all-MiniLM-L6-v2-five-scores / trainer_state.json
youssefkhalil320's picture
Upload folder using huggingface_hub
3e9eceb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 94748,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004221724996833706,
"grad_norm": 99.84131622314453,
"learning_rate": 2.0474934036939315e-07,
"loss": 12.0647,
"step": 100
},
{
"epoch": 0.008443449993667413,
"grad_norm": 95.49885559082031,
"learning_rate": 4.1583113456464383e-07,
"loss": 11.7727,
"step": 200
},
{
"epoch": 0.012665174990501118,
"grad_norm": 66.75164794921875,
"learning_rate": 6.269129287598945e-07,
"loss": 11.1315,
"step": 300
},
{
"epoch": 0.016886899987334825,
"grad_norm": 65.58353424072266,
"learning_rate": 8.379947229551452e-07,
"loss": 10.8852,
"step": 400
},
{
"epoch": 0.02110862498416853,
"grad_norm": 66.87042999267578,
"learning_rate": 1.0469656992084432e-06,
"loss": 9.9168,
"step": 500
},
{
"epoch": 0.02110862498416853,
"eval_loss": 10.42082691192627,
"eval_runtime": 383.256,
"eval_samples_per_second": 494.424,
"eval_steps_per_second": 15.452,
"step": 500
},
{
"epoch": 0.025330349981002236,
"grad_norm": 112.13134765625,
"learning_rate": 1.258047493403694e-06,
"loss": 9.4099,
"step": 600
},
{
"epoch": 0.029552074977835945,
"grad_norm": 74.77991485595703,
"learning_rate": 1.4691292875989447e-06,
"loss": 8.5361,
"step": 700
},
{
"epoch": 0.03377379997466965,
"grad_norm": 66.13243103027344,
"learning_rate": 1.6802110817941955e-06,
"loss": 7.7286,
"step": 800
},
{
"epoch": 0.03799552497150336,
"grad_norm": 43.571102142333984,
"learning_rate": 1.8912928759894462e-06,
"loss": 7.0852,
"step": 900
},
{
"epoch": 0.04221724996833706,
"grad_norm": 39.42892074584961,
"learning_rate": 2.1023746701846966e-06,
"loss": 6.3646,
"step": 1000
},
{
"epoch": 0.04221724996833706,
"eval_loss": 6.634986400604248,
"eval_runtime": 371.6377,
"eval_samples_per_second": 509.881,
"eval_steps_per_second": 15.935,
"step": 1000
},
{
"epoch": 0.04643897496517077,
"grad_norm": 34.04639434814453,
"learning_rate": 2.3134564643799472e-06,
"loss": 6.1673,
"step": 1100
},
{
"epoch": 0.05066069996200447,
"grad_norm": 19.675539016723633,
"learning_rate": 2.5245382585751983e-06,
"loss": 5.5683,
"step": 1200
},
{
"epoch": 0.05488242495883818,
"grad_norm": 18.309850692749023,
"learning_rate": 2.7356200527704485e-06,
"loss": 5.4462,
"step": 1300
},
{
"epoch": 0.05910414995567189,
"grad_norm": 21.344575881958008,
"learning_rate": 2.9467018469656995e-06,
"loss": 5.303,
"step": 1400
},
{
"epoch": 0.06332587495250559,
"grad_norm": 18.55843162536621,
"learning_rate": 3.15778364116095e-06,
"loss": 5.1935,
"step": 1500
},
{
"epoch": 0.06332587495250559,
"eval_loss": 5.242936134338379,
"eval_runtime": 373.9106,
"eval_samples_per_second": 506.782,
"eval_steps_per_second": 15.838,
"step": 1500
},
{
"epoch": 0.0675475999493393,
"grad_norm": 15.893339157104492,
"learning_rate": 3.3688654353562008e-06,
"loss": 5.1856,
"step": 1600
},
{
"epoch": 0.07176932494617301,
"grad_norm": 34.76871871948242,
"learning_rate": 3.5799472295514514e-06,
"loss": 5.0136,
"step": 1700
},
{
"epoch": 0.07599104994300672,
"grad_norm": 20.7496280670166,
"learning_rate": 3.7910290237467025e-06,
"loss": 5.0667,
"step": 1800
},
{
"epoch": 0.08021277493984041,
"grad_norm": 30.79672622680664,
"learning_rate": 4.002110817941952e-06,
"loss": 4.9982,
"step": 1900
},
{
"epoch": 0.08443449993667412,
"grad_norm": 18.434511184692383,
"learning_rate": 4.213192612137204e-06,
"loss": 5.0429,
"step": 2000
},
{
"epoch": 0.08443449993667412,
"eval_loss": 5.009856224060059,
"eval_runtime": 383.4068,
"eval_samples_per_second": 494.23,
"eval_steps_per_second": 15.446,
"step": 2000
},
{
"epoch": 0.08865622493350783,
"grad_norm": 23.91575050354004,
"learning_rate": 4.424274406332454e-06,
"loss": 4.8719,
"step": 2100
},
{
"epoch": 0.09287794993034154,
"grad_norm": 16.657917022705078,
"learning_rate": 4.635356200527705e-06,
"loss": 4.8579,
"step": 2200
},
{
"epoch": 0.09709967492717525,
"grad_norm": 20.16551399230957,
"learning_rate": 4.846437994722956e-06,
"loss": 4.9282,
"step": 2300
},
{
"epoch": 0.10132139992400895,
"grad_norm": 26.069272994995117,
"learning_rate": 5.057519788918206e-06,
"loss": 4.9848,
"step": 2400
},
{
"epoch": 0.10554312492084265,
"grad_norm": 20.215301513671875,
"learning_rate": 5.268601583113458e-06,
"loss": 4.8974,
"step": 2500
},
{
"epoch": 0.10554312492084265,
"eval_loss": 4.907773971557617,
"eval_runtime": 383.6054,
"eval_samples_per_second": 493.974,
"eval_steps_per_second": 15.438,
"step": 2500
},
{
"epoch": 0.10976484991767636,
"grad_norm": 16.090208053588867,
"learning_rate": 5.4796833773087075e-06,
"loss": 4.9103,
"step": 2600
},
{
"epoch": 0.11398657491451007,
"grad_norm": 14.869135856628418,
"learning_rate": 5.690765171503958e-06,
"loss": 4.7459,
"step": 2700
},
{
"epoch": 0.11820829991134378,
"grad_norm": 40.26353454589844,
"learning_rate": 5.90184696569921e-06,
"loss": 4.8084,
"step": 2800
},
{
"epoch": 0.12243002490817748,
"grad_norm": 12.258187294006348,
"learning_rate": 6.112928759894459e-06,
"loss": 4.8221,
"step": 2900
},
{
"epoch": 0.12665174990501119,
"grad_norm": 14.802818298339844,
"learning_rate": 6.32401055408971e-06,
"loss": 4.7622,
"step": 3000
},
{
"epoch": 0.12665174990501119,
"eval_loss": 4.81698751449585,
"eval_runtime": 385.8745,
"eval_samples_per_second": 491.069,
"eval_steps_per_second": 15.347,
"step": 3000
},
{
"epoch": 0.1308734749018449,
"grad_norm": 34.78262710571289,
"learning_rate": 6.535092348284961e-06,
"loss": 4.7004,
"step": 3100
},
{
"epoch": 0.1350951998986786,
"grad_norm": 13.215925216674805,
"learning_rate": 6.746174142480212e-06,
"loss": 4.6912,
"step": 3200
},
{
"epoch": 0.1393169248955123,
"grad_norm": 26.275604248046875,
"learning_rate": 6.957255936675462e-06,
"loss": 4.6595,
"step": 3300
},
{
"epoch": 0.14353864989234602,
"grad_norm": 17.464338302612305,
"learning_rate": 7.1683377308707125e-06,
"loss": 4.7322,
"step": 3400
},
{
"epoch": 0.14776037488917973,
"grad_norm": 17.785873413085938,
"learning_rate": 7.379419525065964e-06,
"loss": 4.7575,
"step": 3500
},
{
"epoch": 0.14776037488917973,
"eval_loss": 4.719913959503174,
"eval_runtime": 380.6996,
"eval_samples_per_second": 497.744,
"eval_steps_per_second": 15.556,
"step": 3500
},
{
"epoch": 0.15198209988601344,
"grad_norm": 33.81704330444336,
"learning_rate": 7.590501319261215e-06,
"loss": 4.6443,
"step": 3600
},
{
"epoch": 0.15620382488284712,
"grad_norm": 37.59410095214844,
"learning_rate": 7.801583113456465e-06,
"loss": 4.6638,
"step": 3700
},
{
"epoch": 0.16042554987968083,
"grad_norm": 19.028879165649414,
"learning_rate": 8.012664907651716e-06,
"loss": 4.5958,
"step": 3800
},
{
"epoch": 0.16464727487651454,
"grad_norm": 17.87994956970215,
"learning_rate": 8.223746701846966e-06,
"loss": 4.6285,
"step": 3900
},
{
"epoch": 0.16886899987334825,
"grad_norm": 24.185325622558594,
"learning_rate": 8.434828496042217e-06,
"loss": 4.6347,
"step": 4000
},
{
"epoch": 0.16886899987334825,
"eval_loss": 4.655384063720703,
"eval_runtime": 389.5193,
"eval_samples_per_second": 486.474,
"eval_steps_per_second": 15.203,
"step": 4000
},
{
"epoch": 0.17309072487018196,
"grad_norm": 22.38553237915039,
"learning_rate": 8.645910290237468e-06,
"loss": 4.6558,
"step": 4100
},
{
"epoch": 0.17731244986701566,
"grad_norm": 17.568811416625977,
"learning_rate": 8.856992084432718e-06,
"loss": 4.6712,
"step": 4200
},
{
"epoch": 0.18153417486384937,
"grad_norm": 21.295473098754883,
"learning_rate": 9.068073878627969e-06,
"loss": 4.6126,
"step": 4300
},
{
"epoch": 0.18575589986068308,
"grad_norm": 43.39851760864258,
"learning_rate": 9.27915567282322e-06,
"loss": 4.6219,
"step": 4400
},
{
"epoch": 0.1899776248575168,
"grad_norm": 17.622400283813477,
"learning_rate": 9.488126649076517e-06,
"loss": 4.6101,
"step": 4500
},
{
"epoch": 0.1899776248575168,
"eval_loss": 4.6206207275390625,
"eval_runtime": 383.5173,
"eval_samples_per_second": 494.087,
"eval_steps_per_second": 15.441,
"step": 4500
},
{
"epoch": 0.1941993498543505,
"grad_norm": 16.083688735961914,
"learning_rate": 9.699208443271768e-06,
"loss": 4.7682,
"step": 4600
},
{
"epoch": 0.19842107485118418,
"grad_norm": 61.793235778808594,
"learning_rate": 9.91029023746702e-06,
"loss": 4.5385,
"step": 4700
},
{
"epoch": 0.2026427998480179,
"grad_norm": 19.626968383789062,
"learning_rate": 1.012137203166227e-05,
"loss": 4.6744,
"step": 4800
},
{
"epoch": 0.2068645248448516,
"grad_norm": 13.591843605041504,
"learning_rate": 1.033245382585752e-05,
"loss": 4.5383,
"step": 4900
},
{
"epoch": 0.2110862498416853,
"grad_norm": 26.316524505615234,
"learning_rate": 1.0543535620052772e-05,
"loss": 4.6095,
"step": 5000
},
{
"epoch": 0.2110862498416853,
"eval_loss": 4.626244068145752,
"eval_runtime": 385.0461,
"eval_samples_per_second": 492.125,
"eval_steps_per_second": 15.38,
"step": 5000
},
{
"epoch": 0.21530797483851902,
"grad_norm": 35.92019271850586,
"learning_rate": 1.0754617414248022e-05,
"loss": 4.6807,
"step": 5100
},
{
"epoch": 0.21952969983535273,
"grad_norm": 16.565446853637695,
"learning_rate": 1.0965699208443273e-05,
"loss": 4.4866,
"step": 5200
},
{
"epoch": 0.22375142483218644,
"grad_norm": 71.62289428710938,
"learning_rate": 1.1176781002638524e-05,
"loss": 4.5353,
"step": 5300
},
{
"epoch": 0.22797314982902014,
"grad_norm": 85.97599029541016,
"learning_rate": 1.1387862796833773e-05,
"loss": 4.5285,
"step": 5400
},
{
"epoch": 0.23219487482585385,
"grad_norm": 114.70708465576172,
"learning_rate": 1.1598944591029025e-05,
"loss": 4.5416,
"step": 5500
},
{
"epoch": 0.23219487482585385,
"eval_loss": 4.591431617736816,
"eval_runtime": 389.2061,
"eval_samples_per_second": 486.865,
"eval_steps_per_second": 15.216,
"step": 5500
},
{
"epoch": 0.23641659982268756,
"grad_norm": 18.64859962463379,
"learning_rate": 1.1810026385224276e-05,
"loss": 4.623,
"step": 5600
},
{
"epoch": 0.24063832481952124,
"grad_norm": 40.123992919921875,
"learning_rate": 1.2021108179419525e-05,
"loss": 4.5337,
"step": 5700
},
{
"epoch": 0.24486004981635495,
"grad_norm": 19.153076171875,
"learning_rate": 1.2232189973614777e-05,
"loss": 4.5726,
"step": 5800
},
{
"epoch": 0.24908177481318866,
"grad_norm": 16.48304557800293,
"learning_rate": 1.2443271767810027e-05,
"loss": 4.5467,
"step": 5900
},
{
"epoch": 0.25330349981002237,
"grad_norm": 35.746307373046875,
"learning_rate": 1.2654353562005276e-05,
"loss": 4.3986,
"step": 6000
},
{
"epoch": 0.25330349981002237,
"eval_loss": 4.601134300231934,
"eval_runtime": 382.9558,
"eval_samples_per_second": 494.812,
"eval_steps_per_second": 15.464,
"step": 6000
},
{
"epoch": 0.2575252248068561,
"grad_norm": 15.414148330688477,
"learning_rate": 1.2863324538258577e-05,
"loss": 4.559,
"step": 6100
},
{
"epoch": 0.2617469498036898,
"grad_norm": 18.076143264770508,
"learning_rate": 1.3074406332453826e-05,
"loss": 4.6066,
"step": 6200
},
{
"epoch": 0.2659686748005235,
"grad_norm": 19.2498836517334,
"learning_rate": 1.3285488126649078e-05,
"loss": 4.4445,
"step": 6300
},
{
"epoch": 0.2701903997973572,
"grad_norm": 17.76839828491211,
"learning_rate": 1.3496569920844329e-05,
"loss": 4.4518,
"step": 6400
},
{
"epoch": 0.2744121247941909,
"grad_norm": 17.81798553466797,
"learning_rate": 1.3707651715039578e-05,
"loss": 4.4761,
"step": 6500
},
{
"epoch": 0.2744121247941909,
"eval_loss": 4.5092549324035645,
"eval_runtime": 383.1229,
"eval_samples_per_second": 494.596,
"eval_steps_per_second": 15.457,
"step": 6500
},
{
"epoch": 0.2786338497910246,
"grad_norm": 26.38512420654297,
"learning_rate": 1.391873350923483e-05,
"loss": 4.3362,
"step": 6600
},
{
"epoch": 0.28285557478785833,
"grad_norm": 22.481346130371094,
"learning_rate": 1.4129815303430081e-05,
"loss": 4.4936,
"step": 6700
},
{
"epoch": 0.28707729978469204,
"grad_norm": 34.47163772583008,
"learning_rate": 1.434089709762533e-05,
"loss": 4.2397,
"step": 6800
},
{
"epoch": 0.29129902478152575,
"grad_norm": 14.515748023986816,
"learning_rate": 1.4551978891820582e-05,
"loss": 4.5243,
"step": 6900
},
{
"epoch": 0.29552074977835946,
"grad_norm": 13.278151512145996,
"learning_rate": 1.4763060686015833e-05,
"loss": 4.496,
"step": 7000
},
{
"epoch": 0.29552074977835946,
"eval_loss": 4.396859645843506,
"eval_runtime": 380.6896,
"eval_samples_per_second": 497.757,
"eval_steps_per_second": 15.556,
"step": 7000
},
{
"epoch": 0.29974247477519317,
"grad_norm": 28.896297454833984,
"learning_rate": 1.4974142480211082e-05,
"loss": 4.2558,
"step": 7100
},
{
"epoch": 0.3039641997720269,
"grad_norm": 41.2579231262207,
"learning_rate": 1.5185224274406334e-05,
"loss": 4.4691,
"step": 7200
},
{
"epoch": 0.30818592476886053,
"grad_norm": 23.71632957458496,
"learning_rate": 1.5396306068601585e-05,
"loss": 4.4819,
"step": 7300
},
{
"epoch": 0.31240764976569424,
"grad_norm": 31.86453628540039,
"learning_rate": 1.5607387862796834e-05,
"loss": 4.3785,
"step": 7400
},
{
"epoch": 0.31662937476252795,
"grad_norm": 27.288312911987305,
"learning_rate": 1.5818469656992086e-05,
"loss": 4.4214,
"step": 7500
},
{
"epoch": 0.31662937476252795,
"eval_loss": 4.419884204864502,
"eval_runtime": 381.558,
"eval_samples_per_second": 496.624,
"eval_steps_per_second": 15.521,
"step": 7500
},
{
"epoch": 0.32085109975936166,
"grad_norm": 31.071884155273438,
"learning_rate": 1.6029551451187338e-05,
"loss": 4.4935,
"step": 7600
},
{
"epoch": 0.32507282475619537,
"grad_norm": 18.19314956665039,
"learning_rate": 1.6240633245382587e-05,
"loss": 4.4238,
"step": 7700
},
{
"epoch": 0.3292945497530291,
"grad_norm": 23.160940170288086,
"learning_rate": 1.645171503957784e-05,
"loss": 4.5361,
"step": 7800
},
{
"epoch": 0.3335162747498628,
"grad_norm": 30.73809242248535,
"learning_rate": 1.666279683377309e-05,
"loss": 4.4284,
"step": 7900
},
{
"epoch": 0.3377379997466965,
"grad_norm": 17.99860954284668,
"learning_rate": 1.6873878627968337e-05,
"loss": 4.3918,
"step": 8000
},
{
"epoch": 0.3377379997466965,
"eval_loss": 4.397606372833252,
"eval_runtime": 384.2042,
"eval_samples_per_second": 493.204,
"eval_steps_per_second": 15.414,
"step": 8000
},
{
"epoch": 0.3419597247435302,
"grad_norm": 35.046939849853516,
"learning_rate": 1.708496042216359e-05,
"loss": 4.4622,
"step": 8100
},
{
"epoch": 0.3461814497403639,
"grad_norm": 15.606463432312012,
"learning_rate": 1.7296042216358842e-05,
"loss": 4.4128,
"step": 8200
},
{
"epoch": 0.3504031747371976,
"grad_norm": 22.16493034362793,
"learning_rate": 1.750712401055409e-05,
"loss": 4.1565,
"step": 8300
},
{
"epoch": 0.35462489973403133,
"grad_norm": 53.230464935302734,
"learning_rate": 1.7718205804749343e-05,
"loss": 4.3241,
"step": 8400
},
{
"epoch": 0.35884662473086504,
"grad_norm": 24.896638870239258,
"learning_rate": 1.7929287598944592e-05,
"loss": 4.2764,
"step": 8500
},
{
"epoch": 0.35884662473086504,
"eval_loss": 4.426083564758301,
"eval_runtime": 374.4401,
"eval_samples_per_second": 506.065,
"eval_steps_per_second": 15.816,
"step": 8500
},
{
"epoch": 0.36306834972769875,
"grad_norm": 37.623687744140625,
"learning_rate": 1.814036939313984e-05,
"loss": 4.2101,
"step": 8600
},
{
"epoch": 0.36729007472453246,
"grad_norm": 15.538634300231934,
"learning_rate": 1.8351451187335093e-05,
"loss": 4.4044,
"step": 8700
},
{
"epoch": 0.37151179972136616,
"grad_norm": 26.525089263916016,
"learning_rate": 1.8562532981530342e-05,
"loss": 4.254,
"step": 8800
},
{
"epoch": 0.3757335247181999,
"grad_norm": 23.026073455810547,
"learning_rate": 1.8771503957783643e-05,
"loss": 4.362,
"step": 8900
},
{
"epoch": 0.3799552497150336,
"grad_norm": 38.66673278808594,
"learning_rate": 1.8982585751978892e-05,
"loss": 4.3424,
"step": 9000
},
{
"epoch": 0.3799552497150336,
"eval_loss": 4.440927505493164,
"eval_runtime": 383.0189,
"eval_samples_per_second": 494.73,
"eval_steps_per_second": 15.461,
"step": 9000
},
{
"epoch": 0.3841769747118673,
"grad_norm": 33.175357818603516,
"learning_rate": 1.9193667546174144e-05,
"loss": 4.3383,
"step": 9100
},
{
"epoch": 0.388398699708701,
"grad_norm": 26.07857894897461,
"learning_rate": 1.9404749340369397e-05,
"loss": 4.4713,
"step": 9200
},
{
"epoch": 0.39262042470553465,
"grad_norm": 20.96491813659668,
"learning_rate": 1.9615831134564646e-05,
"loss": 4.2773,
"step": 9300
},
{
"epoch": 0.39684214970236836,
"grad_norm": 36.000953674316406,
"learning_rate": 1.9826912928759895e-05,
"loss": 4.2842,
"step": 9400
},
{
"epoch": 0.40106387469920207,
"grad_norm": 48.96345901489258,
"learning_rate": 1.99957782651015e-05,
"loss": 4.3301,
"step": 9500
},
{
"epoch": 0.40106387469920207,
"eval_loss": 4.345411777496338,
"eval_runtime": 388.2926,
"eval_samples_per_second": 488.011,
"eval_steps_per_second": 15.251,
"step": 9500
},
{
"epoch": 0.4052855996960358,
"grad_norm": 34.45381546020508,
"learning_rate": 1.9972324182332042e-05,
"loss": 4.3224,
"step": 9600
},
{
"epoch": 0.4095073246928695,
"grad_norm": 20.29124641418457,
"learning_rate": 1.9948870099562583e-05,
"loss": 4.3878,
"step": 9700
},
{
"epoch": 0.4137290496897032,
"grad_norm": 15.433843612670898,
"learning_rate": 1.9925416016793124e-05,
"loss": 4.3614,
"step": 9800
},
{
"epoch": 0.4179507746865369,
"grad_norm": 22.69325065612793,
"learning_rate": 1.9901961934023665e-05,
"loss": 4.3423,
"step": 9900
},
{
"epoch": 0.4221724996833706,
"grad_norm": 15.297199249267578,
"learning_rate": 1.987850785125421e-05,
"loss": 4.3576,
"step": 10000
},
{
"epoch": 0.4221724996833706,
"eval_loss": 4.354933261871338,
"eval_runtime": 386.7965,
"eval_samples_per_second": 489.898,
"eval_steps_per_second": 15.31,
"step": 10000
},
{
"epoch": 0.4263942246802043,
"grad_norm": 18.692224502563477,
"learning_rate": 1.985505376848475e-05,
"loss": 4.1451,
"step": 10100
},
{
"epoch": 0.43061594967703803,
"grad_norm": 16.447420120239258,
"learning_rate": 1.983159968571529e-05,
"loss": 4.3326,
"step": 10200
},
{
"epoch": 0.43483767467387174,
"grad_norm": 21.373384475708008,
"learning_rate": 1.9808145602945835e-05,
"loss": 4.2761,
"step": 10300
},
{
"epoch": 0.43905939967070545,
"grad_norm": 16.10767364501953,
"learning_rate": 1.9784691520176376e-05,
"loss": 4.2421,
"step": 10400
},
{
"epoch": 0.44328112466753916,
"grad_norm": 33.177181243896484,
"learning_rate": 1.9761237437406917e-05,
"loss": 4.262,
"step": 10500
},
{
"epoch": 0.44328112466753916,
"eval_loss": 4.349300861358643,
"eval_runtime": 386.4801,
"eval_samples_per_second": 490.299,
"eval_steps_per_second": 15.323,
"step": 10500
},
{
"epoch": 0.44750284966437287,
"grad_norm": 29.280973434448242,
"learning_rate": 1.973778335463746e-05,
"loss": 4.1227,
"step": 10600
},
{
"epoch": 0.4517245746612066,
"grad_norm": 26.586437225341797,
"learning_rate": 1.9714329271868002e-05,
"loss": 4.2365,
"step": 10700
},
{
"epoch": 0.4559462996580403,
"grad_norm": 34.612091064453125,
"learning_rate": 1.9690875189098543e-05,
"loss": 4.3528,
"step": 10800
},
{
"epoch": 0.460168024654874,
"grad_norm": 25.977516174316406,
"learning_rate": 1.9667421106329084e-05,
"loss": 4.077,
"step": 10900
},
{
"epoch": 0.4643897496517077,
"grad_norm": 17.730405807495117,
"learning_rate": 1.964396702355963e-05,
"loss": 4.0878,
"step": 11000
},
{
"epoch": 0.4643897496517077,
"eval_loss": 4.334869384765625,
"eval_runtime": 381.6891,
"eval_samples_per_second": 496.454,
"eval_steps_per_second": 15.515,
"step": 11000
},
{
"epoch": 0.4686114746485414,
"grad_norm": 23.120807647705078,
"learning_rate": 1.9620747481617864e-05,
"loss": 4.4246,
"step": 11100
},
{
"epoch": 0.4728331996453751,
"grad_norm": 62.60495376586914,
"learning_rate": 1.9597293398848408e-05,
"loss": 4.1019,
"step": 11200
},
{
"epoch": 0.47705492464220883,
"grad_norm": 38.34320068359375,
"learning_rate": 1.957383931607895e-05,
"loss": 4.2565,
"step": 11300
},
{
"epoch": 0.4812766496390425,
"grad_norm": 19.053138732910156,
"learning_rate": 1.955038523330949e-05,
"loss": 4.3177,
"step": 11400
},
{
"epoch": 0.4854983746358762,
"grad_norm": 25.41574478149414,
"learning_rate": 1.952693115054003e-05,
"loss": 4.1283,
"step": 11500
},
{
"epoch": 0.4854983746358762,
"eval_loss": 4.423605918884277,
"eval_runtime": 385.1053,
"eval_samples_per_second": 492.05,
"eval_steps_per_second": 15.378,
"step": 11500
},
{
"epoch": 0.4897200996327099,
"grad_norm": 16.951583862304688,
"learning_rate": 1.9503477067770572e-05,
"loss": 4.2232,
"step": 11600
},
{
"epoch": 0.4939418246295436,
"grad_norm": 19.988862991333008,
"learning_rate": 1.9480022985001116e-05,
"loss": 4.2347,
"step": 11700
},
{
"epoch": 0.4981635496263773,
"grad_norm": 29.073562622070312,
"learning_rate": 1.9456568902231657e-05,
"loss": 4.082,
"step": 11800
},
{
"epoch": 0.502385274623211,
"grad_norm": 57.14900207519531,
"learning_rate": 1.94331148194622e-05,
"loss": 4.2026,
"step": 11900
},
{
"epoch": 0.5066069996200447,
"grad_norm": 19.27651596069336,
"learning_rate": 1.9409660736692742e-05,
"loss": 4.2687,
"step": 12000
},
{
"epoch": 0.5066069996200447,
"eval_loss": 4.269065856933594,
"eval_runtime": 389.1571,
"eval_samples_per_second": 486.927,
"eval_steps_per_second": 15.218,
"step": 12000
},
{
"epoch": 0.5108287246168784,
"grad_norm": 47.312129974365234,
"learning_rate": 1.9386206653923283e-05,
"loss": 4.302,
"step": 12100
},
{
"epoch": 0.5150504496137122,
"grad_norm": 51.988243103027344,
"learning_rate": 1.9362752571153824e-05,
"loss": 4.0474,
"step": 12200
},
{
"epoch": 0.5192721746105459,
"grad_norm": 25.95807456970215,
"learning_rate": 1.933929848838437e-05,
"loss": 4.1286,
"step": 12300
},
{
"epoch": 0.5234938996073796,
"grad_norm": 29.327302932739258,
"learning_rate": 1.931584440561491e-05,
"loss": 4.3888,
"step": 12400
},
{
"epoch": 0.5277156246042133,
"grad_norm": 21.580862045288086,
"learning_rate": 1.929239032284545e-05,
"loss": 4.2339,
"step": 12500
},
{
"epoch": 0.5277156246042133,
"eval_loss": 4.2413506507873535,
"eval_runtime": 385.1064,
"eval_samples_per_second": 492.048,
"eval_steps_per_second": 15.378,
"step": 12500
},
{
"epoch": 0.531937349601047,
"grad_norm": 57.45950698852539,
"learning_rate": 1.926893624007599e-05,
"loss": 4.1976,
"step": 12600
},
{
"epoch": 0.5361590745978807,
"grad_norm": 40.33968734741211,
"learning_rate": 1.9245482157306535e-05,
"loss": 4.1851,
"step": 12700
},
{
"epoch": 0.5403807995947144,
"grad_norm": 33.016944885253906,
"learning_rate": 1.9222028074537076e-05,
"loss": 4.3969,
"step": 12800
},
{
"epoch": 0.5446025245915481,
"grad_norm": 33.44650650024414,
"learning_rate": 1.919857399176762e-05,
"loss": 4.5229,
"step": 12900
},
{
"epoch": 0.5488242495883818,
"grad_norm": 26.207275390625,
"learning_rate": 1.917511990899816e-05,
"loss": 4.2242,
"step": 13000
},
{
"epoch": 0.5488242495883818,
"eval_loss": 4.138918399810791,
"eval_runtime": 383.7736,
"eval_samples_per_second": 493.757,
"eval_steps_per_second": 15.431,
"step": 13000
},
{
"epoch": 0.5530459745852155,
"grad_norm": 17.512197494506836,
"learning_rate": 1.9151665826228703e-05,
"loss": 4.2804,
"step": 13100
},
{
"epoch": 0.5572676995820492,
"grad_norm": 27.008798599243164,
"learning_rate": 1.9128211743459243e-05,
"loss": 4.2097,
"step": 13200
},
{
"epoch": 0.561489424578883,
"grad_norm": 66.86945343017578,
"learning_rate": 1.9104757660689784e-05,
"loss": 3.9226,
"step": 13300
},
{
"epoch": 0.5657111495757167,
"grad_norm": 49.8387451171875,
"learning_rate": 1.9081538118748023e-05,
"loss": 4.2274,
"step": 13400
},
{
"epoch": 0.5699328745725504,
"grad_norm": 134.98756408691406,
"learning_rate": 1.9058084035978564e-05,
"loss": 4.0309,
"step": 13500
},
{
"epoch": 0.5699328745725504,
"eval_loss": 4.242140293121338,
"eval_runtime": 388.1121,
"eval_samples_per_second": 488.238,
"eval_steps_per_second": 15.258,
"step": 13500
},
{
"epoch": 0.5741545995693841,
"grad_norm": 16.55082130432129,
"learning_rate": 1.9034629953209108e-05,
"loss": 4.3429,
"step": 13600
},
{
"epoch": 0.5783763245662178,
"grad_norm": 17.452939987182617,
"learning_rate": 1.901117587043965e-05,
"loss": 4.0352,
"step": 13700
},
{
"epoch": 0.5825980495630515,
"grad_norm": 36.04256057739258,
"learning_rate": 1.898772178767019e-05,
"loss": 4.2926,
"step": 13800
},
{
"epoch": 0.5868197745598852,
"grad_norm": 38.001712799072266,
"learning_rate": 1.896426770490073e-05,
"loss": 4.3063,
"step": 13900
},
{
"epoch": 0.5910414995567189,
"grad_norm": 30.819929122924805,
"learning_rate": 1.894104816295897e-05,
"loss": 4.3172,
"step": 14000
},
{
"epoch": 0.5910414995567189,
"eval_loss": 4.2266845703125,
"eval_runtime": 374.8984,
"eval_samples_per_second": 505.446,
"eval_steps_per_second": 15.796,
"step": 14000
},
{
"epoch": 0.5952632245535526,
"grad_norm": 22.755001068115234,
"learning_rate": 1.891759408018951e-05,
"loss": 4.0057,
"step": 14100
},
{
"epoch": 0.5994849495503863,
"grad_norm": 109.07671356201172,
"learning_rate": 1.8894139997420055e-05,
"loss": 4.2081,
"step": 14200
},
{
"epoch": 0.60370667454722,
"grad_norm": 32.722652435302734,
"learning_rate": 1.8870685914650596e-05,
"loss": 4.2408,
"step": 14300
},
{
"epoch": 0.6079283995440538,
"grad_norm": 17.725366592407227,
"learning_rate": 1.8847231831881137e-05,
"loss": 4.1066,
"step": 14400
},
{
"epoch": 0.6121501245408874,
"grad_norm": 18.536985397338867,
"learning_rate": 1.8823777749111678e-05,
"loss": 4.1997,
"step": 14500
},
{
"epoch": 0.6121501245408874,
"eval_loss": 4.1797871589660645,
"eval_runtime": 387.8769,
"eval_samples_per_second": 488.534,
"eval_steps_per_second": 15.268,
"step": 14500
},
{
"epoch": 0.6163718495377211,
"grad_norm": 18.541019439697266,
"learning_rate": 1.880032366634222e-05,
"loss": 4.2364,
"step": 14600
},
{
"epoch": 0.6205935745345548,
"grad_norm": 20.86994743347168,
"learning_rate": 1.877686958357276e-05,
"loss": 4.1135,
"step": 14700
},
{
"epoch": 0.6248152995313885,
"grad_norm": 19.748693466186523,
"learning_rate": 1.8753415500803304e-05,
"loss": 4.0561,
"step": 14800
},
{
"epoch": 0.6290370245282222,
"grad_norm": 16.203369140625,
"learning_rate": 1.8729961418033845e-05,
"loss": 4.0347,
"step": 14900
},
{
"epoch": 0.6332587495250559,
"grad_norm": 38.8477668762207,
"learning_rate": 1.870650733526439e-05,
"loss": 4.1979,
"step": 15000
},
{
"epoch": 0.6332587495250559,
"eval_loss": 4.240939140319824,
"eval_runtime": 382.8365,
"eval_samples_per_second": 494.966,
"eval_steps_per_second": 15.469,
"step": 15000
},
{
"epoch": 0.6374804745218896,
"grad_norm": 44.9690055847168,
"learning_rate": 1.868305325249493e-05,
"loss": 4.0132,
"step": 15100
},
{
"epoch": 0.6417021995187233,
"grad_norm": 21.35906410217285,
"learning_rate": 1.865959916972547e-05,
"loss": 4.1131,
"step": 15200
},
{
"epoch": 0.645923924515557,
"grad_norm": 26.089805603027344,
"learning_rate": 1.8636145086956012e-05,
"loss": 3.8049,
"step": 15300
},
{
"epoch": 0.6501456495123907,
"grad_norm": 17.96413230895996,
"learning_rate": 1.8612691004186556e-05,
"loss": 3.9468,
"step": 15400
},
{
"epoch": 0.6543673745092244,
"grad_norm": 23.05537223815918,
"learning_rate": 1.8589236921417097e-05,
"loss": 4.17,
"step": 15500
},
{
"epoch": 0.6543673745092244,
"eval_loss": 4.1938157081604,
"eval_runtime": 389.3344,
"eval_samples_per_second": 486.705,
"eval_steps_per_second": 15.211,
"step": 15500
},
{
"epoch": 0.6585890995060582,
"grad_norm": 29.09811782836914,
"learning_rate": 1.8565782838647638e-05,
"loss": 4.2369,
"step": 15600
},
{
"epoch": 0.6628108245028919,
"grad_norm": 54.29764938354492,
"learning_rate": 1.854232875587818e-05,
"loss": 4.159,
"step": 15700
},
{
"epoch": 0.6670325494997256,
"grad_norm": 30.67020606994629,
"learning_rate": 1.8518874673108723e-05,
"loss": 4.1172,
"step": 15800
},
{
"epoch": 0.6712542744965593,
"grad_norm": 50.64274597167969,
"learning_rate": 1.8495420590339268e-05,
"loss": 4.01,
"step": 15900
},
{
"epoch": 0.675475999493393,
"grad_norm": 48.833839416503906,
"learning_rate": 1.847196650756981e-05,
"loss": 4.0204,
"step": 16000
},
{
"epoch": 0.675475999493393,
"eval_loss": 4.279551982879639,
"eval_runtime": 380.0,
"eval_samples_per_second": 498.661,
"eval_steps_per_second": 15.584,
"step": 16000
},
{
"epoch": 0.6796977244902267,
"grad_norm": 32.197509765625,
"learning_rate": 1.844851242480035e-05,
"loss": 4.0013,
"step": 16100
},
{
"epoch": 0.6839194494870604,
"grad_norm": 27.845190048217773,
"learning_rate": 1.842505834203089e-05,
"loss": 4.0174,
"step": 16200
},
{
"epoch": 0.6881411744838941,
"grad_norm": 57.353271484375,
"learning_rate": 1.840160425926143e-05,
"loss": 4.0616,
"step": 16300
},
{
"epoch": 0.6923628994807278,
"grad_norm": 49.316341400146484,
"learning_rate": 1.8378150176491972e-05,
"loss": 3.9944,
"step": 16400
},
{
"epoch": 0.6965846244775615,
"grad_norm": 69.53150939941406,
"learning_rate": 1.8354696093722516e-05,
"loss": 4.05,
"step": 16500
},
{
"epoch": 0.6965846244775615,
"eval_loss": 4.213247776031494,
"eval_runtime": 390.3133,
"eval_samples_per_second": 485.484,
"eval_steps_per_second": 15.172,
"step": 16500
},
{
"epoch": 0.7008063494743952,
"grad_norm": 101.47056579589844,
"learning_rate": 1.8331242010953057e-05,
"loss": 4.0769,
"step": 16600
},
{
"epoch": 0.705028074471229,
"grad_norm": 21.874292373657227,
"learning_rate": 1.8307787928183602e-05,
"loss": 4.1289,
"step": 16700
},
{
"epoch": 0.7092497994680627,
"grad_norm": 28.484107971191406,
"learning_rate": 1.8284333845414143e-05,
"loss": 4.0941,
"step": 16800
},
{
"epoch": 0.7134715244648964,
"grad_norm": 20.411182403564453,
"learning_rate": 1.8260879762644684e-05,
"loss": 4.2556,
"step": 16900
},
{
"epoch": 0.7176932494617301,
"grad_norm": 21.108642578125,
"learning_rate": 1.8237425679875224e-05,
"loss": 4.3075,
"step": 17000
},
{
"epoch": 0.7176932494617301,
"eval_loss": 4.128803253173828,
"eval_runtime": 387.8228,
"eval_samples_per_second": 488.602,
"eval_steps_per_second": 15.27,
"step": 17000
},
{
"epoch": 0.7219149744585638,
"grad_norm": 24.48760986328125,
"learning_rate": 1.821397159710577e-05,
"loss": 4.0751,
"step": 17100
},
{
"epoch": 0.7261366994553975,
"grad_norm": 28.520254135131836,
"learning_rate": 1.819051751433631e-05,
"loss": 4.0711,
"step": 17200
},
{
"epoch": 0.7303584244522312,
"grad_norm": 37.351932525634766,
"learning_rate": 1.816706343156685e-05,
"loss": 3.9483,
"step": 17300
},
{
"epoch": 0.7345801494490649,
"grad_norm": 32.99684143066406,
"learning_rate": 1.814360934879739e-05,
"loss": 4.3186,
"step": 17400
},
{
"epoch": 0.7388018744458986,
"grad_norm": 24.938167572021484,
"learning_rate": 1.8120155266027936e-05,
"loss": 3.932,
"step": 17500
},
{
"epoch": 0.7388018744458986,
"eval_loss": 4.114780902862549,
"eval_runtime": 379.5561,
"eval_samples_per_second": 499.244,
"eval_steps_per_second": 15.602,
"step": 17500
},
{
"epoch": 0.7430235994427323,
"grad_norm": 19.03070068359375,
"learning_rate": 1.8096701183258477e-05,
"loss": 3.8774,
"step": 17600
},
{
"epoch": 0.747245324439566,
"grad_norm": 20.461793899536133,
"learning_rate": 1.807324710048902e-05,
"loss": 4.2312,
"step": 17700
},
{
"epoch": 0.7514670494363997,
"grad_norm": 50.202266693115234,
"learning_rate": 1.8049793017719562e-05,
"loss": 3.9327,
"step": 17800
},
{
"epoch": 0.7556887744332335,
"grad_norm": 28.14083480834961,
"learning_rate": 1.8026338934950103e-05,
"loss": 4.2264,
"step": 17900
},
{
"epoch": 0.7599104994300672,
"grad_norm": 58.97740173339844,
"learning_rate": 1.8002884852180644e-05,
"loss": 3.9723,
"step": 18000
},
{
"epoch": 0.7599104994300672,
"eval_loss": 4.106083869934082,
"eval_runtime": 385.336,
"eval_samples_per_second": 491.755,
"eval_steps_per_second": 15.368,
"step": 18000
},
{
"epoch": 0.7641322244269009,
"grad_norm": 25.112369537353516,
"learning_rate": 1.7979430769411185e-05,
"loss": 4.1206,
"step": 18100
},
{
"epoch": 0.7683539494237346,
"grad_norm": 29.05199432373047,
"learning_rate": 1.795597668664173e-05,
"loss": 4.1744,
"step": 18200
},
{
"epoch": 0.7725756744205683,
"grad_norm": 44.24822998046875,
"learning_rate": 1.793252260387227e-05,
"loss": 3.89,
"step": 18300
},
{
"epoch": 0.776797399417402,
"grad_norm": 44.473838806152344,
"learning_rate": 1.790930306193051e-05,
"loss": 4.1414,
"step": 18400
},
{
"epoch": 0.7810191244142357,
"grad_norm": 29.81984519958496,
"learning_rate": 1.788584897916105e-05,
"loss": 4.0286,
"step": 18500
},
{
"epoch": 0.7810191244142357,
"eval_loss": 4.140493392944336,
"eval_runtime": 383.1405,
"eval_samples_per_second": 494.573,
"eval_steps_per_second": 15.456,
"step": 18500
},
{
"epoch": 0.7852408494110693,
"grad_norm": 67.35831451416016,
"learning_rate": 1.786239489639159e-05,
"loss": 3.885,
"step": 18600
},
{
"epoch": 0.789462574407903,
"grad_norm": 65.76871490478516,
"learning_rate": 1.783894081362213e-05,
"loss": 4.3785,
"step": 18700
},
{
"epoch": 0.7936842994047367,
"grad_norm": 22.527624130249023,
"learning_rate": 1.7815486730852672e-05,
"loss": 3.9304,
"step": 18800
},
{
"epoch": 0.7979060244015704,
"grad_norm": 62.7253532409668,
"learning_rate": 1.7792032648083217e-05,
"loss": 4.0831,
"step": 18900
},
{
"epoch": 0.8021277493984041,
"grad_norm": 23.038379669189453,
"learning_rate": 1.7768578565313758e-05,
"loss": 4.1698,
"step": 19000
},
{
"epoch": 0.8021277493984041,
"eval_loss": 4.099842071533203,
"eval_runtime": 383.8583,
"eval_samples_per_second": 493.648,
"eval_steps_per_second": 15.428,
"step": 19000
},
{
"epoch": 0.8063494743952379,
"grad_norm": 49.628074645996094,
"learning_rate": 1.7745124482544302e-05,
"loss": 3.9876,
"step": 19100
},
{
"epoch": 0.8105711993920716,
"grad_norm": 81.16788482666016,
"learning_rate": 1.7721670399774843e-05,
"loss": 3.9194,
"step": 19200
},
{
"epoch": 0.8147929243889053,
"grad_norm": 62.49740982055664,
"learning_rate": 1.7698216317005384e-05,
"loss": 3.9222,
"step": 19300
},
{
"epoch": 0.819014649385739,
"grad_norm": 33.98008728027344,
"learning_rate": 1.7674762234235928e-05,
"loss": 4.1863,
"step": 19400
},
{
"epoch": 0.8232363743825727,
"grad_norm": 22.858428955078125,
"learning_rate": 1.765130815146647e-05,
"loss": 4.0315,
"step": 19500
},
{
"epoch": 0.8232363743825727,
"eval_loss": 4.077751636505127,
"eval_runtime": 385.7439,
"eval_samples_per_second": 491.235,
"eval_steps_per_second": 15.352,
"step": 19500
},
{
"epoch": 0.8274580993794064,
"grad_norm": 24.006502151489258,
"learning_rate": 1.762785406869701e-05,
"loss": 3.9286,
"step": 19600
},
{
"epoch": 0.8316798243762401,
"grad_norm": 27.51500701904297,
"learning_rate": 1.760439998592755e-05,
"loss": 3.9605,
"step": 19700
},
{
"epoch": 0.8359015493730738,
"grad_norm": 77.57249450683594,
"learning_rate": 1.7580945903158092e-05,
"loss": 4.1991,
"step": 19800
},
{
"epoch": 0.8401232743699075,
"grad_norm": 30.322465896606445,
"learning_rate": 1.7557491820388636e-05,
"loss": 4.0311,
"step": 19900
},
{
"epoch": 0.8443449993667412,
"grad_norm": 27.484439849853516,
"learning_rate": 1.7534037737619177e-05,
"loss": 3.7869,
"step": 20000
},
{
"epoch": 0.8443449993667412,
"eval_loss": 4.174862384796143,
"eval_runtime": 387.7755,
"eval_samples_per_second": 488.662,
"eval_steps_per_second": 15.272,
"step": 20000
},
{
"epoch": 0.8485667243635749,
"grad_norm": 23.302629470825195,
"learning_rate": 1.751058365484972e-05,
"loss": 3.9232,
"step": 20100
},
{
"epoch": 0.8527884493604087,
"grad_norm": 23.86292839050293,
"learning_rate": 1.7487129572080262e-05,
"loss": 4.034,
"step": 20200
},
{
"epoch": 0.8570101743572424,
"grad_norm": 21.296226501464844,
"learning_rate": 1.7463675489310803e-05,
"loss": 4.2625,
"step": 20300
},
{
"epoch": 0.8612318993540761,
"grad_norm": 30.49806022644043,
"learning_rate": 1.7440221406541344e-05,
"loss": 3.983,
"step": 20400
},
{
"epoch": 0.8654536243509098,
"grad_norm": 27.714088439941406,
"learning_rate": 1.741700186459958e-05,
"loss": 4.2154,
"step": 20500
},
{
"epoch": 0.8654536243509098,
"eval_loss": 4.1056694984436035,
"eval_runtime": 381.9065,
"eval_samples_per_second": 496.171,
"eval_steps_per_second": 15.506,
"step": 20500
},
{
"epoch": 0.8696753493477435,
"grad_norm": 41.86836624145508,
"learning_rate": 1.7393547781830124e-05,
"loss": 4.1696,
"step": 20600
},
{
"epoch": 0.8738970743445772,
"grad_norm": 155.70619201660156,
"learning_rate": 1.7370093699060665e-05,
"loss": 3.8989,
"step": 20700
},
{
"epoch": 0.8781187993414109,
"grad_norm": 29.960683822631836,
"learning_rate": 1.734663961629121e-05,
"loss": 3.9004,
"step": 20800
},
{
"epoch": 0.8823405243382446,
"grad_norm": 35.051795959472656,
"learning_rate": 1.732318553352175e-05,
"loss": 4.2134,
"step": 20900
},
{
"epoch": 0.8865622493350783,
"grad_norm": 68.12458801269531,
"learning_rate": 1.729973145075229e-05,
"loss": 3.9789,
"step": 21000
},
{
"epoch": 0.8865622493350783,
"eval_loss": 4.088019371032715,
"eval_runtime": 387.7304,
"eval_samples_per_second": 488.718,
"eval_steps_per_second": 15.273,
"step": 21000
},
{
"epoch": 0.890783974331912,
"grad_norm": 32.287498474121094,
"learning_rate": 1.727627736798283e-05,
"loss": 4.2438,
"step": 21100
},
{
"epoch": 0.8950056993287457,
"grad_norm": 53.939903259277344,
"learning_rate": 1.7252823285213376e-05,
"loss": 3.9271,
"step": 21200
},
{
"epoch": 0.8992274243255794,
"grad_norm": 23.49491310119629,
"learning_rate": 1.7229369202443917e-05,
"loss": 3.9693,
"step": 21300
},
{
"epoch": 0.9034491493224132,
"grad_norm": 66.63202667236328,
"learning_rate": 1.7205915119674458e-05,
"loss": 4.0197,
"step": 21400
},
{
"epoch": 0.9076708743192469,
"grad_norm": 38.183998107910156,
"learning_rate": 1.7182461036905002e-05,
"loss": 4.1802,
"step": 21500
},
{
"epoch": 0.9076708743192469,
"eval_loss": 4.014532089233398,
"eval_runtime": 389.5421,
"eval_samples_per_second": 486.446,
"eval_steps_per_second": 15.202,
"step": 21500
},
{
"epoch": 0.9118925993160806,
"grad_norm": 25.80255126953125,
"learning_rate": 1.7159006954135543e-05,
"loss": 3.8818,
"step": 21600
},
{
"epoch": 0.9161143243129143,
"grad_norm": 105.8138198852539,
"learning_rate": 1.7135552871366084e-05,
"loss": 4.1069,
"step": 21700
},
{
"epoch": 0.920336049309748,
"grad_norm": 30.18889617919922,
"learning_rate": 1.7112098788596628e-05,
"loss": 3.7999,
"step": 21800
},
{
"epoch": 0.9245577743065817,
"grad_norm": 35.612117767333984,
"learning_rate": 1.708864470582717e-05,
"loss": 3.8949,
"step": 21900
},
{
"epoch": 0.9287794993034154,
"grad_norm": 40.16869354248047,
"learning_rate": 1.706519062305771e-05,
"loss": 3.9893,
"step": 22000
},
{
"epoch": 0.9287794993034154,
"eval_loss": 4.131321430206299,
"eval_runtime": 385.7035,
"eval_samples_per_second": 491.287,
"eval_steps_per_second": 15.354,
"step": 22000
},
{
"epoch": 0.9330012243002491,
"grad_norm": 44.418731689453125,
"learning_rate": 1.704173654028825e-05,
"loss": 4.0918,
"step": 22100
},
{
"epoch": 0.9372229492970828,
"grad_norm": 75.3628921508789,
"learning_rate": 1.7018282457518792e-05,
"loss": 4.0451,
"step": 22200
},
{
"epoch": 0.9414446742939165,
"grad_norm": 49.9603271484375,
"learning_rate": 1.6994828374749336e-05,
"loss": 3.9312,
"step": 22300
},
{
"epoch": 0.9456663992907502,
"grad_norm": 17.13250160217285,
"learning_rate": 1.6971374291979877e-05,
"loss": 4.117,
"step": 22400
},
{
"epoch": 0.949888124287584,
"grad_norm": 22.806734085083008,
"learning_rate": 1.694792020921042e-05,
"loss": 3.883,
"step": 22500
},
{
"epoch": 0.949888124287584,
"eval_loss": 4.10904598236084,
"eval_runtime": 385.0496,
"eval_samples_per_second": 492.121,
"eval_steps_per_second": 15.38,
"step": 22500
},
{
"epoch": 0.9541098492844177,
"grad_norm": 20.560272216796875,
"learning_rate": 1.6924700667268657e-05,
"loss": 3.6942,
"step": 22600
},
{
"epoch": 0.9583315742812513,
"grad_norm": 29.096519470214844,
"learning_rate": 1.6901246584499198e-05,
"loss": 4.1196,
"step": 22700
},
{
"epoch": 0.962553299278085,
"grad_norm": 37.949188232421875,
"learning_rate": 1.687779250172974e-05,
"loss": 3.9292,
"step": 22800
},
{
"epoch": 0.9667750242749187,
"grad_norm": 62.616546630859375,
"learning_rate": 1.685433841896028e-05,
"loss": 3.9081,
"step": 22900
},
{
"epoch": 0.9709967492717524,
"grad_norm": 25.162185668945312,
"learning_rate": 1.6830884336190824e-05,
"loss": 3.8169,
"step": 23000
},
{
"epoch": 0.9709967492717524,
"eval_loss": 4.123219966888428,
"eval_runtime": 384.0529,
"eval_samples_per_second": 493.398,
"eval_steps_per_second": 15.42,
"step": 23000
},
{
"epoch": 0.9752184742685861,
"grad_norm": 28.75921630859375,
"learning_rate": 1.6807664794249062e-05,
"loss": 3.8342,
"step": 23100
},
{
"epoch": 0.9794401992654198,
"grad_norm": 49.7982177734375,
"learning_rate": 1.6784210711479603e-05,
"loss": 4.078,
"step": 23200
},
{
"epoch": 0.9836619242622535,
"grad_norm": 20.842620849609375,
"learning_rate": 1.6760756628710144e-05,
"loss": 4.0002,
"step": 23300
},
{
"epoch": 0.9878836492590872,
"grad_norm": 27.644798278808594,
"learning_rate": 1.6737302545940685e-05,
"loss": 3.9373,
"step": 23400
},
{
"epoch": 0.9921053742559209,
"grad_norm": 45.985050201416016,
"learning_rate": 1.6713848463171226e-05,
"loss": 3.8344,
"step": 23500
},
{
"epoch": 0.9921053742559209,
"eval_loss": 4.156482696533203,
"eval_runtime": 388.5562,
"eval_samples_per_second": 487.68,
"eval_steps_per_second": 15.241,
"step": 23500
},
{
"epoch": 0.9963270992527546,
"grad_norm": 21.684967041015625,
"learning_rate": 1.6690394380401767e-05,
"loss": 4.2827,
"step": 23600
},
{
"epoch": 1.0005488242495884,
"grad_norm": 25.846668243408203,
"learning_rate": 1.666694029763231e-05,
"loss": 4.0298,
"step": 23700
},
{
"epoch": 1.004770549246422,
"grad_norm": 29.205032348632812,
"learning_rate": 1.6643486214862856e-05,
"loss": 3.9967,
"step": 23800
},
{
"epoch": 1.0089922742432558,
"grad_norm": 25.228853225708008,
"learning_rate": 1.6620032132093397e-05,
"loss": 3.7508,
"step": 23900
},
{
"epoch": 1.0132139992400895,
"grad_norm": 39.203895568847656,
"learning_rate": 1.6596578049323938e-05,
"loss": 3.8919,
"step": 24000
},
{
"epoch": 1.0132139992400895,
"eval_loss": 4.079037666320801,
"eval_runtime": 386.829,
"eval_samples_per_second": 489.857,
"eval_steps_per_second": 15.309,
"step": 24000
},
{
"epoch": 1.0174357242369232,
"grad_norm": 29.192537307739258,
"learning_rate": 1.657312396655448e-05,
"loss": 4.0181,
"step": 24100
},
{
"epoch": 1.021657449233757,
"grad_norm": 29.643613815307617,
"learning_rate": 1.6549669883785023e-05,
"loss": 3.7934,
"step": 24200
},
{
"epoch": 1.0258791742305906,
"grad_norm": 37.858001708984375,
"learning_rate": 1.6526215801015564e-05,
"loss": 3.8986,
"step": 24300
},
{
"epoch": 1.0301008992274243,
"grad_norm": 31.105730056762695,
"learning_rate": 1.6502761718246105e-05,
"loss": 3.9275,
"step": 24400
},
{
"epoch": 1.034322624224258,
"grad_norm": 36.47043991088867,
"learning_rate": 1.6479307635476646e-05,
"loss": 3.6911,
"step": 24500
},
{
"epoch": 1.034322624224258,
"eval_loss": 4.160192012786865,
"eval_runtime": 385.9759,
"eval_samples_per_second": 490.94,
"eval_steps_per_second": 15.343,
"step": 24500
},
{
"epoch": 1.0385443492210917,
"grad_norm": 36.14301681518555,
"learning_rate": 1.645585355270719e-05,
"loss": 3.5855,
"step": 24600
},
{
"epoch": 1.0427660742179254,
"grad_norm": 37.279823303222656,
"learning_rate": 1.643239946993773e-05,
"loss": 3.7875,
"step": 24700
},
{
"epoch": 1.0469877992147592,
"grad_norm": 18.79073715209961,
"learning_rate": 1.6408945387168275e-05,
"loss": 3.7999,
"step": 24800
},
{
"epoch": 1.0512095242115929,
"grad_norm": 38.917659759521484,
"learning_rate": 1.6385491304398816e-05,
"loss": 3.7718,
"step": 24900
},
{
"epoch": 1.0554312492084266,
"grad_norm": 23.614336013793945,
"learning_rate": 1.6362037221629357e-05,
"loss": 3.8362,
"step": 25000
},
{
"epoch": 1.0554312492084266,
"eval_loss": 4.038120746612549,
"eval_runtime": 386.8592,
"eval_samples_per_second": 489.819,
"eval_steps_per_second": 15.308,
"step": 25000
},
{
"epoch": 1.0596529742052603,
"grad_norm": 70.36714172363281,
"learning_rate": 1.6338583138859898e-05,
"loss": 3.8076,
"step": 25100
},
{
"epoch": 1.063874699202094,
"grad_norm": 20.698545455932617,
"learning_rate": 1.631512905609044e-05,
"loss": 3.8875,
"step": 25200
},
{
"epoch": 1.0680964241989277,
"grad_norm": 25.684085845947266,
"learning_rate": 1.629167497332098e-05,
"loss": 3.9675,
"step": 25300
},
{
"epoch": 1.0723181491957614,
"grad_norm": 24.55357551574707,
"learning_rate": 1.6268220890551524e-05,
"loss": 3.8451,
"step": 25400
},
{
"epoch": 1.0765398741925951,
"grad_norm": 41.31557846069336,
"learning_rate": 1.6244766807782065e-05,
"loss": 3.4346,
"step": 25500
},
{
"epoch": 1.0765398741925951,
"eval_loss": 4.199647903442383,
"eval_runtime": 384.2225,
"eval_samples_per_second": 493.18,
"eval_steps_per_second": 15.413,
"step": 25500
},
{
"epoch": 1.0807615991894288,
"grad_norm": 24.676942825317383,
"learning_rate": 1.622131272501261e-05,
"loss": 4.0584,
"step": 25600
},
{
"epoch": 1.0849833241862625,
"grad_norm": 53.181884765625,
"learning_rate": 1.619785864224315e-05,
"loss": 3.602,
"step": 25700
},
{
"epoch": 1.0892050491830962,
"grad_norm": 50.667518615722656,
"learning_rate": 1.617440455947369e-05,
"loss": 3.673,
"step": 25800
},
{
"epoch": 1.09342677417993,
"grad_norm": 145.48304748535156,
"learning_rate": 1.6150950476704235e-05,
"loss": 3.976,
"step": 25900
},
{
"epoch": 1.0976484991767637,
"grad_norm": 20.84932518005371,
"learning_rate": 1.6127496393934776e-05,
"loss": 3.8768,
"step": 26000
},
{
"epoch": 1.0976484991767637,
"eval_loss": 3.9983105659484863,
"eval_runtime": 381.5211,
"eval_samples_per_second": 496.672,
"eval_steps_per_second": 15.522,
"step": 26000
},
{
"epoch": 1.1018702241735974,
"grad_norm": 40.64849090576172,
"learning_rate": 1.6104042311165317e-05,
"loss": 3.7575,
"step": 26100
},
{
"epoch": 1.106091949170431,
"grad_norm": 35.289878845214844,
"learning_rate": 1.6080588228395858e-05,
"loss": 3.8101,
"step": 26200
},
{
"epoch": 1.1103136741672648,
"grad_norm": 48.63871383666992,
"learning_rate": 1.60571341456264e-05,
"loss": 4.104,
"step": 26300
},
{
"epoch": 1.1145353991640985,
"grad_norm": 107.08880615234375,
"learning_rate": 1.6033680062856943e-05,
"loss": 3.7139,
"step": 26400
},
{
"epoch": 1.1187571241609322,
"grad_norm": 24.861068725585938,
"learning_rate": 1.6010225980087484e-05,
"loss": 4.0391,
"step": 26500
},
{
"epoch": 1.1187571241609322,
"eval_loss": 4.00176477432251,
"eval_runtime": 390.396,
"eval_samples_per_second": 485.381,
"eval_steps_per_second": 15.169,
"step": 26500
},
{
"epoch": 1.122978849157766,
"grad_norm": 19.068403244018555,
"learning_rate": 1.598677189731803e-05,
"loss": 3.8449,
"step": 26600
},
{
"epoch": 1.1272005741545996,
"grad_norm": 37.11012649536133,
"learning_rate": 1.596331781454857e-05,
"loss": 3.7146,
"step": 26700
},
{
"epoch": 1.1314222991514333,
"grad_norm": 37.64760971069336,
"learning_rate": 1.593986373177911e-05,
"loss": 4.0576,
"step": 26800
},
{
"epoch": 1.135644024148267,
"grad_norm": 32.245750427246094,
"learning_rate": 1.591640964900965e-05,
"loss": 3.8831,
"step": 26900
},
{
"epoch": 1.1398657491451007,
"grad_norm": 27.54793930053711,
"learning_rate": 1.5892955566240192e-05,
"loss": 3.8161,
"step": 27000
},
{
"epoch": 1.1398657491451007,
"eval_loss": 4.0019025802612305,
"eval_runtime": 388.1527,
"eval_samples_per_second": 488.187,
"eval_steps_per_second": 15.257,
"step": 27000
},
{
"epoch": 1.1440874741419345,
"grad_norm": 73.00877380371094,
"learning_rate": 1.5869501483470737e-05,
"loss": 3.9283,
"step": 27100
},
{
"epoch": 1.1483091991387682,
"grad_norm": 180.4134063720703,
"learning_rate": 1.5846281941528975e-05,
"loss": 3.8637,
"step": 27200
},
{
"epoch": 1.1525309241356019,
"grad_norm": 28.230194091796875,
"learning_rate": 1.5822827858759516e-05,
"loss": 3.701,
"step": 27300
},
{
"epoch": 1.1567526491324356,
"grad_norm": 29.84754753112793,
"learning_rate": 1.5799373775990057e-05,
"loss": 3.9364,
"step": 27400
},
{
"epoch": 1.1609743741292693,
"grad_norm": 27.61314582824707,
"learning_rate": 1.5775919693220598e-05,
"loss": 3.7305,
"step": 27500
},
{
"epoch": 1.1609743741292693,
"eval_loss": 3.995920419692993,
"eval_runtime": 389.5266,
"eval_samples_per_second": 486.465,
"eval_steps_per_second": 15.203,
"step": 27500
},
{
"epoch": 1.165196099126103,
"grad_norm": 50.62400817871094,
"learning_rate": 1.575246561045114e-05,
"loss": 3.8542,
"step": 27600
},
{
"epoch": 1.1694178241229367,
"grad_norm": 50.76478576660156,
"learning_rate": 1.5729011527681683e-05,
"loss": 3.7249,
"step": 27700
},
{
"epoch": 1.1736395491197704,
"grad_norm": 30.597715377807617,
"learning_rate": 1.5705557444912224e-05,
"loss": 3.7223,
"step": 27800
},
{
"epoch": 1.1778612741166041,
"grad_norm": 121.53684997558594,
"learning_rate": 1.5682103362142765e-05,
"loss": 3.9777,
"step": 27900
},
{
"epoch": 1.1820829991134378,
"grad_norm": 161.595947265625,
"learning_rate": 1.565864927937331e-05,
"loss": 3.8036,
"step": 28000
},
{
"epoch": 1.1820829991134378,
"eval_loss": 4.054747104644775,
"eval_runtime": 387.9897,
"eval_samples_per_second": 488.392,
"eval_steps_per_second": 15.263,
"step": 28000
},
{
"epoch": 1.1863047241102715,
"grad_norm": 35.772239685058594,
"learning_rate": 1.563519519660385e-05,
"loss": 3.8635,
"step": 28100
},
{
"epoch": 1.1905264491071053,
"grad_norm": 20.932790756225586,
"learning_rate": 1.561174111383439e-05,
"loss": 3.8523,
"step": 28200
},
{
"epoch": 1.194748174103939,
"grad_norm": 20.581287384033203,
"learning_rate": 1.5588287031064936e-05,
"loss": 3.6757,
"step": 28300
},
{
"epoch": 1.1989698991007725,
"grad_norm": 62.859352111816406,
"learning_rate": 1.5564832948295477e-05,
"loss": 3.7519,
"step": 28400
},
{
"epoch": 1.2031916240976064,
"grad_norm": 29.448869705200195,
"learning_rate": 1.5541378865526017e-05,
"loss": 3.983,
"step": 28500
},
{
"epoch": 1.2031916240976064,
"eval_loss": 4.038938045501709,
"eval_runtime": 387.2247,
"eval_samples_per_second": 489.357,
"eval_steps_per_second": 15.293,
"step": 28500
},
{
"epoch": 1.2074133490944399,
"grad_norm": 28.24205780029297,
"learning_rate": 1.551792478275656e-05,
"loss": 3.8288,
"step": 28600
},
{
"epoch": 1.2116350740912738,
"grad_norm": 36.54047393798828,
"learning_rate": 1.54944706999871e-05,
"loss": 3.8074,
"step": 28700
},
{
"epoch": 1.2158567990881073,
"grad_norm": 64.28271484375,
"learning_rate": 1.5471016617217644e-05,
"loss": 3.714,
"step": 28800
},
{
"epoch": 1.2200785240849412,
"grad_norm": 43.12767028808594,
"learning_rate": 1.5447562534448185e-05,
"loss": 3.6594,
"step": 28900
},
{
"epoch": 1.2243002490817747,
"grad_norm": 206.6219940185547,
"learning_rate": 1.542410845167873e-05,
"loss": 3.9452,
"step": 29000
},
{
"epoch": 1.2243002490817747,
"eval_loss": 4.0273637771606445,
"eval_runtime": 384.7693,
"eval_samples_per_second": 492.48,
"eval_steps_per_second": 15.391,
"step": 29000
},
{
"epoch": 1.2285219740786086,
"grad_norm": 82.68762969970703,
"learning_rate": 1.540065436890927e-05,
"loss": 3.9906,
"step": 29100
},
{
"epoch": 1.2327436990754421,
"grad_norm": 93.16060638427734,
"learning_rate": 1.537720028613981e-05,
"loss": 3.9826,
"step": 29200
},
{
"epoch": 1.236965424072276,
"grad_norm": 28.004316329956055,
"learning_rate": 1.5353980744198046e-05,
"loss": 3.8635,
"step": 29300
},
{
"epoch": 1.2411871490691095,
"grad_norm": 26.93125343322754,
"learning_rate": 1.533052666142859e-05,
"loss": 3.9888,
"step": 29400
},
{
"epoch": 1.2454088740659432,
"grad_norm": 55.691139221191406,
"learning_rate": 1.530707257865913e-05,
"loss": 3.7248,
"step": 29500
},
{
"epoch": 1.2454088740659432,
"eval_loss": 4.028724193572998,
"eval_runtime": 387.9576,
"eval_samples_per_second": 488.432,
"eval_steps_per_second": 15.265,
"step": 29500
},
{
"epoch": 1.249630599062777,
"grad_norm": 52.49585723876953,
"learning_rate": 1.5283618495889675e-05,
"loss": 3.7484,
"step": 29600
},
{
"epoch": 1.253852324059611,
"grad_norm": 27.375572204589844,
"learning_rate": 1.5260164413120216e-05,
"loss": 3.9694,
"step": 29700
},
{
"epoch": 1.2580740490564444,
"grad_norm": 27.40184211730957,
"learning_rate": 1.5236710330350757e-05,
"loss": 4.059,
"step": 29800
},
{
"epoch": 1.2622957740532783,
"grad_norm": 38.194427490234375,
"learning_rate": 1.5213256247581298e-05,
"loss": 3.9358,
"step": 29900
},
{
"epoch": 1.2665174990501118,
"grad_norm": 30.75528907775879,
"learning_rate": 1.518980216481184e-05,
"loss": 3.8575,
"step": 30000
},
{
"epoch": 1.2665174990501118,
"eval_loss": 3.9483540058135986,
"eval_runtime": 386.0272,
"eval_samples_per_second": 490.875,
"eval_steps_per_second": 15.341,
"step": 30000
},
{
"epoch": 1.2707392240469455,
"grad_norm": 41.0136604309082,
"learning_rate": 1.5166348082042384e-05,
"loss": 3.8382,
"step": 30100
},
{
"epoch": 1.2749609490437792,
"grad_norm": 27.01508903503418,
"learning_rate": 1.5142893999272924e-05,
"loss": 3.73,
"step": 30200
},
{
"epoch": 1.279182674040613,
"grad_norm": 19.524795532226562,
"learning_rate": 1.5119674457331161e-05,
"loss": 4.0439,
"step": 30300
},
{
"epoch": 1.2834043990374466,
"grad_norm": 22.745532989501953,
"learning_rate": 1.5096220374561702e-05,
"loss": 3.8426,
"step": 30400
},
{
"epoch": 1.2876261240342803,
"grad_norm": 36.96416091918945,
"learning_rate": 1.5072766291792245e-05,
"loss": 3.7062,
"step": 30500
},
{
"epoch": 1.2876261240342803,
"eval_loss": 4.0188140869140625,
"eval_runtime": 385.3455,
"eval_samples_per_second": 491.743,
"eval_steps_per_second": 15.368,
"step": 30500
},
{
"epoch": 1.291847849031114,
"grad_norm": 29.24265480041504,
"learning_rate": 1.5049312209022786e-05,
"loss": 3.8926,
"step": 30600
},
{
"epoch": 1.2960695740279478,
"grad_norm": 38.205142974853516,
"learning_rate": 1.502585812625333e-05,
"loss": 4.0276,
"step": 30700
},
{
"epoch": 1.3002912990247815,
"grad_norm": 52.72880172729492,
"learning_rate": 1.5002404043483871e-05,
"loss": 3.6359,
"step": 30800
},
{
"epoch": 1.3045130240216152,
"grad_norm": 26.97931480407715,
"learning_rate": 1.4978949960714414e-05,
"loss": 4.0006,
"step": 30900
},
{
"epoch": 1.3087347490184489,
"grad_norm": 19.663036346435547,
"learning_rate": 1.4955495877944955e-05,
"loss": 3.8485,
"step": 31000
},
{
"epoch": 1.3087347490184489,
"eval_loss": 4.001898288726807,
"eval_runtime": 389.5661,
"eval_samples_per_second": 486.416,
"eval_steps_per_second": 15.202,
"step": 31000
},
{
"epoch": 1.3129564740152826,
"grad_norm": 55.0498161315918,
"learning_rate": 1.4932041795175496e-05,
"loss": 3.7892,
"step": 31100
},
{
"epoch": 1.3171781990121163,
"grad_norm": 28.248655319213867,
"learning_rate": 1.4908587712406038e-05,
"loss": 3.5783,
"step": 31200
},
{
"epoch": 1.32139992400895,
"grad_norm": 46.059234619140625,
"learning_rate": 1.488513362963658e-05,
"loss": 4.0018,
"step": 31300
},
{
"epoch": 1.3256216490057837,
"grad_norm": 36.40607833862305,
"learning_rate": 1.4861679546867123e-05,
"loss": 3.9542,
"step": 31400
},
{
"epoch": 1.3298433740026174,
"grad_norm": 44.54596710205078,
"learning_rate": 1.4838225464097664e-05,
"loss": 3.7739,
"step": 31500
},
{
"epoch": 1.3298433740026174,
"eval_loss": 3.987475633621216,
"eval_runtime": 389.5011,
"eval_samples_per_second": 486.497,
"eval_steps_per_second": 15.204,
"step": 31500
},
{
"epoch": 1.3340650989994511,
"grad_norm": 58.875572204589844,
"learning_rate": 1.4814771381328205e-05,
"loss": 3.8806,
"step": 31600
},
{
"epoch": 1.3382868239962848,
"grad_norm": 32.03676986694336,
"learning_rate": 1.4791317298558748e-05,
"loss": 4.176,
"step": 31700
},
{
"epoch": 1.3425085489931186,
"grad_norm": 30.134937286376953,
"learning_rate": 1.4767863215789289e-05,
"loss": 3.826,
"step": 31800
},
{
"epoch": 1.3467302739899523,
"grad_norm": 27.317060470581055,
"learning_rate": 1.4744409133019833e-05,
"loss": 3.8514,
"step": 31900
},
{
"epoch": 1.350951998986786,
"grad_norm": 28.351472854614258,
"learning_rate": 1.4720955050250374e-05,
"loss": 3.8261,
"step": 32000
},
{
"epoch": 1.350951998986786,
"eval_loss": 3.9716382026672363,
"eval_runtime": 389.2558,
"eval_samples_per_second": 486.803,
"eval_steps_per_second": 15.214,
"step": 32000
},
{
"epoch": 1.3551737239836197,
"grad_norm": 113.39342498779297,
"learning_rate": 1.4697500967480915e-05,
"loss": 3.8825,
"step": 32100
},
{
"epoch": 1.3593954489804534,
"grad_norm": 36.84119415283203,
"learning_rate": 1.4674046884711458e-05,
"loss": 3.6388,
"step": 32200
},
{
"epoch": 1.363617173977287,
"grad_norm": 26.57939910888672,
"learning_rate": 1.4650592801941998e-05,
"loss": 3.7851,
"step": 32300
},
{
"epoch": 1.3678388989741208,
"grad_norm": 73.95629119873047,
"learning_rate": 1.4627138719172543e-05,
"loss": 3.5687,
"step": 32400
},
{
"epoch": 1.3720606239709545,
"grad_norm": 36.70028305053711,
"learning_rate": 1.4603684636403084e-05,
"loss": 3.5408,
"step": 32500
},
{
"epoch": 1.3720606239709545,
"eval_loss": 3.9371213912963867,
"eval_runtime": 388.4609,
"eval_samples_per_second": 487.799,
"eval_steps_per_second": 15.245,
"step": 32500
},
{
"epoch": 1.3762823489677882,
"grad_norm": 30.751720428466797,
"learning_rate": 1.4580230553633625e-05,
"loss": 3.6995,
"step": 32600
},
{
"epoch": 1.380504073964622,
"grad_norm": 36.78146743774414,
"learning_rate": 1.4556776470864167e-05,
"loss": 3.882,
"step": 32700
},
{
"epoch": 1.3847257989614556,
"grad_norm": 25.60002899169922,
"learning_rate": 1.4533322388094708e-05,
"loss": 3.8703,
"step": 32800
},
{
"epoch": 1.3889475239582894,
"grad_norm": 67.25765991210938,
"learning_rate": 1.4509868305325249e-05,
"loss": 3.806,
"step": 32900
},
{
"epoch": 1.393169248955123,
"grad_norm": 28.73509979248047,
"learning_rate": 1.4486414222555793e-05,
"loss": 3.7826,
"step": 33000
},
{
"epoch": 1.393169248955123,
"eval_loss": 3.8901188373565674,
"eval_runtime": 387.4878,
"eval_samples_per_second": 489.024,
"eval_steps_per_second": 15.283,
"step": 33000
},
{
"epoch": 1.3973909739519568,
"grad_norm": 23.818187713623047,
"learning_rate": 1.4462960139786334e-05,
"loss": 3.7853,
"step": 33100
},
{
"epoch": 1.4016126989487905,
"grad_norm": 55.28591537475586,
"learning_rate": 1.4439506057016877e-05,
"loss": 3.5745,
"step": 33200
},
{
"epoch": 1.4058344239456242,
"grad_norm": 32.88700485229492,
"learning_rate": 1.4416051974247418e-05,
"loss": 3.5884,
"step": 33300
},
{
"epoch": 1.410056148942458,
"grad_norm": 60.0154914855957,
"learning_rate": 1.4392597891477959e-05,
"loss": 3.8678,
"step": 33400
},
{
"epoch": 1.4142778739392916,
"grad_norm": 51.1928825378418,
"learning_rate": 1.4369143808708501e-05,
"loss": 4.0917,
"step": 33500
},
{
"epoch": 1.4142778739392916,
"eval_loss": 3.9331603050231934,
"eval_runtime": 387.4535,
"eval_samples_per_second": 489.068,
"eval_steps_per_second": 15.284,
"step": 33500
},
{
"epoch": 1.4184995989361253,
"grad_norm": 37.41315841674805,
"learning_rate": 1.4345689725939044e-05,
"loss": 3.7125,
"step": 33600
},
{
"epoch": 1.422721323932959,
"grad_norm": 22.55731201171875,
"learning_rate": 1.4322470183997281e-05,
"loss": 3.7298,
"step": 33700
},
{
"epoch": 1.4269430489297927,
"grad_norm": 37.30392837524414,
"learning_rate": 1.4299016101227824e-05,
"loss": 3.9447,
"step": 33800
},
{
"epoch": 1.4311647739266264,
"grad_norm": 36.30218505859375,
"learning_rate": 1.4275562018458364e-05,
"loss": 3.7176,
"step": 33900
},
{
"epoch": 1.4353864989234602,
"grad_norm": 28.75307846069336,
"learning_rate": 1.4252107935688905e-05,
"loss": 3.6765,
"step": 34000
},
{
"epoch": 1.4353864989234602,
"eval_loss": 4.030208110809326,
"eval_runtime": 387.4948,
"eval_samples_per_second": 489.016,
"eval_steps_per_second": 15.283,
"step": 34000
},
{
"epoch": 1.4396082239202939,
"grad_norm": 80.53971099853516,
"learning_rate": 1.4228653852919448e-05,
"loss": 3.9847,
"step": 34100
},
{
"epoch": 1.4438299489171276,
"grad_norm": 62.22883605957031,
"learning_rate": 1.420519977014999e-05,
"loss": 3.7364,
"step": 34200
},
{
"epoch": 1.4480516739139613,
"grad_norm": 20.756181716918945,
"learning_rate": 1.4181745687380533e-05,
"loss": 3.8246,
"step": 34300
},
{
"epoch": 1.452273398910795,
"grad_norm": 39.283870697021484,
"learning_rate": 1.4158291604611074e-05,
"loss": 3.575,
"step": 34400
},
{
"epoch": 1.4564951239076287,
"grad_norm": 21.12331771850586,
"learning_rate": 1.4134837521841615e-05,
"loss": 3.814,
"step": 34500
},
{
"epoch": 1.4564951239076287,
"eval_loss": 3.9518651962280273,
"eval_runtime": 385.0216,
"eval_samples_per_second": 492.157,
"eval_steps_per_second": 15.381,
"step": 34500
},
{
"epoch": 1.4607168489044624,
"grad_norm": 101.86262512207031,
"learning_rate": 1.4111383439072158e-05,
"loss": 3.8708,
"step": 34600
},
{
"epoch": 1.4649385739012961,
"grad_norm": 22.11623191833496,
"learning_rate": 1.4087929356302699e-05,
"loss": 3.7277,
"step": 34700
},
{
"epoch": 1.4691602988981298,
"grad_norm": 21.435291290283203,
"learning_rate": 1.4064475273533243e-05,
"loss": 3.7758,
"step": 34800
},
{
"epoch": 1.4733820238949635,
"grad_norm": 42.297630310058594,
"learning_rate": 1.4041021190763784e-05,
"loss": 3.6727,
"step": 34900
},
{
"epoch": 1.4776037488917972,
"grad_norm": 133.29806518554688,
"learning_rate": 1.4017567107994325e-05,
"loss": 3.773,
"step": 35000
},
{
"epoch": 1.4776037488917972,
"eval_loss": 3.952765464782715,
"eval_runtime": 386.7776,
"eval_samples_per_second": 489.922,
"eval_steps_per_second": 15.311,
"step": 35000
},
{
"epoch": 1.481825473888631,
"grad_norm": 22.62911605834961,
"learning_rate": 1.3994113025224867e-05,
"loss": 4.0004,
"step": 35100
},
{
"epoch": 1.4860471988854647,
"grad_norm": 37.50686264038086,
"learning_rate": 1.3970658942455408e-05,
"loss": 3.8468,
"step": 35200
},
{
"epoch": 1.4902689238822984,
"grad_norm": 38.95034408569336,
"learning_rate": 1.394720485968595e-05,
"loss": 3.6814,
"step": 35300
},
{
"epoch": 1.494490648879132,
"grad_norm": 48.52857208251953,
"learning_rate": 1.3923750776916494e-05,
"loss": 3.8993,
"step": 35400
},
{
"epoch": 1.4987123738759658,
"grad_norm": 41.82535934448242,
"learning_rate": 1.3900296694147035e-05,
"loss": 3.8841,
"step": 35500
},
{
"epoch": 1.4987123738759658,
"eval_loss": 3.9401652812957764,
"eval_runtime": 386.7397,
"eval_samples_per_second": 489.97,
"eval_steps_per_second": 15.313,
"step": 35500
},
{
"epoch": 1.5029340988727995,
"grad_norm": 81.59280395507812,
"learning_rate": 1.3876842611377577e-05,
"loss": 3.8272,
"step": 35600
},
{
"epoch": 1.507155823869633,
"grad_norm": 100.15462493896484,
"learning_rate": 1.3853388528608118e-05,
"loss": 3.584,
"step": 35700
},
{
"epoch": 1.511377548866467,
"grad_norm": 27.60382843017578,
"learning_rate": 1.3829934445838659e-05,
"loss": 3.8424,
"step": 35800
},
{
"epoch": 1.5155992738633004,
"grad_norm": 40.822444915771484,
"learning_rate": 1.3806480363069203e-05,
"loss": 3.7274,
"step": 35900
},
{
"epoch": 1.5198209988601343,
"grad_norm": 31.995193481445312,
"learning_rate": 1.3783026280299744e-05,
"loss": 3.9671,
"step": 36000
},
{
"epoch": 1.5198209988601343,
"eval_loss": 3.9035112857818604,
"eval_runtime": 389.4034,
"eval_samples_per_second": 486.619,
"eval_steps_per_second": 15.208,
"step": 36000
},
{
"epoch": 1.5240427238569678,
"grad_norm": 31.883012771606445,
"learning_rate": 1.3759572197530287e-05,
"loss": 3.7078,
"step": 36100
},
{
"epoch": 1.5282644488538017,
"grad_norm": 33.22710418701172,
"learning_rate": 1.3736118114760828e-05,
"loss": 3.7524,
"step": 36200
},
{
"epoch": 1.5324861738506352,
"grad_norm": 22.334789276123047,
"learning_rate": 1.3712664031991369e-05,
"loss": 3.6992,
"step": 36300
},
{
"epoch": 1.5367078988474692,
"grad_norm": 35.32610321044922,
"learning_rate": 1.3689209949221911e-05,
"loss": 3.8152,
"step": 36400
},
{
"epoch": 1.5409296238443027,
"grad_norm": 46.904090881347656,
"learning_rate": 1.3665755866452454e-05,
"loss": 3.9007,
"step": 36500
},
{
"epoch": 1.5409296238443027,
"eval_loss": 3.9784727096557617,
"eval_runtime": 387.4936,
"eval_samples_per_second": 489.017,
"eval_steps_per_second": 15.283,
"step": 36500
},
{
"epoch": 1.5451513488411366,
"grad_norm": 105.48990631103516,
"learning_rate": 1.3642301783682997e-05,
"loss": 3.6302,
"step": 36600
},
{
"epoch": 1.54937307383797,
"grad_norm": 35.433895111083984,
"learning_rate": 1.3618847700913538e-05,
"loss": 3.6208,
"step": 36700
},
{
"epoch": 1.553594798834804,
"grad_norm": 43.27334976196289,
"learning_rate": 1.359539361814408e-05,
"loss": 3.6039,
"step": 36800
},
{
"epoch": 1.5578165238316375,
"grad_norm": 44.104942321777344,
"learning_rate": 1.3571939535374621e-05,
"loss": 3.7039,
"step": 36900
},
{
"epoch": 1.5620382488284714,
"grad_norm": 59.13047790527344,
"learning_rate": 1.3548485452605162e-05,
"loss": 3.7069,
"step": 37000
},
{
"epoch": 1.5620382488284714,
"eval_loss": 3.9214632511138916,
"eval_runtime": 388.3204,
"eval_samples_per_second": 487.976,
"eval_steps_per_second": 15.25,
"step": 37000
},
{
"epoch": 1.566259973825305,
"grad_norm": 43.375633239746094,
"learning_rate": 1.3525031369835706e-05,
"loss": 3.7246,
"step": 37100
},
{
"epoch": 1.5704816988221388,
"grad_norm": 266.7533874511719,
"learning_rate": 1.3501577287066247e-05,
"loss": 3.7269,
"step": 37200
},
{
"epoch": 1.5747034238189723,
"grad_norm": 58.76247787475586,
"learning_rate": 1.347812320429679e-05,
"loss": 3.6822,
"step": 37300
},
{
"epoch": 1.5789251488158063,
"grad_norm": 65.02640533447266,
"learning_rate": 1.345466912152733e-05,
"loss": 3.7083,
"step": 37400
},
{
"epoch": 1.5831468738126397,
"grad_norm": 57.553504943847656,
"learning_rate": 1.3431215038757872e-05,
"loss": 3.7095,
"step": 37500
},
{
"epoch": 1.5831468738126397,
"eval_loss": 3.966798782348633,
"eval_runtime": 389.3808,
"eval_samples_per_second": 486.647,
"eval_steps_per_second": 15.209,
"step": 37500
},
{
"epoch": 1.5873685988094737,
"grad_norm": 25.69357681274414,
"learning_rate": 1.3407760955988416e-05,
"loss": 3.4556,
"step": 37600
},
{
"epoch": 1.5915903238063072,
"grad_norm": 30.168212890625,
"learning_rate": 1.3384306873218957e-05,
"loss": 4.0595,
"step": 37700
},
{
"epoch": 1.595812048803141,
"grad_norm": 22.801198959350586,
"learning_rate": 1.3361087331277194e-05,
"loss": 3.6583,
"step": 37800
},
{
"epoch": 1.6000337737999746,
"grad_norm": 33.38633346557617,
"learning_rate": 1.3337633248507735e-05,
"loss": 3.5662,
"step": 37900
},
{
"epoch": 1.6042554987968085,
"grad_norm": 85.28038024902344,
"learning_rate": 1.3314179165738277e-05,
"loss": 3.6365,
"step": 38000
},
{
"epoch": 1.6042554987968085,
"eval_loss": 3.9034698009490967,
"eval_runtime": 383.621,
"eval_samples_per_second": 493.954,
"eval_steps_per_second": 15.437,
"step": 38000
},
{
"epoch": 1.608477223793642,
"grad_norm": 180.01327514648438,
"learning_rate": 1.3290725082968818e-05,
"loss": 3.6313,
"step": 38100
},
{
"epoch": 1.612698948790476,
"grad_norm": 32.02112579345703,
"learning_rate": 1.326727100019936e-05,
"loss": 3.8767,
"step": 38200
},
{
"epoch": 1.6169206737873094,
"grad_norm": 75.3433837890625,
"learning_rate": 1.3243816917429904e-05,
"loss": 3.9992,
"step": 38300
},
{
"epoch": 1.6211423987841433,
"grad_norm": 37.75492477416992,
"learning_rate": 1.3220362834660444e-05,
"loss": 3.554,
"step": 38400
},
{
"epoch": 1.6253641237809768,
"grad_norm": 230.31956481933594,
"learning_rate": 1.3196908751890987e-05,
"loss": 3.6862,
"step": 38500
},
{
"epoch": 1.6253641237809768,
"eval_loss": 3.890026330947876,
"eval_runtime": 388.3824,
"eval_samples_per_second": 487.898,
"eval_steps_per_second": 15.248,
"step": 38500
},
{
"epoch": 1.6295858487778108,
"grad_norm": 41.58514404296875,
"learning_rate": 1.3173689209949222e-05,
"loss": 3.7638,
"step": 38600
},
{
"epoch": 1.6338075737746443,
"grad_norm": 39.31863784790039,
"learning_rate": 1.3150235127179765e-05,
"loss": 3.6716,
"step": 38700
},
{
"epoch": 1.6380292987714782,
"grad_norm": 32.94029998779297,
"learning_rate": 1.3126781044410306e-05,
"loss": 3.8667,
"step": 38800
},
{
"epoch": 1.6422510237683117,
"grad_norm": 40.90266418457031,
"learning_rate": 1.3103326961640848e-05,
"loss": 3.5304,
"step": 38900
},
{
"epoch": 1.6464727487651456,
"grad_norm": 31.46969985961914,
"learning_rate": 1.3079872878871391e-05,
"loss": 3.955,
"step": 39000
},
{
"epoch": 1.6464727487651456,
"eval_loss": 3.889432668685913,
"eval_runtime": 389.0277,
"eval_samples_per_second": 487.089,
"eval_steps_per_second": 15.223,
"step": 39000
},
{
"epoch": 1.650694473761979,
"grad_norm": 22.939836502075195,
"learning_rate": 1.3056418796101934e-05,
"loss": 3.4049,
"step": 39100
},
{
"epoch": 1.654916198758813,
"grad_norm": 77.34811401367188,
"learning_rate": 1.3032964713332475e-05,
"loss": 3.663,
"step": 39200
},
{
"epoch": 1.6591379237556465,
"grad_norm": 145.7189178466797,
"learning_rate": 1.3009510630563016e-05,
"loss": 4.0267,
"step": 39300
},
{
"epoch": 1.6633596487524802,
"grad_norm": 20.231035232543945,
"learning_rate": 1.2986056547793558e-05,
"loss": 3.8868,
"step": 39400
},
{
"epoch": 1.667581373749314,
"grad_norm": 43.548316955566406,
"learning_rate": 1.29626024650241e-05,
"loss": 3.8984,
"step": 39500
},
{
"epoch": 1.667581373749314,
"eval_loss": 3.9277007579803467,
"eval_runtime": 389.9149,
"eval_samples_per_second": 485.98,
"eval_steps_per_second": 15.188,
"step": 39500
},
{
"epoch": 1.6718030987461476,
"grad_norm": 75.36473846435547,
"learning_rate": 1.2939148382254643e-05,
"loss": 3.575,
"step": 39600
},
{
"epoch": 1.6760248237429813,
"grad_norm": 82.61229705810547,
"learning_rate": 1.2915694299485184e-05,
"loss": 3.6966,
"step": 39700
},
{
"epoch": 1.680246548739815,
"grad_norm": 23.42165184020996,
"learning_rate": 1.2892240216715725e-05,
"loss": 4.0533,
"step": 39800
},
{
"epoch": 1.6844682737366488,
"grad_norm": 39.809532165527344,
"learning_rate": 1.2868786133946268e-05,
"loss": 3.6106,
"step": 39900
},
{
"epoch": 1.6886899987334825,
"grad_norm": 30.228683471679688,
"learning_rate": 1.2845332051176809e-05,
"loss": 3.6468,
"step": 40000
},
{
"epoch": 1.6886899987334825,
"eval_loss": 3.9425179958343506,
"eval_runtime": 389.7162,
"eval_samples_per_second": 486.228,
"eval_steps_per_second": 15.196,
"step": 40000
},
{
"epoch": 1.6929117237303162,
"grad_norm": 29.66598892211914,
"learning_rate": 1.2821877968407353e-05,
"loss": 3.7145,
"step": 40100
},
{
"epoch": 1.6971334487271499,
"grad_norm": 32.41665267944336,
"learning_rate": 1.2798423885637894e-05,
"loss": 3.6602,
"step": 40200
},
{
"epoch": 1.7013551737239836,
"grad_norm": 54.770538330078125,
"learning_rate": 1.2774969802868435e-05,
"loss": 3.5531,
"step": 40300
},
{
"epoch": 1.7055768987208173,
"grad_norm": 21.214088439941406,
"learning_rate": 1.2751515720098978e-05,
"loss": 3.7857,
"step": 40400
},
{
"epoch": 1.709798623717651,
"grad_norm": 18.517242431640625,
"learning_rate": 1.2728061637329518e-05,
"loss": 3.5586,
"step": 40500
},
{
"epoch": 1.709798623717651,
"eval_loss": 3.8483657836914062,
"eval_runtime": 386.5428,
"eval_samples_per_second": 490.22,
"eval_steps_per_second": 15.32,
"step": 40500
},
{
"epoch": 1.7140203487144847,
"grad_norm": 101.35408782958984,
"learning_rate": 1.270460755456006e-05,
"loss": 3.7711,
"step": 40600
},
{
"epoch": 1.7182420737113184,
"grad_norm": 32.46183395385742,
"learning_rate": 1.2681153471790604e-05,
"loss": 3.7135,
"step": 40700
},
{
"epoch": 1.7224637987081521,
"grad_norm": 38.50857162475586,
"learning_rate": 1.2657699389021145e-05,
"loss": 3.8785,
"step": 40800
},
{
"epoch": 1.7266855237049858,
"grad_norm": 21.530080795288086,
"learning_rate": 1.2634245306251687e-05,
"loss": 3.5577,
"step": 40900
},
{
"epoch": 1.7309072487018196,
"grad_norm": 33.64255142211914,
"learning_rate": 1.2610791223482228e-05,
"loss": 3.5783,
"step": 41000
},
{
"epoch": 1.7309072487018196,
"eval_loss": 3.9012844562530518,
"eval_runtime": 387.6772,
"eval_samples_per_second": 488.786,
"eval_steps_per_second": 15.276,
"step": 41000
},
{
"epoch": 1.7351289736986533,
"grad_norm": 175.7689971923828,
"learning_rate": 1.2587337140712769e-05,
"loss": 3.7346,
"step": 41100
},
{
"epoch": 1.739350698695487,
"grad_norm": 27.353748321533203,
"learning_rate": 1.2563883057943313e-05,
"loss": 3.5098,
"step": 41200
},
{
"epoch": 1.7435724236923207,
"grad_norm": 78.1789321899414,
"learning_rate": 1.2540428975173854e-05,
"loss": 4.0181,
"step": 41300
},
{
"epoch": 1.7477941486891544,
"grad_norm": 56.047279357910156,
"learning_rate": 1.2516974892404397e-05,
"loss": 3.8404,
"step": 41400
},
{
"epoch": 1.752015873685988,
"grad_norm": 24.14201545715332,
"learning_rate": 1.2493520809634938e-05,
"loss": 3.6327,
"step": 41500
},
{
"epoch": 1.752015873685988,
"eval_loss": 3.868438243865967,
"eval_runtime": 388.5976,
"eval_samples_per_second": 487.628,
"eval_steps_per_second": 15.239,
"step": 41500
},
{
"epoch": 1.7562375986828218,
"grad_norm": 21.085111618041992,
"learning_rate": 1.2470066726865479e-05,
"loss": 3.7503,
"step": 41600
},
{
"epoch": 1.7604593236796555,
"grad_norm": 23.111595153808594,
"learning_rate": 1.2446612644096021e-05,
"loss": 3.45,
"step": 41700
},
{
"epoch": 1.7646810486764892,
"grad_norm": 90.75397491455078,
"learning_rate": 1.2423158561326564e-05,
"loss": 3.9138,
"step": 41800
},
{
"epoch": 1.768902773673323,
"grad_norm": 55.01781463623047,
"learning_rate": 1.2399704478557107e-05,
"loss": 3.6061,
"step": 41900
},
{
"epoch": 1.7731244986701566,
"grad_norm": 19.09023094177246,
"learning_rate": 1.2376250395787648e-05,
"loss": 3.6603,
"step": 42000
},
{
"epoch": 1.7731244986701566,
"eval_loss": 3.795603036880493,
"eval_runtime": 387.4527,
"eval_samples_per_second": 489.069,
"eval_steps_per_second": 15.284,
"step": 42000
},
{
"epoch": 1.7773462236669904,
"grad_norm": 36.818756103515625,
"learning_rate": 1.2352796313018189e-05,
"loss": 3.6722,
"step": 42100
},
{
"epoch": 1.781567948663824,
"grad_norm": 35.12385559082031,
"learning_rate": 1.2329342230248731e-05,
"loss": 3.678,
"step": 42200
},
{
"epoch": 1.7857896736606578,
"grad_norm": 47.89170837402344,
"learning_rate": 1.2305888147479272e-05,
"loss": 3.5802,
"step": 42300
},
{
"epoch": 1.7900113986574915,
"grad_norm": 53.81658172607422,
"learning_rate": 1.2282434064709816e-05,
"loss": 3.8253,
"step": 42400
},
{
"epoch": 1.7942331236543252,
"grad_norm": 31.743684768676758,
"learning_rate": 1.2258979981940357e-05,
"loss": 3.7815,
"step": 42500
},
{
"epoch": 1.7942331236543252,
"eval_loss": 3.8192477226257324,
"eval_runtime": 388.2855,
"eval_samples_per_second": 488.02,
"eval_steps_per_second": 15.252,
"step": 42500
},
{
"epoch": 1.798454848651159,
"grad_norm": 29.357864379882812,
"learning_rate": 1.2235760439998594e-05,
"loss": 3.7021,
"step": 42600
},
{
"epoch": 1.8026765736479926,
"grad_norm": 25.66359519958496,
"learning_rate": 1.2212306357229135e-05,
"loss": 3.4263,
"step": 42700
},
{
"epoch": 1.8068982986448263,
"grad_norm": 52.50632095336914,
"learning_rate": 1.2188852274459678e-05,
"loss": 3.8781,
"step": 42800
},
{
"epoch": 1.81112002364166,
"grad_norm": 26.848169326782227,
"learning_rate": 1.2165398191690219e-05,
"loss": 3.5784,
"step": 42900
},
{
"epoch": 1.8153417486384937,
"grad_norm": 51.33030700683594,
"learning_rate": 1.2141944108920763e-05,
"loss": 3.9405,
"step": 43000
},
{
"epoch": 1.8153417486384937,
"eval_loss": 3.810027837753296,
"eval_runtime": 388.9312,
"eval_samples_per_second": 487.21,
"eval_steps_per_second": 15.226,
"step": 43000
},
{
"epoch": 1.8195634736353274,
"grad_norm": 53.251548767089844,
"learning_rate": 1.2118490026151304e-05,
"loss": 3.5516,
"step": 43100
},
{
"epoch": 1.8237851986321612,
"grad_norm": 50.611228942871094,
"learning_rate": 1.2095035943381845e-05,
"loss": 3.8322,
"step": 43200
},
{
"epoch": 1.8280069236289949,
"grad_norm": 61.101463317871094,
"learning_rate": 1.2071581860612387e-05,
"loss": 3.7948,
"step": 43300
},
{
"epoch": 1.8322286486258283,
"grad_norm": 25.454072952270508,
"learning_rate": 1.2048127777842928e-05,
"loss": 3.6175,
"step": 43400
},
{
"epoch": 1.8364503736226623,
"grad_norm": 36.41463851928711,
"learning_rate": 1.202467369507347e-05,
"loss": 3.5256,
"step": 43500
},
{
"epoch": 1.8364503736226623,
"eval_loss": 3.855207920074463,
"eval_runtime": 389.0478,
"eval_samples_per_second": 487.064,
"eval_steps_per_second": 15.222,
"step": 43500
},
{
"epoch": 1.8406720986194958,
"grad_norm": 85.75359344482422,
"learning_rate": 1.2001219612304014e-05,
"loss": 3.8199,
"step": 43600
},
{
"epoch": 1.8448938236163297,
"grad_norm": 90.45889282226562,
"learning_rate": 1.1977765529534555e-05,
"loss": 3.6168,
"step": 43700
},
{
"epoch": 1.8491155486131632,
"grad_norm": 40.536827087402344,
"learning_rate": 1.1954311446765097e-05,
"loss": 3.5648,
"step": 43800
},
{
"epoch": 1.8533372736099971,
"grad_norm": 34.12901306152344,
"learning_rate": 1.1930857363995638e-05,
"loss": 3.5584,
"step": 43900
},
{
"epoch": 1.8575589986068306,
"grad_norm": 33.35273742675781,
"learning_rate": 1.1907403281226179e-05,
"loss": 3.7623,
"step": 44000
},
{
"epoch": 1.8575589986068306,
"eval_loss": 3.8202288150787354,
"eval_runtime": 389.4093,
"eval_samples_per_second": 486.611,
"eval_steps_per_second": 15.208,
"step": 44000
},
{
"epoch": 1.8617807236036645,
"grad_norm": 37.19559860229492,
"learning_rate": 1.1883949198456722e-05,
"loss": 3.7884,
"step": 44100
},
{
"epoch": 1.866002448600498,
"grad_norm": 63.01133346557617,
"learning_rate": 1.1860495115687264e-05,
"loss": 3.6241,
"step": 44200
},
{
"epoch": 1.870224173597332,
"grad_norm": 58.19880294799805,
"learning_rate": 1.1837041032917807e-05,
"loss": 3.4533,
"step": 44300
},
{
"epoch": 1.8744458985941654,
"grad_norm": 46.244720458984375,
"learning_rate": 1.1813586950148348e-05,
"loss": 3.575,
"step": 44400
},
{
"epoch": 1.8786676235909994,
"grad_norm": 34.9444465637207,
"learning_rate": 1.1790132867378889e-05,
"loss": 3.6981,
"step": 44500
},
{
"epoch": 1.8786676235909994,
"eval_loss": 3.9079577922821045,
"eval_runtime": 388.7641,
"eval_samples_per_second": 487.419,
"eval_steps_per_second": 15.233,
"step": 44500
},
{
"epoch": 1.8828893485878329,
"grad_norm": 43.4091682434082,
"learning_rate": 1.1766913325437126e-05,
"loss": 3.6384,
"step": 44600
},
{
"epoch": 1.8871110735846668,
"grad_norm": 32.79497528076172,
"learning_rate": 1.1743459242667668e-05,
"loss": 3.8267,
"step": 44700
},
{
"epoch": 1.8913327985815003,
"grad_norm": 59.50571823120117,
"learning_rate": 1.1720005159898211e-05,
"loss": 3.5696,
"step": 44800
},
{
"epoch": 1.8955545235783342,
"grad_norm": 57.57821273803711,
"learning_rate": 1.1696551077128754e-05,
"loss": 3.5189,
"step": 44900
},
{
"epoch": 1.8997762485751677,
"grad_norm": 51.14540481567383,
"learning_rate": 1.1673096994359294e-05,
"loss": 3.7528,
"step": 45000
},
{
"epoch": 1.8997762485751677,
"eval_loss": 3.875887155532837,
"eval_runtime": 387.2131,
"eval_samples_per_second": 489.371,
"eval_steps_per_second": 15.294,
"step": 45000
},
{
"epoch": 1.9039979735720016,
"grad_norm": 23.407438278198242,
"learning_rate": 1.1649642911589835e-05,
"loss": 3.7572,
"step": 45100
},
{
"epoch": 1.908219698568835,
"grad_norm": 19.387798309326172,
"learning_rate": 1.1626188828820378e-05,
"loss": 3.7283,
"step": 45200
},
{
"epoch": 1.912441423565669,
"grad_norm": 44.595035552978516,
"learning_rate": 1.1602734746050919e-05,
"loss": 3.6185,
"step": 45300
},
{
"epoch": 1.9166631485625025,
"grad_norm": 40.92133331298828,
"learning_rate": 1.1579280663281463e-05,
"loss": 3.5348,
"step": 45400
},
{
"epoch": 1.9208848735593365,
"grad_norm": 272.9943542480469,
"learning_rate": 1.1555826580512004e-05,
"loss": 3.5366,
"step": 45500
},
{
"epoch": 1.9208848735593365,
"eval_loss": 3.971277952194214,
"eval_runtime": 389.7703,
"eval_samples_per_second": 486.161,
"eval_steps_per_second": 15.194,
"step": 45500
},
{
"epoch": 1.92510659855617,
"grad_norm": 30.447141647338867,
"learning_rate": 1.1532372497742545e-05,
"loss": 3.8358,
"step": 45600
},
{
"epoch": 1.9293283235530039,
"grad_norm": 40.50982666015625,
"learning_rate": 1.1508918414973088e-05,
"loss": 3.7831,
"step": 45700
},
{
"epoch": 1.9335500485498374,
"grad_norm": 91.95133209228516,
"learning_rate": 1.1485464332203629e-05,
"loss": 3.7524,
"step": 45800
},
{
"epoch": 1.9377717735466713,
"grad_norm": 89.98445892333984,
"learning_rate": 1.1462010249434173e-05,
"loss": 3.4533,
"step": 45900
},
{
"epoch": 1.9419934985435048,
"grad_norm": 33.398841857910156,
"learning_rate": 1.1438556166664714e-05,
"loss": 3.4622,
"step": 46000
},
{
"epoch": 1.9419934985435048,
"eval_loss": 3.890702962875366,
"eval_runtime": 388.4518,
"eval_samples_per_second": 487.811,
"eval_steps_per_second": 15.245,
"step": 46000
},
{
"epoch": 1.9462152235403387,
"grad_norm": 52.59076690673828,
"learning_rate": 1.1415102083895255e-05,
"loss": 3.7096,
"step": 46100
},
{
"epoch": 1.9504369485371722,
"grad_norm": 77.39093780517578,
"learning_rate": 1.1391648001125797e-05,
"loss": 3.5447,
"step": 46200
},
{
"epoch": 1.9546586735340061,
"grad_norm": 57.93257522583008,
"learning_rate": 1.1368193918356338e-05,
"loss": 3.601,
"step": 46300
},
{
"epoch": 1.9588803985308396,
"grad_norm": 63.35990524291992,
"learning_rate": 1.134473983558688e-05,
"loss": 3.6369,
"step": 46400
},
{
"epoch": 1.9631021235276735,
"grad_norm": 74.04061889648438,
"learning_rate": 1.1321285752817424e-05,
"loss": 3.8619,
"step": 46500
},
{
"epoch": 1.9631021235276735,
"eval_loss": 3.8416244983673096,
"eval_runtime": 388.455,
"eval_samples_per_second": 487.807,
"eval_steps_per_second": 15.245,
"step": 46500
},
{
"epoch": 1.967323848524507,
"grad_norm": 32.15678405761719,
"learning_rate": 1.1297831670047964e-05,
"loss": 3.6623,
"step": 46600
},
{
"epoch": 1.971545573521341,
"grad_norm": 134.01588439941406,
"learning_rate": 1.1274612128106201e-05,
"loss": 3.8225,
"step": 46700
},
{
"epoch": 1.9757672985181745,
"grad_norm": 38.84286880493164,
"learning_rate": 1.1251158045336742e-05,
"loss": 3.9126,
"step": 46800
},
{
"epoch": 1.9799890235150084,
"grad_norm": 27.815914154052734,
"learning_rate": 1.1227703962567285e-05,
"loss": 3.8429,
"step": 46900
},
{
"epoch": 1.9842107485118419,
"grad_norm": 66.18096160888672,
"learning_rate": 1.1204249879797826e-05,
"loss": 3.6607,
"step": 47000
},
{
"epoch": 1.9842107485118419,
"eval_loss": 3.8103389739990234,
"eval_runtime": 388.5622,
"eval_samples_per_second": 487.672,
"eval_steps_per_second": 15.241,
"step": 47000
},
{
"epoch": 1.9884324735086758,
"grad_norm": 61.14945602416992,
"learning_rate": 1.1181030337856063e-05,
"loss": 3.5708,
"step": 47100
},
{
"epoch": 1.9926541985055093,
"grad_norm": 35.80445861816406,
"learning_rate": 1.1157576255086604e-05,
"loss": 3.6346,
"step": 47200
},
{
"epoch": 1.9968759235023432,
"grad_norm": 47.520687103271484,
"learning_rate": 1.1134122172317148e-05,
"loss": 3.4577,
"step": 47300
},
{
"epoch": 2.0010976484991767,
"grad_norm": 72.3710708618164,
"learning_rate": 1.1110668089547689e-05,
"loss": 3.4326,
"step": 47400
},
{
"epoch": 2.0053193734960106,
"grad_norm": 62.152923583984375,
"learning_rate": 1.1087214006778232e-05,
"loss": 3.5431,
"step": 47500
},
{
"epoch": 2.0053193734960106,
"eval_loss": 3.831125497817993,
"eval_runtime": 383.5804,
"eval_samples_per_second": 494.006,
"eval_steps_per_second": 15.439,
"step": 47500
},
{
"epoch": 2.009541098492844,
"grad_norm": 37.57729721069336,
"learning_rate": 1.1063759924008772e-05,
"loss": 3.4852,
"step": 47600
},
{
"epoch": 2.013762823489678,
"grad_norm": 19.60400390625,
"learning_rate": 1.1040305841239313e-05,
"loss": 3.4037,
"step": 47700
},
{
"epoch": 2.0179845484865115,
"grad_norm": 43.43424987792969,
"learning_rate": 1.1016851758469858e-05,
"loss": 3.5685,
"step": 47800
},
{
"epoch": 2.0222062734833455,
"grad_norm": 48.39424514770508,
"learning_rate": 1.0993397675700399e-05,
"loss": 3.2866,
"step": 47900
},
{
"epoch": 2.026427998480179,
"grad_norm": 45.38421630859375,
"learning_rate": 1.0969943592930941e-05,
"loss": 3.3943,
"step": 48000
},
{
"epoch": 2.026427998480179,
"eval_loss": 3.944741725921631,
"eval_runtime": 389.6599,
"eval_samples_per_second": 486.298,
"eval_steps_per_second": 15.198,
"step": 48000
},
{
"epoch": 2.030649723477013,
"grad_norm": 34.202125549316406,
"learning_rate": 1.0946489510161482e-05,
"loss": 3.3675,
"step": 48100
},
{
"epoch": 2.0348714484738464,
"grad_norm": 33.32945251464844,
"learning_rate": 1.0923035427392023e-05,
"loss": 3.7605,
"step": 48200
},
{
"epoch": 2.0390931734706803,
"grad_norm": 22.147314071655273,
"learning_rate": 1.0899581344622566e-05,
"loss": 4.0646,
"step": 48300
},
{
"epoch": 2.043314898467514,
"grad_norm": 50.3154411315918,
"learning_rate": 1.0876127261853108e-05,
"loss": 3.4634,
"step": 48400
},
{
"epoch": 2.0475366234643477,
"grad_norm": 27.445606231689453,
"learning_rate": 1.0852673179083651e-05,
"loss": 3.5041,
"step": 48500
},
{
"epoch": 2.0475366234643477,
"eval_loss": 3.8681693077087402,
"eval_runtime": 388.4463,
"eval_samples_per_second": 487.818,
"eval_steps_per_second": 15.245,
"step": 48500
},
{
"epoch": 2.051758348461181,
"grad_norm": 58.96662521362305,
"learning_rate": 1.0829219096314192e-05,
"loss": 3.6862,
"step": 48600
},
{
"epoch": 2.055980073458015,
"grad_norm": 18.482248306274414,
"learning_rate": 1.0805765013544733e-05,
"loss": 3.5486,
"step": 48700
},
{
"epoch": 2.0602017984548486,
"grad_norm": 33.20754623413086,
"learning_rate": 1.0782310930775275e-05,
"loss": 3.6148,
"step": 48800
},
{
"epoch": 2.0644235234516826,
"grad_norm": 67.11410522460938,
"learning_rate": 1.0758856848005816e-05,
"loss": 3.3776,
"step": 48900
},
{
"epoch": 2.068645248448516,
"grad_norm": 52.77125930786133,
"learning_rate": 1.073540276523636e-05,
"loss": 3.4514,
"step": 49000
},
{
"epoch": 2.068645248448516,
"eval_loss": 3.8840792179107666,
"eval_runtime": 387.6216,
"eval_samples_per_second": 488.856,
"eval_steps_per_second": 15.278,
"step": 49000
},
{
"epoch": 2.07286697344535,
"grad_norm": 64.37210845947266,
"learning_rate": 1.0711948682466902e-05,
"loss": 3.4575,
"step": 49100
},
{
"epoch": 2.0770886984421835,
"grad_norm": 84.62837219238281,
"learning_rate": 1.0688494599697443e-05,
"loss": 3.4984,
"step": 49200
},
{
"epoch": 2.0813104234390174,
"grad_norm": 43.95246124267578,
"learning_rate": 1.0665040516927985e-05,
"loss": 3.3978,
"step": 49300
},
{
"epoch": 2.085532148435851,
"grad_norm": 58.663265228271484,
"learning_rate": 1.0641586434158526e-05,
"loss": 3.644,
"step": 49400
},
{
"epoch": 2.0897538734326844,
"grad_norm": 78.60660552978516,
"learning_rate": 1.061813235138907e-05,
"loss": 3.6412,
"step": 49500
},
{
"epoch": 2.0897538734326844,
"eval_loss": 3.8865935802459717,
"eval_runtime": 388.5454,
"eval_samples_per_second": 487.693,
"eval_steps_per_second": 15.241,
"step": 49500
},
{
"epoch": 2.0939755984295183,
"grad_norm": 71.96966552734375,
"learning_rate": 1.0594678268619611e-05,
"loss": 3.322,
"step": 49600
},
{
"epoch": 2.0981973234263522,
"grad_norm": 26.565078735351562,
"learning_rate": 1.0571224185850152e-05,
"loss": 3.7186,
"step": 49700
},
{
"epoch": 2.1024190484231857,
"grad_norm": 47.89260482788086,
"learning_rate": 1.0547770103080695e-05,
"loss": 3.3604,
"step": 49800
},
{
"epoch": 2.106640773420019,
"grad_norm": 26.684965133666992,
"learning_rate": 1.0524316020311236e-05,
"loss": 3.7262,
"step": 49900
},
{
"epoch": 2.110862498416853,
"grad_norm": 34.246253967285156,
"learning_rate": 1.0500861937541777e-05,
"loss": 3.541,
"step": 50000
},
{
"epoch": 2.110862498416853,
"eval_loss": 3.857327938079834,
"eval_runtime": 387.0381,
"eval_samples_per_second": 489.593,
"eval_steps_per_second": 15.301,
"step": 50000
},
{
"epoch": 2.1150842234136866,
"grad_norm": 79.23475646972656,
"learning_rate": 1.0477642395600014e-05,
"loss": 3.4695,
"step": 50100
},
{
"epoch": 2.1193059484105206,
"grad_norm": 28.924774169921875,
"learning_rate": 1.0454188312830558e-05,
"loss": 3.3366,
"step": 50200
},
{
"epoch": 2.123527673407354,
"grad_norm": 154.0482635498047,
"learning_rate": 1.0430734230061099e-05,
"loss": 3.6556,
"step": 50300
},
{
"epoch": 2.127749398404188,
"grad_norm": 21.952199935913086,
"learning_rate": 1.0407280147291641e-05,
"loss": 3.4016,
"step": 50400
},
{
"epoch": 2.1319711234010215,
"grad_norm": 34.361366271972656,
"learning_rate": 1.0383826064522182e-05,
"loss": 3.6733,
"step": 50500
},
{
"epoch": 2.1319711234010215,
"eval_loss": 3.9452035427093506,
"eval_runtime": 388.1734,
"eval_samples_per_second": 488.161,
"eval_steps_per_second": 15.256,
"step": 50500
},
{
"epoch": 2.1361928483978554,
"grad_norm": 25.264711380004883,
"learning_rate": 1.0360371981752723e-05,
"loss": 3.7553,
"step": 50600
},
{
"epoch": 2.140414573394689,
"grad_norm": 62.976287841796875,
"learning_rate": 1.0336917898983266e-05,
"loss": 3.6238,
"step": 50700
},
{
"epoch": 2.144636298391523,
"grad_norm": 29.24872398376465,
"learning_rate": 1.0313463816213809e-05,
"loss": 3.4566,
"step": 50800
},
{
"epoch": 2.1488580233883563,
"grad_norm": 40.195960998535156,
"learning_rate": 1.0290244274272045e-05,
"loss": 3.7676,
"step": 50900
},
{
"epoch": 2.1530797483851902,
"grad_norm": 53.48100280761719,
"learning_rate": 1.0266790191502586e-05,
"loss": 3.4482,
"step": 51000
},
{
"epoch": 2.1530797483851902,
"eval_loss": 3.895794153213501,
"eval_runtime": 389.8627,
"eval_samples_per_second": 486.046,
"eval_steps_per_second": 15.19,
"step": 51000
},
{
"epoch": 2.1573014733820237,
"grad_norm": 60.159095764160156,
"learning_rate": 1.0243336108733129e-05,
"loss": 3.409,
"step": 51100
},
{
"epoch": 2.1615231983788576,
"grad_norm": 67.60205078125,
"learning_rate": 1.021988202596367e-05,
"loss": 3.9074,
"step": 51200
},
{
"epoch": 2.165744923375691,
"grad_norm": 74.41675567626953,
"learning_rate": 1.019642794319421e-05,
"loss": 3.613,
"step": 51300
},
{
"epoch": 2.169966648372525,
"grad_norm": 143.5885009765625,
"learning_rate": 1.0172973860424755e-05,
"loss": 3.4835,
"step": 51400
},
{
"epoch": 2.1741883733693586,
"grad_norm": 71.70709991455078,
"learning_rate": 1.0149519777655296e-05,
"loss": 3.5494,
"step": 51500
},
{
"epoch": 2.1741883733693586,
"eval_loss": 3.788289785385132,
"eval_runtime": 387.1462,
"eval_samples_per_second": 489.456,
"eval_steps_per_second": 15.297,
"step": 51500
},
{
"epoch": 2.1784100983661925,
"grad_norm": 30.032886505126953,
"learning_rate": 1.0126065694885839e-05,
"loss": 3.5205,
"step": 51600
},
{
"epoch": 2.182631823363026,
"grad_norm": 53.41360092163086,
"learning_rate": 1.010261161211638e-05,
"loss": 3.4232,
"step": 51700
},
{
"epoch": 2.18685354835986,
"grad_norm": 69.20714569091797,
"learning_rate": 1.007915752934692e-05,
"loss": 3.6539,
"step": 51800
},
{
"epoch": 2.1910752733566934,
"grad_norm": 113.37935638427734,
"learning_rate": 1.0055703446577463e-05,
"loss": 3.6586,
"step": 51900
},
{
"epoch": 2.1952969983535273,
"grad_norm": 72.74465942382812,
"learning_rate": 1.0032249363808006e-05,
"loss": 3.3321,
"step": 52000
},
{
"epoch": 2.1952969983535273,
"eval_loss": 3.893357992172241,
"eval_runtime": 388.1116,
"eval_samples_per_second": 488.238,
"eval_steps_per_second": 15.258,
"step": 52000
},
{
"epoch": 2.199518723350361,
"grad_norm": 71.91123962402344,
"learning_rate": 1.0008795281038548e-05,
"loss": 3.7712,
"step": 52100
},
{
"epoch": 2.2037404483471947,
"grad_norm": 35.67560577392578,
"learning_rate": 9.98534119826909e-06,
"loss": 3.4973,
"step": 52200
},
{
"epoch": 2.2079621733440282,
"grad_norm": 122.19819641113281,
"learning_rate": 9.96188711549963e-06,
"loss": 3.5404,
"step": 52300
},
{
"epoch": 2.212183898340862,
"grad_norm": 42.20289993286133,
"learning_rate": 9.938433032730173e-06,
"loss": 3.4815,
"step": 52400
},
{
"epoch": 2.2164056233376956,
"grad_norm": 36.77101516723633,
"learning_rate": 9.914978949960715e-06,
"loss": 3.5586,
"step": 52500
},
{
"epoch": 2.2164056233376956,
"eval_loss": 3.8740973472595215,
"eval_runtime": 384.4478,
"eval_samples_per_second": 492.891,
"eval_steps_per_second": 15.404,
"step": 52500
},
{
"epoch": 2.2206273483345296,
"grad_norm": 30.4133243560791,
"learning_rate": 9.891524867191256e-06,
"loss": 3.5463,
"step": 52600
},
{
"epoch": 2.224849073331363,
"grad_norm": 47.02263259887695,
"learning_rate": 9.868070784421799e-06,
"loss": 3.537,
"step": 52700
},
{
"epoch": 2.229070798328197,
"grad_norm": 26.324251174926758,
"learning_rate": 9.844616701652342e-06,
"loss": 3.5299,
"step": 52800
},
{
"epoch": 2.2332925233250305,
"grad_norm": 203.6620635986328,
"learning_rate": 9.821162618882883e-06,
"loss": 3.5967,
"step": 52900
},
{
"epoch": 2.2375142483218644,
"grad_norm": 29.926071166992188,
"learning_rate": 9.797708536113425e-06,
"loss": 3.5156,
"step": 53000
},
{
"epoch": 2.2375142483218644,
"eval_loss": 3.8976686000823975,
"eval_runtime": 389.0602,
"eval_samples_per_second": 487.048,
"eval_steps_per_second": 15.221,
"step": 53000
},
{
"epoch": 2.241735973318698,
"grad_norm": 35.688720703125,
"learning_rate": 9.774254453343966e-06,
"loss": 3.6079,
"step": 53100
},
{
"epoch": 2.245957698315532,
"grad_norm": 47.552772521972656,
"learning_rate": 9.750800370574509e-06,
"loss": 3.4492,
"step": 53200
},
{
"epoch": 2.2501794233123653,
"grad_norm": 217.60997009277344,
"learning_rate": 9.727346287805051e-06,
"loss": 3.4611,
"step": 53300
},
{
"epoch": 2.2544011483091992,
"grad_norm": 38.56391906738281,
"learning_rate": 9.703892205035592e-06,
"loss": 3.1611,
"step": 53400
},
{
"epoch": 2.2586228733060327,
"grad_norm": 36.49523162841797,
"learning_rate": 9.680438122266135e-06,
"loss": 3.6554,
"step": 53500
},
{
"epoch": 2.2586228733060327,
"eval_loss": 3.844381809234619,
"eval_runtime": 385.0727,
"eval_samples_per_second": 492.092,
"eval_steps_per_second": 15.379,
"step": 53500
},
{
"epoch": 2.2628445983028667,
"grad_norm": 111.7251968383789,
"learning_rate": 9.656984039496676e-06,
"loss": 3.473,
"step": 53600
},
{
"epoch": 2.2670663232997,
"grad_norm": 26.387981414794922,
"learning_rate": 9.633529956727218e-06,
"loss": 3.8362,
"step": 53700
},
{
"epoch": 2.271288048296534,
"grad_norm": 51.86467742919922,
"learning_rate": 9.610310414785454e-06,
"loss": 3.8243,
"step": 53800
},
{
"epoch": 2.2755097732933676,
"grad_norm": 48.3861198425293,
"learning_rate": 9.586856332015996e-06,
"loss": 3.2957,
"step": 53900
},
{
"epoch": 2.2797314982902015,
"grad_norm": 96.6748046875,
"learning_rate": 9.563402249246539e-06,
"loss": 3.4227,
"step": 54000
},
{
"epoch": 2.2797314982902015,
"eval_loss": 3.865082025527954,
"eval_runtime": 388.8342,
"eval_samples_per_second": 487.331,
"eval_steps_per_second": 15.23,
"step": 54000
},
{
"epoch": 2.283953223287035,
"grad_norm": 145.63818359375,
"learning_rate": 9.53994816647708e-06,
"loss": 3.6771,
"step": 54100
},
{
"epoch": 2.288174948283869,
"grad_norm": 27.48724365234375,
"learning_rate": 9.516494083707622e-06,
"loss": 3.5287,
"step": 54200
},
{
"epoch": 2.2923966732807024,
"grad_norm": 33.88056564331055,
"learning_rate": 9.493040000938163e-06,
"loss": 3.6393,
"step": 54300
},
{
"epoch": 2.2966183982775363,
"grad_norm": 46.24734878540039,
"learning_rate": 9.469585918168706e-06,
"loss": 3.6447,
"step": 54400
},
{
"epoch": 2.30084012327437,
"grad_norm": 65.59355926513672,
"learning_rate": 9.446131835399249e-06,
"loss": 3.1714,
"step": 54500
},
{
"epoch": 2.30084012327437,
"eval_loss": 3.8695130348205566,
"eval_runtime": 386.6075,
"eval_samples_per_second": 490.138,
"eval_steps_per_second": 15.318,
"step": 54500
},
{
"epoch": 2.3050618482712038,
"grad_norm": 152.84823608398438,
"learning_rate": 9.42267775262979e-06,
"loss": 3.5703,
"step": 54600
},
{
"epoch": 2.3092835732680372,
"grad_norm": 49.877586364746094,
"learning_rate": 9.39922366986033e-06,
"loss": 3.6058,
"step": 54700
},
{
"epoch": 2.313505298264871,
"grad_norm": 35.91408920288086,
"learning_rate": 9.375769587090873e-06,
"loss": 3.3485,
"step": 54800
},
{
"epoch": 2.3177270232617047,
"grad_norm": 42.26518630981445,
"learning_rate": 9.352315504321416e-06,
"loss": 3.4143,
"step": 54900
},
{
"epoch": 2.3219487482585386,
"grad_norm": 75.06291961669922,
"learning_rate": 9.328861421551958e-06,
"loss": 3.461,
"step": 55000
},
{
"epoch": 2.3219487482585386,
"eval_loss": 3.828523874282837,
"eval_runtime": 385.5046,
"eval_samples_per_second": 491.54,
"eval_steps_per_second": 15.362,
"step": 55000
},
{
"epoch": 2.326170473255372,
"grad_norm": 119.58143615722656,
"learning_rate": 9.3054073387825e-06,
"loss": 3.7676,
"step": 55100
},
{
"epoch": 2.330392198252206,
"grad_norm": 151.1800079345703,
"learning_rate": 9.28195325601304e-06,
"loss": 3.7192,
"step": 55200
},
{
"epoch": 2.3346139232490395,
"grad_norm": 32.434814453125,
"learning_rate": 9.258499173243583e-06,
"loss": 3.3955,
"step": 55300
},
{
"epoch": 2.3388356482458734,
"grad_norm": 17.332210540771484,
"learning_rate": 9.235045090474125e-06,
"loss": 3.5533,
"step": 55400
},
{
"epoch": 2.343057373242707,
"grad_norm": 46.541019439697266,
"learning_rate": 9.211591007704666e-06,
"loss": 3.7335,
"step": 55500
},
{
"epoch": 2.343057373242707,
"eval_loss": 3.7877399921417236,
"eval_runtime": 386.5024,
"eval_samples_per_second": 490.271,
"eval_steps_per_second": 15.322,
"step": 55500
},
{
"epoch": 2.347279098239541,
"grad_norm": 48.02127456665039,
"learning_rate": 9.188136924935209e-06,
"loss": 3.3743,
"step": 55600
},
{
"epoch": 2.3515008232363743,
"grad_norm": 84.77007293701172,
"learning_rate": 9.164682842165752e-06,
"loss": 3.5598,
"step": 55700
},
{
"epoch": 2.3557225482332083,
"grad_norm": 52.11927795410156,
"learning_rate": 9.141228759396292e-06,
"loss": 3.5939,
"step": 55800
},
{
"epoch": 2.3599442732300417,
"grad_norm": 56.104732513427734,
"learning_rate": 9.117774676626835e-06,
"loss": 3.6577,
"step": 55900
},
{
"epoch": 2.3641659982268757,
"grad_norm": 29.97869300842285,
"learning_rate": 9.094320593857376e-06,
"loss": 3.407,
"step": 56000
},
{
"epoch": 2.3641659982268757,
"eval_loss": 3.789689540863037,
"eval_runtime": 382.469,
"eval_samples_per_second": 495.441,
"eval_steps_per_second": 15.484,
"step": 56000
},
{
"epoch": 2.368387723223709,
"grad_norm": 37.954345703125,
"learning_rate": 9.070866511087919e-06,
"loss": 3.5838,
"step": 56100
},
{
"epoch": 2.372609448220543,
"grad_norm": 47.05314636230469,
"learning_rate": 9.047412428318461e-06,
"loss": 3.5182,
"step": 56200
},
{
"epoch": 2.3768311732173766,
"grad_norm": 72.84585571289062,
"learning_rate": 9.023958345549002e-06,
"loss": 3.7881,
"step": 56300
},
{
"epoch": 2.3810528982142105,
"grad_norm": 164.88848876953125,
"learning_rate": 9.000504262779543e-06,
"loss": 3.772,
"step": 56400
},
{
"epoch": 2.385274623211044,
"grad_norm": 56.11479568481445,
"learning_rate": 8.977050180010086e-06,
"loss": 3.5779,
"step": 56500
},
{
"epoch": 2.385274623211044,
"eval_loss": 3.8028056621551514,
"eval_runtime": 387.2356,
"eval_samples_per_second": 489.343,
"eval_steps_per_second": 15.293,
"step": 56500
},
{
"epoch": 2.389496348207878,
"grad_norm": 31.131439208984375,
"learning_rate": 8.953596097240628e-06,
"loss": 3.4765,
"step": 56600
},
{
"epoch": 2.3937180732047114,
"grad_norm": 41.05427169799805,
"learning_rate": 8.930142014471171e-06,
"loss": 3.5122,
"step": 56700
},
{
"epoch": 2.397939798201545,
"grad_norm": 51.679901123046875,
"learning_rate": 8.906687931701712e-06,
"loss": 3.8646,
"step": 56800
},
{
"epoch": 2.402161523198379,
"grad_norm": 21.275638580322266,
"learning_rate": 8.883233848932253e-06,
"loss": 3.4861,
"step": 56900
},
{
"epoch": 2.4063832481952128,
"grad_norm": 42.77296829223633,
"learning_rate": 8.859779766162795e-06,
"loss": 3.4486,
"step": 57000
},
{
"epoch": 2.4063832481952128,
"eval_loss": 3.8668415546417236,
"eval_runtime": 386.271,
"eval_samples_per_second": 490.565,
"eval_steps_per_second": 15.331,
"step": 57000
},
{
"epoch": 2.4106049731920463,
"grad_norm": 60.467735290527344,
"learning_rate": 8.836325683393338e-06,
"loss": 3.4319,
"step": 57100
},
{
"epoch": 2.4148266981888797,
"grad_norm": 38.16925811767578,
"learning_rate": 8.812871600623879e-06,
"loss": 3.5801,
"step": 57200
},
{
"epoch": 2.4190484231857137,
"grad_norm": 35.175376892089844,
"learning_rate": 8.789417517854422e-06,
"loss": 3.4412,
"step": 57300
},
{
"epoch": 2.4232701481825476,
"grad_norm": 37.78458023071289,
"learning_rate": 8.765963435084963e-06,
"loss": 3.4917,
"step": 57400
},
{
"epoch": 2.427491873179381,
"grad_norm": 310.4344177246094,
"learning_rate": 8.742509352315505e-06,
"loss": 3.8994,
"step": 57500
},
{
"epoch": 2.427491873179381,
"eval_loss": 3.7693347930908203,
"eval_runtime": 385.0963,
"eval_samples_per_second": 492.061,
"eval_steps_per_second": 15.378,
"step": 57500
},
{
"epoch": 2.4317135981762146,
"grad_norm": 87.53755950927734,
"learning_rate": 8.719055269546048e-06,
"loss": 3.4321,
"step": 57600
},
{
"epoch": 2.4359353231730485,
"grad_norm": 30.359472274780273,
"learning_rate": 8.695601186776589e-06,
"loss": 3.4605,
"step": 57700
},
{
"epoch": 2.4401570481698824,
"grad_norm": 47.97589874267578,
"learning_rate": 8.67214710400713e-06,
"loss": 3.5348,
"step": 57800
},
{
"epoch": 2.444378773166716,
"grad_norm": 18.004453659057617,
"learning_rate": 8.648693021237672e-06,
"loss": 3.6156,
"step": 57900
},
{
"epoch": 2.4486004981635494,
"grad_norm": 74.85140991210938,
"learning_rate": 8.625238938468215e-06,
"loss": 3.6726,
"step": 58000
},
{
"epoch": 2.4486004981635494,
"eval_loss": 3.7412655353546143,
"eval_runtime": 389.1795,
"eval_samples_per_second": 486.899,
"eval_steps_per_second": 15.217,
"step": 58000
},
{
"epoch": 2.4528222231603833,
"grad_norm": 59.58810043334961,
"learning_rate": 8.601784855698756e-06,
"loss": 3.4447,
"step": 58100
},
{
"epoch": 2.4570439481572173,
"grad_norm": 55.2222785949707,
"learning_rate": 8.578330772929298e-06,
"loss": 3.5318,
"step": 58200
},
{
"epoch": 2.4612656731540508,
"grad_norm": 30.20258903503418,
"learning_rate": 8.55487669015984e-06,
"loss": 3.4284,
"step": 58300
},
{
"epoch": 2.4654873981508842,
"grad_norm": 75.82894134521484,
"learning_rate": 8.531422607390382e-06,
"loss": 3.3426,
"step": 58400
},
{
"epoch": 2.469709123147718,
"grad_norm": 81.5134506225586,
"learning_rate": 8.507968524620925e-06,
"loss": 3.5549,
"step": 58500
},
{
"epoch": 2.469709123147718,
"eval_loss": 3.8397915363311768,
"eval_runtime": 387.2912,
"eval_samples_per_second": 489.273,
"eval_steps_per_second": 15.291,
"step": 58500
},
{
"epoch": 2.473930848144552,
"grad_norm": 43.514183044433594,
"learning_rate": 8.484514441851465e-06,
"loss": 3.7305,
"step": 58600
},
{
"epoch": 2.4781525731413856,
"grad_norm": 44.70104217529297,
"learning_rate": 8.461060359082008e-06,
"loss": 3.4777,
"step": 58700
},
{
"epoch": 2.482374298138219,
"grad_norm": 41.51865768432617,
"learning_rate": 8.43760627631255e-06,
"loss": 3.625,
"step": 58800
},
{
"epoch": 2.486596023135053,
"grad_norm": 54.522674560546875,
"learning_rate": 8.414152193543092e-06,
"loss": 3.8084,
"step": 58900
},
{
"epoch": 2.4908177481318865,
"grad_norm": 118.72567749023438,
"learning_rate": 8.390698110773634e-06,
"loss": 3.6772,
"step": 59000
},
{
"epoch": 2.4908177481318865,
"eval_loss": 3.780479669570923,
"eval_runtime": 389.0844,
"eval_samples_per_second": 487.018,
"eval_steps_per_second": 15.22,
"step": 59000
},
{
"epoch": 2.4950394731287204,
"grad_norm": 34.044559478759766,
"learning_rate": 8.367478568831871e-06,
"loss": 3.4634,
"step": 59100
},
{
"epoch": 2.499261198125554,
"grad_norm": 93.83566284179688,
"learning_rate": 8.344024486062412e-06,
"loss": 3.5926,
"step": 59200
},
{
"epoch": 2.503482923122388,
"grad_norm": 53.263832092285156,
"learning_rate": 8.320570403292953e-06,
"loss": 3.303,
"step": 59300
},
{
"epoch": 2.507704648119222,
"grad_norm": 65.40522766113281,
"learning_rate": 8.297116320523496e-06,
"loss": 3.5749,
"step": 59400
},
{
"epoch": 2.5119263731160553,
"grad_norm": 65.15473937988281,
"learning_rate": 8.273662237754038e-06,
"loss": 3.7852,
"step": 59500
},
{
"epoch": 2.5119263731160553,
"eval_loss": 3.825472593307495,
"eval_runtime": 390.0313,
"eval_samples_per_second": 485.835,
"eval_steps_per_second": 15.183,
"step": 59500
},
{
"epoch": 2.5161480981128888,
"grad_norm": 29.49171257019043,
"learning_rate": 8.25020815498458e-06,
"loss": 3.6317,
"step": 59600
},
{
"epoch": 2.5203698231097227,
"grad_norm": 31.727855682373047,
"learning_rate": 8.226754072215122e-06,
"loss": 3.3228,
"step": 59700
},
{
"epoch": 2.5245915481065566,
"grad_norm": 30.480257034301758,
"learning_rate": 8.203299989445663e-06,
"loss": 3.4541,
"step": 59800
},
{
"epoch": 2.52881327310339,
"grad_norm": 29.858154296875,
"learning_rate": 8.179845906676205e-06,
"loss": 3.5879,
"step": 59900
},
{
"epoch": 2.5330349981002236,
"grad_norm": 38.494544982910156,
"learning_rate": 8.156391823906748e-06,
"loss": 3.6403,
"step": 60000
},
{
"epoch": 2.5330349981002236,
"eval_loss": 3.7743747234344482,
"eval_runtime": 377.1382,
"eval_samples_per_second": 502.445,
"eval_steps_per_second": 15.702,
"step": 60000
},
{
"epoch": 2.5372567230970575,
"grad_norm": 48.94626998901367,
"learning_rate": 8.132937741137289e-06,
"loss": 3.5289,
"step": 60100
},
{
"epoch": 2.541478448093891,
"grad_norm": 41.8023567199707,
"learning_rate": 8.109483658367832e-06,
"loss": 3.4,
"step": 60200
},
{
"epoch": 2.545700173090725,
"grad_norm": 64.16112518310547,
"learning_rate": 8.086029575598372e-06,
"loss": 3.4026,
"step": 60300
},
{
"epoch": 2.5499218980875584,
"grad_norm": 33.682769775390625,
"learning_rate": 8.062575492828915e-06,
"loss": 3.6348,
"step": 60400
},
{
"epoch": 2.5541436230843924,
"grad_norm": 36.693504333496094,
"learning_rate": 8.039121410059458e-06,
"loss": 3.3963,
"step": 60500
},
{
"epoch": 2.5541436230843924,
"eval_loss": 3.7308504581451416,
"eval_runtime": 377.3946,
"eval_samples_per_second": 502.103,
"eval_steps_per_second": 15.692,
"step": 60500
},
{
"epoch": 2.558365348081226,
"grad_norm": 25.65692138671875,
"learning_rate": 8.015667327289999e-06,
"loss": 3.2838,
"step": 60600
},
{
"epoch": 2.5625870730780598,
"grad_norm": 62.07941436767578,
"learning_rate": 7.99221324452054e-06,
"loss": 3.8537,
"step": 60700
},
{
"epoch": 2.5668087980748933,
"grad_norm": 46.93730163574219,
"learning_rate": 7.968759161751082e-06,
"loss": 3.3861,
"step": 60800
},
{
"epoch": 2.571030523071727,
"grad_norm": 55.51591491699219,
"learning_rate": 7.945305078981625e-06,
"loss": 3.5289,
"step": 60900
},
{
"epoch": 2.5752522480685607,
"grad_norm": 33.335594177246094,
"learning_rate": 7.921850996212166e-06,
"loss": 3.7611,
"step": 61000
},
{
"epoch": 2.5752522480685607,
"eval_loss": 3.7403736114501953,
"eval_runtime": 385.32,
"eval_samples_per_second": 491.776,
"eval_steps_per_second": 15.369,
"step": 61000
},
{
"epoch": 2.5794739730653946,
"grad_norm": 30.4799861907959,
"learning_rate": 7.898396913442708e-06,
"loss": 3.3036,
"step": 61100
},
{
"epoch": 2.583695698062228,
"grad_norm": 41.96533203125,
"learning_rate": 7.87494283067325e-06,
"loss": 3.4874,
"step": 61200
},
{
"epoch": 2.587917423059062,
"grad_norm": 38.74420928955078,
"learning_rate": 7.851488747903792e-06,
"loss": 3.3885,
"step": 61300
},
{
"epoch": 2.5921391480558955,
"grad_norm": 109.50426483154297,
"learning_rate": 7.828034665134334e-06,
"loss": 3.6008,
"step": 61400
},
{
"epoch": 2.5963608730527294,
"grad_norm": 45.021488189697266,
"learning_rate": 7.804580582364875e-06,
"loss": 3.7175,
"step": 61500
},
{
"epoch": 2.5963608730527294,
"eval_loss": 3.768545389175415,
"eval_runtime": 386.8942,
"eval_samples_per_second": 489.775,
"eval_steps_per_second": 15.307,
"step": 61500
},
{
"epoch": 2.600582598049563,
"grad_norm": 62.09092330932617,
"learning_rate": 7.781361040423112e-06,
"loss": 3.589,
"step": 61600
},
{
"epoch": 2.604804323046397,
"grad_norm": 62.47654724121094,
"learning_rate": 7.757906957653653e-06,
"loss": 3.6725,
"step": 61700
},
{
"epoch": 2.6090260480432304,
"grad_norm": 46.53767776489258,
"learning_rate": 7.734452874884196e-06,
"loss": 3.3397,
"step": 61800
},
{
"epoch": 2.6132477730400643,
"grad_norm": 46.46326446533203,
"learning_rate": 7.710998792114738e-06,
"loss": 3.562,
"step": 61900
},
{
"epoch": 2.6174694980368978,
"grad_norm": 48.67976379394531,
"learning_rate": 7.687544709345281e-06,
"loss": 3.5421,
"step": 62000
},
{
"epoch": 2.6174694980368978,
"eval_loss": 3.755615711212158,
"eval_runtime": 381.9504,
"eval_samples_per_second": 496.114,
"eval_steps_per_second": 15.505,
"step": 62000
},
{
"epoch": 2.6216912230337317,
"grad_norm": 29.504335403442383,
"learning_rate": 7.664090626575822e-06,
"loss": 3.462,
"step": 62100
},
{
"epoch": 2.625912948030565,
"grad_norm": 89.24791717529297,
"learning_rate": 7.640636543806363e-06,
"loss": 3.329,
"step": 62200
},
{
"epoch": 2.630134673027399,
"grad_norm": 97.40950012207031,
"learning_rate": 7.617182461036906e-06,
"loss": 3.4274,
"step": 62300
},
{
"epoch": 2.6343563980242326,
"grad_norm": 57.1513671875,
"learning_rate": 7.593728378267447e-06,
"loss": 3.4622,
"step": 62400
},
{
"epoch": 2.6385781230210665,
"grad_norm": 39.16256332397461,
"learning_rate": 7.570274295497989e-06,
"loss": 3.2924,
"step": 62500
},
{
"epoch": 2.6385781230210665,
"eval_loss": 3.8314058780670166,
"eval_runtime": 383.303,
"eval_samples_per_second": 494.363,
"eval_steps_per_second": 15.45,
"step": 62500
},
{
"epoch": 2.6427998480179,
"grad_norm": 72.36564636230469,
"learning_rate": 7.546820212728532e-06,
"loss": 3.5465,
"step": 62600
},
{
"epoch": 2.647021573014734,
"grad_norm": 82.64096069335938,
"learning_rate": 7.5233661299590735e-06,
"loss": 3.3618,
"step": 62700
},
{
"epoch": 2.6512432980115674,
"grad_norm": 41.66115188598633,
"learning_rate": 7.499912047189614e-06,
"loss": 3.2383,
"step": 62800
},
{
"epoch": 2.6554650230084014,
"grad_norm": 114.26473999023438,
"learning_rate": 7.476457964420157e-06,
"loss": 3.3542,
"step": 62900
},
{
"epoch": 2.659686748005235,
"grad_norm": 45.76057434082031,
"learning_rate": 7.453003881650699e-06,
"loss": 3.5209,
"step": 63000
},
{
"epoch": 2.659686748005235,
"eval_loss": 3.7941720485687256,
"eval_runtime": 378.5969,
"eval_samples_per_second": 500.509,
"eval_steps_per_second": 15.642,
"step": 63000
},
{
"epoch": 2.663908473002069,
"grad_norm": 44.19281768798828,
"learning_rate": 7.4295497988812406e-06,
"loss": 3.4416,
"step": 63100
},
{
"epoch": 2.6681301979989023,
"grad_norm": 29.7802677154541,
"learning_rate": 7.406095716111783e-06,
"loss": 3.5881,
"step": 63200
},
{
"epoch": 2.672351922995736,
"grad_norm": 23.825647354125977,
"learning_rate": 7.382641633342324e-06,
"loss": 3.7347,
"step": 63300
},
{
"epoch": 2.6765736479925697,
"grad_norm": 120.45719909667969,
"learning_rate": 7.359187550572866e-06,
"loss": 3.815,
"step": 63400
},
{
"epoch": 2.6807953729894036,
"grad_norm": 76.13922882080078,
"learning_rate": 7.3357334678034085e-06,
"loss": 3.4667,
"step": 63500
},
{
"epoch": 2.6807953729894036,
"eval_loss": 3.779139995574951,
"eval_runtime": 386.2584,
"eval_samples_per_second": 490.581,
"eval_steps_per_second": 15.332,
"step": 63500
},
{
"epoch": 2.685017097986237,
"grad_norm": 67.92314147949219,
"learning_rate": 7.31227938503395e-06,
"loss": 3.342,
"step": 63600
},
{
"epoch": 2.689238822983071,
"grad_norm": 49.04058837890625,
"learning_rate": 7.288825302264493e-06,
"loss": 3.6695,
"step": 63700
},
{
"epoch": 2.6934605479799045,
"grad_norm": 90.01438903808594,
"learning_rate": 7.265371219495035e-06,
"loss": 3.4863,
"step": 63800
},
{
"epoch": 2.6976822729767385,
"grad_norm": 103.12335968017578,
"learning_rate": 7.241917136725576e-06,
"loss": 3.741,
"step": 63900
},
{
"epoch": 2.701903997973572,
"grad_norm": 26.381654739379883,
"learning_rate": 7.2186975947838125e-06,
"loss": 3.6267,
"step": 64000
},
{
"epoch": 2.701903997973572,
"eval_loss": 3.8010313510894775,
"eval_runtime": 378.1577,
"eval_samples_per_second": 501.09,
"eval_steps_per_second": 15.66,
"step": 64000
},
{
"epoch": 2.7061257229704054,
"grad_norm": 39.231876373291016,
"learning_rate": 7.195243512014355e-06,
"loss": 3.2939,
"step": 64100
},
{
"epoch": 2.7103474479672394,
"grad_norm": 79.92709350585938,
"learning_rate": 7.171789429244896e-06,
"loss": 3.4422,
"step": 64200
},
{
"epoch": 2.7145691729640733,
"grad_norm": 55.60668182373047,
"learning_rate": 7.148335346475438e-06,
"loss": 3.7278,
"step": 64300
},
{
"epoch": 2.718790897960907,
"grad_norm": 67.64916229248047,
"learning_rate": 7.1248812637059804e-06,
"loss": 3.8691,
"step": 64400
},
{
"epoch": 2.7230126229577403,
"grad_norm": 75.73139953613281,
"learning_rate": 7.101427180936522e-06,
"loss": 3.611,
"step": 64500
},
{
"epoch": 2.7230126229577403,
"eval_loss": 3.717172145843506,
"eval_runtime": 384.6215,
"eval_samples_per_second": 492.669,
"eval_steps_per_second": 15.397,
"step": 64500
},
{
"epoch": 2.727234347954574,
"grad_norm": 47.768821716308594,
"learning_rate": 7.077973098167064e-06,
"loss": 3.5474,
"step": 64600
},
{
"epoch": 2.731456072951408,
"grad_norm": 112.48494720458984,
"learning_rate": 7.054519015397607e-06,
"loss": 3.5087,
"step": 64700
},
{
"epoch": 2.7356777979482416,
"grad_norm": 64.66181945800781,
"learning_rate": 7.0310649326281475e-06,
"loss": 3.4489,
"step": 64800
},
{
"epoch": 2.739899522945075,
"grad_norm": 35.16935348510742,
"learning_rate": 7.007610849858689e-06,
"loss": 3.6549,
"step": 64900
},
{
"epoch": 2.744121247941909,
"grad_norm": 36.89544677734375,
"learning_rate": 6.984156767089232e-06,
"loss": 3.3956,
"step": 65000
},
{
"epoch": 2.744121247941909,
"eval_loss": 3.735604763031006,
"eval_runtime": 383.8224,
"eval_samples_per_second": 493.694,
"eval_steps_per_second": 15.429,
"step": 65000
},
{
"epoch": 2.748342972938743,
"grad_norm": 37.28645706176758,
"learning_rate": 6.960702684319774e-06,
"loss": 3.5116,
"step": 65100
},
{
"epoch": 2.7525646979355765,
"grad_norm": 85.3537826538086,
"learning_rate": 6.937248601550316e-06,
"loss": 3.1777,
"step": 65200
},
{
"epoch": 2.75678642293241,
"grad_norm": 79.33281707763672,
"learning_rate": 6.913794518780857e-06,
"loss": 3.7644,
"step": 65300
},
{
"epoch": 2.761008147929244,
"grad_norm": 58.84652328491211,
"learning_rate": 6.890340436011399e-06,
"loss": 3.6376,
"step": 65400
},
{
"epoch": 2.765229872926078,
"grad_norm": 95.42400360107422,
"learning_rate": 6.866886353241942e-06,
"loss": 3.6153,
"step": 65500
},
{
"epoch": 2.765229872926078,
"eval_loss": 3.7485992908477783,
"eval_runtime": 386.3393,
"eval_samples_per_second": 490.478,
"eval_steps_per_second": 15.328,
"step": 65500
},
{
"epoch": 2.7694515979229113,
"grad_norm": 43.77649688720703,
"learning_rate": 6.843432270472483e-06,
"loss": 3.2564,
"step": 65600
},
{
"epoch": 2.7736733229197448,
"grad_norm": 26.90199089050293,
"learning_rate": 6.819978187703024e-06,
"loss": 3.5547,
"step": 65700
},
{
"epoch": 2.7778950479165787,
"grad_norm": 65.34386444091797,
"learning_rate": 6.796524104933567e-06,
"loss": 3.3283,
"step": 65800
},
{
"epoch": 2.7821167729134126,
"grad_norm": 39.57930374145508,
"learning_rate": 6.773070022164109e-06,
"loss": 3.4592,
"step": 65900
},
{
"epoch": 2.786338497910246,
"grad_norm": 53.023067474365234,
"learning_rate": 6.7496159393946505e-06,
"loss": 3.7505,
"step": 66000
},
{
"epoch": 2.786338497910246,
"eval_loss": 3.725229263305664,
"eval_runtime": 384.6647,
"eval_samples_per_second": 492.613,
"eval_steps_per_second": 15.395,
"step": 66000
},
{
"epoch": 2.7905602229070796,
"grad_norm": 30.69059181213379,
"learning_rate": 6.7263963974528865e-06,
"loss": 3.3761,
"step": 66100
},
{
"epoch": 2.7947819479039135,
"grad_norm": 67.33950805664062,
"learning_rate": 6.702942314683429e-06,
"loss": 3.6223,
"step": 66200
},
{
"epoch": 2.7990036729007475,
"grad_norm": 72.26821899414062,
"learning_rate": 6.679488231913971e-06,
"loss": 3.4702,
"step": 66300
},
{
"epoch": 2.803225397897581,
"grad_norm": 42.78792190551758,
"learning_rate": 6.656034149144513e-06,
"loss": 3.8666,
"step": 66400
},
{
"epoch": 2.8074471228944144,
"grad_norm": 36.370338439941406,
"learning_rate": 6.632580066375055e-06,
"loss": 3.2927,
"step": 66500
},
{
"epoch": 2.8074471228944144,
"eval_loss": 3.7299137115478516,
"eval_runtime": 379.7934,
"eval_samples_per_second": 498.932,
"eval_steps_per_second": 15.593,
"step": 66500
},
{
"epoch": 2.8116688478912484,
"grad_norm": 108.68196868896484,
"learning_rate": 6.609125983605596e-06,
"loss": 3.5424,
"step": 66600
},
{
"epoch": 2.8158905728880823,
"grad_norm": 43.846248626708984,
"learning_rate": 6.585671900836138e-06,
"loss": 3.5487,
"step": 66700
},
{
"epoch": 2.820112297884916,
"grad_norm": 39.46868133544922,
"learning_rate": 6.562217818066681e-06,
"loss": 3.3343,
"step": 66800
},
{
"epoch": 2.8243340228817493,
"grad_norm": 176.4261016845703,
"learning_rate": 6.538763735297222e-06,
"loss": 3.3005,
"step": 66900
},
{
"epoch": 2.828555747878583,
"grad_norm": 49.91089630126953,
"learning_rate": 6.515309652527765e-06,
"loss": 3.5036,
"step": 67000
},
{
"epoch": 2.828555747878583,
"eval_loss": 3.7751986980438232,
"eval_runtime": 382.91,
"eval_samples_per_second": 494.871,
"eval_steps_per_second": 15.466,
"step": 67000
},
{
"epoch": 2.832777472875417,
"grad_norm": 42.26047897338867,
"learning_rate": 6.491855569758306e-06,
"loss": 3.4419,
"step": 67100
},
{
"epoch": 2.8369991978722506,
"grad_norm": 61.47079849243164,
"learning_rate": 6.468401486988848e-06,
"loss": 3.3805,
"step": 67200
},
{
"epoch": 2.841220922869084,
"grad_norm": 36.838199615478516,
"learning_rate": 6.44494740421939e-06,
"loss": 3.3591,
"step": 67300
},
{
"epoch": 2.845442647865918,
"grad_norm": 38.40938186645508,
"learning_rate": 6.421493321449932e-06,
"loss": 3.738,
"step": 67400
},
{
"epoch": 2.849664372862752,
"grad_norm": 72.58192443847656,
"learning_rate": 6.398039238680474e-06,
"loss": 3.268,
"step": 67500
},
{
"epoch": 2.849664372862752,
"eval_loss": 3.765678882598877,
"eval_runtime": 368.4593,
"eval_samples_per_second": 514.279,
"eval_steps_per_second": 16.072,
"step": 67500
},
{
"epoch": 2.8538860978595855,
"grad_norm": 70.88404083251953,
"learning_rate": 6.3745851559110165e-06,
"loss": 3.4224,
"step": 67600
},
{
"epoch": 2.858107822856419,
"grad_norm": 32.05725860595703,
"learning_rate": 6.3511310731415574e-06,
"loss": 3.5734,
"step": 67700
},
{
"epoch": 2.862329547853253,
"grad_norm": 29.63147735595703,
"learning_rate": 6.327676990372099e-06,
"loss": 3.3804,
"step": 67800
},
{
"epoch": 2.866551272850087,
"grad_norm": 31.01226043701172,
"learning_rate": 6.304222907602642e-06,
"loss": 3.594,
"step": 67900
},
{
"epoch": 2.8707729978469203,
"grad_norm": 41.206783294677734,
"learning_rate": 6.280768824833184e-06,
"loss": 3.6526,
"step": 68000
},
{
"epoch": 2.8707729978469203,
"eval_loss": 3.779634714126587,
"eval_runtime": 382.5774,
"eval_samples_per_second": 495.301,
"eval_steps_per_second": 15.479,
"step": 68000
},
{
"epoch": 2.874994722843754,
"grad_norm": Infinity,
"learning_rate": 6.25754928289142e-06,
"loss": 3.7921,
"step": 68100
},
{
"epoch": 2.8792164478405877,
"grad_norm": 196.81781005859375,
"learning_rate": 6.2340952001219614e-06,
"loss": 3.352,
"step": 68200
},
{
"epoch": 2.883438172837421,
"grad_norm": 62.34221267700195,
"learning_rate": 6.210641117352504e-06,
"loss": 3.7122,
"step": 68300
},
{
"epoch": 2.887659897834255,
"grad_norm": 45.04619216918945,
"learning_rate": 6.187187034583045e-06,
"loss": 3.5739,
"step": 68400
},
{
"epoch": 2.8918816228310886,
"grad_norm": 29.019325256347656,
"learning_rate": 6.163732951813588e-06,
"loss": 3.3912,
"step": 68500
},
{
"epoch": 2.8918816228310886,
"eval_loss": 3.7358908653259277,
"eval_runtime": 382.9272,
"eval_samples_per_second": 494.849,
"eval_steps_per_second": 15.465,
"step": 68500
},
{
"epoch": 2.8961033478279226,
"grad_norm": 36.82758331298828,
"learning_rate": 6.140278869044129e-06,
"loss": 3.8863,
"step": 68600
},
{
"epoch": 2.900325072824756,
"grad_norm": 45.10524368286133,
"learning_rate": 6.116824786274671e-06,
"loss": 3.6851,
"step": 68700
},
{
"epoch": 2.90454679782159,
"grad_norm": 47.0596923828125,
"learning_rate": 6.093370703505214e-06,
"loss": 3.1867,
"step": 68800
},
{
"epoch": 2.9087685228184235,
"grad_norm": 97.21186828613281,
"learning_rate": 6.0699166207357555e-06,
"loss": 3.2456,
"step": 68900
},
{
"epoch": 2.9129902478152574,
"grad_norm": 42.81410598754883,
"learning_rate": 6.0464625379662964e-06,
"loss": 3.447,
"step": 69000
},
{
"epoch": 2.9129902478152574,
"eval_loss": 3.7272026538848877,
"eval_runtime": 373.7122,
"eval_samples_per_second": 507.051,
"eval_steps_per_second": 15.846,
"step": 69000
},
{
"epoch": 2.917211972812091,
"grad_norm": 21.994388580322266,
"learning_rate": 6.023008455196839e-06,
"loss": 3.3142,
"step": 69100
},
{
"epoch": 2.921433697808925,
"grad_norm": 75.22715759277344,
"learning_rate": 5.999554372427381e-06,
"loss": 3.8019,
"step": 69200
},
{
"epoch": 2.9256554228057583,
"grad_norm": 108.84042358398438,
"learning_rate": 5.976100289657923e-06,
"loss": 3.6041,
"step": 69300
},
{
"epoch": 2.9298771478025922,
"grad_norm": 49.068809509277344,
"learning_rate": 5.952646206888465e-06,
"loss": 3.6291,
"step": 69400
},
{
"epoch": 2.9340988727994257,
"grad_norm": 27.290891647338867,
"learning_rate": 5.929192124119006e-06,
"loss": 3.5412,
"step": 69500
},
{
"epoch": 2.9340988727994257,
"eval_loss": 3.704101085662842,
"eval_runtime": 378.324,
"eval_samples_per_second": 500.87,
"eval_steps_per_second": 15.653,
"step": 69500
},
{
"epoch": 2.9383205977962596,
"grad_norm": 33.8206901550293,
"learning_rate": 5.905738041349548e-06,
"loss": 3.5873,
"step": 69600
},
{
"epoch": 2.942542322793093,
"grad_norm": 52.327754974365234,
"learning_rate": 5.8822839585800905e-06,
"loss": 3.6207,
"step": 69700
},
{
"epoch": 2.946764047789927,
"grad_norm": 193.724853515625,
"learning_rate": 5.858829875810632e-06,
"loss": 3.5858,
"step": 69800
},
{
"epoch": 2.9509857727867606,
"grad_norm": 51.89821243286133,
"learning_rate": 5.835375793041173e-06,
"loss": 3.5341,
"step": 69900
},
{
"epoch": 2.9552074977835945,
"grad_norm": 42.79822540283203,
"learning_rate": 5.811921710271716e-06,
"loss": 3.6644,
"step": 70000
},
{
"epoch": 2.9552074977835945,
"eval_loss": 3.740612268447876,
"eval_runtime": 378.139,
"eval_samples_per_second": 501.115,
"eval_steps_per_second": 15.661,
"step": 70000
},
{
"epoch": 2.959429222780428,
"grad_norm": 31.204317092895508,
"learning_rate": 5.788467627502258e-06,
"loss": 3.4947,
"step": 70100
},
{
"epoch": 2.963650947777262,
"grad_norm": 43.13151931762695,
"learning_rate": 5.765013544732799e-06,
"loss": 3.5763,
"step": 70200
},
{
"epoch": 2.9678726727740954,
"grad_norm": 91.4846420288086,
"learning_rate": 5.741559461963342e-06,
"loss": 3.6131,
"step": 70300
},
{
"epoch": 2.9720943977709293,
"grad_norm": 72.16215515136719,
"learning_rate": 5.718105379193884e-06,
"loss": 3.509,
"step": 70400
},
{
"epoch": 2.976316122767763,
"grad_norm": 80.8546371459961,
"learning_rate": 5.694651296424426e-06,
"loss": 3.4352,
"step": 70500
},
{
"epoch": 2.976316122767763,
"eval_loss": 3.7115588188171387,
"eval_runtime": 375.398,
"eval_samples_per_second": 504.774,
"eval_steps_per_second": 15.775,
"step": 70500
},
{
"epoch": 2.9805378477645967,
"grad_norm": 30.867778778076172,
"learning_rate": 5.671197213654967e-06,
"loss": 3.3115,
"step": 70600
},
{
"epoch": 2.9847595727614302,
"grad_norm": 77.84353637695312,
"learning_rate": 5.647743130885509e-06,
"loss": 3.2393,
"step": 70700
},
{
"epoch": 2.988981297758264,
"grad_norm": 87.11060333251953,
"learning_rate": 5.624289048116052e-06,
"loss": 3.3738,
"step": 70800
},
{
"epoch": 2.9932030227550976,
"grad_norm": 54.43430709838867,
"learning_rate": 5.6008349653465935e-06,
"loss": 3.424,
"step": 70900
},
{
"epoch": 2.9974247477519316,
"grad_norm": 48.77461242675781,
"learning_rate": 5.5773808825771344e-06,
"loss": 3.7252,
"step": 71000
},
{
"epoch": 2.9974247477519316,
"eval_loss": 3.7152390480041504,
"eval_runtime": 381.7179,
"eval_samples_per_second": 496.416,
"eval_steps_per_second": 15.514,
"step": 71000
},
{
"epoch": 3.001646472748765,
"grad_norm": 83.55622100830078,
"learning_rate": 5.553926799807677e-06,
"loss": 3.6013,
"step": 71100
},
{
"epoch": 3.005868197745599,
"grad_norm": 26.548662185668945,
"learning_rate": 5.530472717038219e-06,
"loss": 3.407,
"step": 71200
},
{
"epoch": 3.0100899227424325,
"grad_norm": 256.9148864746094,
"learning_rate": 5.507018634268761e-06,
"loss": 3.2695,
"step": 71300
},
{
"epoch": 3.0143116477392664,
"grad_norm": 70.4573974609375,
"learning_rate": 5.483564551499303e-06,
"loss": 3.3632,
"step": 71400
},
{
"epoch": 3.0185333727361,
"grad_norm": 34.62995529174805,
"learning_rate": 5.460110468729844e-06,
"loss": 2.95,
"step": 71500
},
{
"epoch": 3.0185333727361,
"eval_loss": 3.7984957695007324,
"eval_runtime": 381.4288,
"eval_samples_per_second": 496.793,
"eval_steps_per_second": 15.526,
"step": 71500
},
{
"epoch": 3.022755097732934,
"grad_norm": 47.09511184692383,
"learning_rate": 5.436656385960386e-06,
"loss": 3.164,
"step": 71600
},
{
"epoch": 3.0269768227297673,
"grad_norm": 43.297088623046875,
"learning_rate": 5.4132023031909285e-06,
"loss": 3.4829,
"step": 71700
},
{
"epoch": 3.0311985477266012,
"grad_norm": 119.86432647705078,
"learning_rate": 5.38974822042147e-06,
"loss": 3.7491,
"step": 71800
},
{
"epoch": 3.0354202727234347,
"grad_norm": 35.53313064575195,
"learning_rate": 5.366294137652012e-06,
"loss": 3.7257,
"step": 71900
},
{
"epoch": 3.0396419977202687,
"grad_norm": 82.5821304321289,
"learning_rate": 5.342840054882555e-06,
"loss": 3.701,
"step": 72000
},
{
"epoch": 3.0396419977202687,
"eval_loss": 3.749661684036255,
"eval_runtime": 378.3724,
"eval_samples_per_second": 500.806,
"eval_steps_per_second": 15.651,
"step": 72000
},
{
"epoch": 3.043863722717102,
"grad_norm": 46.78045654296875,
"learning_rate": 5.319385972113096e-06,
"loss": 3.105,
"step": 72100
},
{
"epoch": 3.048085447713936,
"grad_norm": 24.64872169494629,
"learning_rate": 5.295931889343638e-06,
"loss": 3.43,
"step": 72200
},
{
"epoch": 3.0523071727107696,
"grad_norm": 54.81727981567383,
"learning_rate": 5.272946888229569e-06,
"loss": 3.3014,
"step": 72300
},
{
"epoch": 3.0565288977076035,
"grad_norm": 35.038734436035156,
"learning_rate": 5.249492805460111e-06,
"loss": 3.2578,
"step": 72400
},
{
"epoch": 3.060750622704437,
"grad_norm": 63.87615203857422,
"learning_rate": 5.226038722690653e-06,
"loss": 3.4971,
"step": 72500
},
{
"epoch": 3.060750622704437,
"eval_loss": 3.7966361045837402,
"eval_runtime": 384.8477,
"eval_samples_per_second": 492.379,
"eval_steps_per_second": 15.388,
"step": 72500
},
{
"epoch": 3.064972347701271,
"grad_norm": 41.326568603515625,
"learning_rate": 5.202584639921195e-06,
"loss": 3.4542,
"step": 72600
},
{
"epoch": 3.0691940726981044,
"grad_norm": 38.29585647583008,
"learning_rate": 5.179130557151737e-06,
"loss": 3.4634,
"step": 72700
},
{
"epoch": 3.0734157976949383,
"grad_norm": 131.59951782226562,
"learning_rate": 5.155676474382278e-06,
"loss": 3.6576,
"step": 72800
},
{
"epoch": 3.077637522691772,
"grad_norm": 33.464576721191406,
"learning_rate": 5.13222239161282e-06,
"loss": 3.38,
"step": 72900
},
{
"epoch": 3.0818592476886058,
"grad_norm": 56.84671401977539,
"learning_rate": 5.108768308843363e-06,
"loss": 3.4615,
"step": 73000
},
{
"epoch": 3.0818592476886058,
"eval_loss": 3.733682870864868,
"eval_runtime": 379.8356,
"eval_samples_per_second": 498.876,
"eval_steps_per_second": 15.591,
"step": 73000
},
{
"epoch": 3.0860809726854392,
"grad_norm": 78.62255096435547,
"learning_rate": 5.0853142260739044e-06,
"loss": 3.6838,
"step": 73100
},
{
"epoch": 3.090302697682273,
"grad_norm": 40.18800735473633,
"learning_rate": 5.061860143304445e-06,
"loss": 3.2183,
"step": 73200
},
{
"epoch": 3.0945244226791067,
"grad_norm": 72.73197174072266,
"learning_rate": 5.038406060534988e-06,
"loss": 3.3281,
"step": 73300
},
{
"epoch": 3.0987461476759406,
"grad_norm": 91.64218139648438,
"learning_rate": 5.01495197776553e-06,
"loss": 3.2304,
"step": 73400
},
{
"epoch": 3.102967872672774,
"grad_norm": 43.35820007324219,
"learning_rate": 4.9914978949960715e-06,
"loss": 3.3314,
"step": 73500
},
{
"epoch": 3.102967872672774,
"eval_loss": 3.7586066722869873,
"eval_runtime": 377.7973,
"eval_samples_per_second": 501.568,
"eval_steps_per_second": 15.675,
"step": 73500
},
{
"epoch": 3.107189597669608,
"grad_norm": 175.2445526123047,
"learning_rate": 4.968043812226614e-06,
"loss": 3.0549,
"step": 73600
},
{
"epoch": 3.1114113226664415,
"grad_norm": 57.96905517578125,
"learning_rate": 4.944589729457155e-06,
"loss": 3.4578,
"step": 73700
},
{
"epoch": 3.1156330476632754,
"grad_norm": 32.305484771728516,
"learning_rate": 4.921135646687698e-06,
"loss": 3.3797,
"step": 73800
},
{
"epoch": 3.119854772660109,
"grad_norm": 32.543643951416016,
"learning_rate": 4.8976815639182395e-06,
"loss": 3.4435,
"step": 73900
},
{
"epoch": 3.124076497656943,
"grad_norm": 62.69993591308594,
"learning_rate": 4.874227481148781e-06,
"loss": 3.1522,
"step": 74000
},
{
"epoch": 3.124076497656943,
"eval_loss": 3.7354512214660645,
"eval_runtime": 379.0534,
"eval_samples_per_second": 499.906,
"eval_steps_per_second": 15.623,
"step": 74000
},
{
"epoch": 3.1282982226537763,
"grad_norm": 42.34346008300781,
"learning_rate": 4.850773398379323e-06,
"loss": 3.3775,
"step": 74100
},
{
"epoch": 3.13251994765061,
"grad_norm": 88.97834014892578,
"learning_rate": 4.827319315609866e-06,
"loss": 3.5751,
"step": 74200
},
{
"epoch": 3.1367416726474437,
"grad_norm": 99.53689575195312,
"learning_rate": 4.803865232840407e-06,
"loss": 3.5017,
"step": 74300
},
{
"epoch": 3.1409633976442777,
"grad_norm": 29.37308120727539,
"learning_rate": 4.780411150070949e-06,
"loss": 3.3353,
"step": 74400
},
{
"epoch": 3.145185122641111,
"grad_norm": 80.75165557861328,
"learning_rate": 4.757191608129185e-06,
"loss": 3.5746,
"step": 74500
},
{
"epoch": 3.145185122641111,
"eval_loss": 3.7411203384399414,
"eval_runtime": 374.8408,
"eval_samples_per_second": 505.524,
"eval_steps_per_second": 15.799,
"step": 74500
},
{
"epoch": 3.1494068476379447,
"grad_norm": 26.067644119262695,
"learning_rate": 4.733737525359727e-06,
"loss": 3.7003,
"step": 74600
},
{
"epoch": 3.1536285726347786,
"grad_norm": 32.2315788269043,
"learning_rate": 4.71028344259027e-06,
"loss": 3.0499,
"step": 74700
},
{
"epoch": 3.157850297631612,
"grad_norm": 49.030643463134766,
"learning_rate": 4.686829359820811e-06,
"loss": 3.3735,
"step": 74800
},
{
"epoch": 3.162072022628446,
"grad_norm": 103.1226577758789,
"learning_rate": 4.663375277051353e-06,
"loss": 3.5844,
"step": 74900
},
{
"epoch": 3.1662937476252795,
"grad_norm": 54.65353775024414,
"learning_rate": 4.639921194281895e-06,
"loss": 3.3551,
"step": 75000
},
{
"epoch": 3.1662937476252795,
"eval_loss": 3.8240253925323486,
"eval_runtime": 380.1742,
"eval_samples_per_second": 498.432,
"eval_steps_per_second": 15.577,
"step": 75000
},
{
"epoch": 3.1705154726221134,
"grad_norm": 48.09339904785156,
"learning_rate": 4.616467111512437e-06,
"loss": 3.262,
"step": 75100
},
{
"epoch": 3.174737197618947,
"grad_norm": 37.72649383544922,
"learning_rate": 4.5930130287429785e-06,
"loss": 3.4301,
"step": 75200
},
{
"epoch": 3.178958922615781,
"grad_norm": 46.34507369995117,
"learning_rate": 4.56955894597352e-06,
"loss": 3.464,
"step": 75300
},
{
"epoch": 3.1831806476126143,
"grad_norm": 68.04910278320312,
"learning_rate": 4.546104863204063e-06,
"loss": 3.4751,
"step": 75400
},
{
"epoch": 3.1874023726094483,
"grad_norm": 44.80384826660156,
"learning_rate": 4.522650780434605e-06,
"loss": 3.5351,
"step": 75500
},
{
"epoch": 3.1874023726094483,
"eval_loss": 3.7464685440063477,
"eval_runtime": 387.0901,
"eval_samples_per_second": 489.527,
"eval_steps_per_second": 15.299,
"step": 75500
},
{
"epoch": 3.1916240976062817,
"grad_norm": 51.414955139160156,
"learning_rate": 4.499196697665146e-06,
"loss": 3.2933,
"step": 75600
},
{
"epoch": 3.1958458226031157,
"grad_norm": 78.93827056884766,
"learning_rate": 4.475742614895688e-06,
"loss": 4.0448,
"step": 75700
},
{
"epoch": 3.200067547599949,
"grad_norm": 28.45920753479004,
"learning_rate": 4.452288532126231e-06,
"loss": 3.4882,
"step": 75800
},
{
"epoch": 3.204289272596783,
"grad_norm": 56.69211959838867,
"learning_rate": 4.428834449356772e-06,
"loss": 3.615,
"step": 75900
},
{
"epoch": 3.2085109975936166,
"grad_norm": 43.08451843261719,
"learning_rate": 4.405380366587314e-06,
"loss": 3.1492,
"step": 76000
},
{
"epoch": 3.2085109975936166,
"eval_loss": 3.7142016887664795,
"eval_runtime": 383.7896,
"eval_samples_per_second": 493.737,
"eval_steps_per_second": 15.43,
"step": 76000
},
{
"epoch": 3.2127327225904505,
"grad_norm": 43.29754638671875,
"learning_rate": 4.381926283817856e-06,
"loss": 3.0458,
"step": 76100
},
{
"epoch": 3.216954447587284,
"grad_norm": 87.42881774902344,
"learning_rate": 4.358472201048398e-06,
"loss": 3.6002,
"step": 76200
},
{
"epoch": 3.221176172584118,
"grad_norm": 119.25186920166016,
"learning_rate": 4.33501811827894e-06,
"loss": 3.3197,
"step": 76300
},
{
"epoch": 3.2253978975809514,
"grad_norm": 70.65145111083984,
"learning_rate": 4.3115640355094814e-06,
"loss": 3.3113,
"step": 76400
},
{
"epoch": 3.2296196225777853,
"grad_norm": 48.13566207885742,
"learning_rate": 4.288109952740024e-06,
"loss": 3.3607,
"step": 76500
},
{
"epoch": 3.2296196225777853,
"eval_loss": 3.777985095977783,
"eval_runtime": 381.0294,
"eval_samples_per_second": 497.313,
"eval_steps_per_second": 15.542,
"step": 76500
},
{
"epoch": 3.233841347574619,
"grad_norm": 43.97998809814453,
"learning_rate": 4.264655869970565e-06,
"loss": 3.3242,
"step": 76600
},
{
"epoch": 3.2380630725714528,
"grad_norm": 97.08489990234375,
"learning_rate": 4.241201787201108e-06,
"loss": 3.6477,
"step": 76700
},
{
"epoch": 3.2422847975682862,
"grad_norm": 63.02432632446289,
"learning_rate": 4.217747704431649e-06,
"loss": 3.1657,
"step": 76800
},
{
"epoch": 3.24650652256512,
"grad_norm": 34.82364273071289,
"learning_rate": 4.194528162489886e-06,
"loss": 3.0839,
"step": 76900
},
{
"epoch": 3.2507282475619537,
"grad_norm": 37.6974983215332,
"learning_rate": 4.171074079720427e-06,
"loss": 3.599,
"step": 77000
},
{
"epoch": 3.2507282475619537,
"eval_loss": 3.7618165016174316,
"eval_runtime": 380.91,
"eval_samples_per_second": 497.469,
"eval_steps_per_second": 15.547,
"step": 77000
},
{
"epoch": 3.2549499725587876,
"grad_norm": 44.467830657958984,
"learning_rate": 4.14761999695097e-06,
"loss": 3.1563,
"step": 77100
},
{
"epoch": 3.259171697555621,
"grad_norm": 25.788896560668945,
"learning_rate": 4.124165914181512e-06,
"loss": 3.1867,
"step": 77200
},
{
"epoch": 3.263393422552455,
"grad_norm": 57.96255874633789,
"learning_rate": 4.100711831412053e-06,
"loss": 3.5676,
"step": 77300
},
{
"epoch": 3.2676151475492885,
"grad_norm": 58.24321746826172,
"learning_rate": 4.077257748642595e-06,
"loss": 3.6313,
"step": 77400
},
{
"epoch": 3.2718368725461224,
"grad_norm": 48.765167236328125,
"learning_rate": 4.053803665873137e-06,
"loss": 3.2504,
"step": 77500
},
{
"epoch": 3.2718368725461224,
"eval_loss": 3.7300686836242676,
"eval_runtime": 379.0002,
"eval_samples_per_second": 499.976,
"eval_steps_per_second": 15.625,
"step": 77500
},
{
"epoch": 3.276058597542956,
"grad_norm": 89.91081237792969,
"learning_rate": 4.0303495831036795e-06,
"loss": 3.2488,
"step": 77600
},
{
"epoch": 3.28028032253979,
"grad_norm": 85.31146240234375,
"learning_rate": 4.0068955003342205e-06,
"loss": 3.0412,
"step": 77700
},
{
"epoch": 3.2845020475366233,
"grad_norm": 81.51136016845703,
"learning_rate": 3.983441417564763e-06,
"loss": 3.1514,
"step": 77800
},
{
"epoch": 3.2887237725334573,
"grad_norm": 33.587486267089844,
"learning_rate": 3.959987334795305e-06,
"loss": 2.9742,
"step": 77900
},
{
"epoch": 3.2929454975302908,
"grad_norm": 58.87001037597656,
"learning_rate": 3.936533252025847e-06,
"loss": 3.395,
"step": 78000
},
{
"epoch": 3.2929454975302908,
"eval_loss": 3.757880210876465,
"eval_runtime": 387.0641,
"eval_samples_per_second": 489.56,
"eval_steps_per_second": 15.3,
"step": 78000
},
{
"epoch": 3.2971672225271247,
"grad_norm": 101.9001693725586,
"learning_rate": 3.913079169256388e-06,
"loss": 3.5513,
"step": 78100
},
{
"epoch": 3.301388947523958,
"grad_norm": 59.32830047607422,
"learning_rate": 3.889625086486931e-06,
"loss": 3.3194,
"step": 78200
},
{
"epoch": 3.305610672520792,
"grad_norm": 40.857460021972656,
"learning_rate": 3.866171003717473e-06,
"loss": 3.2702,
"step": 78300
},
{
"epoch": 3.3098323975176256,
"grad_norm": 32.93022155761719,
"learning_rate": 3.8427169209480146e-06,
"loss": 3.322,
"step": 78400
},
{
"epoch": 3.3140541225144595,
"grad_norm": 50.25773239135742,
"learning_rate": 3.819262838178556e-06,
"loss": 3.5357,
"step": 78500
},
{
"epoch": 3.3140541225144595,
"eval_loss": 3.782740354537964,
"eval_runtime": 385.7272,
"eval_samples_per_second": 491.257,
"eval_steps_per_second": 15.353,
"step": 78500
},
{
"epoch": 3.318275847511293,
"grad_norm": 136.00146484375,
"learning_rate": 3.7958087554090985e-06,
"loss": 3.3831,
"step": 78600
},
{
"epoch": 3.322497572508127,
"grad_norm": 56.89426803588867,
"learning_rate": 3.77235467263964e-06,
"loss": 3.3878,
"step": 78700
},
{
"epoch": 3.3267192975049604,
"grad_norm": 39.877349853515625,
"learning_rate": 3.748900589870182e-06,
"loss": 3.2869,
"step": 78800
},
{
"epoch": 3.3309410225017944,
"grad_norm": 70.3539047241211,
"learning_rate": 3.725446507100724e-06,
"loss": 3.7636,
"step": 78900
},
{
"epoch": 3.335162747498628,
"grad_norm": 72.76617431640625,
"learning_rate": 3.701992424331266e-06,
"loss": 3.4089,
"step": 79000
},
{
"epoch": 3.335162747498628,
"eval_loss": 3.7984070777893066,
"eval_runtime": 375.5303,
"eval_samples_per_second": 504.596,
"eval_steps_per_second": 15.77,
"step": 79000
},
{
"epoch": 3.3393844724954618,
"grad_norm": 37.42483901977539,
"learning_rate": 3.6785383415618074e-06,
"loss": 3.3371,
"step": 79100
},
{
"epoch": 3.3436061974922953,
"grad_norm": 54.95014953613281,
"learning_rate": 3.6550842587923496e-06,
"loss": 3.5966,
"step": 79200
},
{
"epoch": 3.347827922489129,
"grad_norm": 231.54039001464844,
"learning_rate": 3.6316301760228918e-06,
"loss": 3.8318,
"step": 79300
},
{
"epoch": 3.3520496474859627,
"grad_norm": 22.132776260375977,
"learning_rate": 3.6084106340811283e-06,
"loss": 3.4452,
"step": 79400
},
{
"epoch": 3.3562713724827966,
"grad_norm": 46.317466735839844,
"learning_rate": 3.5849565513116696e-06,
"loss": 3.1789,
"step": 79500
},
{
"epoch": 3.3562713724827966,
"eval_loss": 3.728987693786621,
"eval_runtime": 381.2454,
"eval_samples_per_second": 497.032,
"eval_steps_per_second": 15.533,
"step": 79500
},
{
"epoch": 3.36049309747963,
"grad_norm": 18.88166046142578,
"learning_rate": 3.561502468542212e-06,
"loss": 3.1829,
"step": 79600
},
{
"epoch": 3.364714822476464,
"grad_norm": 61.83921813964844,
"learning_rate": 3.5380483857727536e-06,
"loss": 3.4624,
"step": 79700
},
{
"epoch": 3.3689365474732975,
"grad_norm": 43.87940216064453,
"learning_rate": 3.5145943030032958e-06,
"loss": 3.3163,
"step": 79800
},
{
"epoch": 3.3731582724701314,
"grad_norm": 35.34148406982422,
"learning_rate": 3.4911402202338375e-06,
"loss": 3.2591,
"step": 79900
},
{
"epoch": 3.377379997466965,
"grad_norm": 38.877281188964844,
"learning_rate": 3.4676861374643793e-06,
"loss": 3.2375,
"step": 80000
},
{
"epoch": 3.377379997466965,
"eval_loss": 3.715250015258789,
"eval_runtime": 385.7979,
"eval_samples_per_second": 491.166,
"eval_steps_per_second": 15.35,
"step": 80000
},
{
"epoch": 3.381601722463799,
"grad_norm": 94.71104431152344,
"learning_rate": 3.4442320546949215e-06,
"loss": 3.0596,
"step": 80100
},
{
"epoch": 3.3858234474606324,
"grad_norm": 87.31482696533203,
"learning_rate": 3.420777971925463e-06,
"loss": 3.2673,
"step": 80200
},
{
"epoch": 3.3900451724574663,
"grad_norm": 61.90350341796875,
"learning_rate": 3.397323889156005e-06,
"loss": 3.8284,
"step": 80300
},
{
"epoch": 3.3942668974542998,
"grad_norm": 36.948333740234375,
"learning_rate": 3.3738698063865473e-06,
"loss": 3.2518,
"step": 80400
},
{
"epoch": 3.3984886224511337,
"grad_norm": 74.06956481933594,
"learning_rate": 3.3504157236170886e-06,
"loss": 3.4214,
"step": 80500
},
{
"epoch": 3.3984886224511337,
"eval_loss": 3.7571513652801514,
"eval_runtime": 365.367,
"eval_samples_per_second": 518.632,
"eval_steps_per_second": 16.208,
"step": 80500
},
{
"epoch": 3.402710347447967,
"grad_norm": 243.53871154785156,
"learning_rate": 3.326961640847631e-06,
"loss": 3.3534,
"step": 80600
},
{
"epoch": 3.406932072444801,
"grad_norm": 23.158308029174805,
"learning_rate": 3.303507558078173e-06,
"loss": 3.7609,
"step": 80700
},
{
"epoch": 3.4111537974416346,
"grad_norm": 70.86226654052734,
"learning_rate": 3.2800534753087148e-06,
"loss": 3.7096,
"step": 80800
},
{
"epoch": 3.4153755224384685,
"grad_norm": 36.22040557861328,
"learning_rate": 3.2565993925392565e-06,
"loss": 2.9755,
"step": 80900
},
{
"epoch": 3.419597247435302,
"grad_norm": 110.31868743896484,
"learning_rate": 3.2331453097697983e-06,
"loss": 3.4585,
"step": 81000
},
{
"epoch": 3.419597247435302,
"eval_loss": 3.736179828643799,
"eval_runtime": 373.3817,
"eval_samples_per_second": 507.499,
"eval_steps_per_second": 15.86,
"step": 81000
},
{
"epoch": 3.423818972432136,
"grad_norm": 60.848907470703125,
"learning_rate": 3.2096912270003405e-06,
"loss": 3.5315,
"step": 81100
},
{
"epoch": 3.4280406974289694,
"grad_norm": 27.560346603393555,
"learning_rate": 3.186237144230882e-06,
"loss": 3.4276,
"step": 81200
},
{
"epoch": 3.4322624224258034,
"grad_norm": 46.1323127746582,
"learning_rate": 3.162783061461424e-06,
"loss": 3.5303,
"step": 81300
},
{
"epoch": 3.436484147422637,
"grad_norm": 55.932132720947266,
"learning_rate": 3.1393289786919662e-06,
"loss": 3.2272,
"step": 81400
},
{
"epoch": 3.440705872419471,
"grad_norm": 36.39140701293945,
"learning_rate": 3.1158748959225076e-06,
"loss": 2.9614,
"step": 81500
},
{
"epoch": 3.440705872419471,
"eval_loss": 3.7592084407806396,
"eval_runtime": 380.4214,
"eval_samples_per_second": 498.108,
"eval_steps_per_second": 15.567,
"step": 81500
},
{
"epoch": 3.4449275974163043,
"grad_norm": 63.25979232788086,
"learning_rate": 3.0924208131530498e-06,
"loss": 3.3272,
"step": 81600
},
{
"epoch": 3.449149322413138,
"grad_norm": 60.168540954589844,
"learning_rate": 3.068966730383592e-06,
"loss": 3.548,
"step": 81700
},
{
"epoch": 3.4533710474099717,
"grad_norm": 65.39545440673828,
"learning_rate": 3.0455126476141338e-06,
"loss": 3.5806,
"step": 81800
},
{
"epoch": 3.457592772406805,
"grad_norm": 71.17179107666016,
"learning_rate": 3.0220585648446755e-06,
"loss": 3.2915,
"step": 81900
},
{
"epoch": 3.461814497403639,
"grad_norm": 83.21878814697266,
"learning_rate": 2.9986044820752173e-06,
"loss": 3.4571,
"step": 82000
},
{
"epoch": 3.461814497403639,
"eval_loss": 3.7446765899658203,
"eval_runtime": 371.3966,
"eval_samples_per_second": 510.212,
"eval_steps_per_second": 15.945,
"step": 82000
},
{
"epoch": 3.466036222400473,
"grad_norm": 40.86414337158203,
"learning_rate": 2.9751503993057595e-06,
"loss": 3.2471,
"step": 82100
},
{
"epoch": 3.4702579473973065,
"grad_norm": 69.95441436767578,
"learning_rate": 2.9516963165363013e-06,
"loss": 3.3675,
"step": 82200
},
{
"epoch": 3.47447967239414,
"grad_norm": 87.65948486328125,
"learning_rate": 2.928242233766843e-06,
"loss": 3.039,
"step": 82300
},
{
"epoch": 3.478701397390974,
"grad_norm": 42.40869140625,
"learning_rate": 2.9047881509973852e-06,
"loss": 3.1737,
"step": 82400
},
{
"epoch": 3.482923122387808,
"grad_norm": 95.52741241455078,
"learning_rate": 2.8813340682279274e-06,
"loss": 3.5937,
"step": 82500
},
{
"epoch": 3.482923122387808,
"eval_loss": 3.7526960372924805,
"eval_runtime": 382.5828,
"eval_samples_per_second": 495.294,
"eval_steps_per_second": 15.479,
"step": 82500
},
{
"epoch": 3.4871448473846414,
"grad_norm": 99.28910064697266,
"learning_rate": 2.858114526286164e-06,
"loss": 3.3723,
"step": 82600
},
{
"epoch": 3.491366572381475,
"grad_norm": 84.63138580322266,
"learning_rate": 2.8346604435167053e-06,
"loss": 3.5835,
"step": 82700
},
{
"epoch": 3.495588297378309,
"grad_norm": 30.03119468688965,
"learning_rate": 2.8112063607472475e-06,
"loss": 3.3739,
"step": 82800
},
{
"epoch": 3.4998100223751427,
"grad_norm": 45.82896041870117,
"learning_rate": 2.7877522779777892e-06,
"loss": 3.3891,
"step": 82900
},
{
"epoch": 3.504031747371976,
"grad_norm": 64.97034454345703,
"learning_rate": 2.764298195208331e-06,
"loss": 3.5204,
"step": 83000
},
{
"epoch": 3.504031747371976,
"eval_loss": 3.7397613525390625,
"eval_runtime": 380.231,
"eval_samples_per_second": 498.358,
"eval_steps_per_second": 15.575,
"step": 83000
},
{
"epoch": 3.5082534723688097,
"grad_norm": 23.58553123474121,
"learning_rate": 2.7408441124388728e-06,
"loss": 3.0925,
"step": 83100
},
{
"epoch": 3.5124751973656436,
"grad_norm": 46.03030014038086,
"learning_rate": 2.717390029669415e-06,
"loss": 3.2285,
"step": 83200
},
{
"epoch": 3.5166969223624776,
"grad_norm": 51.1907844543457,
"learning_rate": 2.693935946899957e-06,
"loss": 3.4032,
"step": 83300
},
{
"epoch": 3.520918647359311,
"grad_norm": 72.13794708251953,
"learning_rate": 2.6704818641304985e-06,
"loss": 3.5367,
"step": 83400
},
{
"epoch": 3.5251403723561445,
"grad_norm": 31.403751373291016,
"learning_rate": 2.6470277813610407e-06,
"loss": 3.1513,
"step": 83500
},
{
"epoch": 3.5251403723561445,
"eval_loss": 3.7454099655151367,
"eval_runtime": 387.0416,
"eval_samples_per_second": 489.588,
"eval_steps_per_second": 15.301,
"step": 83500
},
{
"epoch": 3.5293620973529785,
"grad_norm": 45.70579147338867,
"learning_rate": 2.623573698591583e-06,
"loss": 3.292,
"step": 83600
},
{
"epoch": 3.5335838223498124,
"grad_norm": 53.186622619628906,
"learning_rate": 2.6001196158221242e-06,
"loss": 3.2018,
"step": 83700
},
{
"epoch": 3.537805547346646,
"grad_norm": 60.18638610839844,
"learning_rate": 2.5766655330526664e-06,
"loss": 3.4814,
"step": 83800
},
{
"epoch": 3.5420272723434794,
"grad_norm": 63.47237014770508,
"learning_rate": 2.5532114502832082e-06,
"loss": 3.2591,
"step": 83900
},
{
"epoch": 3.5462489973403133,
"grad_norm": 35.679569244384766,
"learning_rate": 2.52975736751375e-06,
"loss": 3.179,
"step": 84000
},
{
"epoch": 3.5462489973403133,
"eval_loss": 3.7721922397613525,
"eval_runtime": 368.2527,
"eval_samples_per_second": 514.568,
"eval_steps_per_second": 16.081,
"step": 84000
},
{
"epoch": 3.5504707223371472,
"grad_norm": 70.08448791503906,
"learning_rate": 2.5063032847442918e-06,
"loss": 3.3408,
"step": 84100
},
{
"epoch": 3.5546924473339807,
"grad_norm": 30.69338607788086,
"learning_rate": 2.482849201974834e-06,
"loss": 3.6131,
"step": 84200
},
{
"epoch": 3.558914172330814,
"grad_norm": 57.080509185791016,
"learning_rate": 2.4593951192053757e-06,
"loss": 3.2299,
"step": 84300
},
{
"epoch": 3.563135897327648,
"grad_norm": 76.74966430664062,
"learning_rate": 2.435941036435918e-06,
"loss": 3.3005,
"step": 84400
},
{
"epoch": 3.567357622324482,
"grad_norm": 33.93446731567383,
"learning_rate": 2.4124869536664597e-06,
"loss": 3.4731,
"step": 84500
},
{
"epoch": 3.567357622324482,
"eval_loss": 3.7457754611968994,
"eval_runtime": 380.3578,
"eval_samples_per_second": 498.191,
"eval_steps_per_second": 15.57,
"step": 84500
},
{
"epoch": 3.5715793473213155,
"grad_norm": 48.096961975097656,
"learning_rate": 2.3890328708970015e-06,
"loss": 3.2288,
"step": 84600
},
{
"epoch": 3.575801072318149,
"grad_norm": 107.95574188232422,
"learning_rate": 2.3655787881275437e-06,
"loss": 3.4384,
"step": 84700
},
{
"epoch": 3.580022797314983,
"grad_norm": 213.74327087402344,
"learning_rate": 2.3421247053580854e-06,
"loss": 3.6438,
"step": 84800
},
{
"epoch": 3.5842445223118164,
"grad_norm": 42.909175872802734,
"learning_rate": 2.318670622588627e-06,
"loss": 3.3293,
"step": 84900
},
{
"epoch": 3.5884662473086504,
"grad_norm": 32.89263916015625,
"learning_rate": 2.2952165398191694e-06,
"loss": 3.3555,
"step": 85000
},
{
"epoch": 3.5884662473086504,
"eval_loss": 3.7533552646636963,
"eval_runtime": 386.4534,
"eval_samples_per_second": 490.333,
"eval_steps_per_second": 15.324,
"step": 85000
},
{
"epoch": 3.592687972305484,
"grad_norm": 52.0558967590332,
"learning_rate": 2.271762457049711e-06,
"loss": 3.4791,
"step": 85100
},
{
"epoch": 3.596909697302318,
"grad_norm": 36.7622184753418,
"learning_rate": 2.248308374280253e-06,
"loss": 3.1024,
"step": 85200
},
{
"epoch": 3.6011314222991513,
"grad_norm": 37.515411376953125,
"learning_rate": 2.2248542915107947e-06,
"loss": 3.4605,
"step": 85300
},
{
"epoch": 3.605353147295985,
"grad_norm": 35.729393005371094,
"learning_rate": 2.201400208741337e-06,
"loss": 3.4317,
"step": 85400
},
{
"epoch": 3.6095748722928187,
"grad_norm": 117.54667663574219,
"learning_rate": 2.1779461259718787e-06,
"loss": 3.2913,
"step": 85500
},
{
"epoch": 3.6095748722928187,
"eval_loss": 3.703676223754883,
"eval_runtime": 374.3622,
"eval_samples_per_second": 506.17,
"eval_steps_per_second": 15.819,
"step": 85500
},
{
"epoch": 3.6137965972896526,
"grad_norm": 60.1458854675293,
"learning_rate": 2.1544920432024205e-06,
"loss": 3.3377,
"step": 85600
},
{
"epoch": 3.618018322286486,
"grad_norm": 74.54237365722656,
"learning_rate": 2.1310379604329627e-06,
"loss": 3.2746,
"step": 85700
},
{
"epoch": 3.62224004728332,
"grad_norm": 82.62213134765625,
"learning_rate": 2.1075838776635044e-06,
"loss": 3.4173,
"step": 85800
},
{
"epoch": 3.6264617722801535,
"grad_norm": 41.210052490234375,
"learning_rate": 2.0841297948940466e-06,
"loss": 3.4623,
"step": 85900
},
{
"epoch": 3.6306834972769875,
"grad_norm": 24.477449417114258,
"learning_rate": 2.0606757121245884e-06,
"loss": 3.2596,
"step": 86000
},
{
"epoch": 3.6306834972769875,
"eval_loss": 3.705226421356201,
"eval_runtime": 377.8636,
"eval_samples_per_second": 501.48,
"eval_steps_per_second": 15.672,
"step": 86000
},
{
"epoch": 3.634905222273821,
"grad_norm": 44.822235107421875,
"learning_rate": 2.03722162935513e-06,
"loss": 3.2322,
"step": 86100
},
{
"epoch": 3.639126947270655,
"grad_norm": 116.65343475341797,
"learning_rate": 2.013767546585672e-06,
"loss": 3.2082,
"step": 86200
},
{
"epoch": 3.6433486722674884,
"grad_norm": 38.853790283203125,
"learning_rate": 1.9903134638162137e-06,
"loss": 3.4993,
"step": 86300
},
{
"epoch": 3.6475703972643223,
"grad_norm": 45.420650482177734,
"learning_rate": 1.966859381046756e-06,
"loss": 3.3922,
"step": 86400
},
{
"epoch": 3.651792122261156,
"grad_norm": 57.968753814697266,
"learning_rate": 1.9434052982772977e-06,
"loss": 3.2275,
"step": 86500
},
{
"epoch": 3.651792122261156,
"eval_loss": 3.679260492324829,
"eval_runtime": 381.7795,
"eval_samples_per_second": 496.336,
"eval_steps_per_second": 15.512,
"step": 86500
},
{
"epoch": 3.6560138472579897,
"grad_norm": 44.483787536621094,
"learning_rate": 1.91995121550784e-06,
"loss": 3.3031,
"step": 86600
},
{
"epoch": 3.660235572254823,
"grad_norm": 57.32283020019531,
"learning_rate": 1.8967316735660761e-06,
"loss": 3.2876,
"step": 86700
},
{
"epoch": 3.664457297251657,
"grad_norm": 63.705631256103516,
"learning_rate": 1.8732775907966181e-06,
"loss": 3.5403,
"step": 86800
},
{
"epoch": 3.6686790222484906,
"grad_norm": 39.64052200317383,
"learning_rate": 1.8498235080271601e-06,
"loss": 3.3889,
"step": 86900
},
{
"epoch": 3.6729007472453246,
"grad_norm": 67.5136489868164,
"learning_rate": 1.8263694252577019e-06,
"loss": 3.3938,
"step": 87000
},
{
"epoch": 3.6729007472453246,
"eval_loss": 3.6927380561828613,
"eval_runtime": 390.1224,
"eval_samples_per_second": 485.722,
"eval_steps_per_second": 15.18,
"step": 87000
},
{
"epoch": 3.677122472242158,
"grad_norm": 65.40144348144531,
"learning_rate": 1.8029153424882437e-06,
"loss": 3.3169,
"step": 87100
},
{
"epoch": 3.681344197238992,
"grad_norm": 39.77958297729492,
"learning_rate": 1.7794612597187858e-06,
"loss": 3.1372,
"step": 87200
},
{
"epoch": 3.6855659222358255,
"grad_norm": 38.851959228515625,
"learning_rate": 1.7560071769493276e-06,
"loss": 3.0958,
"step": 87300
},
{
"epoch": 3.6897876472326594,
"grad_norm": 75.6167221069336,
"learning_rate": 1.7325530941798696e-06,
"loss": 3.1186,
"step": 87400
},
{
"epoch": 3.694009372229493,
"grad_norm": 61.1735725402832,
"learning_rate": 1.7090990114104114e-06,
"loss": 3.5419,
"step": 87500
},
{
"epoch": 3.694009372229493,
"eval_loss": 3.6852128505706787,
"eval_runtime": 386.9458,
"eval_samples_per_second": 489.709,
"eval_steps_per_second": 15.304,
"step": 87500
},
{
"epoch": 3.698231097226327,
"grad_norm": 66.27415466308594,
"learning_rate": 1.6856449286409531e-06,
"loss": 3.6208,
"step": 87600
},
{
"epoch": 3.7024528222231603,
"grad_norm": 75.61920928955078,
"learning_rate": 1.6621908458714953e-06,
"loss": 3.3815,
"step": 87700
},
{
"epoch": 3.7066745472199942,
"grad_norm": 77.92952728271484,
"learning_rate": 1.6387367631020371e-06,
"loss": 2.9388,
"step": 87800
},
{
"epoch": 3.7108962722168277,
"grad_norm": 47.41593551635742,
"learning_rate": 1.615282680332579e-06,
"loss": 3.2111,
"step": 87900
},
{
"epoch": 3.7151179972136616,
"grad_norm": 106.80548095703125,
"learning_rate": 1.5918285975631209e-06,
"loss": 3.4742,
"step": 88000
},
{
"epoch": 3.7151179972136616,
"eval_loss": 3.690481424331665,
"eval_runtime": 376.2552,
"eval_samples_per_second": 503.624,
"eval_steps_per_second": 15.739,
"step": 88000
},
{
"epoch": 3.719339722210495,
"grad_norm": 99.46328735351562,
"learning_rate": 1.5683745147936626e-06,
"loss": 3.2668,
"step": 88100
},
{
"epoch": 3.723561447207329,
"grad_norm": 40.66118240356445,
"learning_rate": 1.5451549728518993e-06,
"loss": 3.439,
"step": 88200
},
{
"epoch": 3.7277831722041626,
"grad_norm": 47.70708084106445,
"learning_rate": 1.5217008900824411e-06,
"loss": 3.3342,
"step": 88300
},
{
"epoch": 3.7320048972009965,
"grad_norm": 79.89665222167969,
"learning_rate": 1.498246807312983e-06,
"loss": 3.5079,
"step": 88400
},
{
"epoch": 3.73622662219783,
"grad_norm": 39.58213806152344,
"learning_rate": 1.474792724543525e-06,
"loss": 3.4446,
"step": 88500
},
{
"epoch": 3.73622662219783,
"eval_loss": 3.7126083374023438,
"eval_runtime": 378.45,
"eval_samples_per_second": 500.703,
"eval_steps_per_second": 15.648,
"step": 88500
},
{
"epoch": 3.740448347194664,
"grad_norm": 60.13318634033203,
"learning_rate": 1.4513386417740668e-06,
"loss": 3.3036,
"step": 88600
},
{
"epoch": 3.7446700721914974,
"grad_norm": 95.5160903930664,
"learning_rate": 1.4278845590046088e-06,
"loss": 3.323,
"step": 88700
},
{
"epoch": 3.748891797188331,
"grad_norm": 29.227277755737305,
"learning_rate": 1.4044304762351508e-06,
"loss": 3.2921,
"step": 88800
},
{
"epoch": 3.753113522185165,
"grad_norm": 46.99550247192383,
"learning_rate": 1.381210934293387e-06,
"loss": 3.3972,
"step": 88900
},
{
"epoch": 3.7573352471819987,
"grad_norm": 44.87736892700195,
"learning_rate": 1.3577568515239293e-06,
"loss": 3.3132,
"step": 89000
},
{
"epoch": 3.7573352471819987,
"eval_loss": 3.7031056880950928,
"eval_runtime": 387.1006,
"eval_samples_per_second": 489.514,
"eval_steps_per_second": 15.298,
"step": 89000
},
{
"epoch": 3.7615569721788322,
"grad_norm": 249.27423095703125,
"learning_rate": 1.334302768754471e-06,
"loss": 3.6181,
"step": 89100
},
{
"epoch": 3.7657786971756657,
"grad_norm": 56.87501907348633,
"learning_rate": 1.310848685985013e-06,
"loss": 3.41,
"step": 89200
},
{
"epoch": 3.7700004221724996,
"grad_norm": 30.149934768676758,
"learning_rate": 1.2873946032155548e-06,
"loss": 3.2602,
"step": 89300
},
{
"epoch": 3.7742221471693336,
"grad_norm": 81.41632080078125,
"learning_rate": 1.2639405204460966e-06,
"loss": 3.3742,
"step": 89400
},
{
"epoch": 3.778443872166167,
"grad_norm": 79.90999603271484,
"learning_rate": 1.2404864376766386e-06,
"loss": 3.2929,
"step": 89500
},
{
"epoch": 3.778443872166167,
"eval_loss": 3.7064220905303955,
"eval_runtime": 380.9556,
"eval_samples_per_second": 497.41,
"eval_steps_per_second": 15.545,
"step": 89500
},
{
"epoch": 3.7826655971630005,
"grad_norm": 86.91078186035156,
"learning_rate": 1.2170323549071805e-06,
"loss": 3.1366,
"step": 89600
},
{
"epoch": 3.7868873221598345,
"grad_norm": 55.73985290527344,
"learning_rate": 1.1935782721377225e-06,
"loss": 3.5312,
"step": 89700
},
{
"epoch": 3.7911090471566684,
"grad_norm": 47.79535675048828,
"learning_rate": 1.1701241893682643e-06,
"loss": 3.2735,
"step": 89800
},
{
"epoch": 3.795330772153502,
"grad_norm": 122.197021484375,
"learning_rate": 1.1466701065988063e-06,
"loss": 3.3797,
"step": 89900
},
{
"epoch": 3.7995524971503354,
"grad_norm": 83.0975341796875,
"learning_rate": 1.1232160238293483e-06,
"loss": 3.3003,
"step": 90000
},
{
"epoch": 3.7995524971503354,
"eval_loss": 3.713467597961426,
"eval_runtime": 391.6827,
"eval_samples_per_second": 483.787,
"eval_steps_per_second": 15.119,
"step": 90000
},
{
"epoch": 3.8037742221471693,
"grad_norm": 27.892484664916992,
"learning_rate": 1.09976194105989e-06,
"loss": 3.483,
"step": 90100
},
{
"epoch": 3.8079959471440032,
"grad_norm": 29.49579620361328,
"learning_rate": 1.076307858290432e-06,
"loss": 3.2309,
"step": 90200
},
{
"epoch": 3.8122176721408367,
"grad_norm": 48.57194900512695,
"learning_rate": 1.0528537755209738e-06,
"loss": 3.3767,
"step": 90300
},
{
"epoch": 3.81643939713767,
"grad_norm": 34.58513259887695,
"learning_rate": 1.0293996927515158e-06,
"loss": 2.8296,
"step": 90400
},
{
"epoch": 3.820661122134504,
"grad_norm": 70.50397491455078,
"learning_rate": 1.0059456099820578e-06,
"loss": 3.361,
"step": 90500
},
{
"epoch": 3.820661122134504,
"eval_loss": 3.714541435241699,
"eval_runtime": 387.3586,
"eval_samples_per_second": 489.188,
"eval_steps_per_second": 15.288,
"step": 90500
},
{
"epoch": 3.824882847131338,
"grad_norm": 49.018592834472656,
"learning_rate": 9.824915272125995e-07,
"loss": 3.3726,
"step": 90600
},
{
"epoch": 3.8291045721281716,
"grad_norm": 63.764713287353516,
"learning_rate": 9.590374444431415e-07,
"loss": 3.2925,
"step": 90700
},
{
"epoch": 3.833326297125005,
"grad_norm": 32.11079788208008,
"learning_rate": 9.355833616736834e-07,
"loss": 3.5113,
"step": 90800
},
{
"epoch": 3.837548022121839,
"grad_norm": 65.86510467529297,
"learning_rate": 9.121292789042254e-07,
"loss": 3.6037,
"step": 90900
},
{
"epoch": 3.841769747118673,
"grad_norm": 80.28820037841797,
"learning_rate": 8.886751961347673e-07,
"loss": 3.0925,
"step": 91000
},
{
"epoch": 3.841769747118673,
"eval_loss": 3.7223033905029297,
"eval_runtime": 379.9027,
"eval_samples_per_second": 498.788,
"eval_steps_per_second": 15.588,
"step": 91000
},
{
"epoch": 3.8459914721155064,
"grad_norm": 44.35951232910156,
"learning_rate": 8.652211133653092e-07,
"loss": 3.4363,
"step": 91100
},
{
"epoch": 3.85021319711234,
"grad_norm": 31.515623092651367,
"learning_rate": 8.41767030595851e-07,
"loss": 3.3181,
"step": 91200
},
{
"epoch": 3.854434922109174,
"grad_norm": 80.0140609741211,
"learning_rate": 8.183129478263929e-07,
"loss": 3.4216,
"step": 91300
},
{
"epoch": 3.8586566471060078,
"grad_norm": 32.14973068237305,
"learning_rate": 7.948588650569349e-07,
"loss": 3.1301,
"step": 91400
},
{
"epoch": 3.8628783721028412,
"grad_norm": 47.468631744384766,
"learning_rate": 7.714047822874768e-07,
"loss": 3.5791,
"step": 91500
},
{
"epoch": 3.8628783721028412,
"eval_loss": 3.714407205581665,
"eval_runtime": 386.9507,
"eval_samples_per_second": 489.703,
"eval_steps_per_second": 15.304,
"step": 91500
},
{
"epoch": 3.8671000970996747,
"grad_norm": 17.921409606933594,
"learning_rate": 7.479506995180187e-07,
"loss": 3.0492,
"step": 91600
},
{
"epoch": 3.8713218220965087,
"grad_norm": 46.39325714111328,
"learning_rate": 7.244966167485605e-07,
"loss": 3.4513,
"step": 91700
},
{
"epoch": 3.8755435470933426,
"grad_norm": 73.21107482910156,
"learning_rate": 7.010425339791024e-07,
"loss": 3.7442,
"step": 91800
},
{
"epoch": 3.879765272090176,
"grad_norm": 101.89634704589844,
"learning_rate": 6.775884512096444e-07,
"loss": 3.1566,
"step": 91900
},
{
"epoch": 3.8839869970870096,
"grad_norm": 69.1541748046875,
"learning_rate": 6.541343684401863e-07,
"loss": 3.3871,
"step": 92000
},
{
"epoch": 3.8839869970870096,
"eval_loss": 3.7024970054626465,
"eval_runtime": 386.5745,
"eval_samples_per_second": 490.18,
"eval_steps_per_second": 15.319,
"step": 92000
},
{
"epoch": 3.8882087220838435,
"grad_norm": 64.59981536865234,
"learning_rate": 6.306802856707282e-07,
"loss": 3.3478,
"step": 92100
},
{
"epoch": 3.8924304470806774,
"grad_norm": 48.25294494628906,
"learning_rate": 6.072262029012701e-07,
"loss": 3.2922,
"step": 92200
},
{
"epoch": 3.896652172077511,
"grad_norm": 33.20170974731445,
"learning_rate": 5.83772120131812e-07,
"loss": 3.0988,
"step": 92300
},
{
"epoch": 3.9008738970743444,
"grad_norm": 59.5956916809082,
"learning_rate": 5.603180373623539e-07,
"loss": 3.4383,
"step": 92400
},
{
"epoch": 3.9050956220711783,
"grad_norm": 29.315317153930664,
"learning_rate": 5.368639545928959e-07,
"loss": 3.175,
"step": 92500
},
{
"epoch": 3.9050956220711783,
"eval_loss": 3.702639579772949,
"eval_runtime": 374.6028,
"eval_samples_per_second": 505.845,
"eval_steps_per_second": 15.809,
"step": 92500
},
{
"epoch": 3.9093173470680123,
"grad_norm": 86.02645111083984,
"learning_rate": 5.134098718234377e-07,
"loss": 3.3831,
"step": 92600
},
{
"epoch": 3.9135390720648457,
"grad_norm": 53.895973205566406,
"learning_rate": 4.899557890539796e-07,
"loss": 3.3871,
"step": 92700
},
{
"epoch": 3.9177607970616792,
"grad_norm": 41.91706085205078,
"learning_rate": 4.6650170628452154e-07,
"loss": 3.5747,
"step": 92800
},
{
"epoch": 3.921982522058513,
"grad_norm": 35.26768112182617,
"learning_rate": 4.4304762351506336e-07,
"loss": 3.272,
"step": 92900
},
{
"epoch": 3.9262042470553467,
"grad_norm": 29.90506362915039,
"learning_rate": 4.195935407456053e-07,
"loss": 3.4294,
"step": 93000
},
{
"epoch": 3.9262042470553467,
"eval_loss": 3.6876306533813477,
"eval_runtime": 371.6904,
"eval_samples_per_second": 509.809,
"eval_steps_per_second": 15.933,
"step": 93000
},
{
"epoch": 3.9304259720521806,
"grad_norm": 28.437841415405273,
"learning_rate": 3.961394579761472e-07,
"loss": 3.6332,
"step": 93100
},
{
"epoch": 3.934647697049014,
"grad_norm": 40.40605163574219,
"learning_rate": 3.726853752066891e-07,
"loss": 3.626,
"step": 93200
},
{
"epoch": 3.938869422045848,
"grad_norm": 53.085670471191406,
"learning_rate": 3.4923129243723103e-07,
"loss": 3.5402,
"step": 93300
},
{
"epoch": 3.9430911470426815,
"grad_norm": 51.06597900390625,
"learning_rate": 3.2577720966777297e-07,
"loss": 3.348,
"step": 93400
},
{
"epoch": 3.9473128720395154,
"grad_norm": 48.22876739501953,
"learning_rate": 3.0232312689831484e-07,
"loss": 3.2556,
"step": 93500
},
{
"epoch": 3.9473128720395154,
"eval_loss": 3.6902225017547607,
"eval_runtime": 371.965,
"eval_samples_per_second": 509.432,
"eval_steps_per_second": 15.921,
"step": 93500
},
{
"epoch": 3.951534597036349,
"grad_norm": 43.102745056152344,
"learning_rate": 2.788690441288568e-07,
"loss": 3.5298,
"step": 93600
},
{
"epoch": 3.955756322033183,
"grad_norm": 38.394798278808594,
"learning_rate": 2.5541496135939865e-07,
"loss": 3.5247,
"step": 93700
},
{
"epoch": 3.9599780470300163,
"grad_norm": 33.49041748046875,
"learning_rate": 2.3196087858994056e-07,
"loss": 3.0763,
"step": 93800
},
{
"epoch": 3.9641997720268503,
"grad_norm": 36.5673828125,
"learning_rate": 2.0850679582048246e-07,
"loss": 3.2457,
"step": 93900
},
{
"epoch": 3.9684214970236837,
"grad_norm": 31.246381759643555,
"learning_rate": 1.8528725387871894e-07,
"loss": 3.1805,
"step": 94000
},
{
"epoch": 3.9684214970236837,
"eval_loss": 3.689732551574707,
"eval_runtime": 382.9877,
"eval_samples_per_second": 494.77,
"eval_steps_per_second": 15.463,
"step": 94000
},
{
"epoch": 3.9726432220205177,
"grad_norm": 51.14909362792969,
"learning_rate": 1.6183317110926085e-07,
"loss": 3.3959,
"step": 94100
},
{
"epoch": 3.976864947017351,
"grad_norm": 30.674938201904297,
"learning_rate": 1.3837908833980275e-07,
"loss": 3.205,
"step": 94200
},
{
"epoch": 3.981086672014185,
"grad_norm": 92.1559829711914,
"learning_rate": 1.1492500557034467e-07,
"loss": 3.3307,
"step": 94300
},
{
"epoch": 3.9853083970110186,
"grad_norm": 105.24581909179688,
"learning_rate": 9.147092280088657e-08,
"loss": 3.0448,
"step": 94400
},
{
"epoch": 3.9895301220078525,
"grad_norm": 43.431854248046875,
"learning_rate": 6.801684003142848e-08,
"loss": 3.0447,
"step": 94500
},
{
"epoch": 3.9895301220078525,
"eval_loss": 3.690552234649658,
"eval_runtime": 386.4079,
"eval_samples_per_second": 490.391,
"eval_steps_per_second": 15.326,
"step": 94500
},
{
"epoch": 3.993751847004686,
"grad_norm": 53.77738952636719,
"learning_rate": 4.4562757261970375e-08,
"loss": 3.3314,
"step": 94600
},
{
"epoch": 3.99797357200152,
"grad_norm": 63.30742645263672,
"learning_rate": 2.1108674492512286e-08,
"loss": 3.4516,
"step": 94700
}
],
"logging_steps": 100,
"max_steps": 94748,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}