{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 1, "global_step": 6834, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000292654375182909, "grad_norm": 16.360368728637695, "learning_rate": 4.999268364062043e-05, "loss": 0.6293, "step": 1 }, { "epoch": 0.000585308750365818, "grad_norm": 10.653396606445312, "learning_rate": 4.998536728124086e-05, "loss": 0.9532, "step": 2 }, { "epoch": 0.000877963125548727, "grad_norm": 4.163839817047119, "learning_rate": 4.9978050921861286e-05, "loss": 0.2345, "step": 3 }, { "epoch": 0.001170617500731636, "grad_norm": 3.368375539779663, "learning_rate": 4.9970734562481714e-05, "loss": 0.2212, "step": 4 }, { "epoch": 0.0014632718759145448, "grad_norm": 8.007204055786133, "learning_rate": 4.996341820310214e-05, "loss": 0.3395, "step": 5 }, { "epoch": 0.001755926251097454, "grad_norm": 9.681714057922363, "learning_rate": 4.995610184372256e-05, "loss": 0.4309, "step": 6 }, { "epoch": 0.002048580626280363, "grad_norm": 4.414303779602051, "learning_rate": 4.994878548434299e-05, "loss": 0.2745, "step": 7 }, { "epoch": 0.002341235001463272, "grad_norm": 3.7951767444610596, "learning_rate": 4.994146912496342e-05, "loss": 0.1274, "step": 8 }, { "epoch": 0.0026338893766461808, "grad_norm": 7.083094120025635, "learning_rate": 4.993415276558385e-05, "loss": 0.3291, "step": 9 }, { "epoch": 0.0029265437518290896, "grad_norm": 2.265326738357544, "learning_rate": 4.9926836406204275e-05, "loss": 0.1044, "step": 10 }, { "epoch": 0.003219198127011999, "grad_norm": 2.20458984375, "learning_rate": 4.99195200468247e-05, "loss": 0.1148, "step": 11 }, { "epoch": 0.003511852502194908, "grad_norm": 2.6887764930725098, "learning_rate": 4.991220368744513e-05, "loss": 0.1535, "step": 12 }, { "epoch": 0.0038045068773778167, "grad_norm": 5.465479850769043, "learning_rate": 4.990488732806556e-05, "loss": 0.2356, "step": 13 }, { "epoch": 0.004097161252560726, "grad_norm": 2.450127601623535, "learning_rate": 4.989757096868599e-05, "loss": 0.0974, "step": 14 }, { "epoch": 0.004389815627743635, "grad_norm": 5.298760890960693, "learning_rate": 4.9890254609306415e-05, "loss": 0.2099, "step": 15 }, { "epoch": 0.004682470002926544, "grad_norm": 1.242136836051941, "learning_rate": 4.9882938249926836e-05, "loss": 0.0506, "step": 16 }, { "epoch": 0.004975124378109453, "grad_norm": 3.672600269317627, "learning_rate": 4.9875621890547264e-05, "loss": 0.1305, "step": 17 }, { "epoch": 0.0052677787532923615, "grad_norm": 3.408050298690796, "learning_rate": 4.986830553116769e-05, "loss": 0.0902, "step": 18 }, { "epoch": 0.00556043312847527, "grad_norm": 9.07597541809082, "learning_rate": 4.986098917178812e-05, "loss": 0.1501, "step": 19 }, { "epoch": 0.005853087503658179, "grad_norm": 11.2225341796875, "learning_rate": 4.985367281240855e-05, "loss": 0.1522, "step": 20 }, { "epoch": 0.006145741878841089, "grad_norm": 7.519731521606445, "learning_rate": 4.9846356453028976e-05, "loss": 0.1536, "step": 21 }, { "epoch": 0.006438396254023998, "grad_norm": 10.621881484985352, "learning_rate": 4.9839040093649404e-05, "loss": 0.2526, "step": 22 }, { "epoch": 0.006731050629206907, "grad_norm": 11.814753532409668, "learning_rate": 4.983172373426983e-05, "loss": 0.2104, "step": 23 }, { "epoch": 0.007023705004389816, "grad_norm": 2.594482421875, "learning_rate": 4.982440737489026e-05, "loss": 0.0617, "step": 24 }, { "epoch": 0.0073163593795727245, "grad_norm": 2.499378204345703, "learning_rate": 4.981709101551069e-05, "loss": 0.0613, "step": 25 }, { "epoch": 0.007609013754755633, "grad_norm": 3.145395278930664, "learning_rate": 4.9809774656131116e-05, "loss": 0.0386, "step": 26 }, { "epoch": 0.007901668129938543, "grad_norm": 4.624070167541504, "learning_rate": 4.980245829675154e-05, "loss": 0.1326, "step": 27 }, { "epoch": 0.008194322505121452, "grad_norm": 9.242585182189941, "learning_rate": 4.9795141937371965e-05, "loss": 0.154, "step": 28 }, { "epoch": 0.008486976880304361, "grad_norm": 0.6491050720214844, "learning_rate": 4.978782557799239e-05, "loss": 0.0038, "step": 29 }, { "epoch": 0.00877963125548727, "grad_norm": 4.768040180206299, "learning_rate": 4.978050921861282e-05, "loss": 0.1239, "step": 30 }, { "epoch": 0.009072285630670179, "grad_norm": 5.549623966217041, "learning_rate": 4.977319285923325e-05, "loss": 0.267, "step": 31 }, { "epoch": 0.009364940005853088, "grad_norm": 7.482322692871094, "learning_rate": 4.9765876499853676e-05, "loss": 0.2245, "step": 32 }, { "epoch": 0.009657594381035996, "grad_norm": 11.393575668334961, "learning_rate": 4.9758560140474104e-05, "loss": 0.2126, "step": 33 }, { "epoch": 0.009950248756218905, "grad_norm": 22.630184173583984, "learning_rate": 4.975124378109453e-05, "loss": 0.6228, "step": 34 }, { "epoch": 0.010242903131401814, "grad_norm": 17.684167861938477, "learning_rate": 4.974392742171496e-05, "loss": 0.4424, "step": 35 }, { "epoch": 0.010535557506584723, "grad_norm": 5.006700038909912, "learning_rate": 4.973661106233539e-05, "loss": 0.1162, "step": 36 }, { "epoch": 0.010828211881767632, "grad_norm": 8.332120895385742, "learning_rate": 4.9729294702955816e-05, "loss": 0.2701, "step": 37 }, { "epoch": 0.01112086625695054, "grad_norm": 5.902966022491455, "learning_rate": 4.972197834357624e-05, "loss": 0.1282, "step": 38 }, { "epoch": 0.01141352063213345, "grad_norm": 3.1011571884155273, "learning_rate": 4.9714661984196665e-05, "loss": 0.0795, "step": 39 }, { "epoch": 0.011706175007316359, "grad_norm": 4.028739929199219, "learning_rate": 4.970734562481709e-05, "loss": 0.1002, "step": 40 }, { "epoch": 0.01199882938249927, "grad_norm": 4.598829746246338, "learning_rate": 4.970002926543752e-05, "loss": 0.1702, "step": 41 }, { "epoch": 0.012291483757682178, "grad_norm": 10.465580940246582, "learning_rate": 4.969271290605795e-05, "loss": 0.2358, "step": 42 }, { "epoch": 0.012584138132865087, "grad_norm": 16.530763626098633, "learning_rate": 4.968539654667838e-05, "loss": 0.3765, "step": 43 }, { "epoch": 0.012876792508047996, "grad_norm": 11.63476276397705, "learning_rate": 4.9678080187298805e-05, "loss": 0.259, "step": 44 }, { "epoch": 0.013169446883230905, "grad_norm": 7.477903366088867, "learning_rate": 4.967076382791923e-05, "loss": 0.1471, "step": 45 }, { "epoch": 0.013462101258413814, "grad_norm": 1.3951044082641602, "learning_rate": 4.966344746853966e-05, "loss": 0.055, "step": 46 }, { "epoch": 0.013754755633596722, "grad_norm": 5.2689595222473145, "learning_rate": 4.965613110916009e-05, "loss": 0.0843, "step": 47 }, { "epoch": 0.014047410008779631, "grad_norm": 1.5146514177322388, "learning_rate": 4.964881474978051e-05, "loss": 0.0214, "step": 48 }, { "epoch": 0.01434006438396254, "grad_norm": 1.2873971462249756, "learning_rate": 4.964149839040094e-05, "loss": 0.0206, "step": 49 }, { "epoch": 0.014632718759145449, "grad_norm": 2.304189920425415, "learning_rate": 4.9634182031021366e-05, "loss": 0.0219, "step": 50 }, { "epoch": 0.014925373134328358, "grad_norm": 3.5825271606445312, "learning_rate": 4.9626865671641794e-05, "loss": 0.0284, "step": 51 }, { "epoch": 0.015218027509511267, "grad_norm": 7.846745491027832, "learning_rate": 4.961954931226222e-05, "loss": 0.1349, "step": 52 }, { "epoch": 0.015510681884694176, "grad_norm": 0.15582777559757233, "learning_rate": 4.961223295288265e-05, "loss": 0.0021, "step": 53 }, { "epoch": 0.015803336259877086, "grad_norm": 0.20206129550933838, "learning_rate": 4.960491659350308e-05, "loss": 0.0023, "step": 54 }, { "epoch": 0.016095990635059995, "grad_norm": 8.418569564819336, "learning_rate": 4.9597600234123506e-05, "loss": 0.0311, "step": 55 }, { "epoch": 0.016388645010242904, "grad_norm": 0.4458349347114563, "learning_rate": 4.9590283874743934e-05, "loss": 0.0015, "step": 56 }, { "epoch": 0.016681299385425813, "grad_norm": 15.10268497467041, "learning_rate": 4.958296751536436e-05, "loss": 0.1544, "step": 57 }, { "epoch": 0.016973953760608722, "grad_norm": 3.0525522232055664, "learning_rate": 4.957565115598479e-05, "loss": 0.0124, "step": 58 }, { "epoch": 0.01726660813579163, "grad_norm": 0.11207275837659836, "learning_rate": 4.956833479660521e-05, "loss": 0.0007, "step": 59 }, { "epoch": 0.01755926251097454, "grad_norm": 4.801476001739502, "learning_rate": 4.956101843722564e-05, "loss": 0.0091, "step": 60 }, { "epoch": 0.01785191688615745, "grad_norm": 0.045167919248342514, "learning_rate": 4.955370207784607e-05, "loss": 0.0004, "step": 61 }, { "epoch": 0.018144571261340357, "grad_norm": 12.306194305419922, "learning_rate": 4.9546385718466495e-05, "loss": 0.0526, "step": 62 }, { "epoch": 0.018437225636523266, "grad_norm": 0.4786720871925354, "learning_rate": 4.953906935908692e-05, "loss": 0.0019, "step": 63 }, { "epoch": 0.018729880011706175, "grad_norm": 6.309378623962402, "learning_rate": 4.953175299970735e-05, "loss": 0.1884, "step": 64 }, { "epoch": 0.019022534386889084, "grad_norm": 3.3473260402679443, "learning_rate": 4.952443664032778e-05, "loss": 0.229, "step": 65 }, { "epoch": 0.019315188762071993, "grad_norm": 11.742388725280762, "learning_rate": 4.9517120280948206e-05, "loss": 0.3438, "step": 66 }, { "epoch": 0.0196078431372549, "grad_norm": 7.495570659637451, "learning_rate": 4.9509803921568634e-05, "loss": 0.0878, "step": 67 }, { "epoch": 0.01990049751243781, "grad_norm": 0.14235353469848633, "learning_rate": 4.950248756218906e-05, "loss": 0.002, "step": 68 }, { "epoch": 0.02019315188762072, "grad_norm": 2.739047050476074, "learning_rate": 4.9495171202809484e-05, "loss": 0.0183, "step": 69 }, { "epoch": 0.02048580626280363, "grad_norm": 2.6751646995544434, "learning_rate": 4.948785484342991e-05, "loss": 0.0417, "step": 70 }, { "epoch": 0.020778460637986537, "grad_norm": 4.878410339355469, "learning_rate": 4.948053848405034e-05, "loss": 0.0622, "step": 71 }, { "epoch": 0.021071115013169446, "grad_norm": 3.525151014328003, "learning_rate": 4.947322212467077e-05, "loss": 0.0839, "step": 72 }, { "epoch": 0.021363769388352355, "grad_norm": 0.149206280708313, "learning_rate": 4.9465905765291195e-05, "loss": 0.0018, "step": 73 }, { "epoch": 0.021656423763535264, "grad_norm": 0.9150102734565735, "learning_rate": 4.945858940591162e-05, "loss": 0.0048, "step": 74 }, { "epoch": 0.021949078138718173, "grad_norm": 0.07271959632635117, "learning_rate": 4.945127304653205e-05, "loss": 0.0014, "step": 75 }, { "epoch": 0.02224173251390108, "grad_norm": 7.08049201965332, "learning_rate": 4.944395668715248e-05, "loss": 0.058, "step": 76 }, { "epoch": 0.02253438688908399, "grad_norm": 3.4751858711242676, "learning_rate": 4.943664032777291e-05, "loss": 0.0364, "step": 77 }, { "epoch": 0.0228270412642669, "grad_norm": 7.454194068908691, "learning_rate": 4.9429323968393335e-05, "loss": 0.0815, "step": 78 }, { "epoch": 0.023119695639449808, "grad_norm": 1.007652759552002, "learning_rate": 4.9422007609013756e-05, "loss": 0.0064, "step": 79 }, { "epoch": 0.023412350014632717, "grad_norm": 4.862194061279297, "learning_rate": 4.9414691249634184e-05, "loss": 0.0276, "step": 80 }, { "epoch": 0.02370500438981563, "grad_norm": 13.2680082321167, "learning_rate": 4.940737489025461e-05, "loss": 0.3092, "step": 81 }, { "epoch": 0.02399765876499854, "grad_norm": 14.458672523498535, "learning_rate": 4.940005853087504e-05, "loss": 0.1856, "step": 82 }, { "epoch": 0.024290313140181447, "grad_norm": 11.084071159362793, "learning_rate": 4.939274217149547e-05, "loss": 0.2517, "step": 83 }, { "epoch": 0.024582967515364356, "grad_norm": 11.786449432373047, "learning_rate": 4.9385425812115896e-05, "loss": 0.2811, "step": 84 }, { "epoch": 0.024875621890547265, "grad_norm": 3.7863101959228516, "learning_rate": 4.9378109452736324e-05, "loss": 0.1465, "step": 85 }, { "epoch": 0.025168276265730174, "grad_norm": 3.508310556411743, "learning_rate": 4.937079309335675e-05, "loss": 0.0545, "step": 86 }, { "epoch": 0.025460930640913083, "grad_norm": 4.037002086639404, "learning_rate": 4.936347673397717e-05, "loss": 0.1531, "step": 87 }, { "epoch": 0.02575358501609599, "grad_norm": 15.45694351196289, "learning_rate": 4.93561603745976e-05, "loss": 0.4562, "step": 88 }, { "epoch": 0.0260462393912789, "grad_norm": 5.105620384216309, "learning_rate": 4.934884401521803e-05, "loss": 0.0803, "step": 89 }, { "epoch": 0.02633889376646181, "grad_norm": 3.120607852935791, "learning_rate": 4.934152765583846e-05, "loss": 0.037, "step": 90 }, { "epoch": 0.026631548141644718, "grad_norm": 13.44425106048584, "learning_rate": 4.9334211296458885e-05, "loss": 0.2523, "step": 91 }, { "epoch": 0.026924202516827627, "grad_norm": 8.555777549743652, "learning_rate": 4.932689493707931e-05, "loss": 0.1941, "step": 92 }, { "epoch": 0.027216856892010536, "grad_norm": 4.331564903259277, "learning_rate": 4.931957857769974e-05, "loss": 0.0826, "step": 93 }, { "epoch": 0.027509511267193445, "grad_norm": 4.442009449005127, "learning_rate": 4.931226221832017e-05, "loss": 0.0389, "step": 94 }, { "epoch": 0.027802165642376354, "grad_norm": 6.328910827636719, "learning_rate": 4.930494585894059e-05, "loss": 0.0539, "step": 95 }, { "epoch": 0.028094820017559263, "grad_norm": 7.026902675628662, "learning_rate": 4.929762949956102e-05, "loss": 0.0754, "step": 96 }, { "epoch": 0.02838747439274217, "grad_norm": 1.1635509729385376, "learning_rate": 4.9290313140181446e-05, "loss": 0.0121, "step": 97 }, { "epoch": 0.02868012876792508, "grad_norm": 1.090627908706665, "learning_rate": 4.9282996780801874e-05, "loss": 0.0102, "step": 98 }, { "epoch": 0.02897278314310799, "grad_norm": 7.56167459487915, "learning_rate": 4.92756804214223e-05, "loss": 0.2045, "step": 99 }, { "epoch": 0.029265437518290898, "grad_norm": 4.7205281257629395, "learning_rate": 4.926836406204273e-05, "loss": 0.0799, "step": 100 }, { "epoch": 0.029558091893473807, "grad_norm": 0.6290732026100159, "learning_rate": 4.926104770266316e-05, "loss": 0.0065, "step": 101 }, { "epoch": 0.029850746268656716, "grad_norm": 8.050834655761719, "learning_rate": 4.9253731343283586e-05, "loss": 0.1108, "step": 102 }, { "epoch": 0.030143400643839625, "grad_norm": 0.0645279660820961, "learning_rate": 4.924641498390401e-05, "loss": 0.0013, "step": 103 }, { "epoch": 0.030436055019022534, "grad_norm": 0.08164330571889877, "learning_rate": 4.9239098624524435e-05, "loss": 0.0014, "step": 104 }, { "epoch": 0.030728709394205442, "grad_norm": 1.1083030700683594, "learning_rate": 4.923178226514486e-05, "loss": 0.0069, "step": 105 }, { "epoch": 0.03102136376938835, "grad_norm": 0.28905317187309265, "learning_rate": 4.922446590576529e-05, "loss": 0.0047, "step": 106 }, { "epoch": 0.031314018144571264, "grad_norm": 0.8381115198135376, "learning_rate": 4.921714954638572e-05, "loss": 0.0054, "step": 107 }, { "epoch": 0.03160667251975417, "grad_norm": 5.30599308013916, "learning_rate": 4.9209833187006146e-05, "loss": 0.0412, "step": 108 }, { "epoch": 0.03189932689493708, "grad_norm": 1.0647958517074585, "learning_rate": 4.9202516827626574e-05, "loss": 0.0059, "step": 109 }, { "epoch": 0.03219198127011999, "grad_norm": 0.035897109657526016, "learning_rate": 4.9195200468247e-05, "loss": 0.0007, "step": 110 }, { "epoch": 0.0324846356453029, "grad_norm": 0.42506933212280273, "learning_rate": 4.918788410886743e-05, "loss": 0.0018, "step": 111 }, { "epoch": 0.03277729002048581, "grad_norm": 0.009726514108479023, "learning_rate": 4.918056774948785e-05, "loss": 0.0001, "step": 112 }, { "epoch": 0.03306994439566872, "grad_norm": 0.08547463268041611, "learning_rate": 4.917325139010828e-05, "loss": 0.0008, "step": 113 }, { "epoch": 0.033362598770851626, "grad_norm": 0.010455029085278511, "learning_rate": 4.916593503072871e-05, "loss": 0.0002, "step": 114 }, { "epoch": 0.033655253146034535, "grad_norm": 0.019560284912586212, "learning_rate": 4.9158618671349135e-05, "loss": 0.0003, "step": 115 }, { "epoch": 0.033947907521217444, "grad_norm": 0.008105482906103134, "learning_rate": 4.915130231196956e-05, "loss": 0.0001, "step": 116 }, { "epoch": 0.03424056189640035, "grad_norm": 0.03059670701622963, "learning_rate": 4.914398595258999e-05, "loss": 0.0003, "step": 117 }, { "epoch": 0.03453321627158326, "grad_norm": 10.641276359558105, "learning_rate": 4.913666959321042e-05, "loss": 0.27, "step": 118 }, { "epoch": 0.03482587064676617, "grad_norm": 6.722626686096191, "learning_rate": 4.912935323383085e-05, "loss": 0.0246, "step": 119 }, { "epoch": 0.03511852502194908, "grad_norm": 0.016005797311663628, "learning_rate": 4.9122036874451275e-05, "loss": 0.0002, "step": 120 }, { "epoch": 0.03541117939713199, "grad_norm": 3.1229820251464844, "learning_rate": 4.91147205150717e-05, "loss": 0.0201, "step": 121 }, { "epoch": 0.0357038337723149, "grad_norm": 0.020100802183151245, "learning_rate": 4.910740415569213e-05, "loss": 0.0002, "step": 122 }, { "epoch": 0.035996488147497806, "grad_norm": 0.1265154480934143, "learning_rate": 4.910008779631255e-05, "loss": 0.0008, "step": 123 }, { "epoch": 0.036289142522680715, "grad_norm": 0.003131201257929206, "learning_rate": 4.909277143693298e-05, "loss": 0.0001, "step": 124 }, { "epoch": 0.036581796897863623, "grad_norm": 14.657594680786133, "learning_rate": 4.908545507755341e-05, "loss": 0.0794, "step": 125 }, { "epoch": 0.03687445127304653, "grad_norm": 13.732138633728027, "learning_rate": 4.9078138718173836e-05, "loss": 0.1962, "step": 126 }, { "epoch": 0.03716710564822944, "grad_norm": 0.01592232845723629, "learning_rate": 4.9070822358794264e-05, "loss": 0.0003, "step": 127 }, { "epoch": 0.03745976002341235, "grad_norm": 9.321319580078125, "learning_rate": 4.906350599941469e-05, "loss": 0.079, "step": 128 }, { "epoch": 0.03775241439859526, "grad_norm": 0.34992310404777527, "learning_rate": 4.905618964003512e-05, "loss": 0.0014, "step": 129 }, { "epoch": 0.03804506877377817, "grad_norm": 11.185626029968262, "learning_rate": 4.904887328065555e-05, "loss": 0.1738, "step": 130 }, { "epoch": 0.03833772314896108, "grad_norm": 0.4017196297645569, "learning_rate": 4.9041556921275976e-05, "loss": 0.0015, "step": 131 }, { "epoch": 0.038630377524143986, "grad_norm": 9.26835823059082, "learning_rate": 4.9034240561896404e-05, "loss": 0.1436, "step": 132 }, { "epoch": 0.038923031899326895, "grad_norm": 0.014303785748779774, "learning_rate": 4.9026924202516825e-05, "loss": 0.0002, "step": 133 }, { "epoch": 0.0392156862745098, "grad_norm": 15.018467903137207, "learning_rate": 4.901960784313725e-05, "loss": 0.2495, "step": 134 }, { "epoch": 0.03950834064969271, "grad_norm": 17.677291870117188, "learning_rate": 4.901229148375768e-05, "loss": 0.5083, "step": 135 }, { "epoch": 0.03980099502487562, "grad_norm": 26.62687110900879, "learning_rate": 4.900497512437811e-05, "loss": 0.9146, "step": 136 }, { "epoch": 0.04009364940005853, "grad_norm": 1.2822232246398926, "learning_rate": 4.899765876499854e-05, "loss": 0.0047, "step": 137 }, { "epoch": 0.04038630377524144, "grad_norm": 0.004942530766129494, "learning_rate": 4.8990342405618965e-05, "loss": 0.0001, "step": 138 }, { "epoch": 0.04067895815042435, "grad_norm": 0.04992402717471123, "learning_rate": 4.898302604623939e-05, "loss": 0.0007, "step": 139 }, { "epoch": 0.04097161252560726, "grad_norm": 7.602818965911865, "learning_rate": 4.897570968685982e-05, "loss": 0.1186, "step": 140 }, { "epoch": 0.041264266900790166, "grad_norm": 5.089090347290039, "learning_rate": 4.896839332748025e-05, "loss": 0.0466, "step": 141 }, { "epoch": 0.041556921275973074, "grad_norm": 8.595590591430664, "learning_rate": 4.8961076968100676e-05, "loss": 0.1921, "step": 142 }, { "epoch": 0.04184957565115598, "grad_norm": 0.10725227743387222, "learning_rate": 4.8953760608721104e-05, "loss": 0.0013, "step": 143 }, { "epoch": 0.04214223002633889, "grad_norm": 0.0959903746843338, "learning_rate": 4.8946444249341526e-05, "loss": 0.0014, "step": 144 }, { "epoch": 0.0424348844015218, "grad_norm": 3.4017302989959717, "learning_rate": 4.8939127889961954e-05, "loss": 0.0167, "step": 145 }, { "epoch": 0.04272753877670471, "grad_norm": 1.009041666984558, "learning_rate": 4.893181153058238e-05, "loss": 0.0094, "step": 146 }, { "epoch": 0.04302019315188762, "grad_norm": 18.60599136352539, "learning_rate": 4.892449517120281e-05, "loss": 0.5468, "step": 147 }, { "epoch": 0.04331284752707053, "grad_norm": 8.110335350036621, "learning_rate": 4.891717881182324e-05, "loss": 0.3326, "step": 148 }, { "epoch": 0.043605501902253437, "grad_norm": 11.047513008117676, "learning_rate": 4.8909862452443665e-05, "loss": 0.1789, "step": 149 }, { "epoch": 0.043898156277436345, "grad_norm": 2.181462049484253, "learning_rate": 4.890254609306409e-05, "loss": 0.0298, "step": 150 }, { "epoch": 0.044190810652619254, "grad_norm": 0.3789248466491699, "learning_rate": 4.889522973368452e-05, "loss": 0.0063, "step": 151 }, { "epoch": 0.04448346502780216, "grad_norm": 3.644010305404663, "learning_rate": 4.888791337430495e-05, "loss": 0.0446, "step": 152 }, { "epoch": 0.04477611940298507, "grad_norm": 1.0178574323654175, "learning_rate": 4.888059701492538e-05, "loss": 0.012, "step": 153 }, { "epoch": 0.04506877377816798, "grad_norm": 1.6284925937652588, "learning_rate": 4.88732806555458e-05, "loss": 0.0119, "step": 154 }, { "epoch": 0.04536142815335089, "grad_norm": 0.4712274968624115, "learning_rate": 4.8865964296166226e-05, "loss": 0.0052, "step": 155 }, { "epoch": 0.0456540825285338, "grad_norm": 4.255475044250488, "learning_rate": 4.8858647936786654e-05, "loss": 0.0983, "step": 156 }, { "epoch": 0.04594673690371671, "grad_norm": 2.3100857734680176, "learning_rate": 4.885133157740708e-05, "loss": 0.0124, "step": 157 }, { "epoch": 0.046239391278899616, "grad_norm": 10.108885765075684, "learning_rate": 4.884401521802751e-05, "loss": 0.1775, "step": 158 }, { "epoch": 0.046532045654082525, "grad_norm": 0.29219287633895874, "learning_rate": 4.883669885864794e-05, "loss": 0.0025, "step": 159 }, { "epoch": 0.046824700029265434, "grad_norm": 8.007964134216309, "learning_rate": 4.8829382499268366e-05, "loss": 0.17, "step": 160 }, { "epoch": 0.04711735440444834, "grad_norm": 3.072378635406494, "learning_rate": 4.8822066139888794e-05, "loss": 0.0224, "step": 161 }, { "epoch": 0.04741000877963126, "grad_norm": 0.2821565270423889, "learning_rate": 4.881474978050922e-05, "loss": 0.0024, "step": 162 }, { "epoch": 0.04770266315481417, "grad_norm": 0.0806671753525734, "learning_rate": 4.880743342112965e-05, "loss": 0.0013, "step": 163 }, { "epoch": 0.04799531752999708, "grad_norm": 3.4239866733551025, "learning_rate": 4.880011706175008e-05, "loss": 0.1401, "step": 164 }, { "epoch": 0.048287971905179985, "grad_norm": 8.500259399414062, "learning_rate": 4.87928007023705e-05, "loss": 0.0536, "step": 165 }, { "epoch": 0.048580626280362894, "grad_norm": 11.746143341064453, "learning_rate": 4.878548434299093e-05, "loss": 0.2361, "step": 166 }, { "epoch": 0.0488732806555458, "grad_norm": 5.371679306030273, "learning_rate": 4.8778167983611355e-05, "loss": 0.0422, "step": 167 }, { "epoch": 0.04916593503072871, "grad_norm": 3.1472370624542236, "learning_rate": 4.877085162423178e-05, "loss": 0.0248, "step": 168 }, { "epoch": 0.04945858940591162, "grad_norm": 1.7468245029449463, "learning_rate": 4.876353526485221e-05, "loss": 0.0133, "step": 169 }, { "epoch": 0.04975124378109453, "grad_norm": 0.6838405132293701, "learning_rate": 4.875621890547264e-05, "loss": 0.0065, "step": 170 }, { "epoch": 0.05004389815627744, "grad_norm": 0.05950434133410454, "learning_rate": 4.874890254609307e-05, "loss": 0.0011, "step": 171 }, { "epoch": 0.05033655253146035, "grad_norm": 0.057430557906627655, "learning_rate": 4.8741586186713495e-05, "loss": 0.0007, "step": 172 }, { "epoch": 0.050629206906643257, "grad_norm": 0.7738732695579529, "learning_rate": 4.873426982733392e-05, "loss": 0.008, "step": 173 }, { "epoch": 0.050921861281826165, "grad_norm": 0.2586875855922699, "learning_rate": 4.872695346795435e-05, "loss": 0.0016, "step": 174 }, { "epoch": 0.051214515657009074, "grad_norm": 0.33069998025894165, "learning_rate": 4.871963710857478e-05, "loss": 0.0025, "step": 175 }, { "epoch": 0.05150717003219198, "grad_norm": 0.6992021799087524, "learning_rate": 4.87123207491952e-05, "loss": 0.0044, "step": 176 }, { "epoch": 0.05179982440737489, "grad_norm": 4.2504987716674805, "learning_rate": 4.870500438981563e-05, "loss": 0.1317, "step": 177 }, { "epoch": 0.0520924787825578, "grad_norm": 5.421692848205566, "learning_rate": 4.8697688030436056e-05, "loss": 0.0209, "step": 178 }, { "epoch": 0.05238513315774071, "grad_norm": 0.11863663792610168, "learning_rate": 4.8690371671056484e-05, "loss": 0.0016, "step": 179 }, { "epoch": 0.05267778753292362, "grad_norm": 11.441097259521484, "learning_rate": 4.868305531167691e-05, "loss": 0.0851, "step": 180 }, { "epoch": 0.05297044190810653, "grad_norm": 0.1788787990808487, "learning_rate": 4.867573895229734e-05, "loss": 0.0018, "step": 181 }, { "epoch": 0.053263096283289436, "grad_norm": 3.151461362838745, "learning_rate": 4.866842259291777e-05, "loss": 0.0311, "step": 182 }, { "epoch": 0.053555750658472345, "grad_norm": 0.7903239130973816, "learning_rate": 4.8661106233538195e-05, "loss": 0.0074, "step": 183 }, { "epoch": 0.053848405033655254, "grad_norm": 0.18016786873340607, "learning_rate": 4.865378987415862e-05, "loss": 0.0021, "step": 184 }, { "epoch": 0.05414105940883816, "grad_norm": 0.02187931537628174, "learning_rate": 4.864647351477905e-05, "loss": 0.0003, "step": 185 }, { "epoch": 0.05443371378402107, "grad_norm": 0.025294115766882896, "learning_rate": 4.863915715539947e-05, "loss": 0.0004, "step": 186 }, { "epoch": 0.05472636815920398, "grad_norm": 0.031194772571325302, "learning_rate": 4.86318407960199e-05, "loss": 0.0002, "step": 187 }, { "epoch": 0.05501902253438689, "grad_norm": 1.7829680442810059, "learning_rate": 4.862452443664033e-05, "loss": 0.0045, "step": 188 }, { "epoch": 0.0553116769095698, "grad_norm": 0.01780118979513645, "learning_rate": 4.8617208077260756e-05, "loss": 0.0002, "step": 189 }, { "epoch": 0.05560433128475271, "grad_norm": 0.2539866864681244, "learning_rate": 4.8609891717881184e-05, "loss": 0.0012, "step": 190 }, { "epoch": 0.055896985659935616, "grad_norm": 0.10223159939050674, "learning_rate": 4.860257535850161e-05, "loss": 0.0004, "step": 191 }, { "epoch": 0.056189640035118525, "grad_norm": 0.012053336016833782, "learning_rate": 4.859525899912204e-05, "loss": 0.0001, "step": 192 }, { "epoch": 0.056482294410301434, "grad_norm": 1.3330971002578735, "learning_rate": 4.858794263974247e-05, "loss": 0.0059, "step": 193 }, { "epoch": 0.05677494878548434, "grad_norm": 0.20015238225460052, "learning_rate": 4.8580626280362896e-05, "loss": 0.0008, "step": 194 }, { "epoch": 0.05706760316066725, "grad_norm": 0.0025256345979869366, "learning_rate": 4.8573309920983324e-05, "loss": 0.0001, "step": 195 }, { "epoch": 0.05736025753585016, "grad_norm": 0.008162226527929306, "learning_rate": 4.856599356160375e-05, "loss": 0.0001, "step": 196 }, { "epoch": 0.05765291191103307, "grad_norm": 0.002979808719828725, "learning_rate": 4.855867720222417e-05, "loss": 0.0, "step": 197 }, { "epoch": 0.05794556628621598, "grad_norm": 11.365145683288574, "learning_rate": 4.85513608428446e-05, "loss": 0.0807, "step": 198 }, { "epoch": 0.05823822066139889, "grad_norm": 3.422775983810425, "learning_rate": 4.854404448346503e-05, "loss": 0.0053, "step": 199 }, { "epoch": 0.058530875036581796, "grad_norm": 0.18176983296871185, "learning_rate": 4.853672812408546e-05, "loss": 0.0003, "step": 200 }, { "epoch": 0.058823529411764705, "grad_norm": 0.019310927018523216, "learning_rate": 4.8529411764705885e-05, "loss": 0.0001, "step": 201 }, { "epoch": 0.059116183786947614, "grad_norm": 0.0014688526280224323, "learning_rate": 4.852209540532631e-05, "loss": 0.0, "step": 202 }, { "epoch": 0.05940883816213052, "grad_norm": 11.9734468460083, "learning_rate": 4.851477904594674e-05, "loss": 0.0195, "step": 203 }, { "epoch": 0.05970149253731343, "grad_norm": 0.0035654143430292606, "learning_rate": 4.850746268656717e-05, "loss": 0.0001, "step": 204 }, { "epoch": 0.05999414691249634, "grad_norm": 0.23448404669761658, "learning_rate": 4.85001463271876e-05, "loss": 0.0011, "step": 205 }, { "epoch": 0.06028680128767925, "grad_norm": 0.01675930805504322, "learning_rate": 4.8492829967808025e-05, "loss": 0.0001, "step": 206 }, { "epoch": 0.06057945566286216, "grad_norm": 0.026205426082015038, "learning_rate": 4.8485513608428446e-05, "loss": 0.0001, "step": 207 }, { "epoch": 0.06087211003804507, "grad_norm": 0.7105171084403992, "learning_rate": 4.8478197249048874e-05, "loss": 0.0013, "step": 208 }, { "epoch": 0.061164764413227976, "grad_norm": 28.973526000976562, "learning_rate": 4.84708808896693e-05, "loss": 0.0618, "step": 209 }, { "epoch": 0.061457418788410885, "grad_norm": 8.843789100646973, "learning_rate": 4.846356453028973e-05, "loss": 0.13, "step": 210 }, { "epoch": 0.061750073163593794, "grad_norm": 0.002778812311589718, "learning_rate": 4.845624817091016e-05, "loss": 0.0, "step": 211 }, { "epoch": 0.0620427275387767, "grad_norm": 0.0018273483728989959, "learning_rate": 4.8448931811530586e-05, "loss": 0.0, "step": 212 }, { "epoch": 0.06233538191395961, "grad_norm": 0.0318334624171257, "learning_rate": 4.8441615452151014e-05, "loss": 0.0002, "step": 213 }, { "epoch": 0.06262803628914253, "grad_norm": 0.043673787266016006, "learning_rate": 4.843429909277144e-05, "loss": 0.0002, "step": 214 }, { "epoch": 0.06292069066432543, "grad_norm": 0.023164449259638786, "learning_rate": 4.842698273339187e-05, "loss": 0.0002, "step": 215 }, { "epoch": 0.06321334503950835, "grad_norm": 0.02091466635465622, "learning_rate": 4.84196663740123e-05, "loss": 0.0001, "step": 216 }, { "epoch": 0.06350599941469125, "grad_norm": 3.7927169799804688, "learning_rate": 4.8412350014632725e-05, "loss": 0.0116, "step": 217 }, { "epoch": 0.06379865378987416, "grad_norm": 0.012715407647192478, "learning_rate": 4.8405033655253146e-05, "loss": 0.0001, "step": 218 }, { "epoch": 0.06409130816505706, "grad_norm": 0.008732621558010578, "learning_rate": 4.8397717295873574e-05, "loss": 0.0001, "step": 219 }, { "epoch": 0.06438396254023998, "grad_norm": 13.74286937713623, "learning_rate": 4.8390400936494e-05, "loss": 0.1818, "step": 220 }, { "epoch": 0.06467661691542288, "grad_norm": 0.00698635121807456, "learning_rate": 4.838308457711443e-05, "loss": 0.0, "step": 221 }, { "epoch": 0.0649692712906058, "grad_norm": 13.672441482543945, "learning_rate": 4.837576821773486e-05, "loss": 0.0313, "step": 222 }, { "epoch": 0.0652619256657887, "grad_norm": 0.004722634330391884, "learning_rate": 4.8368451858355286e-05, "loss": 0.0, "step": 223 }, { "epoch": 0.06555458004097162, "grad_norm": 0.017278265208005905, "learning_rate": 4.8361135498975714e-05, "loss": 0.0001, "step": 224 }, { "epoch": 0.06584723441615452, "grad_norm": 0.5335111021995544, "learning_rate": 4.835381913959614e-05, "loss": 0.002, "step": 225 }, { "epoch": 0.06613988879133743, "grad_norm": 0.0020765329245477915, "learning_rate": 4.834650278021657e-05, "loss": 0.0, "step": 226 }, { "epoch": 0.06643254316652034, "grad_norm": 0.004643509164452553, "learning_rate": 4.8339186420837e-05, "loss": 0.0, "step": 227 }, { "epoch": 0.06672519754170325, "grad_norm": 5.808701992034912, "learning_rate": 4.8331870061457426e-05, "loss": 0.2854, "step": 228 }, { "epoch": 0.06701785191688615, "grad_norm": 0.0037125300150364637, "learning_rate": 4.832455370207785e-05, "loss": 0.0, "step": 229 }, { "epoch": 0.06731050629206907, "grad_norm": 0.017818965017795563, "learning_rate": 4.8317237342698275e-05, "loss": 0.0002, "step": 230 }, { "epoch": 0.06760316066725197, "grad_norm": 0.001532245078124106, "learning_rate": 4.83099209833187e-05, "loss": 0.0, "step": 231 }, { "epoch": 0.06789581504243489, "grad_norm": 0.17457996308803558, "learning_rate": 4.830260462393913e-05, "loss": 0.0007, "step": 232 }, { "epoch": 0.06818846941761779, "grad_norm": 8.137628555297852, "learning_rate": 4.829528826455956e-05, "loss": 0.2625, "step": 233 }, { "epoch": 0.0684811237928007, "grad_norm": 0.002676829230040312, "learning_rate": 4.828797190517999e-05, "loss": 0.0, "step": 234 }, { "epoch": 0.0687737781679836, "grad_norm": 0.7786100506782532, "learning_rate": 4.8280655545800415e-05, "loss": 0.0022, "step": 235 }, { "epoch": 0.06906643254316652, "grad_norm": 0.0047471970319747925, "learning_rate": 4.827333918642084e-05, "loss": 0.0, "step": 236 }, { "epoch": 0.06935908691834942, "grad_norm": 0.0006819628761149943, "learning_rate": 4.826602282704127e-05, "loss": 0.0, "step": 237 }, { "epoch": 0.06965174129353234, "grad_norm": 18.804973602294922, "learning_rate": 4.82587064676617e-05, "loss": 0.2542, "step": 238 }, { "epoch": 0.06994439566871524, "grad_norm": 0.018086520954966545, "learning_rate": 4.825139010828212e-05, "loss": 0.0001, "step": 239 }, { "epoch": 0.07023705004389816, "grad_norm": 0.01671360246837139, "learning_rate": 4.824407374890255e-05, "loss": 0.0001, "step": 240 }, { "epoch": 0.07052970441908106, "grad_norm": 11.613852500915527, "learning_rate": 4.8236757389522976e-05, "loss": 0.238, "step": 241 }, { "epoch": 0.07082235879426398, "grad_norm": 0.00957377627491951, "learning_rate": 4.8229441030143404e-05, "loss": 0.0001, "step": 242 }, { "epoch": 0.07111501316944688, "grad_norm": 1.9750438928604126, "learning_rate": 4.822212467076383e-05, "loss": 0.0119, "step": 243 }, { "epoch": 0.0714076675446298, "grad_norm": 0.006536509841680527, "learning_rate": 4.821480831138426e-05, "loss": 0.0001, "step": 244 }, { "epoch": 0.0717003219198127, "grad_norm": 0.0057691894471645355, "learning_rate": 4.820749195200469e-05, "loss": 0.0001, "step": 245 }, { "epoch": 0.07199297629499561, "grad_norm": 0.23802554607391357, "learning_rate": 4.8200175592625116e-05, "loss": 0.001, "step": 246 }, { "epoch": 0.07228563067017851, "grad_norm": 0.005101884715259075, "learning_rate": 4.8192859233245543e-05, "loss": 0.0001, "step": 247 }, { "epoch": 0.07257828504536143, "grad_norm": 0.0271473191678524, "learning_rate": 4.818554287386597e-05, "loss": 0.0004, "step": 248 }, { "epoch": 0.07287093942054433, "grad_norm": 0.3413756191730499, "learning_rate": 4.81782265144864e-05, "loss": 0.0024, "step": 249 }, { "epoch": 0.07316359379572725, "grad_norm": 5.48157262802124, "learning_rate": 4.817091015510682e-05, "loss": 0.0218, "step": 250 }, { "epoch": 0.07345624817091015, "grad_norm": 2.417241096496582, "learning_rate": 4.816359379572725e-05, "loss": 0.0151, "step": 251 }, { "epoch": 0.07374890254609306, "grad_norm": 3.9786200523376465, "learning_rate": 4.8156277436347676e-05, "loss": 0.0151, "step": 252 }, { "epoch": 0.07404155692127597, "grad_norm": 7.795351028442383, "learning_rate": 4.8148961076968104e-05, "loss": 0.1096, "step": 253 }, { "epoch": 0.07433421129645888, "grad_norm": 0.010204765945672989, "learning_rate": 4.814164471758853e-05, "loss": 0.0002, "step": 254 }, { "epoch": 0.07462686567164178, "grad_norm": 5.5745391845703125, "learning_rate": 4.813432835820896e-05, "loss": 0.0285, "step": 255 }, { "epoch": 0.0749195200468247, "grad_norm": 0.013587488792836666, "learning_rate": 4.812701199882939e-05, "loss": 0.0002, "step": 256 }, { "epoch": 0.0752121744220076, "grad_norm": 0.001782214269042015, "learning_rate": 4.8119695639449816e-05, "loss": 0.0, "step": 257 }, { "epoch": 0.07550482879719052, "grad_norm": 0.02017296850681305, "learning_rate": 4.8112379280070244e-05, "loss": 0.0002, "step": 258 }, { "epoch": 0.07579748317237343, "grad_norm": 12.804062843322754, "learning_rate": 4.810506292069067e-05, "loss": 0.185, "step": 259 }, { "epoch": 0.07609013754755634, "grad_norm": 0.011184900999069214, "learning_rate": 4.809774656131109e-05, "loss": 0.0002, "step": 260 }, { "epoch": 0.07638279192273925, "grad_norm": 0.5110180377960205, "learning_rate": 4.809043020193152e-05, "loss": 0.0034, "step": 261 }, { "epoch": 0.07667544629792215, "grad_norm": 0.5023605227470398, "learning_rate": 4.808311384255195e-05, "loss": 0.0035, "step": 262 }, { "epoch": 0.07696810067310507, "grad_norm": 1.2688344717025757, "learning_rate": 4.807579748317238e-05, "loss": 0.0092, "step": 263 }, { "epoch": 0.07726075504828797, "grad_norm": 0.09958023577928543, "learning_rate": 4.8068481123792805e-05, "loss": 0.0004, "step": 264 }, { "epoch": 0.07755340942347089, "grad_norm": 0.02350887842476368, "learning_rate": 4.806116476441323e-05, "loss": 0.0002, "step": 265 }, { "epoch": 0.07784606379865379, "grad_norm": 9.163583755493164, "learning_rate": 4.805384840503366e-05, "loss": 0.0357, "step": 266 }, { "epoch": 0.0781387181738367, "grad_norm": 0.0023382732179015875, "learning_rate": 4.804653204565409e-05, "loss": 0.0, "step": 267 }, { "epoch": 0.0784313725490196, "grad_norm": 12.967851638793945, "learning_rate": 4.803921568627452e-05, "loss": 0.1247, "step": 268 }, { "epoch": 0.07872402692420252, "grad_norm": 0.0019798458088189363, "learning_rate": 4.803189932689494e-05, "loss": 0.0, "step": 269 }, { "epoch": 0.07901668129938542, "grad_norm": 0.06770626455545425, "learning_rate": 4.8024582967515366e-05, "loss": 0.0004, "step": 270 }, { "epoch": 0.07930933567456834, "grad_norm": 0.08193594962358475, "learning_rate": 4.8017266608135794e-05, "loss": 0.0005, "step": 271 }, { "epoch": 0.07960199004975124, "grad_norm": 15.774731636047363, "learning_rate": 4.800995024875622e-05, "loss": 0.1289, "step": 272 }, { "epoch": 0.07989464442493416, "grad_norm": 3.4991703033447266, "learning_rate": 4.800263388937665e-05, "loss": 0.0112, "step": 273 }, { "epoch": 0.08018729880011706, "grad_norm": 0.045116156339645386, "learning_rate": 4.799531752999708e-05, "loss": 0.0004, "step": 274 }, { "epoch": 0.08047995317529998, "grad_norm": 0.13101589679718018, "learning_rate": 4.7988001170617506e-05, "loss": 0.0006, "step": 275 }, { "epoch": 0.08077260755048288, "grad_norm": 0.08923770487308502, "learning_rate": 4.7980684811237934e-05, "loss": 0.0006, "step": 276 }, { "epoch": 0.0810652619256658, "grad_norm": 0.7768778204917908, "learning_rate": 4.7973368451858355e-05, "loss": 0.0019, "step": 277 }, { "epoch": 0.0813579163008487, "grad_norm": 0.010617797262966633, "learning_rate": 4.796605209247878e-05, "loss": 0.0001, "step": 278 }, { "epoch": 0.08165057067603161, "grad_norm": 1.319770336151123, "learning_rate": 4.795873573309921e-05, "loss": 0.0037, "step": 279 }, { "epoch": 0.08194322505121451, "grad_norm": 1.0644629001617432, "learning_rate": 4.795141937371964e-05, "loss": 0.002, "step": 280 }, { "epoch": 0.08223587942639743, "grad_norm": 5.610535144805908, "learning_rate": 4.794410301434007e-05, "loss": 0.0108, "step": 281 }, { "epoch": 0.08252853380158033, "grad_norm": 0.0032586927991360426, "learning_rate": 4.7936786654960495e-05, "loss": 0.0, "step": 282 }, { "epoch": 0.08282118817676325, "grad_norm": 0.0010933985468000174, "learning_rate": 4.792947029558092e-05, "loss": 0.0, "step": 283 }, { "epoch": 0.08311384255194615, "grad_norm": 0.0033725625835359097, "learning_rate": 4.792215393620135e-05, "loss": 0.0001, "step": 284 }, { "epoch": 0.08340649692712906, "grad_norm": 0.05476341396570206, "learning_rate": 4.791483757682177e-05, "loss": 0.0002, "step": 285 }, { "epoch": 0.08369915130231197, "grad_norm": 0.0005282636848278344, "learning_rate": 4.79075212174422e-05, "loss": 0.0, "step": 286 }, { "epoch": 0.08399180567749488, "grad_norm": 2.6202609539031982, "learning_rate": 4.790020485806263e-05, "loss": 0.3312, "step": 287 }, { "epoch": 0.08428446005267778, "grad_norm": 0.24636919796466827, "learning_rate": 4.7892888498683056e-05, "loss": 0.0005, "step": 288 }, { "epoch": 0.0845771144278607, "grad_norm": 12.588053703308105, "learning_rate": 4.7885572139303484e-05, "loss": 0.0787, "step": 289 }, { "epoch": 0.0848697688030436, "grad_norm": 2.8286802768707275, "learning_rate": 4.787825577992391e-05, "loss": 0.0063, "step": 290 }, { "epoch": 0.08516242317822652, "grad_norm": 0.04766281321644783, "learning_rate": 4.787093942054434e-05, "loss": 0.0004, "step": 291 }, { "epoch": 0.08545507755340942, "grad_norm": 0.8390278816223145, "learning_rate": 4.786362306116477e-05, "loss": 0.004, "step": 292 }, { "epoch": 0.08574773192859234, "grad_norm": 7.186341762542725, "learning_rate": 4.785630670178519e-05, "loss": 0.0437, "step": 293 }, { "epoch": 0.08604038630377524, "grad_norm": 9.822579383850098, "learning_rate": 4.7848990342405616e-05, "loss": 0.0769, "step": 294 }, { "epoch": 0.08633304067895815, "grad_norm": 0.16112007200717926, "learning_rate": 4.7841673983026044e-05, "loss": 0.0017, "step": 295 }, { "epoch": 0.08662569505414106, "grad_norm": 0.14990444481372833, "learning_rate": 4.783435762364647e-05, "loss": 0.0012, "step": 296 }, { "epoch": 0.08691834942932397, "grad_norm": 0.037165552377700806, "learning_rate": 4.78270412642669e-05, "loss": 0.0006, "step": 297 }, { "epoch": 0.08721100380450687, "grad_norm": 0.586540937423706, "learning_rate": 4.781972490488733e-05, "loss": 0.0012, "step": 298 }, { "epoch": 0.08750365817968979, "grad_norm": 0.014206857420504093, "learning_rate": 4.7812408545507756e-05, "loss": 0.0003, "step": 299 }, { "epoch": 0.08779631255487269, "grad_norm": 1.0259332656860352, "learning_rate": 4.7805092186128184e-05, "loss": 0.0019, "step": 300 }, { "epoch": 0.0880889669300556, "grad_norm": 0.010009337216615677, "learning_rate": 4.779777582674861e-05, "loss": 0.0002, "step": 301 }, { "epoch": 0.08838162130523851, "grad_norm": 0.0075365579687058926, "learning_rate": 4.779045946736904e-05, "loss": 0.0002, "step": 302 }, { "epoch": 0.08867427568042142, "grad_norm": 12.360414505004883, "learning_rate": 4.778314310798946e-05, "loss": 0.1413, "step": 303 }, { "epoch": 0.08896693005560433, "grad_norm": 0.008117208257317543, "learning_rate": 4.777582674860989e-05, "loss": 0.0002, "step": 304 }, { "epoch": 0.08925958443078724, "grad_norm": 0.022172026336193085, "learning_rate": 4.776851038923032e-05, "loss": 0.0003, "step": 305 }, { "epoch": 0.08955223880597014, "grad_norm": 0.008608819916844368, "learning_rate": 4.7761194029850745e-05, "loss": 0.0002, "step": 306 }, { "epoch": 0.08984489318115306, "grad_norm": 0.021730391308665276, "learning_rate": 4.775387767047117e-05, "loss": 0.0002, "step": 307 }, { "epoch": 0.09013754755633596, "grad_norm": 0.7678189873695374, "learning_rate": 4.77465613110916e-05, "loss": 0.0024, "step": 308 }, { "epoch": 0.09043020193151888, "grad_norm": 0.003530798014253378, "learning_rate": 4.773924495171203e-05, "loss": 0.0001, "step": 309 }, { "epoch": 0.09072285630670178, "grad_norm": 0.10175793617963791, "learning_rate": 4.773192859233246e-05, "loss": 0.0005, "step": 310 }, { "epoch": 0.0910155106818847, "grad_norm": 0.005056069698184729, "learning_rate": 4.7724612232952885e-05, "loss": 0.0001, "step": 311 }, { "epoch": 0.0913081650570676, "grad_norm": 0.006732292473316193, "learning_rate": 4.771729587357331e-05, "loss": 0.0001, "step": 312 }, { "epoch": 0.09160081943225051, "grad_norm": 0.005457394290715456, "learning_rate": 4.770997951419374e-05, "loss": 0.0001, "step": 313 }, { "epoch": 0.09189347380743342, "grad_norm": 0.04522399604320526, "learning_rate": 4.770266315481416e-05, "loss": 0.0003, "step": 314 }, { "epoch": 0.09218612818261633, "grad_norm": 0.007834223099052906, "learning_rate": 4.769534679543459e-05, "loss": 0.0002, "step": 315 }, { "epoch": 0.09247878255779923, "grad_norm": 0.01890292391180992, "learning_rate": 4.768803043605502e-05, "loss": 0.0003, "step": 316 }, { "epoch": 0.09277143693298215, "grad_norm": 0.016568325459957123, "learning_rate": 4.7680714076675446e-05, "loss": 0.0002, "step": 317 }, { "epoch": 0.09306409130816505, "grad_norm": 0.08477871865034103, "learning_rate": 4.7673397717295874e-05, "loss": 0.0004, "step": 318 }, { "epoch": 0.09335674568334797, "grad_norm": 0.012408802285790443, "learning_rate": 4.76660813579163e-05, "loss": 0.0002, "step": 319 }, { "epoch": 0.09364940005853087, "grad_norm": 0.061891306191682816, "learning_rate": 4.765876499853673e-05, "loss": 0.0003, "step": 320 }, { "epoch": 0.09394205443371378, "grad_norm": 4.866892337799072, "learning_rate": 4.765144863915716e-05, "loss": 0.248, "step": 321 }, { "epoch": 0.09423470880889669, "grad_norm": 4.284445762634277, "learning_rate": 4.7644132279777586e-05, "loss": 0.0087, "step": 322 }, { "epoch": 0.0945273631840796, "grad_norm": 0.009381483308970928, "learning_rate": 4.7636815920398013e-05, "loss": 0.0002, "step": 323 }, { "epoch": 0.09482001755926252, "grad_norm": 13.310781478881836, "learning_rate": 4.7629499561018435e-05, "loss": 0.1202, "step": 324 }, { "epoch": 0.09511267193444542, "grad_norm": 23.32094383239746, "learning_rate": 4.762218320163886e-05, "loss": 0.1079, "step": 325 }, { "epoch": 0.09540532630962834, "grad_norm": 0.003830630797892809, "learning_rate": 4.761486684225929e-05, "loss": 0.0001, "step": 326 }, { "epoch": 0.09569798068481124, "grad_norm": 0.002424234990030527, "learning_rate": 4.760755048287972e-05, "loss": 0.0001, "step": 327 }, { "epoch": 0.09599063505999415, "grad_norm": 0.23865213990211487, "learning_rate": 4.7600234123500146e-05, "loss": 0.0009, "step": 328 }, { "epoch": 0.09628328943517706, "grad_norm": 0.004662353079766035, "learning_rate": 4.7592917764120574e-05, "loss": 0.0001, "step": 329 }, { "epoch": 0.09657594381035997, "grad_norm": 0.05005013570189476, "learning_rate": 4.7585601404741e-05, "loss": 0.0004, "step": 330 }, { "epoch": 0.09686859818554287, "grad_norm": 0.5038985013961792, "learning_rate": 4.757828504536143e-05, "loss": 0.0027, "step": 331 }, { "epoch": 0.09716125256072579, "grad_norm": 0.2572520077228546, "learning_rate": 4.757096868598186e-05, "loss": 0.001, "step": 332 }, { "epoch": 0.09745390693590869, "grad_norm": 3.4524786472320557, "learning_rate": 4.7563652326602286e-05, "loss": 0.0173, "step": 333 }, { "epoch": 0.0977465613110916, "grad_norm": 12.419319152832031, "learning_rate": 4.7556335967222714e-05, "loss": 0.1363, "step": 334 }, { "epoch": 0.09803921568627451, "grad_norm": 2.271554470062256, "learning_rate": 4.7549019607843135e-05, "loss": 0.0058, "step": 335 }, { "epoch": 0.09833187006145742, "grad_norm": 14.14339542388916, "learning_rate": 4.754170324846356e-05, "loss": 0.0725, "step": 336 }, { "epoch": 0.09862452443664033, "grad_norm": 5.047306537628174, "learning_rate": 4.753438688908399e-05, "loss": 0.0749, "step": 337 }, { "epoch": 0.09891717881182324, "grad_norm": 4.678935527801514, "learning_rate": 4.752707052970442e-05, "loss": 0.0092, "step": 338 }, { "epoch": 0.09920983318700614, "grad_norm": 0.040029630064964294, "learning_rate": 4.751975417032485e-05, "loss": 0.0005, "step": 339 }, { "epoch": 0.09950248756218906, "grad_norm": 0.9745525121688843, "learning_rate": 4.7512437810945275e-05, "loss": 0.0035, "step": 340 }, { "epoch": 0.09979514193737196, "grad_norm": 0.2308918535709381, "learning_rate": 4.75051214515657e-05, "loss": 0.0011, "step": 341 }, { "epoch": 0.10008779631255488, "grad_norm": 0.024273114278912544, "learning_rate": 4.749780509218613e-05, "loss": 0.0003, "step": 342 }, { "epoch": 0.10038045068773778, "grad_norm": 0.14329658448696136, "learning_rate": 4.749048873280656e-05, "loss": 0.0008, "step": 343 }, { "epoch": 0.1006731050629207, "grad_norm": 0.5504376292228699, "learning_rate": 4.748317237342699e-05, "loss": 0.0023, "step": 344 }, { "epoch": 0.1009657594381036, "grad_norm": 0.07725141942501068, "learning_rate": 4.747585601404741e-05, "loss": 0.0004, "step": 345 }, { "epoch": 0.10125841381328651, "grad_norm": 0.004685727413743734, "learning_rate": 4.7468539654667836e-05, "loss": 0.0001, "step": 346 }, { "epoch": 0.10155106818846941, "grad_norm": 5.258609771728516, "learning_rate": 4.7461223295288264e-05, "loss": 0.0163, "step": 347 }, { "epoch": 0.10184372256365233, "grad_norm": 0.005556050688028336, "learning_rate": 4.745390693590869e-05, "loss": 0.0001, "step": 348 }, { "epoch": 0.10213637693883523, "grad_norm": 0.02145569585263729, "learning_rate": 4.744659057652912e-05, "loss": 0.0002, "step": 349 }, { "epoch": 0.10242903131401815, "grad_norm": 0.01085591223090887, "learning_rate": 4.743927421714955e-05, "loss": 0.0001, "step": 350 }, { "epoch": 0.10272168568920105, "grad_norm": 0.05303160846233368, "learning_rate": 4.7431957857769976e-05, "loss": 0.0005, "step": 351 }, { "epoch": 0.10301434006438397, "grad_norm": 8.427594184875488, "learning_rate": 4.7424641498390404e-05, "loss": 0.1494, "step": 352 }, { "epoch": 0.10330699443956687, "grad_norm": 0.0912499874830246, "learning_rate": 4.741732513901083e-05, "loss": 0.0007, "step": 353 }, { "epoch": 0.10359964881474978, "grad_norm": 0.004164704121649265, "learning_rate": 4.741000877963126e-05, "loss": 0.0001, "step": 354 }, { "epoch": 0.10389230318993269, "grad_norm": 0.5572128295898438, "learning_rate": 4.740269242025169e-05, "loss": 0.0034, "step": 355 }, { "epoch": 0.1041849575651156, "grad_norm": 8.62136459350586, "learning_rate": 4.739537606087211e-05, "loss": 0.0587, "step": 356 }, { "epoch": 0.1044776119402985, "grad_norm": 2.725903272628784, "learning_rate": 4.738805970149254e-05, "loss": 0.0086, "step": 357 }, { "epoch": 0.10477026631548142, "grad_norm": 0.0027688341215252876, "learning_rate": 4.7380743342112965e-05, "loss": 0.0001, "step": 358 }, { "epoch": 0.10506292069066432, "grad_norm": 1.1569620370864868, "learning_rate": 4.737342698273339e-05, "loss": 0.0032, "step": 359 }, { "epoch": 0.10535557506584724, "grad_norm": 0.006273103877902031, "learning_rate": 4.736611062335382e-05, "loss": 0.0001, "step": 360 }, { "epoch": 0.10564822944103014, "grad_norm": 13.408610343933105, "learning_rate": 4.735879426397425e-05, "loss": 0.2888, "step": 361 }, { "epoch": 0.10594088381621306, "grad_norm": 4.661241054534912, "learning_rate": 4.7351477904594676e-05, "loss": 0.2044, "step": 362 }, { "epoch": 0.10623353819139596, "grad_norm": 6.516336441040039, "learning_rate": 4.7344161545215104e-05, "loss": 0.1134, "step": 363 }, { "epoch": 0.10652619256657887, "grad_norm": 0.00893083680421114, "learning_rate": 4.733684518583553e-05, "loss": 0.0002, "step": 364 }, { "epoch": 0.10681884694176177, "grad_norm": 0.010455523617565632, "learning_rate": 4.732952882645596e-05, "loss": 0.0002, "step": 365 }, { "epoch": 0.10711150131694469, "grad_norm": 6.539015293121338, "learning_rate": 4.732221246707639e-05, "loss": 0.0543, "step": 366 }, { "epoch": 0.10740415569212759, "grad_norm": 0.01176715549081564, "learning_rate": 4.731489610769681e-05, "loss": 0.0002, "step": 367 }, { "epoch": 0.10769681006731051, "grad_norm": 4.113800048828125, "learning_rate": 4.730757974831724e-05, "loss": 0.0278, "step": 368 }, { "epoch": 0.10798946444249341, "grad_norm": 11.07836627960205, "learning_rate": 4.7300263388937665e-05, "loss": 0.0611, "step": 369 }, { "epoch": 0.10828211881767633, "grad_norm": 0.5808396339416504, "learning_rate": 4.729294702955809e-05, "loss": 0.0023, "step": 370 }, { "epoch": 0.10857477319285923, "grad_norm": 0.02736765518784523, "learning_rate": 4.728563067017852e-05, "loss": 0.0003, "step": 371 }, { "epoch": 0.10886742756804214, "grad_norm": 6.302504539489746, "learning_rate": 4.727831431079895e-05, "loss": 0.1156, "step": 372 }, { "epoch": 0.10916008194322505, "grad_norm": 0.10329131036996841, "learning_rate": 4.727099795141938e-05, "loss": 0.0019, "step": 373 }, { "epoch": 0.10945273631840796, "grad_norm": 0.28105202317237854, "learning_rate": 4.7263681592039805e-05, "loss": 0.0041, "step": 374 }, { "epoch": 0.10974539069359086, "grad_norm": 0.2540721893310547, "learning_rate": 4.725636523266023e-05, "loss": 0.0038, "step": 375 }, { "epoch": 0.11003804506877378, "grad_norm": 0.22207878530025482, "learning_rate": 4.724904887328066e-05, "loss": 0.0015, "step": 376 }, { "epoch": 0.11033069944395668, "grad_norm": 3.857038736343384, "learning_rate": 4.724173251390108e-05, "loss": 0.086, "step": 377 }, { "epoch": 0.1106233538191396, "grad_norm": 2.7076306343078613, "learning_rate": 4.723441615452151e-05, "loss": 0.0153, "step": 378 }, { "epoch": 0.1109160081943225, "grad_norm": 0.03701874241232872, "learning_rate": 4.722709979514194e-05, "loss": 0.0007, "step": 379 }, { "epoch": 0.11120866256950541, "grad_norm": 0.05244317650794983, "learning_rate": 4.7219783435762366e-05, "loss": 0.0007, "step": 380 }, { "epoch": 0.11150131694468832, "grad_norm": 1.131405234336853, "learning_rate": 4.7212467076382794e-05, "loss": 0.0073, "step": 381 }, { "epoch": 0.11179397131987123, "grad_norm": 0.539952278137207, "learning_rate": 4.720515071700322e-05, "loss": 0.0044, "step": 382 }, { "epoch": 0.11208662569505413, "grad_norm": 0.02004413679242134, "learning_rate": 4.719783435762365e-05, "loss": 0.0004, "step": 383 }, { "epoch": 0.11237928007023705, "grad_norm": 0.018577802926301956, "learning_rate": 4.719051799824408e-05, "loss": 0.0002, "step": 384 }, { "epoch": 0.11267193444541995, "grad_norm": 0.09130682796239853, "learning_rate": 4.7183201638864506e-05, "loss": 0.0005, "step": 385 }, { "epoch": 0.11296458882060287, "grad_norm": 4.0011820793151855, "learning_rate": 4.7175885279484934e-05, "loss": 0.2058, "step": 386 }, { "epoch": 0.11325724319578578, "grad_norm": 7.127885818481445, "learning_rate": 4.716856892010536e-05, "loss": 0.0478, "step": 387 }, { "epoch": 0.11354989757096869, "grad_norm": 10.796640396118164, "learning_rate": 4.716125256072578e-05, "loss": 0.0723, "step": 388 }, { "epoch": 0.1138425519461516, "grad_norm": 0.36668506264686584, "learning_rate": 4.715393620134621e-05, "loss": 0.0016, "step": 389 }, { "epoch": 0.1141352063213345, "grad_norm": 3.7605903148651123, "learning_rate": 4.714661984196664e-05, "loss": 0.0238, "step": 390 }, { "epoch": 0.11442786069651742, "grad_norm": 8.716503143310547, "learning_rate": 4.713930348258707e-05, "loss": 0.1373, "step": 391 }, { "epoch": 0.11472051507170032, "grad_norm": 0.05329298600554466, "learning_rate": 4.7131987123207495e-05, "loss": 0.0004, "step": 392 }, { "epoch": 0.11501316944688324, "grad_norm": 6.454421043395996, "learning_rate": 4.712467076382792e-05, "loss": 0.0283, "step": 393 }, { "epoch": 0.11530582382206614, "grad_norm": 12.48361587524414, "learning_rate": 4.711735440444835e-05, "loss": 0.2669, "step": 394 }, { "epoch": 0.11559847819724905, "grad_norm": 2.376858949661255, "learning_rate": 4.711003804506878e-05, "loss": 0.2375, "step": 395 }, { "epoch": 0.11589113257243196, "grad_norm": 0.006335779093205929, "learning_rate": 4.7102721685689206e-05, "loss": 0.0001, "step": 396 }, { "epoch": 0.11618378694761487, "grad_norm": 0.9085770845413208, "learning_rate": 4.7095405326309634e-05, "loss": 0.0046, "step": 397 }, { "epoch": 0.11647644132279777, "grad_norm": 2.1395437717437744, "learning_rate": 4.7088088966930056e-05, "loss": 0.0096, "step": 398 }, { "epoch": 0.11676909569798069, "grad_norm": 1.47907292842865, "learning_rate": 4.7080772607550484e-05, "loss": 0.0114, "step": 399 }, { "epoch": 0.11706175007316359, "grad_norm": 6.841465473175049, "learning_rate": 4.707345624817091e-05, "loss": 0.0373, "step": 400 }, { "epoch": 0.11735440444834651, "grad_norm": 8.371074676513672, "learning_rate": 4.706613988879134e-05, "loss": 0.0509, "step": 401 }, { "epoch": 0.11764705882352941, "grad_norm": 0.017969602718949318, "learning_rate": 4.705882352941177e-05, "loss": 0.0004, "step": 402 }, { "epoch": 0.11793971319871233, "grad_norm": 0.12363521754741669, "learning_rate": 4.7051507170032195e-05, "loss": 0.0009, "step": 403 }, { "epoch": 0.11823236757389523, "grad_norm": 0.045213498175144196, "learning_rate": 4.704419081065262e-05, "loss": 0.001, "step": 404 }, { "epoch": 0.11852502194907814, "grad_norm": 3.5112273693084717, "learning_rate": 4.703687445127305e-05, "loss": 0.0107, "step": 405 }, { "epoch": 0.11881767632426105, "grad_norm": 0.22896206378936768, "learning_rate": 4.702955809189348e-05, "loss": 0.0018, "step": 406 }, { "epoch": 0.11911033069944396, "grad_norm": 2.320936441421509, "learning_rate": 4.702224173251391e-05, "loss": 0.0115, "step": 407 }, { "epoch": 0.11940298507462686, "grad_norm": 0.11919834464788437, "learning_rate": 4.7014925373134335e-05, "loss": 0.0014, "step": 408 }, { "epoch": 0.11969563944980978, "grad_norm": 0.05550792068243027, "learning_rate": 4.7007609013754756e-05, "loss": 0.0009, "step": 409 }, { "epoch": 0.11998829382499268, "grad_norm": 20.043973922729492, "learning_rate": 4.7000292654375184e-05, "loss": 0.0925, "step": 410 }, { "epoch": 0.1202809482001756, "grad_norm": 0.01288844645023346, "learning_rate": 4.699297629499561e-05, "loss": 0.0003, "step": 411 }, { "epoch": 0.1205736025753585, "grad_norm": 0.09516994655132294, "learning_rate": 4.698565993561604e-05, "loss": 0.0012, "step": 412 }, { "epoch": 0.12086625695054141, "grad_norm": 0.1977280080318451, "learning_rate": 4.697834357623647e-05, "loss": 0.0019, "step": 413 }, { "epoch": 0.12115891132572432, "grad_norm": 0.018458962440490723, "learning_rate": 4.6971027216856896e-05, "loss": 0.0003, "step": 414 }, { "epoch": 0.12145156570090723, "grad_norm": 0.01741596683859825, "learning_rate": 4.6963710857477324e-05, "loss": 0.0003, "step": 415 }, { "epoch": 0.12174422007609013, "grad_norm": 17.843650817871094, "learning_rate": 4.695639449809775e-05, "loss": 0.6303, "step": 416 }, { "epoch": 0.12203687445127305, "grad_norm": 1.637863278388977, "learning_rate": 4.694907813871818e-05, "loss": 0.0045, "step": 417 }, { "epoch": 0.12232952882645595, "grad_norm": 8.336296081542969, "learning_rate": 4.694176177933861e-05, "loss": 0.0237, "step": 418 }, { "epoch": 0.12262218320163887, "grad_norm": 0.021015867590904236, "learning_rate": 4.6934445419959036e-05, "loss": 0.0003, "step": 419 }, { "epoch": 0.12291483757682177, "grad_norm": 0.005957255605608225, "learning_rate": 4.692712906057946e-05, "loss": 0.0002, "step": 420 }, { "epoch": 0.12320749195200469, "grad_norm": 0.5390514135360718, "learning_rate": 4.6919812701199885e-05, "loss": 0.003, "step": 421 }, { "epoch": 0.12350014632718759, "grad_norm": 0.0064792693592607975, "learning_rate": 4.691249634182031e-05, "loss": 0.0002, "step": 422 }, { "epoch": 0.1237928007023705, "grad_norm": 0.06832041591405869, "learning_rate": 4.690517998244074e-05, "loss": 0.0006, "step": 423 }, { "epoch": 0.1240854550775534, "grad_norm": 13.302190780639648, "learning_rate": 4.689786362306117e-05, "loss": 0.6247, "step": 424 }, { "epoch": 0.12437810945273632, "grad_norm": 3.1761767864227295, "learning_rate": 4.68905472636816e-05, "loss": 0.2245, "step": 425 }, { "epoch": 0.12467076382791922, "grad_norm": 0.006169583182781935, "learning_rate": 4.6883230904302025e-05, "loss": 0.0001, "step": 426 }, { "epoch": 0.12496341820310214, "grad_norm": 7.2902445793151855, "learning_rate": 4.687591454492245e-05, "loss": 0.1874, "step": 427 }, { "epoch": 0.12525607257828505, "grad_norm": 0.026300964877009392, "learning_rate": 4.686859818554288e-05, "loss": 0.0005, "step": 428 }, { "epoch": 0.12554872695346794, "grad_norm": 0.057506389915943146, "learning_rate": 4.686128182616331e-05, "loss": 0.0012, "step": 429 }, { "epoch": 0.12584138132865086, "grad_norm": 0.14768287539482117, "learning_rate": 4.685396546678373e-05, "loss": 0.0017, "step": 430 }, { "epoch": 0.12613403570383377, "grad_norm": 3.4165399074554443, "learning_rate": 4.684664910740416e-05, "loss": 0.2147, "step": 431 }, { "epoch": 0.1264266900790167, "grad_norm": 1.6047797203063965, "learning_rate": 4.6839332748024586e-05, "loss": 0.0122, "step": 432 }, { "epoch": 0.12671934445419958, "grad_norm": 0.22430621087551117, "learning_rate": 4.6832016388645013e-05, "loss": 0.0042, "step": 433 }, { "epoch": 0.1270119988293825, "grad_norm": 10.875073432922363, "learning_rate": 4.682470002926544e-05, "loss": 0.1259, "step": 434 }, { "epoch": 0.1273046532045654, "grad_norm": 1.0021528005599976, "learning_rate": 4.681738366988587e-05, "loss": 0.0045, "step": 435 }, { "epoch": 0.12759730757974833, "grad_norm": 6.521137714385986, "learning_rate": 4.68100673105063e-05, "loss": 0.0449, "step": 436 }, { "epoch": 0.12788996195493121, "grad_norm": 0.07355611026287079, "learning_rate": 4.6802750951126725e-05, "loss": 0.0016, "step": 437 }, { "epoch": 0.12818261633011413, "grad_norm": 0.04785890877246857, "learning_rate": 4.679543459174715e-05, "loss": 0.0013, "step": 438 }, { "epoch": 0.12847527070529705, "grad_norm": 0.0513986274600029, "learning_rate": 4.678811823236758e-05, "loss": 0.0014, "step": 439 }, { "epoch": 0.12876792508047996, "grad_norm": 0.8326976895332336, "learning_rate": 4.678080187298801e-05, "loss": 0.0052, "step": 440 }, { "epoch": 0.12906057945566285, "grad_norm": 0.026337653398513794, "learning_rate": 4.677348551360843e-05, "loss": 0.0006, "step": 441 }, { "epoch": 0.12935323383084577, "grad_norm": 0.03743434324860573, "learning_rate": 4.676616915422886e-05, "loss": 0.0008, "step": 442 }, { "epoch": 0.12964588820602868, "grad_norm": 2.389418125152588, "learning_rate": 4.6758852794849286e-05, "loss": 0.0072, "step": 443 }, { "epoch": 0.1299385425812116, "grad_norm": 5.299021244049072, "learning_rate": 4.6751536435469714e-05, "loss": 0.2451, "step": 444 }, { "epoch": 0.13023119695639448, "grad_norm": 0.017874106764793396, "learning_rate": 4.674422007609014e-05, "loss": 0.0005, "step": 445 }, { "epoch": 0.1305238513315774, "grad_norm": 4.864397048950195, "learning_rate": 4.673690371671057e-05, "loss": 0.0796, "step": 446 }, { "epoch": 0.13081650570676032, "grad_norm": 0.06551877409219742, "learning_rate": 4.6729587357331e-05, "loss": 0.0012, "step": 447 }, { "epoch": 0.13110916008194323, "grad_norm": 0.039231959730386734, "learning_rate": 4.6722270997951426e-05, "loss": 0.0008, "step": 448 }, { "epoch": 0.13140181445712612, "grad_norm": 3.826850175857544, "learning_rate": 4.6714954638571854e-05, "loss": 0.0273, "step": 449 }, { "epoch": 0.13169446883230904, "grad_norm": 0.12824492156505585, "learning_rate": 4.6707638279192275e-05, "loss": 0.0018, "step": 450 }, { "epoch": 0.13198712320749195, "grad_norm": 3.2820913791656494, "learning_rate": 4.67003219198127e-05, "loss": 0.0184, "step": 451 }, { "epoch": 0.13227977758267487, "grad_norm": 0.05461831018328667, "learning_rate": 4.669300556043313e-05, "loss": 0.0009, "step": 452 }, { "epoch": 0.13257243195785778, "grad_norm": 0.10601229220628738, "learning_rate": 4.668568920105356e-05, "loss": 0.0014, "step": 453 }, { "epoch": 0.13286508633304067, "grad_norm": 0.0834844782948494, "learning_rate": 4.667837284167399e-05, "loss": 0.0013, "step": 454 }, { "epoch": 0.1331577407082236, "grad_norm": 3.3121674060821533, "learning_rate": 4.6671056482294415e-05, "loss": 0.1301, "step": 455 }, { "epoch": 0.1334503950834065, "grad_norm": 1.9951350688934326, "learning_rate": 4.666374012291484e-05, "loss": 0.013, "step": 456 }, { "epoch": 0.13374304945858942, "grad_norm": 4.8048930168151855, "learning_rate": 4.665642376353527e-05, "loss": 0.3001, "step": 457 }, { "epoch": 0.1340357038337723, "grad_norm": 0.02953229285776615, "learning_rate": 4.664910740415569e-05, "loss": 0.0007, "step": 458 }, { "epoch": 0.13432835820895522, "grad_norm": 1.4468019008636475, "learning_rate": 4.664179104477612e-05, "loss": 0.0078, "step": 459 }, { "epoch": 0.13462101258413814, "grad_norm": 0.08909465372562408, "learning_rate": 4.663447468539655e-05, "loss": 0.0014, "step": 460 }, { "epoch": 0.13491366695932105, "grad_norm": 0.17444898188114166, "learning_rate": 4.6627158326016976e-05, "loss": 0.0018, "step": 461 }, { "epoch": 0.13520632133450394, "grad_norm": 5.046396732330322, "learning_rate": 4.6619841966637404e-05, "loss": 0.025, "step": 462 }, { "epoch": 0.13549897570968686, "grad_norm": 0.37730321288108826, "learning_rate": 4.661252560725783e-05, "loss": 0.0031, "step": 463 }, { "epoch": 0.13579163008486977, "grad_norm": 10.075611114501953, "learning_rate": 4.660520924787826e-05, "loss": 0.106, "step": 464 }, { "epoch": 0.1360842844600527, "grad_norm": 0.08799745887517929, "learning_rate": 4.659789288849869e-05, "loss": 0.0012, "step": 465 }, { "epoch": 0.13637693883523558, "grad_norm": 0.4172189235687256, "learning_rate": 4.659057652911911e-05, "loss": 0.0036, "step": 466 }, { "epoch": 0.1366695932104185, "grad_norm": 0.1687348634004593, "learning_rate": 4.658326016973954e-05, "loss": 0.0019, "step": 467 }, { "epoch": 0.1369622475856014, "grad_norm": 0.40193888545036316, "learning_rate": 4.6575943810359965e-05, "loss": 0.0033, "step": 468 }, { "epoch": 0.13725490196078433, "grad_norm": 0.05767688527703285, "learning_rate": 4.656862745098039e-05, "loss": 0.0009, "step": 469 }, { "epoch": 0.1375475563359672, "grad_norm": 4.644579887390137, "learning_rate": 4.656131109160082e-05, "loss": 0.0194, "step": 470 }, { "epoch": 0.13784021071115013, "grad_norm": 0.018554113805294037, "learning_rate": 4.655399473222125e-05, "loss": 0.0004, "step": 471 }, { "epoch": 0.13813286508633305, "grad_norm": 0.3066471219062805, "learning_rate": 4.6546678372841676e-05, "loss": 0.0016, "step": 472 }, { "epoch": 0.13842551946151596, "grad_norm": 4.61218786239624, "learning_rate": 4.6539362013462104e-05, "loss": 0.0225, "step": 473 }, { "epoch": 0.13871817383669885, "grad_norm": 1.0650144815444946, "learning_rate": 4.6532045654082526e-05, "loss": 0.0039, "step": 474 }, { "epoch": 0.13901082821188177, "grad_norm": 0.009785358794033527, "learning_rate": 4.6524729294702954e-05, "loss": 0.0002, "step": 475 }, { "epoch": 0.13930348258706468, "grad_norm": 1.6635420322418213, "learning_rate": 4.651741293532338e-05, "loss": 0.0076, "step": 476 }, { "epoch": 0.1395961369622476, "grad_norm": 3.513559341430664, "learning_rate": 4.651009657594381e-05, "loss": 0.0048, "step": 477 }, { "epoch": 0.13988879133743048, "grad_norm": 7.961835861206055, "learning_rate": 4.650278021656424e-05, "loss": 0.1191, "step": 478 }, { "epoch": 0.1401814457126134, "grad_norm": 0.03598767891526222, "learning_rate": 4.6495463857184665e-05, "loss": 0.0003, "step": 479 }, { "epoch": 0.14047410008779632, "grad_norm": 14.998804092407227, "learning_rate": 4.648814749780509e-05, "loss": 0.0693, "step": 480 }, { "epoch": 0.14076675446297923, "grad_norm": 0.04784742742776871, "learning_rate": 4.648083113842552e-05, "loss": 0.0004, "step": 481 }, { "epoch": 0.14105940883816212, "grad_norm": 10.379205703735352, "learning_rate": 4.647351477904595e-05, "loss": 0.2895, "step": 482 }, { "epoch": 0.14135206321334504, "grad_norm": 0.007849453948438168, "learning_rate": 4.646619841966637e-05, "loss": 0.0001, "step": 483 }, { "epoch": 0.14164471758852795, "grad_norm": 0.429504930973053, "learning_rate": 4.64588820602868e-05, "loss": 0.0019, "step": 484 }, { "epoch": 0.14193737196371087, "grad_norm": 4.247195243835449, "learning_rate": 4.6451565700907226e-05, "loss": 0.2315, "step": 485 }, { "epoch": 0.14223002633889376, "grad_norm": 0.00903258752077818, "learning_rate": 4.6444249341527654e-05, "loss": 0.0002, "step": 486 }, { "epoch": 0.14252268071407667, "grad_norm": 0.26237669587135315, "learning_rate": 4.643693298214808e-05, "loss": 0.0017, "step": 487 }, { "epoch": 0.1428153350892596, "grad_norm": 0.1016833707690239, "learning_rate": 4.642961662276851e-05, "loss": 0.0005, "step": 488 }, { "epoch": 0.1431079894644425, "grad_norm": 2.977407217025757, "learning_rate": 4.642230026338894e-05, "loss": 0.0126, "step": 489 }, { "epoch": 0.1434006438396254, "grad_norm": 2.0009970664978027, "learning_rate": 4.6414983904009366e-05, "loss": 0.3651, "step": 490 }, { "epoch": 0.1436932982148083, "grad_norm": 0.0034312792122364044, "learning_rate": 4.6407667544629794e-05, "loss": 0.0001, "step": 491 }, { "epoch": 0.14398595258999122, "grad_norm": 0.011009294539690018, "learning_rate": 4.640035118525022e-05, "loss": 0.0003, "step": 492 }, { "epoch": 0.14427860696517414, "grad_norm": 11.964802742004395, "learning_rate": 4.639303482587065e-05, "loss": 0.0375, "step": 493 }, { "epoch": 0.14457126134035703, "grad_norm": 0.011789514683187008, "learning_rate": 4.638571846649107e-05, "loss": 0.0003, "step": 494 }, { "epoch": 0.14486391571553994, "grad_norm": 12.071039199829102, "learning_rate": 4.63784021071115e-05, "loss": 0.052, "step": 495 }, { "epoch": 0.14515657009072286, "grad_norm": 0.056693222373723984, "learning_rate": 4.637108574773193e-05, "loss": 0.0011, "step": 496 }, { "epoch": 0.14544922446590577, "grad_norm": 0.6062968969345093, "learning_rate": 4.6363769388352355e-05, "loss": 0.0066, "step": 497 }, { "epoch": 0.14574187884108866, "grad_norm": 0.1888841986656189, "learning_rate": 4.635645302897278e-05, "loss": 0.0032, "step": 498 }, { "epoch": 0.14603453321627158, "grad_norm": 0.270308256149292, "learning_rate": 4.634913666959321e-05, "loss": 0.0046, "step": 499 }, { "epoch": 0.1463271875914545, "grad_norm": 13.553791999816895, "learning_rate": 4.634182031021364e-05, "loss": 0.3013, "step": 500 }, { "epoch": 0.1466198419666374, "grad_norm": 9.983979225158691, "learning_rate": 4.633450395083407e-05, "loss": 0.2404, "step": 501 }, { "epoch": 0.1469124963418203, "grad_norm": 0.04803050309419632, "learning_rate": 4.6327187591454495e-05, "loss": 0.0011, "step": 502 }, { "epoch": 0.1472051507170032, "grad_norm": 7.278826713562012, "learning_rate": 4.631987123207492e-05, "loss": 0.0522, "step": 503 }, { "epoch": 0.14749780509218613, "grad_norm": 0.9605130553245544, "learning_rate": 4.631255487269535e-05, "loss": 0.0057, "step": 504 }, { "epoch": 0.14779045946736905, "grad_norm": 0.018609413877129555, "learning_rate": 4.630523851331577e-05, "loss": 0.0004, "step": 505 }, { "epoch": 0.14808311384255193, "grad_norm": 9.427624702453613, "learning_rate": 4.62979221539362e-05, "loss": 0.1103, "step": 506 }, { "epoch": 0.14837576821773485, "grad_norm": 9.950340270996094, "learning_rate": 4.629060579455663e-05, "loss": 0.1134, "step": 507 }, { "epoch": 0.14866842259291776, "grad_norm": 0.006434707902371883, "learning_rate": 4.6283289435177056e-05, "loss": 0.0001, "step": 508 }, { "epoch": 0.14896107696810068, "grad_norm": 7.503812313079834, "learning_rate": 4.6275973075797484e-05, "loss": 0.0604, "step": 509 }, { "epoch": 0.14925373134328357, "grad_norm": 0.28119999170303345, "learning_rate": 4.626865671641791e-05, "loss": 0.004, "step": 510 }, { "epoch": 0.14954638571846648, "grad_norm": 0.01554897427558899, "learning_rate": 4.626134035703834e-05, "loss": 0.0005, "step": 511 }, { "epoch": 0.1498390400936494, "grad_norm": 0.039249517023563385, "learning_rate": 4.625402399765877e-05, "loss": 0.0009, "step": 512 }, { "epoch": 0.15013169446883232, "grad_norm": 6.196067810058594, "learning_rate": 4.6246707638279195e-05, "loss": 0.0234, "step": 513 }, { "epoch": 0.1504243488440152, "grad_norm": 1.96540105342865, "learning_rate": 4.623939127889962e-05, "loss": 0.0051, "step": 514 }, { "epoch": 0.15071700321919812, "grad_norm": 0.01271872315555811, "learning_rate": 4.6232074919520044e-05, "loss": 0.0004, "step": 515 }, { "epoch": 0.15100965759438104, "grad_norm": 4.049911975860596, "learning_rate": 4.622475856014047e-05, "loss": 0.1171, "step": 516 }, { "epoch": 0.15130231196956395, "grad_norm": 0.03024102747440338, "learning_rate": 4.62174422007609e-05, "loss": 0.0008, "step": 517 }, { "epoch": 0.15159496634474687, "grad_norm": 0.016120215877890587, "learning_rate": 4.621012584138133e-05, "loss": 0.0002, "step": 518 }, { "epoch": 0.15188762071992976, "grad_norm": 0.02355908416211605, "learning_rate": 4.6202809482001756e-05, "loss": 0.0006, "step": 519 }, { "epoch": 0.15218027509511267, "grad_norm": 0.0034723831340670586, "learning_rate": 4.6195493122622184e-05, "loss": 0.0001, "step": 520 }, { "epoch": 0.1524729294702956, "grad_norm": 7.882876396179199, "learning_rate": 4.618817676324261e-05, "loss": 0.0366, "step": 521 }, { "epoch": 0.1527655838454785, "grad_norm": 0.08890720456838608, "learning_rate": 4.618086040386304e-05, "loss": 0.0015, "step": 522 }, { "epoch": 0.1530582382206614, "grad_norm": 0.041916172951459885, "learning_rate": 4.617354404448347e-05, "loss": 0.0009, "step": 523 }, { "epoch": 0.1533508925958443, "grad_norm": 0.06541915237903595, "learning_rate": 4.6166227685103896e-05, "loss": 0.0009, "step": 524 }, { "epoch": 0.15364354697102722, "grad_norm": 0.030568156391382217, "learning_rate": 4.6158911325724324e-05, "loss": 0.0007, "step": 525 }, { "epoch": 0.15393620134621014, "grad_norm": 0.005763526074588299, "learning_rate": 4.6151594966344745e-05, "loss": 0.0001, "step": 526 }, { "epoch": 0.15422885572139303, "grad_norm": 0.03486074134707451, "learning_rate": 4.614427860696517e-05, "loss": 0.0006, "step": 527 }, { "epoch": 0.15452151009657594, "grad_norm": 0.004300899337977171, "learning_rate": 4.61369622475856e-05, "loss": 0.0001, "step": 528 }, { "epoch": 0.15481416447175886, "grad_norm": 1.4314249753952026, "learning_rate": 4.612964588820603e-05, "loss": 0.0127, "step": 529 }, { "epoch": 0.15510681884694177, "grad_norm": 0.026534054428339005, "learning_rate": 4.612232952882646e-05, "loss": 0.0005, "step": 530 }, { "epoch": 0.15539947322212466, "grad_norm": 0.032224852591753006, "learning_rate": 4.6115013169446885e-05, "loss": 0.0004, "step": 531 }, { "epoch": 0.15569212759730758, "grad_norm": 9.825483322143555, "learning_rate": 4.610769681006731e-05, "loss": 0.0773, "step": 532 }, { "epoch": 0.1559847819724905, "grad_norm": 1.552971363067627, "learning_rate": 4.610038045068774e-05, "loss": 0.004, "step": 533 }, { "epoch": 0.1562774363476734, "grad_norm": 8.752983093261719, "learning_rate": 4.609306409130817e-05, "loss": 0.0626, "step": 534 }, { "epoch": 0.1565700907228563, "grad_norm": 0.9281281232833862, "learning_rate": 4.60857477319286e-05, "loss": 0.0047, "step": 535 }, { "epoch": 0.1568627450980392, "grad_norm": 5.323596000671387, "learning_rate": 4.607843137254902e-05, "loss": 0.1174, "step": 536 }, { "epoch": 0.15715539947322213, "grad_norm": 0.03563448414206505, "learning_rate": 4.6071115013169446e-05, "loss": 0.0003, "step": 537 }, { "epoch": 0.15744805384840505, "grad_norm": 0.016390468925237656, "learning_rate": 4.6063798653789874e-05, "loss": 0.0004, "step": 538 }, { "epoch": 0.15774070822358793, "grad_norm": 0.040976326912641525, "learning_rate": 4.60564822944103e-05, "loss": 0.0004, "step": 539 }, { "epoch": 0.15803336259877085, "grad_norm": 0.12268855422735214, "learning_rate": 4.604916593503073e-05, "loss": 0.0008, "step": 540 }, { "epoch": 0.15832601697395376, "grad_norm": 0.006396189332008362, "learning_rate": 4.604184957565116e-05, "loss": 0.0001, "step": 541 }, { "epoch": 0.15861867134913668, "grad_norm": 0.20329192280769348, "learning_rate": 4.6034533216271586e-05, "loss": 0.0012, "step": 542 }, { "epoch": 0.15891132572431957, "grad_norm": 6.874823093414307, "learning_rate": 4.6027216856892013e-05, "loss": 0.2398, "step": 543 }, { "epoch": 0.15920398009950248, "grad_norm": 0.5490859150886536, "learning_rate": 4.601990049751244e-05, "loss": 0.0039, "step": 544 }, { "epoch": 0.1594966344746854, "grad_norm": 0.047674331814050674, "learning_rate": 4.601258413813287e-05, "loss": 0.0007, "step": 545 }, { "epoch": 0.15978928884986832, "grad_norm": 4.832897186279297, "learning_rate": 4.60052677787533e-05, "loss": 0.2455, "step": 546 }, { "epoch": 0.1600819432250512, "grad_norm": 4.292545318603516, "learning_rate": 4.599795141937372e-05, "loss": 0.0733, "step": 547 }, { "epoch": 0.16037459760023412, "grad_norm": 0.06033731997013092, "learning_rate": 4.5990635059994146e-05, "loss": 0.0009, "step": 548 }, { "epoch": 0.16066725197541704, "grad_norm": 0.011225472204387188, "learning_rate": 4.5983318700614574e-05, "loss": 0.0002, "step": 549 }, { "epoch": 0.16095990635059995, "grad_norm": 1.4956049919128418, "learning_rate": 4.5976002341235e-05, "loss": 0.0051, "step": 550 }, { "epoch": 0.16125256072578284, "grad_norm": 0.024007853120565414, "learning_rate": 4.596868598185543e-05, "loss": 0.0006, "step": 551 }, { "epoch": 0.16154521510096576, "grad_norm": 1.055830955505371, "learning_rate": 4.596136962247586e-05, "loss": 0.0063, "step": 552 }, { "epoch": 0.16183786947614867, "grad_norm": 3.8573505878448486, "learning_rate": 4.5954053263096286e-05, "loss": 0.0249, "step": 553 }, { "epoch": 0.1621305238513316, "grad_norm": 0.03098081424832344, "learning_rate": 4.5946736903716714e-05, "loss": 0.0005, "step": 554 }, { "epoch": 0.16242317822651448, "grad_norm": 4.926241397857666, "learning_rate": 4.593942054433714e-05, "loss": 0.0513, "step": 555 }, { "epoch": 0.1627158326016974, "grad_norm": 0.04326876625418663, "learning_rate": 4.593210418495757e-05, "loss": 0.0006, "step": 556 }, { "epoch": 0.1630084869768803, "grad_norm": 21.93604850769043, "learning_rate": 4.5924787825578e-05, "loss": 0.0702, "step": 557 }, { "epoch": 0.16330114135206322, "grad_norm": 4.877942085266113, "learning_rate": 4.591747146619842e-05, "loss": 0.1671, "step": 558 }, { "epoch": 0.1635937957272461, "grad_norm": 0.2925262153148651, "learning_rate": 4.591015510681885e-05, "loss": 0.0025, "step": 559 }, { "epoch": 0.16388645010242903, "grad_norm": 15.076410293579102, "learning_rate": 4.5902838747439275e-05, "loss": 0.083, "step": 560 }, { "epoch": 0.16417910447761194, "grad_norm": 6.79744815826416, "learning_rate": 4.58955223880597e-05, "loss": 0.0981, "step": 561 }, { "epoch": 0.16447175885279486, "grad_norm": 5.972877025604248, "learning_rate": 4.588820602868013e-05, "loss": 0.0384, "step": 562 }, { "epoch": 0.16476441322797775, "grad_norm": 0.05233852192759514, "learning_rate": 4.588088966930056e-05, "loss": 0.0009, "step": 563 }, { "epoch": 0.16505706760316066, "grad_norm": 0.03200345113873482, "learning_rate": 4.587357330992099e-05, "loss": 0.0008, "step": 564 }, { "epoch": 0.16534972197834358, "grad_norm": 0.38592442870140076, "learning_rate": 4.5866256950541415e-05, "loss": 0.0041, "step": 565 }, { "epoch": 0.1656423763535265, "grad_norm": 11.63177490234375, "learning_rate": 4.585894059116184e-05, "loss": 0.2104, "step": 566 }, { "epoch": 0.16593503072870938, "grad_norm": 0.04096129164099693, "learning_rate": 4.585162423178227e-05, "loss": 0.001, "step": 567 }, { "epoch": 0.1662276851038923, "grad_norm": 0.05317602679133415, "learning_rate": 4.584430787240269e-05, "loss": 0.0011, "step": 568 }, { "epoch": 0.1665203394790752, "grad_norm": 0.1012129858136177, "learning_rate": 4.583699151302312e-05, "loss": 0.0016, "step": 569 }, { "epoch": 0.16681299385425813, "grad_norm": 1.065040946006775, "learning_rate": 4.582967515364355e-05, "loss": 0.0051, "step": 570 }, { "epoch": 0.16710564822944102, "grad_norm": 4.996240615844727, "learning_rate": 4.5822358794263976e-05, "loss": 0.0522, "step": 571 }, { "epoch": 0.16739830260462393, "grad_norm": 0.16447268426418304, "learning_rate": 4.5815042434884404e-05, "loss": 0.0015, "step": 572 }, { "epoch": 0.16769095697980685, "grad_norm": 4.073005199432373, "learning_rate": 4.580772607550483e-05, "loss": 0.0199, "step": 573 }, { "epoch": 0.16798361135498976, "grad_norm": 0.011858438141644001, "learning_rate": 4.580040971612526e-05, "loss": 0.0003, "step": 574 }, { "epoch": 0.16827626573017265, "grad_norm": 0.06540852785110474, "learning_rate": 4.579309335674569e-05, "loss": 0.0008, "step": 575 }, { "epoch": 0.16856892010535557, "grad_norm": 0.6887805461883545, "learning_rate": 4.5785776997366116e-05, "loss": 0.0074, "step": 576 }, { "epoch": 0.16886157448053848, "grad_norm": 0.4583264887332916, "learning_rate": 4.5778460637986543e-05, "loss": 0.0056, "step": 577 }, { "epoch": 0.1691542288557214, "grad_norm": 0.017056701704859734, "learning_rate": 4.577114427860697e-05, "loss": 0.0003, "step": 578 }, { "epoch": 0.1694468832309043, "grad_norm": 0.004097268916666508, "learning_rate": 4.576382791922739e-05, "loss": 0.0001, "step": 579 }, { "epoch": 0.1697395376060872, "grad_norm": 0.012352986261248589, "learning_rate": 4.575651155984782e-05, "loss": 0.0003, "step": 580 }, { "epoch": 0.17003219198127012, "grad_norm": 0.0030429689213633537, "learning_rate": 4.574919520046825e-05, "loss": 0.0001, "step": 581 }, { "epoch": 0.17032484635645304, "grad_norm": 0.2785024344921112, "learning_rate": 4.5741878841088676e-05, "loss": 0.0013, "step": 582 }, { "epoch": 0.17061750073163595, "grad_norm": 0.05230037495493889, "learning_rate": 4.5734562481709104e-05, "loss": 0.0004, "step": 583 }, { "epoch": 0.17091015510681884, "grad_norm": 0.00787578895688057, "learning_rate": 4.572724612232953e-05, "loss": 0.0002, "step": 584 }, { "epoch": 0.17120280948200176, "grad_norm": 0.4896010458469391, "learning_rate": 4.571992976294996e-05, "loss": 0.0026, "step": 585 }, { "epoch": 0.17149546385718467, "grad_norm": 0.0032089147716760635, "learning_rate": 4.571261340357039e-05, "loss": 0.0001, "step": 586 }, { "epoch": 0.1717881182323676, "grad_norm": 0.011004786007106304, "learning_rate": 4.5705297044190816e-05, "loss": 0.0002, "step": 587 }, { "epoch": 0.17208077260755048, "grad_norm": 0.0479825995862484, "learning_rate": 4.5697980684811244e-05, "loss": 0.0005, "step": 588 }, { "epoch": 0.1723734269827334, "grad_norm": 0.0029730063397437334, "learning_rate": 4.5690664325431665e-05, "loss": 0.0001, "step": 589 }, { "epoch": 0.1726660813579163, "grad_norm": 0.015313234180212021, "learning_rate": 4.568334796605209e-05, "loss": 0.0003, "step": 590 }, { "epoch": 0.17295873573309922, "grad_norm": 0.017034409567713737, "learning_rate": 4.567603160667252e-05, "loss": 0.0002, "step": 591 }, { "epoch": 0.1732513901082821, "grad_norm": 0.4127567708492279, "learning_rate": 4.566871524729295e-05, "loss": 0.0016, "step": 592 }, { "epoch": 0.17354404448346503, "grad_norm": 0.015415014699101448, "learning_rate": 4.566139888791338e-05, "loss": 0.0002, "step": 593 }, { "epoch": 0.17383669885864794, "grad_norm": 0.0043084463104605675, "learning_rate": 4.5654082528533805e-05, "loss": 0.0001, "step": 594 }, { "epoch": 0.17412935323383086, "grad_norm": 0.0045915767550468445, "learning_rate": 4.564676616915423e-05, "loss": 0.0001, "step": 595 }, { "epoch": 0.17442200760901375, "grad_norm": 0.014552316628396511, "learning_rate": 4.563944980977466e-05, "loss": 0.0002, "step": 596 }, { "epoch": 0.17471466198419666, "grad_norm": 0.015970690175890923, "learning_rate": 4.563213345039509e-05, "loss": 0.0002, "step": 597 }, { "epoch": 0.17500731635937958, "grad_norm": 0.0026189619675278664, "learning_rate": 4.562481709101552e-05, "loss": 0.0001, "step": 598 }, { "epoch": 0.1752999707345625, "grad_norm": 0.0028786526527255774, "learning_rate": 4.5617500731635945e-05, "loss": 0.0001, "step": 599 }, { "epoch": 0.17559262510974538, "grad_norm": 0.015483339317142963, "learning_rate": 4.5610184372256366e-05, "loss": 0.0002, "step": 600 }, { "epoch": 0.1758852794849283, "grad_norm": 0.05966808274388313, "learning_rate": 4.5602868012876794e-05, "loss": 0.0003, "step": 601 }, { "epoch": 0.1761779338601112, "grad_norm": 0.006986284162849188, "learning_rate": 4.559555165349722e-05, "loss": 0.0001, "step": 602 }, { "epoch": 0.17647058823529413, "grad_norm": 0.0017342488281428814, "learning_rate": 4.558823529411765e-05, "loss": 0.0, "step": 603 }, { "epoch": 0.17676324261047702, "grad_norm": 0.1534646898508072, "learning_rate": 4.558091893473808e-05, "loss": 0.0007, "step": 604 }, { "epoch": 0.17705589698565993, "grad_norm": 0.025370581075549126, "learning_rate": 4.5573602575358506e-05, "loss": 0.0001, "step": 605 }, { "epoch": 0.17734855136084285, "grad_norm": 0.0031309896148741245, "learning_rate": 4.5566286215978934e-05, "loss": 0.0001, "step": 606 }, { "epoch": 0.17764120573602576, "grad_norm": 0.003534929594025016, "learning_rate": 4.555896985659936e-05, "loss": 0.0001, "step": 607 }, { "epoch": 0.17793386011120865, "grad_norm": 1.1894418001174927, "learning_rate": 4.555165349721979e-05, "loss": 0.0026, "step": 608 }, { "epoch": 0.17822651448639157, "grad_norm": 0.0013738750712946057, "learning_rate": 4.554433713784022e-05, "loss": 0.0, "step": 609 }, { "epoch": 0.17851916886157448, "grad_norm": 3.998434066772461, "learning_rate": 4.5537020778460645e-05, "loss": 0.2078, "step": 610 }, { "epoch": 0.1788118232367574, "grad_norm": 0.041833844035863876, "learning_rate": 4.552970441908107e-05, "loss": 0.0002, "step": 611 }, { "epoch": 0.1791044776119403, "grad_norm": 0.0021251097787171602, "learning_rate": 4.5522388059701495e-05, "loss": 0.0, "step": 612 }, { "epoch": 0.1793971319871232, "grad_norm": 4.649111270904541, "learning_rate": 4.551507170032192e-05, "loss": 0.1573, "step": 613 }, { "epoch": 0.17968978636230612, "grad_norm": 0.46552494168281555, "learning_rate": 4.550775534094235e-05, "loss": 0.0023, "step": 614 }, { "epoch": 0.17998244073748904, "grad_norm": 0.030416730791330338, "learning_rate": 4.550043898156278e-05, "loss": 0.0004, "step": 615 }, { "epoch": 0.18027509511267192, "grad_norm": 4.120852947235107, "learning_rate": 4.5493122622183206e-05, "loss": 0.1734, "step": 616 }, { "epoch": 0.18056774948785484, "grad_norm": 0.8814420104026794, "learning_rate": 4.5485806262803634e-05, "loss": 0.0056, "step": 617 }, { "epoch": 0.18086040386303776, "grad_norm": 0.9062165021896362, "learning_rate": 4.547848990342406e-05, "loss": 0.0061, "step": 618 }, { "epoch": 0.18115305823822067, "grad_norm": 0.49208158254623413, "learning_rate": 4.547117354404449e-05, "loss": 0.0048, "step": 619 }, { "epoch": 0.18144571261340356, "grad_norm": 0.15719246864318848, "learning_rate": 4.546385718466492e-05, "loss": 0.0019, "step": 620 }, { "epoch": 0.18173836698858647, "grad_norm": 0.005446034017950296, "learning_rate": 4.545654082528534e-05, "loss": 0.0001, "step": 621 }, { "epoch": 0.1820310213637694, "grad_norm": 0.009686796925961971, "learning_rate": 4.544922446590577e-05, "loss": 0.0002, "step": 622 }, { "epoch": 0.1823236757389523, "grad_norm": 0.31266066431999207, "learning_rate": 4.5441908106526195e-05, "loss": 0.0009, "step": 623 }, { "epoch": 0.1826163301141352, "grad_norm": 0.46191877126693726, "learning_rate": 4.543459174714662e-05, "loss": 0.0016, "step": 624 }, { "epoch": 0.1829089844893181, "grad_norm": 0.0038166295271366835, "learning_rate": 4.542727538776705e-05, "loss": 0.0001, "step": 625 }, { "epoch": 0.18320163886450103, "grad_norm": 0.07279108464717865, "learning_rate": 4.541995902838748e-05, "loss": 0.0003, "step": 626 }, { "epoch": 0.18349429323968394, "grad_norm": 8.50744342803955, "learning_rate": 4.541264266900791e-05, "loss": 0.1231, "step": 627 }, { "epoch": 0.18378694761486683, "grad_norm": 9.774702072143555, "learning_rate": 4.5405326309628335e-05, "loss": 0.3219, "step": 628 }, { "epoch": 0.18407960199004975, "grad_norm": 0.024404382333159447, "learning_rate": 4.539800995024876e-05, "loss": 0.0002, "step": 629 }, { "epoch": 0.18437225636523266, "grad_norm": 0.0032634008675813675, "learning_rate": 4.539069359086919e-05, "loss": 0.0001, "step": 630 }, { "epoch": 0.18466491074041558, "grad_norm": 0.003782094456255436, "learning_rate": 4.538337723148962e-05, "loss": 0.0001, "step": 631 }, { "epoch": 0.18495756511559847, "grad_norm": 0.004177380353212357, "learning_rate": 4.537606087211004e-05, "loss": 0.0001, "step": 632 }, { "epoch": 0.18525021949078138, "grad_norm": 8.427203178405762, "learning_rate": 4.536874451273047e-05, "loss": 0.0314, "step": 633 }, { "epoch": 0.1855428738659643, "grad_norm": 0.07821492105722427, "learning_rate": 4.5361428153350896e-05, "loss": 0.0007, "step": 634 }, { "epoch": 0.1858355282411472, "grad_norm": 0.7422145009040833, "learning_rate": 4.5354111793971324e-05, "loss": 0.0033, "step": 635 }, { "epoch": 0.1861281826163301, "grad_norm": 0.04130464419722557, "learning_rate": 4.534679543459175e-05, "loss": 0.0007, "step": 636 }, { "epoch": 0.18642083699151302, "grad_norm": 0.12119077146053314, "learning_rate": 4.533947907521218e-05, "loss": 0.0008, "step": 637 }, { "epoch": 0.18671349136669593, "grad_norm": 0.020508840680122375, "learning_rate": 4.533216271583261e-05, "loss": 0.0003, "step": 638 }, { "epoch": 0.18700614574187885, "grad_norm": 11.085396766662598, "learning_rate": 4.5324846356453036e-05, "loss": 0.1443, "step": 639 }, { "epoch": 0.18729880011706174, "grad_norm": 0.008764538913965225, "learning_rate": 4.531752999707346e-05, "loss": 0.0001, "step": 640 }, { "epoch": 0.18759145449224465, "grad_norm": 6.063767433166504, "learning_rate": 4.5310213637693885e-05, "loss": 0.0167, "step": 641 }, { "epoch": 0.18788410886742757, "grad_norm": 0.004624322522431612, "learning_rate": 4.530289727831431e-05, "loss": 0.0001, "step": 642 }, { "epoch": 0.18817676324261048, "grad_norm": 5.405406951904297, "learning_rate": 4.529558091893474e-05, "loss": 0.2235, "step": 643 }, { "epoch": 0.18846941761779337, "grad_norm": 2.0690550804138184, "learning_rate": 4.528826455955517e-05, "loss": 0.193, "step": 644 }, { "epoch": 0.1887620719929763, "grad_norm": 7.38582706451416, "learning_rate": 4.52809482001756e-05, "loss": 0.1021, "step": 645 }, { "epoch": 0.1890547263681592, "grad_norm": 6.473080158233643, "learning_rate": 4.5273631840796025e-05, "loss": 0.1266, "step": 646 }, { "epoch": 0.18934738074334212, "grad_norm": 1.8652619123458862, "learning_rate": 4.526631548141645e-05, "loss": 0.0047, "step": 647 }, { "epoch": 0.18964003511852504, "grad_norm": 4.676137447357178, "learning_rate": 4.5258999122036874e-05, "loss": 0.2113, "step": 648 }, { "epoch": 0.18993268949370792, "grad_norm": 0.1120869368314743, "learning_rate": 4.52516827626573e-05, "loss": 0.0022, "step": 649 }, { "epoch": 0.19022534386889084, "grad_norm": 0.43787863850593567, "learning_rate": 4.524436640327773e-05, "loss": 0.008, "step": 650 }, { "epoch": 0.19051799824407376, "grad_norm": 1.8265527486801147, "learning_rate": 4.523705004389816e-05, "loss": 0.0165, "step": 651 }, { "epoch": 0.19081065261925667, "grad_norm": 14.509317398071289, "learning_rate": 4.5229733684518586e-05, "loss": 0.2503, "step": 652 }, { "epoch": 0.19110330699443956, "grad_norm": 14.345330238342285, "learning_rate": 4.5222417325139013e-05, "loss": 0.1041, "step": 653 }, { "epoch": 0.19139596136962247, "grad_norm": 8.671512603759766, "learning_rate": 4.521510096575944e-05, "loss": 0.2108, "step": 654 }, { "epoch": 0.1916886157448054, "grad_norm": 10.393280029296875, "learning_rate": 4.520778460637987e-05, "loss": 0.179, "step": 655 }, { "epoch": 0.1919812701199883, "grad_norm": 3.7070400714874268, "learning_rate": 4.520046824700029e-05, "loss": 0.0498, "step": 656 }, { "epoch": 0.1922739244951712, "grad_norm": 0.06468917429447174, "learning_rate": 4.519315188762072e-05, "loss": 0.0014, "step": 657 }, { "epoch": 0.1925665788703541, "grad_norm": 0.12241574376821518, "learning_rate": 4.5185835528241146e-05, "loss": 0.0027, "step": 658 }, { "epoch": 0.19285923324553703, "grad_norm": 0.07482358813285828, "learning_rate": 4.5178519168861574e-05, "loss": 0.0016, "step": 659 }, { "epoch": 0.19315188762071994, "grad_norm": 4.144123554229736, "learning_rate": 4.5171202809482e-05, "loss": 0.1355, "step": 660 }, { "epoch": 0.19344454199590283, "grad_norm": 0.0915708839893341, "learning_rate": 4.516388645010243e-05, "loss": 0.0021, "step": 661 }, { "epoch": 0.19373719637108575, "grad_norm": 0.09312600642442703, "learning_rate": 4.515657009072286e-05, "loss": 0.002, "step": 662 }, { "epoch": 0.19402985074626866, "grad_norm": 3.0293922424316406, "learning_rate": 4.5149253731343286e-05, "loss": 0.1203, "step": 663 }, { "epoch": 0.19432250512145158, "grad_norm": 0.09050063043832779, "learning_rate": 4.514193737196371e-05, "loss": 0.0022, "step": 664 }, { "epoch": 0.19461515949663447, "grad_norm": 0.5784649848937988, "learning_rate": 4.5134621012584135e-05, "loss": 0.006, "step": 665 }, { "epoch": 0.19490781387181738, "grad_norm": 3.1412320137023926, "learning_rate": 4.512730465320456e-05, "loss": 0.0228, "step": 666 }, { "epoch": 0.1952004682470003, "grad_norm": 5.518512725830078, "learning_rate": 4.511998829382499e-05, "loss": 0.061, "step": 667 }, { "epoch": 0.1954931226221832, "grad_norm": 0.11754651367664337, "learning_rate": 4.511267193444542e-05, "loss": 0.0025, "step": 668 }, { "epoch": 0.1957857769973661, "grad_norm": 0.18826378881931305, "learning_rate": 4.510535557506585e-05, "loss": 0.0033, "step": 669 }, { "epoch": 0.19607843137254902, "grad_norm": 0.12070680409669876, "learning_rate": 4.5098039215686275e-05, "loss": 0.0022, "step": 670 }, { "epoch": 0.19637108574773193, "grad_norm": 0.133676216006279, "learning_rate": 4.50907228563067e-05, "loss": 0.0022, "step": 671 }, { "epoch": 0.19666374012291485, "grad_norm": 0.669661283493042, "learning_rate": 4.508340649692713e-05, "loss": 0.0086, "step": 672 }, { "epoch": 0.19695639449809774, "grad_norm": 0.17344354093074799, "learning_rate": 4.507609013754756e-05, "loss": 0.0023, "step": 673 }, { "epoch": 0.19724904887328065, "grad_norm": 0.12338634580373764, "learning_rate": 4.506877377816798e-05, "loss": 0.0021, "step": 674 }, { "epoch": 0.19754170324846357, "grad_norm": 0.06881749629974365, "learning_rate": 4.506145741878841e-05, "loss": 0.001, "step": 675 }, { "epoch": 0.19783435762364648, "grad_norm": 0.038889095187187195, "learning_rate": 4.5054141059408836e-05, "loss": 0.0008, "step": 676 }, { "epoch": 0.19812701199882937, "grad_norm": 0.014144466258585453, "learning_rate": 4.5046824700029264e-05, "loss": 0.0003, "step": 677 }, { "epoch": 0.1984196663740123, "grad_norm": 0.16300074756145477, "learning_rate": 4.503950834064969e-05, "loss": 0.001, "step": 678 }, { "epoch": 0.1987123207491952, "grad_norm": 4.094395160675049, "learning_rate": 4.503219198127012e-05, "loss": 0.0307, "step": 679 }, { "epoch": 0.19900497512437812, "grad_norm": 0.00436925096437335, "learning_rate": 4.502487562189055e-05, "loss": 0.0001, "step": 680 }, { "epoch": 0.199297629499561, "grad_norm": 0.013027762062847614, "learning_rate": 4.5017559262510976e-05, "loss": 0.0002, "step": 681 }, { "epoch": 0.19959028387474392, "grad_norm": 0.031487055122852325, "learning_rate": 4.5010242903131404e-05, "loss": 0.0004, "step": 682 }, { "epoch": 0.19988293824992684, "grad_norm": 6.214965343475342, "learning_rate": 4.500292654375183e-05, "loss": 0.1875, "step": 683 }, { "epoch": 0.20017559262510976, "grad_norm": 0.6754149198532104, "learning_rate": 4.499561018437226e-05, "loss": 0.004, "step": 684 }, { "epoch": 0.20046824700029264, "grad_norm": 0.68619704246521, "learning_rate": 4.498829382499268e-05, "loss": 0.0044, "step": 685 }, { "epoch": 0.20076090137547556, "grad_norm": 0.06463277339935303, "learning_rate": 4.498097746561311e-05, "loss": 0.0007, "step": 686 }, { "epoch": 0.20105355575065847, "grad_norm": 0.10290379077196121, "learning_rate": 4.497366110623354e-05, "loss": 0.0009, "step": 687 }, { "epoch": 0.2013462101258414, "grad_norm": 9.652802467346191, "learning_rate": 4.4966344746853965e-05, "loss": 0.0612, "step": 688 }, { "epoch": 0.20163886450102428, "grad_norm": 0.0042547243647277355, "learning_rate": 4.495902838747439e-05, "loss": 0.0001, "step": 689 }, { "epoch": 0.2019315188762072, "grad_norm": 0.004024684429168701, "learning_rate": 4.495171202809482e-05, "loss": 0.0001, "step": 690 }, { "epoch": 0.2022241732513901, "grad_norm": 0.2425258457660675, "learning_rate": 4.494439566871525e-05, "loss": 0.0008, "step": 691 }, { "epoch": 0.20251682762657303, "grad_norm": 0.2894352376461029, "learning_rate": 4.4937079309335676e-05, "loss": 0.0011, "step": 692 }, { "epoch": 0.20280948200175591, "grad_norm": 14.928994178771973, "learning_rate": 4.4929762949956104e-05, "loss": 0.3129, "step": 693 }, { "epoch": 0.20310213637693883, "grad_norm": 7.792734622955322, "learning_rate": 4.492244659057653e-05, "loss": 0.1719, "step": 694 }, { "epoch": 0.20339479075212175, "grad_norm": 0.8874497413635254, "learning_rate": 4.491513023119696e-05, "loss": 0.0033, "step": 695 }, { "epoch": 0.20368744512730466, "grad_norm": 0.013569723814725876, "learning_rate": 4.490781387181738e-05, "loss": 0.0002, "step": 696 }, { "epoch": 0.20398009950248755, "grad_norm": 0.1591472625732422, "learning_rate": 4.490049751243781e-05, "loss": 0.0006, "step": 697 }, { "epoch": 0.20427275387767047, "grad_norm": 6.823668479919434, "learning_rate": 4.489318115305824e-05, "loss": 0.1749, "step": 698 }, { "epoch": 0.20456540825285338, "grad_norm": 0.015314355492591858, "learning_rate": 4.4885864793678665e-05, "loss": 0.0002, "step": 699 }, { "epoch": 0.2048580626280363, "grad_norm": 0.023964963853359222, "learning_rate": 4.487854843429909e-05, "loss": 0.0002, "step": 700 }, { "epoch": 0.20515071700321919, "grad_norm": 0.0187582578510046, "learning_rate": 4.487123207491952e-05, "loss": 0.0002, "step": 701 }, { "epoch": 0.2054433713784021, "grad_norm": 0.02765386924147606, "learning_rate": 4.486391571553995e-05, "loss": 0.0002, "step": 702 }, { "epoch": 0.20573602575358502, "grad_norm": 18.86209487915039, "learning_rate": 4.485659935616038e-05, "loss": 0.4532, "step": 703 }, { "epoch": 0.20602868012876793, "grad_norm": 8.715555191040039, "learning_rate": 4.4849282996780805e-05, "loss": 0.0359, "step": 704 }, { "epoch": 0.20632133450395082, "grad_norm": 0.07234326750040054, "learning_rate": 4.484196663740123e-05, "loss": 0.0006, "step": 705 }, { "epoch": 0.20661398887913374, "grad_norm": 0.045028142631053925, "learning_rate": 4.4834650278021654e-05, "loss": 0.0004, "step": 706 }, { "epoch": 0.20690664325431665, "grad_norm": 0.016763372346758842, "learning_rate": 4.482733391864208e-05, "loss": 0.0002, "step": 707 }, { "epoch": 0.20719929762949957, "grad_norm": 7.2011284828186035, "learning_rate": 4.482001755926251e-05, "loss": 0.1084, "step": 708 }, { "epoch": 0.20749195200468246, "grad_norm": 0.03069467470049858, "learning_rate": 4.481270119988294e-05, "loss": 0.0004, "step": 709 }, { "epoch": 0.20778460637986537, "grad_norm": 0.028901129961013794, "learning_rate": 4.4805384840503366e-05, "loss": 0.0004, "step": 710 }, { "epoch": 0.2080772607550483, "grad_norm": 0.00795326754450798, "learning_rate": 4.4798068481123794e-05, "loss": 0.0001, "step": 711 }, { "epoch": 0.2083699151302312, "grad_norm": 1.5992717742919922, "learning_rate": 4.479075212174422e-05, "loss": 0.0046, "step": 712 }, { "epoch": 0.20866256950541412, "grad_norm": 0.018271101638674736, "learning_rate": 4.478343576236465e-05, "loss": 0.0003, "step": 713 }, { "epoch": 0.208955223880597, "grad_norm": 0.28536123037338257, "learning_rate": 4.477611940298508e-05, "loss": 0.0015, "step": 714 }, { "epoch": 0.20924787825577992, "grad_norm": 0.010522237978875637, "learning_rate": 4.4768803043605506e-05, "loss": 0.0002, "step": 715 }, { "epoch": 0.20954053263096284, "grad_norm": 0.053751010447740555, "learning_rate": 4.4761486684225934e-05, "loss": 0.0006, "step": 716 }, { "epoch": 0.20983318700614575, "grad_norm": 0.0036731716245412827, "learning_rate": 4.4754170324846355e-05, "loss": 0.0001, "step": 717 }, { "epoch": 0.21012584138132864, "grad_norm": 0.004985527601093054, "learning_rate": 4.474685396546678e-05, "loss": 0.0001, "step": 718 }, { "epoch": 0.21041849575651156, "grad_norm": 0.004694411531090736, "learning_rate": 4.473953760608721e-05, "loss": 0.0001, "step": 719 }, { "epoch": 0.21071115013169447, "grad_norm": 0.0014005025150254369, "learning_rate": 4.473222124670764e-05, "loss": 0.0, "step": 720 }, { "epoch": 0.2110038045068774, "grad_norm": 1.1884040832519531, "learning_rate": 4.472490488732807e-05, "loss": 0.0041, "step": 721 }, { "epoch": 0.21129645888206028, "grad_norm": 0.0041051763109862804, "learning_rate": 4.4717588527948495e-05, "loss": 0.0001, "step": 722 }, { "epoch": 0.2115891132572432, "grad_norm": 0.001686644391156733, "learning_rate": 4.471027216856892e-05, "loss": 0.0, "step": 723 }, { "epoch": 0.2118817676324261, "grad_norm": 0.020543327555060387, "learning_rate": 4.470295580918935e-05, "loss": 0.0002, "step": 724 }, { "epoch": 0.21217442200760903, "grad_norm": 4.9859232902526855, "learning_rate": 4.469563944980978e-05, "loss": 0.0254, "step": 725 }, { "epoch": 0.21246707638279191, "grad_norm": 5.037113189697266, "learning_rate": 4.4688323090430206e-05, "loss": 0.1981, "step": 726 }, { "epoch": 0.21275973075797483, "grad_norm": 0.017563283443450928, "learning_rate": 4.468100673105063e-05, "loss": 0.0002, "step": 727 }, { "epoch": 0.21305238513315775, "grad_norm": 0.07505333423614502, "learning_rate": 4.4673690371671056e-05, "loss": 0.0006, "step": 728 }, { "epoch": 0.21334503950834066, "grad_norm": 0.003406686242669821, "learning_rate": 4.4666374012291483e-05, "loss": 0.0001, "step": 729 }, { "epoch": 0.21363769388352355, "grad_norm": 0.034339457750320435, "learning_rate": 4.465905765291191e-05, "loss": 0.0003, "step": 730 }, { "epoch": 0.21393034825870647, "grad_norm": 0.014503705315291882, "learning_rate": 4.465174129353234e-05, "loss": 0.0002, "step": 731 }, { "epoch": 0.21422300263388938, "grad_norm": 0.0040726629085838795, "learning_rate": 4.464442493415277e-05, "loss": 0.0001, "step": 732 }, { "epoch": 0.2145156570090723, "grad_norm": 0.0007581388927064836, "learning_rate": 4.4637108574773195e-05, "loss": 0.0, "step": 733 }, { "epoch": 0.21480831138425519, "grad_norm": 8.548843383789062, "learning_rate": 4.462979221539362e-05, "loss": 0.0632, "step": 734 }, { "epoch": 0.2151009657594381, "grad_norm": 1.2448025941848755, "learning_rate": 4.462247585601405e-05, "loss": 0.0068, "step": 735 }, { "epoch": 0.21539362013462102, "grad_norm": 0.013233359903097153, "learning_rate": 4.461515949663448e-05, "loss": 0.0002, "step": 736 }, { "epoch": 0.21568627450980393, "grad_norm": 3.531691312789917, "learning_rate": 4.460784313725491e-05, "loss": 0.2103, "step": 737 }, { "epoch": 0.21597892888498682, "grad_norm": 0.05020001903176308, "learning_rate": 4.460052677787533e-05, "loss": 0.0003, "step": 738 }, { "epoch": 0.21627158326016974, "grad_norm": 1.1592589616775513, "learning_rate": 4.4593210418495756e-05, "loss": 0.0047, "step": 739 }, { "epoch": 0.21656423763535265, "grad_norm": 12.470833778381348, "learning_rate": 4.4585894059116184e-05, "loss": 0.1691, "step": 740 }, { "epoch": 0.21685689201053557, "grad_norm": 0.032759133726358414, "learning_rate": 4.457857769973661e-05, "loss": 0.0004, "step": 741 }, { "epoch": 0.21714954638571846, "grad_norm": 0.06250883638858795, "learning_rate": 4.457126134035704e-05, "loss": 0.0005, "step": 742 }, { "epoch": 0.21744220076090137, "grad_norm": 0.12659525871276855, "learning_rate": 4.456394498097747e-05, "loss": 0.0013, "step": 743 }, { "epoch": 0.2177348551360843, "grad_norm": 0.06023351475596428, "learning_rate": 4.4556628621597896e-05, "loss": 0.0005, "step": 744 }, { "epoch": 0.2180275095112672, "grad_norm": 0.01682423986494541, "learning_rate": 4.4549312262218324e-05, "loss": 0.0003, "step": 745 }, { "epoch": 0.2183201638864501, "grad_norm": 0.016918722540140152, "learning_rate": 4.454199590283875e-05, "loss": 0.0004, "step": 746 }, { "epoch": 0.218612818261633, "grad_norm": 0.5990718007087708, "learning_rate": 4.453467954345918e-05, "loss": 0.004, "step": 747 }, { "epoch": 0.21890547263681592, "grad_norm": 3.5272769927978516, "learning_rate": 4.452736318407961e-05, "loss": 0.1258, "step": 748 }, { "epoch": 0.21919812701199884, "grad_norm": 0.029189743101596832, "learning_rate": 4.452004682470003e-05, "loss": 0.0005, "step": 749 }, { "epoch": 0.21949078138718173, "grad_norm": 0.09604718536138535, "learning_rate": 4.451273046532046e-05, "loss": 0.0009, "step": 750 }, { "epoch": 0.21978343576236464, "grad_norm": 0.01348460279405117, "learning_rate": 4.4505414105940885e-05, "loss": 0.0004, "step": 751 }, { "epoch": 0.22007609013754756, "grad_norm": 0.020246319472789764, "learning_rate": 4.449809774656131e-05, "loss": 0.0005, "step": 752 }, { "epoch": 0.22036874451273047, "grad_norm": 0.014105631969869137, "learning_rate": 4.449078138718174e-05, "loss": 0.0003, "step": 753 }, { "epoch": 0.22066139888791336, "grad_norm": 0.07877824455499649, "learning_rate": 4.448346502780217e-05, "loss": 0.001, "step": 754 }, { "epoch": 0.22095405326309628, "grad_norm": 1.8267985582351685, "learning_rate": 4.44761486684226e-05, "loss": 0.0129, "step": 755 }, { "epoch": 0.2212467076382792, "grad_norm": 0.6814830899238586, "learning_rate": 4.4468832309043025e-05, "loss": 0.0047, "step": 756 }, { "epoch": 0.2215393620134621, "grad_norm": 0.25179216265678406, "learning_rate": 4.446151594966345e-05, "loss": 0.0023, "step": 757 }, { "epoch": 0.221832016388645, "grad_norm": 0.9964368343353271, "learning_rate": 4.445419959028388e-05, "loss": 0.0064, "step": 758 }, { "epoch": 0.2221246707638279, "grad_norm": 0.05321267619729042, "learning_rate": 4.44468832309043e-05, "loss": 0.0008, "step": 759 }, { "epoch": 0.22241732513901083, "grad_norm": 2.7881665229797363, "learning_rate": 4.443956687152473e-05, "loss": 0.2114, "step": 760 }, { "epoch": 0.22270997951419375, "grad_norm": 0.029525209218263626, "learning_rate": 4.443225051214516e-05, "loss": 0.0005, "step": 761 }, { "epoch": 0.22300263388937663, "grad_norm": 0.019108805805444717, "learning_rate": 4.4424934152765586e-05, "loss": 0.0004, "step": 762 }, { "epoch": 0.22329528826455955, "grad_norm": 0.5585194230079651, "learning_rate": 4.4417617793386013e-05, "loss": 0.0035, "step": 763 }, { "epoch": 0.22358794263974247, "grad_norm": 0.012019911780953407, "learning_rate": 4.441030143400644e-05, "loss": 0.0002, "step": 764 }, { "epoch": 0.22388059701492538, "grad_norm": 0.020603956654667854, "learning_rate": 4.440298507462687e-05, "loss": 0.0006, "step": 765 }, { "epoch": 0.22417325139010827, "grad_norm": 5.0426836013793945, "learning_rate": 4.43956687152473e-05, "loss": 0.023, "step": 766 }, { "epoch": 0.22446590576529118, "grad_norm": 2.308359384536743, "learning_rate": 4.4388352355867725e-05, "loss": 0.2389, "step": 767 }, { "epoch": 0.2247585601404741, "grad_norm": 4.962514400482178, "learning_rate": 4.438103599648815e-05, "loss": 0.0525, "step": 768 }, { "epoch": 0.22505121451565702, "grad_norm": 0.06218094751238823, "learning_rate": 4.437371963710858e-05, "loss": 0.0008, "step": 769 }, { "epoch": 0.2253438688908399, "grad_norm": 0.8410007953643799, "learning_rate": 4.4366403277729e-05, "loss": 0.0056, "step": 770 }, { "epoch": 0.22563652326602282, "grad_norm": 3.1172614097595215, "learning_rate": 4.435908691834943e-05, "loss": 0.0257, "step": 771 }, { "epoch": 0.22592917764120574, "grad_norm": 0.033022698014974594, "learning_rate": 4.435177055896986e-05, "loss": 0.0006, "step": 772 }, { "epoch": 0.22622183201638865, "grad_norm": 0.10443870723247528, "learning_rate": 4.4344454199590286e-05, "loss": 0.0015, "step": 773 }, { "epoch": 0.22651448639157157, "grad_norm": 0.13600991666316986, "learning_rate": 4.4337137840210714e-05, "loss": 0.0017, "step": 774 }, { "epoch": 0.22680714076675446, "grad_norm": 3.21096134185791, "learning_rate": 4.432982148083114e-05, "loss": 0.0155, "step": 775 }, { "epoch": 0.22709979514193737, "grad_norm": 2.4363033771514893, "learning_rate": 4.432250512145157e-05, "loss": 0.0151, "step": 776 }, { "epoch": 0.2273924495171203, "grad_norm": 0.03876936063170433, "learning_rate": 4.4315188762072e-05, "loss": 0.0011, "step": 777 }, { "epoch": 0.2276851038923032, "grad_norm": 0.023100847378373146, "learning_rate": 4.4307872402692426e-05, "loss": 0.0007, "step": 778 }, { "epoch": 0.2279777582674861, "grad_norm": 0.018950048834085464, "learning_rate": 4.4300556043312854e-05, "loss": 0.0004, "step": 779 }, { "epoch": 0.228270412642669, "grad_norm": 0.0254242904484272, "learning_rate": 4.4293239683933275e-05, "loss": 0.0008, "step": 780 }, { "epoch": 0.22856306701785192, "grad_norm": 0.7936460971832275, "learning_rate": 4.42859233245537e-05, "loss": 0.0046, "step": 781 }, { "epoch": 0.22885572139303484, "grad_norm": 0.050249021500349045, "learning_rate": 4.427860696517413e-05, "loss": 0.001, "step": 782 }, { "epoch": 0.22914837576821773, "grad_norm": 0.08169472962617874, "learning_rate": 4.427129060579456e-05, "loss": 0.0012, "step": 783 }, { "epoch": 0.22944103014340064, "grad_norm": 6.499297618865967, "learning_rate": 4.426397424641499e-05, "loss": 0.0624, "step": 784 }, { "epoch": 0.22973368451858356, "grad_norm": 0.02524263970553875, "learning_rate": 4.4256657887035415e-05, "loss": 0.0005, "step": 785 }, { "epoch": 0.23002633889376647, "grad_norm": 5.125154495239258, "learning_rate": 4.424934152765584e-05, "loss": 0.0226, "step": 786 }, { "epoch": 0.23031899326894936, "grad_norm": 6.894948482513428, "learning_rate": 4.424202516827627e-05, "loss": 0.0668, "step": 787 }, { "epoch": 0.23061164764413228, "grad_norm": 5.20366096496582, "learning_rate": 4.42347088088967e-05, "loss": 0.0671, "step": 788 }, { "epoch": 0.2309043020193152, "grad_norm": 0.05931296572089195, "learning_rate": 4.422739244951713e-05, "loss": 0.0009, "step": 789 }, { "epoch": 0.2311969563944981, "grad_norm": 0.00888920109719038, "learning_rate": 4.4220076090137555e-05, "loss": 0.0003, "step": 790 }, { "epoch": 0.231489610769681, "grad_norm": 0.012214032001793385, "learning_rate": 4.4212759730757976e-05, "loss": 0.0003, "step": 791 }, { "epoch": 0.2317822651448639, "grad_norm": 0.01478834543377161, "learning_rate": 4.4205443371378404e-05, "loss": 0.0004, "step": 792 }, { "epoch": 0.23207491952004683, "grad_norm": 3.1242382526397705, "learning_rate": 4.419812701199883e-05, "loss": 0.0407, "step": 793 }, { "epoch": 0.23236757389522975, "grad_norm": 0.013407468795776367, "learning_rate": 4.419081065261926e-05, "loss": 0.0003, "step": 794 }, { "epoch": 0.23266022827041263, "grad_norm": 0.014789867214858532, "learning_rate": 4.418349429323969e-05, "loss": 0.0004, "step": 795 }, { "epoch": 0.23295288264559555, "grad_norm": 0.01953650824725628, "learning_rate": 4.4176177933860115e-05, "loss": 0.0004, "step": 796 }, { "epoch": 0.23324553702077847, "grad_norm": 0.03375176712870598, "learning_rate": 4.4168861574480543e-05, "loss": 0.0008, "step": 797 }, { "epoch": 0.23353819139596138, "grad_norm": 0.05466015264391899, "learning_rate": 4.416154521510097e-05, "loss": 0.0009, "step": 798 }, { "epoch": 0.23383084577114427, "grad_norm": 0.00844351015985012, "learning_rate": 4.41542288557214e-05, "loss": 0.0002, "step": 799 }, { "epoch": 0.23412350014632718, "grad_norm": 0.008441613055765629, "learning_rate": 4.414691249634183e-05, "loss": 0.0003, "step": 800 }, { "epoch": 0.2344161545215101, "grad_norm": 0.048100389540195465, "learning_rate": 4.4139596136962255e-05, "loss": 0.0008, "step": 801 }, { "epoch": 0.23470880889669302, "grad_norm": 6.122220516204834, "learning_rate": 4.4132279777582676e-05, "loss": 0.0348, "step": 802 }, { "epoch": 0.2350014632718759, "grad_norm": 0.006385037675499916, "learning_rate": 4.4124963418203104e-05, "loss": 0.0002, "step": 803 }, { "epoch": 0.23529411764705882, "grad_norm": 0.019083580002188683, "learning_rate": 4.411764705882353e-05, "loss": 0.0003, "step": 804 }, { "epoch": 0.23558677202224174, "grad_norm": 0.01814999058842659, "learning_rate": 4.411033069944396e-05, "loss": 0.0004, "step": 805 }, { "epoch": 0.23587942639742465, "grad_norm": 5.577958583831787, "learning_rate": 4.410301434006439e-05, "loss": 0.0333, "step": 806 }, { "epoch": 0.23617208077260754, "grad_norm": 2.3570194244384766, "learning_rate": 4.4095697980684816e-05, "loss": 0.0099, "step": 807 }, { "epoch": 0.23646473514779046, "grad_norm": 0.007592096459120512, "learning_rate": 4.4088381621305244e-05, "loss": 0.0002, "step": 808 }, { "epoch": 0.23675738952297337, "grad_norm": 1.1854114532470703, "learning_rate": 4.408106526192567e-05, "loss": 0.0053, "step": 809 }, { "epoch": 0.2370500438981563, "grad_norm": 0.4952651560306549, "learning_rate": 4.40737489025461e-05, "loss": 0.0022, "step": 810 }, { "epoch": 0.23734269827333918, "grad_norm": 0.014262191019952297, "learning_rate": 4.406643254316653e-05, "loss": 0.0003, "step": 811 }, { "epoch": 0.2376353526485221, "grad_norm": 0.02860407717525959, "learning_rate": 4.405911618378695e-05, "loss": 0.0005, "step": 812 }, { "epoch": 0.237928007023705, "grad_norm": 1.1772172451019287, "learning_rate": 4.405179982440738e-05, "loss": 0.0047, "step": 813 }, { "epoch": 0.23822066139888792, "grad_norm": 4.126893043518066, "learning_rate": 4.4044483465027805e-05, "loss": 0.0133, "step": 814 }, { "epoch": 0.2385133157740708, "grad_norm": 4.438233852386475, "learning_rate": 4.403716710564823e-05, "loss": 0.1691, "step": 815 }, { "epoch": 0.23880597014925373, "grad_norm": 1.1588010787963867, "learning_rate": 4.402985074626866e-05, "loss": 0.0035, "step": 816 }, { "epoch": 0.23909862452443664, "grad_norm": 0.06502325087785721, "learning_rate": 4.402253438688909e-05, "loss": 0.0006, "step": 817 }, { "epoch": 0.23939127889961956, "grad_norm": 0.0038198577240109444, "learning_rate": 4.401521802750952e-05, "loss": 0.0001, "step": 818 }, { "epoch": 0.23968393327480245, "grad_norm": 0.004416614770889282, "learning_rate": 4.4007901668129945e-05, "loss": 0.0001, "step": 819 }, { "epoch": 0.23997658764998536, "grad_norm": 3.8089776039123535, "learning_rate": 4.400058530875037e-05, "loss": 0.1348, "step": 820 }, { "epoch": 0.24026924202516828, "grad_norm": 0.49052879214286804, "learning_rate": 4.3993268949370794e-05, "loss": 0.0011, "step": 821 }, { "epoch": 0.2405618964003512, "grad_norm": 7.548944473266602, "learning_rate": 4.398595258999122e-05, "loss": 0.058, "step": 822 }, { "epoch": 0.24085455077553408, "grad_norm": 0.008259841240942478, "learning_rate": 4.397863623061165e-05, "loss": 0.0002, "step": 823 }, { "epoch": 0.241147205150717, "grad_norm": 0.049051620066165924, "learning_rate": 4.397131987123208e-05, "loss": 0.0008, "step": 824 }, { "epoch": 0.2414398595258999, "grad_norm": 0.03358924388885498, "learning_rate": 4.3964003511852506e-05, "loss": 0.0007, "step": 825 }, { "epoch": 0.24173251390108283, "grad_norm": 0.38548314571380615, "learning_rate": 4.3956687152472934e-05, "loss": 0.0027, "step": 826 }, { "epoch": 0.24202516827626572, "grad_norm": 0.1643989384174347, "learning_rate": 4.394937079309336e-05, "loss": 0.0023, "step": 827 }, { "epoch": 0.24231782265144863, "grad_norm": 0.02894510142505169, "learning_rate": 4.394205443371379e-05, "loss": 0.0005, "step": 828 }, { "epoch": 0.24261047702663155, "grad_norm": 0.06682289391756058, "learning_rate": 4.393473807433421e-05, "loss": 0.0009, "step": 829 }, { "epoch": 0.24290313140181446, "grad_norm": 1.2807080745697021, "learning_rate": 4.392742171495464e-05, "loss": 0.0079, "step": 830 }, { "epoch": 0.24319578577699735, "grad_norm": 0.08425843715667725, "learning_rate": 4.392010535557507e-05, "loss": 0.0005, "step": 831 }, { "epoch": 0.24348844015218027, "grad_norm": 0.10579044371843338, "learning_rate": 4.3912788996195495e-05, "loss": 0.0011, "step": 832 }, { "epoch": 0.24378109452736318, "grad_norm": 0.05214914306998253, "learning_rate": 4.390547263681592e-05, "loss": 0.0006, "step": 833 }, { "epoch": 0.2440737489025461, "grad_norm": 0.014113985002040863, "learning_rate": 4.389815627743635e-05, "loss": 0.0003, "step": 834 }, { "epoch": 0.244366403277729, "grad_norm": 0.004227771423757076, "learning_rate": 4.389083991805678e-05, "loss": 0.0001, "step": 835 }, { "epoch": 0.2446590576529119, "grad_norm": 0.005914884619414806, "learning_rate": 4.3883523558677206e-05, "loss": 0.0001, "step": 836 }, { "epoch": 0.24495171202809482, "grad_norm": 0.004017166327685118, "learning_rate": 4.387620719929763e-05, "loss": 0.0001, "step": 837 }, { "epoch": 0.24524436640327774, "grad_norm": 0.003356066532433033, "learning_rate": 4.3868890839918056e-05, "loss": 0.0001, "step": 838 }, { "epoch": 0.24553702077846065, "grad_norm": 2.8616511821746826, "learning_rate": 4.3861574480538483e-05, "loss": 0.0039, "step": 839 }, { "epoch": 0.24582967515364354, "grad_norm": 0.2694534957408905, "learning_rate": 4.385425812115891e-05, "loss": 0.001, "step": 840 }, { "epoch": 0.24612232952882646, "grad_norm": 0.003960182890295982, "learning_rate": 4.384694176177934e-05, "loss": 0.0001, "step": 841 }, { "epoch": 0.24641498390400937, "grad_norm": 0.1128448098897934, "learning_rate": 4.383962540239977e-05, "loss": 0.0004, "step": 842 }, { "epoch": 0.2467076382791923, "grad_norm": 0.0031736004166305065, "learning_rate": 4.3832309043020195e-05, "loss": 0.0001, "step": 843 }, { "epoch": 0.24700029265437518, "grad_norm": 0.011300680227577686, "learning_rate": 4.382499268364062e-05, "loss": 0.0002, "step": 844 }, { "epoch": 0.2472929470295581, "grad_norm": 0.006444556172937155, "learning_rate": 4.3817676324261044e-05, "loss": 0.0001, "step": 845 }, { "epoch": 0.247585601404741, "grad_norm": 0.07491226494312286, "learning_rate": 4.381035996488147e-05, "loss": 0.0006, "step": 846 }, { "epoch": 0.24787825577992392, "grad_norm": 0.0015734657645225525, "learning_rate": 4.38030436055019e-05, "loss": 0.0, "step": 847 }, { "epoch": 0.2481709101551068, "grad_norm": 3.5701217651367188, "learning_rate": 4.379572724612233e-05, "loss": 0.2921, "step": 848 }, { "epoch": 0.24846356453028973, "grad_norm": 0.04214177653193474, "learning_rate": 4.3788410886742756e-05, "loss": 0.0003, "step": 849 }, { "epoch": 0.24875621890547264, "grad_norm": 12.9547119140625, "learning_rate": 4.3781094527363184e-05, "loss": 0.0718, "step": 850 }, { "epoch": 0.24904887328065556, "grad_norm": 0.034693583846092224, "learning_rate": 4.377377816798361e-05, "loss": 0.0003, "step": 851 }, { "epoch": 0.24934152765583845, "grad_norm": 0.028678277507424355, "learning_rate": 4.376646180860404e-05, "loss": 0.0004, "step": 852 }, { "epoch": 0.24963418203102136, "grad_norm": 0.05104019492864609, "learning_rate": 4.375914544922447e-05, "loss": 0.0006, "step": 853 }, { "epoch": 0.24992683640620428, "grad_norm": 0.3913678824901581, "learning_rate": 4.3751829089844896e-05, "loss": 0.0018, "step": 854 }, { "epoch": 0.25021949078138717, "grad_norm": 0.0270369965583086, "learning_rate": 4.374451273046532e-05, "loss": 0.0004, "step": 855 }, { "epoch": 0.2505121451565701, "grad_norm": 0.012402846477925777, "learning_rate": 4.3737196371085745e-05, "loss": 0.0002, "step": 856 }, { "epoch": 0.250804799531753, "grad_norm": 0.015872538089752197, "learning_rate": 4.372988001170617e-05, "loss": 0.0003, "step": 857 }, { "epoch": 0.2510974539069359, "grad_norm": 0.007675354368984699, "learning_rate": 4.37225636523266e-05, "loss": 0.0001, "step": 858 }, { "epoch": 0.25139010828211883, "grad_norm": 0.005604118574410677, "learning_rate": 4.371524729294703e-05, "loss": 0.0001, "step": 859 }, { "epoch": 0.2516827626573017, "grad_norm": 0.011989112012088299, "learning_rate": 4.370793093356746e-05, "loss": 0.0003, "step": 860 }, { "epoch": 0.25197541703248466, "grad_norm": 0.005313977133482695, "learning_rate": 4.3700614574187885e-05, "loss": 0.0001, "step": 861 }, { "epoch": 0.25226807140766755, "grad_norm": 0.022295720875263214, "learning_rate": 4.369329821480831e-05, "loss": 0.0004, "step": 862 }, { "epoch": 0.25256072578285044, "grad_norm": 0.0032608299516141415, "learning_rate": 4.368598185542874e-05, "loss": 0.0001, "step": 863 }, { "epoch": 0.2528533801580334, "grad_norm": 13.01710033416748, "learning_rate": 4.367866549604917e-05, "loss": 0.0695, "step": 864 }, { "epoch": 0.25314603453321627, "grad_norm": 0.022410310804843903, "learning_rate": 4.367134913666959e-05, "loss": 0.0002, "step": 865 }, { "epoch": 0.25343868890839916, "grad_norm": 0.008261977694928646, "learning_rate": 4.366403277729002e-05, "loss": 0.0002, "step": 866 }, { "epoch": 0.2537313432835821, "grad_norm": 0.1107669249176979, "learning_rate": 4.3656716417910446e-05, "loss": 0.0005, "step": 867 }, { "epoch": 0.254023997658765, "grad_norm": 0.01423464436084032, "learning_rate": 4.3649400058530874e-05, "loss": 0.0003, "step": 868 }, { "epoch": 0.25431665203394793, "grad_norm": 0.015357548370957375, "learning_rate": 4.36420836991513e-05, "loss": 0.0003, "step": 869 }, { "epoch": 0.2546093064091308, "grad_norm": 0.006338243838399649, "learning_rate": 4.363476733977173e-05, "loss": 0.0001, "step": 870 }, { "epoch": 0.2549019607843137, "grad_norm": 0.012789924629032612, "learning_rate": 4.362745098039216e-05, "loss": 0.0002, "step": 871 }, { "epoch": 0.25519461515949665, "grad_norm": 0.006994856055825949, "learning_rate": 4.3620134621012586e-05, "loss": 0.0001, "step": 872 }, { "epoch": 0.25548726953467954, "grad_norm": 6.519273281097412, "learning_rate": 4.3612818261633013e-05, "loss": 0.1762, "step": 873 }, { "epoch": 0.25577992390986243, "grad_norm": 10.568449974060059, "learning_rate": 4.360550190225344e-05, "loss": 0.0666, "step": 874 }, { "epoch": 0.25607257828504537, "grad_norm": 0.27735745906829834, "learning_rate": 4.359818554287387e-05, "loss": 0.0008, "step": 875 }, { "epoch": 0.25636523266022826, "grad_norm": 1.7639248371124268, "learning_rate": 4.359086918349429e-05, "loss": 0.0049, "step": 876 }, { "epoch": 0.2566578870354112, "grad_norm": 0.015343848615884781, "learning_rate": 4.358355282411472e-05, "loss": 0.0001, "step": 877 }, { "epoch": 0.2569505414105941, "grad_norm": 0.010843094438314438, "learning_rate": 4.3576236464735146e-05, "loss": 0.0001, "step": 878 }, { "epoch": 0.257243195785777, "grad_norm": 1.0062090158462524, "learning_rate": 4.3568920105355574e-05, "loss": 0.0035, "step": 879 }, { "epoch": 0.2575358501609599, "grad_norm": 0.0034330443013459444, "learning_rate": 4.3561603745976e-05, "loss": 0.0001, "step": 880 }, { "epoch": 0.2578285045361428, "grad_norm": 0.02194824256002903, "learning_rate": 4.355428738659643e-05, "loss": 0.0002, "step": 881 }, { "epoch": 0.2581211589113257, "grad_norm": 0.9777836203575134, "learning_rate": 4.354697102721686e-05, "loss": 0.0039, "step": 882 }, { "epoch": 0.25841381328650864, "grad_norm": 0.003030191408470273, "learning_rate": 4.3539654667837286e-05, "loss": 0.0001, "step": 883 }, { "epoch": 0.25870646766169153, "grad_norm": 0.0041368212550878525, "learning_rate": 4.3532338308457714e-05, "loss": 0.0001, "step": 884 }, { "epoch": 0.2589991220368745, "grad_norm": 0.011940663680434227, "learning_rate": 4.352502194907814e-05, "loss": 0.0002, "step": 885 }, { "epoch": 0.25929177641205736, "grad_norm": 0.0035515271592885256, "learning_rate": 4.351770558969857e-05, "loss": 0.0001, "step": 886 }, { "epoch": 0.25958443078724025, "grad_norm": 5.958016395568848, "learning_rate": 4.351038923031899e-05, "loss": 0.1927, "step": 887 }, { "epoch": 0.2598770851624232, "grad_norm": 0.012839854694902897, "learning_rate": 4.350307287093942e-05, "loss": 0.0002, "step": 888 }, { "epoch": 0.2601697395376061, "grad_norm": 0.02360195852816105, "learning_rate": 4.349575651155985e-05, "loss": 0.0004, "step": 889 }, { "epoch": 0.26046239391278897, "grad_norm": 0.008355548605322838, "learning_rate": 4.3488440152180275e-05, "loss": 0.0002, "step": 890 }, { "epoch": 0.2607550482879719, "grad_norm": 0.010975964367389679, "learning_rate": 4.34811237928007e-05, "loss": 0.0002, "step": 891 }, { "epoch": 0.2610477026631548, "grad_norm": 0.005606422666460276, "learning_rate": 4.347380743342113e-05, "loss": 0.0001, "step": 892 }, { "epoch": 0.26134035703833774, "grad_norm": 0.0077472287230193615, "learning_rate": 4.346649107404156e-05, "loss": 0.0001, "step": 893 }, { "epoch": 0.26163301141352063, "grad_norm": 0.016559531912207603, "learning_rate": 4.345917471466199e-05, "loss": 0.0002, "step": 894 }, { "epoch": 0.2619256657887035, "grad_norm": 0.005465401802212, "learning_rate": 4.3451858355282415e-05, "loss": 0.0001, "step": 895 }, { "epoch": 0.26221832016388646, "grad_norm": 0.011149146594107151, "learning_rate": 4.344454199590284e-05, "loss": 0.0001, "step": 896 }, { "epoch": 0.26251097453906935, "grad_norm": 0.020837359130382538, "learning_rate": 4.3437225636523264e-05, "loss": 0.0002, "step": 897 }, { "epoch": 0.26280362891425224, "grad_norm": 9.050426483154297, "learning_rate": 4.342990927714369e-05, "loss": 0.024, "step": 898 }, { "epoch": 0.2630962832894352, "grad_norm": 0.4160962700843811, "learning_rate": 4.342259291776412e-05, "loss": 0.0014, "step": 899 }, { "epoch": 0.2633889376646181, "grad_norm": 0.048738472163677216, "learning_rate": 4.341527655838455e-05, "loss": 0.0004, "step": 900 }, { "epoch": 0.263681592039801, "grad_norm": 0.007950617000460625, "learning_rate": 4.3407960199004976e-05, "loss": 0.0002, "step": 901 }, { "epoch": 0.2639742464149839, "grad_norm": 0.036539409309625626, "learning_rate": 4.3400643839625404e-05, "loss": 0.0004, "step": 902 }, { "epoch": 0.2642669007901668, "grad_norm": 0.34213629364967346, "learning_rate": 4.339332748024583e-05, "loss": 0.0013, "step": 903 }, { "epoch": 0.26455955516534974, "grad_norm": 12.875584602355957, "learning_rate": 4.338601112086626e-05, "loss": 0.0282, "step": 904 }, { "epoch": 0.2648522095405326, "grad_norm": 1.863190770149231, "learning_rate": 4.337869476148669e-05, "loss": 0.0067, "step": 905 }, { "epoch": 0.26514486391571557, "grad_norm": 0.0017428912688046694, "learning_rate": 4.3371378402107115e-05, "loss": 0.0, "step": 906 }, { "epoch": 0.26543751829089846, "grad_norm": 0.0031504349317401648, "learning_rate": 4.3364062042727543e-05, "loss": 0.0, "step": 907 }, { "epoch": 0.26573017266608134, "grad_norm": 0.04455633834004402, "learning_rate": 4.3356745683347965e-05, "loss": 0.0003, "step": 908 }, { "epoch": 0.2660228270412643, "grad_norm": 0.27286088466644287, "learning_rate": 4.334942932396839e-05, "loss": 0.0018, "step": 909 }, { "epoch": 0.2663154814164472, "grad_norm": 6.610106468200684, "learning_rate": 4.334211296458882e-05, "loss": 0.045, "step": 910 }, { "epoch": 0.26660813579163006, "grad_norm": 0.0290259700268507, "learning_rate": 4.333479660520925e-05, "loss": 0.0003, "step": 911 }, { "epoch": 0.266900790166813, "grad_norm": 0.006859095301479101, "learning_rate": 4.3327480245829676e-05, "loss": 0.0001, "step": 912 }, { "epoch": 0.2671934445419959, "grad_norm": 0.007041125558316708, "learning_rate": 4.3320163886450104e-05, "loss": 0.0001, "step": 913 }, { "epoch": 0.26748609891717884, "grad_norm": 0.001733655110001564, "learning_rate": 4.331284752707053e-05, "loss": 0.0, "step": 914 }, { "epoch": 0.2677787532923617, "grad_norm": 0.005434884224087, "learning_rate": 4.330553116769096e-05, "loss": 0.0001, "step": 915 }, { "epoch": 0.2680714076675446, "grad_norm": 16.0704345703125, "learning_rate": 4.329821480831139e-05, "loss": 0.0313, "step": 916 }, { "epoch": 0.26836406204272756, "grad_norm": 0.001067645032890141, "learning_rate": 4.3290898448931816e-05, "loss": 0.0, "step": 917 }, { "epoch": 0.26865671641791045, "grad_norm": 0.0016751672374084592, "learning_rate": 4.328358208955224e-05, "loss": 0.0, "step": 918 }, { "epoch": 0.26894937079309333, "grad_norm": 0.01004913542419672, "learning_rate": 4.3276265730172665e-05, "loss": 0.0001, "step": 919 }, { "epoch": 0.2692420251682763, "grad_norm": 0.004952297545969486, "learning_rate": 4.326894937079309e-05, "loss": 0.0001, "step": 920 }, { "epoch": 0.26953467954345917, "grad_norm": 14.358283996582031, "learning_rate": 4.326163301141352e-05, "loss": 0.1519, "step": 921 }, { "epoch": 0.2698273339186421, "grad_norm": 12.556432723999023, "learning_rate": 4.325431665203395e-05, "loss": 0.0425, "step": 922 }, { "epoch": 0.270119988293825, "grad_norm": 0.0035831343848258257, "learning_rate": 4.324700029265438e-05, "loss": 0.0001, "step": 923 }, { "epoch": 0.2704126426690079, "grad_norm": 0.002111797221004963, "learning_rate": 4.3239683933274805e-05, "loss": 0.0, "step": 924 }, { "epoch": 0.27070529704419083, "grad_norm": 0.011011890135705471, "learning_rate": 4.323236757389523e-05, "loss": 0.0001, "step": 925 }, { "epoch": 0.2709979514193737, "grad_norm": 0.0017403267556801438, "learning_rate": 4.322505121451566e-05, "loss": 0.0, "step": 926 }, { "epoch": 0.2712906057945566, "grad_norm": 0.041412338614463806, "learning_rate": 4.321773485513609e-05, "loss": 0.0002, "step": 927 }, { "epoch": 0.27158326016973955, "grad_norm": 0.014695284888148308, "learning_rate": 4.321041849575652e-05, "loss": 0.0002, "step": 928 }, { "epoch": 0.27187591454492244, "grad_norm": 11.386469841003418, "learning_rate": 4.320310213637694e-05, "loss": 0.0317, "step": 929 }, { "epoch": 0.2721685689201054, "grad_norm": 3.229830503463745, "learning_rate": 4.3195785776997366e-05, "loss": 0.3326, "step": 930 }, { "epoch": 0.27246122329528827, "grad_norm": 0.0017760074697434902, "learning_rate": 4.3188469417617794e-05, "loss": 0.0, "step": 931 }, { "epoch": 0.27275387767047116, "grad_norm": 0.014468281529843807, "learning_rate": 4.318115305823822e-05, "loss": 0.0001, "step": 932 }, { "epoch": 0.2730465320456541, "grad_norm": 0.0017373122973367572, "learning_rate": 4.317383669885865e-05, "loss": 0.0, "step": 933 }, { "epoch": 0.273339186420837, "grad_norm": 0.008035775274038315, "learning_rate": 4.316652033947908e-05, "loss": 0.0001, "step": 934 }, { "epoch": 0.2736318407960199, "grad_norm": 0.045467838644981384, "learning_rate": 4.3159203980099506e-05, "loss": 0.0003, "step": 935 }, { "epoch": 0.2739244951712028, "grad_norm": 0.23960766196250916, "learning_rate": 4.3151887620719934e-05, "loss": 0.0009, "step": 936 }, { "epoch": 0.2742171495463857, "grad_norm": 0.42729949951171875, "learning_rate": 4.314457126134036e-05, "loss": 0.001, "step": 937 }, { "epoch": 0.27450980392156865, "grad_norm": 3.776639699935913, "learning_rate": 4.313725490196079e-05, "loss": 0.0164, "step": 938 }, { "epoch": 0.27480245829675154, "grad_norm": 0.010938864201307297, "learning_rate": 4.312993854258122e-05, "loss": 0.0002, "step": 939 }, { "epoch": 0.2750951126719344, "grad_norm": 0.025770241394639015, "learning_rate": 4.312262218320164e-05, "loss": 0.0002, "step": 940 }, { "epoch": 0.27538776704711737, "grad_norm": 0.03667419031262398, "learning_rate": 4.311530582382207e-05, "loss": 0.0003, "step": 941 }, { "epoch": 0.27568042142230026, "grad_norm": 5.51930570602417, "learning_rate": 4.3107989464442495e-05, "loss": 0.1458, "step": 942 }, { "epoch": 0.27597307579748315, "grad_norm": 5.226438045501709, "learning_rate": 4.310067310506292e-05, "loss": 0.0088, "step": 943 }, { "epoch": 0.2762657301726661, "grad_norm": 0.23687642812728882, "learning_rate": 4.309335674568335e-05, "loss": 0.0007, "step": 944 }, { "epoch": 0.276558384547849, "grad_norm": 0.021837415173649788, "learning_rate": 4.308604038630378e-05, "loss": 0.0004, "step": 945 }, { "epoch": 0.2768510389230319, "grad_norm": 2.7103466987609863, "learning_rate": 4.3078724026924206e-05, "loss": 0.0068, "step": 946 }, { "epoch": 0.2771436932982148, "grad_norm": 12.31233024597168, "learning_rate": 4.3071407667544634e-05, "loss": 0.0343, "step": 947 }, { "epoch": 0.2774363476733977, "grad_norm": 4.28306770324707, "learning_rate": 4.306409130816506e-05, "loss": 0.0128, "step": 948 }, { "epoch": 0.27772900204858064, "grad_norm": 0.038289640098810196, "learning_rate": 4.305677494878549e-05, "loss": 0.0008, "step": 949 }, { "epoch": 0.27802165642376353, "grad_norm": 0.031705744564533234, "learning_rate": 4.304945858940591e-05, "loss": 0.0006, "step": 950 }, { "epoch": 0.2783143107989464, "grad_norm": 0.027836626395583153, "learning_rate": 4.304214223002634e-05, "loss": 0.0006, "step": 951 }, { "epoch": 0.27860696517412936, "grad_norm": 0.013349486514925957, "learning_rate": 4.303482587064677e-05, "loss": 0.0003, "step": 952 }, { "epoch": 0.27889961954931225, "grad_norm": 0.010819119401276112, "learning_rate": 4.3027509511267195e-05, "loss": 0.0002, "step": 953 }, { "epoch": 0.2791922739244952, "grad_norm": 0.011443225666880608, "learning_rate": 4.302019315188762e-05, "loss": 0.0002, "step": 954 }, { "epoch": 0.2794849282996781, "grad_norm": 8.645218849182129, "learning_rate": 4.301287679250805e-05, "loss": 0.0937, "step": 955 }, { "epoch": 0.27977758267486097, "grad_norm": 0.011645263060927391, "learning_rate": 4.300556043312848e-05, "loss": 0.0002, "step": 956 }, { "epoch": 0.2800702370500439, "grad_norm": 0.012442460283637047, "learning_rate": 4.299824407374891e-05, "loss": 0.0002, "step": 957 }, { "epoch": 0.2803628914252268, "grad_norm": 24.188045501708984, "learning_rate": 4.2990927714369335e-05, "loss": 0.0763, "step": 958 }, { "epoch": 0.2806555458004097, "grad_norm": 0.016478197649121284, "learning_rate": 4.298361135498976e-05, "loss": 0.0003, "step": 959 }, { "epoch": 0.28094820017559263, "grad_norm": 0.04385501518845558, "learning_rate": 4.297629499561019e-05, "loss": 0.0005, "step": 960 }, { "epoch": 0.2812408545507755, "grad_norm": 0.009579029865562916, "learning_rate": 4.296897863623061e-05, "loss": 0.0002, "step": 961 }, { "epoch": 0.28153350892595846, "grad_norm": 0.009482062421739101, "learning_rate": 4.296166227685104e-05, "loss": 0.0002, "step": 962 }, { "epoch": 0.28182616330114135, "grad_norm": 0.018784578889608383, "learning_rate": 4.295434591747147e-05, "loss": 0.0003, "step": 963 }, { "epoch": 0.28211881767632424, "grad_norm": 0.013786012306809425, "learning_rate": 4.2947029558091896e-05, "loss": 0.0003, "step": 964 }, { "epoch": 0.2824114720515072, "grad_norm": 0.07157467305660248, "learning_rate": 4.2939713198712324e-05, "loss": 0.0011, "step": 965 }, { "epoch": 0.28270412642669007, "grad_norm": 0.007835990749299526, "learning_rate": 4.293239683933275e-05, "loss": 0.0002, "step": 966 }, { "epoch": 0.28299678080187296, "grad_norm": 0.01569865457713604, "learning_rate": 4.292508047995318e-05, "loss": 0.0003, "step": 967 }, { "epoch": 0.2832894351770559, "grad_norm": 0.013751539401710033, "learning_rate": 4.291776412057361e-05, "loss": 0.0003, "step": 968 }, { "epoch": 0.2835820895522388, "grad_norm": 0.01229290384799242, "learning_rate": 4.2910447761194036e-05, "loss": 0.0002, "step": 969 }, { "epoch": 0.28387474392742174, "grad_norm": 0.012345947325229645, "learning_rate": 4.2903131401814464e-05, "loss": 0.0003, "step": 970 }, { "epoch": 0.2841673983026046, "grad_norm": 0.010408922098577023, "learning_rate": 4.2895815042434885e-05, "loss": 0.0002, "step": 971 }, { "epoch": 0.2844600526777875, "grad_norm": 0.00517492787912488, "learning_rate": 4.288849868305531e-05, "loss": 0.0001, "step": 972 }, { "epoch": 0.28475270705297046, "grad_norm": 0.02397426962852478, "learning_rate": 4.288118232367574e-05, "loss": 0.0004, "step": 973 }, { "epoch": 0.28504536142815334, "grad_norm": 0.010111522860825062, "learning_rate": 4.287386596429617e-05, "loss": 0.0002, "step": 974 }, { "epoch": 0.2853380158033363, "grad_norm": 5.845407962799072, "learning_rate": 4.28665496049166e-05, "loss": 0.0151, "step": 975 }, { "epoch": 0.2856306701785192, "grad_norm": 0.1198161393404007, "learning_rate": 4.2859233245537025e-05, "loss": 0.0008, "step": 976 }, { "epoch": 0.28592332455370206, "grad_norm": 0.01586102321743965, "learning_rate": 4.285191688615745e-05, "loss": 0.0003, "step": 977 }, { "epoch": 0.286215978928885, "grad_norm": 6.835101127624512, "learning_rate": 4.284460052677788e-05, "loss": 0.1479, "step": 978 }, { "epoch": 0.2865086333040679, "grad_norm": 0.00831079576164484, "learning_rate": 4.283728416739831e-05, "loss": 0.0001, "step": 979 }, { "epoch": 0.2868012876792508, "grad_norm": 0.006643155124038458, "learning_rate": 4.2829967808018736e-05, "loss": 0.0001, "step": 980 }, { "epoch": 0.2870939420544337, "grad_norm": 0.003035582136362791, "learning_rate": 4.2822651448639164e-05, "loss": 0.0001, "step": 981 }, { "epoch": 0.2873865964296166, "grad_norm": 3.952209234237671, "learning_rate": 4.2815335089259586e-05, "loss": 0.0115, "step": 982 }, { "epoch": 0.28767925080479956, "grad_norm": 0.01735353283584118, "learning_rate": 4.2808018729880013e-05, "loss": 0.0002, "step": 983 }, { "epoch": 0.28797190517998245, "grad_norm": 0.004753570072352886, "learning_rate": 4.280070237050044e-05, "loss": 0.0001, "step": 984 }, { "epoch": 0.28826455955516533, "grad_norm": 0.003133102785795927, "learning_rate": 4.279338601112087e-05, "loss": 0.0001, "step": 985 }, { "epoch": 0.2885572139303483, "grad_norm": 0.0429200679063797, "learning_rate": 4.27860696517413e-05, "loss": 0.0003, "step": 986 }, { "epoch": 0.28884986830553117, "grad_norm": 0.006937104742974043, "learning_rate": 4.2778753292361725e-05, "loss": 0.0001, "step": 987 }, { "epoch": 0.28914252268071405, "grad_norm": 0.004405143670737743, "learning_rate": 4.277143693298215e-05, "loss": 0.0001, "step": 988 }, { "epoch": 0.289435177055897, "grad_norm": 0.04413849487900734, "learning_rate": 4.276412057360258e-05, "loss": 0.0002, "step": 989 }, { "epoch": 0.2897278314310799, "grad_norm": 0.0017203919123858213, "learning_rate": 4.275680421422301e-05, "loss": 0.0, "step": 990 }, { "epoch": 0.29002048580626283, "grad_norm": 0.005167375318706036, "learning_rate": 4.274948785484344e-05, "loss": 0.0001, "step": 991 }, { "epoch": 0.2903131401814457, "grad_norm": 0.005177373066544533, "learning_rate": 4.2742171495463865e-05, "loss": 0.0001, "step": 992 }, { "epoch": 0.2906057945566286, "grad_norm": 0.0017759123584255576, "learning_rate": 4.2734855136084286e-05, "loss": 0.0, "step": 993 }, { "epoch": 0.29089844893181155, "grad_norm": 3.8305413722991943, "learning_rate": 4.2727538776704714e-05, "loss": 0.16, "step": 994 }, { "epoch": 0.29119110330699444, "grad_norm": 0.002959401113912463, "learning_rate": 4.272022241732514e-05, "loss": 0.0001, "step": 995 }, { "epoch": 0.2914837576821773, "grad_norm": 7.244873046875, "learning_rate": 4.271290605794557e-05, "loss": 0.2784, "step": 996 }, { "epoch": 0.29177641205736027, "grad_norm": 0.01972855255007744, "learning_rate": 4.2705589698566e-05, "loss": 0.0003, "step": 997 }, { "epoch": 0.29206906643254316, "grad_norm": 0.026749806478619576, "learning_rate": 4.2698273339186426e-05, "loss": 0.0003, "step": 998 }, { "epoch": 0.2923617208077261, "grad_norm": 0.015836404636502266, "learning_rate": 4.2690956979806854e-05, "loss": 0.0003, "step": 999 }, { "epoch": 0.292654375182909, "grad_norm": 0.022398140281438828, "learning_rate": 4.268364062042728e-05, "loss": 0.0004, "step": 1000 }, { "epoch": 0.2929470295580919, "grad_norm": 0.10159856826066971, "learning_rate": 4.267632426104771e-05, "loss": 0.0015, "step": 1001 }, { "epoch": 0.2932396839332748, "grad_norm": 13.095772743225098, "learning_rate": 4.266900790166813e-05, "loss": 0.1974, "step": 1002 }, { "epoch": 0.2935323383084577, "grad_norm": 0.294485867023468, "learning_rate": 4.266169154228856e-05, "loss": 0.0034, "step": 1003 }, { "epoch": 0.2938249926836406, "grad_norm": 0.6832722425460815, "learning_rate": 4.265437518290899e-05, "loss": 0.0056, "step": 1004 }, { "epoch": 0.29411764705882354, "grad_norm": 1.7126188278198242, "learning_rate": 4.2647058823529415e-05, "loss": 0.0082, "step": 1005 }, { "epoch": 0.2944103014340064, "grad_norm": 0.010100563988089561, "learning_rate": 4.263974246414984e-05, "loss": 0.0002, "step": 1006 }, { "epoch": 0.29470295580918937, "grad_norm": 0.010684849694371223, "learning_rate": 4.263242610477027e-05, "loss": 0.0002, "step": 1007 }, { "epoch": 0.29499561018437226, "grad_norm": 0.06480780988931656, "learning_rate": 4.26251097453907e-05, "loss": 0.0006, "step": 1008 }, { "epoch": 0.29528826455955515, "grad_norm": 0.40292733907699585, "learning_rate": 4.2617793386011127e-05, "loss": 0.0014, "step": 1009 }, { "epoch": 0.2955809189347381, "grad_norm": 5.591978073120117, "learning_rate": 4.261047702663155e-05, "loss": 0.0735, "step": 1010 }, { "epoch": 0.295873573309921, "grad_norm": 0.004439515061676502, "learning_rate": 4.2603160667251976e-05, "loss": 0.0001, "step": 1011 }, { "epoch": 0.29616622768510387, "grad_norm": 0.2625431418418884, "learning_rate": 4.2595844307872404e-05, "loss": 0.0013, "step": 1012 }, { "epoch": 0.2964588820602868, "grad_norm": 0.8707738518714905, "learning_rate": 4.258852794849283e-05, "loss": 0.0023, "step": 1013 }, { "epoch": 0.2967515364354697, "grad_norm": 0.00825866125524044, "learning_rate": 4.258121158911326e-05, "loss": 0.0002, "step": 1014 }, { "epoch": 0.29704419081065264, "grad_norm": 0.011433348059654236, "learning_rate": 4.257389522973369e-05, "loss": 0.0002, "step": 1015 }, { "epoch": 0.29733684518583553, "grad_norm": 0.005431619007140398, "learning_rate": 4.2566578870354115e-05, "loss": 0.0001, "step": 1016 }, { "epoch": 0.2976294995610184, "grad_norm": 0.006892085541039705, "learning_rate": 4.2559262510974543e-05, "loss": 0.0001, "step": 1017 }, { "epoch": 0.29792215393620136, "grad_norm": 0.016305092722177505, "learning_rate": 4.255194615159497e-05, "loss": 0.0002, "step": 1018 }, { "epoch": 0.29821480831138425, "grad_norm": 0.015026643872261047, "learning_rate": 4.254462979221539e-05, "loss": 0.0002, "step": 1019 }, { "epoch": 0.29850746268656714, "grad_norm": 0.025266500189900398, "learning_rate": 4.253731343283582e-05, "loss": 0.0004, "step": 1020 }, { "epoch": 0.2988001170617501, "grad_norm": 0.014823542907834053, "learning_rate": 4.252999707345625e-05, "loss": 0.0003, "step": 1021 }, { "epoch": 0.29909277143693297, "grad_norm": 0.04029303789138794, "learning_rate": 4.2522680714076676e-05, "loss": 0.0004, "step": 1022 }, { "epoch": 0.2993854258121159, "grad_norm": 3.154346466064453, "learning_rate": 4.2515364354697104e-05, "loss": 0.1832, "step": 1023 }, { "epoch": 0.2996780801872988, "grad_norm": 0.040032967925071716, "learning_rate": 4.250804799531753e-05, "loss": 0.0006, "step": 1024 }, { "epoch": 0.2999707345624817, "grad_norm": 7.443057537078857, "learning_rate": 4.250073163593796e-05, "loss": 0.017, "step": 1025 }, { "epoch": 0.30026338893766463, "grad_norm": 0.3301943838596344, "learning_rate": 4.249341527655839e-05, "loss": 0.0031, "step": 1026 }, { "epoch": 0.3005560433128475, "grad_norm": 0.009938620030879974, "learning_rate": 4.248609891717881e-05, "loss": 0.0002, "step": 1027 }, { "epoch": 0.3008486976880304, "grad_norm": 0.0500522255897522, "learning_rate": 4.247878255779924e-05, "loss": 0.0007, "step": 1028 }, { "epoch": 0.30114135206321335, "grad_norm": 2.4152634143829346, "learning_rate": 4.2471466198419665e-05, "loss": 0.298, "step": 1029 }, { "epoch": 0.30143400643839624, "grad_norm": 1.904123067855835, "learning_rate": 4.246414983904009e-05, "loss": 0.0096, "step": 1030 }, { "epoch": 0.3017266608135792, "grad_norm": 0.4908471405506134, "learning_rate": 4.245683347966052e-05, "loss": 0.0031, "step": 1031 }, { "epoch": 0.30201931518876207, "grad_norm": 0.007599519100040197, "learning_rate": 4.244951712028095e-05, "loss": 0.0002, "step": 1032 }, { "epoch": 0.30231196956394496, "grad_norm": 8.015567779541016, "learning_rate": 4.244220076090138e-05, "loss": 0.0568, "step": 1033 }, { "epoch": 0.3026046239391279, "grad_norm": 0.13986808061599731, "learning_rate": 4.2434884401521805e-05, "loss": 0.0007, "step": 1034 }, { "epoch": 0.3028972783143108, "grad_norm": 0.00213990593329072, "learning_rate": 4.2427568042142226e-05, "loss": 0.0001, "step": 1035 }, { "epoch": 0.30318993268949374, "grad_norm": 0.9953876733779907, "learning_rate": 4.2420251682762654e-05, "loss": 0.0055, "step": 1036 }, { "epoch": 0.3034825870646766, "grad_norm": 0.012336323037743568, "learning_rate": 4.241293532338308e-05, "loss": 0.0002, "step": 1037 }, { "epoch": 0.3037752414398595, "grad_norm": 8.195555686950684, "learning_rate": 4.240561896400351e-05, "loss": 0.0533, "step": 1038 }, { "epoch": 0.30406789581504245, "grad_norm": 0.0049442751333117485, "learning_rate": 4.239830260462394e-05, "loss": 0.0001, "step": 1039 }, { "epoch": 0.30436055019022534, "grad_norm": 0.010625405237078667, "learning_rate": 4.2390986245244366e-05, "loss": 0.0003, "step": 1040 }, { "epoch": 0.30465320456540823, "grad_norm": 0.019566647708415985, "learning_rate": 4.2383669885864794e-05, "loss": 0.0003, "step": 1041 }, { "epoch": 0.3049458589405912, "grad_norm": 7.804142951965332, "learning_rate": 4.237635352648522e-05, "loss": 0.0354, "step": 1042 }, { "epoch": 0.30523851331577406, "grad_norm": 0.4666447639465332, "learning_rate": 4.236903716710565e-05, "loss": 0.0027, "step": 1043 }, { "epoch": 0.305531167690957, "grad_norm": 0.0783332884311676, "learning_rate": 4.236172080772608e-05, "loss": 0.0012, "step": 1044 }, { "epoch": 0.3058238220661399, "grad_norm": 9.876611709594727, "learning_rate": 4.2354404448346506e-05, "loss": 0.0625, "step": 1045 }, { "epoch": 0.3061164764413228, "grad_norm": 3.9588825702667236, "learning_rate": 4.234708808896693e-05, "loss": 0.0243, "step": 1046 }, { "epoch": 0.3064091308165057, "grad_norm": 0.024225052446126938, "learning_rate": 4.2339771729587355e-05, "loss": 0.0004, "step": 1047 }, { "epoch": 0.3067017851916886, "grad_norm": 0.15110638737678528, "learning_rate": 4.233245537020778e-05, "loss": 0.001, "step": 1048 }, { "epoch": 0.3069944395668715, "grad_norm": 0.0236615389585495, "learning_rate": 4.232513901082821e-05, "loss": 0.0004, "step": 1049 }, { "epoch": 0.30728709394205445, "grad_norm": 0.01637859269976616, "learning_rate": 4.231782265144864e-05, "loss": 0.0004, "step": 1050 }, { "epoch": 0.30757974831723733, "grad_norm": 0.015215440653264523, "learning_rate": 4.231050629206907e-05, "loss": 0.0003, "step": 1051 }, { "epoch": 0.3078724026924203, "grad_norm": 0.7368258833885193, "learning_rate": 4.2303189932689495e-05, "loss": 0.0038, "step": 1052 }, { "epoch": 0.30816505706760317, "grad_norm": 7.859990119934082, "learning_rate": 4.229587357330992e-05, "loss": 0.0521, "step": 1053 }, { "epoch": 0.30845771144278605, "grad_norm": 0.011727051809430122, "learning_rate": 4.228855721393035e-05, "loss": 0.0003, "step": 1054 }, { "epoch": 0.308750365817969, "grad_norm": 0.007139664608985186, "learning_rate": 4.228124085455078e-05, "loss": 0.0002, "step": 1055 }, { "epoch": 0.3090430201931519, "grad_norm": 1.1220675706863403, "learning_rate": 4.22739244951712e-05, "loss": 0.0031, "step": 1056 }, { "epoch": 0.3093356745683348, "grad_norm": 0.004913520999252796, "learning_rate": 4.226660813579163e-05, "loss": 0.0001, "step": 1057 }, { "epoch": 0.3096283289435177, "grad_norm": 0.047684524208307266, "learning_rate": 4.2259291776412056e-05, "loss": 0.0006, "step": 1058 }, { "epoch": 0.3099209833187006, "grad_norm": 0.07091566175222397, "learning_rate": 4.2251975417032483e-05, "loss": 0.0009, "step": 1059 }, { "epoch": 0.31021363769388355, "grad_norm": 5.7323899269104, "learning_rate": 4.224465905765291e-05, "loss": 0.014, "step": 1060 }, { "epoch": 0.31050629206906644, "grad_norm": 10.569522857666016, "learning_rate": 4.223734269827334e-05, "loss": 0.0419, "step": 1061 }, { "epoch": 0.3107989464442493, "grad_norm": 0.01675260253250599, "learning_rate": 4.223002633889377e-05, "loss": 0.0003, "step": 1062 }, { "epoch": 0.31109160081943227, "grad_norm": 0.010008047334849834, "learning_rate": 4.2222709979514195e-05, "loss": 0.0002, "step": 1063 }, { "epoch": 0.31138425519461516, "grad_norm": 0.27900147438049316, "learning_rate": 4.221539362013462e-05, "loss": 0.0026, "step": 1064 }, { "epoch": 0.31167690956979804, "grad_norm": 0.007643221411854029, "learning_rate": 4.220807726075505e-05, "loss": 0.0002, "step": 1065 }, { "epoch": 0.311969563944981, "grad_norm": 0.9489452242851257, "learning_rate": 4.220076090137548e-05, "loss": 0.0053, "step": 1066 }, { "epoch": 0.3122622183201639, "grad_norm": 4.815219402313232, "learning_rate": 4.21934445419959e-05, "loss": 0.1514, "step": 1067 }, { "epoch": 0.3125548726953468, "grad_norm": 0.020156513899564743, "learning_rate": 4.218612818261633e-05, "loss": 0.0003, "step": 1068 }, { "epoch": 0.3128475270705297, "grad_norm": 0.25733235478401184, "learning_rate": 4.2178811823236756e-05, "loss": 0.0007, "step": 1069 }, { "epoch": 0.3131401814457126, "grad_norm": 0.6491882801055908, "learning_rate": 4.2171495463857184e-05, "loss": 0.0016, "step": 1070 }, { "epoch": 0.31343283582089554, "grad_norm": 0.0020534771028906107, "learning_rate": 4.216417910447761e-05, "loss": 0.0001, "step": 1071 }, { "epoch": 0.3137254901960784, "grad_norm": 0.004543904215097427, "learning_rate": 4.215686274509804e-05, "loss": 0.0001, "step": 1072 }, { "epoch": 0.3140181445712613, "grad_norm": 0.006593961734324694, "learning_rate": 4.214954638571847e-05, "loss": 0.0002, "step": 1073 }, { "epoch": 0.31431079894644426, "grad_norm": 0.002780098468065262, "learning_rate": 4.2142230026338896e-05, "loss": 0.0001, "step": 1074 }, { "epoch": 0.31460345332162715, "grad_norm": 0.005017926450818777, "learning_rate": 4.2134913666959324e-05, "loss": 0.0001, "step": 1075 }, { "epoch": 0.3148961076968101, "grad_norm": 0.0033071120269596577, "learning_rate": 4.212759730757975e-05, "loss": 0.0001, "step": 1076 }, { "epoch": 0.315188762071993, "grad_norm": 0.007819382473826408, "learning_rate": 4.212028094820018e-05, "loss": 0.0001, "step": 1077 }, { "epoch": 0.31548141644717587, "grad_norm": 0.004965700674802065, "learning_rate": 4.21129645888206e-05, "loss": 0.0001, "step": 1078 }, { "epoch": 0.3157740708223588, "grad_norm": 0.07569081336259842, "learning_rate": 4.210564822944103e-05, "loss": 0.0005, "step": 1079 }, { "epoch": 0.3160667251975417, "grad_norm": 0.0030827720183879137, "learning_rate": 4.209833187006146e-05, "loss": 0.0001, "step": 1080 }, { "epoch": 0.3163593795727246, "grad_norm": 0.006564725656062365, "learning_rate": 4.2091015510681885e-05, "loss": 0.0002, "step": 1081 }, { "epoch": 0.31665203394790753, "grad_norm": 0.004230671562254429, "learning_rate": 4.208369915130231e-05, "loss": 0.0001, "step": 1082 }, { "epoch": 0.3169446883230904, "grad_norm": 0.12409224361181259, "learning_rate": 4.207638279192274e-05, "loss": 0.0007, "step": 1083 }, { "epoch": 0.31723734269827336, "grad_norm": 0.44127991795539856, "learning_rate": 4.206906643254317e-05, "loss": 0.0011, "step": 1084 }, { "epoch": 0.31752999707345625, "grad_norm": 0.0017722928896546364, "learning_rate": 4.20617500731636e-05, "loss": 0.0001, "step": 1085 }, { "epoch": 0.31782265144863914, "grad_norm": 0.0450950562953949, "learning_rate": 4.2054433713784025e-05, "loss": 0.0004, "step": 1086 }, { "epoch": 0.3181153058238221, "grad_norm": 0.00162849563639611, "learning_rate": 4.204711735440445e-05, "loss": 0.0, "step": 1087 }, { "epoch": 0.31840796019900497, "grad_norm": 0.003102143993601203, "learning_rate": 4.2039800995024874e-05, "loss": 0.0001, "step": 1088 }, { "epoch": 0.31870061457418786, "grad_norm": 11.183212280273438, "learning_rate": 4.20324846356453e-05, "loss": 0.0843, "step": 1089 }, { "epoch": 0.3189932689493708, "grad_norm": 6.125216960906982, "learning_rate": 4.202516827626573e-05, "loss": 0.2432, "step": 1090 }, { "epoch": 0.3192859233245537, "grad_norm": 15.360879898071289, "learning_rate": 4.201785191688616e-05, "loss": 0.0511, "step": 1091 }, { "epoch": 0.31957857769973663, "grad_norm": 0.09148026257753372, "learning_rate": 4.2010535557506585e-05, "loss": 0.0007, "step": 1092 }, { "epoch": 0.3198712320749195, "grad_norm": 0.059560466557741165, "learning_rate": 4.2003219198127013e-05, "loss": 0.0005, "step": 1093 }, { "epoch": 0.3201638864501024, "grad_norm": 0.15515701472759247, "learning_rate": 4.199590283874744e-05, "loss": 0.0008, "step": 1094 }, { "epoch": 0.32045654082528535, "grad_norm": 0.5345580577850342, "learning_rate": 4.198858647936787e-05, "loss": 0.0022, "step": 1095 }, { "epoch": 0.32074919520046824, "grad_norm": 9.372148513793945, "learning_rate": 4.19812701199883e-05, "loss": 0.1242, "step": 1096 }, { "epoch": 0.3210418495756512, "grad_norm": 0.04386669024825096, "learning_rate": 4.1973953760608725e-05, "loss": 0.0005, "step": 1097 }, { "epoch": 0.32133450395083407, "grad_norm": 0.02652975730597973, "learning_rate": 4.196663740122915e-05, "loss": 0.0002, "step": 1098 }, { "epoch": 0.32162715832601696, "grad_norm": 0.0688885748386383, "learning_rate": 4.1959321041849574e-05, "loss": 0.0004, "step": 1099 }, { "epoch": 0.3219198127011999, "grad_norm": 7.224321365356445, "learning_rate": 4.195200468247e-05, "loss": 0.0256, "step": 1100 }, { "epoch": 0.3222124670763828, "grad_norm": 0.005632548127323389, "learning_rate": 4.194468832309043e-05, "loss": 0.0001, "step": 1101 }, { "epoch": 0.3225051214515657, "grad_norm": 6.820813179016113, "learning_rate": 4.193737196371086e-05, "loss": 0.1765, "step": 1102 }, { "epoch": 0.3227977758267486, "grad_norm": 0.0025644320994615555, "learning_rate": 4.1930055604331286e-05, "loss": 0.0001, "step": 1103 }, { "epoch": 0.3230904302019315, "grad_norm": 0.015021145343780518, "learning_rate": 4.1922739244951714e-05, "loss": 0.0001, "step": 1104 }, { "epoch": 0.32338308457711445, "grad_norm": 4.52939510345459, "learning_rate": 4.191542288557214e-05, "loss": 0.0091, "step": 1105 }, { "epoch": 0.32367573895229734, "grad_norm": 4.507110118865967, "learning_rate": 4.190810652619257e-05, "loss": 0.0182, "step": 1106 }, { "epoch": 0.32396839332748023, "grad_norm": 0.005083255935460329, "learning_rate": 4.1900790166813e-05, "loss": 0.0001, "step": 1107 }, { "epoch": 0.3242610477026632, "grad_norm": 0.018605127930641174, "learning_rate": 4.1893473807433426e-05, "loss": 0.0003, "step": 1108 }, { "epoch": 0.32455370207784606, "grad_norm": 0.02068709395825863, "learning_rate": 4.188615744805385e-05, "loss": 0.0003, "step": 1109 }, { "epoch": 0.32484635645302895, "grad_norm": 0.020834336057305336, "learning_rate": 4.1878841088674275e-05, "loss": 0.0003, "step": 1110 }, { "epoch": 0.3251390108282119, "grad_norm": 0.25753891468048096, "learning_rate": 4.18715247292947e-05, "loss": 0.0014, "step": 1111 }, { "epoch": 0.3254316652033948, "grad_norm": 5.986315727233887, "learning_rate": 4.186420836991513e-05, "loss": 0.1455, "step": 1112 }, { "epoch": 0.3257243195785777, "grad_norm": 0.11490517854690552, "learning_rate": 4.185689201053556e-05, "loss": 0.001, "step": 1113 }, { "epoch": 0.3260169739537606, "grad_norm": 8.565180778503418, "learning_rate": 4.184957565115599e-05, "loss": 0.0544, "step": 1114 }, { "epoch": 0.3263096283289435, "grad_norm": 0.002345768269151449, "learning_rate": 4.1842259291776415e-05, "loss": 0.0, "step": 1115 }, { "epoch": 0.32660228270412645, "grad_norm": 8.543319702148438, "learning_rate": 4.183494293239684e-05, "loss": 0.0232, "step": 1116 }, { "epoch": 0.32689493707930933, "grad_norm": 4.645630359649658, "learning_rate": 4.182762657301727e-05, "loss": 0.0164, "step": 1117 }, { "epoch": 0.3271875914544922, "grad_norm": 0.0025222499389201403, "learning_rate": 4.18203102136377e-05, "loss": 0.0001, "step": 1118 }, { "epoch": 0.32748024582967517, "grad_norm": 0.007890268228948116, "learning_rate": 4.1812993854258127e-05, "loss": 0.0002, "step": 1119 }, { "epoch": 0.32777290020485805, "grad_norm": 0.03280539810657501, "learning_rate": 4.180567749487855e-05, "loss": 0.0005, "step": 1120 }, { "epoch": 0.328065554580041, "grad_norm": 1.9922877550125122, "learning_rate": 4.1798361135498976e-05, "loss": 0.0093, "step": 1121 }, { "epoch": 0.3283582089552239, "grad_norm": 5.538125514984131, "learning_rate": 4.1791044776119404e-05, "loss": 0.013, "step": 1122 }, { "epoch": 0.3286508633304068, "grad_norm": 3.149320602416992, "learning_rate": 4.178372841673983e-05, "loss": 0.0151, "step": 1123 }, { "epoch": 0.3289435177055897, "grad_norm": 0.19224414229393005, "learning_rate": 4.177641205736026e-05, "loss": 0.0013, "step": 1124 }, { "epoch": 0.3292361720807726, "grad_norm": 0.16442540287971497, "learning_rate": 4.176909569798069e-05, "loss": 0.0011, "step": 1125 }, { "epoch": 0.3295288264559555, "grad_norm": 0.014610587619245052, "learning_rate": 4.1761779338601115e-05, "loss": 0.0003, "step": 1126 }, { "epoch": 0.32982148083113844, "grad_norm": 5.707912445068359, "learning_rate": 4.1754462979221543e-05, "loss": 0.1486, "step": 1127 }, { "epoch": 0.3301141352063213, "grad_norm": 0.023597678169608116, "learning_rate": 4.174714661984197e-05, "loss": 0.0004, "step": 1128 }, { "epoch": 0.33040678958150427, "grad_norm": 0.05142216011881828, "learning_rate": 4.17398302604624e-05, "loss": 0.0008, "step": 1129 }, { "epoch": 0.33069944395668716, "grad_norm": 3.5134053230285645, "learning_rate": 4.173251390108283e-05, "loss": 0.0108, "step": 1130 }, { "epoch": 0.33099209833187004, "grad_norm": 0.02050555869936943, "learning_rate": 4.172519754170325e-05, "loss": 0.0003, "step": 1131 }, { "epoch": 0.331284752707053, "grad_norm": 0.07703938335180283, "learning_rate": 4.1717881182323676e-05, "loss": 0.0011, "step": 1132 }, { "epoch": 0.3315774070822359, "grad_norm": 2.8332552909851074, "learning_rate": 4.1710564822944104e-05, "loss": 0.006, "step": 1133 }, { "epoch": 0.33187006145741876, "grad_norm": 0.015761759132146835, "learning_rate": 4.170324846356453e-05, "loss": 0.0003, "step": 1134 }, { "epoch": 0.3321627158326017, "grad_norm": 3.626243829727173, "learning_rate": 4.169593210418496e-05, "loss": 0.018, "step": 1135 }, { "epoch": 0.3324553702077846, "grad_norm": 0.04060226306319237, "learning_rate": 4.168861574480539e-05, "loss": 0.0005, "step": 1136 }, { "epoch": 0.33274802458296754, "grad_norm": 0.09494542330503464, "learning_rate": 4.1681299385425816e-05, "loss": 0.0008, "step": 1137 }, { "epoch": 0.3330406789581504, "grad_norm": 0.5154232382774353, "learning_rate": 4.1673983026046244e-05, "loss": 0.0026, "step": 1138 }, { "epoch": 0.3333333333333333, "grad_norm": 0.011833082884550095, "learning_rate": 4.166666666666667e-05, "loss": 0.0002, "step": 1139 }, { "epoch": 0.33362598770851626, "grad_norm": 0.4022832214832306, "learning_rate": 4.16593503072871e-05, "loss": 0.0012, "step": 1140 }, { "epoch": 0.33391864208369915, "grad_norm": 2.553866386413574, "learning_rate": 4.165203394790752e-05, "loss": 0.2013, "step": 1141 }, { "epoch": 0.33421129645888203, "grad_norm": 0.004225094802677631, "learning_rate": 4.164471758852795e-05, "loss": 0.0001, "step": 1142 }, { "epoch": 0.334503950834065, "grad_norm": 0.03997182846069336, "learning_rate": 4.163740122914838e-05, "loss": 0.0004, "step": 1143 }, { "epoch": 0.33479660520924787, "grad_norm": 0.016258778050541878, "learning_rate": 4.1630084869768805e-05, "loss": 0.0002, "step": 1144 }, { "epoch": 0.3350892595844308, "grad_norm": 15.144865989685059, "learning_rate": 4.162276851038923e-05, "loss": 0.0768, "step": 1145 }, { "epoch": 0.3353819139596137, "grad_norm": 0.0572894886136055, "learning_rate": 4.161545215100966e-05, "loss": 0.0007, "step": 1146 }, { "epoch": 0.3356745683347966, "grad_norm": 0.36676284670829773, "learning_rate": 4.160813579163009e-05, "loss": 0.0037, "step": 1147 }, { "epoch": 0.33596722270997953, "grad_norm": 0.058127518743276596, "learning_rate": 4.160081943225052e-05, "loss": 0.0007, "step": 1148 }, { "epoch": 0.3362598770851624, "grad_norm": 5.779123783111572, "learning_rate": 4.1593503072870945e-05, "loss": 0.0924, "step": 1149 }, { "epoch": 0.3365525314603453, "grad_norm": 0.017173565924167633, "learning_rate": 4.158618671349137e-05, "loss": 0.0003, "step": 1150 }, { "epoch": 0.33684518583552825, "grad_norm": 0.0010117714991793036, "learning_rate": 4.15788703541118e-05, "loss": 0.0, "step": 1151 }, { "epoch": 0.33713784021071114, "grad_norm": 0.10073135793209076, "learning_rate": 4.157155399473222e-05, "loss": 0.0015, "step": 1152 }, { "epoch": 0.3374304945858941, "grad_norm": 0.706798791885376, "learning_rate": 4.156423763535265e-05, "loss": 0.0061, "step": 1153 }, { "epoch": 0.33772314896107697, "grad_norm": 7.5920257568359375, "learning_rate": 4.155692127597308e-05, "loss": 0.0663, "step": 1154 }, { "epoch": 0.33801580333625986, "grad_norm": 0.046216003596782684, "learning_rate": 4.1549604916593506e-05, "loss": 0.0004, "step": 1155 }, { "epoch": 0.3383084577114428, "grad_norm": 0.029077712446451187, "learning_rate": 4.1542288557213934e-05, "loss": 0.0006, "step": 1156 }, { "epoch": 0.3386011120866257, "grad_norm": 0.04728193208575249, "learning_rate": 4.153497219783436e-05, "loss": 0.0005, "step": 1157 }, { "epoch": 0.3388937664618086, "grad_norm": 0.018949152901768684, "learning_rate": 4.152765583845479e-05, "loss": 0.0003, "step": 1158 }, { "epoch": 0.3391864208369915, "grad_norm": 0.002709601540118456, "learning_rate": 4.152033947907522e-05, "loss": 0.0001, "step": 1159 }, { "epoch": 0.3394790752121744, "grad_norm": 0.409335196018219, "learning_rate": 4.1513023119695645e-05, "loss": 0.002, "step": 1160 }, { "epoch": 0.33977172958735735, "grad_norm": 0.00565820187330246, "learning_rate": 4.1505706760316073e-05, "loss": 0.0002, "step": 1161 }, { "epoch": 0.34006438396254024, "grad_norm": 0.003459826810285449, "learning_rate": 4.1498390400936495e-05, "loss": 0.0001, "step": 1162 }, { "epoch": 0.34035703833772313, "grad_norm": 0.011411946266889572, "learning_rate": 4.149107404155692e-05, "loss": 0.0002, "step": 1163 }, { "epoch": 0.34064969271290607, "grad_norm": 0.003876009490340948, "learning_rate": 4.148375768217735e-05, "loss": 0.0001, "step": 1164 }, { "epoch": 0.34094234708808896, "grad_norm": 1.1270101070404053, "learning_rate": 4.147644132279778e-05, "loss": 0.0037, "step": 1165 }, { "epoch": 0.3412350014632719, "grad_norm": 0.0018890424398705363, "learning_rate": 4.1469124963418206e-05, "loss": 0.0, "step": 1166 }, { "epoch": 0.3415276558384548, "grad_norm": 0.5412814617156982, "learning_rate": 4.1461808604038634e-05, "loss": 0.0013, "step": 1167 }, { "epoch": 0.3418203102136377, "grad_norm": 0.23688070476055145, "learning_rate": 4.145449224465906e-05, "loss": 0.001, "step": 1168 }, { "epoch": 0.3421129645888206, "grad_norm": 0.005186624825000763, "learning_rate": 4.144717588527949e-05, "loss": 0.0001, "step": 1169 }, { "epoch": 0.3424056189640035, "grad_norm": 0.0717591941356659, "learning_rate": 4.143985952589992e-05, "loss": 0.0003, "step": 1170 }, { "epoch": 0.3426982733391864, "grad_norm": 0.004381998907774687, "learning_rate": 4.1432543166520346e-05, "loss": 0.0001, "step": 1171 }, { "epoch": 0.34299092771436934, "grad_norm": 0.004648114088922739, "learning_rate": 4.1425226807140774e-05, "loss": 0.0001, "step": 1172 }, { "epoch": 0.34328358208955223, "grad_norm": 0.002697630086913705, "learning_rate": 4.1417910447761195e-05, "loss": 0.0001, "step": 1173 }, { "epoch": 0.3435762364647352, "grad_norm": 0.003834686242043972, "learning_rate": 4.141059408838162e-05, "loss": 0.0001, "step": 1174 }, { "epoch": 0.34386889083991806, "grad_norm": 0.18755541741847992, "learning_rate": 4.140327772900205e-05, "loss": 0.0007, "step": 1175 }, { "epoch": 0.34416154521510095, "grad_norm": 0.0005057503585703671, "learning_rate": 4.139596136962248e-05, "loss": 0.0, "step": 1176 }, { "epoch": 0.3444541995902839, "grad_norm": 0.003325084690004587, "learning_rate": 4.138864501024291e-05, "loss": 0.0001, "step": 1177 }, { "epoch": 0.3447468539654668, "grad_norm": 0.0005210234085097909, "learning_rate": 4.1381328650863335e-05, "loss": 0.0, "step": 1178 }, { "epoch": 0.34503950834064967, "grad_norm": 0.004745765123516321, "learning_rate": 4.137401229148376e-05, "loss": 0.0001, "step": 1179 }, { "epoch": 0.3453321627158326, "grad_norm": 12.660859107971191, "learning_rate": 4.136669593210419e-05, "loss": 0.0821, "step": 1180 }, { "epoch": 0.3456248170910155, "grad_norm": 0.024795109406113625, "learning_rate": 4.135937957272462e-05, "loss": 0.0003, "step": 1181 }, { "epoch": 0.34591747146619845, "grad_norm": 0.0026485335547477007, "learning_rate": 4.135206321334505e-05, "loss": 0.0001, "step": 1182 }, { "epoch": 0.34621012584138133, "grad_norm": 0.0020074001513421535, "learning_rate": 4.1344746853965475e-05, "loss": 0.0, "step": 1183 }, { "epoch": 0.3465027802165642, "grad_norm": 0.0006435702671296895, "learning_rate": 4.1337430494585896e-05, "loss": 0.0, "step": 1184 }, { "epoch": 0.34679543459174716, "grad_norm": 0.0022121912334114313, "learning_rate": 4.1330114135206324e-05, "loss": 0.0001, "step": 1185 }, { "epoch": 0.34708808896693005, "grad_norm": 0.0025052560959011316, "learning_rate": 4.132279777582675e-05, "loss": 0.0001, "step": 1186 }, { "epoch": 0.34738074334211294, "grad_norm": 1.232609748840332, "learning_rate": 4.131548141644718e-05, "loss": 0.0041, "step": 1187 }, { "epoch": 0.3476733977172959, "grad_norm": 0.7070603966712952, "learning_rate": 4.130816505706761e-05, "loss": 0.0011, "step": 1188 }, { "epoch": 0.3479660520924788, "grad_norm": 0.16078346967697144, "learning_rate": 4.1300848697688036e-05, "loss": 0.0007, "step": 1189 }, { "epoch": 0.3482587064676617, "grad_norm": 0.0014520692639052868, "learning_rate": 4.1293532338308464e-05, "loss": 0.0, "step": 1190 }, { "epoch": 0.3485513608428446, "grad_norm": 0.0011567205656319857, "learning_rate": 4.128621597892889e-05, "loss": 0.0, "step": 1191 }, { "epoch": 0.3488440152180275, "grad_norm": 0.0019392389804124832, "learning_rate": 4.127889961954931e-05, "loss": 0.0001, "step": 1192 }, { "epoch": 0.34913666959321044, "grad_norm": 0.0016712337965145707, "learning_rate": 4.127158326016974e-05, "loss": 0.0, "step": 1193 }, { "epoch": 0.3494293239683933, "grad_norm": 0.21385757625102997, "learning_rate": 4.126426690079017e-05, "loss": 0.0005, "step": 1194 }, { "epoch": 0.3497219783435762, "grad_norm": 0.0075780716724693775, "learning_rate": 4.1256950541410597e-05, "loss": 0.0001, "step": 1195 }, { "epoch": 0.35001463271875916, "grad_norm": 0.001782455830834806, "learning_rate": 4.1249634182031025e-05, "loss": 0.0, "step": 1196 }, { "epoch": 0.35030728709394204, "grad_norm": 0.016709130257368088, "learning_rate": 4.124231782265145e-05, "loss": 0.0001, "step": 1197 }, { "epoch": 0.350599941469125, "grad_norm": 0.39221397042274475, "learning_rate": 4.123500146327188e-05, "loss": 0.001, "step": 1198 }, { "epoch": 0.3508925958443079, "grad_norm": 0.001340279122814536, "learning_rate": 4.122768510389231e-05, "loss": 0.0, "step": 1199 }, { "epoch": 0.35118525021949076, "grad_norm": 0.01328748557716608, "learning_rate": 4.122036874451273e-05, "loss": 0.0001, "step": 1200 }, { "epoch": 0.3514779045946737, "grad_norm": 0.001047693658620119, "learning_rate": 4.121305238513316e-05, "loss": 0.0, "step": 1201 }, { "epoch": 0.3517705589698566, "grad_norm": 0.0032731464598327875, "learning_rate": 4.1205736025753585e-05, "loss": 0.0001, "step": 1202 }, { "epoch": 0.3520632133450395, "grad_norm": 0.8166394829750061, "learning_rate": 4.1198419666374013e-05, "loss": 0.0019, "step": 1203 }, { "epoch": 0.3523558677202224, "grad_norm": 0.014093336649239063, "learning_rate": 4.119110330699444e-05, "loss": 0.0001, "step": 1204 }, { "epoch": 0.3526485220954053, "grad_norm": 21.849721908569336, "learning_rate": 4.118378694761487e-05, "loss": 0.0372, "step": 1205 }, { "epoch": 0.35294117647058826, "grad_norm": 0.0012545472709462047, "learning_rate": 4.11764705882353e-05, "loss": 0.0, "step": 1206 }, { "epoch": 0.35323383084577115, "grad_norm": 0.0010050591081380844, "learning_rate": 4.1169154228855725e-05, "loss": 0.0, "step": 1207 }, { "epoch": 0.35352648522095403, "grad_norm": 0.0011364357778802514, "learning_rate": 4.1161837869476146e-05, "loss": 0.0, "step": 1208 }, { "epoch": 0.353819139596137, "grad_norm": 0.2661976218223572, "learning_rate": 4.1154521510096574e-05, "loss": 0.0005, "step": 1209 }, { "epoch": 0.35411179397131987, "grad_norm": 0.0009252326563000679, "learning_rate": 4.1147205150717e-05, "loss": 0.0, "step": 1210 }, { "epoch": 0.35440444834650275, "grad_norm": 0.0011284584179520607, "learning_rate": 4.113988879133743e-05, "loss": 0.0, "step": 1211 }, { "epoch": 0.3546971027216857, "grad_norm": 0.010301393456757069, "learning_rate": 4.113257243195786e-05, "loss": 0.0, "step": 1212 }, { "epoch": 0.3549897570968686, "grad_norm": 1.3835265636444092, "learning_rate": 4.1125256072578286e-05, "loss": 0.0043, "step": 1213 }, { "epoch": 0.35528241147205153, "grad_norm": 0.0026471379678696394, "learning_rate": 4.1117939713198714e-05, "loss": 0.0001, "step": 1214 }, { "epoch": 0.3555750658472344, "grad_norm": 0.0007697929395362735, "learning_rate": 4.111062335381914e-05, "loss": 0.0, "step": 1215 }, { "epoch": 0.3558677202224173, "grad_norm": 0.00047303663450293243, "learning_rate": 4.110330699443956e-05, "loss": 0.0, "step": 1216 }, { "epoch": 0.35616037459760025, "grad_norm": 0.0036289000418037176, "learning_rate": 4.109599063505999e-05, "loss": 0.0, "step": 1217 }, { "epoch": 0.35645302897278314, "grad_norm": 0.0009147240780293941, "learning_rate": 4.108867427568042e-05, "loss": 0.0, "step": 1218 }, { "epoch": 0.356745683347966, "grad_norm": 0.0012819399125874043, "learning_rate": 4.108135791630085e-05, "loss": 0.0, "step": 1219 }, { "epoch": 0.35703833772314897, "grad_norm": 0.0010691086063161492, "learning_rate": 4.1074041556921275e-05, "loss": 0.0, "step": 1220 }, { "epoch": 0.35733099209833186, "grad_norm": 0.0008675382705405354, "learning_rate": 4.10667251975417e-05, "loss": 0.0, "step": 1221 }, { "epoch": 0.3576236464735148, "grad_norm": 10.915205955505371, "learning_rate": 4.105940883816213e-05, "loss": 0.4804, "step": 1222 }, { "epoch": 0.3579163008486977, "grad_norm": 0.0017219093861058354, "learning_rate": 4.105209247878256e-05, "loss": 0.0, "step": 1223 }, { "epoch": 0.3582089552238806, "grad_norm": 0.0019681653939187527, "learning_rate": 4.104477611940299e-05, "loss": 0.0, "step": 1224 }, { "epoch": 0.3585016095990635, "grad_norm": 0.0012229308485984802, "learning_rate": 4.1037459760023415e-05, "loss": 0.0, "step": 1225 }, { "epoch": 0.3587942639742464, "grad_norm": 0.0015450023347511888, "learning_rate": 4.1030143400643836e-05, "loss": 0.0, "step": 1226 }, { "epoch": 0.35908691834942935, "grad_norm": 0.008542041294276714, "learning_rate": 4.1022827041264264e-05, "loss": 0.0001, "step": 1227 }, { "epoch": 0.35937957272461224, "grad_norm": 0.007675641216337681, "learning_rate": 4.101551068188469e-05, "loss": 0.0001, "step": 1228 }, { "epoch": 0.3596722270997951, "grad_norm": 4.740835189819336, "learning_rate": 4.100819432250512e-05, "loss": 0.0509, "step": 1229 }, { "epoch": 0.35996488147497807, "grad_norm": 12.755969047546387, "learning_rate": 4.100087796312555e-05, "loss": 0.1278, "step": 1230 }, { "epoch": 0.36025753585016096, "grad_norm": 0.02200162783265114, "learning_rate": 4.0993561603745976e-05, "loss": 0.0001, "step": 1231 }, { "epoch": 0.36055019022534385, "grad_norm": 0.008278402499854565, "learning_rate": 4.0986245244366404e-05, "loss": 0.0001, "step": 1232 }, { "epoch": 0.3608428446005268, "grad_norm": 0.07329938560724258, "learning_rate": 4.097892888498683e-05, "loss": 0.0005, "step": 1233 }, { "epoch": 0.3611354989757097, "grad_norm": 0.0015238827327266335, "learning_rate": 4.097161252560726e-05, "loss": 0.0, "step": 1234 }, { "epoch": 0.3614281533508926, "grad_norm": 7.457364559173584, "learning_rate": 4.096429616622769e-05, "loss": 0.0106, "step": 1235 }, { "epoch": 0.3617208077260755, "grad_norm": 0.0005767598049715161, "learning_rate": 4.0956979806848115e-05, "loss": 0.0, "step": 1236 }, { "epoch": 0.3620134621012584, "grad_norm": 0.004051645752042532, "learning_rate": 4.094966344746854e-05, "loss": 0.0001, "step": 1237 }, { "epoch": 0.36230611647644134, "grad_norm": 0.0036045070737600327, "learning_rate": 4.0942347088088965e-05, "loss": 0.0001, "step": 1238 }, { "epoch": 0.36259877085162423, "grad_norm": 0.05607904866337776, "learning_rate": 4.093503072870939e-05, "loss": 0.0004, "step": 1239 }, { "epoch": 0.3628914252268071, "grad_norm": 0.010820225812494755, "learning_rate": 4.092771436932982e-05, "loss": 0.0002, "step": 1240 }, { "epoch": 0.36318407960199006, "grad_norm": 0.0014110167976468801, "learning_rate": 4.092039800995025e-05, "loss": 0.0, "step": 1241 }, { "epoch": 0.36347673397717295, "grad_norm": 0.06000782176852226, "learning_rate": 4.0913081650570676e-05, "loss": 0.0005, "step": 1242 }, { "epoch": 0.3637693883523559, "grad_norm": 0.1307305246591568, "learning_rate": 4.0905765291191104e-05, "loss": 0.0007, "step": 1243 }, { "epoch": 0.3640620427275388, "grad_norm": 0.0020625912584364414, "learning_rate": 4.089844893181153e-05, "loss": 0.0, "step": 1244 }, { "epoch": 0.36435469710272167, "grad_norm": 0.002705375896766782, "learning_rate": 4.089113257243196e-05, "loss": 0.0001, "step": 1245 }, { "epoch": 0.3646473514779046, "grad_norm": 0.43489834666252136, "learning_rate": 4.088381621305239e-05, "loss": 0.0007, "step": 1246 }, { "epoch": 0.3649400058530875, "grad_norm": 0.0008786004618741572, "learning_rate": 4.087649985367281e-05, "loss": 0.0, "step": 1247 }, { "epoch": 0.3652326602282704, "grad_norm": 0.9479494690895081, "learning_rate": 4.086918349429324e-05, "loss": 0.0029, "step": 1248 }, { "epoch": 0.36552531460345333, "grad_norm": 0.08482043445110321, "learning_rate": 4.0861867134913665e-05, "loss": 0.0002, "step": 1249 }, { "epoch": 0.3658179689786362, "grad_norm": 0.33638685941696167, "learning_rate": 4.085455077553409e-05, "loss": 0.0013, "step": 1250 }, { "epoch": 0.36611062335381916, "grad_norm": 0.004500244278460741, "learning_rate": 4.084723441615452e-05, "loss": 0.0001, "step": 1251 }, { "epoch": 0.36640327772900205, "grad_norm": 0.03262517228722572, "learning_rate": 4.083991805677495e-05, "loss": 0.0002, "step": 1252 }, { "epoch": 0.36669593210418494, "grad_norm": 0.0035471178125590086, "learning_rate": 4.083260169739538e-05, "loss": 0.0001, "step": 1253 }, { "epoch": 0.3669885864793679, "grad_norm": 0.010737705044448376, "learning_rate": 4.0825285338015805e-05, "loss": 0.0001, "step": 1254 }, { "epoch": 0.3672812408545508, "grad_norm": 0.0018867601174861193, "learning_rate": 4.081796897863623e-05, "loss": 0.0, "step": 1255 }, { "epoch": 0.36757389522973366, "grad_norm": 0.016890184953808784, "learning_rate": 4.081065261925666e-05, "loss": 0.0001, "step": 1256 }, { "epoch": 0.3678665496049166, "grad_norm": 9.655845642089844, "learning_rate": 4.080333625987709e-05, "loss": 0.0169, "step": 1257 }, { "epoch": 0.3681592039800995, "grad_norm": 14.044231414794922, "learning_rate": 4.079601990049751e-05, "loss": 0.1856, "step": 1258 }, { "epoch": 0.36845185835528244, "grad_norm": 13.366278648376465, "learning_rate": 4.078870354111794e-05, "loss": 0.0625, "step": 1259 }, { "epoch": 0.3687445127304653, "grad_norm": 0.0026375851593911648, "learning_rate": 4.0781387181738366e-05, "loss": 0.0001, "step": 1260 }, { "epoch": 0.3690371671056482, "grad_norm": 0.0007985808770172298, "learning_rate": 4.0774070822358794e-05, "loss": 0.0, "step": 1261 }, { "epoch": 0.36932982148083116, "grad_norm": 0.0016761808656156063, "learning_rate": 4.076675446297922e-05, "loss": 0.0, "step": 1262 }, { "epoch": 0.36962247585601404, "grad_norm": 0.001301329699344933, "learning_rate": 4.075943810359965e-05, "loss": 0.0, "step": 1263 }, { "epoch": 0.36991513023119693, "grad_norm": 0.004801755305379629, "learning_rate": 4.075212174422008e-05, "loss": 0.0001, "step": 1264 }, { "epoch": 0.3702077846063799, "grad_norm": 0.0013198426458984613, "learning_rate": 4.0744805384840506e-05, "loss": 0.0, "step": 1265 }, { "epoch": 0.37050043898156276, "grad_norm": 6.025171756744385, "learning_rate": 4.0737489025460934e-05, "loss": 0.2881, "step": 1266 }, { "epoch": 0.3707930933567457, "grad_norm": 0.0029023438692092896, "learning_rate": 4.073017266608136e-05, "loss": 0.0, "step": 1267 }, { "epoch": 0.3710857477319286, "grad_norm": 0.004703729413449764, "learning_rate": 4.072285630670179e-05, "loss": 0.0001, "step": 1268 }, { "epoch": 0.3713784021071115, "grad_norm": 0.001049618935212493, "learning_rate": 4.071553994732221e-05, "loss": 0.0, "step": 1269 }, { "epoch": 0.3716710564822944, "grad_norm": 0.0010271676583215594, "learning_rate": 4.070822358794264e-05, "loss": 0.0, "step": 1270 }, { "epoch": 0.3719637108574773, "grad_norm": 0.08657903224229813, "learning_rate": 4.070090722856307e-05, "loss": 0.0003, "step": 1271 }, { "epoch": 0.3722563652326602, "grad_norm": 0.002311570104211569, "learning_rate": 4.0693590869183495e-05, "loss": 0.0001, "step": 1272 }, { "epoch": 0.37254901960784315, "grad_norm": 0.0010429377434775233, "learning_rate": 4.068627450980392e-05, "loss": 0.0, "step": 1273 }, { "epoch": 0.37284167398302603, "grad_norm": 0.0025792322121560574, "learning_rate": 4.067895815042435e-05, "loss": 0.0001, "step": 1274 }, { "epoch": 0.373134328358209, "grad_norm": 0.0026215538382530212, "learning_rate": 4.067164179104478e-05, "loss": 0.0001, "step": 1275 }, { "epoch": 0.37342698273339187, "grad_norm": 3.9294819831848145, "learning_rate": 4.0664325431665206e-05, "loss": 0.0067, "step": 1276 }, { "epoch": 0.37371963710857475, "grad_norm": 0.010046660900115967, "learning_rate": 4.0657009072285634e-05, "loss": 0.0002, "step": 1277 }, { "epoch": 0.3740122914837577, "grad_norm": 0.002739732852205634, "learning_rate": 4.064969271290606e-05, "loss": 0.0001, "step": 1278 }, { "epoch": 0.3743049458589406, "grad_norm": 0.0019405756611377, "learning_rate": 4.0642376353526483e-05, "loss": 0.0, "step": 1279 }, { "epoch": 0.3745976002341235, "grad_norm": 0.0013102261582389474, "learning_rate": 4.063505999414691e-05, "loss": 0.0, "step": 1280 }, { "epoch": 0.3748902546093064, "grad_norm": 0.0016861463664099574, "learning_rate": 4.062774363476734e-05, "loss": 0.0, "step": 1281 }, { "epoch": 0.3751829089844893, "grad_norm": 0.018297750502824783, "learning_rate": 4.062042727538777e-05, "loss": 0.0002, "step": 1282 }, { "epoch": 0.37547556335967225, "grad_norm": 0.007014001719653606, "learning_rate": 4.0613110916008195e-05, "loss": 0.0001, "step": 1283 }, { "epoch": 0.37576821773485514, "grad_norm": 0.005624879617244005, "learning_rate": 4.060579455662862e-05, "loss": 0.0001, "step": 1284 }, { "epoch": 0.376060872110038, "grad_norm": 0.003993480000644922, "learning_rate": 4.059847819724905e-05, "loss": 0.0001, "step": 1285 }, { "epoch": 0.37635352648522097, "grad_norm": 0.03830829635262489, "learning_rate": 4.059116183786948e-05, "loss": 0.0002, "step": 1286 }, { "epoch": 0.37664618086040386, "grad_norm": 0.008096047677099705, "learning_rate": 4.058384547848991e-05, "loss": 0.0001, "step": 1287 }, { "epoch": 0.37693883523558674, "grad_norm": 0.0047937557101249695, "learning_rate": 4.0576529119110335e-05, "loss": 0.0001, "step": 1288 }, { "epoch": 0.3772314896107697, "grad_norm": 0.0018073159735649824, "learning_rate": 4.056921275973076e-05, "loss": 0.0001, "step": 1289 }, { "epoch": 0.3775241439859526, "grad_norm": 0.004695659503340721, "learning_rate": 4.0561896400351184e-05, "loss": 0.0001, "step": 1290 }, { "epoch": 0.3778167983611355, "grad_norm": 0.002083779312670231, "learning_rate": 4.055458004097161e-05, "loss": 0.0, "step": 1291 }, { "epoch": 0.3781094527363184, "grad_norm": 0.011576264165341854, "learning_rate": 4.054726368159204e-05, "loss": 0.0001, "step": 1292 }, { "epoch": 0.3784021071115013, "grad_norm": 0.002925613196566701, "learning_rate": 4.053994732221247e-05, "loss": 0.0001, "step": 1293 }, { "epoch": 0.37869476148668424, "grad_norm": 0.0024837800301611423, "learning_rate": 4.0532630962832896e-05, "loss": 0.0001, "step": 1294 }, { "epoch": 0.3789874158618671, "grad_norm": 0.0065367016941308975, "learning_rate": 4.0525314603453324e-05, "loss": 0.0001, "step": 1295 }, { "epoch": 0.37928007023705007, "grad_norm": 0.000926087552215904, "learning_rate": 4.051799824407375e-05, "loss": 0.0, "step": 1296 }, { "epoch": 0.37957272461223296, "grad_norm": 0.0016152521129697561, "learning_rate": 4.051068188469418e-05, "loss": 0.0, "step": 1297 }, { "epoch": 0.37986537898741585, "grad_norm": 0.015473726205527782, "learning_rate": 4.050336552531461e-05, "loss": 0.0002, "step": 1298 }, { "epoch": 0.3801580333625988, "grad_norm": 0.022625859826803207, "learning_rate": 4.0496049165935036e-05, "loss": 0.0002, "step": 1299 }, { "epoch": 0.3804506877377817, "grad_norm": 0.0019020326435565948, "learning_rate": 4.048873280655546e-05, "loss": 0.0, "step": 1300 }, { "epoch": 0.38074334211296457, "grad_norm": 0.0017117613460868597, "learning_rate": 4.0481416447175885e-05, "loss": 0.0, "step": 1301 }, { "epoch": 0.3810359964881475, "grad_norm": 0.002542115282267332, "learning_rate": 4.047410008779631e-05, "loss": 0.0001, "step": 1302 }, { "epoch": 0.3813286508633304, "grad_norm": 19.405616760253906, "learning_rate": 4.046678372841674e-05, "loss": 0.0528, "step": 1303 }, { "epoch": 0.38162130523851334, "grad_norm": 0.4098086655139923, "learning_rate": 4.045946736903717e-05, "loss": 0.001, "step": 1304 }, { "epoch": 0.38191395961369623, "grad_norm": 0.009942581877112389, "learning_rate": 4.0452151009657597e-05, "loss": 0.0001, "step": 1305 }, { "epoch": 0.3822066139888791, "grad_norm": 3.136842727661133, "learning_rate": 4.0444834650278025e-05, "loss": 0.2425, "step": 1306 }, { "epoch": 0.38249926836406206, "grad_norm": 0.002530040917918086, "learning_rate": 4.043751829089845e-05, "loss": 0.0001, "step": 1307 }, { "epoch": 0.38279192273924495, "grad_norm": 1.0293583869934082, "learning_rate": 4.043020193151888e-05, "loss": 0.003, "step": 1308 }, { "epoch": 0.38308457711442784, "grad_norm": 0.005790709052234888, "learning_rate": 4.042288557213931e-05, "loss": 0.0001, "step": 1309 }, { "epoch": 0.3833772314896108, "grad_norm": 0.008284694515168667, "learning_rate": 4.0415569212759736e-05, "loss": 0.0001, "step": 1310 }, { "epoch": 0.38366988586479367, "grad_norm": 0.0034184777177870274, "learning_rate": 4.040825285338016e-05, "loss": 0.0001, "step": 1311 }, { "epoch": 0.3839625402399766, "grad_norm": 1.2440024614334106, "learning_rate": 4.0400936494000585e-05, "loss": 0.0069, "step": 1312 }, { "epoch": 0.3842551946151595, "grad_norm": 0.01935744099318981, "learning_rate": 4.0393620134621013e-05, "loss": 0.0003, "step": 1313 }, { "epoch": 0.3845478489903424, "grad_norm": 0.007463513873517513, "learning_rate": 4.038630377524144e-05, "loss": 0.0002, "step": 1314 }, { "epoch": 0.38484050336552533, "grad_norm": 0.05692553520202637, "learning_rate": 4.037898741586187e-05, "loss": 0.0008, "step": 1315 }, { "epoch": 0.3851331577407082, "grad_norm": 4.072761058807373, "learning_rate": 4.03716710564823e-05, "loss": 0.2337, "step": 1316 }, { "epoch": 0.3854258121158911, "grad_norm": 7.714447975158691, "learning_rate": 4.0364354697102725e-05, "loss": 0.1148, "step": 1317 }, { "epoch": 0.38571846649107405, "grad_norm": 0.3765358328819275, "learning_rate": 4.035703833772315e-05, "loss": 0.0031, "step": 1318 }, { "epoch": 0.38601112086625694, "grad_norm": 13.320783615112305, "learning_rate": 4.034972197834358e-05, "loss": 0.1663, "step": 1319 }, { "epoch": 0.3863037752414399, "grad_norm": 0.00583779439330101, "learning_rate": 4.034240561896401e-05, "loss": 0.0002, "step": 1320 }, { "epoch": 0.38659642961662277, "grad_norm": 0.005912384018301964, "learning_rate": 4.033508925958444e-05, "loss": 0.0002, "step": 1321 }, { "epoch": 0.38688908399180566, "grad_norm": 5.1954522132873535, "learning_rate": 4.032777290020486e-05, "loss": 0.1351, "step": 1322 }, { "epoch": 0.3871817383669886, "grad_norm": 10.917094230651855, "learning_rate": 4.0320456540825286e-05, "loss": 0.237, "step": 1323 }, { "epoch": 0.3874743927421715, "grad_norm": 0.014544911682605743, "learning_rate": 4.0313140181445714e-05, "loss": 0.0004, "step": 1324 }, { "epoch": 0.3877670471173544, "grad_norm": 3.4436397552490234, "learning_rate": 4.030582382206614e-05, "loss": 0.0618, "step": 1325 }, { "epoch": 0.3880597014925373, "grad_norm": 2.7318570613861084, "learning_rate": 4.029850746268657e-05, "loss": 0.1711, "step": 1326 }, { "epoch": 0.3883523558677202, "grad_norm": 1.5790855884552002, "learning_rate": 4.0291191103307e-05, "loss": 0.0179, "step": 1327 }, { "epoch": 0.38864501024290316, "grad_norm": 3.171313762664795, "learning_rate": 4.0283874743927426e-05, "loss": 0.1463, "step": 1328 }, { "epoch": 0.38893766461808604, "grad_norm": 0.06968532502651215, "learning_rate": 4.0276558384547854e-05, "loss": 0.0014, "step": 1329 }, { "epoch": 0.38923031899326893, "grad_norm": 5.104183197021484, "learning_rate": 4.026924202516828e-05, "loss": 0.0801, "step": 1330 }, { "epoch": 0.3895229733684519, "grad_norm": 1.3833603858947754, "learning_rate": 4.026192566578871e-05, "loss": 0.0159, "step": 1331 }, { "epoch": 0.38981562774363476, "grad_norm": 4.297267913818359, "learning_rate": 4.025460930640913e-05, "loss": 0.0527, "step": 1332 }, { "epoch": 0.39010828211881765, "grad_norm": 0.034918636083602905, "learning_rate": 4.024729294702956e-05, "loss": 0.0008, "step": 1333 }, { "epoch": 0.3904009364940006, "grad_norm": 0.22305458784103394, "learning_rate": 4.023997658764999e-05, "loss": 0.0036, "step": 1334 }, { "epoch": 0.3906935908691835, "grad_norm": 0.03878597170114517, "learning_rate": 4.0232660228270415e-05, "loss": 0.0008, "step": 1335 }, { "epoch": 0.3909862452443664, "grad_norm": 0.02626451477408409, "learning_rate": 4.022534386889084e-05, "loss": 0.0007, "step": 1336 }, { "epoch": 0.3912788996195493, "grad_norm": 0.02288234233856201, "learning_rate": 4.021802750951127e-05, "loss": 0.0004, "step": 1337 }, { "epoch": 0.3915715539947322, "grad_norm": 0.026334531605243683, "learning_rate": 4.02107111501317e-05, "loss": 0.0006, "step": 1338 }, { "epoch": 0.39186420836991515, "grad_norm": 0.027551332488656044, "learning_rate": 4.0203394790752127e-05, "loss": 0.0004, "step": 1339 }, { "epoch": 0.39215686274509803, "grad_norm": 2.991457939147949, "learning_rate": 4.0196078431372555e-05, "loss": 0.0121, "step": 1340 }, { "epoch": 0.3924495171202809, "grad_norm": 0.13049960136413574, "learning_rate": 4.018876207199298e-05, "loss": 0.0013, "step": 1341 }, { "epoch": 0.39274217149546387, "grad_norm": 0.016805050894618034, "learning_rate": 4.018144571261341e-05, "loss": 0.0004, "step": 1342 }, { "epoch": 0.39303482587064675, "grad_norm": 0.01396495383232832, "learning_rate": 4.017412935323383e-05, "loss": 0.0003, "step": 1343 }, { "epoch": 0.3933274802458297, "grad_norm": 0.23971877992153168, "learning_rate": 4.016681299385426e-05, "loss": 0.001, "step": 1344 }, { "epoch": 0.3936201346210126, "grad_norm": 0.04552144929766655, "learning_rate": 4.015949663447469e-05, "loss": 0.0005, "step": 1345 }, { "epoch": 0.3939127889961955, "grad_norm": 0.006005613133311272, "learning_rate": 4.0152180275095115e-05, "loss": 0.0001, "step": 1346 }, { "epoch": 0.3942054433713784, "grad_norm": 0.03240946680307388, "learning_rate": 4.0144863915715543e-05, "loss": 0.0003, "step": 1347 }, { "epoch": 0.3944980977465613, "grad_norm": 0.02361867018043995, "learning_rate": 4.013754755633597e-05, "loss": 0.0003, "step": 1348 }, { "epoch": 0.3947907521217442, "grad_norm": 1.2638202905654907, "learning_rate": 4.01302311969564e-05, "loss": 0.0057, "step": 1349 }, { "epoch": 0.39508340649692714, "grad_norm": 0.002966237021610141, "learning_rate": 4.012291483757683e-05, "loss": 0.0001, "step": 1350 }, { "epoch": 0.39537606087211, "grad_norm": 3.348245143890381, "learning_rate": 4.0115598478197255e-05, "loss": 0.2752, "step": 1351 }, { "epoch": 0.39566871524729297, "grad_norm": 0.0027857578825205564, "learning_rate": 4.010828211881768e-05, "loss": 0.0001, "step": 1352 }, { "epoch": 0.39596136962247586, "grad_norm": 0.013372791931033134, "learning_rate": 4.010096575943811e-05, "loss": 0.0002, "step": 1353 }, { "epoch": 0.39625402399765874, "grad_norm": 0.07573962211608887, "learning_rate": 4.009364940005853e-05, "loss": 0.0007, "step": 1354 }, { "epoch": 0.3965466783728417, "grad_norm": 0.09464661031961441, "learning_rate": 4.008633304067896e-05, "loss": 0.0011, "step": 1355 }, { "epoch": 0.3968393327480246, "grad_norm": 9.230062484741211, "learning_rate": 4.007901668129939e-05, "loss": 0.1205, "step": 1356 }, { "epoch": 0.3971319871232075, "grad_norm": 6.861654281616211, "learning_rate": 4.0071700321919816e-05, "loss": 0.0189, "step": 1357 }, { "epoch": 0.3974246414983904, "grad_norm": 2.2964892387390137, "learning_rate": 4.0064383962540244e-05, "loss": 0.0201, "step": 1358 }, { "epoch": 0.3977172958735733, "grad_norm": 0.1187041699886322, "learning_rate": 4.005706760316067e-05, "loss": 0.0009, "step": 1359 }, { "epoch": 0.39800995024875624, "grad_norm": 0.45186883211135864, "learning_rate": 4.00497512437811e-05, "loss": 0.0022, "step": 1360 }, { "epoch": 0.3983026046239391, "grad_norm": 0.014179177582263947, "learning_rate": 4.004243488440153e-05, "loss": 0.0004, "step": 1361 }, { "epoch": 0.398595258999122, "grad_norm": 0.0133640356361866, "learning_rate": 4.0035118525021956e-05, "loss": 0.0003, "step": 1362 }, { "epoch": 0.39888791337430496, "grad_norm": 0.19518370926380157, "learning_rate": 4.0027802165642384e-05, "loss": 0.001, "step": 1363 }, { "epoch": 0.39918056774948785, "grad_norm": 5.994090557098389, "learning_rate": 4.0020485806262805e-05, "loss": 0.0362, "step": 1364 }, { "epoch": 0.3994732221246708, "grad_norm": 12.498130798339844, "learning_rate": 4.001316944688323e-05, "loss": 0.096, "step": 1365 }, { "epoch": 0.3997658764998537, "grad_norm": 0.07240074127912521, "learning_rate": 4.000585308750366e-05, "loss": 0.0008, "step": 1366 }, { "epoch": 0.40005853087503657, "grad_norm": 0.06022394821047783, "learning_rate": 3.999853672812409e-05, "loss": 0.001, "step": 1367 }, { "epoch": 0.4003511852502195, "grad_norm": 0.023105381056666374, "learning_rate": 3.999122036874452e-05, "loss": 0.0005, "step": 1368 }, { "epoch": 0.4006438396254024, "grad_norm": 0.024272827431559563, "learning_rate": 3.9983904009364945e-05, "loss": 0.0005, "step": 1369 }, { "epoch": 0.4009364940005853, "grad_norm": 0.013460013084113598, "learning_rate": 3.997658764998537e-05, "loss": 0.0003, "step": 1370 }, { "epoch": 0.40122914837576823, "grad_norm": 0.6552921533584595, "learning_rate": 3.99692712906058e-05, "loss": 0.0035, "step": 1371 }, { "epoch": 0.4015218027509511, "grad_norm": 0.03980037197470665, "learning_rate": 3.996195493122623e-05, "loss": 0.0006, "step": 1372 }, { "epoch": 0.40181445712613406, "grad_norm": 0.04527165740728378, "learning_rate": 3.995463857184665e-05, "loss": 0.0009, "step": 1373 }, { "epoch": 0.40210711150131695, "grad_norm": 0.021924695000052452, "learning_rate": 3.994732221246708e-05, "loss": 0.0005, "step": 1374 }, { "epoch": 0.40239976587649984, "grad_norm": 0.02878180518746376, "learning_rate": 3.9940005853087506e-05, "loss": 0.0006, "step": 1375 }, { "epoch": 0.4026924202516828, "grad_norm": 0.04821598529815674, "learning_rate": 3.9932689493707934e-05, "loss": 0.0008, "step": 1376 }, { "epoch": 0.40298507462686567, "grad_norm": 1.2048909664154053, "learning_rate": 3.992537313432836e-05, "loss": 0.0047, "step": 1377 }, { "epoch": 0.40327772900204856, "grad_norm": 0.01729060895740986, "learning_rate": 3.991805677494879e-05, "loss": 0.0004, "step": 1378 }, { "epoch": 0.4035703833772315, "grad_norm": 3.4174089431762695, "learning_rate": 3.991074041556922e-05, "loss": 0.1988, "step": 1379 }, { "epoch": 0.4038630377524144, "grad_norm": 0.06411304324865341, "learning_rate": 3.9903424056189645e-05, "loss": 0.0007, "step": 1380 }, { "epoch": 0.40415569212759733, "grad_norm": 0.008149644359946251, "learning_rate": 3.989610769681007e-05, "loss": 0.0002, "step": 1381 }, { "epoch": 0.4044483465027802, "grad_norm": 0.07035718113183975, "learning_rate": 3.9888791337430495e-05, "loss": 0.0007, "step": 1382 }, { "epoch": 0.4047410008779631, "grad_norm": 0.004200743976980448, "learning_rate": 3.988147497805092e-05, "loss": 0.0001, "step": 1383 }, { "epoch": 0.40503365525314605, "grad_norm": 0.01093363855034113, "learning_rate": 3.987415861867135e-05, "loss": 0.0002, "step": 1384 }, { "epoch": 0.40532630962832894, "grad_norm": 0.005739153828471899, "learning_rate": 3.986684225929178e-05, "loss": 0.0001, "step": 1385 }, { "epoch": 0.40561896400351183, "grad_norm": 0.0033381276298314333, "learning_rate": 3.9859525899912206e-05, "loss": 0.0001, "step": 1386 }, { "epoch": 0.40591161837869477, "grad_norm": 0.06442870944738388, "learning_rate": 3.9852209540532634e-05, "loss": 0.0008, "step": 1387 }, { "epoch": 0.40620427275387766, "grad_norm": 0.014269592240452766, "learning_rate": 3.984489318115306e-05, "loss": 0.0002, "step": 1388 }, { "epoch": 0.4064969271290606, "grad_norm": 12.036171913146973, "learning_rate": 3.9837576821773483e-05, "loss": 0.1214, "step": 1389 }, { "epoch": 0.4067895815042435, "grad_norm": 9.762030601501465, "learning_rate": 3.983026046239391e-05, "loss": 0.132, "step": 1390 }, { "epoch": 0.4070822358794264, "grad_norm": 0.011800894513726234, "learning_rate": 3.982294410301434e-05, "loss": 0.0003, "step": 1391 }, { "epoch": 0.4073748902546093, "grad_norm": 0.014297746121883392, "learning_rate": 3.981562774363477e-05, "loss": 0.0002, "step": 1392 }, { "epoch": 0.4076675446297922, "grad_norm": 0.01088868360966444, "learning_rate": 3.9808311384255195e-05, "loss": 0.0002, "step": 1393 }, { "epoch": 0.4079601990049751, "grad_norm": 0.012100731022655964, "learning_rate": 3.980099502487562e-05, "loss": 0.0003, "step": 1394 }, { "epoch": 0.40825285338015804, "grad_norm": 0.0045606642961502075, "learning_rate": 3.979367866549605e-05, "loss": 0.0001, "step": 1395 }, { "epoch": 0.40854550775534093, "grad_norm": 0.007512548007071018, "learning_rate": 3.978636230611648e-05, "loss": 0.0002, "step": 1396 }, { "epoch": 0.4088381621305239, "grad_norm": 8.761828422546387, "learning_rate": 3.977904594673691e-05, "loss": 0.1879, "step": 1397 }, { "epoch": 0.40913081650570676, "grad_norm": 0.007708055432885885, "learning_rate": 3.977172958735733e-05, "loss": 0.0002, "step": 1398 }, { "epoch": 0.40942347088088965, "grad_norm": 0.022862451151013374, "learning_rate": 3.9764413227977756e-05, "loss": 0.0004, "step": 1399 }, { "epoch": 0.4097161252560726, "grad_norm": 0.01622886210680008, "learning_rate": 3.9757096868598184e-05, "loss": 0.0002, "step": 1400 }, { "epoch": 0.4100087796312555, "grad_norm": 0.004277550149708986, "learning_rate": 3.974978050921861e-05, "loss": 0.0001, "step": 1401 }, { "epoch": 0.41030143400643837, "grad_norm": 0.6454548239707947, "learning_rate": 3.974246414983904e-05, "loss": 0.0028, "step": 1402 }, { "epoch": 0.4105940883816213, "grad_norm": 0.007324654143303633, "learning_rate": 3.973514779045947e-05, "loss": 0.0002, "step": 1403 }, { "epoch": 0.4108867427568042, "grad_norm": 0.06323648244142532, "learning_rate": 3.9727831431079896e-05, "loss": 0.0005, "step": 1404 }, { "epoch": 0.41117939713198715, "grad_norm": 0.005309247877448797, "learning_rate": 3.9720515071700324e-05, "loss": 0.0001, "step": 1405 }, { "epoch": 0.41147205150717003, "grad_norm": 0.08161300420761108, "learning_rate": 3.971319871232075e-05, "loss": 0.0007, "step": 1406 }, { "epoch": 0.4117647058823529, "grad_norm": 0.009725712239742279, "learning_rate": 3.970588235294117e-05, "loss": 0.0002, "step": 1407 }, { "epoch": 0.41205736025753587, "grad_norm": 0.0238348376005888, "learning_rate": 3.96985659935616e-05, "loss": 0.0003, "step": 1408 }, { "epoch": 0.41235001463271875, "grad_norm": 0.014538202434778214, "learning_rate": 3.969124963418203e-05, "loss": 0.0004, "step": 1409 }, { "epoch": 0.41264266900790164, "grad_norm": 0.01194294448941946, "learning_rate": 3.968393327480246e-05, "loss": 0.0003, "step": 1410 }, { "epoch": 0.4129353233830846, "grad_norm": 10.102177619934082, "learning_rate": 3.9676616915422885e-05, "loss": 0.0828, "step": 1411 }, { "epoch": 0.4132279777582675, "grad_norm": 0.0121079720556736, "learning_rate": 3.966930055604331e-05, "loss": 0.0002, "step": 1412 }, { "epoch": 0.4135206321334504, "grad_norm": 0.01376876700669527, "learning_rate": 3.966198419666374e-05, "loss": 0.0003, "step": 1413 }, { "epoch": 0.4138132865086333, "grad_norm": 0.013423971831798553, "learning_rate": 3.965466783728417e-05, "loss": 0.0003, "step": 1414 }, { "epoch": 0.4141059408838162, "grad_norm": 0.018480490893125534, "learning_rate": 3.9647351477904597e-05, "loss": 0.0002, "step": 1415 }, { "epoch": 0.41439859525899914, "grad_norm": 0.010010740719735622, "learning_rate": 3.9640035118525025e-05, "loss": 0.0002, "step": 1416 }, { "epoch": 0.414691249634182, "grad_norm": 0.04045334458351135, "learning_rate": 3.9632718759145446e-05, "loss": 0.0006, "step": 1417 }, { "epoch": 0.4149839040093649, "grad_norm": 0.015969226136803627, "learning_rate": 3.9625402399765874e-05, "loss": 0.0004, "step": 1418 }, { "epoch": 0.41527655838454786, "grad_norm": 0.012548488564789295, "learning_rate": 3.96180860403863e-05, "loss": 0.0003, "step": 1419 }, { "epoch": 0.41556921275973074, "grad_norm": 0.002381518017500639, "learning_rate": 3.961076968100673e-05, "loss": 0.0001, "step": 1420 }, { "epoch": 0.4158618671349137, "grad_norm": 4.0952887535095215, "learning_rate": 3.960345332162716e-05, "loss": 0.0229, "step": 1421 }, { "epoch": 0.4161545215100966, "grad_norm": 0.039171867072582245, "learning_rate": 3.9596136962247585e-05, "loss": 0.0006, "step": 1422 }, { "epoch": 0.41644717588527946, "grad_norm": 0.05395697057247162, "learning_rate": 3.9588820602868013e-05, "loss": 0.0005, "step": 1423 }, { "epoch": 0.4167398302604624, "grad_norm": 2.978508949279785, "learning_rate": 3.958150424348844e-05, "loss": 0.216, "step": 1424 }, { "epoch": 0.4170324846356453, "grad_norm": 0.11305846273899078, "learning_rate": 3.957418788410887e-05, "loss": 0.0012, "step": 1425 }, { "epoch": 0.41732513901082824, "grad_norm": 1.4474173784255981, "learning_rate": 3.95668715247293e-05, "loss": 0.0048, "step": 1426 }, { "epoch": 0.4176177933860111, "grad_norm": 0.029976138845086098, "learning_rate": 3.9559555165349725e-05, "loss": 0.0006, "step": 1427 }, { "epoch": 0.417910447761194, "grad_norm": 0.0030669139232486486, "learning_rate": 3.9552238805970146e-05, "loss": 0.0001, "step": 1428 }, { "epoch": 0.41820310213637696, "grad_norm": 0.02776286192238331, "learning_rate": 3.9544922446590574e-05, "loss": 0.0005, "step": 1429 }, { "epoch": 0.41849575651155985, "grad_norm": 0.009369317442178726, "learning_rate": 3.9537606087211e-05, "loss": 0.0003, "step": 1430 }, { "epoch": 0.41878841088674273, "grad_norm": 0.05225818604230881, "learning_rate": 3.953028972783143e-05, "loss": 0.0007, "step": 1431 }, { "epoch": 0.4190810652619257, "grad_norm": 0.013598539866507053, "learning_rate": 3.952297336845186e-05, "loss": 0.0004, "step": 1432 }, { "epoch": 0.41937371963710857, "grad_norm": 0.018940743058919907, "learning_rate": 3.9515657009072286e-05, "loss": 0.0003, "step": 1433 }, { "epoch": 0.4196663740122915, "grad_norm": 0.014823894016444683, "learning_rate": 3.9508340649692714e-05, "loss": 0.0004, "step": 1434 }, { "epoch": 0.4199590283874744, "grad_norm": 0.016674788668751717, "learning_rate": 3.950102429031314e-05, "loss": 0.0004, "step": 1435 }, { "epoch": 0.4202516827626573, "grad_norm": 0.02102663926780224, "learning_rate": 3.949370793093357e-05, "loss": 0.0005, "step": 1436 }, { "epoch": 0.42054433713784023, "grad_norm": 0.01617257483303547, "learning_rate": 3.9486391571554e-05, "loss": 0.0003, "step": 1437 }, { "epoch": 0.4208369915130231, "grad_norm": 18.397354125976562, "learning_rate": 3.947907521217442e-05, "loss": 0.0812, "step": 1438 }, { "epoch": 0.421129645888206, "grad_norm": 4.920436382293701, "learning_rate": 3.947175885279485e-05, "loss": 0.051, "step": 1439 }, { "epoch": 0.42142230026338895, "grad_norm": 0.019231706857681274, "learning_rate": 3.9464442493415275e-05, "loss": 0.0004, "step": 1440 }, { "epoch": 0.42171495463857184, "grad_norm": 0.009344886988401413, "learning_rate": 3.94571261340357e-05, "loss": 0.0002, "step": 1441 }, { "epoch": 0.4220076090137548, "grad_norm": 0.019241783767938614, "learning_rate": 3.944980977465613e-05, "loss": 0.0004, "step": 1442 }, { "epoch": 0.42230026338893767, "grad_norm": 0.11743147671222687, "learning_rate": 3.944249341527656e-05, "loss": 0.0009, "step": 1443 }, { "epoch": 0.42259291776412056, "grad_norm": 0.027256010100245476, "learning_rate": 3.943517705589699e-05, "loss": 0.0005, "step": 1444 }, { "epoch": 0.4228855721393035, "grad_norm": 0.005137627013027668, "learning_rate": 3.9427860696517415e-05, "loss": 0.0001, "step": 1445 }, { "epoch": 0.4231782265144864, "grad_norm": 4.843735694885254, "learning_rate": 3.942054433713784e-05, "loss": 0.1782, "step": 1446 }, { "epoch": 0.4234708808896693, "grad_norm": 5.0710649490356445, "learning_rate": 3.941322797775827e-05, "loss": 0.0176, "step": 1447 }, { "epoch": 0.4237635352648522, "grad_norm": 1.0121043920516968, "learning_rate": 3.94059116183787e-05, "loss": 0.0036, "step": 1448 }, { "epoch": 0.4240561896400351, "grad_norm": 0.01022071111947298, "learning_rate": 3.939859525899912e-05, "loss": 0.0002, "step": 1449 }, { "epoch": 0.42434884401521805, "grad_norm": 0.011404616758227348, "learning_rate": 3.939127889961955e-05, "loss": 0.0002, "step": 1450 }, { "epoch": 0.42464149839040094, "grad_norm": 8.245271682739258, "learning_rate": 3.9383962540239976e-05, "loss": 0.2487, "step": 1451 }, { "epoch": 0.42493415276558383, "grad_norm": 6.751049995422363, "learning_rate": 3.9376646180860404e-05, "loss": 0.0898, "step": 1452 }, { "epoch": 0.42522680714076677, "grad_norm": 0.009213696233928204, "learning_rate": 3.936932982148083e-05, "loss": 0.0003, "step": 1453 }, { "epoch": 0.42551946151594966, "grad_norm": 0.008370252326130867, "learning_rate": 3.936201346210126e-05, "loss": 0.0002, "step": 1454 }, { "epoch": 0.42581211589113255, "grad_norm": 0.01335230190306902, "learning_rate": 3.935469710272169e-05, "loss": 0.0003, "step": 1455 }, { "epoch": 0.4261047702663155, "grad_norm": 0.013338599354028702, "learning_rate": 3.9347380743342115e-05, "loss": 0.0003, "step": 1456 }, { "epoch": 0.4263974246414984, "grad_norm": 0.057257991284132004, "learning_rate": 3.9340064383962543e-05, "loss": 0.0005, "step": 1457 }, { "epoch": 0.4266900790166813, "grad_norm": 0.03446501865983009, "learning_rate": 3.933274802458297e-05, "loss": 0.0006, "step": 1458 }, { "epoch": 0.4269827333918642, "grad_norm": 4.571585178375244, "learning_rate": 3.93254316652034e-05, "loss": 0.0939, "step": 1459 }, { "epoch": 0.4272753877670471, "grad_norm": 0.02036476694047451, "learning_rate": 3.931811530582382e-05, "loss": 0.0005, "step": 1460 }, { "epoch": 0.42756804214223004, "grad_norm": 0.020145880058407784, "learning_rate": 3.931079894644425e-05, "loss": 0.0005, "step": 1461 }, { "epoch": 0.42786069651741293, "grad_norm": 0.022871676832437515, "learning_rate": 3.9303482587064676e-05, "loss": 0.0005, "step": 1462 }, { "epoch": 0.4281533508925958, "grad_norm": 0.07374252378940582, "learning_rate": 3.9296166227685104e-05, "loss": 0.0008, "step": 1463 }, { "epoch": 0.42844600526777876, "grad_norm": 0.02100226655602455, "learning_rate": 3.928884986830553e-05, "loss": 0.0004, "step": 1464 }, { "epoch": 0.42873865964296165, "grad_norm": 0.012682802975177765, "learning_rate": 3.928153350892596e-05, "loss": 0.0003, "step": 1465 }, { "epoch": 0.4290313140181446, "grad_norm": 0.01567976363003254, "learning_rate": 3.927421714954639e-05, "loss": 0.0005, "step": 1466 }, { "epoch": 0.4293239683933275, "grad_norm": 6.267890930175781, "learning_rate": 3.9266900790166816e-05, "loss": 0.0788, "step": 1467 }, { "epoch": 0.42961662276851037, "grad_norm": 0.006946507375687361, "learning_rate": 3.9259584430787244e-05, "loss": 0.0002, "step": 1468 }, { "epoch": 0.4299092771436933, "grad_norm": 0.010431395843625069, "learning_rate": 3.925226807140767e-05, "loss": 0.0002, "step": 1469 }, { "epoch": 0.4302019315188762, "grad_norm": 0.1004369780421257, "learning_rate": 3.924495171202809e-05, "loss": 0.001, "step": 1470 }, { "epoch": 0.4304945858940591, "grad_norm": 0.03698040917515755, "learning_rate": 3.923763535264852e-05, "loss": 0.0006, "step": 1471 }, { "epoch": 0.43078724026924203, "grad_norm": 0.18772642314434052, "learning_rate": 3.923031899326895e-05, "loss": 0.0018, "step": 1472 }, { "epoch": 0.4310798946444249, "grad_norm": 4.989668846130371, "learning_rate": 3.922300263388938e-05, "loss": 0.0318, "step": 1473 }, { "epoch": 0.43137254901960786, "grad_norm": 0.10625988245010376, "learning_rate": 3.9215686274509805e-05, "loss": 0.0013, "step": 1474 }, { "epoch": 0.43166520339479075, "grad_norm": 0.0064647323451936245, "learning_rate": 3.920836991513023e-05, "loss": 0.0001, "step": 1475 }, { "epoch": 0.43195785776997364, "grad_norm": 0.01839122176170349, "learning_rate": 3.920105355575066e-05, "loss": 0.0004, "step": 1476 }, { "epoch": 0.4322505121451566, "grad_norm": 0.023326152935624123, "learning_rate": 3.919373719637109e-05, "loss": 0.0005, "step": 1477 }, { "epoch": 0.4325431665203395, "grad_norm": 0.009359706193208694, "learning_rate": 3.918642083699152e-05, "loss": 0.0002, "step": 1478 }, { "epoch": 0.43283582089552236, "grad_norm": 0.009548685513436794, "learning_rate": 3.9179104477611945e-05, "loss": 0.0002, "step": 1479 }, { "epoch": 0.4331284752707053, "grad_norm": 0.4187501072883606, "learning_rate": 3.917178811823237e-05, "loss": 0.0029, "step": 1480 }, { "epoch": 0.4334211296458882, "grad_norm": 0.09940121322870255, "learning_rate": 3.9164471758852794e-05, "loss": 0.0008, "step": 1481 }, { "epoch": 0.43371378402107114, "grad_norm": 0.4981006383895874, "learning_rate": 3.915715539947322e-05, "loss": 0.0023, "step": 1482 }, { "epoch": 0.434006438396254, "grad_norm": 0.00698661245405674, "learning_rate": 3.914983904009365e-05, "loss": 0.0002, "step": 1483 }, { "epoch": 0.4342990927714369, "grad_norm": 0.016817551106214523, "learning_rate": 3.914252268071408e-05, "loss": 0.0004, "step": 1484 }, { "epoch": 0.43459174714661986, "grad_norm": 0.005778777413070202, "learning_rate": 3.9135206321334506e-05, "loss": 0.0001, "step": 1485 }, { "epoch": 0.43488440152180274, "grad_norm": 0.0036035231314599514, "learning_rate": 3.9127889961954934e-05, "loss": 0.0001, "step": 1486 }, { "epoch": 0.4351770558969857, "grad_norm": 0.007998858578503132, "learning_rate": 3.912057360257536e-05, "loss": 0.0001, "step": 1487 }, { "epoch": 0.4354697102721686, "grad_norm": 1.1437442302703857, "learning_rate": 3.911325724319579e-05, "loss": 0.0076, "step": 1488 }, { "epoch": 0.43576236464735146, "grad_norm": 0.0033962379675358534, "learning_rate": 3.910594088381622e-05, "loss": 0.0001, "step": 1489 }, { "epoch": 0.4360550190225344, "grad_norm": 0.010588807053864002, "learning_rate": 3.9098624524436645e-05, "loss": 0.0002, "step": 1490 }, { "epoch": 0.4363476733977173, "grad_norm": 0.006284533068537712, "learning_rate": 3.9091308165057067e-05, "loss": 0.0001, "step": 1491 }, { "epoch": 0.4366403277729002, "grad_norm": 0.007512817159295082, "learning_rate": 3.9083991805677495e-05, "loss": 0.0001, "step": 1492 }, { "epoch": 0.4369329821480831, "grad_norm": 0.5369182825088501, "learning_rate": 3.907667544629792e-05, "loss": 0.0025, "step": 1493 }, { "epoch": 0.437225636523266, "grad_norm": 0.00482457410544157, "learning_rate": 3.906935908691835e-05, "loss": 0.0001, "step": 1494 }, { "epoch": 0.43751829089844896, "grad_norm": 0.021206054836511612, "learning_rate": 3.906204272753878e-05, "loss": 0.0002, "step": 1495 }, { "epoch": 0.43781094527363185, "grad_norm": 0.007357397116720676, "learning_rate": 3.9054726368159206e-05, "loss": 0.0001, "step": 1496 }, { "epoch": 0.43810359964881473, "grad_norm": 0.0006888994830660522, "learning_rate": 3.9047410008779634e-05, "loss": 0.0, "step": 1497 }, { "epoch": 0.4383962540239977, "grad_norm": 0.00313826696947217, "learning_rate": 3.904009364940006e-05, "loss": 0.0001, "step": 1498 }, { "epoch": 0.43868890839918057, "grad_norm": 0.007373438682407141, "learning_rate": 3.903277729002049e-05, "loss": 0.0001, "step": 1499 }, { "epoch": 0.43898156277436345, "grad_norm": 0.06540460139513016, "learning_rate": 3.902546093064092e-05, "loss": 0.0004, "step": 1500 }, { "epoch": 0.4392742171495464, "grad_norm": 11.630769729614258, "learning_rate": 3.9018144571261346e-05, "loss": 0.1478, "step": 1501 }, { "epoch": 0.4395668715247293, "grad_norm": 1.5413963794708252, "learning_rate": 3.901082821188177e-05, "loss": 0.0044, "step": 1502 }, { "epoch": 0.43985952589991223, "grad_norm": 0.0013103276723995805, "learning_rate": 3.9003511852502195e-05, "loss": 0.0, "step": 1503 }, { "epoch": 0.4401521802750951, "grad_norm": 0.0021003279834985733, "learning_rate": 3.899619549312262e-05, "loss": 0.0001, "step": 1504 }, { "epoch": 0.440444834650278, "grad_norm": 4.0217976570129395, "learning_rate": 3.898887913374305e-05, "loss": 0.1454, "step": 1505 }, { "epoch": 0.44073748902546095, "grad_norm": 0.05071612820029259, "learning_rate": 3.898156277436348e-05, "loss": 0.0002, "step": 1506 }, { "epoch": 0.44103014340064384, "grad_norm": 0.0028761134017258883, "learning_rate": 3.897424641498391e-05, "loss": 0.0001, "step": 1507 }, { "epoch": 0.4413227977758267, "grad_norm": 1.7916882038116455, "learning_rate": 3.8966930055604335e-05, "loss": 0.0058, "step": 1508 }, { "epoch": 0.44161545215100967, "grad_norm": 0.0016702886205166578, "learning_rate": 3.895961369622476e-05, "loss": 0.0, "step": 1509 }, { "epoch": 0.44190810652619256, "grad_norm": 0.06896457821130753, "learning_rate": 3.895229733684519e-05, "loss": 0.0004, "step": 1510 }, { "epoch": 0.4422007609013755, "grad_norm": 0.013217308558523655, "learning_rate": 3.894498097746562e-05, "loss": 0.0002, "step": 1511 }, { "epoch": 0.4424934152765584, "grad_norm": 0.007142780348658562, "learning_rate": 3.893766461808605e-05, "loss": 0.0001, "step": 1512 }, { "epoch": 0.4427860696517413, "grad_norm": 0.10819875448942184, "learning_rate": 3.893034825870647e-05, "loss": 0.0005, "step": 1513 }, { "epoch": 0.4430787240269242, "grad_norm": 0.4063895344734192, "learning_rate": 3.8923031899326896e-05, "loss": 0.0009, "step": 1514 }, { "epoch": 0.4433713784021071, "grad_norm": 2.506284713745117, "learning_rate": 3.8915715539947324e-05, "loss": 0.249, "step": 1515 }, { "epoch": 0.44366403277729, "grad_norm": 0.004622430540621281, "learning_rate": 3.890839918056775e-05, "loss": 0.0001, "step": 1516 }, { "epoch": 0.44395668715247294, "grad_norm": 0.007036368362605572, "learning_rate": 3.890108282118818e-05, "loss": 0.0001, "step": 1517 }, { "epoch": 0.4442493415276558, "grad_norm": 0.0023173808585852385, "learning_rate": 3.889376646180861e-05, "loss": 0.0, "step": 1518 }, { "epoch": 0.44454199590283877, "grad_norm": 2.48722243309021, "learning_rate": 3.8886450102429036e-05, "loss": 0.1683, "step": 1519 }, { "epoch": 0.44483465027802166, "grad_norm": 0.0057946923188865185, "learning_rate": 3.8879133743049464e-05, "loss": 0.0001, "step": 1520 }, { "epoch": 0.44512730465320455, "grad_norm": 0.09324245899915695, "learning_rate": 3.887181738366989e-05, "loss": 0.0014, "step": 1521 }, { "epoch": 0.4454199590283875, "grad_norm": 0.3141811490058899, "learning_rate": 3.886450102429032e-05, "loss": 0.0023, "step": 1522 }, { "epoch": 0.4457126134035704, "grad_norm": 0.010908433236181736, "learning_rate": 3.885718466491074e-05, "loss": 0.0002, "step": 1523 }, { "epoch": 0.44600526777875327, "grad_norm": 0.04157865792512894, "learning_rate": 3.884986830553117e-05, "loss": 0.0007, "step": 1524 }, { "epoch": 0.4462979221539362, "grad_norm": 0.3245861232280731, "learning_rate": 3.8842551946151597e-05, "loss": 0.0029, "step": 1525 }, { "epoch": 0.4465905765291191, "grad_norm": 0.08890893310308456, "learning_rate": 3.8835235586772025e-05, "loss": 0.0012, "step": 1526 }, { "epoch": 0.44688323090430204, "grad_norm": 0.18287794291973114, "learning_rate": 3.882791922739245e-05, "loss": 0.0031, "step": 1527 }, { "epoch": 0.44717588527948493, "grad_norm": 0.14691399037837982, "learning_rate": 3.882060286801288e-05, "loss": 0.0013, "step": 1528 }, { "epoch": 0.4474685396546678, "grad_norm": 0.008837338536977768, "learning_rate": 3.881328650863331e-05, "loss": 0.0002, "step": 1529 }, { "epoch": 0.44776119402985076, "grad_norm": 0.15746049582958221, "learning_rate": 3.8805970149253736e-05, "loss": 0.0016, "step": 1530 }, { "epoch": 0.44805384840503365, "grad_norm": 1.1990667581558228, "learning_rate": 3.8798653789874164e-05, "loss": 0.0195, "step": 1531 }, { "epoch": 0.44834650278021654, "grad_norm": 10.3333740234375, "learning_rate": 3.879133743049459e-05, "loss": 0.1041, "step": 1532 }, { "epoch": 0.4486391571553995, "grad_norm": 0.15595147013664246, "learning_rate": 3.878402107111502e-05, "loss": 0.0015, "step": 1533 }, { "epoch": 0.44893181153058237, "grad_norm": 0.018397051841020584, "learning_rate": 3.877670471173544e-05, "loss": 0.0004, "step": 1534 }, { "epoch": 0.4492244659057653, "grad_norm": 0.00914598349481821, "learning_rate": 3.876938835235587e-05, "loss": 0.0002, "step": 1535 }, { "epoch": 0.4495171202809482, "grad_norm": 0.10302092880010605, "learning_rate": 3.87620719929763e-05, "loss": 0.0014, "step": 1536 }, { "epoch": 0.4498097746561311, "grad_norm": 0.062038298696279526, "learning_rate": 3.8754755633596725e-05, "loss": 0.0011, "step": 1537 }, { "epoch": 0.45010242903131403, "grad_norm": 0.1500018686056137, "learning_rate": 3.874743927421715e-05, "loss": 0.0015, "step": 1538 }, { "epoch": 0.4503950834064969, "grad_norm": 0.016570372506976128, "learning_rate": 3.874012291483758e-05, "loss": 0.0004, "step": 1539 }, { "epoch": 0.4506877377816798, "grad_norm": 0.01027642097324133, "learning_rate": 3.873280655545801e-05, "loss": 0.0002, "step": 1540 }, { "epoch": 0.45098039215686275, "grad_norm": 0.011809996329247952, "learning_rate": 3.872549019607844e-05, "loss": 0.0003, "step": 1541 }, { "epoch": 0.45127304653204564, "grad_norm": 0.05385143309831619, "learning_rate": 3.8718173836698865e-05, "loss": 0.0004, "step": 1542 }, { "epoch": 0.4515657009072286, "grad_norm": 0.19968895614147186, "learning_rate": 3.871085747731929e-05, "loss": 0.001, "step": 1543 }, { "epoch": 0.4518583552824115, "grad_norm": 0.004949803464114666, "learning_rate": 3.870354111793972e-05, "loss": 0.0001, "step": 1544 }, { "epoch": 0.45215100965759436, "grad_norm": 0.007569814566522837, "learning_rate": 3.869622475856014e-05, "loss": 0.0002, "step": 1545 }, { "epoch": 0.4524436640327773, "grad_norm": 0.006508437916636467, "learning_rate": 3.868890839918057e-05, "loss": 0.0002, "step": 1546 }, { "epoch": 0.4527363184079602, "grad_norm": 0.07787645608186722, "learning_rate": 3.8681592039801e-05, "loss": 0.0008, "step": 1547 }, { "epoch": 0.45302897278314314, "grad_norm": 0.01738792657852173, "learning_rate": 3.8674275680421426e-05, "loss": 0.0003, "step": 1548 }, { "epoch": 0.453321627158326, "grad_norm": 0.2992579936981201, "learning_rate": 3.8666959321041854e-05, "loss": 0.001, "step": 1549 }, { "epoch": 0.4536142815335089, "grad_norm": 0.061419274657964706, "learning_rate": 3.865964296166228e-05, "loss": 0.0004, "step": 1550 }, { "epoch": 0.45390693590869186, "grad_norm": 0.013273519463837147, "learning_rate": 3.865232660228271e-05, "loss": 0.0003, "step": 1551 }, { "epoch": 0.45419959028387474, "grad_norm": 0.004993957933038473, "learning_rate": 3.864501024290314e-05, "loss": 0.0001, "step": 1552 }, { "epoch": 0.45449224465905763, "grad_norm": 0.04157587140798569, "learning_rate": 3.8637693883523566e-05, "loss": 0.0007, "step": 1553 }, { "epoch": 0.4547848990342406, "grad_norm": 0.004822755232453346, "learning_rate": 3.8630377524143994e-05, "loss": 0.0001, "step": 1554 }, { "epoch": 0.45507755340942346, "grad_norm": 0.007024036720395088, "learning_rate": 3.8623061164764415e-05, "loss": 0.0001, "step": 1555 }, { "epoch": 0.4553702077846064, "grad_norm": 0.0037519552279263735, "learning_rate": 3.861574480538484e-05, "loss": 0.0001, "step": 1556 }, { "epoch": 0.4556628621597893, "grad_norm": 0.003447313094511628, "learning_rate": 3.860842844600527e-05, "loss": 0.0001, "step": 1557 }, { "epoch": 0.4559555165349722, "grad_norm": 0.002084544859826565, "learning_rate": 3.86011120866257e-05, "loss": 0.0001, "step": 1558 }, { "epoch": 0.4562481709101551, "grad_norm": 0.0013182968832552433, "learning_rate": 3.8593795727246127e-05, "loss": 0.0, "step": 1559 }, { "epoch": 0.456540825285338, "grad_norm": 0.0034747810568660498, "learning_rate": 3.8586479367866555e-05, "loss": 0.0001, "step": 1560 }, { "epoch": 0.4568334796605209, "grad_norm": 0.041806068271398544, "learning_rate": 3.857916300848698e-05, "loss": 0.0002, "step": 1561 }, { "epoch": 0.45712613403570385, "grad_norm": 0.0035612063948065042, "learning_rate": 3.857184664910741e-05, "loss": 0.0001, "step": 1562 }, { "epoch": 0.45741878841088673, "grad_norm": 0.008042696863412857, "learning_rate": 3.856453028972783e-05, "loss": 0.0001, "step": 1563 }, { "epoch": 0.4577114427860697, "grad_norm": 0.0036441178526729345, "learning_rate": 3.855721393034826e-05, "loss": 0.0001, "step": 1564 }, { "epoch": 0.45800409716125257, "grad_norm": 0.014042911119759083, "learning_rate": 3.854989757096869e-05, "loss": 0.0001, "step": 1565 }, { "epoch": 0.45829675153643545, "grad_norm": 0.0040075695142149925, "learning_rate": 3.8542581211589115e-05, "loss": 0.0001, "step": 1566 }, { "epoch": 0.4585894059116184, "grad_norm": 0.0015805475413799286, "learning_rate": 3.8535264852209543e-05, "loss": 0.0, "step": 1567 }, { "epoch": 0.4588820602868013, "grad_norm": 0.007362937089055777, "learning_rate": 3.852794849282997e-05, "loss": 0.0001, "step": 1568 }, { "epoch": 0.4591747146619842, "grad_norm": 0.0010071613360196352, "learning_rate": 3.85206321334504e-05, "loss": 0.0, "step": 1569 }, { "epoch": 0.4594673690371671, "grad_norm": 0.0006803566357120872, "learning_rate": 3.851331577407083e-05, "loss": 0.0, "step": 1570 }, { "epoch": 0.45976002341235, "grad_norm": 0.0031457000877708197, "learning_rate": 3.850599941469125e-05, "loss": 0.0001, "step": 1571 }, { "epoch": 0.46005267778753295, "grad_norm": 0.010785883292555809, "learning_rate": 3.8498683055311676e-05, "loss": 0.0001, "step": 1572 }, { "epoch": 0.46034533216271584, "grad_norm": 4.436830043792725, "learning_rate": 3.8491366695932104e-05, "loss": 0.1064, "step": 1573 }, { "epoch": 0.4606379865378987, "grad_norm": 0.012927822768688202, "learning_rate": 3.848405033655253e-05, "loss": 0.0001, "step": 1574 }, { "epoch": 0.46093064091308167, "grad_norm": 0.020288215950131416, "learning_rate": 3.847673397717296e-05, "loss": 0.0001, "step": 1575 }, { "epoch": 0.46122329528826456, "grad_norm": 0.006180945783853531, "learning_rate": 3.846941761779339e-05, "loss": 0.0001, "step": 1576 }, { "epoch": 0.46151594966344744, "grad_norm": 4.199718475341797, "learning_rate": 3.8462101258413816e-05, "loss": 0.0074, "step": 1577 }, { "epoch": 0.4618086040386304, "grad_norm": 0.7159136533737183, "learning_rate": 3.8454784899034244e-05, "loss": 0.0059, "step": 1578 }, { "epoch": 0.4621012584138133, "grad_norm": 0.0019133149180561304, "learning_rate": 3.8447468539654665e-05, "loss": 0.0, "step": 1579 }, { "epoch": 0.4623939127889962, "grad_norm": 0.0006168386898934841, "learning_rate": 3.844015218027509e-05, "loss": 0.0, "step": 1580 }, { "epoch": 0.4626865671641791, "grad_norm": 0.0026355122681707144, "learning_rate": 3.843283582089552e-05, "loss": 0.0, "step": 1581 }, { "epoch": 0.462979221539362, "grad_norm": 0.0010602109832689166, "learning_rate": 3.842551946151595e-05, "loss": 0.0, "step": 1582 }, { "epoch": 0.46327187591454494, "grad_norm": 0.0019133257446810603, "learning_rate": 3.841820310213638e-05, "loss": 0.0, "step": 1583 }, { "epoch": 0.4635645302897278, "grad_norm": 0.0017432968597859144, "learning_rate": 3.8410886742756805e-05, "loss": 0.0, "step": 1584 }, { "epoch": 0.4638571846649107, "grad_norm": 4.2040534019470215, "learning_rate": 3.840357038337723e-05, "loss": 0.0155, "step": 1585 }, { "epoch": 0.46414983904009366, "grad_norm": 13.846515655517578, "learning_rate": 3.839625402399766e-05, "loss": 0.0595, "step": 1586 }, { "epoch": 0.46444249341527655, "grad_norm": 8.511911392211914, "learning_rate": 3.838893766461808e-05, "loss": 0.1244, "step": 1587 }, { "epoch": 0.4647351477904595, "grad_norm": 0.0030939967837184668, "learning_rate": 3.838162130523851e-05, "loss": 0.0, "step": 1588 }, { "epoch": 0.4650278021656424, "grad_norm": 0.8021648526191711, "learning_rate": 3.837430494585894e-05, "loss": 0.0017, "step": 1589 }, { "epoch": 0.46532045654082527, "grad_norm": 0.002673780545592308, "learning_rate": 3.8366988586479366e-05, "loss": 0.0001, "step": 1590 }, { "epoch": 0.4656131109160082, "grad_norm": 8.91903018951416, "learning_rate": 3.8359672227099794e-05, "loss": 0.1671, "step": 1591 }, { "epoch": 0.4659057652911911, "grad_norm": 0.002431961242109537, "learning_rate": 3.835235586772022e-05, "loss": 0.0, "step": 1592 }, { "epoch": 0.466198419666374, "grad_norm": 7.502378463745117, "learning_rate": 3.834503950834065e-05, "loss": 0.0763, "step": 1593 }, { "epoch": 0.46649107404155693, "grad_norm": 0.017539622262120247, "learning_rate": 3.833772314896108e-05, "loss": 0.0002, "step": 1594 }, { "epoch": 0.4667837284167398, "grad_norm": 0.17085938155651093, "learning_rate": 3.8330406789581506e-05, "loss": 0.001, "step": 1595 }, { "epoch": 0.46707638279192276, "grad_norm": 1.729633092880249, "learning_rate": 3.8323090430201934e-05, "loss": 0.2641, "step": 1596 }, { "epoch": 0.46736903716710565, "grad_norm": 0.06899145990610123, "learning_rate": 3.831577407082236e-05, "loss": 0.0006, "step": 1597 }, { "epoch": 0.46766169154228854, "grad_norm": 0.004920002538710833, "learning_rate": 3.830845771144278e-05, "loss": 0.0001, "step": 1598 }, { "epoch": 0.4679543459174715, "grad_norm": 6.381435871124268, "learning_rate": 3.830114135206321e-05, "loss": 0.0941, "step": 1599 }, { "epoch": 0.46824700029265437, "grad_norm": 0.010226819664239883, "learning_rate": 3.829382499268364e-05, "loss": 0.0002, "step": 1600 }, { "epoch": 0.46853965466783726, "grad_norm": 7.104165077209473, "learning_rate": 3.8286508633304067e-05, "loss": 0.1188, "step": 1601 }, { "epoch": 0.4688323090430202, "grad_norm": 0.02988087385892868, "learning_rate": 3.8279192273924495e-05, "loss": 0.0006, "step": 1602 }, { "epoch": 0.4691249634182031, "grad_norm": 0.07559366524219513, "learning_rate": 3.827187591454492e-05, "loss": 0.0013, "step": 1603 }, { "epoch": 0.46941761779338603, "grad_norm": 0.11377298086881638, "learning_rate": 3.826455955516535e-05, "loss": 0.0023, "step": 1604 }, { "epoch": 0.4697102721685689, "grad_norm": 0.29434069991111755, "learning_rate": 3.825724319578578e-05, "loss": 0.0044, "step": 1605 }, { "epoch": 0.4700029265437518, "grad_norm": 0.18786948919296265, "learning_rate": 3.8249926836406206e-05, "loss": 0.004, "step": 1606 }, { "epoch": 0.47029558091893475, "grad_norm": 0.12316965311765671, "learning_rate": 3.8242610477026634e-05, "loss": 0.0024, "step": 1607 }, { "epoch": 0.47058823529411764, "grad_norm": 0.03546321019530296, "learning_rate": 3.8235294117647055e-05, "loss": 0.0007, "step": 1608 }, { "epoch": 0.47088088966930053, "grad_norm": 0.08442742377519608, "learning_rate": 3.8227977758267483e-05, "loss": 0.0018, "step": 1609 }, { "epoch": 0.47117354404448347, "grad_norm": 0.07069230824708939, "learning_rate": 3.822066139888791e-05, "loss": 0.0006, "step": 1610 }, { "epoch": 0.47146619841966636, "grad_norm": 0.08098865300416946, "learning_rate": 3.821334503950834e-05, "loss": 0.0016, "step": 1611 }, { "epoch": 0.4717588527948493, "grad_norm": 0.1017969399690628, "learning_rate": 3.820602868012877e-05, "loss": 0.0017, "step": 1612 }, { "epoch": 0.4720515071700322, "grad_norm": 0.05172237381339073, "learning_rate": 3.8198712320749195e-05, "loss": 0.0011, "step": 1613 }, { "epoch": 0.4723441615452151, "grad_norm": 6.959278106689453, "learning_rate": 3.819139596136962e-05, "loss": 0.0355, "step": 1614 }, { "epoch": 0.472636815920398, "grad_norm": 0.010147632099688053, "learning_rate": 3.818407960199005e-05, "loss": 0.0003, "step": 1615 }, { "epoch": 0.4729294702955809, "grad_norm": 0.08733673393726349, "learning_rate": 3.817676324261048e-05, "loss": 0.0007, "step": 1616 }, { "epoch": 0.47322212467076386, "grad_norm": 0.015223219059407711, "learning_rate": 3.816944688323091e-05, "loss": 0.0004, "step": 1617 }, { "epoch": 0.47351477904594674, "grad_norm": 2.2570912837982178, "learning_rate": 3.8162130523851335e-05, "loss": 0.0089, "step": 1618 }, { "epoch": 0.47380743342112963, "grad_norm": 2.3515350818634033, "learning_rate": 3.8154814164471756e-05, "loss": 0.0106, "step": 1619 }, { "epoch": 0.4741000877963126, "grad_norm": 0.02242353744804859, "learning_rate": 3.8147497805092184e-05, "loss": 0.0004, "step": 1620 }, { "epoch": 0.47439274217149546, "grad_norm": 0.10555027425289154, "learning_rate": 3.814018144571261e-05, "loss": 0.0011, "step": 1621 }, { "epoch": 0.47468539654667835, "grad_norm": 4.0205769538879395, "learning_rate": 3.813286508633304e-05, "loss": 0.0072, "step": 1622 }, { "epoch": 0.4749780509218613, "grad_norm": 2.659857749938965, "learning_rate": 3.812554872695347e-05, "loss": 0.0142, "step": 1623 }, { "epoch": 0.4752707052970442, "grad_norm": 6.0880608558654785, "learning_rate": 3.8118232367573896e-05, "loss": 0.1141, "step": 1624 }, { "epoch": 0.4755633596722271, "grad_norm": 0.007533062249422073, "learning_rate": 3.8110916008194324e-05, "loss": 0.0002, "step": 1625 }, { "epoch": 0.47585601404741, "grad_norm": 0.09547273814678192, "learning_rate": 3.810359964881475e-05, "loss": 0.0005, "step": 1626 }, { "epoch": 0.4761486684225929, "grad_norm": 0.22707970440387726, "learning_rate": 3.809628328943518e-05, "loss": 0.0009, "step": 1627 }, { "epoch": 0.47644132279777585, "grad_norm": 0.0026254430413246155, "learning_rate": 3.808896693005561e-05, "loss": 0.0, "step": 1628 }, { "epoch": 0.47673397717295873, "grad_norm": 0.016741158440709114, "learning_rate": 3.808165057067603e-05, "loss": 0.0003, "step": 1629 }, { "epoch": 0.4770266315481416, "grad_norm": 10.494945526123047, "learning_rate": 3.807433421129646e-05, "loss": 0.0877, "step": 1630 }, { "epoch": 0.47731928592332457, "grad_norm": 13.728397369384766, "learning_rate": 3.8067017851916885e-05, "loss": 0.1001, "step": 1631 }, { "epoch": 0.47761194029850745, "grad_norm": 0.011857084929943085, "learning_rate": 3.805970149253731e-05, "loss": 0.0002, "step": 1632 }, { "epoch": 0.4779045946736904, "grad_norm": 8.238283157348633, "learning_rate": 3.805238513315774e-05, "loss": 0.0229, "step": 1633 }, { "epoch": 0.4781972490488733, "grad_norm": 7.15416955947876, "learning_rate": 3.804506877377817e-05, "loss": 0.0276, "step": 1634 }, { "epoch": 0.4784899034240562, "grad_norm": 0.009290005080401897, "learning_rate": 3.8037752414398597e-05, "loss": 0.0002, "step": 1635 }, { "epoch": 0.4787825577992391, "grad_norm": 0.009782211855053902, "learning_rate": 3.8030436055019025e-05, "loss": 0.0002, "step": 1636 }, { "epoch": 0.479075212174422, "grad_norm": 0.0036196867004036903, "learning_rate": 3.802311969563945e-05, "loss": 0.0001, "step": 1637 }, { "epoch": 0.4793678665496049, "grad_norm": 0.007514093536883593, "learning_rate": 3.801580333625988e-05, "loss": 0.0002, "step": 1638 }, { "epoch": 0.47966052092478784, "grad_norm": 0.07700416445732117, "learning_rate": 3.800848697688031e-05, "loss": 0.0008, "step": 1639 }, { "epoch": 0.4799531752999707, "grad_norm": 0.06622344255447388, "learning_rate": 3.800117061750073e-05, "loss": 0.0004, "step": 1640 }, { "epoch": 0.48024582967515367, "grad_norm": 0.03312865272164345, "learning_rate": 3.799385425812116e-05, "loss": 0.0004, "step": 1641 }, { "epoch": 0.48053848405033656, "grad_norm": 8.950428009033203, "learning_rate": 3.7986537898741585e-05, "loss": 0.0752, "step": 1642 }, { "epoch": 0.48083113842551944, "grad_norm": 0.01622932218015194, "learning_rate": 3.7979221539362013e-05, "loss": 0.0001, "step": 1643 }, { "epoch": 0.4811237928007024, "grad_norm": 5.980432510375977, "learning_rate": 3.797190517998244e-05, "loss": 0.0757, "step": 1644 }, { "epoch": 0.4814164471758853, "grad_norm": 5.4728217124938965, "learning_rate": 3.796458882060287e-05, "loss": 0.13, "step": 1645 }, { "epoch": 0.48170910155106816, "grad_norm": 0.004476544447243214, "learning_rate": 3.79572724612233e-05, "loss": 0.0001, "step": 1646 }, { "epoch": 0.4820017559262511, "grad_norm": 0.0021811951883137226, "learning_rate": 3.7949956101843725e-05, "loss": 0.0001, "step": 1647 }, { "epoch": 0.482294410301434, "grad_norm": 0.0010316645493730903, "learning_rate": 3.794263974246415e-05, "loss": 0.0, "step": 1648 }, { "epoch": 0.48258706467661694, "grad_norm": 0.25734391808509827, "learning_rate": 3.793532338308458e-05, "loss": 0.0017, "step": 1649 }, { "epoch": 0.4828797190517998, "grad_norm": 0.0003380229463800788, "learning_rate": 3.792800702370501e-05, "loss": 0.0, "step": 1650 }, { "epoch": 0.4831723734269827, "grad_norm": 18.4676513671875, "learning_rate": 3.792069066432543e-05, "loss": 0.0973, "step": 1651 }, { "epoch": 0.48346502780216566, "grad_norm": 0.015821581706404686, "learning_rate": 3.791337430494586e-05, "loss": 0.0003, "step": 1652 }, { "epoch": 0.48375768217734855, "grad_norm": 2.85701060295105, "learning_rate": 3.7906057945566286e-05, "loss": 0.0129, "step": 1653 }, { "epoch": 0.48405033655253143, "grad_norm": 0.15178117156028748, "learning_rate": 3.7898741586186714e-05, "loss": 0.0019, "step": 1654 }, { "epoch": 0.4843429909277144, "grad_norm": 0.014793816953897476, "learning_rate": 3.789142522680714e-05, "loss": 0.0003, "step": 1655 }, { "epoch": 0.48463564530289727, "grad_norm": 9.333024024963379, "learning_rate": 3.788410886742757e-05, "loss": 0.0552, "step": 1656 }, { "epoch": 0.4849282996780802, "grad_norm": 0.03246520087122917, "learning_rate": 3.7876792508048e-05, "loss": 0.0005, "step": 1657 }, { "epoch": 0.4852209540532631, "grad_norm": 0.005863872356712818, "learning_rate": 3.7869476148668426e-05, "loss": 0.0001, "step": 1658 }, { "epoch": 0.485513608428446, "grad_norm": 0.02286364510655403, "learning_rate": 3.7862159789288854e-05, "loss": 0.0006, "step": 1659 }, { "epoch": 0.48580626280362893, "grad_norm": 0.017278209328651428, "learning_rate": 3.785484342990928e-05, "loss": 0.0003, "step": 1660 }, { "epoch": 0.4860989171788118, "grad_norm": 9.417853355407715, "learning_rate": 3.78475270705297e-05, "loss": 0.0752, "step": 1661 }, { "epoch": 0.4863915715539947, "grad_norm": 0.08602503687143326, "learning_rate": 3.784021071115013e-05, "loss": 0.0004, "step": 1662 }, { "epoch": 0.48668422592917765, "grad_norm": 0.05982265621423721, "learning_rate": 3.783289435177056e-05, "loss": 0.0003, "step": 1663 }, { "epoch": 0.48697688030436054, "grad_norm": 5.121674060821533, "learning_rate": 3.782557799239099e-05, "loss": 0.0837, "step": 1664 }, { "epoch": 0.4872695346795435, "grad_norm": 0.00907017569988966, "learning_rate": 3.7818261633011415e-05, "loss": 0.0003, "step": 1665 }, { "epoch": 0.48756218905472637, "grad_norm": 0.10281817615032196, "learning_rate": 3.781094527363184e-05, "loss": 0.0006, "step": 1666 }, { "epoch": 0.48785484342990926, "grad_norm": 2.749037504196167, "learning_rate": 3.780362891425227e-05, "loss": 0.0626, "step": 1667 }, { "epoch": 0.4881474978050922, "grad_norm": 1.7417001724243164, "learning_rate": 3.77963125548727e-05, "loss": 0.0048, "step": 1668 }, { "epoch": 0.4884401521802751, "grad_norm": 0.02646813727915287, "learning_rate": 3.7788996195493127e-05, "loss": 0.0003, "step": 1669 }, { "epoch": 0.488732806555458, "grad_norm": 0.040591128170490265, "learning_rate": 3.7781679836113555e-05, "loss": 0.0008, "step": 1670 }, { "epoch": 0.4890254609306409, "grad_norm": 0.09096373617649078, "learning_rate": 3.777436347673398e-05, "loss": 0.0014, "step": 1671 }, { "epoch": 0.4893181153058238, "grad_norm": 0.06142628937959671, "learning_rate": 3.7767047117354404e-05, "loss": 0.001, "step": 1672 }, { "epoch": 0.48961076968100675, "grad_norm": 0.011314328759908676, "learning_rate": 3.775973075797483e-05, "loss": 0.0002, "step": 1673 }, { "epoch": 0.48990342405618964, "grad_norm": 8.231575012207031, "learning_rate": 3.775241439859526e-05, "loss": 0.093, "step": 1674 }, { "epoch": 0.49019607843137253, "grad_norm": 0.314681738615036, "learning_rate": 3.774509803921569e-05, "loss": 0.0042, "step": 1675 }, { "epoch": 0.49048873280655547, "grad_norm": 0.04359268397092819, "learning_rate": 3.7737781679836115e-05, "loss": 0.0008, "step": 1676 }, { "epoch": 0.49078138718173836, "grad_norm": 1.3855690956115723, "learning_rate": 3.773046532045654e-05, "loss": 0.0126, "step": 1677 }, { "epoch": 0.4910740415569213, "grad_norm": 4.025027751922607, "learning_rate": 3.772314896107697e-05, "loss": 0.0629, "step": 1678 }, { "epoch": 0.4913666959321042, "grad_norm": 0.0015903118764981627, "learning_rate": 3.77158326016974e-05, "loss": 0.0, "step": 1679 }, { "epoch": 0.4916593503072871, "grad_norm": 0.015470651909708977, "learning_rate": 3.770851624231783e-05, "loss": 0.0003, "step": 1680 }, { "epoch": 0.49195200468247, "grad_norm": 0.005371852777898312, "learning_rate": 3.7701199882938255e-05, "loss": 0.0001, "step": 1681 }, { "epoch": 0.4922446590576529, "grad_norm": 0.02501101791858673, "learning_rate": 3.7693883523558676e-05, "loss": 0.0003, "step": 1682 }, { "epoch": 0.4925373134328358, "grad_norm": 0.01687714457511902, "learning_rate": 3.7686567164179104e-05, "loss": 0.0003, "step": 1683 }, { "epoch": 0.49282996780801874, "grad_norm": 0.009274466894567013, "learning_rate": 3.767925080479953e-05, "loss": 0.0002, "step": 1684 }, { "epoch": 0.49312262218320163, "grad_norm": 0.008989389054477215, "learning_rate": 3.767193444541996e-05, "loss": 0.0002, "step": 1685 }, { "epoch": 0.4934152765583846, "grad_norm": 0.02986481972038746, "learning_rate": 3.766461808604039e-05, "loss": 0.0004, "step": 1686 }, { "epoch": 0.49370793093356746, "grad_norm": 0.0024271649308502674, "learning_rate": 3.7657301726660816e-05, "loss": 0.0001, "step": 1687 }, { "epoch": 0.49400058530875035, "grad_norm": 0.14183920621871948, "learning_rate": 3.7649985367281244e-05, "loss": 0.0005, "step": 1688 }, { "epoch": 0.4942932396839333, "grad_norm": 0.12646184861660004, "learning_rate": 3.764266900790167e-05, "loss": 0.001, "step": 1689 }, { "epoch": 0.4945858940591162, "grad_norm": 0.18008200824260712, "learning_rate": 3.76353526485221e-05, "loss": 0.0007, "step": 1690 }, { "epoch": 0.49487854843429907, "grad_norm": 2.948591709136963, "learning_rate": 3.762803628914253e-05, "loss": 0.0058, "step": 1691 }, { "epoch": 0.495171202809482, "grad_norm": 3.9043712615966797, "learning_rate": 3.7620719929762956e-05, "loss": 0.2148, "step": 1692 }, { "epoch": 0.4954638571846649, "grad_norm": 0.014103816822171211, "learning_rate": 3.761340357038338e-05, "loss": 0.0002, "step": 1693 }, { "epoch": 0.49575651155984785, "grad_norm": 0.047968197613954544, "learning_rate": 3.7606087211003805e-05, "loss": 0.0006, "step": 1694 }, { "epoch": 0.49604916593503073, "grad_norm": 0.008447016589343548, "learning_rate": 3.759877085162423e-05, "loss": 0.0002, "step": 1695 }, { "epoch": 0.4963418203102136, "grad_norm": 0.004565671551972628, "learning_rate": 3.759145449224466e-05, "loss": 0.0001, "step": 1696 }, { "epoch": 0.49663447468539657, "grad_norm": 0.009304909966886044, "learning_rate": 3.758413813286509e-05, "loss": 0.0001, "step": 1697 }, { "epoch": 0.49692712906057945, "grad_norm": 3.502692699432373, "learning_rate": 3.757682177348552e-05, "loss": 0.2776, "step": 1698 }, { "epoch": 0.49721978343576234, "grad_norm": 0.012169056572020054, "learning_rate": 3.7569505414105945e-05, "loss": 0.0001, "step": 1699 }, { "epoch": 0.4975124378109453, "grad_norm": 0.0030158180743455887, "learning_rate": 3.756218905472637e-05, "loss": 0.0001, "step": 1700 }, { "epoch": 0.4978050921861282, "grad_norm": 0.013154564425349236, "learning_rate": 3.75548726953468e-05, "loss": 0.0003, "step": 1701 }, { "epoch": 0.4980977465613111, "grad_norm": 0.001938572502695024, "learning_rate": 3.754755633596723e-05, "loss": 0.0, "step": 1702 }, { "epoch": 0.498390400936494, "grad_norm": 0.02460942603647709, "learning_rate": 3.7540239976587657e-05, "loss": 0.0002, "step": 1703 }, { "epoch": 0.4986830553116769, "grad_norm": 8.89484691619873, "learning_rate": 3.753292361720808e-05, "loss": 0.0953, "step": 1704 }, { "epoch": 0.49897570968685984, "grad_norm": 0.3271154463291168, "learning_rate": 3.7525607257828506e-05, "loss": 0.0018, "step": 1705 }, { "epoch": 0.4992683640620427, "grad_norm": 0.03161986917257309, "learning_rate": 3.7518290898448934e-05, "loss": 0.0005, "step": 1706 }, { "epoch": 0.4995610184372256, "grad_norm": 0.9015589952468872, "learning_rate": 3.751097453906936e-05, "loss": 0.0048, "step": 1707 }, { "epoch": 0.49985367281240856, "grad_norm": 0.00638006255030632, "learning_rate": 3.750365817968979e-05, "loss": 0.0002, "step": 1708 }, { "epoch": 0.5001463271875914, "grad_norm": 4.401444435119629, "learning_rate": 3.749634182031022e-05, "loss": 0.0155, "step": 1709 }, { "epoch": 0.5004389815627743, "grad_norm": 0.011124534532427788, "learning_rate": 3.7489025460930645e-05, "loss": 0.0002, "step": 1710 }, { "epoch": 0.5007316359379572, "grad_norm": 0.10676395893096924, "learning_rate": 3.748170910155107e-05, "loss": 0.0009, "step": 1711 }, { "epoch": 0.5010242903131402, "grad_norm": 0.010841317474842072, "learning_rate": 3.74743927421715e-05, "loss": 0.0002, "step": 1712 }, { "epoch": 0.5013169446883231, "grad_norm": 0.37011250853538513, "learning_rate": 3.746707638279193e-05, "loss": 0.0024, "step": 1713 }, { "epoch": 0.501609599063506, "grad_norm": 4.9328107833862305, "learning_rate": 3.745976002341235e-05, "loss": 0.0161, "step": 1714 }, { "epoch": 0.5019022534386889, "grad_norm": 0.00960032269358635, "learning_rate": 3.745244366403278e-05, "loss": 0.0002, "step": 1715 }, { "epoch": 0.5021949078138718, "grad_norm": 0.14403264224529266, "learning_rate": 3.7445127304653206e-05, "loss": 0.0024, "step": 1716 }, { "epoch": 0.5024875621890548, "grad_norm": 0.0046923803165555, "learning_rate": 3.7437810945273634e-05, "loss": 0.0001, "step": 1717 }, { "epoch": 0.5027802165642377, "grad_norm": 0.003075518412515521, "learning_rate": 3.743049458589406e-05, "loss": 0.0001, "step": 1718 }, { "epoch": 0.5030728709394205, "grad_norm": 0.020254552364349365, "learning_rate": 3.742317822651449e-05, "loss": 0.0003, "step": 1719 }, { "epoch": 0.5033655253146034, "grad_norm": 0.009212339296936989, "learning_rate": 3.741586186713492e-05, "loss": 0.0001, "step": 1720 }, { "epoch": 0.5036581796897863, "grad_norm": 0.015108101069927216, "learning_rate": 3.7408545507755346e-05, "loss": 0.0003, "step": 1721 }, { "epoch": 0.5039508340649693, "grad_norm": 0.0037316512316465378, "learning_rate": 3.7401229148375774e-05, "loss": 0.0001, "step": 1722 }, { "epoch": 0.5042434884401522, "grad_norm": 0.20514242351055145, "learning_rate": 3.73939127889962e-05, "loss": 0.0013, "step": 1723 }, { "epoch": 0.5045361428153351, "grad_norm": 0.0022379914298653603, "learning_rate": 3.738659642961663e-05, "loss": 0.0001, "step": 1724 }, { "epoch": 0.504828797190518, "grad_norm": 0.0064598931930959225, "learning_rate": 3.737928007023705e-05, "loss": 0.0002, "step": 1725 }, { "epoch": 0.5051214515657009, "grad_norm": 0.01024326216429472, "learning_rate": 3.737196371085748e-05, "loss": 0.0002, "step": 1726 }, { "epoch": 0.5054141059408838, "grad_norm": 0.004229737911373377, "learning_rate": 3.736464735147791e-05, "loss": 0.0001, "step": 1727 }, { "epoch": 0.5057067603160668, "grad_norm": 0.0021727506536990404, "learning_rate": 3.7357330992098335e-05, "loss": 0.0, "step": 1728 }, { "epoch": 0.5059994146912496, "grad_norm": 8.523039817810059, "learning_rate": 3.735001463271876e-05, "loss": 0.0721, "step": 1729 }, { "epoch": 0.5062920690664325, "grad_norm": 0.0025433769915252924, "learning_rate": 3.734269827333919e-05, "loss": 0.0001, "step": 1730 }, { "epoch": 0.5065847234416154, "grad_norm": 0.0037780781276524067, "learning_rate": 3.733538191395962e-05, "loss": 0.0001, "step": 1731 }, { "epoch": 0.5068773778167983, "grad_norm": 0.0531008280813694, "learning_rate": 3.732806555458005e-05, "loss": 0.0005, "step": 1732 }, { "epoch": 0.5071700321919813, "grad_norm": 0.006564264185726643, "learning_rate": 3.7320749195200475e-05, "loss": 0.0001, "step": 1733 }, { "epoch": 0.5074626865671642, "grad_norm": 0.5764566659927368, "learning_rate": 3.73134328358209e-05, "loss": 0.0028, "step": 1734 }, { "epoch": 0.5077553409423471, "grad_norm": 0.001747317728586495, "learning_rate": 3.730611647644133e-05, "loss": 0.0, "step": 1735 }, { "epoch": 0.50804799531753, "grad_norm": 0.005131816025823355, "learning_rate": 3.729880011706175e-05, "loss": 0.0001, "step": 1736 }, { "epoch": 0.5083406496927129, "grad_norm": 0.2529681324958801, "learning_rate": 3.729148375768218e-05, "loss": 0.0011, "step": 1737 }, { "epoch": 0.5086333040678959, "grad_norm": 0.00605887221172452, "learning_rate": 3.728416739830261e-05, "loss": 0.0001, "step": 1738 }, { "epoch": 0.5089259584430788, "grad_norm": 0.15838930010795593, "learning_rate": 3.7276851038923036e-05, "loss": 0.001, "step": 1739 }, { "epoch": 0.5092186128182616, "grad_norm": 0.008716394193470478, "learning_rate": 3.7269534679543464e-05, "loss": 0.0001, "step": 1740 }, { "epoch": 0.5095112671934445, "grad_norm": 0.10244045406579971, "learning_rate": 3.726221832016389e-05, "loss": 0.0006, "step": 1741 }, { "epoch": 0.5098039215686274, "grad_norm": 0.014128678478300571, "learning_rate": 3.725490196078432e-05, "loss": 0.0002, "step": 1742 }, { "epoch": 0.5100965759438104, "grad_norm": 0.024465791881084442, "learning_rate": 3.724758560140475e-05, "loss": 0.0004, "step": 1743 }, { "epoch": 0.5103892303189933, "grad_norm": 0.10913330316543579, "learning_rate": 3.724026924202517e-05, "loss": 0.0007, "step": 1744 }, { "epoch": 0.5106818846941762, "grad_norm": 0.01726347953081131, "learning_rate": 3.7232952882645597e-05, "loss": 0.0002, "step": 1745 }, { "epoch": 0.5109745390693591, "grad_norm": 0.08208633214235306, "learning_rate": 3.7225636523266025e-05, "loss": 0.0008, "step": 1746 }, { "epoch": 0.511267193444542, "grad_norm": 10.661100387573242, "learning_rate": 3.721832016388645e-05, "loss": 0.0384, "step": 1747 }, { "epoch": 0.5115598478197249, "grad_norm": 0.0018042756710201502, "learning_rate": 3.721100380450688e-05, "loss": 0.0, "step": 1748 }, { "epoch": 0.5118525021949079, "grad_norm": 0.014229393564164639, "learning_rate": 3.720368744512731e-05, "loss": 0.0001, "step": 1749 }, { "epoch": 0.5121451565700907, "grad_norm": 0.002377843949943781, "learning_rate": 3.7196371085747736e-05, "loss": 0.0001, "step": 1750 }, { "epoch": 0.5124378109452736, "grad_norm": 0.08071793615818024, "learning_rate": 3.7189054726368164e-05, "loss": 0.0004, "step": 1751 }, { "epoch": 0.5127304653204565, "grad_norm": 0.0018894653767347336, "learning_rate": 3.7181738366988585e-05, "loss": 0.0, "step": 1752 }, { "epoch": 0.5130231196956394, "grad_norm": 0.0011113358195871115, "learning_rate": 3.7174422007609013e-05, "loss": 0.0, "step": 1753 }, { "epoch": 0.5133157740708224, "grad_norm": 0.002135826740413904, "learning_rate": 3.716710564822944e-05, "loss": 0.0001, "step": 1754 }, { "epoch": 0.5136084284460053, "grad_norm": 0.00445591239258647, "learning_rate": 3.715978928884987e-05, "loss": 0.0001, "step": 1755 }, { "epoch": 0.5139010828211882, "grad_norm": 0.0021990472450852394, "learning_rate": 3.71524729294703e-05, "loss": 0.0, "step": 1756 }, { "epoch": 0.5141937371963711, "grad_norm": 0.11762271821498871, "learning_rate": 3.7145156570090725e-05, "loss": 0.0004, "step": 1757 }, { "epoch": 0.514486391571554, "grad_norm": 6.813816547393799, "learning_rate": 3.713784021071115e-05, "loss": 0.096, "step": 1758 }, { "epoch": 0.514779045946737, "grad_norm": 0.007990210317075253, "learning_rate": 3.713052385133158e-05, "loss": 0.0001, "step": 1759 }, { "epoch": 0.5150717003219198, "grad_norm": 0.0036788741126656532, "learning_rate": 3.7123207491952e-05, "loss": 0.0001, "step": 1760 }, { "epoch": 0.5153643546971027, "grad_norm": 0.0008727388922125101, "learning_rate": 3.711589113257243e-05, "loss": 0.0, "step": 1761 }, { "epoch": 0.5156570090722856, "grad_norm": 0.0029930968303233385, "learning_rate": 3.710857477319286e-05, "loss": 0.0001, "step": 1762 }, { "epoch": 0.5159496634474685, "grad_norm": 0.013158080168068409, "learning_rate": 3.7101258413813286e-05, "loss": 0.0001, "step": 1763 }, { "epoch": 0.5162423178226514, "grad_norm": 0.001070450060069561, "learning_rate": 3.7093942054433714e-05, "loss": 0.0, "step": 1764 }, { "epoch": 0.5165349721978344, "grad_norm": 0.03361155837774277, "learning_rate": 3.708662569505414e-05, "loss": 0.0002, "step": 1765 }, { "epoch": 0.5168276265730173, "grad_norm": 0.044139012694358826, "learning_rate": 3.707930933567457e-05, "loss": 0.0005, "step": 1766 }, { "epoch": 0.5171202809482002, "grad_norm": 0.004443437326699495, "learning_rate": 3.7071992976295e-05, "loss": 0.0001, "step": 1767 }, { "epoch": 0.5174129353233831, "grad_norm": 0.01623067446053028, "learning_rate": 3.706467661691542e-05, "loss": 0.0002, "step": 1768 }, { "epoch": 0.517705589698566, "grad_norm": 0.0013135349145159125, "learning_rate": 3.705736025753585e-05, "loss": 0.0, "step": 1769 }, { "epoch": 0.517998244073749, "grad_norm": 0.006052326411008835, "learning_rate": 3.7050043898156275e-05, "loss": 0.0001, "step": 1770 }, { "epoch": 0.5182908984489318, "grad_norm": 0.015305147506296635, "learning_rate": 3.70427275387767e-05, "loss": 0.0002, "step": 1771 }, { "epoch": 0.5185835528241147, "grad_norm": 0.0007871999987401068, "learning_rate": 3.703541117939713e-05, "loss": 0.0, "step": 1772 }, { "epoch": 0.5188762071992976, "grad_norm": 0.005125640891492367, "learning_rate": 3.702809482001756e-05, "loss": 0.0001, "step": 1773 }, { "epoch": 0.5191688615744805, "grad_norm": 0.004210739862173796, "learning_rate": 3.702077846063799e-05, "loss": 0.0001, "step": 1774 }, { "epoch": 0.5194615159496635, "grad_norm": 8.280128479003906, "learning_rate": 3.7013462101258415e-05, "loss": 0.0457, "step": 1775 }, { "epoch": 0.5197541703248464, "grad_norm": 9.250022888183594, "learning_rate": 3.700614574187884e-05, "loss": 0.0734, "step": 1776 }, { "epoch": 0.5200468247000293, "grad_norm": 6.377782344818115, "learning_rate": 3.699882938249927e-05, "loss": 0.1639, "step": 1777 }, { "epoch": 0.5203394790752122, "grad_norm": 0.008695557713508606, "learning_rate": 3.699151302311969e-05, "loss": 0.0001, "step": 1778 }, { "epoch": 0.520632133450395, "grad_norm": 4.113627910614014, "learning_rate": 3.698419666374012e-05, "loss": 0.0101, "step": 1779 }, { "epoch": 0.5209247878255779, "grad_norm": 0.01942027173936367, "learning_rate": 3.697688030436055e-05, "loss": 0.0002, "step": 1780 }, { "epoch": 0.5212174422007609, "grad_norm": 0.0015956436982378364, "learning_rate": 3.6969563944980976e-05, "loss": 0.0, "step": 1781 }, { "epoch": 0.5215100965759438, "grad_norm": 0.004826758522540331, "learning_rate": 3.6962247585601404e-05, "loss": 0.0001, "step": 1782 }, { "epoch": 0.5218027509511267, "grad_norm": 0.006228056736290455, "learning_rate": 3.695493122622183e-05, "loss": 0.0001, "step": 1783 }, { "epoch": 0.5220954053263096, "grad_norm": 0.0038263732567429543, "learning_rate": 3.694761486684226e-05, "loss": 0.0001, "step": 1784 }, { "epoch": 0.5223880597014925, "grad_norm": 0.0545361191034317, "learning_rate": 3.694029850746269e-05, "loss": 0.0006, "step": 1785 }, { "epoch": 0.5226807140766755, "grad_norm": 6.925477027893066, "learning_rate": 3.6932982148083115e-05, "loss": 0.1366, "step": 1786 }, { "epoch": 0.5229733684518584, "grad_norm": 0.03318082541227341, "learning_rate": 3.692566578870354e-05, "loss": 0.0003, "step": 1787 }, { "epoch": 0.5232660228270413, "grad_norm": 0.02219540998339653, "learning_rate": 3.691834942932397e-05, "loss": 0.0003, "step": 1788 }, { "epoch": 0.5235586772022242, "grad_norm": 0.10277865082025528, "learning_rate": 3.691103306994439e-05, "loss": 0.0005, "step": 1789 }, { "epoch": 0.523851331577407, "grad_norm": 0.007709465455263853, "learning_rate": 3.690371671056482e-05, "loss": 0.0001, "step": 1790 }, { "epoch": 0.52414398595259, "grad_norm": 0.003918229136615992, "learning_rate": 3.689640035118525e-05, "loss": 0.0001, "step": 1791 }, { "epoch": 0.5244366403277729, "grad_norm": 0.7273104190826416, "learning_rate": 3.6889083991805676e-05, "loss": 0.0058, "step": 1792 }, { "epoch": 0.5247292947029558, "grad_norm": 0.02427494153380394, "learning_rate": 3.6881767632426104e-05, "loss": 0.0002, "step": 1793 }, { "epoch": 0.5250219490781387, "grad_norm": 0.3889729082584381, "learning_rate": 3.687445127304653e-05, "loss": 0.0017, "step": 1794 }, { "epoch": 0.5253146034533216, "grad_norm": 0.30237486958503723, "learning_rate": 3.686713491366696e-05, "loss": 0.0016, "step": 1795 }, { "epoch": 0.5256072578285045, "grad_norm": 0.003748288843780756, "learning_rate": 3.685981855428739e-05, "loss": 0.0001, "step": 1796 }, { "epoch": 0.5258999122036875, "grad_norm": 9.525371551513672, "learning_rate": 3.6852502194907816e-05, "loss": 0.0617, "step": 1797 }, { "epoch": 0.5261925665788704, "grad_norm": 0.034219879657030106, "learning_rate": 3.6845185835528244e-05, "loss": 0.0003, "step": 1798 }, { "epoch": 0.5264852209540533, "grad_norm": 0.052397776395082474, "learning_rate": 3.6837869476148665e-05, "loss": 0.0003, "step": 1799 }, { "epoch": 0.5267778753292361, "grad_norm": 0.0034943039063364267, "learning_rate": 3.683055311676909e-05, "loss": 0.0001, "step": 1800 }, { "epoch": 0.527070529704419, "grad_norm": 0.00827542133629322, "learning_rate": 3.682323675738952e-05, "loss": 0.0001, "step": 1801 }, { "epoch": 0.527363184079602, "grad_norm": 0.06492964178323746, "learning_rate": 3.681592039800995e-05, "loss": 0.0004, "step": 1802 }, { "epoch": 0.5276558384547849, "grad_norm": 0.06064360961318016, "learning_rate": 3.680860403863038e-05, "loss": 0.0004, "step": 1803 }, { "epoch": 0.5279484928299678, "grad_norm": 0.03688051551580429, "learning_rate": 3.6801287679250805e-05, "loss": 0.0003, "step": 1804 }, { "epoch": 0.5282411472051507, "grad_norm": 0.0004697742697317153, "learning_rate": 3.679397131987123e-05, "loss": 0.0, "step": 1805 }, { "epoch": 0.5285338015803336, "grad_norm": 0.03380119800567627, "learning_rate": 3.678665496049166e-05, "loss": 0.0002, "step": 1806 }, { "epoch": 0.5288264559555166, "grad_norm": 0.0014248295919969678, "learning_rate": 3.677933860111209e-05, "loss": 0.0, "step": 1807 }, { "epoch": 0.5291191103306995, "grad_norm": 0.0007322686142288148, "learning_rate": 3.677202224173252e-05, "loss": 0.0, "step": 1808 }, { "epoch": 0.5294117647058824, "grad_norm": 0.01017748098820448, "learning_rate": 3.6764705882352945e-05, "loss": 0.0001, "step": 1809 }, { "epoch": 0.5297044190810652, "grad_norm": 0.08894390612840652, "learning_rate": 3.6757389522973366e-05, "loss": 0.0004, "step": 1810 }, { "epoch": 0.5299970734562481, "grad_norm": 0.002421896904706955, "learning_rate": 3.6750073163593794e-05, "loss": 0.0, "step": 1811 }, { "epoch": 0.5302897278314311, "grad_norm": 0.0010194090427830815, "learning_rate": 3.674275680421422e-05, "loss": 0.0, "step": 1812 }, { "epoch": 0.530582382206614, "grad_norm": 0.0037550644483417273, "learning_rate": 3.673544044483465e-05, "loss": 0.0001, "step": 1813 }, { "epoch": 0.5308750365817969, "grad_norm": 0.006121156271547079, "learning_rate": 3.672812408545508e-05, "loss": 0.0001, "step": 1814 }, { "epoch": 0.5311676909569798, "grad_norm": 0.0014041484100744128, "learning_rate": 3.6720807726075506e-05, "loss": 0.0, "step": 1815 }, { "epoch": 0.5314603453321627, "grad_norm": 0.0009004389285109937, "learning_rate": 3.6713491366695934e-05, "loss": 0.0, "step": 1816 }, { "epoch": 0.5317529997073456, "grad_norm": 0.1951526403427124, "learning_rate": 3.670617500731636e-05, "loss": 0.0008, "step": 1817 }, { "epoch": 0.5320456540825286, "grad_norm": 0.0010145616251975298, "learning_rate": 3.669885864793679e-05, "loss": 0.0, "step": 1818 }, { "epoch": 0.5323383084577115, "grad_norm": 0.0022146685514599085, "learning_rate": 3.669154228855722e-05, "loss": 0.0, "step": 1819 }, { "epoch": 0.5326309628328944, "grad_norm": 0.0035102490801364183, "learning_rate": 3.668422592917764e-05, "loss": 0.0, "step": 1820 }, { "epoch": 0.5329236172080772, "grad_norm": 0.006640933454036713, "learning_rate": 3.6676909569798067e-05, "loss": 0.0001, "step": 1821 }, { "epoch": 0.5332162715832601, "grad_norm": 0.010873553343117237, "learning_rate": 3.6669593210418495e-05, "loss": 0.0001, "step": 1822 }, { "epoch": 0.5335089259584431, "grad_norm": 0.5120856761932373, "learning_rate": 3.666227685103892e-05, "loss": 0.0017, "step": 1823 }, { "epoch": 0.533801580333626, "grad_norm": 0.0015129977837204933, "learning_rate": 3.665496049165935e-05, "loss": 0.0, "step": 1824 }, { "epoch": 0.5340942347088089, "grad_norm": 0.023982318118214607, "learning_rate": 3.664764413227978e-05, "loss": 0.0002, "step": 1825 }, { "epoch": 0.5343868890839918, "grad_norm": 0.007661939598619938, "learning_rate": 3.6640327772900206e-05, "loss": 0.0001, "step": 1826 }, { "epoch": 0.5346795434591747, "grad_norm": 0.0005000335513614118, "learning_rate": 3.6633011413520634e-05, "loss": 0.0, "step": 1827 }, { "epoch": 0.5349721978343577, "grad_norm": 0.0013710459461435676, "learning_rate": 3.662569505414106e-05, "loss": 0.0, "step": 1828 }, { "epoch": 0.5352648522095406, "grad_norm": 0.009977270849049091, "learning_rate": 3.661837869476149e-05, "loss": 0.0001, "step": 1829 }, { "epoch": 0.5355575065847235, "grad_norm": 0.0027754006441682577, "learning_rate": 3.661106233538192e-05, "loss": 0.0, "step": 1830 }, { "epoch": 0.5358501609599063, "grad_norm": 0.0007236430537886918, "learning_rate": 3.660374597600234e-05, "loss": 0.0, "step": 1831 }, { "epoch": 0.5361428153350892, "grad_norm": 0.0009558421443216503, "learning_rate": 3.659642961662277e-05, "loss": 0.0, "step": 1832 }, { "epoch": 0.5364354697102721, "grad_norm": 0.00018310759332962334, "learning_rate": 3.6589113257243195e-05, "loss": 0.0, "step": 1833 }, { "epoch": 0.5367281240854551, "grad_norm": 0.00036073499359190464, "learning_rate": 3.658179689786362e-05, "loss": 0.0, "step": 1834 }, { "epoch": 0.537020778460638, "grad_norm": 5.54606819152832, "learning_rate": 3.657448053848405e-05, "loss": 0.0191, "step": 1835 }, { "epoch": 0.5373134328358209, "grad_norm": 0.000820028712041676, "learning_rate": 3.656716417910448e-05, "loss": 0.0, "step": 1836 }, { "epoch": 0.5376060872110038, "grad_norm": 0.000855633057653904, "learning_rate": 3.655984781972491e-05, "loss": 0.0, "step": 1837 }, { "epoch": 0.5378987415861867, "grad_norm": 0.0016642031259834766, "learning_rate": 3.6552531460345335e-05, "loss": 0.0, "step": 1838 }, { "epoch": 0.5381913959613697, "grad_norm": 0.0009357350063510239, "learning_rate": 3.654521510096576e-05, "loss": 0.0, "step": 1839 }, { "epoch": 0.5384840503365526, "grad_norm": 0.0006593746365979314, "learning_rate": 3.653789874158619e-05, "loss": 0.0, "step": 1840 }, { "epoch": 0.5387767047117354, "grad_norm": 0.06413095444440842, "learning_rate": 3.653058238220662e-05, "loss": 0.0004, "step": 1841 }, { "epoch": 0.5390693590869183, "grad_norm": 0.003475102363154292, "learning_rate": 3.652326602282704e-05, "loss": 0.0, "step": 1842 }, { "epoch": 0.5393620134621012, "grad_norm": 0.00027950009098276496, "learning_rate": 3.651594966344747e-05, "loss": 0.0, "step": 1843 }, { "epoch": 0.5396546678372842, "grad_norm": 0.0063551911152899265, "learning_rate": 3.6508633304067896e-05, "loss": 0.0001, "step": 1844 }, { "epoch": 0.5399473222124671, "grad_norm": 0.0005005158018320799, "learning_rate": 3.6501316944688324e-05, "loss": 0.0, "step": 1845 }, { "epoch": 0.54023997658765, "grad_norm": 0.001076875370927155, "learning_rate": 3.649400058530875e-05, "loss": 0.0, "step": 1846 }, { "epoch": 0.5405326309628329, "grad_norm": 0.00039572734385728836, "learning_rate": 3.648668422592918e-05, "loss": 0.0, "step": 1847 }, { "epoch": 0.5408252853380158, "grad_norm": 0.0002550368953961879, "learning_rate": 3.647936786654961e-05, "loss": 0.0, "step": 1848 }, { "epoch": 0.5411179397131987, "grad_norm": 0.001221882994286716, "learning_rate": 3.6472051507170036e-05, "loss": 0.0, "step": 1849 }, { "epoch": 0.5414105940883817, "grad_norm": 0.0008454864728264511, "learning_rate": 3.6464735147790464e-05, "loss": 0.0, "step": 1850 }, { "epoch": 0.5417032484635645, "grad_norm": 9.2111382400617e-05, "learning_rate": 3.645741878841089e-05, "loss": 0.0, "step": 1851 }, { "epoch": 0.5419959028387474, "grad_norm": 8.438952445983887, "learning_rate": 3.645010242903131e-05, "loss": 0.2633, "step": 1852 }, { "epoch": 0.5422885572139303, "grad_norm": 0.001004392164759338, "learning_rate": 3.644278606965174e-05, "loss": 0.0, "step": 1853 }, { "epoch": 0.5425812115891132, "grad_norm": 0.00137075234670192, "learning_rate": 3.643546971027217e-05, "loss": 0.0, "step": 1854 }, { "epoch": 0.5428738659642962, "grad_norm": 0.008688335306942463, "learning_rate": 3.6428153350892597e-05, "loss": 0.0001, "step": 1855 }, { "epoch": 0.5431665203394791, "grad_norm": 0.0057664671912789345, "learning_rate": 3.6420836991513025e-05, "loss": 0.0001, "step": 1856 }, { "epoch": 0.543459174714662, "grad_norm": 0.00026519360835663974, "learning_rate": 3.641352063213345e-05, "loss": 0.0, "step": 1857 }, { "epoch": 0.5437518290898449, "grad_norm": 5.946147441864014, "learning_rate": 3.640620427275388e-05, "loss": 0.0278, "step": 1858 }, { "epoch": 0.5440444834650278, "grad_norm": 0.0016452086856588721, "learning_rate": 3.639888791337431e-05, "loss": 0.0, "step": 1859 }, { "epoch": 0.5443371378402108, "grad_norm": 0.001900315866805613, "learning_rate": 3.6391571553994736e-05, "loss": 0.0, "step": 1860 }, { "epoch": 0.5446297922153936, "grad_norm": 0.001699809799902141, "learning_rate": 3.6384255194615164e-05, "loss": 0.0, "step": 1861 }, { "epoch": 0.5449224465905765, "grad_norm": 0.004050768446177244, "learning_rate": 3.637693883523559e-05, "loss": 0.0001, "step": 1862 }, { "epoch": 0.5452151009657594, "grad_norm": 0.0055578239262104034, "learning_rate": 3.6369622475856013e-05, "loss": 0.0001, "step": 1863 }, { "epoch": 0.5455077553409423, "grad_norm": 0.002650769893079996, "learning_rate": 3.636230611647644e-05, "loss": 0.0, "step": 1864 }, { "epoch": 0.5458004097161252, "grad_norm": 0.006541771348565817, "learning_rate": 3.635498975709687e-05, "loss": 0.0001, "step": 1865 }, { "epoch": 0.5460930640913082, "grad_norm": 0.005475026089698076, "learning_rate": 3.63476733977173e-05, "loss": 0.0001, "step": 1866 }, { "epoch": 0.5463857184664911, "grad_norm": 0.014294413849711418, "learning_rate": 3.6340357038337725e-05, "loss": 0.0002, "step": 1867 }, { "epoch": 0.546678372841674, "grad_norm": 5.496011257171631, "learning_rate": 3.633304067895815e-05, "loss": 0.2543, "step": 1868 }, { "epoch": 0.5469710272168569, "grad_norm": 0.007919173687696457, "learning_rate": 3.632572431957858e-05, "loss": 0.0001, "step": 1869 }, { "epoch": 0.5472636815920398, "grad_norm": 0.06703829765319824, "learning_rate": 3.631840796019901e-05, "loss": 0.0007, "step": 1870 }, { "epoch": 0.5475563359672228, "grad_norm": 0.01958063803613186, "learning_rate": 3.631109160081944e-05, "loss": 0.0004, "step": 1871 }, { "epoch": 0.5478489903424056, "grad_norm": 0.009955674409866333, "learning_rate": 3.6303775241439865e-05, "loss": 0.0002, "step": 1872 }, { "epoch": 0.5481416447175885, "grad_norm": 0.013306597247719765, "learning_rate": 3.6296458882060286e-05, "loss": 0.0003, "step": 1873 }, { "epoch": 0.5484342990927714, "grad_norm": 0.005448292475193739, "learning_rate": 3.6289142522680714e-05, "loss": 0.0001, "step": 1874 }, { "epoch": 0.5487269534679543, "grad_norm": 0.020972304046154022, "learning_rate": 3.628182616330114e-05, "loss": 0.0004, "step": 1875 }, { "epoch": 0.5490196078431373, "grad_norm": 0.013444705866277218, "learning_rate": 3.627450980392157e-05, "loss": 0.0002, "step": 1876 }, { "epoch": 0.5493122622183202, "grad_norm": 0.005087513942271471, "learning_rate": 3.6267193444542e-05, "loss": 0.0001, "step": 1877 }, { "epoch": 0.5496049165935031, "grad_norm": 0.007403940428048372, "learning_rate": 3.6259877085162426e-05, "loss": 0.0001, "step": 1878 }, { "epoch": 0.549897570968686, "grad_norm": 0.0029638975393027067, "learning_rate": 3.6252560725782854e-05, "loss": 0.0001, "step": 1879 }, { "epoch": 0.5501902253438689, "grad_norm": 0.0048151337541639805, "learning_rate": 3.624524436640328e-05, "loss": 0.0001, "step": 1880 }, { "epoch": 0.5504828797190519, "grad_norm": 0.008215599693357944, "learning_rate": 3.623792800702371e-05, "loss": 0.0002, "step": 1881 }, { "epoch": 0.5507755340942347, "grad_norm": 0.0034256833605468273, "learning_rate": 3.623061164764414e-05, "loss": 0.0001, "step": 1882 }, { "epoch": 0.5510681884694176, "grad_norm": 0.007427630480378866, "learning_rate": 3.6223295288264566e-05, "loss": 0.0001, "step": 1883 }, { "epoch": 0.5513608428446005, "grad_norm": 0.01656688190996647, "learning_rate": 3.621597892888499e-05, "loss": 0.0001, "step": 1884 }, { "epoch": 0.5516534972197834, "grad_norm": 0.005414238199591637, "learning_rate": 3.6208662569505415e-05, "loss": 0.0001, "step": 1885 }, { "epoch": 0.5519461515949663, "grad_norm": 0.002928486093878746, "learning_rate": 3.620134621012584e-05, "loss": 0.0001, "step": 1886 }, { "epoch": 0.5522388059701493, "grad_norm": 0.004565770737826824, "learning_rate": 3.619402985074627e-05, "loss": 0.0001, "step": 1887 }, { "epoch": 0.5525314603453322, "grad_norm": 0.0021436475217342377, "learning_rate": 3.61867134913667e-05, "loss": 0.0001, "step": 1888 }, { "epoch": 0.5528241147205151, "grad_norm": 0.024198435246944427, "learning_rate": 3.6179397131987127e-05, "loss": 0.0002, "step": 1889 }, { "epoch": 0.553116769095698, "grad_norm": 4.87838888168335, "learning_rate": 3.6172080772607554e-05, "loss": 0.0229, "step": 1890 }, { "epoch": 0.5534094234708808, "grad_norm": 0.12384998053312302, "learning_rate": 3.616476441322798e-05, "loss": 0.0006, "step": 1891 }, { "epoch": 0.5537020778460638, "grad_norm": 0.15898260474205017, "learning_rate": 3.615744805384841e-05, "loss": 0.0007, "step": 1892 }, { "epoch": 0.5539947322212467, "grad_norm": 0.055284690111875534, "learning_rate": 3.615013169446884e-05, "loss": 0.0004, "step": 1893 }, { "epoch": 0.5542873865964296, "grad_norm": 0.002289955737069249, "learning_rate": 3.6142815335089266e-05, "loss": 0.0001, "step": 1894 }, { "epoch": 0.5545800409716125, "grad_norm": 0.0055839489214122295, "learning_rate": 3.613549897570969e-05, "loss": 0.0001, "step": 1895 }, { "epoch": 0.5548726953467954, "grad_norm": 0.040371961891651154, "learning_rate": 3.6128182616330115e-05, "loss": 0.0004, "step": 1896 }, { "epoch": 0.5551653497219784, "grad_norm": 0.006794141139835119, "learning_rate": 3.612086625695054e-05, "loss": 0.0001, "step": 1897 }, { "epoch": 0.5554580040971613, "grad_norm": 0.0622115284204483, "learning_rate": 3.611354989757097e-05, "loss": 0.0004, "step": 1898 }, { "epoch": 0.5557506584723442, "grad_norm": 0.002630019560456276, "learning_rate": 3.61062335381914e-05, "loss": 0.0001, "step": 1899 }, { "epoch": 0.5560433128475271, "grad_norm": 0.0018102923640981317, "learning_rate": 3.609891717881183e-05, "loss": 0.0, "step": 1900 }, { "epoch": 0.55633596722271, "grad_norm": 0.08920388668775558, "learning_rate": 3.6091600819432255e-05, "loss": 0.0006, "step": 1901 }, { "epoch": 0.5566286215978928, "grad_norm": 0.0019470619736239314, "learning_rate": 3.608428446005268e-05, "loss": 0.0, "step": 1902 }, { "epoch": 0.5569212759730758, "grad_norm": 0.005832942668348551, "learning_rate": 3.607696810067311e-05, "loss": 0.0001, "step": 1903 }, { "epoch": 0.5572139303482587, "grad_norm": 5.806044578552246, "learning_rate": 3.606965174129354e-05, "loss": 0.1801, "step": 1904 }, { "epoch": 0.5575065847234416, "grad_norm": 0.005481138359755278, "learning_rate": 3.606233538191396e-05, "loss": 0.0001, "step": 1905 }, { "epoch": 0.5577992390986245, "grad_norm": 0.008103495463728905, "learning_rate": 3.605501902253439e-05, "loss": 0.0001, "step": 1906 }, { "epoch": 0.5580918934738074, "grad_norm": 2.412076473236084, "learning_rate": 3.6047702663154816e-05, "loss": 0.0053, "step": 1907 }, { "epoch": 0.5583845478489904, "grad_norm": 0.002062377519905567, "learning_rate": 3.6040386303775244e-05, "loss": 0.0001, "step": 1908 }, { "epoch": 0.5586772022241733, "grad_norm": 0.00176193134393543, "learning_rate": 3.603306994439567e-05, "loss": 0.0001, "step": 1909 }, { "epoch": 0.5589698565993562, "grad_norm": 0.0025971047580242157, "learning_rate": 3.60257535850161e-05, "loss": 0.0001, "step": 1910 }, { "epoch": 0.559262510974539, "grad_norm": 0.00832816306501627, "learning_rate": 3.601843722563653e-05, "loss": 0.0002, "step": 1911 }, { "epoch": 0.5595551653497219, "grad_norm": 4.229795455932617, "learning_rate": 3.6011120866256956e-05, "loss": 0.1842, "step": 1912 }, { "epoch": 0.5598478197249049, "grad_norm": 0.0056917197071015835, "learning_rate": 3.6003804506877384e-05, "loss": 0.0001, "step": 1913 }, { "epoch": 0.5601404741000878, "grad_norm": 2.5309674739837646, "learning_rate": 3.599648814749781e-05, "loss": 0.0057, "step": 1914 }, { "epoch": 0.5604331284752707, "grad_norm": 0.008329502306878567, "learning_rate": 3.598917178811824e-05, "loss": 0.0002, "step": 1915 }, { "epoch": 0.5607257828504536, "grad_norm": 0.005598751828074455, "learning_rate": 3.598185542873866e-05, "loss": 0.0001, "step": 1916 }, { "epoch": 0.5610184372256365, "grad_norm": 0.00646185502409935, "learning_rate": 3.597453906935909e-05, "loss": 0.0001, "step": 1917 }, { "epoch": 0.5613110916008194, "grad_norm": 0.006866334471851587, "learning_rate": 3.596722270997952e-05, "loss": 0.0002, "step": 1918 }, { "epoch": 0.5616037459760024, "grad_norm": 0.0034606242552399635, "learning_rate": 3.5959906350599945e-05, "loss": 0.0001, "step": 1919 }, { "epoch": 0.5618964003511853, "grad_norm": 0.004204215481877327, "learning_rate": 3.595258999122037e-05, "loss": 0.0001, "step": 1920 }, { "epoch": 0.5621890547263682, "grad_norm": 0.006527795922011137, "learning_rate": 3.59452736318408e-05, "loss": 0.0001, "step": 1921 }, { "epoch": 0.562481709101551, "grad_norm": 0.08269781619310379, "learning_rate": 3.593795727246123e-05, "loss": 0.0005, "step": 1922 }, { "epoch": 0.5627743634767339, "grad_norm": 0.0049555618315935135, "learning_rate": 3.5930640913081657e-05, "loss": 0.0001, "step": 1923 }, { "epoch": 0.5630670178519169, "grad_norm": 0.17137527465820312, "learning_rate": 3.5923324553702084e-05, "loss": 0.0011, "step": 1924 }, { "epoch": 0.5633596722270998, "grad_norm": 0.03199278190732002, "learning_rate": 3.591600819432251e-05, "loss": 0.0003, "step": 1925 }, { "epoch": 0.5636523266022827, "grad_norm": 0.021704381331801414, "learning_rate": 3.5908691834942934e-05, "loss": 0.0003, "step": 1926 }, { "epoch": 0.5639449809774656, "grad_norm": 0.0018218193436041474, "learning_rate": 3.590137547556336e-05, "loss": 0.0, "step": 1927 }, { "epoch": 0.5642376353526485, "grad_norm": 1.291187047958374, "learning_rate": 3.589405911618379e-05, "loss": 0.0039, "step": 1928 }, { "epoch": 0.5645302897278315, "grad_norm": 4.635223865509033, "learning_rate": 3.588674275680422e-05, "loss": 0.2324, "step": 1929 }, { "epoch": 0.5648229441030144, "grad_norm": 0.0054092868231236935, "learning_rate": 3.5879426397424645e-05, "loss": 0.0001, "step": 1930 }, { "epoch": 0.5651155984781973, "grad_norm": 0.008922097273170948, "learning_rate": 3.587211003804507e-05, "loss": 0.0002, "step": 1931 }, { "epoch": 0.5654082528533801, "grad_norm": 0.002301436150446534, "learning_rate": 3.58647936786655e-05, "loss": 0.0001, "step": 1932 }, { "epoch": 0.565700907228563, "grad_norm": 0.005360117182135582, "learning_rate": 3.585747731928593e-05, "loss": 0.0001, "step": 1933 }, { "epoch": 0.5659935616037459, "grad_norm": 0.012410847470164299, "learning_rate": 3.585016095990635e-05, "loss": 0.0003, "step": 1934 }, { "epoch": 0.5662862159789289, "grad_norm": 0.12954729795455933, "learning_rate": 3.584284460052678e-05, "loss": 0.002, "step": 1935 }, { "epoch": 0.5665788703541118, "grad_norm": 0.04091344401240349, "learning_rate": 3.5835528241147206e-05, "loss": 0.0007, "step": 1936 }, { "epoch": 0.5668715247292947, "grad_norm": 0.11257024854421616, "learning_rate": 3.5828211881767634e-05, "loss": 0.0011, "step": 1937 }, { "epoch": 0.5671641791044776, "grad_norm": 0.07596535235643387, "learning_rate": 3.582089552238806e-05, "loss": 0.0015, "step": 1938 }, { "epoch": 0.5674568334796605, "grad_norm": 0.14624670147895813, "learning_rate": 3.581357916300849e-05, "loss": 0.0018, "step": 1939 }, { "epoch": 0.5677494878548435, "grad_norm": 0.23475168645381927, "learning_rate": 3.580626280362892e-05, "loss": 0.0019, "step": 1940 }, { "epoch": 0.5680421422300264, "grad_norm": 0.02123354561626911, "learning_rate": 3.5798946444249346e-05, "loss": 0.0003, "step": 1941 }, { "epoch": 0.5683347966052092, "grad_norm": 0.09904786199331284, "learning_rate": 3.579163008486977e-05, "loss": 0.001, "step": 1942 }, { "epoch": 0.5686274509803921, "grad_norm": 7.794208526611328, "learning_rate": 3.5784313725490195e-05, "loss": 0.1359, "step": 1943 }, { "epoch": 0.568920105355575, "grad_norm": 0.014828276820480824, "learning_rate": 3.577699736611062e-05, "loss": 0.0003, "step": 1944 }, { "epoch": 0.569212759730758, "grad_norm": 0.08800917863845825, "learning_rate": 3.576968100673105e-05, "loss": 0.0012, "step": 1945 }, { "epoch": 0.5695054141059409, "grad_norm": 0.02050822600722313, "learning_rate": 3.576236464735148e-05, "loss": 0.0003, "step": 1946 }, { "epoch": 0.5697980684811238, "grad_norm": 0.006806317251175642, "learning_rate": 3.575504828797191e-05, "loss": 0.0002, "step": 1947 }, { "epoch": 0.5700907228563067, "grad_norm": 0.008568843826651573, "learning_rate": 3.5747731928592335e-05, "loss": 0.0002, "step": 1948 }, { "epoch": 0.5703833772314896, "grad_norm": 0.003100211266428232, "learning_rate": 3.574041556921276e-05, "loss": 0.0001, "step": 1949 }, { "epoch": 0.5706760316066726, "grad_norm": 0.14013731479644775, "learning_rate": 3.5733099209833184e-05, "loss": 0.0006, "step": 1950 }, { "epoch": 0.5709686859818555, "grad_norm": 0.13786007463932037, "learning_rate": 3.572578285045361e-05, "loss": 0.0017, "step": 1951 }, { "epoch": 0.5712613403570383, "grad_norm": 0.0029935319907963276, "learning_rate": 3.571846649107404e-05, "loss": 0.0001, "step": 1952 }, { "epoch": 0.5715539947322212, "grad_norm": 1.9697563648223877, "learning_rate": 3.571115013169447e-05, "loss": 0.2066, "step": 1953 }, { "epoch": 0.5718466491074041, "grad_norm": 0.024167198687791824, "learning_rate": 3.5703833772314896e-05, "loss": 0.0003, "step": 1954 }, { "epoch": 0.572139303482587, "grad_norm": 0.005826764740049839, "learning_rate": 3.5696517412935324e-05, "loss": 0.0001, "step": 1955 }, { "epoch": 0.57243195785777, "grad_norm": 1.85420823097229, "learning_rate": 3.568920105355575e-05, "loss": 0.0104, "step": 1956 }, { "epoch": 0.5727246122329529, "grad_norm": 0.15664705634117126, "learning_rate": 3.568188469417618e-05, "loss": 0.0021, "step": 1957 }, { "epoch": 0.5730172666081358, "grad_norm": 6.557300090789795, "learning_rate": 3.56745683347966e-05, "loss": 0.2577, "step": 1958 }, { "epoch": 0.5733099209833187, "grad_norm": 0.13520510494709015, "learning_rate": 3.566725197541703e-05, "loss": 0.0021, "step": 1959 }, { "epoch": 0.5736025753585016, "grad_norm": 0.031301479786634445, "learning_rate": 3.565993561603746e-05, "loss": 0.0005, "step": 1960 }, { "epoch": 0.5738952297336846, "grad_norm": 0.015609705820679665, "learning_rate": 3.5652619256657885e-05, "loss": 0.0004, "step": 1961 }, { "epoch": 0.5741878841088675, "grad_norm": 0.052701354026794434, "learning_rate": 3.564530289727831e-05, "loss": 0.001, "step": 1962 }, { "epoch": 0.5744805384840503, "grad_norm": 0.007138526998460293, "learning_rate": 3.563798653789874e-05, "loss": 0.0002, "step": 1963 }, { "epoch": 0.5747731928592332, "grad_norm": 0.059247542172670364, "learning_rate": 3.563067017851917e-05, "loss": 0.0008, "step": 1964 }, { "epoch": 0.5750658472344161, "grad_norm": 0.011258046142756939, "learning_rate": 3.5623353819139597e-05, "loss": 0.0002, "step": 1965 }, { "epoch": 0.5753585016095991, "grad_norm": 0.011694671586155891, "learning_rate": 3.5616037459760025e-05, "loss": 0.0003, "step": 1966 }, { "epoch": 0.575651155984782, "grad_norm": 0.02896781824529171, "learning_rate": 3.560872110038045e-05, "loss": 0.0006, "step": 1967 }, { "epoch": 0.5759438103599649, "grad_norm": 4.3482890129089355, "learning_rate": 3.560140474100088e-05, "loss": 0.1504, "step": 1968 }, { "epoch": 0.5762364647351478, "grad_norm": 0.2739729583263397, "learning_rate": 3.55940883816213e-05, "loss": 0.0012, "step": 1969 }, { "epoch": 0.5765291191103307, "grad_norm": 0.013300075195729733, "learning_rate": 3.558677202224173e-05, "loss": 0.0003, "step": 1970 }, { "epoch": 0.5768217734855136, "grad_norm": 0.9926109910011292, "learning_rate": 3.557945566286216e-05, "loss": 0.0046, "step": 1971 }, { "epoch": 0.5771144278606966, "grad_norm": 0.014598459005355835, "learning_rate": 3.5572139303482585e-05, "loss": 0.0003, "step": 1972 }, { "epoch": 0.5774070822358794, "grad_norm": 0.02771534025669098, "learning_rate": 3.556482294410301e-05, "loss": 0.0005, "step": 1973 }, { "epoch": 0.5776997366110623, "grad_norm": 0.058283593505620956, "learning_rate": 3.555750658472344e-05, "loss": 0.0011, "step": 1974 }, { "epoch": 0.5779923909862452, "grad_norm": 0.9233146905899048, "learning_rate": 3.555019022534387e-05, "loss": 0.0078, "step": 1975 }, { "epoch": 0.5782850453614281, "grad_norm": 0.0344410240650177, "learning_rate": 3.55428738659643e-05, "loss": 0.0007, "step": 1976 }, { "epoch": 0.5785776997366111, "grad_norm": 0.9516710638999939, "learning_rate": 3.5535557506584725e-05, "loss": 0.0086, "step": 1977 }, { "epoch": 0.578870354111794, "grad_norm": 0.01560552790760994, "learning_rate": 3.552824114720515e-05, "loss": 0.0003, "step": 1978 }, { "epoch": 0.5791630084869769, "grad_norm": 0.36015036702156067, "learning_rate": 3.552092478782558e-05, "loss": 0.0016, "step": 1979 }, { "epoch": 0.5794556628621598, "grad_norm": 0.013730215840041637, "learning_rate": 3.5513608428446e-05, "loss": 0.0003, "step": 1980 }, { "epoch": 0.5797483172373427, "grad_norm": 0.182551309466362, "learning_rate": 3.550629206906643e-05, "loss": 0.0013, "step": 1981 }, { "epoch": 0.5800409716125257, "grad_norm": 0.010926664806902409, "learning_rate": 3.549897570968686e-05, "loss": 0.0002, "step": 1982 }, { "epoch": 0.5803336259877085, "grad_norm": 0.00620716018602252, "learning_rate": 3.5491659350307286e-05, "loss": 0.0001, "step": 1983 }, { "epoch": 0.5806262803628914, "grad_norm": 0.010993091389536858, "learning_rate": 3.5484342990927714e-05, "loss": 0.0002, "step": 1984 }, { "epoch": 0.5809189347380743, "grad_norm": 0.0013883326901122928, "learning_rate": 3.547702663154814e-05, "loss": 0.0, "step": 1985 }, { "epoch": 0.5812115891132572, "grad_norm": 0.0055152433924376965, "learning_rate": 3.546971027216857e-05, "loss": 0.0001, "step": 1986 }, { "epoch": 0.5815042434884401, "grad_norm": 7.650897026062012, "learning_rate": 3.5462393912789e-05, "loss": 0.0135, "step": 1987 }, { "epoch": 0.5817968978636231, "grad_norm": 0.0018448525806888938, "learning_rate": 3.5455077553409426e-05, "loss": 0.0001, "step": 1988 }, { "epoch": 0.582089552238806, "grad_norm": 0.007545731961727142, "learning_rate": 3.5447761194029854e-05, "loss": 0.0001, "step": 1989 }, { "epoch": 0.5823822066139889, "grad_norm": 0.002951526315882802, "learning_rate": 3.5440444834650275e-05, "loss": 0.0001, "step": 1990 }, { "epoch": 0.5826748609891718, "grad_norm": 0.006255722139030695, "learning_rate": 3.54331284752707e-05, "loss": 0.0002, "step": 1991 }, { "epoch": 0.5829675153643546, "grad_norm": 0.011105085723102093, "learning_rate": 3.542581211589113e-05, "loss": 0.0002, "step": 1992 }, { "epoch": 0.5832601697395376, "grad_norm": 0.05295855924487114, "learning_rate": 3.541849575651156e-05, "loss": 0.0006, "step": 1993 }, { "epoch": 0.5835528241147205, "grad_norm": 0.3890467584133148, "learning_rate": 3.541117939713199e-05, "loss": 0.0014, "step": 1994 }, { "epoch": 0.5838454784899034, "grad_norm": 0.10936211794614792, "learning_rate": 3.5403863037752415e-05, "loss": 0.0011, "step": 1995 }, { "epoch": 0.5841381328650863, "grad_norm": 0.0033165684435516596, "learning_rate": 3.539654667837284e-05, "loss": 0.0001, "step": 1996 }, { "epoch": 0.5844307872402692, "grad_norm": 0.1402590274810791, "learning_rate": 3.538923031899327e-05, "loss": 0.0007, "step": 1997 }, { "epoch": 0.5847234416154522, "grad_norm": 0.003403761889785528, "learning_rate": 3.53819139596137e-05, "loss": 0.0001, "step": 1998 }, { "epoch": 0.5850160959906351, "grad_norm": 0.010263781994581223, "learning_rate": 3.5374597600234127e-05, "loss": 0.0001, "step": 1999 }, { "epoch": 0.585308750365818, "grad_norm": 0.001706862822175026, "learning_rate": 3.5367281240854554e-05, "loss": 0.0, "step": 2000 }, { "epoch": 0.5856014047410009, "grad_norm": 0.03570732846856117, "learning_rate": 3.5359964881474976e-05, "loss": 0.0003, "step": 2001 }, { "epoch": 0.5858940591161838, "grad_norm": 0.0018397400854155421, "learning_rate": 3.5352648522095404e-05, "loss": 0.0, "step": 2002 }, { "epoch": 0.5861867134913668, "grad_norm": 0.0006071400130167603, "learning_rate": 3.534533216271583e-05, "loss": 0.0, "step": 2003 }, { "epoch": 0.5864793678665496, "grad_norm": 0.012687459588050842, "learning_rate": 3.533801580333626e-05, "loss": 0.0002, "step": 2004 }, { "epoch": 0.5867720222417325, "grad_norm": 0.0034372718073427677, "learning_rate": 3.533069944395669e-05, "loss": 0.0001, "step": 2005 }, { "epoch": 0.5870646766169154, "grad_norm": 0.00016561755910515785, "learning_rate": 3.5323383084577115e-05, "loss": 0.0, "step": 2006 }, { "epoch": 0.5873573309920983, "grad_norm": 0.006079351529479027, "learning_rate": 3.531606672519754e-05, "loss": 0.0001, "step": 2007 }, { "epoch": 0.5876499853672812, "grad_norm": 0.3710552752017975, "learning_rate": 3.530875036581797e-05, "loss": 0.0008, "step": 2008 }, { "epoch": 0.5879426397424642, "grad_norm": 0.0008314733277074993, "learning_rate": 3.53014340064384e-05, "loss": 0.0, "step": 2009 }, { "epoch": 0.5882352941176471, "grad_norm": 0.023041611537337303, "learning_rate": 3.529411764705883e-05, "loss": 0.0001, "step": 2010 }, { "epoch": 0.58852794849283, "grad_norm": 0.0008120948914438486, "learning_rate": 3.528680128767925e-05, "loss": 0.0, "step": 2011 }, { "epoch": 0.5888206028680129, "grad_norm": 0.0011115546803921461, "learning_rate": 3.5279484928299676e-05, "loss": 0.0, "step": 2012 }, { "epoch": 0.5891132572431957, "grad_norm": 0.009538223035633564, "learning_rate": 3.5272168568920104e-05, "loss": 0.0002, "step": 2013 }, { "epoch": 0.5894059116183787, "grad_norm": 0.007257380057126284, "learning_rate": 3.526485220954053e-05, "loss": 0.0001, "step": 2014 }, { "epoch": 0.5896985659935616, "grad_norm": 0.0024914678651839495, "learning_rate": 3.525753585016096e-05, "loss": 0.0001, "step": 2015 }, { "epoch": 0.5899912203687445, "grad_norm": 0.004597298800945282, "learning_rate": 3.525021949078139e-05, "loss": 0.0001, "step": 2016 }, { "epoch": 0.5902838747439274, "grad_norm": 0.0012197772739455104, "learning_rate": 3.5242903131401816e-05, "loss": 0.0, "step": 2017 }, { "epoch": 0.5905765291191103, "grad_norm": 0.0014564783778041601, "learning_rate": 3.5235586772022244e-05, "loss": 0.0, "step": 2018 }, { "epoch": 0.5908691834942933, "grad_norm": 0.000955626368522644, "learning_rate": 3.522827041264267e-05, "loss": 0.0, "step": 2019 }, { "epoch": 0.5911618378694762, "grad_norm": 0.0003709029115270823, "learning_rate": 3.52209540532631e-05, "loss": 0.0, "step": 2020 }, { "epoch": 0.5914544922446591, "grad_norm": 0.11591051518917084, "learning_rate": 3.521363769388353e-05, "loss": 0.0005, "step": 2021 }, { "epoch": 0.591747146619842, "grad_norm": 0.001540387631393969, "learning_rate": 3.520632133450395e-05, "loss": 0.0, "step": 2022 }, { "epoch": 0.5920398009950248, "grad_norm": 1.1781518459320068, "learning_rate": 3.519900497512438e-05, "loss": 0.0042, "step": 2023 }, { "epoch": 0.5923324553702077, "grad_norm": 0.0009852898074313998, "learning_rate": 3.5191688615744805e-05, "loss": 0.0, "step": 2024 }, { "epoch": 0.5926251097453907, "grad_norm": 0.0017510764300823212, "learning_rate": 3.518437225636523e-05, "loss": 0.0, "step": 2025 }, { "epoch": 0.5929177641205736, "grad_norm": 0.004155490547418594, "learning_rate": 3.517705589698566e-05, "loss": 0.0001, "step": 2026 }, { "epoch": 0.5932104184957565, "grad_norm": 0.08749844878911972, "learning_rate": 3.516973953760609e-05, "loss": 0.0005, "step": 2027 }, { "epoch": 0.5935030728709394, "grad_norm": 0.0029032984748482704, "learning_rate": 3.516242317822652e-05, "loss": 0.0, "step": 2028 }, { "epoch": 0.5937957272461223, "grad_norm": 0.0009475258993916214, "learning_rate": 3.5155106818846945e-05, "loss": 0.0, "step": 2029 }, { "epoch": 0.5940883816213053, "grad_norm": 0.0036558397114276886, "learning_rate": 3.514779045946737e-05, "loss": 0.0, "step": 2030 }, { "epoch": 0.5943810359964882, "grad_norm": 0.0013729785569012165, "learning_rate": 3.51404741000878e-05, "loss": 0.0, "step": 2031 }, { "epoch": 0.5946736903716711, "grad_norm": 0.06336381286382675, "learning_rate": 3.513315774070823e-05, "loss": 0.0002, "step": 2032 }, { "epoch": 0.594966344746854, "grad_norm": 0.23235172033309937, "learning_rate": 3.512584138132865e-05, "loss": 0.0011, "step": 2033 }, { "epoch": 0.5952589991220368, "grad_norm": 0.08274620771408081, "learning_rate": 3.511852502194908e-05, "loss": 0.0004, "step": 2034 }, { "epoch": 0.5955516534972198, "grad_norm": 11.089838981628418, "learning_rate": 3.5111208662569506e-05, "loss": 0.0176, "step": 2035 }, { "epoch": 0.5958443078724027, "grad_norm": 0.001164097455330193, "learning_rate": 3.5103892303189934e-05, "loss": 0.0, "step": 2036 }, { "epoch": 0.5961369622475856, "grad_norm": 0.0018230958376079798, "learning_rate": 3.509657594381036e-05, "loss": 0.0, "step": 2037 }, { "epoch": 0.5964296166227685, "grad_norm": 0.0010525623802095652, "learning_rate": 3.508925958443079e-05, "loss": 0.0, "step": 2038 }, { "epoch": 0.5967222709979514, "grad_norm": 0.000734026194550097, "learning_rate": 3.508194322505122e-05, "loss": 0.0, "step": 2039 }, { "epoch": 0.5970149253731343, "grad_norm": 0.12918123602867126, "learning_rate": 3.5074626865671645e-05, "loss": 0.0003, "step": 2040 }, { "epoch": 0.5973075797483173, "grad_norm": 0.1338168829679489, "learning_rate": 3.506731050629207e-05, "loss": 0.0005, "step": 2041 }, { "epoch": 0.5976002341235002, "grad_norm": 0.0012683144304901361, "learning_rate": 3.50599941469125e-05, "loss": 0.0, "step": 2042 }, { "epoch": 0.597892888498683, "grad_norm": 0.0012570394901558757, "learning_rate": 3.505267778753292e-05, "loss": 0.0, "step": 2043 }, { "epoch": 0.5981855428738659, "grad_norm": 0.00037313997745513916, "learning_rate": 3.504536142815335e-05, "loss": 0.0, "step": 2044 }, { "epoch": 0.5984781972490488, "grad_norm": 0.0002580972795840353, "learning_rate": 3.503804506877378e-05, "loss": 0.0, "step": 2045 }, { "epoch": 0.5987708516242318, "grad_norm": 0.0012548977974802256, "learning_rate": 3.5030728709394206e-05, "loss": 0.0, "step": 2046 }, { "epoch": 0.5990635059994147, "grad_norm": 0.00040712079498916864, "learning_rate": 3.5023412350014634e-05, "loss": 0.0, "step": 2047 }, { "epoch": 0.5993561603745976, "grad_norm": 0.0012564021162688732, "learning_rate": 3.501609599063506e-05, "loss": 0.0, "step": 2048 }, { "epoch": 0.5996488147497805, "grad_norm": 0.0002876602520700544, "learning_rate": 3.500877963125549e-05, "loss": 0.0, "step": 2049 }, { "epoch": 0.5999414691249634, "grad_norm": 0.00044737564167007804, "learning_rate": 3.500146327187592e-05, "loss": 0.0, "step": 2050 }, { "epoch": 0.6002341235001464, "grad_norm": 25.57400894165039, "learning_rate": 3.4994146912496346e-05, "loss": 0.0318, "step": 2051 }, { "epoch": 0.6005267778753293, "grad_norm": 0.0019366320921108127, "learning_rate": 3.4986830553116774e-05, "loss": 0.0, "step": 2052 }, { "epoch": 0.6008194322505122, "grad_norm": 0.032917775213718414, "learning_rate": 3.49795141937372e-05, "loss": 0.0001, "step": 2053 }, { "epoch": 0.601112086625695, "grad_norm": 0.0001589482999406755, "learning_rate": 3.497219783435762e-05, "loss": 0.0, "step": 2054 }, { "epoch": 0.6014047410008779, "grad_norm": 0.0005509228794835508, "learning_rate": 3.496488147497805e-05, "loss": 0.0, "step": 2055 }, { "epoch": 0.6016973953760608, "grad_norm": 21.30624008178711, "learning_rate": 3.495756511559848e-05, "loss": 0.0818, "step": 2056 }, { "epoch": 0.6019900497512438, "grad_norm": 0.0004659033438656479, "learning_rate": 3.495024875621891e-05, "loss": 0.0, "step": 2057 }, { "epoch": 0.6022827041264267, "grad_norm": 0.0006594470469281077, "learning_rate": 3.4942932396839335e-05, "loss": 0.0, "step": 2058 }, { "epoch": 0.6025753585016096, "grad_norm": 0.003137239022180438, "learning_rate": 3.493561603745976e-05, "loss": 0.0, "step": 2059 }, { "epoch": 0.6028680128767925, "grad_norm": 4.280525207519531, "learning_rate": 3.492829967808019e-05, "loss": 0.0094, "step": 2060 }, { "epoch": 0.6031606672519754, "grad_norm": 0.013040806166827679, "learning_rate": 3.492098331870062e-05, "loss": 0.0001, "step": 2061 }, { "epoch": 0.6034533216271584, "grad_norm": 0.0018619210459291935, "learning_rate": 3.491366695932105e-05, "loss": 0.0, "step": 2062 }, { "epoch": 0.6037459760023413, "grad_norm": 0.004644501954317093, "learning_rate": 3.4906350599941475e-05, "loss": 0.0, "step": 2063 }, { "epoch": 0.6040386303775241, "grad_norm": 0.0015707537531852722, "learning_rate": 3.4899034240561896e-05, "loss": 0.0, "step": 2064 }, { "epoch": 0.604331284752707, "grad_norm": 13.5844144821167, "learning_rate": 3.4891717881182324e-05, "loss": 0.2497, "step": 2065 }, { "epoch": 0.6046239391278899, "grad_norm": 0.007252132520079613, "learning_rate": 3.488440152180275e-05, "loss": 0.0, "step": 2066 }, { "epoch": 0.6049165935030729, "grad_norm": 0.001173017080873251, "learning_rate": 3.487708516242318e-05, "loss": 0.0, "step": 2067 }, { "epoch": 0.6052092478782558, "grad_norm": 0.02849682979285717, "learning_rate": 3.486976880304361e-05, "loss": 0.0001, "step": 2068 }, { "epoch": 0.6055019022534387, "grad_norm": 0.5588051676750183, "learning_rate": 3.4862452443664036e-05, "loss": 0.0015, "step": 2069 }, { "epoch": 0.6057945566286216, "grad_norm": 13.102985382080078, "learning_rate": 3.4855136084284464e-05, "loss": 0.0956, "step": 2070 }, { "epoch": 0.6060872110038045, "grad_norm": 0.00035603917785920203, "learning_rate": 3.484781972490489e-05, "loss": 0.0, "step": 2071 }, { "epoch": 0.6063798653789875, "grad_norm": 2.3206276893615723, "learning_rate": 3.484050336552532e-05, "loss": 0.0066, "step": 2072 }, { "epoch": 0.6066725197541704, "grad_norm": 0.0003959232126362622, "learning_rate": 3.483318700614575e-05, "loss": 0.0, "step": 2073 }, { "epoch": 0.6069651741293532, "grad_norm": 0.00046805141028016806, "learning_rate": 3.4825870646766175e-05, "loss": 0.0, "step": 2074 }, { "epoch": 0.6072578285045361, "grad_norm": 0.041566260159015656, "learning_rate": 3.4818554287386597e-05, "loss": 0.0002, "step": 2075 }, { "epoch": 0.607550482879719, "grad_norm": 0.0005712300189770758, "learning_rate": 3.4811237928007024e-05, "loss": 0.0, "step": 2076 }, { "epoch": 0.6078431372549019, "grad_norm": 0.02407524734735489, "learning_rate": 3.480392156862745e-05, "loss": 0.0001, "step": 2077 }, { "epoch": 0.6081357916300849, "grad_norm": 0.000892152835149318, "learning_rate": 3.479660520924788e-05, "loss": 0.0, "step": 2078 }, { "epoch": 0.6084284460052678, "grad_norm": 7.137636661529541, "learning_rate": 3.478928884986831e-05, "loss": 0.0089, "step": 2079 }, { "epoch": 0.6087211003804507, "grad_norm": 0.0011299810139462352, "learning_rate": 3.4781972490488736e-05, "loss": 0.0, "step": 2080 }, { "epoch": 0.6090137547556336, "grad_norm": 0.0006017218693159521, "learning_rate": 3.4774656131109164e-05, "loss": 0.0, "step": 2081 }, { "epoch": 0.6093064091308165, "grad_norm": 0.0011323533253744245, "learning_rate": 3.476733977172959e-05, "loss": 0.0, "step": 2082 }, { "epoch": 0.6095990635059995, "grad_norm": 0.04937488213181496, "learning_rate": 3.476002341235002e-05, "loss": 0.0001, "step": 2083 }, { "epoch": 0.6098917178811823, "grad_norm": 13.617822647094727, "learning_rate": 3.475270705297045e-05, "loss": 0.1336, "step": 2084 }, { "epoch": 0.6101843722563652, "grad_norm": 0.01302315853536129, "learning_rate": 3.4745390693590876e-05, "loss": 0.0001, "step": 2085 }, { "epoch": 0.6104770266315481, "grad_norm": 0.002732899971306324, "learning_rate": 3.47380743342113e-05, "loss": 0.0001, "step": 2086 }, { "epoch": 0.610769681006731, "grad_norm": 0.09016023576259613, "learning_rate": 3.4730757974831725e-05, "loss": 0.0004, "step": 2087 }, { "epoch": 0.611062335381914, "grad_norm": 0.0027853124774992466, "learning_rate": 3.472344161545215e-05, "loss": 0.0, "step": 2088 }, { "epoch": 0.6113549897570969, "grad_norm": 0.02375302091240883, "learning_rate": 3.471612525607258e-05, "loss": 0.0001, "step": 2089 }, { "epoch": 0.6116476441322798, "grad_norm": 9.565156936645508, "learning_rate": 3.470880889669301e-05, "loss": 0.0315, "step": 2090 }, { "epoch": 0.6119402985074627, "grad_norm": 0.001845332677476108, "learning_rate": 3.470149253731344e-05, "loss": 0.0, "step": 2091 }, { "epoch": 0.6122329528826456, "grad_norm": 0.0012049475917592645, "learning_rate": 3.4694176177933865e-05, "loss": 0.0, "step": 2092 }, { "epoch": 0.6125256072578285, "grad_norm": 0.04229447618126869, "learning_rate": 3.468685981855429e-05, "loss": 0.0001, "step": 2093 }, { "epoch": 0.6128182616330115, "grad_norm": 0.0011197493877261877, "learning_rate": 3.467954345917472e-05, "loss": 0.0, "step": 2094 }, { "epoch": 0.6131109160081943, "grad_norm": 0.0012051331577822566, "learning_rate": 3.467222709979515e-05, "loss": 0.0, "step": 2095 }, { "epoch": 0.6134035703833772, "grad_norm": 0.003435655264183879, "learning_rate": 3.466491074041557e-05, "loss": 0.0, "step": 2096 }, { "epoch": 0.6136962247585601, "grad_norm": 0.0005401496891863644, "learning_rate": 3.4657594381036e-05, "loss": 0.0, "step": 2097 }, { "epoch": 0.613988879133743, "grad_norm": 0.0385284349322319, "learning_rate": 3.4650278021656426e-05, "loss": 0.0002, "step": 2098 }, { "epoch": 0.614281533508926, "grad_norm": 0.004104093182832003, "learning_rate": 3.4642961662276854e-05, "loss": 0.0, "step": 2099 }, { "epoch": 0.6145741878841089, "grad_norm": 0.001205161795951426, "learning_rate": 3.463564530289728e-05, "loss": 0.0, "step": 2100 }, { "epoch": 0.6148668422592918, "grad_norm": 0.0005487053422257304, "learning_rate": 3.462832894351771e-05, "loss": 0.0, "step": 2101 }, { "epoch": 0.6151594966344747, "grad_norm": 0.002301194006577134, "learning_rate": 3.462101258413814e-05, "loss": 0.0, "step": 2102 }, { "epoch": 0.6154521510096576, "grad_norm": 0.06975915282964706, "learning_rate": 3.4613696224758566e-05, "loss": 0.0003, "step": 2103 }, { "epoch": 0.6157448053848406, "grad_norm": 2.4628310203552246, "learning_rate": 3.4606379865378994e-05, "loss": 0.003, "step": 2104 }, { "epoch": 0.6160374597600234, "grad_norm": 0.0005157164996489882, "learning_rate": 3.459906350599942e-05, "loss": 0.0, "step": 2105 }, { "epoch": 0.6163301141352063, "grad_norm": 0.0017777555622160435, "learning_rate": 3.459174714661985e-05, "loss": 0.0, "step": 2106 }, { "epoch": 0.6166227685103892, "grad_norm": 0.0010313241509720683, "learning_rate": 3.458443078724027e-05, "loss": 0.0, "step": 2107 }, { "epoch": 0.6169154228855721, "grad_norm": 0.0005064535071142018, "learning_rate": 3.45771144278607e-05, "loss": 0.0, "step": 2108 }, { "epoch": 0.617208077260755, "grad_norm": 0.0007273271330632269, "learning_rate": 3.4569798068481127e-05, "loss": 0.0, "step": 2109 }, { "epoch": 0.617500731635938, "grad_norm": 0.0013223080895841122, "learning_rate": 3.4562481709101554e-05, "loss": 0.0, "step": 2110 }, { "epoch": 0.6177933860111209, "grad_norm": 0.0034261911641806364, "learning_rate": 3.455516534972198e-05, "loss": 0.0, "step": 2111 }, { "epoch": 0.6180860403863038, "grad_norm": 0.0024893530644476414, "learning_rate": 3.454784899034241e-05, "loss": 0.0, "step": 2112 }, { "epoch": 0.6183786947614867, "grad_norm": 13.454879760742188, "learning_rate": 3.454053263096284e-05, "loss": 0.0475, "step": 2113 }, { "epoch": 0.6186713491366695, "grad_norm": 0.0004851063422393054, "learning_rate": 3.4533216271583266e-05, "loss": 0.0, "step": 2114 }, { "epoch": 0.6189640035118525, "grad_norm": 0.00036992091918364167, "learning_rate": 3.452589991220369e-05, "loss": 0.0, "step": 2115 }, { "epoch": 0.6192566578870354, "grad_norm": 0.0005809810827486217, "learning_rate": 3.4518583552824115e-05, "loss": 0.0, "step": 2116 }, { "epoch": 0.6195493122622183, "grad_norm": 0.0004890324780717492, "learning_rate": 3.451126719344454e-05, "loss": 0.0, "step": 2117 }, { "epoch": 0.6198419666374012, "grad_norm": 0.0031506626401096582, "learning_rate": 3.450395083406497e-05, "loss": 0.0, "step": 2118 }, { "epoch": 0.6201346210125841, "grad_norm": 1.1961182355880737, "learning_rate": 3.44966344746854e-05, "loss": 0.002, "step": 2119 }, { "epoch": 0.6204272753877671, "grad_norm": 0.00019465781224425882, "learning_rate": 3.448931811530583e-05, "loss": 0.0, "step": 2120 }, { "epoch": 0.62071992976295, "grad_norm": 0.0013921204954385757, "learning_rate": 3.4482001755926255e-05, "loss": 0.0, "step": 2121 }, { "epoch": 0.6210125841381329, "grad_norm": 0.00045760799548588693, "learning_rate": 3.447468539654668e-05, "loss": 0.0, "step": 2122 }, { "epoch": 0.6213052385133158, "grad_norm": 0.002929717069491744, "learning_rate": 3.4467369037167104e-05, "loss": 0.0, "step": 2123 }, { "epoch": 0.6215978928884986, "grad_norm": 0.0006322418921627104, "learning_rate": 3.446005267778753e-05, "loss": 0.0, "step": 2124 }, { "epoch": 0.6218905472636815, "grad_norm": 0.00026546447770670056, "learning_rate": 3.445273631840796e-05, "loss": 0.0, "step": 2125 }, { "epoch": 0.6221832016388645, "grad_norm": 0.0008917133673094213, "learning_rate": 3.444541995902839e-05, "loss": 0.0, "step": 2126 }, { "epoch": 0.6224758560140474, "grad_norm": 0.10912005603313446, "learning_rate": 3.4438103599648816e-05, "loss": 0.0003, "step": 2127 }, { "epoch": 0.6227685103892303, "grad_norm": 0.0013562339590862393, "learning_rate": 3.4430787240269244e-05, "loss": 0.0, "step": 2128 }, { "epoch": 0.6230611647644132, "grad_norm": 4.516958713531494, "learning_rate": 3.442347088088967e-05, "loss": 0.191, "step": 2129 }, { "epoch": 0.6233538191395961, "grad_norm": 0.00037854915717616677, "learning_rate": 3.44161545215101e-05, "loss": 0.0, "step": 2130 }, { "epoch": 0.6236464735147791, "grad_norm": 0.00032612564973533154, "learning_rate": 3.440883816213052e-05, "loss": 0.0, "step": 2131 }, { "epoch": 0.623939127889962, "grad_norm": 0.0030479480046778917, "learning_rate": 3.440152180275095e-05, "loss": 0.0, "step": 2132 }, { "epoch": 0.6242317822651449, "grad_norm": 0.0015022482257336378, "learning_rate": 3.439420544337138e-05, "loss": 0.0, "step": 2133 }, { "epoch": 0.6245244366403278, "grad_norm": 0.0004719913122244179, "learning_rate": 3.4386889083991805e-05, "loss": 0.0, "step": 2134 }, { "epoch": 0.6248170910155106, "grad_norm": 0.0012320306850597262, "learning_rate": 3.437957272461223e-05, "loss": 0.0, "step": 2135 }, { "epoch": 0.6251097453906936, "grad_norm": 1.3019874095916748, "learning_rate": 3.437225636523266e-05, "loss": 0.0043, "step": 2136 }, { "epoch": 0.6254023997658765, "grad_norm": 0.004123196937143803, "learning_rate": 3.436494000585309e-05, "loss": 0.0001, "step": 2137 }, { "epoch": 0.6256950541410594, "grad_norm": 0.004008032847195864, "learning_rate": 3.435762364647352e-05, "loss": 0.0001, "step": 2138 }, { "epoch": 0.6259877085162423, "grad_norm": 0.000680704484693706, "learning_rate": 3.435030728709394e-05, "loss": 0.0, "step": 2139 }, { "epoch": 0.6262803628914252, "grad_norm": 0.0005675011198036373, "learning_rate": 3.4342990927714366e-05, "loss": 0.0, "step": 2140 }, { "epoch": 0.6265730172666082, "grad_norm": 0.03600315377116203, "learning_rate": 3.4335674568334794e-05, "loss": 0.0003, "step": 2141 }, { "epoch": 0.6268656716417911, "grad_norm": 0.00038307654904201627, "learning_rate": 3.432835820895522e-05, "loss": 0.0, "step": 2142 }, { "epoch": 0.627158326016974, "grad_norm": 0.0026059714145958424, "learning_rate": 3.432104184957565e-05, "loss": 0.0, "step": 2143 }, { "epoch": 0.6274509803921569, "grad_norm": 0.0010734302923083305, "learning_rate": 3.431372549019608e-05, "loss": 0.0, "step": 2144 }, { "epoch": 0.6277436347673397, "grad_norm": 0.0013862098567187786, "learning_rate": 3.4306409130816506e-05, "loss": 0.0, "step": 2145 }, { "epoch": 0.6280362891425226, "grad_norm": 0.0029694773256778717, "learning_rate": 3.4299092771436934e-05, "loss": 0.0001, "step": 2146 }, { "epoch": 0.6283289435177056, "grad_norm": 0.35501620173454285, "learning_rate": 3.429177641205736e-05, "loss": 0.0006, "step": 2147 }, { "epoch": 0.6286215978928885, "grad_norm": 0.0010110613657161593, "learning_rate": 3.428446005267779e-05, "loss": 0.0, "step": 2148 }, { "epoch": 0.6289142522680714, "grad_norm": 0.003870262997224927, "learning_rate": 3.427714369329821e-05, "loss": 0.0001, "step": 2149 }, { "epoch": 0.6292069066432543, "grad_norm": 0.0006364476284943521, "learning_rate": 3.426982733391864e-05, "loss": 0.0, "step": 2150 }, { "epoch": 0.6294995610184372, "grad_norm": 0.000568551302421838, "learning_rate": 3.4262510974539067e-05, "loss": 0.0, "step": 2151 }, { "epoch": 0.6297922153936202, "grad_norm": 0.016431095078587532, "learning_rate": 3.4255194615159495e-05, "loss": 0.0001, "step": 2152 }, { "epoch": 0.6300848697688031, "grad_norm": 5.688620090484619, "learning_rate": 3.424787825577992e-05, "loss": 0.1346, "step": 2153 }, { "epoch": 0.630377524143986, "grad_norm": 0.0009870333597064018, "learning_rate": 3.424056189640035e-05, "loss": 0.0, "step": 2154 }, { "epoch": 0.6306701785191688, "grad_norm": 0.007537742145359516, "learning_rate": 3.423324553702078e-05, "loss": 0.0001, "step": 2155 }, { "epoch": 0.6309628328943517, "grad_norm": 0.003192983567714691, "learning_rate": 3.4225929177641206e-05, "loss": 0.0001, "step": 2156 }, { "epoch": 0.6312554872695347, "grad_norm": 0.0012039679568260908, "learning_rate": 3.4218612818261634e-05, "loss": 0.0, "step": 2157 }, { "epoch": 0.6315481416447176, "grad_norm": 0.05299927666783333, "learning_rate": 3.421129645888206e-05, "loss": 0.0005, "step": 2158 }, { "epoch": 0.6318407960199005, "grad_norm": 0.11062745749950409, "learning_rate": 3.420398009950249e-05, "loss": 0.0009, "step": 2159 }, { "epoch": 0.6321334503950834, "grad_norm": 0.3257235586643219, "learning_rate": 3.419666374012291e-05, "loss": 0.002, "step": 2160 }, { "epoch": 0.6324261047702663, "grad_norm": 0.0014030158054083586, "learning_rate": 3.418934738074334e-05, "loss": 0.0, "step": 2161 }, { "epoch": 0.6327187591454492, "grad_norm": 4.451821327209473, "learning_rate": 3.418203102136377e-05, "loss": 0.2209, "step": 2162 }, { "epoch": 0.6330114135206322, "grad_norm": 0.053129829466342926, "learning_rate": 3.4174714661984195e-05, "loss": 0.0003, "step": 2163 }, { "epoch": 0.6333040678958151, "grad_norm": 0.002650732407346368, "learning_rate": 3.416739830260462e-05, "loss": 0.0001, "step": 2164 }, { "epoch": 0.633596722270998, "grad_norm": 0.0010525553952902555, "learning_rate": 3.416008194322505e-05, "loss": 0.0, "step": 2165 }, { "epoch": 0.6338893766461808, "grad_norm": 0.0010408986127004027, "learning_rate": 3.415276558384548e-05, "loss": 0.0, "step": 2166 }, { "epoch": 0.6341820310213637, "grad_norm": 0.0022852777037769556, "learning_rate": 3.414544922446591e-05, "loss": 0.0, "step": 2167 }, { "epoch": 0.6344746853965467, "grad_norm": 2.6670784950256348, "learning_rate": 3.4138132865086335e-05, "loss": 0.3112, "step": 2168 }, { "epoch": 0.6347673397717296, "grad_norm": 0.009369021281599998, "learning_rate": 3.413081650570676e-05, "loss": 0.0002, "step": 2169 }, { "epoch": 0.6350599941469125, "grad_norm": 0.004362731706351042, "learning_rate": 3.412350014632719e-05, "loss": 0.0001, "step": 2170 }, { "epoch": 0.6353526485220954, "grad_norm": 0.022487707436084747, "learning_rate": 3.411618378694761e-05, "loss": 0.0004, "step": 2171 }, { "epoch": 0.6356453028972783, "grad_norm": 0.06050792708992958, "learning_rate": 3.410886742756804e-05, "loss": 0.0009, "step": 2172 }, { "epoch": 0.6359379572724613, "grad_norm": 0.05857773497700691, "learning_rate": 3.410155106818847e-05, "loss": 0.0011, "step": 2173 }, { "epoch": 0.6362306116476442, "grad_norm": 0.5926027297973633, "learning_rate": 3.4094234708808896e-05, "loss": 0.0063, "step": 2174 }, { "epoch": 0.636523266022827, "grad_norm": 0.14563141763210297, "learning_rate": 3.4086918349429324e-05, "loss": 0.0022, "step": 2175 }, { "epoch": 0.6368159203980099, "grad_norm": 0.0783274695277214, "learning_rate": 3.407960199004975e-05, "loss": 0.0015, "step": 2176 }, { "epoch": 0.6371085747731928, "grad_norm": 0.1807767003774643, "learning_rate": 3.407228563067018e-05, "loss": 0.0022, "step": 2177 }, { "epoch": 0.6374012291483757, "grad_norm": 0.06697569787502289, "learning_rate": 3.406496927129061e-05, "loss": 0.0012, "step": 2178 }, { "epoch": 0.6376938835235587, "grad_norm": 4.3055644035339355, "learning_rate": 3.4057652911911036e-05, "loss": 0.0112, "step": 2179 }, { "epoch": 0.6379865378987416, "grad_norm": 0.01856839284300804, "learning_rate": 3.4050336552531464e-05, "loss": 0.0004, "step": 2180 }, { "epoch": 0.6382791922739245, "grad_norm": 0.027890879660844803, "learning_rate": 3.4043020193151885e-05, "loss": 0.0005, "step": 2181 }, { "epoch": 0.6385718466491074, "grad_norm": 0.030753012746572495, "learning_rate": 3.403570383377231e-05, "loss": 0.0006, "step": 2182 }, { "epoch": 0.6388645010242903, "grad_norm": 0.01352408342063427, "learning_rate": 3.402838747439274e-05, "loss": 0.0003, "step": 2183 }, { "epoch": 0.6391571553994733, "grad_norm": 0.009226124733686447, "learning_rate": 3.402107111501317e-05, "loss": 0.0002, "step": 2184 }, { "epoch": 0.6394498097746562, "grad_norm": 0.01322910189628601, "learning_rate": 3.4013754755633597e-05, "loss": 0.0003, "step": 2185 }, { "epoch": 0.639742464149839, "grad_norm": 0.015596003271639347, "learning_rate": 3.4006438396254024e-05, "loss": 0.0004, "step": 2186 }, { "epoch": 0.6400351185250219, "grad_norm": 0.006880583707243204, "learning_rate": 3.399912203687445e-05, "loss": 0.0002, "step": 2187 }, { "epoch": 0.6403277729002048, "grad_norm": 0.010393386706709862, "learning_rate": 3.399180567749488e-05, "loss": 0.0003, "step": 2188 }, { "epoch": 0.6406204272753878, "grad_norm": 0.008832590654492378, "learning_rate": 3.398448931811531e-05, "loss": 0.0002, "step": 2189 }, { "epoch": 0.6409130816505707, "grad_norm": 0.02388172410428524, "learning_rate": 3.3977172958735736e-05, "loss": 0.0003, "step": 2190 }, { "epoch": 0.6412057360257536, "grad_norm": 0.04809088632464409, "learning_rate": 3.3969856599356164e-05, "loss": 0.0004, "step": 2191 }, { "epoch": 0.6414983904009365, "grad_norm": 0.003923215437680483, "learning_rate": 3.3962540239976585e-05, "loss": 0.0001, "step": 2192 }, { "epoch": 0.6417910447761194, "grad_norm": 0.007215586956590414, "learning_rate": 3.395522388059701e-05, "loss": 0.0002, "step": 2193 }, { "epoch": 0.6420836991513024, "grad_norm": 0.0031919616740196943, "learning_rate": 3.394790752121744e-05, "loss": 0.0001, "step": 2194 }, { "epoch": 0.6423763535264853, "grad_norm": 14.256848335266113, "learning_rate": 3.394059116183787e-05, "loss": 0.1273, "step": 2195 }, { "epoch": 0.6426690079016681, "grad_norm": 0.0035253202077001333, "learning_rate": 3.39332748024583e-05, "loss": 0.0001, "step": 2196 }, { "epoch": 0.642961662276851, "grad_norm": 1.0613200664520264, "learning_rate": 3.3925958443078725e-05, "loss": 0.0027, "step": 2197 }, { "epoch": 0.6432543166520339, "grad_norm": 0.004639477469027042, "learning_rate": 3.391864208369915e-05, "loss": 0.0001, "step": 2198 }, { "epoch": 0.6435469710272168, "grad_norm": 0.0037832509260624647, "learning_rate": 3.391132572431958e-05, "loss": 0.0001, "step": 2199 }, { "epoch": 0.6438396254023998, "grad_norm": 0.004914381075650454, "learning_rate": 3.390400936494001e-05, "loss": 0.0001, "step": 2200 }, { "epoch": 0.6441322797775827, "grad_norm": 0.01405208557844162, "learning_rate": 3.389669300556044e-05, "loss": 0.0002, "step": 2201 }, { "epoch": 0.6444249341527656, "grad_norm": 0.03063022904098034, "learning_rate": 3.388937664618086e-05, "loss": 0.0004, "step": 2202 }, { "epoch": 0.6447175885279485, "grad_norm": 0.00888260081410408, "learning_rate": 3.3882060286801286e-05, "loss": 0.0001, "step": 2203 }, { "epoch": 0.6450102429031314, "grad_norm": 0.2707194983959198, "learning_rate": 3.3874743927421714e-05, "loss": 0.0012, "step": 2204 }, { "epoch": 0.6453028972783144, "grad_norm": 11.670083045959473, "learning_rate": 3.386742756804214e-05, "loss": 0.3591, "step": 2205 }, { "epoch": 0.6455955516534972, "grad_norm": 0.04171549156308174, "learning_rate": 3.386011120866257e-05, "loss": 0.0003, "step": 2206 }, { "epoch": 0.6458882060286801, "grad_norm": 0.004247634205967188, "learning_rate": 3.3852794849283e-05, "loss": 0.0001, "step": 2207 }, { "epoch": 0.646180860403863, "grad_norm": 0.00270745693705976, "learning_rate": 3.3845478489903426e-05, "loss": 0.0001, "step": 2208 }, { "epoch": 0.6464735147790459, "grad_norm": 0.015967080369591713, "learning_rate": 3.3838162130523854e-05, "loss": 0.0002, "step": 2209 }, { "epoch": 0.6467661691542289, "grad_norm": 0.006121322978287935, "learning_rate": 3.383084577114428e-05, "loss": 0.0001, "step": 2210 }, { "epoch": 0.6470588235294118, "grad_norm": 0.010700903832912445, "learning_rate": 3.382352941176471e-05, "loss": 0.0002, "step": 2211 }, { "epoch": 0.6473514779045947, "grad_norm": 5.404544353485107, "learning_rate": 3.381621305238514e-05, "loss": 0.1908, "step": 2212 }, { "epoch": 0.6476441322797776, "grad_norm": 5.264749526977539, "learning_rate": 3.380889669300556e-05, "loss": 0.0133, "step": 2213 }, { "epoch": 0.6479367866549605, "grad_norm": 0.004720195196568966, "learning_rate": 3.380158033362599e-05, "loss": 0.0001, "step": 2214 }, { "epoch": 0.6482294410301433, "grad_norm": 0.006057473830878735, "learning_rate": 3.3794263974246415e-05, "loss": 0.0001, "step": 2215 }, { "epoch": 0.6485220954053263, "grad_norm": 0.003476094687357545, "learning_rate": 3.378694761486684e-05, "loss": 0.0001, "step": 2216 }, { "epoch": 0.6488147497805092, "grad_norm": 0.0119353411719203, "learning_rate": 3.377963125548727e-05, "loss": 0.0002, "step": 2217 }, { "epoch": 0.6491074041556921, "grad_norm": 0.004302394576370716, "learning_rate": 3.37723148961077e-05, "loss": 0.0001, "step": 2218 }, { "epoch": 0.649400058530875, "grad_norm": 0.0031735720112919807, "learning_rate": 3.3764998536728127e-05, "loss": 0.0001, "step": 2219 }, { "epoch": 0.6496927129060579, "grad_norm": 0.003988498356193304, "learning_rate": 3.3757682177348554e-05, "loss": 0.0001, "step": 2220 }, { "epoch": 0.6499853672812409, "grad_norm": 0.004030546173453331, "learning_rate": 3.375036581796898e-05, "loss": 0.0001, "step": 2221 }, { "epoch": 0.6502780216564238, "grad_norm": 0.011398413218557835, "learning_rate": 3.374304945858941e-05, "loss": 0.0003, "step": 2222 }, { "epoch": 0.6505706760316067, "grad_norm": 0.011405447497963905, "learning_rate": 3.373573309920984e-05, "loss": 0.0002, "step": 2223 }, { "epoch": 0.6508633304067896, "grad_norm": 0.01702878251671791, "learning_rate": 3.372841673983026e-05, "loss": 0.0003, "step": 2224 }, { "epoch": 0.6511559847819725, "grad_norm": 11.716462135314941, "learning_rate": 3.372110038045069e-05, "loss": 0.0334, "step": 2225 }, { "epoch": 0.6514486391571555, "grad_norm": 0.019067659974098206, "learning_rate": 3.3713784021071115e-05, "loss": 0.0002, "step": 2226 }, { "epoch": 0.6517412935323383, "grad_norm": 0.044808294624090195, "learning_rate": 3.370646766169154e-05, "loss": 0.0005, "step": 2227 }, { "epoch": 0.6520339479075212, "grad_norm": 0.1213802695274353, "learning_rate": 3.369915130231197e-05, "loss": 0.0007, "step": 2228 }, { "epoch": 0.6523266022827041, "grad_norm": 0.00575533602386713, "learning_rate": 3.36918349429324e-05, "loss": 0.0002, "step": 2229 }, { "epoch": 0.652619256657887, "grad_norm": 0.07758716493844986, "learning_rate": 3.368451858355283e-05, "loss": 0.0007, "step": 2230 }, { "epoch": 0.6529119110330699, "grad_norm": 0.0531279556453228, "learning_rate": 3.3677202224173255e-05, "loss": 0.0005, "step": 2231 }, { "epoch": 0.6532045654082529, "grad_norm": 0.0054940213449299335, "learning_rate": 3.366988586479368e-05, "loss": 0.0001, "step": 2232 }, { "epoch": 0.6534972197834358, "grad_norm": 0.09491506218910217, "learning_rate": 3.366256950541411e-05, "loss": 0.0005, "step": 2233 }, { "epoch": 0.6537898741586187, "grad_norm": 0.00495274318382144, "learning_rate": 3.365525314603453e-05, "loss": 0.0001, "step": 2234 }, { "epoch": 0.6540825285338016, "grad_norm": 0.007331415545195341, "learning_rate": 3.364793678665496e-05, "loss": 0.0001, "step": 2235 }, { "epoch": 0.6543751829089844, "grad_norm": 0.008214449509978294, "learning_rate": 3.364062042727539e-05, "loss": 0.0002, "step": 2236 }, { "epoch": 0.6546678372841674, "grad_norm": 0.004883881658315659, "learning_rate": 3.3633304067895816e-05, "loss": 0.0001, "step": 2237 }, { "epoch": 0.6549604916593503, "grad_norm": 0.00528697669506073, "learning_rate": 3.3625987708516244e-05, "loss": 0.0001, "step": 2238 }, { "epoch": 0.6552531460345332, "grad_norm": 5.757792949676514, "learning_rate": 3.361867134913667e-05, "loss": 0.0566, "step": 2239 }, { "epoch": 0.6555458004097161, "grad_norm": 0.1155574768781662, "learning_rate": 3.36113549897571e-05, "loss": 0.001, "step": 2240 }, { "epoch": 0.655838454784899, "grad_norm": 0.003949552774429321, "learning_rate": 3.360403863037753e-05, "loss": 0.0001, "step": 2241 }, { "epoch": 0.656131109160082, "grad_norm": 0.005053091794252396, "learning_rate": 3.3596722270997956e-05, "loss": 0.0001, "step": 2242 }, { "epoch": 0.6564237635352649, "grad_norm": 0.0025514643639326096, "learning_rate": 3.3589405911618384e-05, "loss": 0.0001, "step": 2243 }, { "epoch": 0.6567164179104478, "grad_norm": 0.0027982911560684443, "learning_rate": 3.358208955223881e-05, "loss": 0.0001, "step": 2244 }, { "epoch": 0.6570090722856307, "grad_norm": 0.13875851035118103, "learning_rate": 3.357477319285923e-05, "loss": 0.0006, "step": 2245 }, { "epoch": 0.6573017266608135, "grad_norm": 0.002096978249028325, "learning_rate": 3.356745683347966e-05, "loss": 0.0001, "step": 2246 }, { "epoch": 0.6575943810359964, "grad_norm": 0.6293127536773682, "learning_rate": 3.356014047410009e-05, "loss": 0.0022, "step": 2247 }, { "epoch": 0.6578870354111794, "grad_norm": 0.001957373693585396, "learning_rate": 3.355282411472052e-05, "loss": 0.0001, "step": 2248 }, { "epoch": 0.6581796897863623, "grad_norm": 0.03885246813297272, "learning_rate": 3.3545507755340945e-05, "loss": 0.0003, "step": 2249 }, { "epoch": 0.6584723441615452, "grad_norm": 0.00180476950481534, "learning_rate": 3.353819139596137e-05, "loss": 0.0, "step": 2250 }, { "epoch": 0.6587649985367281, "grad_norm": 0.008187128230929375, "learning_rate": 3.35308750365818e-05, "loss": 0.0002, "step": 2251 }, { "epoch": 0.659057652911911, "grad_norm": 0.0025216848589479923, "learning_rate": 3.352355867720223e-05, "loss": 0.0001, "step": 2252 }, { "epoch": 0.659350307287094, "grad_norm": 0.0019639465026557446, "learning_rate": 3.3516242317822657e-05, "loss": 0.0001, "step": 2253 }, { "epoch": 0.6596429616622769, "grad_norm": 0.04376494884490967, "learning_rate": 3.3508925958443084e-05, "loss": 0.0002, "step": 2254 }, { "epoch": 0.6599356160374598, "grad_norm": 0.38673028349876404, "learning_rate": 3.350160959906351e-05, "loss": 0.001, "step": 2255 }, { "epoch": 0.6602282704126426, "grad_norm": 0.005738749168813229, "learning_rate": 3.3494293239683934e-05, "loss": 0.0001, "step": 2256 }, { "epoch": 0.6605209247878255, "grad_norm": 2.2870633602142334, "learning_rate": 3.348697688030436e-05, "loss": 0.0064, "step": 2257 }, { "epoch": 0.6608135791630085, "grad_norm": 0.0019669472239911556, "learning_rate": 3.347966052092479e-05, "loss": 0.0, "step": 2258 }, { "epoch": 0.6611062335381914, "grad_norm": 0.002588227391242981, "learning_rate": 3.347234416154522e-05, "loss": 0.0001, "step": 2259 }, { "epoch": 0.6613988879133743, "grad_norm": 0.001294539077207446, "learning_rate": 3.3465027802165645e-05, "loss": 0.0, "step": 2260 }, { "epoch": 0.6616915422885572, "grad_norm": 0.00048353031161241233, "learning_rate": 3.345771144278607e-05, "loss": 0.0, "step": 2261 }, { "epoch": 0.6619841966637401, "grad_norm": 0.0015470280777662992, "learning_rate": 3.34503950834065e-05, "loss": 0.0, "step": 2262 }, { "epoch": 0.6622768510389231, "grad_norm": 4.904445171356201, "learning_rate": 3.344307872402693e-05, "loss": 0.011, "step": 2263 }, { "epoch": 0.662569505414106, "grad_norm": 2.363774538040161, "learning_rate": 3.343576236464736e-05, "loss": 0.0055, "step": 2264 }, { "epoch": 0.6628621597892889, "grad_norm": 0.0030767517164349556, "learning_rate": 3.3428446005267785e-05, "loss": 0.0001, "step": 2265 }, { "epoch": 0.6631548141644718, "grad_norm": 0.008060317486524582, "learning_rate": 3.3421129645888206e-05, "loss": 0.0001, "step": 2266 }, { "epoch": 0.6634474685396546, "grad_norm": 0.03407042846083641, "learning_rate": 3.3413813286508634e-05, "loss": 0.0002, "step": 2267 }, { "epoch": 0.6637401229148375, "grad_norm": 0.0033131930977106094, "learning_rate": 3.340649692712906e-05, "loss": 0.0001, "step": 2268 }, { "epoch": 0.6640327772900205, "grad_norm": 0.007835319265723228, "learning_rate": 3.339918056774949e-05, "loss": 0.0001, "step": 2269 }, { "epoch": 0.6643254316652034, "grad_norm": 0.001285523409023881, "learning_rate": 3.339186420836992e-05, "loss": 0.0, "step": 2270 }, { "epoch": 0.6646180860403863, "grad_norm": 0.010365760885179043, "learning_rate": 3.3384547848990346e-05, "loss": 0.0001, "step": 2271 }, { "epoch": 0.6649107404155692, "grad_norm": 0.0191858671605587, "learning_rate": 3.3377231489610774e-05, "loss": 0.0001, "step": 2272 }, { "epoch": 0.6652033947907521, "grad_norm": 15.162426948547363, "learning_rate": 3.33699151302312e-05, "loss": 0.0439, "step": 2273 }, { "epoch": 0.6654960491659351, "grad_norm": 0.0013493781443685293, "learning_rate": 3.336259877085163e-05, "loss": 0.0, "step": 2274 }, { "epoch": 0.665788703541118, "grad_norm": 0.0016791290836408734, "learning_rate": 3.335528241147206e-05, "loss": 0.0, "step": 2275 }, { "epoch": 0.6660813579163009, "grad_norm": 0.001705312286503613, "learning_rate": 3.3347966052092486e-05, "loss": 0.0, "step": 2276 }, { "epoch": 0.6663740122914837, "grad_norm": 0.001440341817215085, "learning_rate": 3.334064969271291e-05, "loss": 0.0, "step": 2277 }, { "epoch": 0.6666666666666666, "grad_norm": 2.711886167526245, "learning_rate": 3.3333333333333335e-05, "loss": 0.0065, "step": 2278 }, { "epoch": 0.6669593210418496, "grad_norm": 0.00304962950758636, "learning_rate": 3.332601697395376e-05, "loss": 0.0001, "step": 2279 }, { "epoch": 0.6672519754170325, "grad_norm": 5.063312530517578, "learning_rate": 3.331870061457419e-05, "loss": 0.2016, "step": 2280 }, { "epoch": 0.6675446297922154, "grad_norm": 0.013917754404246807, "learning_rate": 3.331138425519462e-05, "loss": 0.0001, "step": 2281 }, { "epoch": 0.6678372841673983, "grad_norm": 0.0017429639119654894, "learning_rate": 3.330406789581505e-05, "loss": 0.0, "step": 2282 }, { "epoch": 0.6681299385425812, "grad_norm": 0.0008035599021241069, "learning_rate": 3.3296751536435475e-05, "loss": 0.0, "step": 2283 }, { "epoch": 0.6684225929177641, "grad_norm": 0.0014676072169095278, "learning_rate": 3.32894351770559e-05, "loss": 0.0, "step": 2284 }, { "epoch": 0.6687152472929471, "grad_norm": 0.000980646931566298, "learning_rate": 3.328211881767633e-05, "loss": 0.0, "step": 2285 }, { "epoch": 0.66900790166813, "grad_norm": 5.8885579109191895, "learning_rate": 3.327480245829676e-05, "loss": 0.0173, "step": 2286 }, { "epoch": 0.6693005560433128, "grad_norm": 0.0011138279223814607, "learning_rate": 3.326748609891718e-05, "loss": 0.0, "step": 2287 }, { "epoch": 0.6695932104184957, "grad_norm": 1.5539418458938599, "learning_rate": 3.326016973953761e-05, "loss": 0.0044, "step": 2288 }, { "epoch": 0.6698858647936786, "grad_norm": 0.0023576596286147833, "learning_rate": 3.3252853380158036e-05, "loss": 0.0001, "step": 2289 }, { "epoch": 0.6701785191688616, "grad_norm": 0.00780422193929553, "learning_rate": 3.3245537020778464e-05, "loss": 0.0001, "step": 2290 }, { "epoch": 0.6704711735440445, "grad_norm": 0.004599100910127163, "learning_rate": 3.323822066139889e-05, "loss": 0.0001, "step": 2291 }, { "epoch": 0.6707638279192274, "grad_norm": 0.0034668464213609695, "learning_rate": 3.323090430201932e-05, "loss": 0.0001, "step": 2292 }, { "epoch": 0.6710564822944103, "grad_norm": 0.155696839094162, "learning_rate": 3.322358794263975e-05, "loss": 0.0009, "step": 2293 }, { "epoch": 0.6713491366695932, "grad_norm": 0.0032399471383541822, "learning_rate": 3.3216271583260175e-05, "loss": 0.0001, "step": 2294 }, { "epoch": 0.6716417910447762, "grad_norm": 2.8677914142608643, "learning_rate": 3.32089552238806e-05, "loss": 0.0065, "step": 2295 }, { "epoch": 0.6719344454199591, "grad_norm": 0.001276138355024159, "learning_rate": 3.320163886450103e-05, "loss": 0.0, "step": 2296 }, { "epoch": 0.672227099795142, "grad_norm": 0.0016609752783551812, "learning_rate": 3.319432250512145e-05, "loss": 0.0, "step": 2297 }, { "epoch": 0.6725197541703248, "grad_norm": 0.0012457565171644092, "learning_rate": 3.318700614574188e-05, "loss": 0.0, "step": 2298 }, { "epoch": 0.6728124085455077, "grad_norm": 0.000976709881797433, "learning_rate": 3.317968978636231e-05, "loss": 0.0, "step": 2299 }, { "epoch": 0.6731050629206906, "grad_norm": 0.00264275586232543, "learning_rate": 3.3172373426982736e-05, "loss": 0.0001, "step": 2300 }, { "epoch": 0.6733977172958736, "grad_norm": 0.0018932694802060723, "learning_rate": 3.3165057067603164e-05, "loss": 0.0, "step": 2301 }, { "epoch": 0.6736903716710565, "grad_norm": 0.006679253187030554, "learning_rate": 3.315774070822359e-05, "loss": 0.0001, "step": 2302 }, { "epoch": 0.6739830260462394, "grad_norm": 0.0025321717839688063, "learning_rate": 3.315042434884402e-05, "loss": 0.0001, "step": 2303 }, { "epoch": 0.6742756804214223, "grad_norm": 0.00163306575268507, "learning_rate": 3.314310798946445e-05, "loss": 0.0, "step": 2304 }, { "epoch": 0.6745683347966052, "grad_norm": 0.0017003427492454648, "learning_rate": 3.313579163008487e-05, "loss": 0.0, "step": 2305 }, { "epoch": 0.6748609891717882, "grad_norm": 0.0021983168553560972, "learning_rate": 3.31284752707053e-05, "loss": 0.0001, "step": 2306 }, { "epoch": 0.675153643546971, "grad_norm": 0.0022590961307287216, "learning_rate": 3.3121158911325725e-05, "loss": 0.0, "step": 2307 }, { "epoch": 0.6754462979221539, "grad_norm": 0.00828596856445074, "learning_rate": 3.311384255194615e-05, "loss": 0.0001, "step": 2308 }, { "epoch": 0.6757389522973368, "grad_norm": 0.006290559656918049, "learning_rate": 3.310652619256658e-05, "loss": 0.0001, "step": 2309 }, { "epoch": 0.6760316066725197, "grad_norm": 0.0021867703180760145, "learning_rate": 3.309920983318701e-05, "loss": 0.0, "step": 2310 }, { "epoch": 0.6763242610477027, "grad_norm": 0.001774878241121769, "learning_rate": 3.309189347380744e-05, "loss": 0.0, "step": 2311 }, { "epoch": 0.6766169154228856, "grad_norm": 10.279358863830566, "learning_rate": 3.3084577114427865e-05, "loss": 0.077, "step": 2312 }, { "epoch": 0.6769095697980685, "grad_norm": 0.002121191006153822, "learning_rate": 3.3077260755048286e-05, "loss": 0.0, "step": 2313 }, { "epoch": 0.6772022241732514, "grad_norm": 2.7408127784729004, "learning_rate": 3.3069944395668714e-05, "loss": 0.234, "step": 2314 }, { "epoch": 0.6774948785484343, "grad_norm": 0.0018349678721278906, "learning_rate": 3.306262803628914e-05, "loss": 0.0, "step": 2315 }, { "epoch": 0.6777875329236172, "grad_norm": 4.3838348388671875, "learning_rate": 3.305531167690957e-05, "loss": 0.1928, "step": 2316 }, { "epoch": 0.6780801872988002, "grad_norm": 0.0037601948715746403, "learning_rate": 3.304799531753e-05, "loss": 0.0001, "step": 2317 }, { "epoch": 0.678372841673983, "grad_norm": 0.06992469727993011, "learning_rate": 3.3040678958150426e-05, "loss": 0.0008, "step": 2318 }, { "epoch": 0.6786654960491659, "grad_norm": 20.966833114624023, "learning_rate": 3.3033362598770854e-05, "loss": 0.0917, "step": 2319 }, { "epoch": 0.6789581504243488, "grad_norm": 0.028865372762084007, "learning_rate": 3.302604623939128e-05, "loss": 0.0005, "step": 2320 }, { "epoch": 0.6792508047995317, "grad_norm": 0.012698938138782978, "learning_rate": 3.30187298800117e-05, "loss": 0.0003, "step": 2321 }, { "epoch": 0.6795434591747147, "grad_norm": 0.006503901444375515, "learning_rate": 3.301141352063213e-05, "loss": 0.0001, "step": 2322 }, { "epoch": 0.6798361135498976, "grad_norm": 0.12220772355794907, "learning_rate": 3.300409716125256e-05, "loss": 0.0017, "step": 2323 }, { "epoch": 0.6801287679250805, "grad_norm": 0.011729221791028976, "learning_rate": 3.299678080187299e-05, "loss": 0.0002, "step": 2324 }, { "epoch": 0.6804214223002634, "grad_norm": 0.07524167001247406, "learning_rate": 3.2989464442493415e-05, "loss": 0.0013, "step": 2325 }, { "epoch": 0.6807140766754463, "grad_norm": 2.473724842071533, "learning_rate": 3.298214808311384e-05, "loss": 0.0126, "step": 2326 }, { "epoch": 0.6810067310506293, "grad_norm": 0.01172594353556633, "learning_rate": 3.297483172373427e-05, "loss": 0.0002, "step": 2327 }, { "epoch": 0.6812993854258121, "grad_norm": 0.4100677967071533, "learning_rate": 3.29675153643547e-05, "loss": 0.0046, "step": 2328 }, { "epoch": 0.681592039800995, "grad_norm": 0.0013916159514337778, "learning_rate": 3.2960199004975127e-05, "loss": 0.0, "step": 2329 }, { "epoch": 0.6818846941761779, "grad_norm": 0.022183818742632866, "learning_rate": 3.295288264559555e-05, "loss": 0.0004, "step": 2330 }, { "epoch": 0.6821773485513608, "grad_norm": 0.014194161631166935, "learning_rate": 3.2945566286215976e-05, "loss": 0.0003, "step": 2331 }, { "epoch": 0.6824700029265438, "grad_norm": 0.010851086117327213, "learning_rate": 3.2938249926836404e-05, "loss": 0.0003, "step": 2332 }, { "epoch": 0.6827626573017267, "grad_norm": 0.004293947480618954, "learning_rate": 3.293093356745683e-05, "loss": 0.0001, "step": 2333 }, { "epoch": 0.6830553116769096, "grad_norm": 0.009280310943722725, "learning_rate": 3.292361720807726e-05, "loss": 0.0002, "step": 2334 }, { "epoch": 0.6833479660520925, "grad_norm": 0.008533764630556107, "learning_rate": 3.291630084869769e-05, "loss": 0.0002, "step": 2335 }, { "epoch": 0.6836406204272754, "grad_norm": 0.006480607204139233, "learning_rate": 3.2908984489318115e-05, "loss": 0.0002, "step": 2336 }, { "epoch": 0.6839332748024582, "grad_norm": 0.011746611446142197, "learning_rate": 3.290166812993854e-05, "loss": 0.0003, "step": 2337 }, { "epoch": 0.6842259291776412, "grad_norm": 0.010013306513428688, "learning_rate": 3.289435177055897e-05, "loss": 0.0002, "step": 2338 }, { "epoch": 0.6845185835528241, "grad_norm": 0.012896952219307423, "learning_rate": 3.28870354111794e-05, "loss": 0.0003, "step": 2339 }, { "epoch": 0.684811237928007, "grad_norm": 0.00540511030703783, "learning_rate": 3.287971905179982e-05, "loss": 0.0002, "step": 2340 }, { "epoch": 0.6851038923031899, "grad_norm": 5.056729316711426, "learning_rate": 3.287240269242025e-05, "loss": 0.069, "step": 2341 }, { "epoch": 0.6853965466783728, "grad_norm": 0.05942735821008682, "learning_rate": 3.2865086333040676e-05, "loss": 0.0005, "step": 2342 }, { "epoch": 0.6856892010535558, "grad_norm": 0.01774713769555092, "learning_rate": 3.2857769973661104e-05, "loss": 0.0004, "step": 2343 }, { "epoch": 0.6859818554287387, "grad_norm": 0.14372679591178894, "learning_rate": 3.285045361428153e-05, "loss": 0.0009, "step": 2344 }, { "epoch": 0.6862745098039216, "grad_norm": 0.0036756719928234816, "learning_rate": 3.284313725490196e-05, "loss": 0.0001, "step": 2345 }, { "epoch": 0.6865671641791045, "grad_norm": 0.0028616893105208874, "learning_rate": 3.283582089552239e-05, "loss": 0.0001, "step": 2346 }, { "epoch": 0.6868598185542873, "grad_norm": 0.00771837355569005, "learning_rate": 3.2828504536142816e-05, "loss": 0.0002, "step": 2347 }, { "epoch": 0.6871524729294703, "grad_norm": 5.727898120880127, "learning_rate": 3.2821188176763244e-05, "loss": 0.0814, "step": 2348 }, { "epoch": 0.6874451273046532, "grad_norm": 0.005702680442482233, "learning_rate": 3.281387181738367e-05, "loss": 0.0001, "step": 2349 }, { "epoch": 0.6877377816798361, "grad_norm": 0.008673261851072311, "learning_rate": 3.28065554580041e-05, "loss": 0.0002, "step": 2350 }, { "epoch": 0.688030436055019, "grad_norm": 0.011376095935702324, "learning_rate": 3.279923909862452e-05, "loss": 0.0002, "step": 2351 }, { "epoch": 0.6883230904302019, "grad_norm": 0.021226489916443825, "learning_rate": 3.279192273924495e-05, "loss": 0.0003, "step": 2352 }, { "epoch": 0.6886157448053848, "grad_norm": 0.05789355933666229, "learning_rate": 3.278460637986538e-05, "loss": 0.001, "step": 2353 }, { "epoch": 0.6889083991805678, "grad_norm": 0.18417315185070038, "learning_rate": 3.2777290020485805e-05, "loss": 0.0012, "step": 2354 }, { "epoch": 0.6892010535557507, "grad_norm": 0.016051622107625008, "learning_rate": 3.276997366110623e-05, "loss": 0.0003, "step": 2355 }, { "epoch": 0.6894937079309336, "grad_norm": 0.040032170712947845, "learning_rate": 3.276265730172666e-05, "loss": 0.0005, "step": 2356 }, { "epoch": 0.6897863623061165, "grad_norm": 14.222600936889648, "learning_rate": 3.275534094234709e-05, "loss": 0.1511, "step": 2357 }, { "epoch": 0.6900790166812993, "grad_norm": 0.10259919613599777, "learning_rate": 3.274802458296752e-05, "loss": 0.0014, "step": 2358 }, { "epoch": 0.6903716710564823, "grad_norm": 0.026048608124256134, "learning_rate": 3.2740708223587945e-05, "loss": 0.0004, "step": 2359 }, { "epoch": 0.6906643254316652, "grad_norm": 0.031028462573885918, "learning_rate": 3.273339186420837e-05, "loss": 0.0006, "step": 2360 }, { "epoch": 0.6909569798068481, "grad_norm": 0.018124297261238098, "learning_rate": 3.27260755048288e-05, "loss": 0.0003, "step": 2361 }, { "epoch": 0.691249634182031, "grad_norm": 0.00908717978745699, "learning_rate": 3.271875914544922e-05, "loss": 0.0002, "step": 2362 }, { "epoch": 0.6915422885572139, "grad_norm": 0.004855715204030275, "learning_rate": 3.271144278606965e-05, "loss": 0.0001, "step": 2363 }, { "epoch": 0.6918349429323969, "grad_norm": 0.02341291308403015, "learning_rate": 3.270412642669008e-05, "loss": 0.0004, "step": 2364 }, { "epoch": 0.6921275973075798, "grad_norm": 0.05145770311355591, "learning_rate": 3.2696810067310506e-05, "loss": 0.0007, "step": 2365 }, { "epoch": 0.6924202516827627, "grad_norm": 0.027379680424928665, "learning_rate": 3.2689493707930934e-05, "loss": 0.0004, "step": 2366 }, { "epoch": 0.6927129060579456, "grad_norm": 0.010610656812787056, "learning_rate": 3.268217734855136e-05, "loss": 0.0002, "step": 2367 }, { "epoch": 0.6930055604331284, "grad_norm": 0.06512395292520523, "learning_rate": 3.267486098917179e-05, "loss": 0.0005, "step": 2368 }, { "epoch": 0.6932982148083113, "grad_norm": 0.004508286714553833, "learning_rate": 3.266754462979222e-05, "loss": 0.0001, "step": 2369 }, { "epoch": 0.6935908691834943, "grad_norm": 0.04418289661407471, "learning_rate": 3.2660228270412645e-05, "loss": 0.0004, "step": 2370 }, { "epoch": 0.6938835235586772, "grad_norm": 0.04784709960222244, "learning_rate": 3.265291191103307e-05, "loss": 0.0002, "step": 2371 }, { "epoch": 0.6941761779338601, "grad_norm": 5.457385540008545, "learning_rate": 3.2645595551653494e-05, "loss": 0.0266, "step": 2372 }, { "epoch": 0.694468832309043, "grad_norm": 0.004906444810330868, "learning_rate": 3.263827919227392e-05, "loss": 0.0001, "step": 2373 }, { "epoch": 0.6947614866842259, "grad_norm": 0.0034356743562966585, "learning_rate": 3.263096283289435e-05, "loss": 0.0001, "step": 2374 }, { "epoch": 0.6950541410594089, "grad_norm": 0.0035204212181270123, "learning_rate": 3.262364647351478e-05, "loss": 0.0001, "step": 2375 }, { "epoch": 0.6953467954345918, "grad_norm": 0.006286072079092264, "learning_rate": 3.2616330114135206e-05, "loss": 0.0001, "step": 2376 }, { "epoch": 0.6956394498097747, "grad_norm": 0.021691780537366867, "learning_rate": 3.2609013754755634e-05, "loss": 0.0002, "step": 2377 }, { "epoch": 0.6959321041849575, "grad_norm": 0.0023452823515981436, "learning_rate": 3.260169739537606e-05, "loss": 0.0001, "step": 2378 }, { "epoch": 0.6962247585601404, "grad_norm": 0.0005349525599740446, "learning_rate": 3.259438103599649e-05, "loss": 0.0, "step": 2379 }, { "epoch": 0.6965174129353234, "grad_norm": 0.01140750851482153, "learning_rate": 3.258706467661692e-05, "loss": 0.0002, "step": 2380 }, { "epoch": 0.6968100673105063, "grad_norm": 5.882567882537842, "learning_rate": 3.2579748317237346e-05, "loss": 0.0526, "step": 2381 }, { "epoch": 0.6971027216856892, "grad_norm": 0.05859658122062683, "learning_rate": 3.2572431957857774e-05, "loss": 0.0006, "step": 2382 }, { "epoch": 0.6973953760608721, "grad_norm": 0.03432149067521095, "learning_rate": 3.2565115598478195e-05, "loss": 0.0004, "step": 2383 }, { "epoch": 0.697688030436055, "grad_norm": 2.2817699909210205, "learning_rate": 3.255779923909862e-05, "loss": 0.0092, "step": 2384 }, { "epoch": 0.6979806848112379, "grad_norm": 3.7983312606811523, "learning_rate": 3.255048287971905e-05, "loss": 0.2313, "step": 2385 }, { "epoch": 0.6982733391864209, "grad_norm": 11.28543472290039, "learning_rate": 3.254316652033948e-05, "loss": 0.1115, "step": 2386 }, { "epoch": 0.6985659935616038, "grad_norm": 0.011281290091574192, "learning_rate": 3.253585016095991e-05, "loss": 0.0002, "step": 2387 }, { "epoch": 0.6988586479367866, "grad_norm": 0.016820495948195457, "learning_rate": 3.2528533801580335e-05, "loss": 0.0002, "step": 2388 }, { "epoch": 0.6991513023119695, "grad_norm": 0.07670903205871582, "learning_rate": 3.252121744220076e-05, "loss": 0.0007, "step": 2389 }, { "epoch": 0.6994439566871524, "grad_norm": 2.261282205581665, "learning_rate": 3.251390108282119e-05, "loss": 0.009, "step": 2390 }, { "epoch": 0.6997366110623354, "grad_norm": 6.105201244354248, "learning_rate": 3.250658472344162e-05, "loss": 0.057, "step": 2391 }, { "epoch": 0.7000292654375183, "grad_norm": 0.01005995087325573, "learning_rate": 3.249926836406205e-05, "loss": 0.0002, "step": 2392 }, { "epoch": 0.7003219198127012, "grad_norm": 5.082892894744873, "learning_rate": 3.249195200468247e-05, "loss": 0.0362, "step": 2393 }, { "epoch": 0.7006145741878841, "grad_norm": 0.10898295044898987, "learning_rate": 3.2484635645302896e-05, "loss": 0.0009, "step": 2394 }, { "epoch": 0.700907228563067, "grad_norm": 0.008598077110946178, "learning_rate": 3.2477319285923324e-05, "loss": 0.0002, "step": 2395 }, { "epoch": 0.70119988293825, "grad_norm": 0.0072875553742051125, "learning_rate": 3.247000292654375e-05, "loss": 0.0002, "step": 2396 }, { "epoch": 0.7014925373134329, "grad_norm": 0.01747208461165428, "learning_rate": 3.246268656716418e-05, "loss": 0.0003, "step": 2397 }, { "epoch": 0.7017851916886158, "grad_norm": 0.06051646173000336, "learning_rate": 3.245537020778461e-05, "loss": 0.0007, "step": 2398 }, { "epoch": 0.7020778460637986, "grad_norm": 1.8474124670028687, "learning_rate": 3.2448053848405036e-05, "loss": 0.0113, "step": 2399 }, { "epoch": 0.7023705004389815, "grad_norm": 0.006191436201334, "learning_rate": 3.2440737489025464e-05, "loss": 0.0001, "step": 2400 }, { "epoch": 0.7026631548141645, "grad_norm": 0.22403065860271454, "learning_rate": 3.243342112964589e-05, "loss": 0.0017, "step": 2401 }, { "epoch": 0.7029558091893474, "grad_norm": 0.09544986486434937, "learning_rate": 3.242610477026632e-05, "loss": 0.0009, "step": 2402 }, { "epoch": 0.7032484635645303, "grad_norm": 0.0061316415667533875, "learning_rate": 3.241878841088675e-05, "loss": 0.0001, "step": 2403 }, { "epoch": 0.7035411179397132, "grad_norm": 0.0035257881972938776, "learning_rate": 3.241147205150717e-05, "loss": 0.0001, "step": 2404 }, { "epoch": 0.7038337723148961, "grad_norm": 0.0012417975813150406, "learning_rate": 3.2404155692127597e-05, "loss": 0.0, "step": 2405 }, { "epoch": 0.704126426690079, "grad_norm": 0.006698724813759327, "learning_rate": 3.2396839332748024e-05, "loss": 0.0002, "step": 2406 }, { "epoch": 0.704419081065262, "grad_norm": 0.005079933907836676, "learning_rate": 3.238952297336845e-05, "loss": 0.0001, "step": 2407 }, { "epoch": 0.7047117354404449, "grad_norm": 0.005624685902148485, "learning_rate": 3.238220661398888e-05, "loss": 0.0002, "step": 2408 }, { "epoch": 0.7050043898156277, "grad_norm": 3.10202693939209, "learning_rate": 3.237489025460931e-05, "loss": 0.0255, "step": 2409 }, { "epoch": 0.7052970441908106, "grad_norm": 0.003435496473684907, "learning_rate": 3.2367573895229736e-05, "loss": 0.0001, "step": 2410 }, { "epoch": 0.7055896985659935, "grad_norm": 0.005849195644259453, "learning_rate": 3.2360257535850164e-05, "loss": 0.0001, "step": 2411 }, { "epoch": 0.7058823529411765, "grad_norm": 0.0022957089822739363, "learning_rate": 3.235294117647059e-05, "loss": 0.0, "step": 2412 }, { "epoch": 0.7061750073163594, "grad_norm": 0.0034467079676687717, "learning_rate": 3.234562481709102e-05, "loss": 0.0001, "step": 2413 }, { "epoch": 0.7064676616915423, "grad_norm": 0.0022936102468520403, "learning_rate": 3.233830845771145e-05, "loss": 0.0001, "step": 2414 }, { "epoch": 0.7067603160667252, "grad_norm": 0.04932126775383949, "learning_rate": 3.233099209833187e-05, "loss": 0.0003, "step": 2415 }, { "epoch": 0.7070529704419081, "grad_norm": 0.001915495377033949, "learning_rate": 3.23236757389523e-05, "loss": 0.0001, "step": 2416 }, { "epoch": 0.7073456248170911, "grad_norm": 0.13249242305755615, "learning_rate": 3.2316359379572725e-05, "loss": 0.0006, "step": 2417 }, { "epoch": 0.707638279192274, "grad_norm": 0.0058246999979019165, "learning_rate": 3.230904302019315e-05, "loss": 0.0001, "step": 2418 }, { "epoch": 0.7079309335674568, "grad_norm": 7.282301902770996, "learning_rate": 3.230172666081358e-05, "loss": 0.0545, "step": 2419 }, { "epoch": 0.7082235879426397, "grad_norm": 0.04346953332424164, "learning_rate": 3.229441030143401e-05, "loss": 0.0002, "step": 2420 }, { "epoch": 0.7085162423178226, "grad_norm": 0.44268798828125, "learning_rate": 3.228709394205444e-05, "loss": 0.0019, "step": 2421 }, { "epoch": 0.7088088966930055, "grad_norm": 0.0017859560903161764, "learning_rate": 3.2279777582674865e-05, "loss": 0.0, "step": 2422 }, { "epoch": 0.7091015510681885, "grad_norm": 0.022358566522598267, "learning_rate": 3.227246122329529e-05, "loss": 0.0002, "step": 2423 }, { "epoch": 0.7093942054433714, "grad_norm": 0.005604309495538473, "learning_rate": 3.226514486391572e-05, "loss": 0.0001, "step": 2424 }, { "epoch": 0.7096868598185543, "grad_norm": 0.011078868061304092, "learning_rate": 3.225782850453614e-05, "loss": 0.0001, "step": 2425 }, { "epoch": 0.7099795141937372, "grad_norm": 0.10173115134239197, "learning_rate": 3.225051214515657e-05, "loss": 0.0004, "step": 2426 }, { "epoch": 0.7102721685689201, "grad_norm": 0.001832267502322793, "learning_rate": 3.2243195785777e-05, "loss": 0.0, "step": 2427 }, { "epoch": 0.7105648229441031, "grad_norm": 0.0018338344525545835, "learning_rate": 3.2235879426397426e-05, "loss": 0.0, "step": 2428 }, { "epoch": 0.710857477319286, "grad_norm": 0.004381049424409866, "learning_rate": 3.2228563067017854e-05, "loss": 0.0001, "step": 2429 }, { "epoch": 0.7111501316944688, "grad_norm": 6.8078293800354, "learning_rate": 3.222124670763828e-05, "loss": 0.0201, "step": 2430 }, { "epoch": 0.7114427860696517, "grad_norm": 0.0014487294247373939, "learning_rate": 3.221393034825871e-05, "loss": 0.0, "step": 2431 }, { "epoch": 0.7117354404448346, "grad_norm": 0.001356608932837844, "learning_rate": 3.220661398887914e-05, "loss": 0.0, "step": 2432 }, { "epoch": 0.7120280948200176, "grad_norm": 0.0017179345013573766, "learning_rate": 3.2199297629499566e-05, "loss": 0.0, "step": 2433 }, { "epoch": 0.7123207491952005, "grad_norm": 0.0007872664136812091, "learning_rate": 3.2191981270119994e-05, "loss": 0.0, "step": 2434 }, { "epoch": 0.7126134035703834, "grad_norm": 0.004386863671243191, "learning_rate": 3.218466491074042e-05, "loss": 0.0001, "step": 2435 }, { "epoch": 0.7129060579455663, "grad_norm": 0.0011565539753064513, "learning_rate": 3.217734855136084e-05, "loss": 0.0, "step": 2436 }, { "epoch": 0.7131987123207492, "grad_norm": 0.0011458956869319081, "learning_rate": 3.217003219198127e-05, "loss": 0.0, "step": 2437 }, { "epoch": 0.713491366695932, "grad_norm": 0.0006992155686020851, "learning_rate": 3.21627158326017e-05, "loss": 0.0, "step": 2438 }, { "epoch": 0.713784021071115, "grad_norm": 0.0013631065376102924, "learning_rate": 3.2155399473222127e-05, "loss": 0.0, "step": 2439 }, { "epoch": 0.7140766754462979, "grad_norm": 0.0007857891614548862, "learning_rate": 3.2148083113842554e-05, "loss": 0.0, "step": 2440 }, { "epoch": 0.7143693298214808, "grad_norm": 0.04865026846528053, "learning_rate": 3.214076675446298e-05, "loss": 0.0003, "step": 2441 }, { "epoch": 0.7146619841966637, "grad_norm": 6.565481185913086, "learning_rate": 3.213345039508341e-05, "loss": 0.2418, "step": 2442 }, { "epoch": 0.7149546385718466, "grad_norm": 0.0007474518497474492, "learning_rate": 3.212613403570384e-05, "loss": 0.0, "step": 2443 }, { "epoch": 0.7152472929470296, "grad_norm": 0.0006861622678115964, "learning_rate": 3.2118817676324266e-05, "loss": 0.0, "step": 2444 }, { "epoch": 0.7155399473222125, "grad_norm": 0.017203431576490402, "learning_rate": 3.2111501316944694e-05, "loss": 0.0002, "step": 2445 }, { "epoch": 0.7158326016973954, "grad_norm": 0.0011003983672708273, "learning_rate": 3.210418495756512e-05, "loss": 0.0, "step": 2446 }, { "epoch": 0.7161252560725783, "grad_norm": 0.00291343592107296, "learning_rate": 3.209686859818554e-05, "loss": 0.0001, "step": 2447 }, { "epoch": 0.7164179104477612, "grad_norm": 0.0013620432000607252, "learning_rate": 3.208955223880597e-05, "loss": 0.0, "step": 2448 }, { "epoch": 0.7167105648229442, "grad_norm": 0.4714560806751251, "learning_rate": 3.20822358794264e-05, "loss": 0.0012, "step": 2449 }, { "epoch": 0.717003219198127, "grad_norm": 0.018954362720251083, "learning_rate": 3.207491952004683e-05, "loss": 0.0001, "step": 2450 }, { "epoch": 0.7172958735733099, "grad_norm": 0.001146254246123135, "learning_rate": 3.2067603160667255e-05, "loss": 0.0, "step": 2451 }, { "epoch": 0.7175885279484928, "grad_norm": 0.001986933872103691, "learning_rate": 3.206028680128768e-05, "loss": 0.0, "step": 2452 }, { "epoch": 0.7178811823236757, "grad_norm": 0.002456973074004054, "learning_rate": 3.205297044190811e-05, "loss": 0.0001, "step": 2453 }, { "epoch": 0.7181738366988587, "grad_norm": 0.0031812170054763556, "learning_rate": 3.204565408252854e-05, "loss": 0.0001, "step": 2454 }, { "epoch": 0.7184664910740416, "grad_norm": 0.0009945594938471913, "learning_rate": 3.203833772314897e-05, "loss": 0.0, "step": 2455 }, { "epoch": 0.7187591454492245, "grad_norm": 0.0007926966063678265, "learning_rate": 3.2031021363769395e-05, "loss": 0.0, "step": 2456 }, { "epoch": 0.7190517998244074, "grad_norm": 0.003633065614849329, "learning_rate": 3.2023705004389816e-05, "loss": 0.0001, "step": 2457 }, { "epoch": 0.7193444541995903, "grad_norm": 0.0019254813669249415, "learning_rate": 3.2016388645010244e-05, "loss": 0.0, "step": 2458 }, { "epoch": 0.7196371085747731, "grad_norm": 0.0014255027053877711, "learning_rate": 3.200907228563067e-05, "loss": 0.0, "step": 2459 }, { "epoch": 0.7199297629499561, "grad_norm": 0.0015393865760415792, "learning_rate": 3.20017559262511e-05, "loss": 0.0, "step": 2460 }, { "epoch": 0.720222417325139, "grad_norm": 0.0018572082044556737, "learning_rate": 3.199443956687153e-05, "loss": 0.0, "step": 2461 }, { "epoch": 0.7205150717003219, "grad_norm": 5.579071998596191, "learning_rate": 3.1987123207491956e-05, "loss": 0.0084, "step": 2462 }, { "epoch": 0.7208077260755048, "grad_norm": 0.0015191318234428763, "learning_rate": 3.1979806848112384e-05, "loss": 0.0, "step": 2463 }, { "epoch": 0.7211003804506877, "grad_norm": 0.0009283102699555457, "learning_rate": 3.197249048873281e-05, "loss": 0.0, "step": 2464 }, { "epoch": 0.7213930348258707, "grad_norm": 0.0024321016389876604, "learning_rate": 3.196517412935324e-05, "loss": 0.0, "step": 2465 }, { "epoch": 0.7216856892010536, "grad_norm": 0.005111926235258579, "learning_rate": 3.195785776997367e-05, "loss": 0.0001, "step": 2466 }, { "epoch": 0.7219783435762365, "grad_norm": 0.000839318847283721, "learning_rate": 3.1950541410594096e-05, "loss": 0.0, "step": 2467 }, { "epoch": 0.7222709979514194, "grad_norm": 0.0005801770021207631, "learning_rate": 3.194322505121452e-05, "loss": 0.0, "step": 2468 }, { "epoch": 0.7225636523266022, "grad_norm": 0.0018791498150676489, "learning_rate": 3.1935908691834945e-05, "loss": 0.0001, "step": 2469 }, { "epoch": 0.7228563067017852, "grad_norm": 0.00031380855944007635, "learning_rate": 3.192859233245537e-05, "loss": 0.0, "step": 2470 }, { "epoch": 0.7231489610769681, "grad_norm": 9.160697937011719, "learning_rate": 3.19212759730758e-05, "loss": 0.1265, "step": 2471 }, { "epoch": 0.723441615452151, "grad_norm": 0.000885524321347475, "learning_rate": 3.191395961369623e-05, "loss": 0.0, "step": 2472 }, { "epoch": 0.7237342698273339, "grad_norm": 0.0005429118173196912, "learning_rate": 3.1906643254316656e-05, "loss": 0.0, "step": 2473 }, { "epoch": 0.7240269242025168, "grad_norm": 0.0030722729861736298, "learning_rate": 3.1899326894937084e-05, "loss": 0.0, "step": 2474 }, { "epoch": 0.7243195785776997, "grad_norm": 0.5611380934715271, "learning_rate": 3.189201053555751e-05, "loss": 0.0011, "step": 2475 }, { "epoch": 0.7246122329528827, "grad_norm": 0.0014798748306930065, "learning_rate": 3.188469417617794e-05, "loss": 0.0, "step": 2476 }, { "epoch": 0.7249048873280656, "grad_norm": 0.0015373347559943795, "learning_rate": 3.187737781679837e-05, "loss": 0.0, "step": 2477 }, { "epoch": 0.7251975417032485, "grad_norm": 0.007843293249607086, "learning_rate": 3.187006145741879e-05, "loss": 0.0001, "step": 2478 }, { "epoch": 0.7254901960784313, "grad_norm": 0.06513158231973648, "learning_rate": 3.186274509803922e-05, "loss": 0.0006, "step": 2479 }, { "epoch": 0.7257828504536142, "grad_norm": 0.0035666245967149734, "learning_rate": 3.1855428738659645e-05, "loss": 0.0001, "step": 2480 }, { "epoch": 0.7260755048287972, "grad_norm": 0.0018813954666256905, "learning_rate": 3.184811237928007e-05, "loss": 0.0001, "step": 2481 }, { "epoch": 0.7263681592039801, "grad_norm": 0.0012866331962868571, "learning_rate": 3.18407960199005e-05, "loss": 0.0, "step": 2482 }, { "epoch": 0.726660813579163, "grad_norm": 0.002225383883342147, "learning_rate": 3.183347966052093e-05, "loss": 0.0, "step": 2483 }, { "epoch": 0.7269534679543459, "grad_norm": 0.006104538217186928, "learning_rate": 3.182616330114136e-05, "loss": 0.0001, "step": 2484 }, { "epoch": 0.7272461223295288, "grad_norm": 2.0093445777893066, "learning_rate": 3.1818846941761785e-05, "loss": 0.0038, "step": 2485 }, { "epoch": 0.7275387767047118, "grad_norm": 0.0065629081800580025, "learning_rate": 3.1811530582382206e-05, "loss": 0.0001, "step": 2486 }, { "epoch": 0.7278314310798947, "grad_norm": 4.3598103523254395, "learning_rate": 3.1804214223002634e-05, "loss": 0.0149, "step": 2487 }, { "epoch": 0.7281240854550776, "grad_norm": 0.004508009646087885, "learning_rate": 3.179689786362306e-05, "loss": 0.0001, "step": 2488 }, { "epoch": 0.7284167398302605, "grad_norm": 0.00616801343858242, "learning_rate": 3.178958150424349e-05, "loss": 0.0001, "step": 2489 }, { "epoch": 0.7287093942054433, "grad_norm": 1.723605751991272, "learning_rate": 3.178226514486392e-05, "loss": 0.0039, "step": 2490 }, { "epoch": 0.7290020485806262, "grad_norm": 0.0013585267588496208, "learning_rate": 3.1774948785484346e-05, "loss": 0.0, "step": 2491 }, { "epoch": 0.7292947029558092, "grad_norm": 0.004794291220605373, "learning_rate": 3.1767632426104774e-05, "loss": 0.0001, "step": 2492 }, { "epoch": 0.7295873573309921, "grad_norm": 0.003616244299337268, "learning_rate": 3.17603160667252e-05, "loss": 0.0001, "step": 2493 }, { "epoch": 0.729880011706175, "grad_norm": 0.004236791282892227, "learning_rate": 3.175299970734562e-05, "loss": 0.0001, "step": 2494 }, { "epoch": 0.7301726660813579, "grad_norm": 0.0011380615178495646, "learning_rate": 3.174568334796605e-05, "loss": 0.0, "step": 2495 }, { "epoch": 0.7304653204565408, "grad_norm": 0.0005486432346515357, "learning_rate": 3.173836698858648e-05, "loss": 0.0, "step": 2496 }, { "epoch": 0.7307579748317238, "grad_norm": 0.0009223229717463255, "learning_rate": 3.173105062920691e-05, "loss": 0.0, "step": 2497 }, { "epoch": 0.7310506292069067, "grad_norm": 0.058350205421447754, "learning_rate": 3.1723734269827335e-05, "loss": 0.0002, "step": 2498 }, { "epoch": 0.7313432835820896, "grad_norm": 0.0006941952742636204, "learning_rate": 3.171641791044776e-05, "loss": 0.0, "step": 2499 }, { "epoch": 0.7316359379572724, "grad_norm": 0.004078224301338196, "learning_rate": 3.170910155106819e-05, "loss": 0.0001, "step": 2500 }, { "epoch": 0.7319285923324553, "grad_norm": 0.014169774949550629, "learning_rate": 3.170178519168862e-05, "loss": 0.0002, "step": 2501 }, { "epoch": 0.7322212467076383, "grad_norm": 0.004702478647232056, "learning_rate": 3.169446883230904e-05, "loss": 0.0, "step": 2502 }, { "epoch": 0.7325139010828212, "grad_norm": 0.00040812077349983156, "learning_rate": 3.168715247292947e-05, "loss": 0.0, "step": 2503 }, { "epoch": 0.7328065554580041, "grad_norm": 0.016532549634575844, "learning_rate": 3.1679836113549896e-05, "loss": 0.0001, "step": 2504 }, { "epoch": 0.733099209833187, "grad_norm": 0.0008503179415129125, "learning_rate": 3.1672519754170324e-05, "loss": 0.0, "step": 2505 }, { "epoch": 0.7333918642083699, "grad_norm": 0.024454286321997643, "learning_rate": 3.166520339479075e-05, "loss": 0.0001, "step": 2506 }, { "epoch": 0.7336845185835528, "grad_norm": 10.554803848266602, "learning_rate": 3.165788703541118e-05, "loss": 0.1, "step": 2507 }, { "epoch": 0.7339771729587358, "grad_norm": 0.0017941119149327278, "learning_rate": 3.165057067603161e-05, "loss": 0.0, "step": 2508 }, { "epoch": 0.7342698273339187, "grad_norm": 0.001540915691293776, "learning_rate": 3.1643254316652036e-05, "loss": 0.0, "step": 2509 }, { "epoch": 0.7345624817091015, "grad_norm": 0.0006537814042530954, "learning_rate": 3.163593795727246e-05, "loss": 0.0, "step": 2510 }, { "epoch": 0.7348551360842844, "grad_norm": 0.0022911422420293093, "learning_rate": 3.1628621597892885e-05, "loss": 0.0, "step": 2511 }, { "epoch": 0.7351477904594673, "grad_norm": 0.0009376522502861917, "learning_rate": 3.162130523851331e-05, "loss": 0.0, "step": 2512 }, { "epoch": 0.7354404448346503, "grad_norm": 0.004998430144041777, "learning_rate": 3.161398887913374e-05, "loss": 0.0001, "step": 2513 }, { "epoch": 0.7357330992098332, "grad_norm": 0.0005815597833134234, "learning_rate": 3.160667251975417e-05, "loss": 0.0, "step": 2514 }, { "epoch": 0.7360257535850161, "grad_norm": 0.006281370297074318, "learning_rate": 3.1599356160374597e-05, "loss": 0.0001, "step": 2515 }, { "epoch": 0.736318407960199, "grad_norm": 0.0013787158532068133, "learning_rate": 3.1592039800995024e-05, "loss": 0.0, "step": 2516 }, { "epoch": 0.7366110623353819, "grad_norm": 0.0031138560734689236, "learning_rate": 3.158472344161545e-05, "loss": 0.0, "step": 2517 }, { "epoch": 0.7369037167105649, "grad_norm": 5.179323673248291, "learning_rate": 3.157740708223588e-05, "loss": 0.1861, "step": 2518 }, { "epoch": 0.7371963710857478, "grad_norm": 0.0007730225916020572, "learning_rate": 3.157009072285631e-05, "loss": 0.0, "step": 2519 }, { "epoch": 0.7374890254609306, "grad_norm": 0.0030811233446002007, "learning_rate": 3.1562774363476736e-05, "loss": 0.0, "step": 2520 }, { "epoch": 0.7377816798361135, "grad_norm": 7.347812652587891, "learning_rate": 3.155545800409716e-05, "loss": 0.0458, "step": 2521 }, { "epoch": 0.7380743342112964, "grad_norm": 0.001976597122848034, "learning_rate": 3.1548141644717585e-05, "loss": 0.0001, "step": 2522 }, { "epoch": 0.7383669885864794, "grad_norm": 0.0035360397305339575, "learning_rate": 3.154082528533801e-05, "loss": 0.0001, "step": 2523 }, { "epoch": 0.7386596429616623, "grad_norm": 0.028700485825538635, "learning_rate": 3.153350892595844e-05, "loss": 0.0002, "step": 2524 }, { "epoch": 0.7389522973368452, "grad_norm": 0.04599932208657265, "learning_rate": 3.152619256657887e-05, "loss": 0.0003, "step": 2525 }, { "epoch": 0.7392449517120281, "grad_norm": 0.020827241241931915, "learning_rate": 3.15188762071993e-05, "loss": 0.0002, "step": 2526 }, { "epoch": 0.739537606087211, "grad_norm": 0.004416859708726406, "learning_rate": 3.1511559847819725e-05, "loss": 0.0, "step": 2527 }, { "epoch": 0.7398302604623939, "grad_norm": 0.00646563433110714, "learning_rate": 3.150424348844015e-05, "loss": 0.0001, "step": 2528 }, { "epoch": 0.7401229148375769, "grad_norm": 0.0018413313664495945, "learning_rate": 3.149692712906058e-05, "loss": 0.0, "step": 2529 }, { "epoch": 0.7404155692127597, "grad_norm": 0.002748123137280345, "learning_rate": 3.148961076968101e-05, "loss": 0.0001, "step": 2530 }, { "epoch": 0.7407082235879426, "grad_norm": 0.010115600191056728, "learning_rate": 3.148229441030143e-05, "loss": 0.0001, "step": 2531 }, { "epoch": 0.7410008779631255, "grad_norm": 0.05594480037689209, "learning_rate": 3.147497805092186e-05, "loss": 0.0003, "step": 2532 }, { "epoch": 0.7412935323383084, "grad_norm": 0.000752597872633487, "learning_rate": 3.1467661691542286e-05, "loss": 0.0, "step": 2533 }, { "epoch": 0.7415861867134914, "grad_norm": 0.0009483325993642211, "learning_rate": 3.1460345332162714e-05, "loss": 0.0, "step": 2534 }, { "epoch": 0.7418788410886743, "grad_norm": 0.001707609393633902, "learning_rate": 3.145302897278314e-05, "loss": 0.0, "step": 2535 }, { "epoch": 0.7421714954638572, "grad_norm": 0.012230713851749897, "learning_rate": 3.144571261340357e-05, "loss": 0.0001, "step": 2536 }, { "epoch": 0.7424641498390401, "grad_norm": 0.0016634443309158087, "learning_rate": 3.1438396254024e-05, "loss": 0.0, "step": 2537 }, { "epoch": 0.742756804214223, "grad_norm": 0.008607159368693829, "learning_rate": 3.1431079894644426e-05, "loss": 0.0002, "step": 2538 }, { "epoch": 0.743049458589406, "grad_norm": 0.01903282105922699, "learning_rate": 3.1423763535264854e-05, "loss": 0.0002, "step": 2539 }, { "epoch": 0.7433421129645889, "grad_norm": 0.001686413073912263, "learning_rate": 3.141644717588528e-05, "loss": 0.0, "step": 2540 }, { "epoch": 0.7436347673397717, "grad_norm": 0.001426143106073141, "learning_rate": 3.140913081650571e-05, "loss": 0.0, "step": 2541 }, { "epoch": 0.7439274217149546, "grad_norm": 0.002940902952104807, "learning_rate": 3.140181445712613e-05, "loss": 0.0001, "step": 2542 }, { "epoch": 0.7442200760901375, "grad_norm": 0.0014354386366903782, "learning_rate": 3.139449809774656e-05, "loss": 0.0, "step": 2543 }, { "epoch": 0.7445127304653204, "grad_norm": 0.0016779705183580518, "learning_rate": 3.138718173836699e-05, "loss": 0.0, "step": 2544 }, { "epoch": 0.7448053848405034, "grad_norm": 0.0019332582596689463, "learning_rate": 3.1379865378987415e-05, "loss": 0.0001, "step": 2545 }, { "epoch": 0.7450980392156863, "grad_norm": 0.007013517431914806, "learning_rate": 3.137254901960784e-05, "loss": 0.0001, "step": 2546 }, { "epoch": 0.7453906935908692, "grad_norm": 0.001592212007381022, "learning_rate": 3.136523266022827e-05, "loss": 0.0, "step": 2547 }, { "epoch": 0.7456833479660521, "grad_norm": 0.0041719297878444195, "learning_rate": 3.13579163008487e-05, "loss": 0.0001, "step": 2548 }, { "epoch": 0.745976002341235, "grad_norm": 0.0015470386715605855, "learning_rate": 3.1350599941469126e-05, "loss": 0.0, "step": 2549 }, { "epoch": 0.746268656716418, "grad_norm": 0.14043869078159332, "learning_rate": 3.1343283582089554e-05, "loss": 0.0007, "step": 2550 }, { "epoch": 0.7465613110916008, "grad_norm": 0.012669500894844532, "learning_rate": 3.133596722270998e-05, "loss": 0.0001, "step": 2551 }, { "epoch": 0.7468539654667837, "grad_norm": 0.005013457499444485, "learning_rate": 3.132865086333041e-05, "loss": 0.0001, "step": 2552 }, { "epoch": 0.7471466198419666, "grad_norm": 0.00242857588455081, "learning_rate": 3.132133450395083e-05, "loss": 0.0001, "step": 2553 }, { "epoch": 0.7474392742171495, "grad_norm": 0.001480822917073965, "learning_rate": 3.131401814457126e-05, "loss": 0.0, "step": 2554 }, { "epoch": 0.7477319285923325, "grad_norm": 0.005603375378996134, "learning_rate": 3.130670178519169e-05, "loss": 0.0001, "step": 2555 }, { "epoch": 0.7480245829675154, "grad_norm": 0.018584266304969788, "learning_rate": 3.1299385425812115e-05, "loss": 0.0002, "step": 2556 }, { "epoch": 0.7483172373426983, "grad_norm": 0.002079001860693097, "learning_rate": 3.129206906643254e-05, "loss": 0.0, "step": 2557 }, { "epoch": 0.7486098917178812, "grad_norm": 0.018753597512841225, "learning_rate": 3.128475270705297e-05, "loss": 0.0001, "step": 2558 }, { "epoch": 0.7489025460930641, "grad_norm": 0.0020891628228127956, "learning_rate": 3.12774363476734e-05, "loss": 0.0001, "step": 2559 }, { "epoch": 0.749195200468247, "grad_norm": 0.0036165923811495304, "learning_rate": 3.127011998829383e-05, "loss": 0.0001, "step": 2560 }, { "epoch": 0.74948785484343, "grad_norm": 0.0031803473830223083, "learning_rate": 3.1262803628914255e-05, "loss": 0.0001, "step": 2561 }, { "epoch": 0.7497805092186128, "grad_norm": 0.0026807805988937616, "learning_rate": 3.125548726953468e-05, "loss": 0.0001, "step": 2562 }, { "epoch": 0.7500731635937957, "grad_norm": 0.03576747328042984, "learning_rate": 3.1248170910155104e-05, "loss": 0.0002, "step": 2563 }, { "epoch": 0.7503658179689786, "grad_norm": 0.00326459645293653, "learning_rate": 3.124085455077553e-05, "loss": 0.0001, "step": 2564 }, { "epoch": 0.7506584723441615, "grad_norm": 0.0017964920261874795, "learning_rate": 3.123353819139596e-05, "loss": 0.0, "step": 2565 }, { "epoch": 0.7509511267193445, "grad_norm": 0.0020052739419043064, "learning_rate": 3.122622183201639e-05, "loss": 0.0001, "step": 2566 }, { "epoch": 0.7512437810945274, "grad_norm": 0.0005410740268416703, "learning_rate": 3.1218905472636816e-05, "loss": 0.0, "step": 2567 }, { "epoch": 0.7515364354697103, "grad_norm": 0.07815296202898026, "learning_rate": 3.1211589113257244e-05, "loss": 0.0003, "step": 2568 }, { "epoch": 0.7518290898448932, "grad_norm": 0.006977943703532219, "learning_rate": 3.120427275387767e-05, "loss": 0.0001, "step": 2569 }, { "epoch": 0.752121744220076, "grad_norm": 0.011482861824333668, "learning_rate": 3.11969563944981e-05, "loss": 0.0001, "step": 2570 }, { "epoch": 0.752414398595259, "grad_norm": 0.00260129664093256, "learning_rate": 3.118964003511853e-05, "loss": 0.0, "step": 2571 }, { "epoch": 0.7527070529704419, "grad_norm": 0.0028190468437969685, "learning_rate": 3.1182323675738956e-05, "loss": 0.0001, "step": 2572 }, { "epoch": 0.7529997073456248, "grad_norm": 0.006121335085481405, "learning_rate": 3.1175007316359384e-05, "loss": 0.0001, "step": 2573 }, { "epoch": 0.7532923617208077, "grad_norm": 0.0015971810789778829, "learning_rate": 3.1167690956979805e-05, "loss": 0.0, "step": 2574 }, { "epoch": 0.7535850160959906, "grad_norm": 0.016662990674376488, "learning_rate": 3.116037459760023e-05, "loss": 0.0002, "step": 2575 }, { "epoch": 0.7538776704711735, "grad_norm": 0.000863375433254987, "learning_rate": 3.115305823822066e-05, "loss": 0.0, "step": 2576 }, { "epoch": 0.7541703248463565, "grad_norm": 0.0011870183516293764, "learning_rate": 3.114574187884109e-05, "loss": 0.0, "step": 2577 }, { "epoch": 0.7544629792215394, "grad_norm": 0.004520727321505547, "learning_rate": 3.113842551946152e-05, "loss": 0.0001, "step": 2578 }, { "epoch": 0.7547556335967223, "grad_norm": 0.006965481676161289, "learning_rate": 3.1131109160081945e-05, "loss": 0.0001, "step": 2579 }, { "epoch": 0.7550482879719052, "grad_norm": 0.001343530253507197, "learning_rate": 3.112379280070237e-05, "loss": 0.0, "step": 2580 }, { "epoch": 0.755340942347088, "grad_norm": 0.0017632056260481477, "learning_rate": 3.11164764413228e-05, "loss": 0.0, "step": 2581 }, { "epoch": 0.755633596722271, "grad_norm": 0.0017508604796603322, "learning_rate": 3.110916008194323e-05, "loss": 0.0, "step": 2582 }, { "epoch": 0.7559262510974539, "grad_norm": 0.0011078683892264962, "learning_rate": 3.1101843722563656e-05, "loss": 0.0, "step": 2583 }, { "epoch": 0.7562189054726368, "grad_norm": 0.0008804899407550693, "learning_rate": 3.109452736318408e-05, "loss": 0.0, "step": 2584 }, { "epoch": 0.7565115598478197, "grad_norm": 0.07887193560600281, "learning_rate": 3.1087211003804506e-05, "loss": 0.0004, "step": 2585 }, { "epoch": 0.7568042142230026, "grad_norm": 0.005430367775261402, "learning_rate": 3.1079894644424934e-05, "loss": 0.0001, "step": 2586 }, { "epoch": 0.7570968685981856, "grad_norm": 0.0016514130402356386, "learning_rate": 3.107257828504536e-05, "loss": 0.0, "step": 2587 }, { "epoch": 0.7573895229733685, "grad_norm": 0.00218368717469275, "learning_rate": 3.106526192566579e-05, "loss": 0.0, "step": 2588 }, { "epoch": 0.7576821773485514, "grad_norm": 0.0007050017593428493, "learning_rate": 3.105794556628622e-05, "loss": 0.0, "step": 2589 }, { "epoch": 0.7579748317237343, "grad_norm": 0.0005608153296634555, "learning_rate": 3.1050629206906645e-05, "loss": 0.0, "step": 2590 }, { "epoch": 0.7582674860989171, "grad_norm": 0.000953198061324656, "learning_rate": 3.104331284752707e-05, "loss": 0.0, "step": 2591 }, { "epoch": 0.7585601404741001, "grad_norm": 0.001409333199262619, "learning_rate": 3.10359964881475e-05, "loss": 0.0, "step": 2592 }, { "epoch": 0.758852794849283, "grad_norm": 0.0007091196603141725, "learning_rate": 3.102868012876793e-05, "loss": 0.0, "step": 2593 }, { "epoch": 0.7591454492244659, "grad_norm": 0.0006789021426811814, "learning_rate": 3.102136376938836e-05, "loss": 0.0, "step": 2594 }, { "epoch": 0.7594381035996488, "grad_norm": 0.0012679515639320016, "learning_rate": 3.101404741000878e-05, "loss": 0.0, "step": 2595 }, { "epoch": 0.7597307579748317, "grad_norm": 0.0006374386139214039, "learning_rate": 3.1006731050629206e-05, "loss": 0.0, "step": 2596 }, { "epoch": 0.7600234123500146, "grad_norm": 7.963675498962402, "learning_rate": 3.0999414691249634e-05, "loss": 0.0368, "step": 2597 }, { "epoch": 0.7603160667251976, "grad_norm": 0.000828925461973995, "learning_rate": 3.099209833187006e-05, "loss": 0.0, "step": 2598 }, { "epoch": 0.7606087211003805, "grad_norm": 0.0005160178407095373, "learning_rate": 3.098478197249049e-05, "loss": 0.0, "step": 2599 }, { "epoch": 0.7609013754755634, "grad_norm": 0.0015012379735708237, "learning_rate": 3.097746561311092e-05, "loss": 0.0, "step": 2600 }, { "epoch": 0.7611940298507462, "grad_norm": 0.0016636387445032597, "learning_rate": 3.0970149253731346e-05, "loss": 0.0, "step": 2601 }, { "epoch": 0.7614866842259291, "grad_norm": 0.0010978828649967909, "learning_rate": 3.0962832894351774e-05, "loss": 0.0, "step": 2602 }, { "epoch": 0.7617793386011121, "grad_norm": 0.0026254807598888874, "learning_rate": 3.09555165349722e-05, "loss": 0.0001, "step": 2603 }, { "epoch": 0.762071992976295, "grad_norm": 0.007941869087517262, "learning_rate": 3.094820017559263e-05, "loss": 0.0001, "step": 2604 }, { "epoch": 0.7623646473514779, "grad_norm": 0.1966180056333542, "learning_rate": 3.094088381621306e-05, "loss": 0.0005, "step": 2605 }, { "epoch": 0.7626573017266608, "grad_norm": 0.0033722524531185627, "learning_rate": 3.093356745683348e-05, "loss": 0.0001, "step": 2606 }, { "epoch": 0.7629499561018437, "grad_norm": 0.032422102987766266, "learning_rate": 3.092625109745391e-05, "loss": 0.0004, "step": 2607 }, { "epoch": 0.7632426104770267, "grad_norm": 0.010612317360937595, "learning_rate": 3.0918934738074335e-05, "loss": 0.0001, "step": 2608 }, { "epoch": 0.7635352648522096, "grad_norm": 0.4488622546195984, "learning_rate": 3.091161837869476e-05, "loss": 0.0019, "step": 2609 }, { "epoch": 0.7638279192273925, "grad_norm": 0.0019041658379137516, "learning_rate": 3.090430201931519e-05, "loss": 0.0, "step": 2610 }, { "epoch": 0.7641205736025753, "grad_norm": 0.0013012103736400604, "learning_rate": 3.089698565993562e-05, "loss": 0.0, "step": 2611 }, { "epoch": 0.7644132279777582, "grad_norm": 0.0009472115198150277, "learning_rate": 3.088966930055605e-05, "loss": 0.0, "step": 2612 }, { "epoch": 0.7647058823529411, "grad_norm": 0.0014884189004078507, "learning_rate": 3.0882352941176475e-05, "loss": 0.0, "step": 2613 }, { "epoch": 0.7649985367281241, "grad_norm": 0.017035644501447678, "learning_rate": 3.08750365817969e-05, "loss": 0.0001, "step": 2614 }, { "epoch": 0.765291191103307, "grad_norm": 0.0269364845007658, "learning_rate": 3.086772022241733e-05, "loss": 0.0002, "step": 2615 }, { "epoch": 0.7655838454784899, "grad_norm": 0.011261007748544216, "learning_rate": 3.086040386303775e-05, "loss": 0.0001, "step": 2616 }, { "epoch": 0.7658764998536728, "grad_norm": 0.0022833487018942833, "learning_rate": 3.085308750365818e-05, "loss": 0.0, "step": 2617 }, { "epoch": 0.7661691542288557, "grad_norm": 0.0006077625439502299, "learning_rate": 3.084577114427861e-05, "loss": 0.0, "step": 2618 }, { "epoch": 0.7664618086040387, "grad_norm": 0.0006674678879790008, "learning_rate": 3.0838454784899036e-05, "loss": 0.0, "step": 2619 }, { "epoch": 0.7667544629792216, "grad_norm": 6.559497356414795, "learning_rate": 3.0831138425519464e-05, "loss": 0.0244, "step": 2620 }, { "epoch": 0.7670471173544045, "grad_norm": 0.0005603537429124117, "learning_rate": 3.082382206613989e-05, "loss": 0.0, "step": 2621 }, { "epoch": 0.7673397717295873, "grad_norm": 0.03798175975680351, "learning_rate": 3.081650570676032e-05, "loss": 0.0002, "step": 2622 }, { "epoch": 0.7676324261047702, "grad_norm": 0.000654637289699167, "learning_rate": 3.080918934738075e-05, "loss": 0.0, "step": 2623 }, { "epoch": 0.7679250804799532, "grad_norm": 0.0007277166587300599, "learning_rate": 3.0801872988001175e-05, "loss": 0.0, "step": 2624 }, { "epoch": 0.7682177348551361, "grad_norm": 0.0008071591728366911, "learning_rate": 3.07945566286216e-05, "loss": 0.0, "step": 2625 }, { "epoch": 0.768510389230319, "grad_norm": 0.026575203984975815, "learning_rate": 3.078724026924203e-05, "loss": 0.0001, "step": 2626 }, { "epoch": 0.7688030436055019, "grad_norm": 0.0014964027795940638, "learning_rate": 3.077992390986245e-05, "loss": 0.0, "step": 2627 }, { "epoch": 0.7690956979806848, "grad_norm": 0.0101132458075881, "learning_rate": 3.077260755048288e-05, "loss": 0.0001, "step": 2628 }, { "epoch": 0.7693883523558677, "grad_norm": 0.007380722090601921, "learning_rate": 3.076529119110331e-05, "loss": 0.0, "step": 2629 }, { "epoch": 0.7696810067310507, "grad_norm": 0.0011052581248804927, "learning_rate": 3.0757974831723736e-05, "loss": 0.0, "step": 2630 }, { "epoch": 0.7699736611062336, "grad_norm": 24.553504943847656, "learning_rate": 3.0750658472344164e-05, "loss": 0.037, "step": 2631 }, { "epoch": 0.7702663154814164, "grad_norm": 0.040060315281152725, "learning_rate": 3.074334211296459e-05, "loss": 0.0002, "step": 2632 }, { "epoch": 0.7705589698565993, "grad_norm": 0.0004694352683145553, "learning_rate": 3.073602575358502e-05, "loss": 0.0, "step": 2633 }, { "epoch": 0.7708516242317822, "grad_norm": 18.785173416137695, "learning_rate": 3.072870939420545e-05, "loss": 0.1991, "step": 2634 }, { "epoch": 0.7711442786069652, "grad_norm": 0.0014431950403377414, "learning_rate": 3.0721393034825876e-05, "loss": 0.0, "step": 2635 }, { "epoch": 0.7714369329821481, "grad_norm": 8.37049388885498, "learning_rate": 3.0714076675446304e-05, "loss": 0.1484, "step": 2636 }, { "epoch": 0.771729587357331, "grad_norm": 0.001613378757610917, "learning_rate": 3.070676031606673e-05, "loss": 0.0, "step": 2637 }, { "epoch": 0.7720222417325139, "grad_norm": 6.582139015197754, "learning_rate": 3.069944395668715e-05, "loss": 0.0371, "step": 2638 }, { "epoch": 0.7723148961076968, "grad_norm": 0.51373291015625, "learning_rate": 3.069212759730758e-05, "loss": 0.0014, "step": 2639 }, { "epoch": 0.7726075504828798, "grad_norm": 18.354637145996094, "learning_rate": 3.068481123792801e-05, "loss": 0.1058, "step": 2640 }, { "epoch": 0.7729002048580627, "grad_norm": 0.016380123794078827, "learning_rate": 3.067749487854844e-05, "loss": 0.0001, "step": 2641 }, { "epoch": 0.7731928592332455, "grad_norm": 0.001659467234276235, "learning_rate": 3.0670178519168865e-05, "loss": 0.0, "step": 2642 }, { "epoch": 0.7734855136084284, "grad_norm": 0.0006678146310150623, "learning_rate": 3.066286215978929e-05, "loss": 0.0, "step": 2643 }, { "epoch": 0.7737781679836113, "grad_norm": 0.13738636672496796, "learning_rate": 3.065554580040972e-05, "loss": 0.0005, "step": 2644 }, { "epoch": 0.7740708223587943, "grad_norm": 0.0005895581562072039, "learning_rate": 3.064822944103015e-05, "loss": 0.0, "step": 2645 }, { "epoch": 0.7743634767339772, "grad_norm": 0.05153066664934158, "learning_rate": 3.064091308165058e-05, "loss": 0.0003, "step": 2646 }, { "epoch": 0.7746561311091601, "grad_norm": 0.002324148081243038, "learning_rate": 3.0633596722271005e-05, "loss": 0.0, "step": 2647 }, { "epoch": 0.774948785484343, "grad_norm": 0.004383832681924105, "learning_rate": 3.0626280362891426e-05, "loss": 0.0001, "step": 2648 }, { "epoch": 0.7752414398595259, "grad_norm": 0.005408088676631451, "learning_rate": 3.0618964003511854e-05, "loss": 0.0001, "step": 2649 }, { "epoch": 0.7755340942347088, "grad_norm": 0.021713461726903915, "learning_rate": 3.061164764413228e-05, "loss": 0.0001, "step": 2650 }, { "epoch": 0.7758267486098918, "grad_norm": 0.003193659009411931, "learning_rate": 3.060433128475271e-05, "loss": 0.0001, "step": 2651 }, { "epoch": 0.7761194029850746, "grad_norm": 8.086435317993164, "learning_rate": 3.059701492537314e-05, "loss": 0.1508, "step": 2652 }, { "epoch": 0.7764120573602575, "grad_norm": 0.012869983911514282, "learning_rate": 3.0589698565993566e-05, "loss": 0.0002, "step": 2653 }, { "epoch": 0.7767047117354404, "grad_norm": 0.01338435709476471, "learning_rate": 3.0582382206613994e-05, "loss": 0.0001, "step": 2654 }, { "epoch": 0.7769973661106233, "grad_norm": 0.0036994232796132565, "learning_rate": 3.057506584723442e-05, "loss": 0.0001, "step": 2655 }, { "epoch": 0.7772900204858063, "grad_norm": 0.0025011140387505293, "learning_rate": 3.056774948785485e-05, "loss": 0.0, "step": 2656 }, { "epoch": 0.7775826748609892, "grad_norm": 9.787667274475098, "learning_rate": 3.056043312847528e-05, "loss": 0.1411, "step": 2657 }, { "epoch": 0.7778753292361721, "grad_norm": 0.08533408492803574, "learning_rate": 3.0553116769095705e-05, "loss": 0.0004, "step": 2658 }, { "epoch": 0.778167983611355, "grad_norm": 0.024230292066931725, "learning_rate": 3.0545800409716126e-05, "loss": 0.0004, "step": 2659 }, { "epoch": 0.7784606379865379, "grad_norm": 0.007551426533609629, "learning_rate": 3.0538484050336554e-05, "loss": 0.0001, "step": 2660 }, { "epoch": 0.7787532923617209, "grad_norm": 0.005011970642954111, "learning_rate": 3.053116769095698e-05, "loss": 0.0001, "step": 2661 }, { "epoch": 0.7790459467369037, "grad_norm": 0.004218139685690403, "learning_rate": 3.052385133157741e-05, "loss": 0.0001, "step": 2662 }, { "epoch": 0.7793386011120866, "grad_norm": 0.0030332435853779316, "learning_rate": 3.0516534972197835e-05, "loss": 0.0001, "step": 2663 }, { "epoch": 0.7796312554872695, "grad_norm": 0.14285200834274292, "learning_rate": 3.0509218612818263e-05, "loss": 0.0005, "step": 2664 }, { "epoch": 0.7799239098624524, "grad_norm": 6.031563758850098, "learning_rate": 3.050190225343869e-05, "loss": 0.0279, "step": 2665 }, { "epoch": 0.7802165642376353, "grad_norm": 0.005401406437158585, "learning_rate": 3.049458589405912e-05, "loss": 0.0001, "step": 2666 }, { "epoch": 0.7805092186128183, "grad_norm": 0.005855087656527758, "learning_rate": 3.0487269534679547e-05, "loss": 0.0001, "step": 2667 }, { "epoch": 0.7808018729880012, "grad_norm": 0.006042785011231899, "learning_rate": 3.0479953175299975e-05, "loss": 0.0001, "step": 2668 }, { "epoch": 0.7810945273631841, "grad_norm": 0.0059804487973451614, "learning_rate": 3.0472636815920396e-05, "loss": 0.0001, "step": 2669 }, { "epoch": 0.781387181738367, "grad_norm": 0.01024722307920456, "learning_rate": 3.0465320456540824e-05, "loss": 0.0001, "step": 2670 }, { "epoch": 0.7816798361135499, "grad_norm": 15.585776329040527, "learning_rate": 3.0458004097161252e-05, "loss": 0.2139, "step": 2671 }, { "epoch": 0.7819724904887329, "grad_norm": 0.005884439684450626, "learning_rate": 3.045068773778168e-05, "loss": 0.0001, "step": 2672 }, { "epoch": 0.7822651448639157, "grad_norm": 0.016200028359889984, "learning_rate": 3.0443371378402108e-05, "loss": 0.0002, "step": 2673 }, { "epoch": 0.7825577992390986, "grad_norm": 0.008441347628831863, "learning_rate": 3.0436055019022536e-05, "loss": 0.0001, "step": 2674 }, { "epoch": 0.7828504536142815, "grad_norm": 0.6087526082992554, "learning_rate": 3.0428738659642964e-05, "loss": 0.0025, "step": 2675 }, { "epoch": 0.7831431079894644, "grad_norm": 0.013706745579838753, "learning_rate": 3.042142230026339e-05, "loss": 0.0001, "step": 2676 }, { "epoch": 0.7834357623646474, "grad_norm": 0.012047035619616508, "learning_rate": 3.041410594088382e-05, "loss": 0.0002, "step": 2677 }, { "epoch": 0.7837284167398303, "grad_norm": 0.0010208478197455406, "learning_rate": 3.0406789581504247e-05, "loss": 0.0, "step": 2678 }, { "epoch": 0.7840210711150132, "grad_norm": 0.008369138464331627, "learning_rate": 3.0399473222124675e-05, "loss": 0.0001, "step": 2679 }, { "epoch": 0.7843137254901961, "grad_norm": 0.01685381308197975, "learning_rate": 3.0392156862745097e-05, "loss": 0.0002, "step": 2680 }, { "epoch": 0.784606379865379, "grad_norm": 0.004259116016328335, "learning_rate": 3.0384840503365524e-05, "loss": 0.0001, "step": 2681 }, { "epoch": 0.7848990342405618, "grad_norm": 7.433053970336914, "learning_rate": 3.0377524143985952e-05, "loss": 0.0862, "step": 2682 }, { "epoch": 0.7851916886157448, "grad_norm": 0.004358033649623394, "learning_rate": 3.037020778460638e-05, "loss": 0.0001, "step": 2683 }, { "epoch": 0.7854843429909277, "grad_norm": 0.2607213854789734, "learning_rate": 3.036289142522681e-05, "loss": 0.0012, "step": 2684 }, { "epoch": 0.7857769973661106, "grad_norm": 0.1773947924375534, "learning_rate": 3.0355575065847236e-05, "loss": 0.0011, "step": 2685 }, { "epoch": 0.7860696517412935, "grad_norm": 0.0020568312611430883, "learning_rate": 3.0348258706467664e-05, "loss": 0.0, "step": 2686 }, { "epoch": 0.7863623061164764, "grad_norm": 3.2226767539978027, "learning_rate": 3.0340942347088092e-05, "loss": 0.0175, "step": 2687 }, { "epoch": 0.7866549604916594, "grad_norm": 0.8149426579475403, "learning_rate": 3.033362598770852e-05, "loss": 0.0041, "step": 2688 }, { "epoch": 0.7869476148668423, "grad_norm": 0.008766296319663525, "learning_rate": 3.0326309628328948e-05, "loss": 0.0001, "step": 2689 }, { "epoch": 0.7872402692420252, "grad_norm": 0.008891499601304531, "learning_rate": 3.0318993268949376e-05, "loss": 0.0002, "step": 2690 }, { "epoch": 0.7875329236172081, "grad_norm": 0.019587429240345955, "learning_rate": 3.0311676909569797e-05, "loss": 0.0003, "step": 2691 }, { "epoch": 0.787825577992391, "grad_norm": 0.0022967993281781673, "learning_rate": 3.0304360550190225e-05, "loss": 0.0001, "step": 2692 }, { "epoch": 0.788118232367574, "grad_norm": 0.057845134288072586, "learning_rate": 3.0297044190810653e-05, "loss": 0.0004, "step": 2693 }, { "epoch": 0.7884108867427568, "grad_norm": 0.01102588139474392, "learning_rate": 3.028972783143108e-05, "loss": 0.0002, "step": 2694 }, { "epoch": 0.7887035411179397, "grad_norm": 5.191501617431641, "learning_rate": 3.028241147205151e-05, "loss": 0.1501, "step": 2695 }, { "epoch": 0.7889961954931226, "grad_norm": 0.02012191154062748, "learning_rate": 3.0275095112671937e-05, "loss": 0.0002, "step": 2696 }, { "epoch": 0.7892888498683055, "grad_norm": 0.002266214694827795, "learning_rate": 3.0267778753292365e-05, "loss": 0.0, "step": 2697 }, { "epoch": 0.7895815042434884, "grad_norm": 0.017709849402308464, "learning_rate": 3.0260462393912793e-05, "loss": 0.0002, "step": 2698 }, { "epoch": 0.7898741586186714, "grad_norm": 0.08727958798408508, "learning_rate": 3.025314603453322e-05, "loss": 0.0009, "step": 2699 }, { "epoch": 0.7901668129938543, "grad_norm": 0.0019750255160033703, "learning_rate": 3.024582967515365e-05, "loss": 0.0001, "step": 2700 }, { "epoch": 0.7904594673690372, "grad_norm": 0.08402732014656067, "learning_rate": 3.023851331577407e-05, "loss": 0.0008, "step": 2701 }, { "epoch": 0.79075212174422, "grad_norm": 0.008780294097959995, "learning_rate": 3.0231196956394498e-05, "loss": 0.0001, "step": 2702 }, { "epoch": 0.7910447761194029, "grad_norm": 0.0077883535996079445, "learning_rate": 3.0223880597014926e-05, "loss": 0.0001, "step": 2703 }, { "epoch": 0.7913374304945859, "grad_norm": 0.08220571279525757, "learning_rate": 3.0216564237635354e-05, "loss": 0.0006, "step": 2704 }, { "epoch": 0.7916300848697688, "grad_norm": 0.11518245935440063, "learning_rate": 3.0209247878255782e-05, "loss": 0.0008, "step": 2705 }, { "epoch": 0.7919227392449517, "grad_norm": 0.010135181248188019, "learning_rate": 3.020193151887621e-05, "loss": 0.0002, "step": 2706 }, { "epoch": 0.7922153936201346, "grad_norm": 0.0184785146266222, "learning_rate": 3.0194615159496638e-05, "loss": 0.0002, "step": 2707 }, { "epoch": 0.7925080479953175, "grad_norm": 0.02294454351067543, "learning_rate": 3.0187298800117066e-05, "loss": 0.0002, "step": 2708 }, { "epoch": 0.7928007023705005, "grad_norm": 0.014910697937011719, "learning_rate": 3.0179982440737494e-05, "loss": 0.0002, "step": 2709 }, { "epoch": 0.7930933567456834, "grad_norm": 0.0005895071662962437, "learning_rate": 3.017266608135792e-05, "loss": 0.0, "step": 2710 }, { "epoch": 0.7933860111208663, "grad_norm": 0.0028742244467139244, "learning_rate": 3.0165349721978346e-05, "loss": 0.0001, "step": 2711 }, { "epoch": 0.7936786654960492, "grad_norm": 0.019316155463457108, "learning_rate": 3.015803336259877e-05, "loss": 0.0002, "step": 2712 }, { "epoch": 0.793971319871232, "grad_norm": 0.00208294321782887, "learning_rate": 3.01507170032192e-05, "loss": 0.0, "step": 2713 }, { "epoch": 0.794263974246415, "grad_norm": 0.24471159279346466, "learning_rate": 3.0143400643839626e-05, "loss": 0.0015, "step": 2714 }, { "epoch": 0.7945566286215979, "grad_norm": 0.0016943076625466347, "learning_rate": 3.0136084284460054e-05, "loss": 0.0, "step": 2715 }, { "epoch": 0.7948492829967808, "grad_norm": 0.06629418581724167, "learning_rate": 3.0128767925080482e-05, "loss": 0.0004, "step": 2716 }, { "epoch": 0.7951419373719637, "grad_norm": 0.25379154086112976, "learning_rate": 3.012145156570091e-05, "loss": 0.0017, "step": 2717 }, { "epoch": 0.7954345917471466, "grad_norm": 0.00131731026340276, "learning_rate": 3.0114135206321338e-05, "loss": 0.0, "step": 2718 }, { "epoch": 0.7957272461223295, "grad_norm": 0.012910161167383194, "learning_rate": 3.0106818846941763e-05, "loss": 0.0002, "step": 2719 }, { "epoch": 0.7960199004975125, "grad_norm": 0.0022732571233063936, "learning_rate": 3.009950248756219e-05, "loss": 0.0, "step": 2720 }, { "epoch": 0.7963125548726954, "grad_norm": 0.0017511585028842092, "learning_rate": 3.009218612818262e-05, "loss": 0.0, "step": 2721 }, { "epoch": 0.7966052092478783, "grad_norm": 0.0016778865829110146, "learning_rate": 3.0084869768803043e-05, "loss": 0.0, "step": 2722 }, { "epoch": 0.7968978636230611, "grad_norm": 0.0068087331019341946, "learning_rate": 3.007755340942347e-05, "loss": 0.0001, "step": 2723 }, { "epoch": 0.797190517998244, "grad_norm": 0.0014441823586821556, "learning_rate": 3.00702370500439e-05, "loss": 0.0, "step": 2724 }, { "epoch": 0.797483172373427, "grad_norm": 0.0011686611687764525, "learning_rate": 3.0062920690664327e-05, "loss": 0.0, "step": 2725 }, { "epoch": 0.7977758267486099, "grad_norm": 0.8683436512947083, "learning_rate": 3.0055604331284755e-05, "loss": 0.0021, "step": 2726 }, { "epoch": 0.7980684811237928, "grad_norm": 0.02965354174375534, "learning_rate": 3.004828797190518e-05, "loss": 0.0002, "step": 2727 }, { "epoch": 0.7983611354989757, "grad_norm": 0.008388472720980644, "learning_rate": 3.0040971612525608e-05, "loss": 0.0001, "step": 2728 }, { "epoch": 0.7986537898741586, "grad_norm": 0.003930417355149984, "learning_rate": 3.0033655253146036e-05, "loss": 0.0001, "step": 2729 }, { "epoch": 0.7989464442493416, "grad_norm": 0.0007542763487435877, "learning_rate": 3.0026338893766464e-05, "loss": 0.0, "step": 2730 }, { "epoch": 0.7992390986245245, "grad_norm": 0.0011721195187419653, "learning_rate": 3.001902253438689e-05, "loss": 0.0, "step": 2731 }, { "epoch": 0.7995317529997074, "grad_norm": 9.9760103225708, "learning_rate": 3.001170617500732e-05, "loss": 0.0838, "step": 2732 }, { "epoch": 0.7998244073748902, "grad_norm": 0.011321314610540867, "learning_rate": 3.0004389815627744e-05, "loss": 0.0001, "step": 2733 }, { "epoch": 0.8001170617500731, "grad_norm": 0.0004917927435599267, "learning_rate": 2.9997073456248172e-05, "loss": 0.0, "step": 2734 }, { "epoch": 0.800409716125256, "grad_norm": 0.005929878912866116, "learning_rate": 2.9989757096868597e-05, "loss": 0.0001, "step": 2735 }, { "epoch": 0.800702370500439, "grad_norm": 0.002472624648362398, "learning_rate": 2.9982440737489024e-05, "loss": 0.0, "step": 2736 }, { "epoch": 0.8009950248756219, "grad_norm": 0.009232684969902039, "learning_rate": 2.9975124378109452e-05, "loss": 0.0001, "step": 2737 }, { "epoch": 0.8012876792508048, "grad_norm": 0.004552721511572599, "learning_rate": 2.996780801872988e-05, "loss": 0.0001, "step": 2738 }, { "epoch": 0.8015803336259877, "grad_norm": 0.0003590781125240028, "learning_rate": 2.9960491659350308e-05, "loss": 0.0, "step": 2739 }, { "epoch": 0.8018729880011706, "grad_norm": 0.0006961704348213971, "learning_rate": 2.9953175299970736e-05, "loss": 0.0, "step": 2740 }, { "epoch": 0.8021656423763536, "grad_norm": 0.002296611201018095, "learning_rate": 2.9945858940591164e-05, "loss": 0.0, "step": 2741 }, { "epoch": 0.8024582967515365, "grad_norm": 0.0015559152234345675, "learning_rate": 2.9938542581211592e-05, "loss": 0.0, "step": 2742 }, { "epoch": 0.8027509511267193, "grad_norm": 0.0012545986101031303, "learning_rate": 2.993122622183202e-05, "loss": 0.0, "step": 2743 }, { "epoch": 0.8030436055019022, "grad_norm": 0.0018432078650221229, "learning_rate": 2.992390986245244e-05, "loss": 0.0, "step": 2744 }, { "epoch": 0.8033362598770851, "grad_norm": 11.844979286193848, "learning_rate": 2.991659350307287e-05, "loss": 0.0336, "step": 2745 }, { "epoch": 0.8036289142522681, "grad_norm": 0.0009786701994016767, "learning_rate": 2.9909277143693297e-05, "loss": 0.0, "step": 2746 }, { "epoch": 0.803921568627451, "grad_norm": 0.055951718240976334, "learning_rate": 2.9901960784313725e-05, "loss": 0.0002, "step": 2747 }, { "epoch": 0.8042142230026339, "grad_norm": 0.0006754621281288564, "learning_rate": 2.9894644424934153e-05, "loss": 0.0, "step": 2748 }, { "epoch": 0.8045068773778168, "grad_norm": 0.012983710505068302, "learning_rate": 2.988732806555458e-05, "loss": 0.0001, "step": 2749 }, { "epoch": 0.8047995317529997, "grad_norm": 0.0006265717092901468, "learning_rate": 2.988001170617501e-05, "loss": 0.0, "step": 2750 }, { "epoch": 0.8050921861281826, "grad_norm": 0.00151202199049294, "learning_rate": 2.9872695346795437e-05, "loss": 0.0, "step": 2751 }, { "epoch": 0.8053848405033656, "grad_norm": 0.0005175884580239654, "learning_rate": 2.9865378987415865e-05, "loss": 0.0, "step": 2752 }, { "epoch": 0.8056774948785485, "grad_norm": 0.0009235625620931387, "learning_rate": 2.9858062628036293e-05, "loss": 0.0, "step": 2753 }, { "epoch": 0.8059701492537313, "grad_norm": 9.69883918762207, "learning_rate": 2.9850746268656714e-05, "loss": 0.1382, "step": 2754 }, { "epoch": 0.8062628036289142, "grad_norm": 0.0002538673870731145, "learning_rate": 2.9843429909277142e-05, "loss": 0.0, "step": 2755 }, { "epoch": 0.8065554580040971, "grad_norm": 0.20520877838134766, "learning_rate": 2.983611354989757e-05, "loss": 0.0008, "step": 2756 }, { "epoch": 0.8068481123792801, "grad_norm": 0.02855227142572403, "learning_rate": 2.9828797190517998e-05, "loss": 0.0002, "step": 2757 }, { "epoch": 0.807140766754463, "grad_norm": 0.0006890453514643013, "learning_rate": 2.9821480831138426e-05, "loss": 0.0, "step": 2758 }, { "epoch": 0.8074334211296459, "grad_norm": 0.0015851255739107728, "learning_rate": 2.9814164471758854e-05, "loss": 0.0, "step": 2759 }, { "epoch": 0.8077260755048288, "grad_norm": 0.0012363274581730366, "learning_rate": 2.9806848112379282e-05, "loss": 0.0, "step": 2760 }, { "epoch": 0.8080187298800117, "grad_norm": 0.0014085586881265044, "learning_rate": 2.979953175299971e-05, "loss": 0.0, "step": 2761 }, { "epoch": 0.8083113842551947, "grad_norm": 0.0009393296204507351, "learning_rate": 2.9792215393620138e-05, "loss": 0.0, "step": 2762 }, { "epoch": 0.8086040386303776, "grad_norm": 0.00327840237878263, "learning_rate": 2.9784899034240566e-05, "loss": 0.0, "step": 2763 }, { "epoch": 0.8088966930055604, "grad_norm": 0.0021783942356705666, "learning_rate": 2.9777582674860994e-05, "loss": 0.0, "step": 2764 }, { "epoch": 0.8091893473807433, "grad_norm": 0.01263353694230318, "learning_rate": 2.9770266315481415e-05, "loss": 0.0001, "step": 2765 }, { "epoch": 0.8094820017559262, "grad_norm": 0.001956967869773507, "learning_rate": 2.9762949956101843e-05, "loss": 0.0, "step": 2766 }, { "epoch": 0.8097746561311091, "grad_norm": 0.00973919965326786, "learning_rate": 2.975563359672227e-05, "loss": 0.0001, "step": 2767 }, { "epoch": 0.8100673105062921, "grad_norm": 0.2914956510066986, "learning_rate": 2.97483172373427e-05, "loss": 0.0016, "step": 2768 }, { "epoch": 0.810359964881475, "grad_norm": 0.04364306107163429, "learning_rate": 2.9741000877963126e-05, "loss": 0.0003, "step": 2769 }, { "epoch": 0.8106526192566579, "grad_norm": 0.008897513151168823, "learning_rate": 2.9733684518583554e-05, "loss": 0.0001, "step": 2770 }, { "epoch": 0.8109452736318408, "grad_norm": 0.0018379120156168938, "learning_rate": 2.9726368159203982e-05, "loss": 0.0, "step": 2771 }, { "epoch": 0.8112379280070237, "grad_norm": 0.0035767194349318743, "learning_rate": 2.971905179982441e-05, "loss": 0.0001, "step": 2772 }, { "epoch": 0.8115305823822067, "grad_norm": 0.12195797264575958, "learning_rate": 2.9711735440444838e-05, "loss": 0.0005, "step": 2773 }, { "epoch": 0.8118232367573895, "grad_norm": 0.0010022588539868593, "learning_rate": 2.9704419081065266e-05, "loss": 0.0, "step": 2774 }, { "epoch": 0.8121158911325724, "grad_norm": 0.0004416050505824387, "learning_rate": 2.9697102721685687e-05, "loss": 0.0, "step": 2775 }, { "epoch": 0.8124085455077553, "grad_norm": 0.002000578213483095, "learning_rate": 2.9689786362306115e-05, "loss": 0.0, "step": 2776 }, { "epoch": 0.8127011998829382, "grad_norm": 0.0013109841383993626, "learning_rate": 2.9682470002926543e-05, "loss": 0.0, "step": 2777 }, { "epoch": 0.8129938542581212, "grad_norm": 0.002561133122071624, "learning_rate": 2.967515364354697e-05, "loss": 0.0, "step": 2778 }, { "epoch": 0.8132865086333041, "grad_norm": 0.001267809304408729, "learning_rate": 2.96678372841674e-05, "loss": 0.0, "step": 2779 }, { "epoch": 0.813579163008487, "grad_norm": 0.0004984505940228701, "learning_rate": 2.9660520924787827e-05, "loss": 0.0, "step": 2780 }, { "epoch": 0.8138718173836699, "grad_norm": 0.006047519389539957, "learning_rate": 2.9653204565408255e-05, "loss": 0.0001, "step": 2781 }, { "epoch": 0.8141644717588528, "grad_norm": 0.0013361189048737288, "learning_rate": 2.9645888206028683e-05, "loss": 0.0, "step": 2782 }, { "epoch": 0.8144571261340358, "grad_norm": 0.0007503708475269377, "learning_rate": 2.963857184664911e-05, "loss": 0.0, "step": 2783 }, { "epoch": 0.8147497805092186, "grad_norm": 0.05241904780268669, "learning_rate": 2.963125548726954e-05, "loss": 0.0003, "step": 2784 }, { "epoch": 0.8150424348844015, "grad_norm": 0.0008833020692691207, "learning_rate": 2.9623939127889967e-05, "loss": 0.0, "step": 2785 }, { "epoch": 0.8153350892595844, "grad_norm": 0.00027996645076200366, "learning_rate": 2.9616622768510388e-05, "loss": 0.0, "step": 2786 }, { "epoch": 0.8156277436347673, "grad_norm": 0.01899244263768196, "learning_rate": 2.9609306409130816e-05, "loss": 0.0001, "step": 2787 }, { "epoch": 0.8159203980099502, "grad_norm": 0.001283986959606409, "learning_rate": 2.9601990049751244e-05, "loss": 0.0, "step": 2788 }, { "epoch": 0.8162130523851332, "grad_norm": 0.00158262113109231, "learning_rate": 2.9594673690371672e-05, "loss": 0.0, "step": 2789 }, { "epoch": 0.8165057067603161, "grad_norm": 0.0007797530852258205, "learning_rate": 2.95873573309921e-05, "loss": 0.0, "step": 2790 }, { "epoch": 0.816798361135499, "grad_norm": 0.000604621774982661, "learning_rate": 2.9580040971612528e-05, "loss": 0.0, "step": 2791 }, { "epoch": 0.8170910155106819, "grad_norm": 0.37448152899742126, "learning_rate": 2.9572724612232956e-05, "loss": 0.0012, "step": 2792 }, { "epoch": 0.8173836698858647, "grad_norm": 0.0010730307549238205, "learning_rate": 2.9565408252853384e-05, "loss": 0.0, "step": 2793 }, { "epoch": 0.8176763242610477, "grad_norm": 0.0008598492713645101, "learning_rate": 2.955809189347381e-05, "loss": 0.0, "step": 2794 }, { "epoch": 0.8179689786362306, "grad_norm": 0.0010620629182085395, "learning_rate": 2.955077553409424e-05, "loss": 0.0, "step": 2795 }, { "epoch": 0.8182616330114135, "grad_norm": 0.006837981753051281, "learning_rate": 2.9543459174714668e-05, "loss": 0.0001, "step": 2796 }, { "epoch": 0.8185542873865964, "grad_norm": 0.0009813921060413122, "learning_rate": 2.953614281533509e-05, "loss": 0.0, "step": 2797 }, { "epoch": 0.8188469417617793, "grad_norm": 0.024175820872187614, "learning_rate": 2.9528826455955517e-05, "loss": 0.0001, "step": 2798 }, { "epoch": 0.8191395961369623, "grad_norm": 0.011615071445703506, "learning_rate": 2.9521510096575945e-05, "loss": 0.0001, "step": 2799 }, { "epoch": 0.8194322505121452, "grad_norm": 0.0013745144242420793, "learning_rate": 2.9514193737196373e-05, "loss": 0.0, "step": 2800 }, { "epoch": 0.8197249048873281, "grad_norm": 0.0020426204428076744, "learning_rate": 2.95068773778168e-05, "loss": 0.0, "step": 2801 }, { "epoch": 0.820017559262511, "grad_norm": 0.0008682936313562095, "learning_rate": 2.949956101843723e-05, "loss": 0.0, "step": 2802 }, { "epoch": 0.8203102136376939, "grad_norm": 0.0026835596654564142, "learning_rate": 2.9492244659057656e-05, "loss": 0.0001, "step": 2803 }, { "epoch": 0.8206028680128767, "grad_norm": 0.01051302533596754, "learning_rate": 2.9484928299678084e-05, "loss": 0.0001, "step": 2804 }, { "epoch": 0.8208955223880597, "grad_norm": 0.00984960701316595, "learning_rate": 2.9477611940298512e-05, "loss": 0.0001, "step": 2805 }, { "epoch": 0.8211881767632426, "grad_norm": 0.000248409021878615, "learning_rate": 2.9470295580918937e-05, "loss": 0.0, "step": 2806 }, { "epoch": 0.8214808311384255, "grad_norm": 2.868433713912964, "learning_rate": 2.946297922153936e-05, "loss": 0.0068, "step": 2807 }, { "epoch": 0.8217734855136084, "grad_norm": 0.0009482511668466032, "learning_rate": 2.945566286215979e-05, "loss": 0.0, "step": 2808 }, { "epoch": 0.8220661398887913, "grad_norm": 0.0003328518941998482, "learning_rate": 2.9448346502780217e-05, "loss": 0.0, "step": 2809 }, { "epoch": 0.8223587942639743, "grad_norm": 0.008871063590049744, "learning_rate": 2.9441030143400645e-05, "loss": 0.0, "step": 2810 }, { "epoch": 0.8226514486391572, "grad_norm": 0.00042780142393894494, "learning_rate": 2.9433713784021073e-05, "loss": 0.0, "step": 2811 }, { "epoch": 0.8229441030143401, "grad_norm": 0.0029195339884608984, "learning_rate": 2.94263974246415e-05, "loss": 0.0, "step": 2812 }, { "epoch": 0.823236757389523, "grad_norm": 0.0002829222066793591, "learning_rate": 2.941908106526193e-05, "loss": 0.0, "step": 2813 }, { "epoch": 0.8235294117647058, "grad_norm": 0.0001001347482088022, "learning_rate": 2.9411764705882354e-05, "loss": 0.0, "step": 2814 }, { "epoch": 0.8238220661398888, "grad_norm": 0.0007854485884308815, "learning_rate": 2.9404448346502782e-05, "loss": 0.0, "step": 2815 }, { "epoch": 0.8241147205150717, "grad_norm": 2.1147656440734863, "learning_rate": 2.939713198712321e-05, "loss": 0.0046, "step": 2816 }, { "epoch": 0.8244073748902546, "grad_norm": 0.5754238367080688, "learning_rate": 2.9389815627743638e-05, "loss": 0.0016, "step": 2817 }, { "epoch": 0.8247000292654375, "grad_norm": 0.00013969867723062634, "learning_rate": 2.9382499268364062e-05, "loss": 0.0, "step": 2818 }, { "epoch": 0.8249926836406204, "grad_norm": 0.016234902665019035, "learning_rate": 2.937518290898449e-05, "loss": 0.0001, "step": 2819 }, { "epoch": 0.8252853380158033, "grad_norm": 0.0002993481757584959, "learning_rate": 2.9367866549604918e-05, "loss": 0.0, "step": 2820 }, { "epoch": 0.8255779923909863, "grad_norm": 0.00902794860303402, "learning_rate": 2.9360550190225346e-05, "loss": 0.0001, "step": 2821 }, { "epoch": 0.8258706467661692, "grad_norm": 0.0005412195459939539, "learning_rate": 2.935323383084577e-05, "loss": 0.0, "step": 2822 }, { "epoch": 0.8261633011413521, "grad_norm": 0.0006353114731609821, "learning_rate": 2.93459174714662e-05, "loss": 0.0, "step": 2823 }, { "epoch": 0.826455955516535, "grad_norm": 0.0005318338517099619, "learning_rate": 2.9338601112086626e-05, "loss": 0.0, "step": 2824 }, { "epoch": 0.8267486098917178, "grad_norm": 0.006010818760842085, "learning_rate": 2.9331284752707054e-05, "loss": 0.0, "step": 2825 }, { "epoch": 0.8270412642669008, "grad_norm": 0.014696326106786728, "learning_rate": 2.9323968393327482e-05, "loss": 0.0001, "step": 2826 }, { "epoch": 0.8273339186420837, "grad_norm": 0.02982524037361145, "learning_rate": 2.931665203394791e-05, "loss": 0.0001, "step": 2827 }, { "epoch": 0.8276265730172666, "grad_norm": 0.060100823640823364, "learning_rate": 2.9309335674568338e-05, "loss": 0.0004, "step": 2828 }, { "epoch": 0.8279192273924495, "grad_norm": 0.007390571292489767, "learning_rate": 2.9302019315188763e-05, "loss": 0.0, "step": 2829 }, { "epoch": 0.8282118817676324, "grad_norm": 0.0005158820422366261, "learning_rate": 2.9294702955809187e-05, "loss": 0.0, "step": 2830 }, { "epoch": 0.8285045361428154, "grad_norm": 0.43470558524131775, "learning_rate": 2.9287386596429615e-05, "loss": 0.0011, "step": 2831 }, { "epoch": 0.8287971905179983, "grad_norm": 14.537055969238281, "learning_rate": 2.9280070237050043e-05, "loss": 0.0477, "step": 2832 }, { "epoch": 0.8290898448931812, "grad_norm": 0.0005331973661668599, "learning_rate": 2.927275387767047e-05, "loss": 0.0, "step": 2833 }, { "epoch": 0.829382499268364, "grad_norm": 0.0004824897332582623, "learning_rate": 2.92654375182909e-05, "loss": 0.0, "step": 2834 }, { "epoch": 0.8296751536435469, "grad_norm": 4.611309051513672, "learning_rate": 2.9258121158911327e-05, "loss": 0.2556, "step": 2835 }, { "epoch": 0.8299678080187298, "grad_norm": 0.006274438463151455, "learning_rate": 2.9250804799531755e-05, "loss": 0.0, "step": 2836 }, { "epoch": 0.8302604623939128, "grad_norm": 0.00011106379679404199, "learning_rate": 2.9243488440152183e-05, "loss": 0.0, "step": 2837 }, { "epoch": 0.8305531167690957, "grad_norm": 0.10963346064090729, "learning_rate": 2.923617208077261e-05, "loss": 0.0005, "step": 2838 }, { "epoch": 0.8308457711442786, "grad_norm": 0.0020755550358444452, "learning_rate": 2.9228855721393032e-05, "loss": 0.0, "step": 2839 }, { "epoch": 0.8311384255194615, "grad_norm": 0.00033546079066582024, "learning_rate": 2.922153936201346e-05, "loss": 0.0, "step": 2840 }, { "epoch": 0.8314310798946444, "grad_norm": 0.0009139064350165427, "learning_rate": 2.9214223002633888e-05, "loss": 0.0, "step": 2841 }, { "epoch": 0.8317237342698274, "grad_norm": 0.1672952026128769, "learning_rate": 2.9206906643254316e-05, "loss": 0.0008, "step": 2842 }, { "epoch": 0.8320163886450103, "grad_norm": 2.1475164890289307, "learning_rate": 2.9199590283874744e-05, "loss": 0.0057, "step": 2843 }, { "epoch": 0.8323090430201932, "grad_norm": 0.0007393267005681992, "learning_rate": 2.9192273924495172e-05, "loss": 0.0, "step": 2844 }, { "epoch": 0.832601697395376, "grad_norm": 0.02002253197133541, "learning_rate": 2.91849575651156e-05, "loss": 0.0001, "step": 2845 }, { "epoch": 0.8328943517705589, "grad_norm": 0.02759724296629429, "learning_rate": 2.9177641205736028e-05, "loss": 0.0001, "step": 2846 }, { "epoch": 0.8331870061457419, "grad_norm": 0.004802546929568052, "learning_rate": 2.9170324846356456e-05, "loss": 0.0, "step": 2847 }, { "epoch": 0.8334796605209248, "grad_norm": 0.000317930564051494, "learning_rate": 2.9163008486976884e-05, "loss": 0.0, "step": 2848 }, { "epoch": 0.8337723148961077, "grad_norm": 0.0012358769308775663, "learning_rate": 2.915569212759731e-05, "loss": 0.0, "step": 2849 }, { "epoch": 0.8340649692712906, "grad_norm": 0.8042240738868713, "learning_rate": 2.9148375768217733e-05, "loss": 0.0028, "step": 2850 }, { "epoch": 0.8343576236464735, "grad_norm": 0.0007900151540525258, "learning_rate": 2.914105940883816e-05, "loss": 0.0, "step": 2851 }, { "epoch": 0.8346502780216565, "grad_norm": 0.0005253091221675277, "learning_rate": 2.913374304945859e-05, "loss": 0.0, "step": 2852 }, { "epoch": 0.8349429323968394, "grad_norm": 0.002171743893995881, "learning_rate": 2.9126426690079017e-05, "loss": 0.0, "step": 2853 }, { "epoch": 0.8352355867720223, "grad_norm": 0.0003047423670068383, "learning_rate": 2.9119110330699445e-05, "loss": 0.0, "step": 2854 }, { "epoch": 0.8355282411472051, "grad_norm": 0.0017876947531476617, "learning_rate": 2.9111793971319873e-05, "loss": 0.0, "step": 2855 }, { "epoch": 0.835820895522388, "grad_norm": 0.0004605422727763653, "learning_rate": 2.91044776119403e-05, "loss": 0.0, "step": 2856 }, { "epoch": 0.8361135498975709, "grad_norm": 0.0010576428612694144, "learning_rate": 2.909716125256073e-05, "loss": 0.0, "step": 2857 }, { "epoch": 0.8364062042727539, "grad_norm": 0.0007708283956162632, "learning_rate": 2.9089844893181156e-05, "loss": 0.0, "step": 2858 }, { "epoch": 0.8366988586479368, "grad_norm": 0.0009573538554832339, "learning_rate": 2.9082528533801584e-05, "loss": 0.0, "step": 2859 }, { "epoch": 0.8369915130231197, "grad_norm": 0.000981238903477788, "learning_rate": 2.9075212174422006e-05, "loss": 0.0, "step": 2860 }, { "epoch": 0.8372841673983026, "grad_norm": 1.0129269361495972, "learning_rate": 2.9067895815042434e-05, "loss": 0.0033, "step": 2861 }, { "epoch": 0.8375768217734855, "grad_norm": 0.00044342450564727187, "learning_rate": 2.906057945566286e-05, "loss": 0.0, "step": 2862 }, { "epoch": 0.8378694761486685, "grad_norm": 0.014190975576639175, "learning_rate": 2.905326309628329e-05, "loss": 0.0001, "step": 2863 }, { "epoch": 0.8381621305238514, "grad_norm": 7.357584476470947, "learning_rate": 2.9045946736903717e-05, "loss": 0.1928, "step": 2864 }, { "epoch": 0.8384547848990342, "grad_norm": 0.005698964465409517, "learning_rate": 2.9038630377524145e-05, "loss": 0.0001, "step": 2865 }, { "epoch": 0.8387474392742171, "grad_norm": 0.013709690421819687, "learning_rate": 2.9031314018144573e-05, "loss": 0.0001, "step": 2866 }, { "epoch": 0.8390400936494, "grad_norm": 0.008576486259698868, "learning_rate": 2.9023997658765e-05, "loss": 0.0001, "step": 2867 }, { "epoch": 0.839332748024583, "grad_norm": 0.001753227086737752, "learning_rate": 2.901668129938543e-05, "loss": 0.0, "step": 2868 }, { "epoch": 0.8396254023997659, "grad_norm": 11.56413459777832, "learning_rate": 2.9009364940005857e-05, "loss": 0.0517, "step": 2869 }, { "epoch": 0.8399180567749488, "grad_norm": 0.003031209111213684, "learning_rate": 2.9002048580626285e-05, "loss": 0.0, "step": 2870 }, { "epoch": 0.8402107111501317, "grad_norm": 16.875307083129883, "learning_rate": 2.8994732221246706e-05, "loss": 0.0994, "step": 2871 }, { "epoch": 0.8405033655253146, "grad_norm": 0.0014646684285253286, "learning_rate": 2.8987415861867134e-05, "loss": 0.0, "step": 2872 }, { "epoch": 0.8407960199004975, "grad_norm": 0.0011212496319785714, "learning_rate": 2.8980099502487562e-05, "loss": 0.0, "step": 2873 }, { "epoch": 0.8410886742756805, "grad_norm": 0.0007398988236673176, "learning_rate": 2.897278314310799e-05, "loss": 0.0, "step": 2874 }, { "epoch": 0.8413813286508633, "grad_norm": 0.000355487602064386, "learning_rate": 2.8965466783728418e-05, "loss": 0.0, "step": 2875 }, { "epoch": 0.8416739830260462, "grad_norm": 0.0025122477672994137, "learning_rate": 2.8958150424348846e-05, "loss": 0.0, "step": 2876 }, { "epoch": 0.8419666374012291, "grad_norm": 0.0020755138248205185, "learning_rate": 2.8950834064969274e-05, "loss": 0.0, "step": 2877 }, { "epoch": 0.842259291776412, "grad_norm": 10.628597259521484, "learning_rate": 2.8943517705589702e-05, "loss": 0.1175, "step": 2878 }, { "epoch": 0.842551946151595, "grad_norm": 0.3478909730911255, "learning_rate": 2.893620134621013e-05, "loss": 0.0008, "step": 2879 }, { "epoch": 0.8428446005267779, "grad_norm": 0.0006228281417861581, "learning_rate": 2.8928884986830558e-05, "loss": 0.0, "step": 2880 }, { "epoch": 0.8431372549019608, "grad_norm": 0.009975981898605824, "learning_rate": 2.8921568627450986e-05, "loss": 0.0001, "step": 2881 }, { "epoch": 0.8434299092771437, "grad_norm": 0.00953388586640358, "learning_rate": 2.8914252268071407e-05, "loss": 0.0001, "step": 2882 }, { "epoch": 0.8437225636523266, "grad_norm": 11.161602020263672, "learning_rate": 2.8906935908691835e-05, "loss": 0.1049, "step": 2883 }, { "epoch": 0.8440152180275096, "grad_norm": 0.001079063513316214, "learning_rate": 2.8899619549312263e-05, "loss": 0.0, "step": 2884 }, { "epoch": 0.8443078724026924, "grad_norm": 0.0013235245132818818, "learning_rate": 2.889230318993269e-05, "loss": 0.0, "step": 2885 }, { "epoch": 0.8446005267778753, "grad_norm": 0.0013966573169454932, "learning_rate": 2.888498683055312e-05, "loss": 0.0, "step": 2886 }, { "epoch": 0.8448931811530582, "grad_norm": 0.001036423142068088, "learning_rate": 2.8877670471173547e-05, "loss": 0.0, "step": 2887 }, { "epoch": 0.8451858355282411, "grad_norm": 3.4299228191375732, "learning_rate": 2.8870354111793975e-05, "loss": 0.0154, "step": 2888 }, { "epoch": 0.845478489903424, "grad_norm": 0.004335775505751371, "learning_rate": 2.8863037752414403e-05, "loss": 0.0001, "step": 2889 }, { "epoch": 0.845771144278607, "grad_norm": 0.0013231680495664477, "learning_rate": 2.885572139303483e-05, "loss": 0.0, "step": 2890 }, { "epoch": 0.8460637986537899, "grad_norm": 6.0724663734436035, "learning_rate": 2.884840503365526e-05, "loss": 0.0914, "step": 2891 }, { "epoch": 0.8463564530289728, "grad_norm": 0.01388559490442276, "learning_rate": 2.884108867427568e-05, "loss": 0.0002, "step": 2892 }, { "epoch": 0.8466491074041557, "grad_norm": 0.0005023994017392397, "learning_rate": 2.8833772314896108e-05, "loss": 0.0, "step": 2893 }, { "epoch": 0.8469417617793386, "grad_norm": 0.0015240572392940521, "learning_rate": 2.8826455955516536e-05, "loss": 0.0, "step": 2894 }, { "epoch": 0.8472344161545216, "grad_norm": 0.021614955738186836, "learning_rate": 2.8819139596136964e-05, "loss": 0.0002, "step": 2895 }, { "epoch": 0.8475270705297044, "grad_norm": 0.006151202600449324, "learning_rate": 2.881182323675739e-05, "loss": 0.0001, "step": 2896 }, { "epoch": 0.8478197249048873, "grad_norm": 0.0035713522229343653, "learning_rate": 2.880450687737782e-05, "loss": 0.0001, "step": 2897 }, { "epoch": 0.8481123792800702, "grad_norm": 0.0021826657466590405, "learning_rate": 2.8797190517998247e-05, "loss": 0.0, "step": 2898 }, { "epoch": 0.8484050336552531, "grad_norm": 0.0003361078561283648, "learning_rate": 2.8789874158618675e-05, "loss": 0.0, "step": 2899 }, { "epoch": 0.8486976880304361, "grad_norm": 0.014857086353003979, "learning_rate": 2.87825577992391e-05, "loss": 0.0001, "step": 2900 }, { "epoch": 0.848990342405619, "grad_norm": 0.00031917868182063103, "learning_rate": 2.8775241439859528e-05, "loss": 0.0, "step": 2901 }, { "epoch": 0.8492829967808019, "grad_norm": 15.271645545959473, "learning_rate": 2.8767925080479956e-05, "loss": 0.1044, "step": 2902 }, { "epoch": 0.8495756511559848, "grad_norm": 0.0025875659193843603, "learning_rate": 2.876060872110038e-05, "loss": 0.0, "step": 2903 }, { "epoch": 0.8498683055311677, "grad_norm": 0.004049929790198803, "learning_rate": 2.8753292361720808e-05, "loss": 0.0001, "step": 2904 }, { "epoch": 0.8501609599063507, "grad_norm": 0.10546530038118362, "learning_rate": 2.8745976002341236e-05, "loss": 0.0007, "step": 2905 }, { "epoch": 0.8504536142815335, "grad_norm": 0.0020989186596125364, "learning_rate": 2.8738659642961664e-05, "loss": 0.0, "step": 2906 }, { "epoch": 0.8507462686567164, "grad_norm": 0.026156364008784294, "learning_rate": 2.8731343283582092e-05, "loss": 0.0003, "step": 2907 }, { "epoch": 0.8510389230318993, "grad_norm": 0.0009036893025040627, "learning_rate": 2.8724026924202517e-05, "loss": 0.0, "step": 2908 }, { "epoch": 0.8513315774070822, "grad_norm": 5.80036735534668, "learning_rate": 2.8716710564822945e-05, "loss": 0.0665, "step": 2909 }, { "epoch": 0.8516242317822651, "grad_norm": 0.0036561263259500265, "learning_rate": 2.8709394205443373e-05, "loss": 0.0001, "step": 2910 }, { "epoch": 0.8519168861574481, "grad_norm": 0.0010862121125683188, "learning_rate": 2.87020778460638e-05, "loss": 0.0, "step": 2911 }, { "epoch": 0.852209540532631, "grad_norm": 0.0026491752360016108, "learning_rate": 2.869476148668423e-05, "loss": 0.0001, "step": 2912 }, { "epoch": 0.8525021949078139, "grad_norm": 0.0032964395359158516, "learning_rate": 2.8687445127304653e-05, "loss": 0.0001, "step": 2913 }, { "epoch": 0.8527948492829968, "grad_norm": 0.008203510195016861, "learning_rate": 2.868012876792508e-05, "loss": 0.0001, "step": 2914 }, { "epoch": 0.8530875036581796, "grad_norm": 0.00029596927925013006, "learning_rate": 2.867281240854551e-05, "loss": 0.0, "step": 2915 }, { "epoch": 0.8533801580333626, "grad_norm": 1.0822234153747559, "learning_rate": 2.8665496049165937e-05, "loss": 0.0047, "step": 2916 }, { "epoch": 0.8536728124085455, "grad_norm": 0.003979158587753773, "learning_rate": 2.865817968978636e-05, "loss": 0.0001, "step": 2917 }, { "epoch": 0.8539654667837284, "grad_norm": 0.0018156712176278234, "learning_rate": 2.865086333040679e-05, "loss": 0.0, "step": 2918 }, { "epoch": 0.8542581211589113, "grad_norm": 0.006645577028393745, "learning_rate": 2.8643546971027217e-05, "loss": 0.0001, "step": 2919 }, { "epoch": 0.8545507755340942, "grad_norm": 0.0036059929989278316, "learning_rate": 2.8636230611647645e-05, "loss": 0.0001, "step": 2920 }, { "epoch": 0.8548434299092772, "grad_norm": 0.0005414785700850189, "learning_rate": 2.8628914252268073e-05, "loss": 0.0, "step": 2921 }, { "epoch": 0.8551360842844601, "grad_norm": 0.00039202620973810554, "learning_rate": 2.86215978928885e-05, "loss": 0.0, "step": 2922 }, { "epoch": 0.855428738659643, "grad_norm": 0.0024136367719620466, "learning_rate": 2.861428153350893e-05, "loss": 0.0, "step": 2923 }, { "epoch": 0.8557213930348259, "grad_norm": 0.0005258520250208676, "learning_rate": 2.8606965174129354e-05, "loss": 0.0, "step": 2924 }, { "epoch": 0.8560140474100087, "grad_norm": 0.00053996971109882, "learning_rate": 2.859964881474978e-05, "loss": 0.0, "step": 2925 }, { "epoch": 0.8563067017851916, "grad_norm": 0.005987247452139854, "learning_rate": 2.8592332455370206e-05, "loss": 0.0001, "step": 2926 }, { "epoch": 0.8565993561603746, "grad_norm": 0.0031112045980989933, "learning_rate": 2.8585016095990634e-05, "loss": 0.0001, "step": 2927 }, { "epoch": 0.8568920105355575, "grad_norm": 0.0008803472737781703, "learning_rate": 2.8577699736611062e-05, "loss": 0.0, "step": 2928 }, { "epoch": 0.8571846649107404, "grad_norm": 0.0006713624461553991, "learning_rate": 2.857038337723149e-05, "loss": 0.0, "step": 2929 }, { "epoch": 0.8574773192859233, "grad_norm": 0.0008445119019597769, "learning_rate": 2.8563067017851918e-05, "loss": 0.0, "step": 2930 }, { "epoch": 0.8577699736611062, "grad_norm": 0.0017099477117881179, "learning_rate": 2.8555750658472346e-05, "loss": 0.0, "step": 2931 }, { "epoch": 0.8580626280362892, "grad_norm": 0.0013064577942714095, "learning_rate": 2.8548434299092774e-05, "loss": 0.0, "step": 2932 }, { "epoch": 0.8583552824114721, "grad_norm": 0.05210070684552193, "learning_rate": 2.8541117939713202e-05, "loss": 0.0003, "step": 2933 }, { "epoch": 0.858647936786655, "grad_norm": 0.0011810348369181156, "learning_rate": 2.853380158033363e-05, "loss": 0.0, "step": 2934 }, { "epoch": 0.8589405911618379, "grad_norm": 0.016888394951820374, "learning_rate": 2.852648522095405e-05, "loss": 0.0001, "step": 2935 }, { "epoch": 0.8592332455370207, "grad_norm": 3.309821128845215, "learning_rate": 2.851916886157448e-05, "loss": 0.0082, "step": 2936 }, { "epoch": 0.8595258999122037, "grad_norm": 0.004678688012063503, "learning_rate": 2.8511852502194907e-05, "loss": 0.0001, "step": 2937 }, { "epoch": 0.8598185542873866, "grad_norm": 0.0011509255273267627, "learning_rate": 2.8504536142815335e-05, "loss": 0.0, "step": 2938 }, { "epoch": 0.8601112086625695, "grad_norm": 0.0011290708789601922, "learning_rate": 2.8497219783435763e-05, "loss": 0.0, "step": 2939 }, { "epoch": 0.8604038630377524, "grad_norm": 0.0021077506244182587, "learning_rate": 2.848990342405619e-05, "loss": 0.0, "step": 2940 }, { "epoch": 0.8606965174129353, "grad_norm": 0.0008194958209060133, "learning_rate": 2.848258706467662e-05, "loss": 0.0, "step": 2941 }, { "epoch": 0.8609891717881182, "grad_norm": 0.00027053439407609403, "learning_rate": 2.8475270705297047e-05, "loss": 0.0, "step": 2942 }, { "epoch": 0.8612818261633012, "grad_norm": 0.002523620380088687, "learning_rate": 2.8467954345917475e-05, "loss": 0.0, "step": 2943 }, { "epoch": 0.8615744805384841, "grad_norm": 0.00047940504737198353, "learning_rate": 2.8460637986537903e-05, "loss": 0.0, "step": 2944 }, { "epoch": 0.861867134913667, "grad_norm": 0.0005567455664277077, "learning_rate": 2.8453321627158324e-05, "loss": 0.0, "step": 2945 }, { "epoch": 0.8621597892888498, "grad_norm": 0.0006807685131207108, "learning_rate": 2.8446005267778752e-05, "loss": 0.0, "step": 2946 }, { "epoch": 0.8624524436640327, "grad_norm": 10.231255531311035, "learning_rate": 2.843868890839918e-05, "loss": 0.0348, "step": 2947 }, { "epoch": 0.8627450980392157, "grad_norm": 0.0005288583342917264, "learning_rate": 2.8431372549019608e-05, "loss": 0.0, "step": 2948 }, { "epoch": 0.8630377524143986, "grad_norm": 13.133936882019043, "learning_rate": 2.8424056189640036e-05, "loss": 0.0892, "step": 2949 }, { "epoch": 0.8633304067895815, "grad_norm": 0.2714103162288666, "learning_rate": 2.8416739830260464e-05, "loss": 0.0011, "step": 2950 }, { "epoch": 0.8636230611647644, "grad_norm": 0.0431370884180069, "learning_rate": 2.840942347088089e-05, "loss": 0.0003, "step": 2951 }, { "epoch": 0.8639157155399473, "grad_norm": 0.011839745566248894, "learning_rate": 2.840210711150132e-05, "loss": 0.0001, "step": 2952 }, { "epoch": 0.8642083699151303, "grad_norm": 0.0012024779571220279, "learning_rate": 2.8394790752121747e-05, "loss": 0.0, "step": 2953 }, { "epoch": 0.8645010242903132, "grad_norm": 0.0008249465608969331, "learning_rate": 2.8387474392742175e-05, "loss": 0.0, "step": 2954 }, { "epoch": 0.8647936786654961, "grad_norm": 5.485530376434326, "learning_rate": 2.8380158033362603e-05, "loss": 0.15, "step": 2955 }, { "epoch": 0.865086333040679, "grad_norm": 0.0009516372811049223, "learning_rate": 2.8372841673983024e-05, "loss": 0.0, "step": 2956 }, { "epoch": 0.8653789874158618, "grad_norm": 0.004190864972770214, "learning_rate": 2.8365525314603452e-05, "loss": 0.0001, "step": 2957 }, { "epoch": 0.8656716417910447, "grad_norm": 0.0011981577845290303, "learning_rate": 2.835820895522388e-05, "loss": 0.0, "step": 2958 }, { "epoch": 0.8659642961662277, "grad_norm": 0.011278665624558926, "learning_rate": 2.8350892595844308e-05, "loss": 0.0001, "step": 2959 }, { "epoch": 0.8662569505414106, "grad_norm": 0.0007399932947009802, "learning_rate": 2.8343576236464736e-05, "loss": 0.0, "step": 2960 }, { "epoch": 0.8665496049165935, "grad_norm": 0.0057645561173558235, "learning_rate": 2.8336259877085164e-05, "loss": 0.0001, "step": 2961 }, { "epoch": 0.8668422592917764, "grad_norm": 0.027825020253658295, "learning_rate": 2.8328943517705592e-05, "loss": 0.0002, "step": 2962 }, { "epoch": 0.8671349136669593, "grad_norm": 0.03514590486884117, "learning_rate": 2.832162715832602e-05, "loss": 0.0001, "step": 2963 }, { "epoch": 0.8674275680421423, "grad_norm": 0.03552441671490669, "learning_rate": 2.8314310798946448e-05, "loss": 0.0002, "step": 2964 }, { "epoch": 0.8677202224173252, "grad_norm": 0.005665469449013472, "learning_rate": 2.8306994439566876e-05, "loss": 0.0001, "step": 2965 }, { "epoch": 0.868012876792508, "grad_norm": 0.4229358434677124, "learning_rate": 2.8299678080187297e-05, "loss": 0.0016, "step": 2966 }, { "epoch": 0.8683055311676909, "grad_norm": 0.3137822449207306, "learning_rate": 2.8292361720807725e-05, "loss": 0.0018, "step": 2967 }, { "epoch": 0.8685981855428738, "grad_norm": 0.002080155536532402, "learning_rate": 2.8285045361428153e-05, "loss": 0.0001, "step": 2968 }, { "epoch": 0.8688908399180568, "grad_norm": 0.0007805654313415289, "learning_rate": 2.827772900204858e-05, "loss": 0.0, "step": 2969 }, { "epoch": 0.8691834942932397, "grad_norm": 0.002341997576877475, "learning_rate": 2.827041264266901e-05, "loss": 0.0001, "step": 2970 }, { "epoch": 0.8694761486684226, "grad_norm": 0.00467673409730196, "learning_rate": 2.8263096283289437e-05, "loss": 0.0, "step": 2971 }, { "epoch": 0.8697688030436055, "grad_norm": 0.0031030906829982996, "learning_rate": 2.8255779923909865e-05, "loss": 0.0001, "step": 2972 }, { "epoch": 0.8700614574187884, "grad_norm": 0.0004940580110996962, "learning_rate": 2.8248463564530293e-05, "loss": 0.0, "step": 2973 }, { "epoch": 0.8703541117939714, "grad_norm": 0.0012688792776316404, "learning_rate": 2.824114720515072e-05, "loss": 0.0, "step": 2974 }, { "epoch": 0.8706467661691543, "grad_norm": 0.03329317644238472, "learning_rate": 2.823383084577115e-05, "loss": 0.0004, "step": 2975 }, { "epoch": 0.8709394205443372, "grad_norm": 0.05283841863274574, "learning_rate": 2.8226514486391577e-05, "loss": 0.0003, "step": 2976 }, { "epoch": 0.87123207491952, "grad_norm": 3.264716863632202, "learning_rate": 2.8219198127011998e-05, "loss": 0.0067, "step": 2977 }, { "epoch": 0.8715247292947029, "grad_norm": 0.009444973431527615, "learning_rate": 2.8211881767632426e-05, "loss": 0.0001, "step": 2978 }, { "epoch": 0.8718173836698858, "grad_norm": 0.0039008858148008585, "learning_rate": 2.8204565408252854e-05, "loss": 0.0001, "step": 2979 }, { "epoch": 0.8721100380450688, "grad_norm": 0.0014422088861465454, "learning_rate": 2.819724904887328e-05, "loss": 0.0, "step": 2980 }, { "epoch": 0.8724026924202517, "grad_norm": 0.006031569559127092, "learning_rate": 2.818993268949371e-05, "loss": 0.0001, "step": 2981 }, { "epoch": 0.8726953467954346, "grad_norm": 0.0257797259837389, "learning_rate": 2.8182616330114138e-05, "loss": 0.0003, "step": 2982 }, { "epoch": 0.8729880011706175, "grad_norm": 0.02930227294564247, "learning_rate": 2.8175299970734566e-05, "loss": 0.0003, "step": 2983 }, { "epoch": 0.8732806555458004, "grad_norm": 0.021732913330197334, "learning_rate": 2.8167983611354993e-05, "loss": 0.0002, "step": 2984 }, { "epoch": 0.8735733099209834, "grad_norm": 0.005003643687814474, "learning_rate": 2.816066725197542e-05, "loss": 0.0001, "step": 2985 }, { "epoch": 0.8738659642961663, "grad_norm": 0.014906673692166805, "learning_rate": 2.815335089259585e-05, "loss": 0.0002, "step": 2986 }, { "epoch": 0.8741586186713491, "grad_norm": 0.003577263094484806, "learning_rate": 2.8146034533216274e-05, "loss": 0.0001, "step": 2987 }, { "epoch": 0.874451273046532, "grad_norm": 0.6646336317062378, "learning_rate": 2.81387181738367e-05, "loss": 0.003, "step": 2988 }, { "epoch": 0.8747439274217149, "grad_norm": 0.01488159317523241, "learning_rate": 2.8131401814457126e-05, "loss": 0.0001, "step": 2989 }, { "epoch": 0.8750365817968979, "grad_norm": 6.319203853607178, "learning_rate": 2.8124085455077554e-05, "loss": 0.2414, "step": 2990 }, { "epoch": 0.8753292361720808, "grad_norm": 0.0002458333328831941, "learning_rate": 2.8116769095697982e-05, "loss": 0.0, "step": 2991 }, { "epoch": 0.8756218905472637, "grad_norm": 0.0015180050395429134, "learning_rate": 2.810945273631841e-05, "loss": 0.0, "step": 2992 }, { "epoch": 0.8759145449224466, "grad_norm": 0.00021006070892326534, "learning_rate": 2.8102136376938838e-05, "loss": 0.0, "step": 2993 }, { "epoch": 0.8762071992976295, "grad_norm": 0.00108836661092937, "learning_rate": 2.8094820017559266e-05, "loss": 0.0, "step": 2994 }, { "epoch": 0.8764998536728124, "grad_norm": 0.000900108425412327, "learning_rate": 2.808750365817969e-05, "loss": 0.0, "step": 2995 }, { "epoch": 0.8767925080479954, "grad_norm": 0.0017437812639400363, "learning_rate": 2.808018729880012e-05, "loss": 0.0, "step": 2996 }, { "epoch": 0.8770851624231782, "grad_norm": 0.03165074437856674, "learning_rate": 2.8072870939420547e-05, "loss": 0.0002, "step": 2997 }, { "epoch": 0.8773778167983611, "grad_norm": 0.0017918091034516692, "learning_rate": 2.806555458004097e-05, "loss": 0.0, "step": 2998 }, { "epoch": 0.877670471173544, "grad_norm": 0.03214191272854805, "learning_rate": 2.80582382206614e-05, "loss": 0.0002, "step": 2999 }, { "epoch": 0.8779631255487269, "grad_norm": 0.001971530495211482, "learning_rate": 2.8050921861281827e-05, "loss": 0.0, "step": 3000 }, { "epoch": 0.8782557799239099, "grad_norm": 7.419076919555664, "learning_rate": 2.8043605501902255e-05, "loss": 0.0408, "step": 3001 }, { "epoch": 0.8785484342990928, "grad_norm": 0.18764744699001312, "learning_rate": 2.8036289142522683e-05, "loss": 0.0012, "step": 3002 }, { "epoch": 0.8788410886742757, "grad_norm": 0.03457354009151459, "learning_rate": 2.8028972783143108e-05, "loss": 0.0003, "step": 3003 }, { "epoch": 0.8791337430494586, "grad_norm": 0.0014142461586743593, "learning_rate": 2.8021656423763536e-05, "loss": 0.0, "step": 3004 }, { "epoch": 0.8794263974246415, "grad_norm": 0.021508987993001938, "learning_rate": 2.8014340064383964e-05, "loss": 0.0001, "step": 3005 }, { "epoch": 0.8797190517998245, "grad_norm": 0.00040742603596299887, "learning_rate": 2.800702370500439e-05, "loss": 0.0, "step": 3006 }, { "epoch": 0.8800117061750073, "grad_norm": 0.001650528167374432, "learning_rate": 2.799970734562482e-05, "loss": 0.0, "step": 3007 }, { "epoch": 0.8803043605501902, "grad_norm": 0.002789534628391266, "learning_rate": 2.7992390986245247e-05, "loss": 0.0, "step": 3008 }, { "epoch": 0.8805970149253731, "grad_norm": 0.0006244821124710143, "learning_rate": 2.7985074626865672e-05, "loss": 0.0, "step": 3009 }, { "epoch": 0.880889669300556, "grad_norm": 0.003858131356537342, "learning_rate": 2.79777582674861e-05, "loss": 0.0001, "step": 3010 }, { "epoch": 0.8811823236757389, "grad_norm": 0.0015467230696231127, "learning_rate": 2.7970441908106524e-05, "loss": 0.0, "step": 3011 }, { "epoch": 0.8814749780509219, "grad_norm": 0.0018741864478215575, "learning_rate": 2.7963125548726952e-05, "loss": 0.0, "step": 3012 }, { "epoch": 0.8817676324261048, "grad_norm": 0.0017647253116592765, "learning_rate": 2.795580918934738e-05, "loss": 0.0, "step": 3013 }, { "epoch": 0.8820602868012877, "grad_norm": 0.0015287159476429224, "learning_rate": 2.7948492829967808e-05, "loss": 0.0, "step": 3014 }, { "epoch": 0.8823529411764706, "grad_norm": 0.0004398477612994611, "learning_rate": 2.7941176470588236e-05, "loss": 0.0, "step": 3015 }, { "epoch": 0.8826455955516535, "grad_norm": 0.0002965580206364393, "learning_rate": 2.7933860111208664e-05, "loss": 0.0, "step": 3016 }, { "epoch": 0.8829382499268364, "grad_norm": 0.0009375095833092928, "learning_rate": 2.7926543751829092e-05, "loss": 0.0, "step": 3017 }, { "epoch": 0.8832309043020193, "grad_norm": 0.0011114409426227212, "learning_rate": 2.791922739244952e-05, "loss": 0.0, "step": 3018 }, { "epoch": 0.8835235586772022, "grad_norm": 0.000699687167070806, "learning_rate": 2.7911911033069948e-05, "loss": 0.0, "step": 3019 }, { "epoch": 0.8838162130523851, "grad_norm": 0.001565412851050496, "learning_rate": 2.790459467369037e-05, "loss": 0.0, "step": 3020 }, { "epoch": 0.884108867427568, "grad_norm": 0.02848893404006958, "learning_rate": 2.7897278314310797e-05, "loss": 0.0002, "step": 3021 }, { "epoch": 0.884401521802751, "grad_norm": 0.0034479028545320034, "learning_rate": 2.7889961954931225e-05, "loss": 0.0001, "step": 3022 }, { "epoch": 0.8846941761779339, "grad_norm": 0.0010538590140640736, "learning_rate": 2.7882645595551653e-05, "loss": 0.0, "step": 3023 }, { "epoch": 0.8849868305531168, "grad_norm": 0.0004554866754915565, "learning_rate": 2.787532923617208e-05, "loss": 0.0, "step": 3024 }, { "epoch": 0.8852794849282997, "grad_norm": 0.010468067601323128, "learning_rate": 2.786801287679251e-05, "loss": 0.0001, "step": 3025 }, { "epoch": 0.8855721393034826, "grad_norm": 0.0002714238653425127, "learning_rate": 2.7860696517412937e-05, "loss": 0.0, "step": 3026 }, { "epoch": 0.8858647936786654, "grad_norm": 0.006320650223642588, "learning_rate": 2.7853380158033365e-05, "loss": 0.0001, "step": 3027 }, { "epoch": 0.8861574480538484, "grad_norm": 0.010992786847054958, "learning_rate": 2.7846063798653793e-05, "loss": 0.0001, "step": 3028 }, { "epoch": 0.8864501024290313, "grad_norm": 0.0016546098049730062, "learning_rate": 2.783874743927422e-05, "loss": 0.0, "step": 3029 }, { "epoch": 0.8867427568042142, "grad_norm": 0.002650155918672681, "learning_rate": 2.7831431079894642e-05, "loss": 0.0001, "step": 3030 }, { "epoch": 0.8870354111793971, "grad_norm": 0.0011368491686880589, "learning_rate": 2.782411472051507e-05, "loss": 0.0, "step": 3031 }, { "epoch": 0.88732806555458, "grad_norm": 0.0006310976459644735, "learning_rate": 2.7816798361135498e-05, "loss": 0.0, "step": 3032 }, { "epoch": 0.887620719929763, "grad_norm": 0.009759177453815937, "learning_rate": 2.7809482001755926e-05, "loss": 0.0001, "step": 3033 }, { "epoch": 0.8879133743049459, "grad_norm": 0.7568905353546143, "learning_rate": 2.7802165642376354e-05, "loss": 0.0035, "step": 3034 }, { "epoch": 0.8882060286801288, "grad_norm": 0.0009570059482939541, "learning_rate": 2.779484928299678e-05, "loss": 0.0, "step": 3035 }, { "epoch": 0.8884986830553117, "grad_norm": 0.002224736148491502, "learning_rate": 2.778753292361721e-05, "loss": 0.0, "step": 3036 }, { "epoch": 0.8887913374304945, "grad_norm": 0.018412360921502113, "learning_rate": 2.7780216564237638e-05, "loss": 0.0001, "step": 3037 }, { "epoch": 0.8890839918056775, "grad_norm": 0.0011815952602773905, "learning_rate": 2.7772900204858066e-05, "loss": 0.0, "step": 3038 }, { "epoch": 0.8893766461808604, "grad_norm": 0.0011148378252983093, "learning_rate": 2.7765583845478493e-05, "loss": 0.0, "step": 3039 }, { "epoch": 0.8896693005560433, "grad_norm": 0.0016473204595968127, "learning_rate": 2.775826748609892e-05, "loss": 0.0, "step": 3040 }, { "epoch": 0.8899619549312262, "grad_norm": 0.002104657469317317, "learning_rate": 2.7750951126719343e-05, "loss": 0.0, "step": 3041 }, { "epoch": 0.8902546093064091, "grad_norm": 0.0009894543327391148, "learning_rate": 2.774363476733977e-05, "loss": 0.0, "step": 3042 }, { "epoch": 0.8905472636815921, "grad_norm": 0.0005898270173929632, "learning_rate": 2.77363184079602e-05, "loss": 0.0, "step": 3043 }, { "epoch": 0.890839918056775, "grad_norm": 0.0008705668733455241, "learning_rate": 2.7729002048580626e-05, "loss": 0.0, "step": 3044 }, { "epoch": 0.8911325724319579, "grad_norm": 0.0008987328037619591, "learning_rate": 2.7721685689201054e-05, "loss": 0.0, "step": 3045 }, { "epoch": 0.8914252268071408, "grad_norm": 0.02109546959400177, "learning_rate": 2.7714369329821482e-05, "loss": 0.0002, "step": 3046 }, { "epoch": 0.8917178811823236, "grad_norm": 0.0006454390822909772, "learning_rate": 2.770705297044191e-05, "loss": 0.0, "step": 3047 }, { "epoch": 0.8920105355575065, "grad_norm": 0.0006983923376537859, "learning_rate": 2.7699736611062338e-05, "loss": 0.0, "step": 3048 }, { "epoch": 0.8923031899326895, "grad_norm": 0.001889776554889977, "learning_rate": 2.7692420251682766e-05, "loss": 0.0, "step": 3049 }, { "epoch": 0.8925958443078724, "grad_norm": 2.0714917182922363, "learning_rate": 2.7685103892303194e-05, "loss": 0.0056, "step": 3050 }, { "epoch": 0.8928884986830553, "grad_norm": 0.03971258923411369, "learning_rate": 2.7677787532923615e-05, "loss": 0.0003, "step": 3051 }, { "epoch": 0.8931811530582382, "grad_norm": 0.0026012498419731855, "learning_rate": 2.7670471173544043e-05, "loss": 0.0, "step": 3052 }, { "epoch": 0.8934738074334211, "grad_norm": 0.019663626328110695, "learning_rate": 2.766315481416447e-05, "loss": 0.0002, "step": 3053 }, { "epoch": 0.8937664618086041, "grad_norm": 0.0005580433062277734, "learning_rate": 2.76558384547849e-05, "loss": 0.0, "step": 3054 }, { "epoch": 0.894059116183787, "grad_norm": 0.0014611040242016315, "learning_rate": 2.7648522095405327e-05, "loss": 0.0, "step": 3055 }, { "epoch": 0.8943517705589699, "grad_norm": 0.0009221715736202896, "learning_rate": 2.7641205736025755e-05, "loss": 0.0, "step": 3056 }, { "epoch": 0.8946444249341527, "grad_norm": 0.00030951714143157005, "learning_rate": 2.7633889376646183e-05, "loss": 0.0, "step": 3057 }, { "epoch": 0.8949370793093356, "grad_norm": 0.00037072168197482824, "learning_rate": 2.762657301726661e-05, "loss": 0.0, "step": 3058 }, { "epoch": 0.8952297336845186, "grad_norm": 0.0011817470658570528, "learning_rate": 2.761925665788704e-05, "loss": 0.0, "step": 3059 }, { "epoch": 0.8955223880597015, "grad_norm": 0.0008959379629231989, "learning_rate": 2.7611940298507467e-05, "loss": 0.0, "step": 3060 }, { "epoch": 0.8958150424348844, "grad_norm": 0.00038279814179986715, "learning_rate": 2.7604623939127895e-05, "loss": 0.0, "step": 3061 }, { "epoch": 0.8961076968100673, "grad_norm": 1.5234986543655396, "learning_rate": 2.7597307579748316e-05, "loss": 0.0032, "step": 3062 }, { "epoch": 0.8964003511852502, "grad_norm": 0.0007021187921054661, "learning_rate": 2.7589991220368744e-05, "loss": 0.0, "step": 3063 }, { "epoch": 0.8966930055604331, "grad_norm": 0.0006102448678575456, "learning_rate": 2.7582674860989172e-05, "loss": 0.0, "step": 3064 }, { "epoch": 0.8969856599356161, "grad_norm": 8.276649475097656, "learning_rate": 2.75753585016096e-05, "loss": 0.04, "step": 3065 }, { "epoch": 0.897278314310799, "grad_norm": 11.039922714233398, "learning_rate": 2.7568042142230028e-05, "loss": 0.0428, "step": 3066 }, { "epoch": 0.8975709686859819, "grad_norm": 0.00047941674711182714, "learning_rate": 2.7560725782850456e-05, "loss": 0.0, "step": 3067 }, { "epoch": 0.8978636230611647, "grad_norm": 0.0007145427516661584, "learning_rate": 2.7553409423470884e-05, "loss": 0.0, "step": 3068 }, { "epoch": 0.8981562774363476, "grad_norm": 0.002789260121062398, "learning_rate": 2.754609306409131e-05, "loss": 0.0, "step": 3069 }, { "epoch": 0.8984489318115306, "grad_norm": 0.000519847497344017, "learning_rate": 2.753877670471174e-05, "loss": 0.0, "step": 3070 }, { "epoch": 0.8987415861867135, "grad_norm": 0.001295949099585414, "learning_rate": 2.7531460345332168e-05, "loss": 0.0, "step": 3071 }, { "epoch": 0.8990342405618964, "grad_norm": 0.0006953420233912766, "learning_rate": 2.7524143985952596e-05, "loss": 0.0, "step": 3072 }, { "epoch": 0.8993268949370793, "grad_norm": 0.001858212286606431, "learning_rate": 2.7516827626573017e-05, "loss": 0.0, "step": 3073 }, { "epoch": 0.8996195493122622, "grad_norm": 0.007349081337451935, "learning_rate": 2.7509511267193445e-05, "loss": 0.0001, "step": 3074 }, { "epoch": 0.8999122036874452, "grad_norm": 0.0004943243111483753, "learning_rate": 2.7502194907813873e-05, "loss": 0.0, "step": 3075 }, { "epoch": 0.9002048580626281, "grad_norm": 0.0209695715457201, "learning_rate": 2.74948785484343e-05, "loss": 0.0002, "step": 3076 }, { "epoch": 0.900497512437811, "grad_norm": 0.00024704058887436986, "learning_rate": 2.748756218905473e-05, "loss": 0.0, "step": 3077 }, { "epoch": 0.9007901668129938, "grad_norm": 0.0019234855426475406, "learning_rate": 2.7480245829675156e-05, "loss": 0.0, "step": 3078 }, { "epoch": 0.9010828211881767, "grad_norm": 0.0011848368449136615, "learning_rate": 2.7472929470295584e-05, "loss": 0.0, "step": 3079 }, { "epoch": 0.9013754755633596, "grad_norm": 0.006680505815893412, "learning_rate": 2.7465613110916012e-05, "loss": 0.0001, "step": 3080 }, { "epoch": 0.9016681299385426, "grad_norm": 0.006065751425921917, "learning_rate": 2.745829675153644e-05, "loss": 0.0001, "step": 3081 }, { "epoch": 0.9019607843137255, "grad_norm": 0.0021666118409484625, "learning_rate": 2.7450980392156865e-05, "loss": 0.0, "step": 3082 }, { "epoch": 0.9022534386889084, "grad_norm": 0.0295471902936697, "learning_rate": 2.744366403277729e-05, "loss": 0.0001, "step": 3083 }, { "epoch": 0.9025460930640913, "grad_norm": 0.0008403001120314002, "learning_rate": 2.7436347673397717e-05, "loss": 0.0, "step": 3084 }, { "epoch": 0.9028387474392742, "grad_norm": 0.0363955982029438, "learning_rate": 2.7429031314018145e-05, "loss": 0.0002, "step": 3085 }, { "epoch": 0.9031314018144572, "grad_norm": 0.003731783712282777, "learning_rate": 2.7421714954638573e-05, "loss": 0.0, "step": 3086 }, { "epoch": 0.9034240561896401, "grad_norm": 0.0009479248546995223, "learning_rate": 2.7414398595259e-05, "loss": 0.0, "step": 3087 }, { "epoch": 0.903716710564823, "grad_norm": 0.08302231878042221, "learning_rate": 2.740708223587943e-05, "loss": 0.0004, "step": 3088 }, { "epoch": 0.9040093649400058, "grad_norm": 0.012477376498281956, "learning_rate": 2.7399765876499857e-05, "loss": 0.0001, "step": 3089 }, { "epoch": 0.9043020193151887, "grad_norm": 0.004335212055593729, "learning_rate": 2.739244951712028e-05, "loss": 0.0, "step": 3090 }, { "epoch": 0.9045946736903717, "grad_norm": 0.0004597943334374577, "learning_rate": 2.738513315774071e-05, "loss": 0.0, "step": 3091 }, { "epoch": 0.9048873280655546, "grad_norm": 0.0008222962496802211, "learning_rate": 2.7377816798361138e-05, "loss": 0.0, "step": 3092 }, { "epoch": 0.9051799824407375, "grad_norm": 0.024342892691493034, "learning_rate": 2.7370500438981566e-05, "loss": 0.0002, "step": 3093 }, { "epoch": 0.9054726368159204, "grad_norm": 0.0015932725509628654, "learning_rate": 2.736318407960199e-05, "loss": 0.0, "step": 3094 }, { "epoch": 0.9057652911911033, "grad_norm": 0.01010272279381752, "learning_rate": 2.7355867720222418e-05, "loss": 0.0001, "step": 3095 }, { "epoch": 0.9060579455662863, "grad_norm": 0.00036504954914562404, "learning_rate": 2.7348551360842846e-05, "loss": 0.0, "step": 3096 }, { "epoch": 0.9063505999414692, "grad_norm": 0.0004988229484297335, "learning_rate": 2.7341235001463274e-05, "loss": 0.0, "step": 3097 }, { "epoch": 0.906643254316652, "grad_norm": 0.040057141333818436, "learning_rate": 2.73339186420837e-05, "loss": 0.0002, "step": 3098 }, { "epoch": 0.9069359086918349, "grad_norm": 0.00041190991760231555, "learning_rate": 2.7326602282704126e-05, "loss": 0.0, "step": 3099 }, { "epoch": 0.9072285630670178, "grad_norm": 0.0003888402134180069, "learning_rate": 2.7319285923324554e-05, "loss": 0.0, "step": 3100 }, { "epoch": 0.9075212174422007, "grad_norm": 0.0060388194397091866, "learning_rate": 2.7311969563944982e-05, "loss": 0.0, "step": 3101 }, { "epoch": 0.9078138718173837, "grad_norm": 0.00023331062402576208, "learning_rate": 2.730465320456541e-05, "loss": 0.0, "step": 3102 }, { "epoch": 0.9081065261925666, "grad_norm": 0.003220916260033846, "learning_rate": 2.7297336845185838e-05, "loss": 0.0, "step": 3103 }, { "epoch": 0.9083991805677495, "grad_norm": 0.00027145931380800903, "learning_rate": 2.7290020485806263e-05, "loss": 0.0, "step": 3104 }, { "epoch": 0.9086918349429324, "grad_norm": 0.0002177278947783634, "learning_rate": 2.728270412642669e-05, "loss": 0.0, "step": 3105 }, { "epoch": 0.9089844893181153, "grad_norm": 0.0003419017593842, "learning_rate": 2.7275387767047115e-05, "loss": 0.0, "step": 3106 }, { "epoch": 0.9092771436932983, "grad_norm": 0.00884864293038845, "learning_rate": 2.7268071407667543e-05, "loss": 0.0001, "step": 3107 }, { "epoch": 0.9095697980684812, "grad_norm": 0.009763057343661785, "learning_rate": 2.726075504828797e-05, "loss": 0.0001, "step": 3108 }, { "epoch": 0.909862452443664, "grad_norm": 0.0008707040688022971, "learning_rate": 2.72534386889084e-05, "loss": 0.0, "step": 3109 }, { "epoch": 0.9101551068188469, "grad_norm": 0.0002548248157836497, "learning_rate": 2.7246122329528827e-05, "loss": 0.0, "step": 3110 }, { "epoch": 0.9104477611940298, "grad_norm": 0.00037588385748676956, "learning_rate": 2.7238805970149255e-05, "loss": 0.0, "step": 3111 }, { "epoch": 0.9107404155692128, "grad_norm": 0.0010243757860735059, "learning_rate": 2.7231489610769683e-05, "loss": 0.0, "step": 3112 }, { "epoch": 0.9110330699443957, "grad_norm": 0.0007129418663680553, "learning_rate": 2.722417325139011e-05, "loss": 0.0, "step": 3113 }, { "epoch": 0.9113257243195786, "grad_norm": 0.000473760039312765, "learning_rate": 2.721685689201054e-05, "loss": 0.0, "step": 3114 }, { "epoch": 0.9116183786947615, "grad_norm": 0.00029461312806233764, "learning_rate": 2.720954053263096e-05, "loss": 0.0, "step": 3115 }, { "epoch": 0.9119110330699444, "grad_norm": 0.0006559625035151839, "learning_rate": 2.7202224173251388e-05, "loss": 0.0, "step": 3116 }, { "epoch": 0.9122036874451273, "grad_norm": 0.05397697910666466, "learning_rate": 2.7194907813871816e-05, "loss": 0.0003, "step": 3117 }, { "epoch": 0.9124963418203103, "grad_norm": 0.0002400352677796036, "learning_rate": 2.7187591454492244e-05, "loss": 0.0, "step": 3118 }, { "epoch": 0.9127889961954931, "grad_norm": 0.37905460596084595, "learning_rate": 2.7180275095112672e-05, "loss": 0.0008, "step": 3119 }, { "epoch": 0.913081650570676, "grad_norm": 0.0003496919816825539, "learning_rate": 2.71729587357331e-05, "loss": 0.0, "step": 3120 }, { "epoch": 0.9133743049458589, "grad_norm": 0.000749769329559058, "learning_rate": 2.7165642376353528e-05, "loss": 0.0, "step": 3121 }, { "epoch": 0.9136669593210418, "grad_norm": 0.005695801693946123, "learning_rate": 2.7158326016973956e-05, "loss": 0.0, "step": 3122 }, { "epoch": 0.9139596136962248, "grad_norm": 0.00032327789813280106, "learning_rate": 2.7151009657594384e-05, "loss": 0.0, "step": 3123 }, { "epoch": 0.9142522680714077, "grad_norm": 0.00010946913971565664, "learning_rate": 2.714369329821481e-05, "loss": 0.0, "step": 3124 }, { "epoch": 0.9145449224465906, "grad_norm": 0.000152256601722911, "learning_rate": 2.713637693883524e-05, "loss": 0.0, "step": 3125 }, { "epoch": 0.9148375768217735, "grad_norm": 0.0009119808673858643, "learning_rate": 2.712906057945566e-05, "loss": 0.0, "step": 3126 }, { "epoch": 0.9151302311969564, "grad_norm": 0.00021451925567816943, "learning_rate": 2.712174422007609e-05, "loss": 0.0, "step": 3127 }, { "epoch": 0.9154228855721394, "grad_norm": 0.005684440489858389, "learning_rate": 2.7114427860696517e-05, "loss": 0.0, "step": 3128 }, { "epoch": 0.9157155399473222, "grad_norm": 0.0006643623928539455, "learning_rate": 2.7107111501316945e-05, "loss": 0.0, "step": 3129 }, { "epoch": 0.9160081943225051, "grad_norm": 0.00016802526079118252, "learning_rate": 2.7099795141937373e-05, "loss": 0.0, "step": 3130 }, { "epoch": 0.916300848697688, "grad_norm": 7.920035362243652, "learning_rate": 2.70924787825578e-05, "loss": 0.0216, "step": 3131 }, { "epoch": 0.9165935030728709, "grad_norm": 0.00022262395941652358, "learning_rate": 2.708516242317823e-05, "loss": 0.0, "step": 3132 }, { "epoch": 0.9168861574480538, "grad_norm": 0.00024071757798083127, "learning_rate": 2.7077846063798656e-05, "loss": 0.0, "step": 3133 }, { "epoch": 0.9171788118232368, "grad_norm": 0.00023998609685804695, "learning_rate": 2.7070529704419084e-05, "loss": 0.0, "step": 3134 }, { "epoch": 0.9174714661984197, "grad_norm": 0.0003449968062341213, "learning_rate": 2.7063213345039512e-05, "loss": 0.0, "step": 3135 }, { "epoch": 0.9177641205736026, "grad_norm": 0.0002522218564990908, "learning_rate": 2.7055896985659934e-05, "loss": 0.0, "step": 3136 }, { "epoch": 0.9180567749487855, "grad_norm": 0.0006752046756446362, "learning_rate": 2.704858062628036e-05, "loss": 0.0, "step": 3137 }, { "epoch": 0.9183494293239683, "grad_norm": 0.0299922376871109, "learning_rate": 2.704126426690079e-05, "loss": 0.0001, "step": 3138 }, { "epoch": 0.9186420836991513, "grad_norm": 0.01105990819633007, "learning_rate": 2.7033947907521217e-05, "loss": 0.0001, "step": 3139 }, { "epoch": 0.9189347380743342, "grad_norm": 0.15487124025821686, "learning_rate": 2.7026631548141645e-05, "loss": 0.0004, "step": 3140 }, { "epoch": 0.9192273924495171, "grad_norm": 0.15040268003940582, "learning_rate": 2.7019315188762073e-05, "loss": 0.0004, "step": 3141 }, { "epoch": 0.9195200468247, "grad_norm": 0.0076209064573049545, "learning_rate": 2.70119988293825e-05, "loss": 0.0, "step": 3142 }, { "epoch": 0.9198127011998829, "grad_norm": 0.03322245180606842, "learning_rate": 2.700468247000293e-05, "loss": 0.0002, "step": 3143 }, { "epoch": 0.9201053555750659, "grad_norm": 0.0031604596879333258, "learning_rate": 2.6997366110623357e-05, "loss": 0.0, "step": 3144 }, { "epoch": 0.9203980099502488, "grad_norm": 0.00011157146946061403, "learning_rate": 2.6990049751243785e-05, "loss": 0.0, "step": 3145 }, { "epoch": 0.9206906643254317, "grad_norm": 0.00012219483323860914, "learning_rate": 2.6982733391864213e-05, "loss": 0.0, "step": 3146 }, { "epoch": 0.9209833187006146, "grad_norm": 0.9359664916992188, "learning_rate": 2.6975417032484634e-05, "loss": 0.0021, "step": 3147 }, { "epoch": 0.9212759730757974, "grad_norm": 0.0023510432802140713, "learning_rate": 2.6968100673105062e-05, "loss": 0.0, "step": 3148 }, { "epoch": 0.9215686274509803, "grad_norm": 0.00047637286479584873, "learning_rate": 2.696078431372549e-05, "loss": 0.0, "step": 3149 }, { "epoch": 0.9218612818261633, "grad_norm": 0.00031984152155928314, "learning_rate": 2.6953467954345918e-05, "loss": 0.0, "step": 3150 }, { "epoch": 0.9221539362013462, "grad_norm": 0.00027832211344502866, "learning_rate": 2.6946151594966346e-05, "loss": 0.0, "step": 3151 }, { "epoch": 0.9224465905765291, "grad_norm": 0.00016787606000434607, "learning_rate": 2.6938835235586774e-05, "loss": 0.0, "step": 3152 }, { "epoch": 0.922739244951712, "grad_norm": 0.0006489913794212043, "learning_rate": 2.6931518876207202e-05, "loss": 0.0, "step": 3153 }, { "epoch": 0.9230318993268949, "grad_norm": 8.885025454219431e-05, "learning_rate": 2.692420251682763e-05, "loss": 0.0, "step": 3154 }, { "epoch": 0.9233245537020779, "grad_norm": 8.944397268351167e-05, "learning_rate": 2.6916886157448058e-05, "loss": 0.0, "step": 3155 }, { "epoch": 0.9236172080772608, "grad_norm": 8.712815179023892e-05, "learning_rate": 2.6909569798068486e-05, "loss": 0.0, "step": 3156 }, { "epoch": 0.9239098624524437, "grad_norm": 0.0003146608651150018, "learning_rate": 2.6902253438688914e-05, "loss": 0.0, "step": 3157 }, { "epoch": 0.9242025168276266, "grad_norm": 0.00010711107461247593, "learning_rate": 2.6894937079309335e-05, "loss": 0.0, "step": 3158 }, { "epoch": 0.9244951712028094, "grad_norm": 0.00014019818627275527, "learning_rate": 2.6887620719929763e-05, "loss": 0.0, "step": 3159 }, { "epoch": 0.9247878255779924, "grad_norm": 0.00018984345661010593, "learning_rate": 2.688030436055019e-05, "loss": 0.0, "step": 3160 }, { "epoch": 0.9250804799531753, "grad_norm": 0.00010449805267853662, "learning_rate": 2.687298800117062e-05, "loss": 0.0, "step": 3161 }, { "epoch": 0.9253731343283582, "grad_norm": 0.05337434262037277, "learning_rate": 2.6865671641791047e-05, "loss": 0.0002, "step": 3162 }, { "epoch": 0.9256657887035411, "grad_norm": 13.599655151367188, "learning_rate": 2.6858355282411475e-05, "loss": 0.0578, "step": 3163 }, { "epoch": 0.925958443078724, "grad_norm": 0.00011503922723932192, "learning_rate": 2.6851038923031903e-05, "loss": 0.0, "step": 3164 }, { "epoch": 0.926251097453907, "grad_norm": 0.00014844737597741187, "learning_rate": 2.684372256365233e-05, "loss": 0.0, "step": 3165 }, { "epoch": 0.9265437518290899, "grad_norm": 0.00011671489482978359, "learning_rate": 2.683640620427276e-05, "loss": 0.0, "step": 3166 }, { "epoch": 0.9268364062042728, "grad_norm": 8.918878302210942e-05, "learning_rate": 2.6829089844893186e-05, "loss": 0.0, "step": 3167 }, { "epoch": 0.9271290605794557, "grad_norm": 0.00026356399757787585, "learning_rate": 2.6821773485513608e-05, "loss": 0.0, "step": 3168 }, { "epoch": 0.9274217149546385, "grad_norm": 0.00011778157204389572, "learning_rate": 2.6814457126134036e-05, "loss": 0.0, "step": 3169 }, { "epoch": 0.9277143693298214, "grad_norm": 0.00011025131971109658, "learning_rate": 2.6807140766754463e-05, "loss": 0.0, "step": 3170 }, { "epoch": 0.9280070237050044, "grad_norm": 0.0004025105736218393, "learning_rate": 2.679982440737489e-05, "loss": 0.0, "step": 3171 }, { "epoch": 0.9282996780801873, "grad_norm": 0.007656876929104328, "learning_rate": 2.679250804799532e-05, "loss": 0.0, "step": 3172 }, { "epoch": 0.9285923324553702, "grad_norm": 0.0003975860890932381, "learning_rate": 2.6785191688615747e-05, "loss": 0.0, "step": 3173 }, { "epoch": 0.9288849868305531, "grad_norm": 0.001018122653476894, "learning_rate": 2.6777875329236175e-05, "loss": 0.0, "step": 3174 }, { "epoch": 0.929177641205736, "grad_norm": 0.0026253890246152878, "learning_rate": 2.6770558969856603e-05, "loss": 0.0, "step": 3175 }, { "epoch": 0.929470295580919, "grad_norm": 0.00011285820073680952, "learning_rate": 2.676324261047703e-05, "loss": 0.0, "step": 3176 }, { "epoch": 0.9297629499561019, "grad_norm": 0.006903521716594696, "learning_rate": 2.6755926251097456e-05, "loss": 0.0, "step": 3177 }, { "epoch": 0.9300556043312848, "grad_norm": 0.0006428345805034041, "learning_rate": 2.6748609891717884e-05, "loss": 0.0, "step": 3178 }, { "epoch": 0.9303482587064676, "grad_norm": 0.00021251077123451978, "learning_rate": 2.6741293532338308e-05, "loss": 0.0, "step": 3179 }, { "epoch": 0.9306409130816505, "grad_norm": 0.0028959973715245724, "learning_rate": 2.6733977172958736e-05, "loss": 0.0, "step": 3180 }, { "epoch": 0.9309335674568335, "grad_norm": 25.46250343322754, "learning_rate": 2.6726660813579164e-05, "loss": 0.0592, "step": 3181 }, { "epoch": 0.9312262218320164, "grad_norm": 0.9013259410858154, "learning_rate": 2.6719344454199592e-05, "loss": 0.0015, "step": 3182 }, { "epoch": 0.9315188762071993, "grad_norm": 0.00012032218364765868, "learning_rate": 2.671202809482002e-05, "loss": 0.0, "step": 3183 }, { "epoch": 0.9318115305823822, "grad_norm": 0.061714570969343185, "learning_rate": 2.6704711735440448e-05, "loss": 0.0003, "step": 3184 }, { "epoch": 0.9321041849575651, "grad_norm": 0.003692185739055276, "learning_rate": 2.6697395376060873e-05, "loss": 0.0, "step": 3185 }, { "epoch": 0.932396839332748, "grad_norm": 8.049399375915527, "learning_rate": 2.66900790166813e-05, "loss": 0.0124, "step": 3186 }, { "epoch": 0.932689493707931, "grad_norm": 7.663945143576711e-05, "learning_rate": 2.668276265730173e-05, "loss": 0.0, "step": 3187 }, { "epoch": 0.9329821480831139, "grad_norm": 9.304416744271293e-05, "learning_rate": 2.6675446297922156e-05, "loss": 0.0, "step": 3188 }, { "epoch": 0.9332748024582967, "grad_norm": 3.968400415033102e-05, "learning_rate": 2.666812993854258e-05, "loss": 0.0, "step": 3189 }, { "epoch": 0.9335674568334796, "grad_norm": 0.018485285341739655, "learning_rate": 2.666081357916301e-05, "loss": 0.0001, "step": 3190 }, { "epoch": 0.9338601112086625, "grad_norm": 0.000890921161044389, "learning_rate": 2.6653497219783437e-05, "loss": 0.0, "step": 3191 }, { "epoch": 0.9341527655838455, "grad_norm": 0.00018055552209261805, "learning_rate": 2.6646180860403865e-05, "loss": 0.0, "step": 3192 }, { "epoch": 0.9344454199590284, "grad_norm": 0.0004028046387247741, "learning_rate": 2.663886450102429e-05, "loss": 0.0, "step": 3193 }, { "epoch": 0.9347380743342113, "grad_norm": 0.021287068724632263, "learning_rate": 2.6631548141644717e-05, "loss": 0.0001, "step": 3194 }, { "epoch": 0.9350307287093942, "grad_norm": 21.67721939086914, "learning_rate": 2.6624231782265145e-05, "loss": 0.0499, "step": 3195 }, { "epoch": 0.9353233830845771, "grad_norm": 0.0007531607989221811, "learning_rate": 2.6616915422885573e-05, "loss": 0.0, "step": 3196 }, { "epoch": 0.9356160374597601, "grad_norm": 0.00021644483786076307, "learning_rate": 2.6609599063506e-05, "loss": 0.0, "step": 3197 }, { "epoch": 0.935908691834943, "grad_norm": 0.002615989651530981, "learning_rate": 2.660228270412643e-05, "loss": 0.0, "step": 3198 }, { "epoch": 0.9362013462101259, "grad_norm": 0.00013541265798266977, "learning_rate": 2.6594966344746857e-05, "loss": 0.0, "step": 3199 }, { "epoch": 0.9364940005853087, "grad_norm": 0.00019994725880678743, "learning_rate": 2.658764998536728e-05, "loss": 0.0, "step": 3200 }, { "epoch": 0.9367866549604916, "grad_norm": 0.001617005211301148, "learning_rate": 2.6580333625987706e-05, "loss": 0.0, "step": 3201 }, { "epoch": 0.9370793093356745, "grad_norm": 0.0008476759539917111, "learning_rate": 2.6573017266608134e-05, "loss": 0.0, "step": 3202 }, { "epoch": 0.9373719637108575, "grad_norm": 0.0002651185786817223, "learning_rate": 2.6565700907228562e-05, "loss": 0.0, "step": 3203 }, { "epoch": 0.9376646180860404, "grad_norm": 0.0006702494574710727, "learning_rate": 2.655838454784899e-05, "loss": 0.0, "step": 3204 }, { "epoch": 0.9379572724612233, "grad_norm": 0.0005622314638458192, "learning_rate": 2.6551068188469418e-05, "loss": 0.0, "step": 3205 }, { "epoch": 0.9382499268364062, "grad_norm": 9.157440185546875, "learning_rate": 2.6543751829089846e-05, "loss": 0.0189, "step": 3206 }, { "epoch": 0.9385425812115891, "grad_norm": 0.024700021371245384, "learning_rate": 2.6536435469710274e-05, "loss": 0.0004, "step": 3207 }, { "epoch": 0.9388352355867721, "grad_norm": 21.003908157348633, "learning_rate": 2.6529119110330702e-05, "loss": 0.3449, "step": 3208 }, { "epoch": 0.939127889961955, "grad_norm": 0.0007336666458286345, "learning_rate": 2.652180275095113e-05, "loss": 0.0, "step": 3209 }, { "epoch": 0.9394205443371378, "grad_norm": 0.0008810044964775443, "learning_rate": 2.6514486391571558e-05, "loss": 0.0, "step": 3210 }, { "epoch": 0.9397131987123207, "grad_norm": 0.0016570580191910267, "learning_rate": 2.650717003219198e-05, "loss": 0.0, "step": 3211 }, { "epoch": 0.9400058530875036, "grad_norm": 0.00028649967862293124, "learning_rate": 2.6499853672812407e-05, "loss": 0.0, "step": 3212 }, { "epoch": 0.9402985074626866, "grad_norm": 0.0003914633998647332, "learning_rate": 2.6492537313432835e-05, "loss": 0.0, "step": 3213 }, { "epoch": 0.9405911618378695, "grad_norm": 0.000271236669505015, "learning_rate": 2.6485220954053263e-05, "loss": 0.0, "step": 3214 }, { "epoch": 0.9408838162130524, "grad_norm": 0.0004488322592806071, "learning_rate": 2.647790459467369e-05, "loss": 0.0, "step": 3215 }, { "epoch": 0.9411764705882353, "grad_norm": 1.9253593683242798, "learning_rate": 2.647058823529412e-05, "loss": 0.0028, "step": 3216 }, { "epoch": 0.9414691249634182, "grad_norm": 0.061426468193531036, "learning_rate": 2.6463271875914547e-05, "loss": 0.0002, "step": 3217 }, { "epoch": 0.9417617793386011, "grad_norm": 9.975981712341309, "learning_rate": 2.6455955516534975e-05, "loss": 0.0201, "step": 3218 }, { "epoch": 0.9420544337137841, "grad_norm": 0.055726345628499985, "learning_rate": 2.6448639157155403e-05, "loss": 0.0002, "step": 3219 }, { "epoch": 0.9423470880889669, "grad_norm": 22.72140121459961, "learning_rate": 2.644132279777583e-05, "loss": 0.0659, "step": 3220 }, { "epoch": 0.9426397424641498, "grad_norm": 0.001565615995787084, "learning_rate": 2.643400643839625e-05, "loss": 0.0, "step": 3221 }, { "epoch": 0.9429323968393327, "grad_norm": 0.0002118179836543277, "learning_rate": 2.642669007901668e-05, "loss": 0.0, "step": 3222 }, { "epoch": 0.9432250512145156, "grad_norm": 0.00017638143617659807, "learning_rate": 2.6419373719637108e-05, "loss": 0.0, "step": 3223 }, { "epoch": 0.9435177055896986, "grad_norm": 0.008058113977313042, "learning_rate": 2.6412057360257536e-05, "loss": 0.0, "step": 3224 }, { "epoch": 0.9438103599648815, "grad_norm": 0.00033659982727840543, "learning_rate": 2.6404741000877963e-05, "loss": 0.0, "step": 3225 }, { "epoch": 0.9441030143400644, "grad_norm": 0.0003911230305675417, "learning_rate": 2.639742464149839e-05, "loss": 0.0, "step": 3226 }, { "epoch": 0.9443956687152473, "grad_norm": 0.0328420028090477, "learning_rate": 2.639010828211882e-05, "loss": 0.0002, "step": 3227 }, { "epoch": 0.9446883230904302, "grad_norm": 14.501241683959961, "learning_rate": 2.6382791922739247e-05, "loss": 0.1735, "step": 3228 }, { "epoch": 0.9449809774656132, "grad_norm": 0.6613368391990662, "learning_rate": 2.6375475563359675e-05, "loss": 0.0013, "step": 3229 }, { "epoch": 0.945273631840796, "grad_norm": 9.179767608642578, "learning_rate": 2.6368159203980103e-05, "loss": 0.1605, "step": 3230 }, { "epoch": 0.9455662862159789, "grad_norm": 8.901062965393066, "learning_rate": 2.636084284460053e-05, "loss": 0.0386, "step": 3231 }, { "epoch": 0.9458589405911618, "grad_norm": 0.00024640775518491864, "learning_rate": 2.6353526485220952e-05, "loss": 0.0, "step": 3232 }, { "epoch": 0.9461515949663447, "grad_norm": 0.0004362701147329062, "learning_rate": 2.634621012584138e-05, "loss": 0.0, "step": 3233 }, { "epoch": 0.9464442493415277, "grad_norm": 0.0012233969755470753, "learning_rate": 2.6338893766461808e-05, "loss": 0.0, "step": 3234 }, { "epoch": 0.9467369037167106, "grad_norm": 0.0006996267475187778, "learning_rate": 2.6331577407082236e-05, "loss": 0.0, "step": 3235 }, { "epoch": 0.9470295580918935, "grad_norm": 0.020976858213543892, "learning_rate": 2.6324261047702664e-05, "loss": 0.0002, "step": 3236 }, { "epoch": 0.9473222124670764, "grad_norm": 0.00276585784740746, "learning_rate": 2.6316944688323092e-05, "loss": 0.0, "step": 3237 }, { "epoch": 0.9476148668422593, "grad_norm": 22.640687942504883, "learning_rate": 2.630962832894352e-05, "loss": 0.0774, "step": 3238 }, { "epoch": 0.9479075212174422, "grad_norm": 0.007990888319909573, "learning_rate": 2.6302311969563948e-05, "loss": 0.0001, "step": 3239 }, { "epoch": 0.9482001755926251, "grad_norm": 11.964275360107422, "learning_rate": 2.6294995610184376e-05, "loss": 0.1295, "step": 3240 }, { "epoch": 0.948492829967808, "grad_norm": 12.758728981018066, "learning_rate": 2.6287679250804804e-05, "loss": 0.1538, "step": 3241 }, { "epoch": 0.9487854843429909, "grad_norm": 0.0004913790035061538, "learning_rate": 2.6280362891425225e-05, "loss": 0.0, "step": 3242 }, { "epoch": 0.9490781387181738, "grad_norm": 0.0006277439533732831, "learning_rate": 2.6273046532045653e-05, "loss": 0.0, "step": 3243 }, { "epoch": 0.9493707930933567, "grad_norm": 0.0029538183007389307, "learning_rate": 2.626573017266608e-05, "loss": 0.0, "step": 3244 }, { "epoch": 0.9496634474685397, "grad_norm": 0.013482455164194107, "learning_rate": 2.625841381328651e-05, "loss": 0.0001, "step": 3245 }, { "epoch": 0.9499561018437226, "grad_norm": 0.001173302996903658, "learning_rate": 2.6251097453906937e-05, "loss": 0.0, "step": 3246 }, { "epoch": 0.9502487562189055, "grad_norm": 0.0009915552800521255, "learning_rate": 2.6243781094527365e-05, "loss": 0.0, "step": 3247 }, { "epoch": 0.9505414105940884, "grad_norm": 0.0009076668065972626, "learning_rate": 2.6236464735147793e-05, "loss": 0.0, "step": 3248 }, { "epoch": 0.9508340649692713, "grad_norm": 0.012135384604334831, "learning_rate": 2.622914837576822e-05, "loss": 0.0001, "step": 3249 }, { "epoch": 0.9511267193444543, "grad_norm": 0.00030453107319772243, "learning_rate": 2.622183201638865e-05, "loss": 0.0, "step": 3250 }, { "epoch": 0.9514193737196371, "grad_norm": 0.0006158557371236384, "learning_rate": 2.6214515657009077e-05, "loss": 0.0, "step": 3251 }, { "epoch": 0.95171202809482, "grad_norm": 0.0005525677115656435, "learning_rate": 2.6207199297629505e-05, "loss": 0.0, "step": 3252 }, { "epoch": 0.9520046824700029, "grad_norm": 0.011062745936214924, "learning_rate": 2.6199882938249926e-05, "loss": 0.0001, "step": 3253 }, { "epoch": 0.9522973368451858, "grad_norm": 3.0753278732299805, "learning_rate": 2.6192566578870354e-05, "loss": 0.0056, "step": 3254 }, { "epoch": 0.9525899912203687, "grad_norm": 0.00169378484133631, "learning_rate": 2.618525021949078e-05, "loss": 0.0, "step": 3255 }, { "epoch": 0.9528826455955517, "grad_norm": 2.393634557723999, "learning_rate": 2.617793386011121e-05, "loss": 0.0049, "step": 3256 }, { "epoch": 0.9531752999707346, "grad_norm": 0.0027157398872077465, "learning_rate": 2.6170617500731638e-05, "loss": 0.0, "step": 3257 }, { "epoch": 0.9534679543459175, "grad_norm": 0.0003597254981286824, "learning_rate": 2.6163301141352066e-05, "loss": 0.0, "step": 3258 }, { "epoch": 0.9537606087211004, "grad_norm": 0.0020881923846900463, "learning_rate": 2.6155984781972493e-05, "loss": 0.0, "step": 3259 }, { "epoch": 0.9540532630962832, "grad_norm": 0.0012122966581955552, "learning_rate": 2.614866842259292e-05, "loss": 0.0, "step": 3260 }, { "epoch": 0.9543459174714662, "grad_norm": 0.002034051576629281, "learning_rate": 2.614135206321335e-05, "loss": 0.0, "step": 3261 }, { "epoch": 0.9546385718466491, "grad_norm": 0.004289246164262295, "learning_rate": 2.6134035703833777e-05, "loss": 0.0001, "step": 3262 }, { "epoch": 0.954931226221832, "grad_norm": 9.942416363628581e-05, "learning_rate": 2.6126719344454202e-05, "loss": 0.0, "step": 3263 }, { "epoch": 0.9552238805970149, "grad_norm": 0.2390059232711792, "learning_rate": 2.6119402985074626e-05, "loss": 0.0007, "step": 3264 }, { "epoch": 0.9555165349721978, "grad_norm": 0.028897233307361603, "learning_rate": 2.6112086625695054e-05, "loss": 0.0002, "step": 3265 }, { "epoch": 0.9558091893473808, "grad_norm": 0.5268934369087219, "learning_rate": 2.6104770266315482e-05, "loss": 0.0018, "step": 3266 }, { "epoch": 0.9561018437225637, "grad_norm": 0.3045479655265808, "learning_rate": 2.609745390693591e-05, "loss": 0.0007, "step": 3267 }, { "epoch": 0.9563944980977466, "grad_norm": 0.0052806478925049305, "learning_rate": 2.6090137547556338e-05, "loss": 0.0001, "step": 3268 }, { "epoch": 0.9566871524729295, "grad_norm": 4.63300895690918, "learning_rate": 2.6082821188176766e-05, "loss": 0.2525, "step": 3269 }, { "epoch": 0.9569798068481123, "grad_norm": 0.0004651243216358125, "learning_rate": 2.6075504828797194e-05, "loss": 0.0, "step": 3270 }, { "epoch": 0.9572724612232952, "grad_norm": 0.023930110037326813, "learning_rate": 2.606818846941762e-05, "loss": 0.0001, "step": 3271 }, { "epoch": 0.9575651155984782, "grad_norm": 0.0017391935689374804, "learning_rate": 2.6060872110038047e-05, "loss": 0.0, "step": 3272 }, { "epoch": 0.9578577699736611, "grad_norm": 1.1058738231658936, "learning_rate": 2.6053555750658475e-05, "loss": 0.0033, "step": 3273 }, { "epoch": 0.958150424348844, "grad_norm": 0.006419755984097719, "learning_rate": 2.60462393912789e-05, "loss": 0.0001, "step": 3274 }, { "epoch": 0.9584430787240269, "grad_norm": 0.012653257697820663, "learning_rate": 2.6038923031899327e-05, "loss": 0.0001, "step": 3275 }, { "epoch": 0.9587357330992098, "grad_norm": 1.6761451959609985, "learning_rate": 2.6031606672519755e-05, "loss": 0.0049, "step": 3276 }, { "epoch": 0.9590283874743928, "grad_norm": 0.004024661611765623, "learning_rate": 2.6024290313140183e-05, "loss": 0.0, "step": 3277 }, { "epoch": 0.9593210418495757, "grad_norm": 0.018077418208122253, "learning_rate": 2.601697395376061e-05, "loss": 0.0001, "step": 3278 }, { "epoch": 0.9596136962247586, "grad_norm": 0.0010719620622694492, "learning_rate": 2.6009657594381036e-05, "loss": 0.0, "step": 3279 }, { "epoch": 0.9599063505999414, "grad_norm": 0.005239298567175865, "learning_rate": 2.6002341235001463e-05, "loss": 0.0001, "step": 3280 }, { "epoch": 0.9601990049751243, "grad_norm": 0.007799623534083366, "learning_rate": 2.599502487562189e-05, "loss": 0.0001, "step": 3281 }, { "epoch": 0.9604916593503073, "grad_norm": 0.018759027123451233, "learning_rate": 2.598770851624232e-05, "loss": 0.0002, "step": 3282 }, { "epoch": 0.9607843137254902, "grad_norm": 0.028783559799194336, "learning_rate": 2.5980392156862747e-05, "loss": 0.0002, "step": 3283 }, { "epoch": 0.9610769681006731, "grad_norm": 0.018134091049432755, "learning_rate": 2.5973075797483175e-05, "loss": 0.0001, "step": 3284 }, { "epoch": 0.961369622475856, "grad_norm": 0.0472426600754261, "learning_rate": 2.59657594381036e-05, "loss": 0.0003, "step": 3285 }, { "epoch": 0.9616622768510389, "grad_norm": 0.01522039994597435, "learning_rate": 2.5958443078724028e-05, "loss": 0.0001, "step": 3286 }, { "epoch": 0.9619549312262218, "grad_norm": 11.493000984191895, "learning_rate": 2.5951126719344452e-05, "loss": 0.108, "step": 3287 }, { "epoch": 0.9622475856014048, "grad_norm": 0.10344574600458145, "learning_rate": 2.594381035996488e-05, "loss": 0.0004, "step": 3288 }, { "epoch": 0.9625402399765877, "grad_norm": 1.291723370552063, "learning_rate": 2.5936494000585308e-05, "loss": 0.0027, "step": 3289 }, { "epoch": 0.9628328943517706, "grad_norm": 0.011618785560131073, "learning_rate": 2.5929177641205736e-05, "loss": 0.0001, "step": 3290 }, { "epoch": 0.9631255487269534, "grad_norm": 0.0016451601404696703, "learning_rate": 2.5921861281826164e-05, "loss": 0.0, "step": 3291 }, { "epoch": 0.9634182031021363, "grad_norm": 0.04313068464398384, "learning_rate": 2.5914544922446592e-05, "loss": 0.0003, "step": 3292 }, { "epoch": 0.9637108574773193, "grad_norm": 0.00029622757574543357, "learning_rate": 2.590722856306702e-05, "loss": 0.0, "step": 3293 }, { "epoch": 0.9640035118525022, "grad_norm": 0.00045671130646951497, "learning_rate": 2.5899912203687448e-05, "loss": 0.0, "step": 3294 }, { "epoch": 0.9642961662276851, "grad_norm": 0.002273781690746546, "learning_rate": 2.5892595844307873e-05, "loss": 0.0, "step": 3295 }, { "epoch": 0.964588820602868, "grad_norm": 0.0009717473876662552, "learning_rate": 2.5885279484928297e-05, "loss": 0.0, "step": 3296 }, { "epoch": 0.9648814749780509, "grad_norm": 0.0005301141645759344, "learning_rate": 2.5877963125548725e-05, "loss": 0.0, "step": 3297 }, { "epoch": 0.9651741293532339, "grad_norm": 0.0018403874710202217, "learning_rate": 2.5870646766169153e-05, "loss": 0.0, "step": 3298 }, { "epoch": 0.9654667837284168, "grad_norm": 0.0006049801595509052, "learning_rate": 2.586333040678958e-05, "loss": 0.0, "step": 3299 }, { "epoch": 0.9657594381035997, "grad_norm": 0.00038890886935405433, "learning_rate": 2.585601404741001e-05, "loss": 0.0, "step": 3300 }, { "epoch": 0.9660520924787825, "grad_norm": 0.0020230012014508247, "learning_rate": 2.5848697688030437e-05, "loss": 0.0, "step": 3301 }, { "epoch": 0.9663447468539654, "grad_norm": 0.07929252833127975, "learning_rate": 2.5841381328650865e-05, "loss": 0.0003, "step": 3302 }, { "epoch": 0.9666374012291484, "grad_norm": 0.0015152136329561472, "learning_rate": 2.5834064969271293e-05, "loss": 0.0, "step": 3303 }, { "epoch": 0.9669300556043313, "grad_norm": 0.13640671968460083, "learning_rate": 2.582674860989172e-05, "loss": 0.0005, "step": 3304 }, { "epoch": 0.9672227099795142, "grad_norm": 0.0044171796180307865, "learning_rate": 2.581943225051215e-05, "loss": 0.0001, "step": 3305 }, { "epoch": 0.9675153643546971, "grad_norm": 3.793172597885132, "learning_rate": 2.581211589113257e-05, "loss": 0.0068, "step": 3306 }, { "epoch": 0.96780801872988, "grad_norm": 0.000829192460514605, "learning_rate": 2.5804799531752998e-05, "loss": 0.0, "step": 3307 }, { "epoch": 0.9681006731050629, "grad_norm": 0.018820637837052345, "learning_rate": 2.5797483172373426e-05, "loss": 0.0001, "step": 3308 }, { "epoch": 0.9683933274802459, "grad_norm": 0.0009690960869193077, "learning_rate": 2.5790166812993854e-05, "loss": 0.0, "step": 3309 }, { "epoch": 0.9686859818554288, "grad_norm": 0.2188994586467743, "learning_rate": 2.578285045361428e-05, "loss": 0.0009, "step": 3310 }, { "epoch": 0.9689786362306116, "grad_norm": 0.0027362063992768526, "learning_rate": 2.577553409423471e-05, "loss": 0.0, "step": 3311 }, { "epoch": 0.9692712906057945, "grad_norm": 0.0008724553044885397, "learning_rate": 2.5768217734855138e-05, "loss": 0.0, "step": 3312 }, { "epoch": 0.9695639449809774, "grad_norm": 0.009658461436629295, "learning_rate": 2.5760901375475566e-05, "loss": 0.0001, "step": 3313 }, { "epoch": 0.9698565993561604, "grad_norm": 0.00034940437762998044, "learning_rate": 2.5753585016095993e-05, "loss": 0.0, "step": 3314 }, { "epoch": 0.9701492537313433, "grad_norm": 0.0005051441839896142, "learning_rate": 2.574626865671642e-05, "loss": 0.0, "step": 3315 }, { "epoch": 0.9704419081065262, "grad_norm": 0.0007315681432373822, "learning_rate": 2.573895229733685e-05, "loss": 0.0, "step": 3316 }, { "epoch": 0.9707345624817091, "grad_norm": 0.0003318546514492482, "learning_rate": 2.573163593795727e-05, "loss": 0.0, "step": 3317 }, { "epoch": 0.971027216856892, "grad_norm": 0.0004913939046673477, "learning_rate": 2.57243195785777e-05, "loss": 0.0, "step": 3318 }, { "epoch": 0.971319871232075, "grad_norm": 0.00020777047029696405, "learning_rate": 2.5717003219198126e-05, "loss": 0.0, "step": 3319 }, { "epoch": 0.9716125256072579, "grad_norm": 0.0025297177489846945, "learning_rate": 2.5709686859818554e-05, "loss": 0.0, "step": 3320 }, { "epoch": 0.9719051799824407, "grad_norm": 0.013613739982247353, "learning_rate": 2.5702370500438982e-05, "loss": 0.0001, "step": 3321 }, { "epoch": 0.9721978343576236, "grad_norm": 0.0002099954435834661, "learning_rate": 2.569505414105941e-05, "loss": 0.0, "step": 3322 }, { "epoch": 0.9724904887328065, "grad_norm": 0.0019854209385812283, "learning_rate": 2.5687737781679838e-05, "loss": 0.0, "step": 3323 }, { "epoch": 0.9727831431079894, "grad_norm": 0.00042396143544465303, "learning_rate": 2.5680421422300266e-05, "loss": 0.0, "step": 3324 }, { "epoch": 0.9730757974831724, "grad_norm": 0.002329543698579073, "learning_rate": 2.5673105062920694e-05, "loss": 0.0, "step": 3325 }, { "epoch": 0.9733684518583553, "grad_norm": 0.0005398441571742296, "learning_rate": 2.5665788703541122e-05, "loss": 0.0, "step": 3326 }, { "epoch": 0.9736611062335382, "grad_norm": 0.0003080609312746674, "learning_rate": 2.5658472344161543e-05, "loss": 0.0, "step": 3327 }, { "epoch": 0.9739537606087211, "grad_norm": 0.00035853084409609437, "learning_rate": 2.565115598478197e-05, "loss": 0.0, "step": 3328 }, { "epoch": 0.974246414983904, "grad_norm": 0.0006352364434860647, "learning_rate": 2.56438396254024e-05, "loss": 0.0, "step": 3329 }, { "epoch": 0.974539069359087, "grad_norm": 0.2538038194179535, "learning_rate": 2.5636523266022827e-05, "loss": 0.0008, "step": 3330 }, { "epoch": 0.9748317237342699, "grad_norm": 0.0013529519783332944, "learning_rate": 2.5629206906643255e-05, "loss": 0.0, "step": 3331 }, { "epoch": 0.9751243781094527, "grad_norm": 0.0006466333288699389, "learning_rate": 2.5621890547263683e-05, "loss": 0.0, "step": 3332 }, { "epoch": 0.9754170324846356, "grad_norm": 0.0003267640422564, "learning_rate": 2.561457418788411e-05, "loss": 0.0, "step": 3333 }, { "epoch": 0.9757096868598185, "grad_norm": 0.00029956409707665443, "learning_rate": 2.560725782850454e-05, "loss": 0.0, "step": 3334 }, { "epoch": 0.9760023412350015, "grad_norm": 0.0002519940317142755, "learning_rate": 2.5599941469124967e-05, "loss": 0.0, "step": 3335 }, { "epoch": 0.9762949956101844, "grad_norm": 0.0004761736490763724, "learning_rate": 2.5592625109745395e-05, "loss": 0.0, "step": 3336 }, { "epoch": 0.9765876499853673, "grad_norm": 0.00024568208027631044, "learning_rate": 2.5585308750365823e-05, "loss": 0.0, "step": 3337 }, { "epoch": 0.9768803043605502, "grad_norm": 0.000524199684150517, "learning_rate": 2.5577992390986244e-05, "loss": 0.0, "step": 3338 }, { "epoch": 0.9771729587357331, "grad_norm": 0.0008691222174093127, "learning_rate": 2.5570676031606672e-05, "loss": 0.0, "step": 3339 }, { "epoch": 0.977465613110916, "grad_norm": 0.00055574846919626, "learning_rate": 2.55633596722271e-05, "loss": 0.0, "step": 3340 }, { "epoch": 0.977758267486099, "grad_norm": 0.0003735160280484706, "learning_rate": 2.5556043312847528e-05, "loss": 0.0, "step": 3341 }, { "epoch": 0.9780509218612818, "grad_norm": 0.0003160155611112714, "learning_rate": 2.5548726953467956e-05, "loss": 0.0, "step": 3342 }, { "epoch": 0.9783435762364647, "grad_norm": 0.03528832271695137, "learning_rate": 2.5541410594088384e-05, "loss": 0.0002, "step": 3343 }, { "epoch": 0.9786362306116476, "grad_norm": 0.0003516852739267051, "learning_rate": 2.553409423470881e-05, "loss": 0.0, "step": 3344 }, { "epoch": 0.9789288849868305, "grad_norm": 0.00026389810955151916, "learning_rate": 2.552677787532924e-05, "loss": 0.0, "step": 3345 }, { "epoch": 0.9792215393620135, "grad_norm": 0.0005689252284355462, "learning_rate": 2.5519461515949668e-05, "loss": 0.0, "step": 3346 }, { "epoch": 0.9795141937371964, "grad_norm": 0.00020987627794966102, "learning_rate": 2.5512145156570096e-05, "loss": 0.0, "step": 3347 }, { "epoch": 0.9798068481123793, "grad_norm": 0.0004520490183494985, "learning_rate": 2.5504828797190523e-05, "loss": 0.0, "step": 3348 }, { "epoch": 0.9800995024875622, "grad_norm": 0.00020526224398054183, "learning_rate": 2.5497512437810945e-05, "loss": 0.0, "step": 3349 }, { "epoch": 0.9803921568627451, "grad_norm": 0.0008702614577487111, "learning_rate": 2.5490196078431373e-05, "loss": 0.0, "step": 3350 }, { "epoch": 0.9806848112379281, "grad_norm": 0.00030936242546886206, "learning_rate": 2.54828797190518e-05, "loss": 0.0, "step": 3351 }, { "epoch": 0.9809774656131109, "grad_norm": 0.0038242791779339314, "learning_rate": 2.547556335967223e-05, "loss": 0.0, "step": 3352 }, { "epoch": 0.9812701199882938, "grad_norm": 0.0009497537394054234, "learning_rate": 2.5468247000292656e-05, "loss": 0.0, "step": 3353 }, { "epoch": 0.9815627743634767, "grad_norm": 0.0002302091015735641, "learning_rate": 2.5460930640913084e-05, "loss": 0.0, "step": 3354 }, { "epoch": 0.9818554287386596, "grad_norm": 0.009501870721578598, "learning_rate": 2.5453614281533512e-05, "loss": 0.0001, "step": 3355 }, { "epoch": 0.9821480831138426, "grad_norm": 0.0034651365131139755, "learning_rate": 2.544629792215394e-05, "loss": 0.0, "step": 3356 }, { "epoch": 0.9824407374890255, "grad_norm": 0.00045796119957230985, "learning_rate": 2.5438981562774368e-05, "loss": 0.0, "step": 3357 }, { "epoch": 0.9827333918642084, "grad_norm": 0.0004128154832869768, "learning_rate": 2.5431665203394793e-05, "loss": 0.0, "step": 3358 }, { "epoch": 0.9830260462393913, "grad_norm": 0.00048634555423632264, "learning_rate": 2.5424348844015217e-05, "loss": 0.0, "step": 3359 }, { "epoch": 0.9833187006145742, "grad_norm": 0.0010877094464376569, "learning_rate": 2.5417032484635645e-05, "loss": 0.0, "step": 3360 }, { "epoch": 0.983611354989757, "grad_norm": 0.00032809775439091027, "learning_rate": 2.5409716125256073e-05, "loss": 0.0, "step": 3361 }, { "epoch": 0.98390400936494, "grad_norm": 0.000183176773134619, "learning_rate": 2.54023997658765e-05, "loss": 0.0, "step": 3362 }, { "epoch": 0.9841966637401229, "grad_norm": 0.0011083828285336494, "learning_rate": 2.539508340649693e-05, "loss": 0.0, "step": 3363 }, { "epoch": 0.9844893181153058, "grad_norm": 0.00015416370297316462, "learning_rate": 2.5387767047117357e-05, "loss": 0.0, "step": 3364 }, { "epoch": 0.9847819724904887, "grad_norm": 0.00027531859814189374, "learning_rate": 2.5380450687737785e-05, "loss": 0.0, "step": 3365 }, { "epoch": 0.9850746268656716, "grad_norm": 0.00017930881585925817, "learning_rate": 2.537313432835821e-05, "loss": 0.0, "step": 3366 }, { "epoch": 0.9853672812408546, "grad_norm": 0.0003319485695101321, "learning_rate": 2.5365817968978638e-05, "loss": 0.0, "step": 3367 }, { "epoch": 0.9856599356160375, "grad_norm": 0.0010585540439933538, "learning_rate": 2.5358501609599066e-05, "loss": 0.0, "step": 3368 }, { "epoch": 0.9859525899912204, "grad_norm": 0.0002888018498197198, "learning_rate": 2.5351185250219493e-05, "loss": 0.0, "step": 3369 }, { "epoch": 0.9862452443664033, "grad_norm": 0.0015260048676282167, "learning_rate": 2.5343868890839918e-05, "loss": 0.0, "step": 3370 }, { "epoch": 0.9865378987415862, "grad_norm": 0.00014370719145517796, "learning_rate": 2.5336552531460346e-05, "loss": 0.0, "step": 3371 }, { "epoch": 0.9868305531167691, "grad_norm": 0.0002880328393075615, "learning_rate": 2.5329236172080774e-05, "loss": 0.0, "step": 3372 }, { "epoch": 0.987123207491952, "grad_norm": 0.0006448253407143056, "learning_rate": 2.5321919812701202e-05, "loss": 0.0, "step": 3373 }, { "epoch": 0.9874158618671349, "grad_norm": 0.00018244732927996665, "learning_rate": 2.5314603453321626e-05, "loss": 0.0, "step": 3374 }, { "epoch": 0.9877085162423178, "grad_norm": 0.00013968031271360815, "learning_rate": 2.5307287093942054e-05, "loss": 0.0, "step": 3375 }, { "epoch": 0.9880011706175007, "grad_norm": 0.00027053322992287576, "learning_rate": 2.5299970734562482e-05, "loss": 0.0, "step": 3376 }, { "epoch": 0.9882938249926836, "grad_norm": 1.9271124601364136, "learning_rate": 2.529265437518291e-05, "loss": 0.0028, "step": 3377 }, { "epoch": 0.9885864793678666, "grad_norm": 0.0356573611497879, "learning_rate": 2.5285338015803338e-05, "loss": 0.0002, "step": 3378 }, { "epoch": 0.9888791337430495, "grad_norm": 0.00020806578686460853, "learning_rate": 2.5278021656423766e-05, "loss": 0.0, "step": 3379 }, { "epoch": 0.9891717881182324, "grad_norm": 0.00011854932381538674, "learning_rate": 2.527070529704419e-05, "loss": 0.0, "step": 3380 }, { "epoch": 0.9894644424934153, "grad_norm": 15.745497703552246, "learning_rate": 2.526338893766462e-05, "loss": 0.0762, "step": 3381 }, { "epoch": 0.9897570968685981, "grad_norm": 0.0009089075610972941, "learning_rate": 2.5256072578285043e-05, "loss": 0.0, "step": 3382 }, { "epoch": 0.9900497512437811, "grad_norm": 0.000981273828074336, "learning_rate": 2.524875621890547e-05, "loss": 0.0, "step": 3383 }, { "epoch": 0.990342405618964, "grad_norm": 0.000403301470214501, "learning_rate": 2.52414398595259e-05, "loss": 0.0, "step": 3384 }, { "epoch": 0.9906350599941469, "grad_norm": 0.00026050262385979295, "learning_rate": 2.5234123500146327e-05, "loss": 0.0, "step": 3385 }, { "epoch": 0.9909277143693298, "grad_norm": 0.00023542552662547678, "learning_rate": 2.5226807140766755e-05, "loss": 0.0, "step": 3386 }, { "epoch": 0.9912203687445127, "grad_norm": 0.0013741771690547466, "learning_rate": 2.5219490781387183e-05, "loss": 0.0, "step": 3387 }, { "epoch": 0.9915130231196957, "grad_norm": 0.0006651601288467646, "learning_rate": 2.521217442200761e-05, "loss": 0.0, "step": 3388 }, { "epoch": 0.9918056774948786, "grad_norm": 0.12294548004865646, "learning_rate": 2.520485806262804e-05, "loss": 0.0005, "step": 3389 }, { "epoch": 0.9920983318700615, "grad_norm": 0.016826776787638664, "learning_rate": 2.5197541703248467e-05, "loss": 0.0001, "step": 3390 }, { "epoch": 0.9923909862452444, "grad_norm": 12.275745391845703, "learning_rate": 2.5190225343868888e-05, "loss": 0.1578, "step": 3391 }, { "epoch": 0.9926836406204272, "grad_norm": 0.0016533538000658154, "learning_rate": 2.5182908984489316e-05, "loss": 0.0, "step": 3392 }, { "epoch": 0.9929762949956101, "grad_norm": 10.942267417907715, "learning_rate": 2.5175592625109744e-05, "loss": 0.2113, "step": 3393 }, { "epoch": 0.9932689493707931, "grad_norm": 0.0032125513534992933, "learning_rate": 2.5168276265730172e-05, "loss": 0.0, "step": 3394 }, { "epoch": 0.993561603745976, "grad_norm": 0.0002497293462511152, "learning_rate": 2.51609599063506e-05, "loss": 0.0, "step": 3395 }, { "epoch": 0.9938542581211589, "grad_norm": 0.0044729625806212425, "learning_rate": 2.5153643546971028e-05, "loss": 0.0, "step": 3396 }, { "epoch": 0.9941469124963418, "grad_norm": 0.012294032610952854, "learning_rate": 2.5146327187591456e-05, "loss": 0.0001, "step": 3397 }, { "epoch": 0.9944395668715247, "grad_norm": 0.008075743913650513, "learning_rate": 2.5139010828211884e-05, "loss": 0.0, "step": 3398 }, { "epoch": 0.9947322212467077, "grad_norm": 0.0002726623206399381, "learning_rate": 2.513169446883231e-05, "loss": 0.0, "step": 3399 }, { "epoch": 0.9950248756218906, "grad_norm": 0.0036799306981265545, "learning_rate": 2.512437810945274e-05, "loss": 0.0, "step": 3400 }, { "epoch": 0.9953175299970735, "grad_norm": 0.0004843377973884344, "learning_rate": 2.5117061750073168e-05, "loss": 0.0, "step": 3401 }, { "epoch": 0.9956101843722563, "grad_norm": 0.2228137105703354, "learning_rate": 2.510974539069359e-05, "loss": 0.0008, "step": 3402 }, { "epoch": 0.9959028387474392, "grad_norm": 0.0007371075917035341, "learning_rate": 2.5102429031314017e-05, "loss": 0.0, "step": 3403 }, { "epoch": 0.9961954931226222, "grad_norm": 0.0014641970628872514, "learning_rate": 2.5095112671934445e-05, "loss": 0.0, "step": 3404 }, { "epoch": 0.9964881474978051, "grad_norm": 0.0005017340299673378, "learning_rate": 2.5087796312554873e-05, "loss": 0.0, "step": 3405 }, { "epoch": 0.996780801872988, "grad_norm": 1.8678332567214966, "learning_rate": 2.50804799531753e-05, "loss": 0.0023, "step": 3406 }, { "epoch": 0.9970734562481709, "grad_norm": 0.000548258947674185, "learning_rate": 2.507316359379573e-05, "loss": 0.0, "step": 3407 }, { "epoch": 0.9973661106233538, "grad_norm": 0.0020072218030691147, "learning_rate": 2.5065847234416156e-05, "loss": 0.0, "step": 3408 }, { "epoch": 0.9976587649985367, "grad_norm": 0.004497798625379801, "learning_rate": 2.5058530875036584e-05, "loss": 0.0001, "step": 3409 }, { "epoch": 0.9979514193737197, "grad_norm": 0.050592996180057526, "learning_rate": 2.5051214515657012e-05, "loss": 0.0003, "step": 3410 }, { "epoch": 0.9982440737489026, "grad_norm": 0.0006552187842316926, "learning_rate": 2.504389815627744e-05, "loss": 0.0, "step": 3411 }, { "epoch": 0.9985367281240854, "grad_norm": 11.620914459228516, "learning_rate": 2.503658179689786e-05, "loss": 0.158, "step": 3412 }, { "epoch": 0.9988293824992683, "grad_norm": 0.0039320336654782295, "learning_rate": 2.502926543751829e-05, "loss": 0.0001, "step": 3413 }, { "epoch": 0.9991220368744512, "grad_norm": 10.041006088256836, "learning_rate": 2.5021949078138717e-05, "loss": 0.016, "step": 3414 }, { "epoch": 0.9994146912496342, "grad_norm": 1.3903892040252686, "learning_rate": 2.5014632718759145e-05, "loss": 0.0032, "step": 3415 }, { "epoch": 0.9997073456248171, "grad_norm": 0.002409636974334717, "learning_rate": 2.5007316359379573e-05, "loss": 0.0, "step": 3416 }, { "epoch": 1.0, "grad_norm": 0.0005303791258484125, "learning_rate": 2.5e-05, "loss": 0.0, "step": 3417 }, { "epoch": 1.0, "eval_accuracy": 0.9995883757306331, "eval_f1": 0.9997716790721037, "eval_loss": 0.002137020230293274, "eval_precision": 0.9997260273972602, "eval_recall": 0.9998173349164308, "eval_runtime": 719.3196, "eval_samples_per_second": 16.887, "eval_steps_per_second": 1.057, "step": 3417 }, { "epoch": 1.0002926543751829, "grad_norm": 0.0005364635726436973, "learning_rate": 2.499268364062043e-05, "loss": 0.0, "step": 3418 }, { "epoch": 1.0005853087503658, "grad_norm": 0.000856778584420681, "learning_rate": 2.4985367281240857e-05, "loss": 0.0, "step": 3419 }, { "epoch": 1.0008779631255487, "grad_norm": 0.05647166445851326, "learning_rate": 2.497805092186128e-05, "loss": 0.0002, "step": 3420 }, { "epoch": 1.0011706175007316, "grad_norm": 0.002607455011457205, "learning_rate": 2.497073456248171e-05, "loss": 0.0, "step": 3421 }, { "epoch": 1.0014632718759144, "grad_norm": 0.0004365872300695628, "learning_rate": 2.4963418203102138e-05, "loss": 0.0, "step": 3422 }, { "epoch": 1.0017559262510976, "grad_norm": 2.8085110187530518, "learning_rate": 2.4956101843722566e-05, "loss": 0.0029, "step": 3423 }, { "epoch": 1.0020485806262804, "grad_norm": 0.001110740122385323, "learning_rate": 2.4948785484342993e-05, "loss": 0.0, "step": 3424 }, { "epoch": 1.0023412350014633, "grad_norm": 1.543617844581604, "learning_rate": 2.4941469124963418e-05, "loss": 0.0059, "step": 3425 }, { "epoch": 1.0026338893766462, "grad_norm": 0.0005496027297340333, "learning_rate": 2.4934152765583846e-05, "loss": 0.0, "step": 3426 }, { "epoch": 1.002926543751829, "grad_norm": 0.0006246971315704286, "learning_rate": 2.4926836406204274e-05, "loss": 0.0, "step": 3427 }, { "epoch": 1.003219198127012, "grad_norm": 0.001402894384227693, "learning_rate": 2.4919520046824702e-05, "loss": 0.0, "step": 3428 }, { "epoch": 1.0035118525021949, "grad_norm": 0.025198372080922127, "learning_rate": 2.491220368744513e-05, "loss": 0.0001, "step": 3429 }, { "epoch": 1.0038045068773778, "grad_norm": 0.001699879881925881, "learning_rate": 2.4904887328065558e-05, "loss": 0.0, "step": 3430 }, { "epoch": 1.0040971612525607, "grad_norm": 0.004124501720070839, "learning_rate": 2.4897570968685982e-05, "loss": 0.0, "step": 3431 }, { "epoch": 1.0043898156277435, "grad_norm": 0.0007139227818697691, "learning_rate": 2.489025460930641e-05, "loss": 0.0, "step": 3432 }, { "epoch": 1.0046824700029267, "grad_norm": 1.722924828529358, "learning_rate": 2.4882938249926838e-05, "loss": 0.0085, "step": 3433 }, { "epoch": 1.0049751243781095, "grad_norm": 0.0007747646304778755, "learning_rate": 2.4875621890547266e-05, "loss": 0.0, "step": 3434 }, { "epoch": 1.0052677787532924, "grad_norm": 0.0024586129002273083, "learning_rate": 2.4868305531167694e-05, "loss": 0.0, "step": 3435 }, { "epoch": 1.0055604331284753, "grad_norm": 0.00048653915291652083, "learning_rate": 2.486098917178812e-05, "loss": 0.0, "step": 3436 }, { "epoch": 1.0058530875036582, "grad_norm": 0.0017318056197836995, "learning_rate": 2.4853672812408547e-05, "loss": 0.0, "step": 3437 }, { "epoch": 1.006145741878841, "grad_norm": 0.024242956191301346, "learning_rate": 2.4846356453028975e-05, "loss": 0.0001, "step": 3438 }, { "epoch": 1.006438396254024, "grad_norm": 0.0004281763976905495, "learning_rate": 2.4839040093649403e-05, "loss": 0.0, "step": 3439 }, { "epoch": 1.0067310506292069, "grad_norm": 0.00036549000651575625, "learning_rate": 2.483172373426983e-05, "loss": 0.0, "step": 3440 }, { "epoch": 1.0070237050043898, "grad_norm": 0.0002772965235635638, "learning_rate": 2.4824407374890255e-05, "loss": 0.0, "step": 3441 }, { "epoch": 1.0073163593795726, "grad_norm": 0.00019915489247068763, "learning_rate": 2.4817091015510683e-05, "loss": 0.0, "step": 3442 }, { "epoch": 1.0076090137547555, "grad_norm": 0.0008135453681461513, "learning_rate": 2.480977465613111e-05, "loss": 0.0, "step": 3443 }, { "epoch": 1.0079016681299386, "grad_norm": 0.0006470750086009502, "learning_rate": 2.480245829675154e-05, "loss": 0.0, "step": 3444 }, { "epoch": 1.0081943225051215, "grad_norm": 0.00011369881394784898, "learning_rate": 2.4795141937371967e-05, "loss": 0.0, "step": 3445 }, { "epoch": 1.0084869768803044, "grad_norm": 0.0005042713601142168, "learning_rate": 2.4787825577992395e-05, "loss": 0.0, "step": 3446 }, { "epoch": 1.0087796312554873, "grad_norm": 0.00038000199128873646, "learning_rate": 2.478050921861282e-05, "loss": 0.0, "step": 3447 }, { "epoch": 1.0090722856306702, "grad_norm": 0.0016200197860598564, "learning_rate": 2.4773192859233247e-05, "loss": 0.0, "step": 3448 }, { "epoch": 1.009364940005853, "grad_norm": 3.1312618255615234, "learning_rate": 2.4765876499853675e-05, "loss": 0.3212, "step": 3449 }, { "epoch": 1.009657594381036, "grad_norm": 0.0004399172612465918, "learning_rate": 2.4758560140474103e-05, "loss": 0.0, "step": 3450 }, { "epoch": 1.0099502487562189, "grad_norm": 0.001580632757395506, "learning_rate": 2.475124378109453e-05, "loss": 0.0, "step": 3451 }, { "epoch": 1.0102429031314017, "grad_norm": 0.0017576804384589195, "learning_rate": 2.4743927421714956e-05, "loss": 0.0, "step": 3452 }, { "epoch": 1.0105355575065846, "grad_norm": 0.0011914470233023167, "learning_rate": 2.4736611062335384e-05, "loss": 0.0, "step": 3453 }, { "epoch": 1.0108282118817675, "grad_norm": 0.0014104278525337577, "learning_rate": 2.472929470295581e-05, "loss": 0.0, "step": 3454 }, { "epoch": 1.0111208662569506, "grad_norm": 0.0016562313539907336, "learning_rate": 2.472197834357624e-05, "loss": 0.0, "step": 3455 }, { "epoch": 1.0114135206321335, "grad_norm": 0.0008212103857658803, "learning_rate": 2.4714661984196668e-05, "loss": 0.0, "step": 3456 }, { "epoch": 1.0117061750073164, "grad_norm": 0.002704160986468196, "learning_rate": 2.4707345624817092e-05, "loss": 0.0001, "step": 3457 }, { "epoch": 1.0119988293824993, "grad_norm": 0.00468985503539443, "learning_rate": 2.470002926543752e-05, "loss": 0.0001, "step": 3458 }, { "epoch": 1.0122914837576822, "grad_norm": 0.0041338070295751095, "learning_rate": 2.4692712906057948e-05, "loss": 0.0001, "step": 3459 }, { "epoch": 1.012584138132865, "grad_norm": 0.0030151098035275936, "learning_rate": 2.4685396546678376e-05, "loss": 0.0001, "step": 3460 }, { "epoch": 1.012876792508048, "grad_norm": 0.007817713543772697, "learning_rate": 2.46780801872988e-05, "loss": 0.0001, "step": 3461 }, { "epoch": 1.0131694468832309, "grad_norm": 0.0026054515037685633, "learning_rate": 2.467076382791923e-05, "loss": 0.0001, "step": 3462 }, { "epoch": 1.0134621012584137, "grad_norm": 0.0018012281507253647, "learning_rate": 2.4663447468539656e-05, "loss": 0.0, "step": 3463 }, { "epoch": 1.0137547556335966, "grad_norm": 0.009095782414078712, "learning_rate": 2.4656131109160084e-05, "loss": 0.0002, "step": 3464 }, { "epoch": 1.0140474100087797, "grad_norm": 0.010534383356571198, "learning_rate": 2.464881474978051e-05, "loss": 0.0002, "step": 3465 }, { "epoch": 1.0143400643839626, "grad_norm": 0.002322286134585738, "learning_rate": 2.4641498390400937e-05, "loss": 0.0001, "step": 3466 }, { "epoch": 1.0146327187591455, "grad_norm": 0.007303439546376467, "learning_rate": 2.4634182031021365e-05, "loss": 0.0001, "step": 3467 }, { "epoch": 1.0149253731343284, "grad_norm": 0.010205356404185295, "learning_rate": 2.4626865671641793e-05, "loss": 0.0002, "step": 3468 }, { "epoch": 1.0152180275095113, "grad_norm": 0.00949253048747778, "learning_rate": 2.4619549312262217e-05, "loss": 0.0002, "step": 3469 }, { "epoch": 1.0155106818846942, "grad_norm": 0.0067353821359574795, "learning_rate": 2.4612232952882645e-05, "loss": 0.0002, "step": 3470 }, { "epoch": 1.015803336259877, "grad_norm": 0.012329000979661942, "learning_rate": 2.4604916593503073e-05, "loss": 0.0002, "step": 3471 }, { "epoch": 1.01609599063506, "grad_norm": 0.005563882179558277, "learning_rate": 2.45976002341235e-05, "loss": 0.0001, "step": 3472 }, { "epoch": 1.0163886450102428, "grad_norm": 0.016726041212677956, "learning_rate": 2.4590283874743926e-05, "loss": 0.0003, "step": 3473 }, { "epoch": 1.0166812993854257, "grad_norm": 8.889880180358887, "learning_rate": 2.4582967515364354e-05, "loss": 0.0333, "step": 3474 }, { "epoch": 1.0169739537606086, "grad_norm": 0.014499545097351074, "learning_rate": 2.457565115598478e-05, "loss": 0.0003, "step": 3475 }, { "epoch": 1.0172666081357917, "grad_norm": 0.005111265927553177, "learning_rate": 2.456833479660521e-05, "loss": 0.0001, "step": 3476 }, { "epoch": 1.0175592625109746, "grad_norm": 0.00333440606482327, "learning_rate": 2.4561018437225638e-05, "loss": 0.0001, "step": 3477 }, { "epoch": 1.0178519168861575, "grad_norm": 0.00515385065227747, "learning_rate": 2.4553702077846066e-05, "loss": 0.0001, "step": 3478 }, { "epoch": 1.0181445712613404, "grad_norm": 0.009151050820946693, "learning_rate": 2.454638571846649e-05, "loss": 0.0002, "step": 3479 }, { "epoch": 1.0184372256365233, "grad_norm": 0.010349135845899582, "learning_rate": 2.4539069359086918e-05, "loss": 0.0002, "step": 3480 }, { "epoch": 1.0187298800117062, "grad_norm": 0.0076150717213749886, "learning_rate": 2.4531752999707346e-05, "loss": 0.0001, "step": 3481 }, { "epoch": 1.019022534386889, "grad_norm": 0.007438007742166519, "learning_rate": 2.4524436640327774e-05, "loss": 0.0002, "step": 3482 }, { "epoch": 1.019315188762072, "grad_norm": 0.008855358697474003, "learning_rate": 2.4517120280948202e-05, "loss": 0.0002, "step": 3483 }, { "epoch": 1.0196078431372548, "grad_norm": 0.004839539993554354, "learning_rate": 2.4509803921568626e-05, "loss": 0.0001, "step": 3484 }, { "epoch": 1.0199004975124377, "grad_norm": 0.02208811230957508, "learning_rate": 2.4502487562189054e-05, "loss": 0.0001, "step": 3485 }, { "epoch": 1.0201931518876206, "grad_norm": 0.004208148457109928, "learning_rate": 2.4495171202809482e-05, "loss": 0.0001, "step": 3486 }, { "epoch": 1.0204858062628037, "grad_norm": 0.010331617668271065, "learning_rate": 2.448785484342991e-05, "loss": 0.0002, "step": 3487 }, { "epoch": 1.0207784606379866, "grad_norm": 0.005608166567981243, "learning_rate": 2.4480538484050338e-05, "loss": 0.0001, "step": 3488 }, { "epoch": 1.0210711150131695, "grad_norm": 9.352372169494629, "learning_rate": 2.4473222124670763e-05, "loss": 0.0126, "step": 3489 }, { "epoch": 1.0213637693883524, "grad_norm": 0.0057179625146090984, "learning_rate": 2.446590576529119e-05, "loss": 0.0001, "step": 3490 }, { "epoch": 1.0216564237635353, "grad_norm": 0.0024904771707952023, "learning_rate": 2.445858940591162e-05, "loss": 0.0001, "step": 3491 }, { "epoch": 1.0219490781387182, "grad_norm": 0.00714457593858242, "learning_rate": 2.4451273046532047e-05, "loss": 0.0002, "step": 3492 }, { "epoch": 1.022241732513901, "grad_norm": 0.002021880354732275, "learning_rate": 2.4443956687152475e-05, "loss": 0.0001, "step": 3493 }, { "epoch": 1.022534386889084, "grad_norm": 0.005838671233505011, "learning_rate": 2.44366403277729e-05, "loss": 0.0001, "step": 3494 }, { "epoch": 1.0228270412642668, "grad_norm": 0.003814258612692356, "learning_rate": 2.4429323968393327e-05, "loss": 0.0001, "step": 3495 }, { "epoch": 1.0231196956394497, "grad_norm": 0.0028571065049618483, "learning_rate": 2.4422007609013755e-05, "loss": 0.0001, "step": 3496 }, { "epoch": 1.0234123500146328, "grad_norm": 0.004640253726392984, "learning_rate": 2.4414691249634183e-05, "loss": 0.0001, "step": 3497 }, { "epoch": 1.0237050043898157, "grad_norm": 0.004929804243147373, "learning_rate": 2.440737489025461e-05, "loss": 0.0001, "step": 3498 }, { "epoch": 1.0239976587649986, "grad_norm": 0.00255428534001112, "learning_rate": 2.440005853087504e-05, "loss": 0.0001, "step": 3499 }, { "epoch": 1.0242903131401815, "grad_norm": 0.00356631469912827, "learning_rate": 2.4392742171495463e-05, "loss": 0.0001, "step": 3500 }, { "epoch": 1.0245829675153644, "grad_norm": 0.004246738739311695, "learning_rate": 2.438542581211589e-05, "loss": 0.0001, "step": 3501 }, { "epoch": 1.0248756218905473, "grad_norm": 0.0017620133003219962, "learning_rate": 2.437810945273632e-05, "loss": 0.0, "step": 3502 }, { "epoch": 1.0251682762657301, "grad_norm": 0.003564164973795414, "learning_rate": 2.4370793093356747e-05, "loss": 0.0001, "step": 3503 }, { "epoch": 1.025460930640913, "grad_norm": 0.00230700196698308, "learning_rate": 2.4363476733977175e-05, "loss": 0.0001, "step": 3504 }, { "epoch": 1.025753585016096, "grad_norm": 0.0015987782971933484, "learning_rate": 2.43561603745976e-05, "loss": 0.0, "step": 3505 }, { "epoch": 1.0260462393912788, "grad_norm": 0.0009958547307178378, "learning_rate": 2.4348844015218028e-05, "loss": 0.0, "step": 3506 }, { "epoch": 1.0263388937664617, "grad_norm": 0.0028785394970327616, "learning_rate": 2.4341527655838456e-05, "loss": 0.0001, "step": 3507 }, { "epoch": 1.0266315481416448, "grad_norm": 0.002653158036991954, "learning_rate": 2.4334211296458884e-05, "loss": 0.0001, "step": 3508 }, { "epoch": 1.0269242025168277, "grad_norm": 0.0009308578446507454, "learning_rate": 2.432689493707931e-05, "loss": 0.0, "step": 3509 }, { "epoch": 1.0272168568920106, "grad_norm": 0.004706493113189936, "learning_rate": 2.4319578577699736e-05, "loss": 0.0001, "step": 3510 }, { "epoch": 1.0275095112671935, "grad_norm": 0.002086702734231949, "learning_rate": 2.4312262218320164e-05, "loss": 0.0, "step": 3511 }, { "epoch": 1.0278021656423764, "grad_norm": 0.003446651855483651, "learning_rate": 2.4304945858940592e-05, "loss": 0.0001, "step": 3512 }, { "epoch": 1.0280948200175593, "grad_norm": 0.0022666268050670624, "learning_rate": 2.429762949956102e-05, "loss": 0.0001, "step": 3513 }, { "epoch": 1.0283874743927421, "grad_norm": 0.0011834276374429464, "learning_rate": 2.4290313140181448e-05, "loss": 0.0, "step": 3514 }, { "epoch": 1.028680128767925, "grad_norm": 0.0007315911934711039, "learning_rate": 2.4282996780801876e-05, "loss": 0.0, "step": 3515 }, { "epoch": 1.028972783143108, "grad_norm": 0.0026782916393131018, "learning_rate": 2.42756804214223e-05, "loss": 0.0001, "step": 3516 }, { "epoch": 1.0292654375182908, "grad_norm": 0.0027841634582728148, "learning_rate": 2.426836406204273e-05, "loss": 0.0001, "step": 3517 }, { "epoch": 1.029558091893474, "grad_norm": 0.001004671910777688, "learning_rate": 2.4261047702663156e-05, "loss": 0.0, "step": 3518 }, { "epoch": 1.0298507462686568, "grad_norm": 0.0021532170940190554, "learning_rate": 2.4253731343283584e-05, "loss": 0.0001, "step": 3519 }, { "epoch": 1.0301434006438397, "grad_norm": 0.0016680386615917087, "learning_rate": 2.4246414983904012e-05, "loss": 0.0, "step": 3520 }, { "epoch": 1.0304360550190226, "grad_norm": 0.004606564529240131, "learning_rate": 2.4239098624524437e-05, "loss": 0.0001, "step": 3521 }, { "epoch": 1.0307287093942055, "grad_norm": 0.0014813424786552787, "learning_rate": 2.4231782265144865e-05, "loss": 0.0, "step": 3522 }, { "epoch": 1.0310213637693884, "grad_norm": 0.0005702835042029619, "learning_rate": 2.4224465905765293e-05, "loss": 0.0, "step": 3523 }, { "epoch": 1.0313140181445712, "grad_norm": 0.002096495358273387, "learning_rate": 2.421714954638572e-05, "loss": 0.0, "step": 3524 }, { "epoch": 1.0316066725197541, "grad_norm": 0.0034722215496003628, "learning_rate": 2.420983318700615e-05, "loss": 0.0001, "step": 3525 }, { "epoch": 1.031899326894937, "grad_norm": 0.0027094169054180384, "learning_rate": 2.4202516827626573e-05, "loss": 0.0001, "step": 3526 }, { "epoch": 1.03219198127012, "grad_norm": 0.002960802521556616, "learning_rate": 2.4195200468247e-05, "loss": 0.0001, "step": 3527 }, { "epoch": 1.0324846356453028, "grad_norm": 0.0020208009518682957, "learning_rate": 2.418788410886743e-05, "loss": 0.0001, "step": 3528 }, { "epoch": 1.032777290020486, "grad_norm": 0.0008873249753378332, "learning_rate": 2.4180567749487857e-05, "loss": 0.0, "step": 3529 }, { "epoch": 1.0330699443956688, "grad_norm": 0.0014568967744708061, "learning_rate": 2.4173251390108285e-05, "loss": 0.0, "step": 3530 }, { "epoch": 1.0333625987708517, "grad_norm": 0.004252336453646421, "learning_rate": 2.4165935030728713e-05, "loss": 0.0001, "step": 3531 }, { "epoch": 1.0336552531460346, "grad_norm": 0.0038973314221948385, "learning_rate": 2.4158618671349138e-05, "loss": 0.0001, "step": 3532 }, { "epoch": 1.0339479075212175, "grad_norm": 0.0010255716042593122, "learning_rate": 2.4151302311969566e-05, "loss": 0.0, "step": 3533 }, { "epoch": 1.0342405618964003, "grad_norm": 0.0017912236507982016, "learning_rate": 2.4143985952589993e-05, "loss": 0.0, "step": 3534 }, { "epoch": 1.0345332162715832, "grad_norm": 0.0040057022124528885, "learning_rate": 2.413666959321042e-05, "loss": 0.0001, "step": 3535 }, { "epoch": 1.0348258706467661, "grad_norm": 0.001981346635147929, "learning_rate": 2.412935323383085e-05, "loss": 0.0, "step": 3536 }, { "epoch": 1.035118525021949, "grad_norm": 0.0009333692723885179, "learning_rate": 2.4122036874451274e-05, "loss": 0.0, "step": 3537 }, { "epoch": 1.035411179397132, "grad_norm": 0.0011883610859513283, "learning_rate": 2.4114720515071702e-05, "loss": 0.0, "step": 3538 }, { "epoch": 1.035703833772315, "grad_norm": 0.002273314166814089, "learning_rate": 2.410740415569213e-05, "loss": 0.0001, "step": 3539 }, { "epoch": 1.035996488147498, "grad_norm": 0.006086126435548067, "learning_rate": 2.4100087796312558e-05, "loss": 0.0001, "step": 3540 }, { "epoch": 1.0362891425226808, "grad_norm": 4.4004974365234375, "learning_rate": 2.4092771436932986e-05, "loss": 0.149, "step": 3541 }, { "epoch": 1.0365817968978637, "grad_norm": 0.008607176132500172, "learning_rate": 2.408545507755341e-05, "loss": 0.0002, "step": 3542 }, { "epoch": 1.0368744512730466, "grad_norm": 0.0027010750491172075, "learning_rate": 2.4078138718173838e-05, "loss": 0.0001, "step": 3543 }, { "epoch": 1.0371671056482294, "grad_norm": 0.0015393191715702415, "learning_rate": 2.4070822358794266e-05, "loss": 0.0, "step": 3544 }, { "epoch": 1.0374597600234123, "grad_norm": 0.0048315017484128475, "learning_rate": 2.4063505999414694e-05, "loss": 0.0001, "step": 3545 }, { "epoch": 1.0377524143985952, "grad_norm": 0.0029096445068717003, "learning_rate": 2.4056189640035122e-05, "loss": 0.0001, "step": 3546 }, { "epoch": 1.0380450687737781, "grad_norm": 0.1986430436372757, "learning_rate": 2.4048873280655547e-05, "loss": 0.0013, "step": 3547 }, { "epoch": 1.038337723148961, "grad_norm": 0.0035794368013739586, "learning_rate": 2.4041556921275975e-05, "loss": 0.0001, "step": 3548 }, { "epoch": 1.0386303775241439, "grad_norm": 0.01305379904806614, "learning_rate": 2.4034240561896403e-05, "loss": 0.0002, "step": 3549 }, { "epoch": 1.038923031899327, "grad_norm": 0.00715178856626153, "learning_rate": 2.402692420251683e-05, "loss": 0.0001, "step": 3550 }, { "epoch": 1.0392156862745099, "grad_norm": 0.0010849026730284095, "learning_rate": 2.401960784313726e-05, "loss": 0.0, "step": 3551 }, { "epoch": 1.0395083406496928, "grad_norm": 0.013471659272909164, "learning_rate": 2.4012291483757683e-05, "loss": 0.0003, "step": 3552 }, { "epoch": 1.0398009950248757, "grad_norm": 0.031217027455568314, "learning_rate": 2.400497512437811e-05, "loss": 0.0003, "step": 3553 }, { "epoch": 1.0400936494000586, "grad_norm": 0.005862652789801359, "learning_rate": 2.399765876499854e-05, "loss": 0.0001, "step": 3554 }, { "epoch": 1.0403863037752414, "grad_norm": 0.028307737782597542, "learning_rate": 2.3990342405618967e-05, "loss": 0.0004, "step": 3555 }, { "epoch": 1.0406789581504243, "grad_norm": 0.10708467662334442, "learning_rate": 2.398302604623939e-05, "loss": 0.001, "step": 3556 }, { "epoch": 1.0409716125256072, "grad_norm": 0.11359408497810364, "learning_rate": 2.397570968685982e-05, "loss": 0.001, "step": 3557 }, { "epoch": 1.04126426690079, "grad_norm": 0.021371543407440186, "learning_rate": 2.3968393327480247e-05, "loss": 0.0003, "step": 3558 }, { "epoch": 1.041556921275973, "grad_norm": 0.0009941854514181614, "learning_rate": 2.3961076968100675e-05, "loss": 0.0, "step": 3559 }, { "epoch": 1.0418495756511559, "grad_norm": 0.003969173412770033, "learning_rate": 2.39537606087211e-05, "loss": 0.0001, "step": 3560 }, { "epoch": 1.042142230026339, "grad_norm": 0.011651767417788506, "learning_rate": 2.3946444249341528e-05, "loss": 0.0002, "step": 3561 }, { "epoch": 1.0424348844015219, "grad_norm": 0.0031273127533495426, "learning_rate": 2.3939127889961956e-05, "loss": 0.0001, "step": 3562 }, { "epoch": 1.0427275387767048, "grad_norm": 0.0070342631079256535, "learning_rate": 2.3931811530582384e-05, "loss": 0.0001, "step": 3563 }, { "epoch": 1.0430201931518877, "grad_norm": 0.018413899466395378, "learning_rate": 2.3924495171202808e-05, "loss": 0.0002, "step": 3564 }, { "epoch": 1.0433128475270705, "grad_norm": 0.005958537105470896, "learning_rate": 2.3917178811823236e-05, "loss": 0.0001, "step": 3565 }, { "epoch": 1.0436055019022534, "grad_norm": 0.0010462752543389797, "learning_rate": 2.3909862452443664e-05, "loss": 0.0, "step": 3566 }, { "epoch": 1.0438981562774363, "grad_norm": 0.0037645921111106873, "learning_rate": 2.3902546093064092e-05, "loss": 0.0001, "step": 3567 }, { "epoch": 1.0441908106526192, "grad_norm": 0.0006369265611283481, "learning_rate": 2.389522973368452e-05, "loss": 0.0, "step": 3568 }, { "epoch": 1.044483465027802, "grad_norm": 0.004025363363325596, "learning_rate": 2.3887913374304945e-05, "loss": 0.0001, "step": 3569 }, { "epoch": 1.044776119402985, "grad_norm": 10.666226387023926, "learning_rate": 2.3880597014925373e-05, "loss": 0.081, "step": 3570 }, { "epoch": 1.0450687737781679, "grad_norm": 0.0009509926312603056, "learning_rate": 2.38732806555458e-05, "loss": 0.0, "step": 3571 }, { "epoch": 1.045361428153351, "grad_norm": 0.002926565706729889, "learning_rate": 2.386596429616623e-05, "loss": 0.0001, "step": 3572 }, { "epoch": 1.0456540825285339, "grad_norm": 0.0013789376243948936, "learning_rate": 2.3858647936786656e-05, "loss": 0.0, "step": 3573 }, { "epoch": 1.0459467369037168, "grad_norm": 0.0010121538070961833, "learning_rate": 2.385133157740708e-05, "loss": 0.0, "step": 3574 }, { "epoch": 1.0462393912788996, "grad_norm": 0.01967390812933445, "learning_rate": 2.384401521802751e-05, "loss": 0.0002, "step": 3575 }, { "epoch": 1.0465320456540825, "grad_norm": 0.001700015040114522, "learning_rate": 2.3836698858647937e-05, "loss": 0.0, "step": 3576 }, { "epoch": 1.0468247000292654, "grad_norm": 0.013245640322566032, "learning_rate": 2.3829382499268365e-05, "loss": 0.0001, "step": 3577 }, { "epoch": 1.0471173544044483, "grad_norm": 0.010698795318603516, "learning_rate": 2.3822066139888793e-05, "loss": 0.0002, "step": 3578 }, { "epoch": 1.0474100087796312, "grad_norm": 0.004378386773169041, "learning_rate": 2.3814749780509217e-05, "loss": 0.0001, "step": 3579 }, { "epoch": 1.047702663154814, "grad_norm": 0.0035175455268472433, "learning_rate": 2.3807433421129645e-05, "loss": 0.0001, "step": 3580 }, { "epoch": 1.047995317529997, "grad_norm": 0.0016487077809870243, "learning_rate": 2.3800117061750073e-05, "loss": 0.0, "step": 3581 }, { "epoch": 1.04828797190518, "grad_norm": 0.001347344950772822, "learning_rate": 2.37928007023705e-05, "loss": 0.0, "step": 3582 }, { "epoch": 1.048580626280363, "grad_norm": 0.007687774952501059, "learning_rate": 2.378548434299093e-05, "loss": 0.0001, "step": 3583 }, { "epoch": 1.0488732806555459, "grad_norm": 0.0017204730538651347, "learning_rate": 2.3778167983611357e-05, "loss": 0.0, "step": 3584 }, { "epoch": 1.0491659350307287, "grad_norm": 0.0008622069726698101, "learning_rate": 2.377085162423178e-05, "loss": 0.0, "step": 3585 }, { "epoch": 1.0494585894059116, "grad_norm": 0.001416791696101427, "learning_rate": 2.376353526485221e-05, "loss": 0.0, "step": 3586 }, { "epoch": 1.0497512437810945, "grad_norm": 0.0008721469203010201, "learning_rate": 2.3756218905472638e-05, "loss": 0.0, "step": 3587 }, { "epoch": 1.0500438981562774, "grad_norm": 0.2702424228191376, "learning_rate": 2.3748902546093066e-05, "loss": 0.0009, "step": 3588 }, { "epoch": 1.0503365525314603, "grad_norm": 0.014712399803102016, "learning_rate": 2.3741586186713493e-05, "loss": 0.0001, "step": 3589 }, { "epoch": 1.0506292069066432, "grad_norm": 0.000925264845136553, "learning_rate": 2.3734269827333918e-05, "loss": 0.0, "step": 3590 }, { "epoch": 1.050921861281826, "grad_norm": 0.0012234977912157774, "learning_rate": 2.3726953467954346e-05, "loss": 0.0, "step": 3591 }, { "epoch": 1.051214515657009, "grad_norm": 0.0011096167145296931, "learning_rate": 2.3719637108574774e-05, "loss": 0.0, "step": 3592 }, { "epoch": 1.051507170032192, "grad_norm": 0.002764485077932477, "learning_rate": 2.3712320749195202e-05, "loss": 0.0, "step": 3593 }, { "epoch": 1.051799824407375, "grad_norm": 0.0006492521497420967, "learning_rate": 2.370500438981563e-05, "loss": 0.0, "step": 3594 }, { "epoch": 1.0520924787825578, "grad_norm": 0.052430085837841034, "learning_rate": 2.3697688030436054e-05, "loss": 0.0002, "step": 3595 }, { "epoch": 1.0523851331577407, "grad_norm": 0.003093923907727003, "learning_rate": 2.3690371671056482e-05, "loss": 0.0, "step": 3596 }, { "epoch": 1.0526777875329236, "grad_norm": 0.0010897924657911062, "learning_rate": 2.368305531167691e-05, "loss": 0.0, "step": 3597 }, { "epoch": 1.0529704419081065, "grad_norm": 0.0005833354080095887, "learning_rate": 2.3675738952297338e-05, "loss": 0.0, "step": 3598 }, { "epoch": 1.0532630962832894, "grad_norm": 0.00032545908470638096, "learning_rate": 2.3668422592917766e-05, "loss": 0.0, "step": 3599 }, { "epoch": 1.0535557506584723, "grad_norm": 0.0004899411578662694, "learning_rate": 2.3661106233538194e-05, "loss": 0.0, "step": 3600 }, { "epoch": 1.0538484050336552, "grad_norm": 0.0058258599601686, "learning_rate": 2.365378987415862e-05, "loss": 0.0, "step": 3601 }, { "epoch": 1.054141059408838, "grad_norm": 0.0022187926806509495, "learning_rate": 2.3646473514779047e-05, "loss": 0.0, "step": 3602 }, { "epoch": 1.0544337137840212, "grad_norm": 0.0009017025586217642, "learning_rate": 2.3639157155399475e-05, "loss": 0.0, "step": 3603 }, { "epoch": 1.054726368159204, "grad_norm": 0.031748924404382706, "learning_rate": 2.3631840796019903e-05, "loss": 0.0002, "step": 3604 }, { "epoch": 1.055019022534387, "grad_norm": 0.002086823107674718, "learning_rate": 2.362452443664033e-05, "loss": 0.0, "step": 3605 }, { "epoch": 1.0553116769095698, "grad_norm": 0.001307449652813375, "learning_rate": 2.3617208077260755e-05, "loss": 0.0, "step": 3606 }, { "epoch": 1.0556043312847527, "grad_norm": 0.0016552209854125977, "learning_rate": 2.3609891717881183e-05, "loss": 0.0, "step": 3607 }, { "epoch": 1.0558969856599356, "grad_norm": 0.0003705853596329689, "learning_rate": 2.360257535850161e-05, "loss": 0.0, "step": 3608 }, { "epoch": 1.0561896400351185, "grad_norm": 0.0012484462931752205, "learning_rate": 2.359525899912204e-05, "loss": 0.0, "step": 3609 }, { "epoch": 1.0564822944103014, "grad_norm": 0.000867952941916883, "learning_rate": 2.3587942639742467e-05, "loss": 0.0, "step": 3610 }, { "epoch": 1.0567749487854843, "grad_norm": 0.0009844473097473383, "learning_rate": 2.358062628036289e-05, "loss": 0.0, "step": 3611 }, { "epoch": 1.0570676031606672, "grad_norm": 0.00044872480793856084, "learning_rate": 2.357330992098332e-05, "loss": 0.0, "step": 3612 }, { "epoch": 1.05736025753585, "grad_norm": 0.012851194478571415, "learning_rate": 2.3565993561603747e-05, "loss": 0.0001, "step": 3613 }, { "epoch": 1.0576529119110332, "grad_norm": 0.0010048080002889037, "learning_rate": 2.3558677202224175e-05, "loss": 0.0, "step": 3614 }, { "epoch": 1.057945566286216, "grad_norm": 0.0013905661180615425, "learning_rate": 2.3551360842844603e-05, "loss": 0.0, "step": 3615 }, { "epoch": 1.058238220661399, "grad_norm": 0.009411433711647987, "learning_rate": 2.3544044483465028e-05, "loss": 0.0001, "step": 3616 }, { "epoch": 1.0585308750365818, "grad_norm": 0.0010081817163154483, "learning_rate": 2.3536728124085456e-05, "loss": 0.0, "step": 3617 }, { "epoch": 1.0588235294117647, "grad_norm": 0.004782139789313078, "learning_rate": 2.3529411764705884e-05, "loss": 0.0001, "step": 3618 }, { "epoch": 1.0591161837869476, "grad_norm": 0.0008458578959107399, "learning_rate": 2.352209540532631e-05, "loss": 0.0, "step": 3619 }, { "epoch": 1.0594088381621305, "grad_norm": 0.0006553055136464536, "learning_rate": 2.351477904594674e-05, "loss": 0.0, "step": 3620 }, { "epoch": 1.0597014925373134, "grad_norm": 0.0006589387194253504, "learning_rate": 2.3507462686567168e-05, "loss": 0.0, "step": 3621 }, { "epoch": 1.0599941469124963, "grad_norm": 0.0006570523255504668, "learning_rate": 2.3500146327187592e-05, "loss": 0.0, "step": 3622 }, { "epoch": 1.0602868012876792, "grad_norm": 0.0013346909545361996, "learning_rate": 2.349282996780802e-05, "loss": 0.0, "step": 3623 }, { "epoch": 1.0605794556628623, "grad_norm": 0.0007223114371299744, "learning_rate": 2.3485513608428448e-05, "loss": 0.0, "step": 3624 }, { "epoch": 1.0608721100380452, "grad_norm": 0.00046515671419911087, "learning_rate": 2.3478197249048876e-05, "loss": 0.0, "step": 3625 }, { "epoch": 1.061164764413228, "grad_norm": 0.0006506069330498576, "learning_rate": 2.3470880889669304e-05, "loss": 0.0, "step": 3626 }, { "epoch": 1.061457418788411, "grad_norm": 0.0005674001877196133, "learning_rate": 2.346356453028973e-05, "loss": 0.0, "step": 3627 }, { "epoch": 1.0617500731635938, "grad_norm": 4.653077125549316, "learning_rate": 2.3456248170910156e-05, "loss": 0.0164, "step": 3628 }, { "epoch": 1.0620427275387767, "grad_norm": 0.000740343879442662, "learning_rate": 2.3448931811530584e-05, "loss": 0.0, "step": 3629 }, { "epoch": 1.0623353819139596, "grad_norm": 0.0011946418089792132, "learning_rate": 2.3441615452151012e-05, "loss": 0.0, "step": 3630 }, { "epoch": 1.0626280362891425, "grad_norm": 0.00047488027485087514, "learning_rate": 2.343429909277144e-05, "loss": 0.0, "step": 3631 }, { "epoch": 1.0629206906643254, "grad_norm": 0.0004698278789874166, "learning_rate": 2.3426982733391865e-05, "loss": 0.0, "step": 3632 }, { "epoch": 1.0632133450395083, "grad_norm": 0.003856950905174017, "learning_rate": 2.3419666374012293e-05, "loss": 0.0, "step": 3633 }, { "epoch": 1.0635059994146911, "grad_norm": 0.0016928977565839887, "learning_rate": 2.341235001463272e-05, "loss": 0.0, "step": 3634 }, { "epoch": 1.0637986537898743, "grad_norm": 0.006070285104215145, "learning_rate": 2.340503365525315e-05, "loss": 0.0001, "step": 3635 }, { "epoch": 1.0640913081650571, "grad_norm": 0.0767374038696289, "learning_rate": 2.3397717295873577e-05, "loss": 0.0002, "step": 3636 }, { "epoch": 1.06438396254024, "grad_norm": 0.01066882349550724, "learning_rate": 2.3390400936494005e-05, "loss": 0.0001, "step": 3637 }, { "epoch": 1.064676616915423, "grad_norm": 0.2522182762622833, "learning_rate": 2.338308457711443e-05, "loss": 0.0013, "step": 3638 }, { "epoch": 1.0649692712906058, "grad_norm": 0.4211108982563019, "learning_rate": 2.3375768217734857e-05, "loss": 0.0011, "step": 3639 }, { "epoch": 1.0652619256657887, "grad_norm": 0.06459060311317444, "learning_rate": 2.3368451858355285e-05, "loss": 0.0004, "step": 3640 }, { "epoch": 1.0655545800409716, "grad_norm": 4.673795223236084, "learning_rate": 2.3361135498975713e-05, "loss": 0.2605, "step": 3641 }, { "epoch": 1.0658472344161545, "grad_norm": 0.0006907483912073076, "learning_rate": 2.3353819139596138e-05, "loss": 0.0, "step": 3642 }, { "epoch": 1.0661398887913374, "grad_norm": 0.0042630466632544994, "learning_rate": 2.3346502780216565e-05, "loss": 0.0, "step": 3643 }, { "epoch": 1.0664325431665203, "grad_norm": 0.0007183166453614831, "learning_rate": 2.3339186420836993e-05, "loss": 0.0, "step": 3644 }, { "epoch": 1.0667251975417034, "grad_norm": 0.0004632271302398294, "learning_rate": 2.333187006145742e-05, "loss": 0.0, "step": 3645 }, { "epoch": 1.0670178519168863, "grad_norm": 0.0017585419118404388, "learning_rate": 2.3324553702077846e-05, "loss": 0.0, "step": 3646 }, { "epoch": 1.0673105062920691, "grad_norm": 0.0007347252685576677, "learning_rate": 2.3317237342698274e-05, "loss": 0.0, "step": 3647 }, { "epoch": 1.067603160667252, "grad_norm": 0.00169753294903785, "learning_rate": 2.3309920983318702e-05, "loss": 0.0, "step": 3648 }, { "epoch": 1.067895815042435, "grad_norm": 0.0028705827426165342, "learning_rate": 2.330260462393913e-05, "loss": 0.0001, "step": 3649 }, { "epoch": 1.0681884694176178, "grad_norm": 0.030678434297442436, "learning_rate": 2.3295288264559554e-05, "loss": 0.0002, "step": 3650 }, { "epoch": 1.0684811237928007, "grad_norm": 0.0030463761650025845, "learning_rate": 2.3287971905179982e-05, "loss": 0.0001, "step": 3651 }, { "epoch": 1.0687737781679836, "grad_norm": 0.0037456315476447344, "learning_rate": 2.328065554580041e-05, "loss": 0.0001, "step": 3652 }, { "epoch": 1.0690664325431665, "grad_norm": 0.0034187056589871645, "learning_rate": 2.3273339186420838e-05, "loss": 0.0001, "step": 3653 }, { "epoch": 1.0693590869183494, "grad_norm": 0.014863521791994572, "learning_rate": 2.3266022827041263e-05, "loss": 0.0002, "step": 3654 }, { "epoch": 1.0696517412935322, "grad_norm": 0.0028082518838346004, "learning_rate": 2.325870646766169e-05, "loss": 0.0001, "step": 3655 }, { "epoch": 1.0699443956687151, "grad_norm": 0.009496224112808704, "learning_rate": 2.325139010828212e-05, "loss": 0.0002, "step": 3656 }, { "epoch": 1.0702370500438982, "grad_norm": 0.0020644606556743383, "learning_rate": 2.3244073748902547e-05, "loss": 0.0, "step": 3657 }, { "epoch": 1.0705297044190811, "grad_norm": 0.009632423520088196, "learning_rate": 2.3236757389522975e-05, "loss": 0.0002, "step": 3658 }, { "epoch": 1.070822358794264, "grad_norm": 0.002899567363783717, "learning_rate": 2.32294410301434e-05, "loss": 0.0001, "step": 3659 }, { "epoch": 1.071115013169447, "grad_norm": 0.006019872613251209, "learning_rate": 2.3222124670763827e-05, "loss": 0.0001, "step": 3660 }, { "epoch": 1.0714076675446298, "grad_norm": 0.02681499533355236, "learning_rate": 2.3214808311384255e-05, "loss": 0.0003, "step": 3661 }, { "epoch": 1.0717003219198127, "grad_norm": 0.008631656877696514, "learning_rate": 2.3207491952004683e-05, "loss": 0.0002, "step": 3662 }, { "epoch": 1.0719929762949956, "grad_norm": 0.00794452615082264, "learning_rate": 2.320017559262511e-05, "loss": 0.0002, "step": 3663 }, { "epoch": 1.0722856306701785, "grad_norm": 0.24969637393951416, "learning_rate": 2.3192859233245536e-05, "loss": 0.0008, "step": 3664 }, { "epoch": 1.0725782850453613, "grad_norm": 0.003523320658132434, "learning_rate": 2.3185542873865963e-05, "loss": 0.0001, "step": 3665 }, { "epoch": 1.0728709394205442, "grad_norm": 0.2369241863489151, "learning_rate": 2.317822651448639e-05, "loss": 0.0011, "step": 3666 }, { "epoch": 1.0731635937957273, "grad_norm": 0.007222886197268963, "learning_rate": 2.317091015510682e-05, "loss": 0.0001, "step": 3667 }, { "epoch": 1.0734562481709102, "grad_norm": 0.007124431896954775, "learning_rate": 2.3163593795727247e-05, "loss": 0.0001, "step": 3668 }, { "epoch": 1.0737489025460931, "grad_norm": 0.005987787153571844, "learning_rate": 2.3156277436347675e-05, "loss": 0.0001, "step": 3669 }, { "epoch": 1.074041556921276, "grad_norm": 0.00803651474416256, "learning_rate": 2.31489610769681e-05, "loss": 0.0001, "step": 3670 }, { "epoch": 1.074334211296459, "grad_norm": 0.0025474862195551395, "learning_rate": 2.3141644717588528e-05, "loss": 0.0001, "step": 3671 }, { "epoch": 1.0746268656716418, "grad_norm": 0.0026336340233683586, "learning_rate": 2.3134328358208956e-05, "loss": 0.0001, "step": 3672 }, { "epoch": 1.0749195200468247, "grad_norm": 0.0007940291543491185, "learning_rate": 2.3127011998829384e-05, "loss": 0.0, "step": 3673 }, { "epoch": 1.0752121744220076, "grad_norm": 0.001279535237699747, "learning_rate": 2.311969563944981e-05, "loss": 0.0, "step": 3674 }, { "epoch": 1.0755048287971904, "grad_norm": 0.0010387222282588482, "learning_rate": 2.3112379280070236e-05, "loss": 0.0, "step": 3675 }, { "epoch": 1.0757974831723733, "grad_norm": 0.004831269383430481, "learning_rate": 2.3105062920690664e-05, "loss": 0.0001, "step": 3676 }, { "epoch": 1.0760901375475562, "grad_norm": 0.0038754655979573727, "learning_rate": 2.3097746561311092e-05, "loss": 0.0001, "step": 3677 }, { "epoch": 1.0763827919227393, "grad_norm": 0.0028973727021366358, "learning_rate": 2.309043020193152e-05, "loss": 0.0001, "step": 3678 }, { "epoch": 1.0766754462979222, "grad_norm": 0.0024733960162848234, "learning_rate": 2.3083113842551948e-05, "loss": 0.0001, "step": 3679 }, { "epoch": 1.076968100673105, "grad_norm": 0.002890933770686388, "learning_rate": 2.3075797483172373e-05, "loss": 0.0001, "step": 3680 }, { "epoch": 1.077260755048288, "grad_norm": 0.002433110261335969, "learning_rate": 2.30684811237928e-05, "loss": 0.0, "step": 3681 }, { "epoch": 1.0775534094234709, "grad_norm": 0.15135546028614044, "learning_rate": 2.306116476441323e-05, "loss": 0.0003, "step": 3682 }, { "epoch": 1.0778460637986538, "grad_norm": 0.0029673471581190825, "learning_rate": 2.3053848405033656e-05, "loss": 0.0001, "step": 3683 }, { "epoch": 1.0781387181738367, "grad_norm": 0.00278499280102551, "learning_rate": 2.3046532045654084e-05, "loss": 0.0001, "step": 3684 }, { "epoch": 1.0784313725490196, "grad_norm": 0.0024187383241951466, "learning_rate": 2.303921568627451e-05, "loss": 0.0, "step": 3685 }, { "epoch": 1.0787240269242024, "grad_norm": 0.0009933672845363617, "learning_rate": 2.3031899326894937e-05, "loss": 0.0, "step": 3686 }, { "epoch": 1.0790166812993853, "grad_norm": 0.0012278319336473942, "learning_rate": 2.3024582967515365e-05, "loss": 0.0, "step": 3687 }, { "epoch": 1.0793093356745684, "grad_norm": 0.01352784875780344, "learning_rate": 2.3017266608135793e-05, "loss": 0.0001, "step": 3688 }, { "epoch": 1.0796019900497513, "grad_norm": 0.003959336783736944, "learning_rate": 2.300995024875622e-05, "loss": 0.0001, "step": 3689 }, { "epoch": 1.0798946444249342, "grad_norm": 0.0008704708307050169, "learning_rate": 2.300263388937665e-05, "loss": 0.0, "step": 3690 }, { "epoch": 1.080187298800117, "grad_norm": 0.0008812810410745442, "learning_rate": 2.2995317529997073e-05, "loss": 0.0, "step": 3691 }, { "epoch": 1.0804799531753, "grad_norm": 0.007243090774863958, "learning_rate": 2.29880011706175e-05, "loss": 0.0001, "step": 3692 }, { "epoch": 1.0807726075504829, "grad_norm": 0.0025750931818038225, "learning_rate": 2.298068481123793e-05, "loss": 0.0001, "step": 3693 }, { "epoch": 1.0810652619256658, "grad_norm": 0.0019465017830953002, "learning_rate": 2.2973368451858357e-05, "loss": 0.0, "step": 3694 }, { "epoch": 1.0813579163008487, "grad_norm": 0.0012004871387034655, "learning_rate": 2.2966052092478785e-05, "loss": 0.0, "step": 3695 }, { "epoch": 1.0816505706760315, "grad_norm": 0.0010097112972289324, "learning_rate": 2.295873573309921e-05, "loss": 0.0, "step": 3696 }, { "epoch": 1.0819432250512144, "grad_norm": 0.0009873710805550218, "learning_rate": 2.2951419373719638e-05, "loss": 0.0, "step": 3697 }, { "epoch": 1.0822358794263973, "grad_norm": 0.0013015306321904063, "learning_rate": 2.2944103014340065e-05, "loss": 0.0, "step": 3698 }, { "epoch": 1.0825285338015804, "grad_norm": 0.0013580222148448229, "learning_rate": 2.2936786654960493e-05, "loss": 0.0, "step": 3699 }, { "epoch": 1.0828211881767633, "grad_norm": 0.00278102932497859, "learning_rate": 2.292947029558092e-05, "loss": 0.0001, "step": 3700 }, { "epoch": 1.0831138425519462, "grad_norm": 0.008593583479523659, "learning_rate": 2.2922153936201346e-05, "loss": 0.0001, "step": 3701 }, { "epoch": 1.083406496927129, "grad_norm": 0.0005224637570790946, "learning_rate": 2.2914837576821774e-05, "loss": 0.0, "step": 3702 }, { "epoch": 1.083699151302312, "grad_norm": 0.004331057891249657, "learning_rate": 2.2907521217442202e-05, "loss": 0.0001, "step": 3703 }, { "epoch": 1.0839918056774949, "grad_norm": 0.0006074766279198229, "learning_rate": 2.290020485806263e-05, "loss": 0.0, "step": 3704 }, { "epoch": 1.0842844600526778, "grad_norm": 0.0010289569618180394, "learning_rate": 2.2892888498683058e-05, "loss": 0.0, "step": 3705 }, { "epoch": 1.0845771144278606, "grad_norm": 0.0007409527315758169, "learning_rate": 2.2885572139303486e-05, "loss": 0.0, "step": 3706 }, { "epoch": 1.0848697688030435, "grad_norm": 0.0006891212542541325, "learning_rate": 2.287825577992391e-05, "loss": 0.0, "step": 3707 }, { "epoch": 1.0851624231782264, "grad_norm": 0.001396518899127841, "learning_rate": 2.2870939420544338e-05, "loss": 0.0, "step": 3708 }, { "epoch": 1.0854550775534095, "grad_norm": 0.001946881995536387, "learning_rate": 2.2863623061164766e-05, "loss": 0.0, "step": 3709 }, { "epoch": 1.0857477319285924, "grad_norm": 0.000842281908262521, "learning_rate": 2.2856306701785194e-05, "loss": 0.0, "step": 3710 }, { "epoch": 1.0860403863037753, "grad_norm": 0.0006059493171051145, "learning_rate": 2.2848990342405622e-05, "loss": 0.0, "step": 3711 }, { "epoch": 1.0863330406789582, "grad_norm": 0.003021880518645048, "learning_rate": 2.2841673983026047e-05, "loss": 0.0, "step": 3712 }, { "epoch": 1.086625695054141, "grad_norm": 0.0009339431999251246, "learning_rate": 2.2834357623646475e-05, "loss": 0.0, "step": 3713 }, { "epoch": 1.086918349429324, "grad_norm": 0.0011237096041440964, "learning_rate": 2.2827041264266903e-05, "loss": 0.0, "step": 3714 }, { "epoch": 1.0872110038045069, "grad_norm": 0.002793660154566169, "learning_rate": 2.281972490488733e-05, "loss": 0.0, "step": 3715 }, { "epoch": 1.0875036581796897, "grad_norm": 0.0008379679638892412, "learning_rate": 2.281240854550776e-05, "loss": 0.0, "step": 3716 }, { "epoch": 1.0877963125548726, "grad_norm": 0.0008086669258773327, "learning_rate": 2.2805092186128183e-05, "loss": 0.0, "step": 3717 }, { "epoch": 1.0880889669300555, "grad_norm": 0.004301237873733044, "learning_rate": 2.279777582674861e-05, "loss": 0.0, "step": 3718 }, { "epoch": 1.0883816213052384, "grad_norm": 0.0007910404237918556, "learning_rate": 2.279045946736904e-05, "loss": 0.0, "step": 3719 }, { "epoch": 1.0886742756804215, "grad_norm": 0.0005540227284654975, "learning_rate": 2.2783143107989467e-05, "loss": 0.0, "step": 3720 }, { "epoch": 1.0889669300556044, "grad_norm": 0.002769920974969864, "learning_rate": 2.2775826748609895e-05, "loss": 0.0, "step": 3721 }, { "epoch": 1.0892595844307873, "grad_norm": 0.0006901599117554724, "learning_rate": 2.2768510389230323e-05, "loss": 0.0, "step": 3722 }, { "epoch": 1.0895522388059702, "grad_norm": 0.016822081059217453, "learning_rate": 2.2761194029850747e-05, "loss": 0.0001, "step": 3723 }, { "epoch": 1.089844893181153, "grad_norm": 0.0010369161609560251, "learning_rate": 2.2753877670471175e-05, "loss": 0.0, "step": 3724 }, { "epoch": 1.090137547556336, "grad_norm": 0.0011231061071157455, "learning_rate": 2.2746561311091603e-05, "loss": 0.0, "step": 3725 }, { "epoch": 1.0904302019315188, "grad_norm": 0.001505909371189773, "learning_rate": 2.273924495171203e-05, "loss": 0.0, "step": 3726 }, { "epoch": 1.0907228563067017, "grad_norm": 0.0013153402833268046, "learning_rate": 2.273192859233246e-05, "loss": 0.0, "step": 3727 }, { "epoch": 1.0910155106818846, "grad_norm": 0.0010314127430319786, "learning_rate": 2.2724612232952884e-05, "loss": 0.0, "step": 3728 }, { "epoch": 1.0913081650570675, "grad_norm": 0.000518880202434957, "learning_rate": 2.271729587357331e-05, "loss": 0.0, "step": 3729 }, { "epoch": 1.0916008194322506, "grad_norm": 0.0004906303365714848, "learning_rate": 2.270997951419374e-05, "loss": 0.0, "step": 3730 }, { "epoch": 1.0918934738074335, "grad_norm": 0.00047414834261871874, "learning_rate": 2.2702663154814168e-05, "loss": 0.0, "step": 3731 }, { "epoch": 1.0921861281826164, "grad_norm": 0.0015571311814710498, "learning_rate": 2.2695346795434595e-05, "loss": 0.0, "step": 3732 }, { "epoch": 1.0924787825577993, "grad_norm": 0.015585982240736485, "learning_rate": 2.268803043605502e-05, "loss": 0.0001, "step": 3733 }, { "epoch": 1.0927714369329822, "grad_norm": 0.0012408023467287421, "learning_rate": 2.2680714076675448e-05, "loss": 0.0, "step": 3734 }, { "epoch": 1.093064091308165, "grad_norm": 0.0005492728087119758, "learning_rate": 2.2673397717295876e-05, "loss": 0.0, "step": 3735 }, { "epoch": 1.093356745683348, "grad_norm": 0.000611587893217802, "learning_rate": 2.2666081357916304e-05, "loss": 0.0, "step": 3736 }, { "epoch": 1.0936494000585308, "grad_norm": 0.0016189048765227199, "learning_rate": 2.265876499853673e-05, "loss": 0.0, "step": 3737 }, { "epoch": 1.0939420544337137, "grad_norm": 0.0011567751644179225, "learning_rate": 2.2651448639157156e-05, "loss": 0.0, "step": 3738 }, { "epoch": 1.0942347088088966, "grad_norm": 0.009464586153626442, "learning_rate": 2.2644132279777584e-05, "loss": 0.0001, "step": 3739 }, { "epoch": 1.0945273631840795, "grad_norm": 0.0015173718566074967, "learning_rate": 2.2636815920398012e-05, "loss": 0.0, "step": 3740 }, { "epoch": 1.0948200175592626, "grad_norm": 0.0005763802328146994, "learning_rate": 2.2629499561018437e-05, "loss": 0.0, "step": 3741 }, { "epoch": 1.0951126719344455, "grad_norm": 0.0008895478094927967, "learning_rate": 2.2622183201638865e-05, "loss": 0.0, "step": 3742 }, { "epoch": 1.0954053263096284, "grad_norm": 0.0006199186318553984, "learning_rate": 2.2614866842259293e-05, "loss": 0.0, "step": 3743 }, { "epoch": 1.0956979806848113, "grad_norm": 0.0002753791050054133, "learning_rate": 2.260755048287972e-05, "loss": 0.0, "step": 3744 }, { "epoch": 1.0959906350599942, "grad_norm": 0.000554372847545892, "learning_rate": 2.2600234123500145e-05, "loss": 0.0, "step": 3745 }, { "epoch": 1.096283289435177, "grad_norm": 0.0011892981128767133, "learning_rate": 2.2592917764120573e-05, "loss": 0.0, "step": 3746 }, { "epoch": 1.09657594381036, "grad_norm": 0.0006116424920037389, "learning_rate": 2.2585601404741e-05, "loss": 0.0, "step": 3747 }, { "epoch": 1.0968685981855428, "grad_norm": 0.004664978478103876, "learning_rate": 2.257828504536143e-05, "loss": 0.0, "step": 3748 }, { "epoch": 1.0971612525607257, "grad_norm": 0.0008806657278910279, "learning_rate": 2.2570968685981854e-05, "loss": 0.0, "step": 3749 }, { "epoch": 1.0974539069359086, "grad_norm": 0.00028525033849291503, "learning_rate": 2.256365232660228e-05, "loss": 0.0, "step": 3750 }, { "epoch": 1.0977465613110917, "grad_norm": 0.0006327142473310232, "learning_rate": 2.255633596722271e-05, "loss": 0.0, "step": 3751 }, { "epoch": 1.0980392156862746, "grad_norm": 0.00039784680120646954, "learning_rate": 2.2549019607843138e-05, "loss": 0.0, "step": 3752 }, { "epoch": 1.0983318700614575, "grad_norm": 0.0009211124270223081, "learning_rate": 2.2541703248463565e-05, "loss": 0.0, "step": 3753 }, { "epoch": 1.0986245244366404, "grad_norm": 0.00081194459926337, "learning_rate": 2.253438688908399e-05, "loss": 0.0, "step": 3754 }, { "epoch": 1.0989171788118233, "grad_norm": 0.0010225963778793812, "learning_rate": 2.2527070529704418e-05, "loss": 0.0, "step": 3755 }, { "epoch": 1.0992098331870062, "grad_norm": 0.0015989018138498068, "learning_rate": 2.2519754170324846e-05, "loss": 0.0, "step": 3756 }, { "epoch": 1.099502487562189, "grad_norm": 0.13136620819568634, "learning_rate": 2.2512437810945274e-05, "loss": 0.0004, "step": 3757 }, { "epoch": 1.099795141937372, "grad_norm": 0.009679006412625313, "learning_rate": 2.2505121451565702e-05, "loss": 0.0001, "step": 3758 }, { "epoch": 1.1000877963125548, "grad_norm": 0.00034133202279917896, "learning_rate": 2.249780509218613e-05, "loss": 0.0, "step": 3759 }, { "epoch": 1.1003804506877377, "grad_norm": 0.0007985789561644197, "learning_rate": 2.2490488732806554e-05, "loss": 0.0, "step": 3760 }, { "epoch": 1.1006731050629206, "grad_norm": 0.0003673519822768867, "learning_rate": 2.2483172373426982e-05, "loss": 0.0, "step": 3761 }, { "epoch": 1.1009657594381035, "grad_norm": 0.005583070684224367, "learning_rate": 2.247585601404741e-05, "loss": 0.0, "step": 3762 }, { "epoch": 1.1012584138132866, "grad_norm": 0.000653867784421891, "learning_rate": 2.2468539654667838e-05, "loss": 0.0, "step": 3763 }, { "epoch": 1.1015510681884695, "grad_norm": 0.000688259897287935, "learning_rate": 2.2461223295288266e-05, "loss": 0.0, "step": 3764 }, { "epoch": 1.1018437225636524, "grad_norm": 0.0013501718640327454, "learning_rate": 2.245390693590869e-05, "loss": 0.0, "step": 3765 }, { "epoch": 1.1021363769388353, "grad_norm": 0.0008501984993927181, "learning_rate": 2.244659057652912e-05, "loss": 0.0, "step": 3766 }, { "epoch": 1.1024290313140181, "grad_norm": 0.0009289407753385603, "learning_rate": 2.2439274217149547e-05, "loss": 0.0, "step": 3767 }, { "epoch": 1.102721685689201, "grad_norm": 0.00038086893619038165, "learning_rate": 2.2431957857769975e-05, "loss": 0.0, "step": 3768 }, { "epoch": 1.103014340064384, "grad_norm": 0.0015734551707282662, "learning_rate": 2.2424641498390403e-05, "loss": 0.0, "step": 3769 }, { "epoch": 1.1033069944395668, "grad_norm": 0.0005957810208201408, "learning_rate": 2.2417325139010827e-05, "loss": 0.0, "step": 3770 }, { "epoch": 1.1035996488147497, "grad_norm": 0.00042635598219931126, "learning_rate": 2.2410008779631255e-05, "loss": 0.0, "step": 3771 }, { "epoch": 1.1038923031899326, "grad_norm": 0.00046527470112778246, "learning_rate": 2.2402692420251683e-05, "loss": 0.0, "step": 3772 }, { "epoch": 1.1041849575651157, "grad_norm": 0.0016610355814918876, "learning_rate": 2.239537606087211e-05, "loss": 0.0, "step": 3773 }, { "epoch": 1.1044776119402986, "grad_norm": 0.0005930790212005377, "learning_rate": 2.238805970149254e-05, "loss": 0.0, "step": 3774 }, { "epoch": 1.1047702663154815, "grad_norm": 0.0014426594134420156, "learning_rate": 2.2380743342112967e-05, "loss": 0.0, "step": 3775 }, { "epoch": 1.1050629206906644, "grad_norm": 0.0018905469914898276, "learning_rate": 2.237342698273339e-05, "loss": 0.0, "step": 3776 }, { "epoch": 1.1053555750658473, "grad_norm": 0.005967998877167702, "learning_rate": 2.236611062335382e-05, "loss": 0.0001, "step": 3777 }, { "epoch": 1.1056482294410301, "grad_norm": 0.0006593017023988068, "learning_rate": 2.2358794263974247e-05, "loss": 0.0, "step": 3778 }, { "epoch": 1.105940883816213, "grad_norm": 0.00027490098727867007, "learning_rate": 2.2351477904594675e-05, "loss": 0.0, "step": 3779 }, { "epoch": 1.106233538191396, "grad_norm": 0.002138617681339383, "learning_rate": 2.2344161545215103e-05, "loss": 0.0, "step": 3780 }, { "epoch": 1.1065261925665788, "grad_norm": 0.0010131902527064085, "learning_rate": 2.2336845185835528e-05, "loss": 0.0, "step": 3781 }, { "epoch": 1.1068188469417617, "grad_norm": 0.006386586930602789, "learning_rate": 2.2329528826455956e-05, "loss": 0.0, "step": 3782 }, { "epoch": 1.1071115013169446, "grad_norm": 0.0010048957774415612, "learning_rate": 2.2322212467076384e-05, "loss": 0.0, "step": 3783 }, { "epoch": 1.1074041556921277, "grad_norm": 0.0010382654145359993, "learning_rate": 2.231489610769681e-05, "loss": 0.0, "step": 3784 }, { "epoch": 1.1076968100673106, "grad_norm": 0.001115762977860868, "learning_rate": 2.230757974831724e-05, "loss": 0.0, "step": 3785 }, { "epoch": 1.1079894644424935, "grad_norm": 0.00950796902179718, "learning_rate": 2.2300263388937664e-05, "loss": 0.0001, "step": 3786 }, { "epoch": 1.1082821188176764, "grad_norm": 0.0006844153394922614, "learning_rate": 2.2292947029558092e-05, "loss": 0.0, "step": 3787 }, { "epoch": 1.1085747731928592, "grad_norm": 0.001086718519218266, "learning_rate": 2.228563067017852e-05, "loss": 0.0, "step": 3788 }, { "epoch": 1.1088674275680421, "grad_norm": 0.0005012504407204688, "learning_rate": 2.2278314310798948e-05, "loss": 0.0, "step": 3789 }, { "epoch": 1.109160081943225, "grad_norm": 0.0007421081536449492, "learning_rate": 2.2270997951419376e-05, "loss": 0.0, "step": 3790 }, { "epoch": 1.109452736318408, "grad_norm": 0.0006756620132364333, "learning_rate": 2.2263681592039804e-05, "loss": 0.0, "step": 3791 }, { "epoch": 1.1097453906935908, "grad_norm": 0.0004992782487533987, "learning_rate": 2.225636523266023e-05, "loss": 0.0, "step": 3792 }, { "epoch": 1.1100380450687737, "grad_norm": 0.0009807657916098833, "learning_rate": 2.2249048873280656e-05, "loss": 0.0, "step": 3793 }, { "epoch": 1.1103306994439568, "grad_norm": 0.002366979606449604, "learning_rate": 2.2241732513901084e-05, "loss": 0.0, "step": 3794 }, { "epoch": 1.1106233538191397, "grad_norm": 0.00036062311846762896, "learning_rate": 2.2234416154521512e-05, "loss": 0.0, "step": 3795 }, { "epoch": 1.1109160081943226, "grad_norm": 0.00047910830471664667, "learning_rate": 2.222709979514194e-05, "loss": 0.0, "step": 3796 }, { "epoch": 1.1112086625695055, "grad_norm": 0.0006496130372397602, "learning_rate": 2.2219783435762365e-05, "loss": 0.0, "step": 3797 }, { "epoch": 1.1115013169446883, "grad_norm": 0.0006131701520644128, "learning_rate": 2.2212467076382793e-05, "loss": 0.0, "step": 3798 }, { "epoch": 1.1117939713198712, "grad_norm": 0.0005057764356024563, "learning_rate": 2.220515071700322e-05, "loss": 0.0, "step": 3799 }, { "epoch": 1.1120866256950541, "grad_norm": 0.0007563771214336157, "learning_rate": 2.219783435762365e-05, "loss": 0.0, "step": 3800 }, { "epoch": 1.112379280070237, "grad_norm": 3.374889612197876, "learning_rate": 2.2190517998244077e-05, "loss": 0.1949, "step": 3801 }, { "epoch": 1.11267193444542, "grad_norm": 0.0005034583737142384, "learning_rate": 2.21832016388645e-05, "loss": 0.0, "step": 3802 }, { "epoch": 1.1129645888206028, "grad_norm": 0.0008981447317637503, "learning_rate": 2.217588527948493e-05, "loss": 0.0, "step": 3803 }, { "epoch": 1.1132572431957857, "grad_norm": 0.00942637026309967, "learning_rate": 2.2168568920105357e-05, "loss": 0.0001, "step": 3804 }, { "epoch": 1.1135498975709688, "grad_norm": 0.07058790326118469, "learning_rate": 2.2161252560725785e-05, "loss": 0.0002, "step": 3805 }, { "epoch": 1.1138425519461517, "grad_norm": 0.0011174330720677972, "learning_rate": 2.2153936201346213e-05, "loss": 0.0, "step": 3806 }, { "epoch": 1.1141352063213346, "grad_norm": 0.030590757727622986, "learning_rate": 2.2146619841966638e-05, "loss": 0.0004, "step": 3807 }, { "epoch": 1.1144278606965174, "grad_norm": 0.014479140751063824, "learning_rate": 2.2139303482587065e-05, "loss": 0.0002, "step": 3808 }, { "epoch": 1.1147205150717003, "grad_norm": 1.097375750541687, "learning_rate": 2.2131987123207493e-05, "loss": 0.0029, "step": 3809 }, { "epoch": 1.1150131694468832, "grad_norm": 0.20258143544197083, "learning_rate": 2.212467076382792e-05, "loss": 0.0011, "step": 3810 }, { "epoch": 1.1153058238220661, "grad_norm": 0.01319414097815752, "learning_rate": 2.211735440444835e-05, "loss": 0.0002, "step": 3811 }, { "epoch": 1.115598478197249, "grad_norm": 0.001642485847696662, "learning_rate": 2.2110038045068777e-05, "loss": 0.0, "step": 3812 }, { "epoch": 1.1158911325724319, "grad_norm": 0.004467404447495937, "learning_rate": 2.2102721685689202e-05, "loss": 0.0001, "step": 3813 }, { "epoch": 1.1161837869476148, "grad_norm": 6.050930500030518, "learning_rate": 2.209540532630963e-05, "loss": 0.1099, "step": 3814 }, { "epoch": 1.1164764413227979, "grad_norm": 0.029940001666545868, "learning_rate": 2.2088088966930058e-05, "loss": 0.0004, "step": 3815 }, { "epoch": 1.1167690956979808, "grad_norm": 0.003221668303012848, "learning_rate": 2.2080772607550486e-05, "loss": 0.0001, "step": 3816 }, { "epoch": 1.1170617500731637, "grad_norm": 0.015286382287740707, "learning_rate": 2.2073456248170914e-05, "loss": 0.0002, "step": 3817 }, { "epoch": 1.1173544044483465, "grad_norm": 0.0132718151435256, "learning_rate": 2.2066139888791338e-05, "loss": 0.0002, "step": 3818 }, { "epoch": 1.1176470588235294, "grad_norm": 0.056354206055402756, "learning_rate": 2.2058823529411766e-05, "loss": 0.0004, "step": 3819 }, { "epoch": 1.1179397131987123, "grad_norm": 0.0911208763718605, "learning_rate": 2.2051507170032194e-05, "loss": 0.0011, "step": 3820 }, { "epoch": 1.1182323675738952, "grad_norm": 0.07263613492250443, "learning_rate": 2.2044190810652622e-05, "loss": 0.0006, "step": 3821 }, { "epoch": 1.118525021949078, "grad_norm": 2.2835283279418945, "learning_rate": 2.203687445127305e-05, "loss": 0.0163, "step": 3822 }, { "epoch": 1.118817676324261, "grad_norm": 0.05524302273988724, "learning_rate": 2.2029558091893475e-05, "loss": 0.0007, "step": 3823 }, { "epoch": 1.1191103306994439, "grad_norm": 0.0030861482955515385, "learning_rate": 2.2022241732513903e-05, "loss": 0.0, "step": 3824 }, { "epoch": 1.1194029850746268, "grad_norm": 0.0007699658162891865, "learning_rate": 2.201492537313433e-05, "loss": 0.0, "step": 3825 }, { "epoch": 1.1196956394498099, "grad_norm": 0.004165220074355602, "learning_rate": 2.200760901375476e-05, "loss": 0.0001, "step": 3826 }, { "epoch": 1.1199882938249928, "grad_norm": 0.0010617617517709732, "learning_rate": 2.2000292654375186e-05, "loss": 0.0, "step": 3827 }, { "epoch": 1.1202809482001757, "grad_norm": 0.000694679154548794, "learning_rate": 2.199297629499561e-05, "loss": 0.0, "step": 3828 }, { "epoch": 1.1205736025753585, "grad_norm": 0.0009077360155060887, "learning_rate": 2.198565993561604e-05, "loss": 0.0, "step": 3829 }, { "epoch": 1.1208662569505414, "grad_norm": 0.0010671500349417329, "learning_rate": 2.1978343576236467e-05, "loss": 0.0, "step": 3830 }, { "epoch": 1.1211589113257243, "grad_norm": 0.0006664483807981014, "learning_rate": 2.1971027216856895e-05, "loss": 0.0, "step": 3831 }, { "epoch": 1.1214515657009072, "grad_norm": 0.00031987507827579975, "learning_rate": 2.196371085747732e-05, "loss": 0.0, "step": 3832 }, { "epoch": 1.12174422007609, "grad_norm": 0.001221408136188984, "learning_rate": 2.1956394498097747e-05, "loss": 0.0, "step": 3833 }, { "epoch": 1.122036874451273, "grad_norm": 0.09421882778406143, "learning_rate": 2.1949078138718175e-05, "loss": 0.0004, "step": 3834 }, { "epoch": 1.1223295288264559, "grad_norm": 0.0010368202347308397, "learning_rate": 2.1941761779338603e-05, "loss": 0.0, "step": 3835 }, { "epoch": 1.122622183201639, "grad_norm": 0.0008518972317688167, "learning_rate": 2.1934445419959028e-05, "loss": 0.0, "step": 3836 }, { "epoch": 1.1229148375768219, "grad_norm": 0.0005097311805002391, "learning_rate": 2.1927129060579456e-05, "loss": 0.0, "step": 3837 }, { "epoch": 1.1232074919520048, "grad_norm": 0.0004974879557266831, "learning_rate": 2.1919812701199884e-05, "loss": 0.0, "step": 3838 }, { "epoch": 1.1235001463271876, "grad_norm": 0.0005729788681492209, "learning_rate": 2.191249634182031e-05, "loss": 0.0, "step": 3839 }, { "epoch": 1.1237928007023705, "grad_norm": 0.0005809550639241934, "learning_rate": 2.1905179982440736e-05, "loss": 0.0, "step": 3840 }, { "epoch": 1.1240854550775534, "grad_norm": 0.00042159974691458046, "learning_rate": 2.1897863623061164e-05, "loss": 0.0, "step": 3841 }, { "epoch": 1.1243781094527363, "grad_norm": 0.00027188719832338393, "learning_rate": 2.1890547263681592e-05, "loss": 0.0, "step": 3842 }, { "epoch": 1.1246707638279192, "grad_norm": 0.0004589363234117627, "learning_rate": 2.188323090430202e-05, "loss": 0.0, "step": 3843 }, { "epoch": 1.124963418203102, "grad_norm": 0.0006850520730949938, "learning_rate": 2.1875914544922448e-05, "loss": 0.0, "step": 3844 }, { "epoch": 1.125256072578285, "grad_norm": 0.0022457025479525328, "learning_rate": 2.1868598185542873e-05, "loss": 0.0, "step": 3845 }, { "epoch": 1.1255487269534679, "grad_norm": 0.0006316835060715675, "learning_rate": 2.18612818261633e-05, "loss": 0.0, "step": 3846 }, { "epoch": 1.1258413813286507, "grad_norm": 0.00030914306989870965, "learning_rate": 2.185396546678373e-05, "loss": 0.0, "step": 3847 }, { "epoch": 1.1261340357038339, "grad_norm": 0.00035717347054742277, "learning_rate": 2.1846649107404156e-05, "loss": 0.0, "step": 3848 }, { "epoch": 1.1264266900790167, "grad_norm": 0.0013712747022509575, "learning_rate": 2.1839332748024584e-05, "loss": 0.0, "step": 3849 }, { "epoch": 1.1267193444541996, "grad_norm": 0.0019398657605051994, "learning_rate": 2.183201638864501e-05, "loss": 0.0, "step": 3850 }, { "epoch": 1.1270119988293825, "grad_norm": 8.307303428649902, "learning_rate": 2.1824700029265437e-05, "loss": 0.0894, "step": 3851 }, { "epoch": 1.1273046532045654, "grad_norm": 3.9371161460876465, "learning_rate": 2.1817383669885865e-05, "loss": 0.0099, "step": 3852 }, { "epoch": 1.1275973075797483, "grad_norm": 0.000982138211838901, "learning_rate": 2.1810067310506293e-05, "loss": 0.0, "step": 3853 }, { "epoch": 1.1278899619549312, "grad_norm": 0.0011270848335698247, "learning_rate": 2.180275095112672e-05, "loss": 0.0, "step": 3854 }, { "epoch": 1.128182616330114, "grad_norm": 0.1403433382511139, "learning_rate": 2.1795434591747145e-05, "loss": 0.0003, "step": 3855 }, { "epoch": 1.128475270705297, "grad_norm": 0.014551272615790367, "learning_rate": 2.1788118232367573e-05, "loss": 0.0001, "step": 3856 }, { "epoch": 1.12876792508048, "grad_norm": 12.923527717590332, "learning_rate": 2.1780801872988e-05, "loss": 0.0838, "step": 3857 }, { "epoch": 1.129060579455663, "grad_norm": 0.000964752456638962, "learning_rate": 2.177348551360843e-05, "loss": 0.0, "step": 3858 }, { "epoch": 1.1293532338308458, "grad_norm": 0.0010205963626503944, "learning_rate": 2.1766169154228857e-05, "loss": 0.0, "step": 3859 }, { "epoch": 1.1296458882060287, "grad_norm": 0.0003431029326748103, "learning_rate": 2.1758852794849285e-05, "loss": 0.0, "step": 3860 }, { "epoch": 1.1299385425812116, "grad_norm": 0.00042271107668057084, "learning_rate": 2.175153643546971e-05, "loss": 0.0, "step": 3861 }, { "epoch": 1.1302311969563945, "grad_norm": 0.0009413016960024834, "learning_rate": 2.1744220076090138e-05, "loss": 0.0, "step": 3862 }, { "epoch": 1.1305238513315774, "grad_norm": 0.00035180142731405795, "learning_rate": 2.1736903716710565e-05, "loss": 0.0, "step": 3863 }, { "epoch": 1.1308165057067603, "grad_norm": 0.0008406572160311043, "learning_rate": 2.1729587357330993e-05, "loss": 0.0, "step": 3864 }, { "epoch": 1.1311091600819432, "grad_norm": 0.0013551930896937847, "learning_rate": 2.172227099795142e-05, "loss": 0.0, "step": 3865 }, { "epoch": 1.131401814457126, "grad_norm": 0.0004497410263866186, "learning_rate": 2.1714954638571846e-05, "loss": 0.0, "step": 3866 }, { "epoch": 1.131694468832309, "grad_norm": 0.00132949638646096, "learning_rate": 2.1707638279192274e-05, "loss": 0.0, "step": 3867 }, { "epoch": 1.1319871232074918, "grad_norm": 0.0037383574526757, "learning_rate": 2.1700321919812702e-05, "loss": 0.0001, "step": 3868 }, { "epoch": 1.132279777582675, "grad_norm": 0.001089840428903699, "learning_rate": 2.169300556043313e-05, "loss": 0.0, "step": 3869 }, { "epoch": 1.1325724319578578, "grad_norm": 0.0005344722303561866, "learning_rate": 2.1685689201053558e-05, "loss": 0.0, "step": 3870 }, { "epoch": 1.1328650863330407, "grad_norm": 0.001129949581809342, "learning_rate": 2.1678372841673982e-05, "loss": 0.0, "step": 3871 }, { "epoch": 1.1331577407082236, "grad_norm": 0.001591612584888935, "learning_rate": 2.167105648229441e-05, "loss": 0.0, "step": 3872 }, { "epoch": 1.1334503950834065, "grad_norm": 0.0008599033462814987, "learning_rate": 2.1663740122914838e-05, "loss": 0.0, "step": 3873 }, { "epoch": 1.1337430494585894, "grad_norm": 0.0004773321852553636, "learning_rate": 2.1656423763535266e-05, "loss": 0.0, "step": 3874 }, { "epoch": 1.1340357038337723, "grad_norm": 0.0018014417728409171, "learning_rate": 2.1649107404155694e-05, "loss": 0.0, "step": 3875 }, { "epoch": 1.1343283582089552, "grad_norm": 0.0020266417413949966, "learning_rate": 2.164179104477612e-05, "loss": 0.0, "step": 3876 }, { "epoch": 1.134621012584138, "grad_norm": 0.21160373091697693, "learning_rate": 2.1634474685396547e-05, "loss": 0.0011, "step": 3877 }, { "epoch": 1.1349136669593212, "grad_norm": 0.01681935228407383, "learning_rate": 2.1627158326016975e-05, "loss": 0.0001, "step": 3878 }, { "epoch": 1.135206321334504, "grad_norm": 0.0005730148404836655, "learning_rate": 2.1619841966637403e-05, "loss": 0.0, "step": 3879 }, { "epoch": 1.135498975709687, "grad_norm": 0.0013979710638523102, "learning_rate": 2.161252560725783e-05, "loss": 0.0, "step": 3880 }, { "epoch": 1.1357916300848698, "grad_norm": 0.0007042467477731407, "learning_rate": 2.160520924787826e-05, "loss": 0.0, "step": 3881 }, { "epoch": 1.1360842844600527, "grad_norm": 0.0012201687786728144, "learning_rate": 2.1597892888498683e-05, "loss": 0.0, "step": 3882 }, { "epoch": 1.1363769388352356, "grad_norm": 0.000890715979039669, "learning_rate": 2.159057652911911e-05, "loss": 0.0, "step": 3883 }, { "epoch": 1.1366695932104185, "grad_norm": 0.0024068302009254694, "learning_rate": 2.158326016973954e-05, "loss": 0.0, "step": 3884 }, { "epoch": 1.1369622475856014, "grad_norm": 0.0013551790034398437, "learning_rate": 2.1575943810359967e-05, "loss": 0.0, "step": 3885 }, { "epoch": 1.1372549019607843, "grad_norm": 0.055313825607299805, "learning_rate": 2.1568627450980395e-05, "loss": 0.0002, "step": 3886 }, { "epoch": 1.1375475563359672, "grad_norm": 0.0012204163940623403, "learning_rate": 2.156131109160082e-05, "loss": 0.0, "step": 3887 }, { "epoch": 1.13784021071115, "grad_norm": 0.0007513660821132362, "learning_rate": 2.1553994732221247e-05, "loss": 0.0, "step": 3888 }, { "epoch": 1.138132865086333, "grad_norm": 0.00091296446043998, "learning_rate": 2.1546678372841675e-05, "loss": 0.0, "step": 3889 }, { "epoch": 1.138425519461516, "grad_norm": 0.0008509410545229912, "learning_rate": 2.1539362013462103e-05, "loss": 0.0, "step": 3890 }, { "epoch": 1.138718173836699, "grad_norm": 0.0007758307037875056, "learning_rate": 2.153204565408253e-05, "loss": 0.0, "step": 3891 }, { "epoch": 1.1390108282118818, "grad_norm": 0.0006105030188336968, "learning_rate": 2.1524729294702956e-05, "loss": 0.0, "step": 3892 }, { "epoch": 1.1393034825870647, "grad_norm": 0.00399687048047781, "learning_rate": 2.1517412935323384e-05, "loss": 0.0, "step": 3893 }, { "epoch": 1.1395961369622476, "grad_norm": 0.6638000011444092, "learning_rate": 2.151009657594381e-05, "loss": 0.0009, "step": 3894 }, { "epoch": 1.1398887913374305, "grad_norm": 0.0012894651154056191, "learning_rate": 2.150278021656424e-05, "loss": 0.0, "step": 3895 }, { "epoch": 1.1401814457126134, "grad_norm": 0.0012685739202424884, "learning_rate": 2.1495463857184668e-05, "loss": 0.0, "step": 3896 }, { "epoch": 1.1404741000877963, "grad_norm": 0.00038186321035027504, "learning_rate": 2.1488147497805095e-05, "loss": 0.0, "step": 3897 }, { "epoch": 1.1407667544629791, "grad_norm": 0.005274210590869188, "learning_rate": 2.148083113842552e-05, "loss": 0.0001, "step": 3898 }, { "epoch": 1.141059408838162, "grad_norm": 0.0008092095959000289, "learning_rate": 2.1473514779045948e-05, "loss": 0.0, "step": 3899 }, { "epoch": 1.1413520632133451, "grad_norm": 0.002121126279234886, "learning_rate": 2.1466198419666376e-05, "loss": 0.0, "step": 3900 }, { "epoch": 1.141644717588528, "grad_norm": 0.00620256457477808, "learning_rate": 2.1458882060286804e-05, "loss": 0.0001, "step": 3901 }, { "epoch": 1.141937371963711, "grad_norm": 0.0012706911657005548, "learning_rate": 2.1451565700907232e-05, "loss": 0.0, "step": 3902 }, { "epoch": 1.1422300263388938, "grad_norm": 0.0016427603550255299, "learning_rate": 2.1444249341527656e-05, "loss": 0.0, "step": 3903 }, { "epoch": 1.1425226807140767, "grad_norm": 0.004213469102978706, "learning_rate": 2.1436932982148084e-05, "loss": 0.0001, "step": 3904 }, { "epoch": 1.1428153350892596, "grad_norm": 0.003851081943139434, "learning_rate": 2.1429616622768512e-05, "loss": 0.0001, "step": 3905 }, { "epoch": 1.1431079894644425, "grad_norm": 0.0013577681966125965, "learning_rate": 2.142230026338894e-05, "loss": 0.0, "step": 3906 }, { "epoch": 1.1434006438396254, "grad_norm": 0.0006001973524689674, "learning_rate": 2.1414983904009368e-05, "loss": 0.0, "step": 3907 }, { "epoch": 1.1436932982148083, "grad_norm": 0.002575696911662817, "learning_rate": 2.1407667544629793e-05, "loss": 0.0, "step": 3908 }, { "epoch": 1.1439859525899911, "grad_norm": 0.0007619388052262366, "learning_rate": 2.140035118525022e-05, "loss": 0.0, "step": 3909 }, { "epoch": 1.144278606965174, "grad_norm": 0.0007594061316922307, "learning_rate": 2.139303482587065e-05, "loss": 0.0, "step": 3910 }, { "epoch": 1.144571261340357, "grad_norm": 0.035657331347465515, "learning_rate": 2.1385718466491077e-05, "loss": 0.0002, "step": 3911 }, { "epoch": 1.14486391571554, "grad_norm": 0.0011063236743211746, "learning_rate": 2.1378402107111505e-05, "loss": 0.0, "step": 3912 }, { "epoch": 1.145156570090723, "grad_norm": 0.0025668165180832148, "learning_rate": 2.1371085747731933e-05, "loss": 0.0, "step": 3913 }, { "epoch": 1.1454492244659058, "grad_norm": 0.0062032961286604404, "learning_rate": 2.1363769388352357e-05, "loss": 0.0001, "step": 3914 }, { "epoch": 1.1457418788410887, "grad_norm": 0.01251330878585577, "learning_rate": 2.1356453028972785e-05, "loss": 0.0001, "step": 3915 }, { "epoch": 1.1460345332162716, "grad_norm": 0.0013093686429783702, "learning_rate": 2.1349136669593213e-05, "loss": 0.0, "step": 3916 }, { "epoch": 1.1463271875914545, "grad_norm": 0.0021693778689950705, "learning_rate": 2.134182031021364e-05, "loss": 0.0, "step": 3917 }, { "epoch": 1.1466198419666374, "grad_norm": 0.056038640439510345, "learning_rate": 2.1334503950834065e-05, "loss": 0.0003, "step": 3918 }, { "epoch": 1.1469124963418202, "grad_norm": 0.0013667010935023427, "learning_rate": 2.1327187591454493e-05, "loss": 0.0, "step": 3919 }, { "epoch": 1.1472051507170031, "grad_norm": 0.0008248764206655324, "learning_rate": 2.131987123207492e-05, "loss": 0.0, "step": 3920 }, { "epoch": 1.1474978050921862, "grad_norm": 0.0006004977622069418, "learning_rate": 2.131255487269535e-05, "loss": 0.0, "step": 3921 }, { "epoch": 1.1477904594673691, "grad_norm": 0.0019180545350536704, "learning_rate": 2.1305238513315774e-05, "loss": 0.0, "step": 3922 }, { "epoch": 1.148083113842552, "grad_norm": 0.0006675656768493354, "learning_rate": 2.1297922153936202e-05, "loss": 0.0, "step": 3923 }, { "epoch": 1.148375768217735, "grad_norm": 0.002919198013842106, "learning_rate": 2.129060579455663e-05, "loss": 0.0, "step": 3924 }, { "epoch": 1.1486684225929178, "grad_norm": 0.0006825899472460151, "learning_rate": 2.1283289435177058e-05, "loss": 0.0, "step": 3925 }, { "epoch": 1.1489610769681007, "grad_norm": 0.0016971147852018476, "learning_rate": 2.1275973075797486e-05, "loss": 0.0, "step": 3926 }, { "epoch": 1.1492537313432836, "grad_norm": 0.0005503272986970842, "learning_rate": 2.126865671641791e-05, "loss": 0.0, "step": 3927 }, { "epoch": 1.1495463857184665, "grad_norm": 0.00199718214571476, "learning_rate": 2.1261340357038338e-05, "loss": 0.0, "step": 3928 }, { "epoch": 1.1498390400936493, "grad_norm": 0.0035684986505657434, "learning_rate": 2.1254023997658766e-05, "loss": 0.0001, "step": 3929 }, { "epoch": 1.1501316944688322, "grad_norm": 0.0018766399007290602, "learning_rate": 2.1246707638279194e-05, "loss": 0.0, "step": 3930 }, { "epoch": 1.1504243488440151, "grad_norm": 0.001120618311688304, "learning_rate": 2.123939127889962e-05, "loss": 0.0, "step": 3931 }, { "epoch": 1.150717003219198, "grad_norm": 0.0007104809628799558, "learning_rate": 2.1232074919520047e-05, "loss": 0.0, "step": 3932 }, { "epoch": 1.1510096575943811, "grad_norm": 0.0005078423419035971, "learning_rate": 2.1224758560140475e-05, "loss": 0.0, "step": 3933 }, { "epoch": 1.151302311969564, "grad_norm": 0.0006699707009829581, "learning_rate": 2.1217442200760903e-05, "loss": 0.0, "step": 3934 }, { "epoch": 1.151594966344747, "grad_norm": 0.0008183489553630352, "learning_rate": 2.1210125841381327e-05, "loss": 0.0, "step": 3935 }, { "epoch": 1.1518876207199298, "grad_norm": 0.0010990456212311983, "learning_rate": 2.1202809482001755e-05, "loss": 0.0, "step": 3936 }, { "epoch": 1.1521802750951127, "grad_norm": 0.0018640455091372132, "learning_rate": 2.1195493122622183e-05, "loss": 0.0, "step": 3937 }, { "epoch": 1.1524729294702956, "grad_norm": 0.002578464802354574, "learning_rate": 2.118817676324261e-05, "loss": 0.0, "step": 3938 }, { "epoch": 1.1527655838454784, "grad_norm": 0.0022473332937806845, "learning_rate": 2.118086040386304e-05, "loss": 0.0, "step": 3939 }, { "epoch": 1.1530582382206613, "grad_norm": 0.014399176463484764, "learning_rate": 2.1173544044483463e-05, "loss": 0.0002, "step": 3940 }, { "epoch": 1.1533508925958442, "grad_norm": 0.0022750303614884615, "learning_rate": 2.116622768510389e-05, "loss": 0.0, "step": 3941 }, { "epoch": 1.1536435469710273, "grad_norm": 0.0017155127134174109, "learning_rate": 2.115891132572432e-05, "loss": 0.0, "step": 3942 }, { "epoch": 1.1539362013462102, "grad_norm": 0.0037041513714939356, "learning_rate": 2.1151594966344747e-05, "loss": 0.0, "step": 3943 }, { "epoch": 1.154228855721393, "grad_norm": 0.0008389271097257733, "learning_rate": 2.1144278606965175e-05, "loss": 0.0, "step": 3944 }, { "epoch": 1.154521510096576, "grad_norm": 0.0014524850994348526, "learning_rate": 2.11369622475856e-05, "loss": 0.0, "step": 3945 }, { "epoch": 1.1548141644717589, "grad_norm": 0.059136826545000076, "learning_rate": 2.1129645888206028e-05, "loss": 0.0003, "step": 3946 }, { "epoch": 1.1551068188469418, "grad_norm": 0.0016152020543813705, "learning_rate": 2.1122329528826456e-05, "loss": 0.0, "step": 3947 }, { "epoch": 1.1553994732221247, "grad_norm": 0.0002280518237967044, "learning_rate": 2.1115013169446884e-05, "loss": 0.0, "step": 3948 }, { "epoch": 1.1556921275973076, "grad_norm": 0.0015034673269838095, "learning_rate": 2.110769681006731e-05, "loss": 0.0, "step": 3949 }, { "epoch": 1.1559847819724904, "grad_norm": 0.003323787124827504, "learning_rate": 2.110038045068774e-05, "loss": 0.0001, "step": 3950 }, { "epoch": 1.1562774363476733, "grad_norm": 0.0010264019947499037, "learning_rate": 2.1093064091308164e-05, "loss": 0.0, "step": 3951 }, { "epoch": 1.1565700907228562, "grad_norm": 0.0008539935224689543, "learning_rate": 2.1085747731928592e-05, "loss": 0.0, "step": 3952 }, { "epoch": 1.156862745098039, "grad_norm": 0.0005931356572546065, "learning_rate": 2.107843137254902e-05, "loss": 0.0, "step": 3953 }, { "epoch": 1.1571553994732222, "grad_norm": 0.001304261269979179, "learning_rate": 2.1071115013169448e-05, "loss": 0.0, "step": 3954 }, { "epoch": 1.157448053848405, "grad_norm": 0.0005458810483105481, "learning_rate": 2.1063798653789876e-05, "loss": 0.0, "step": 3955 }, { "epoch": 1.157740708223588, "grad_norm": 0.0012579706963151693, "learning_rate": 2.10564822944103e-05, "loss": 0.0, "step": 3956 }, { "epoch": 1.1580333625987709, "grad_norm": 0.0019264259608462453, "learning_rate": 2.104916593503073e-05, "loss": 0.0, "step": 3957 }, { "epoch": 1.1583260169739538, "grad_norm": 0.0005996621330268681, "learning_rate": 2.1041849575651156e-05, "loss": 0.0, "step": 3958 }, { "epoch": 1.1586186713491367, "grad_norm": 0.000745131925214082, "learning_rate": 2.1034533216271584e-05, "loss": 0.0, "step": 3959 }, { "epoch": 1.1589113257243195, "grad_norm": 0.05781120806932449, "learning_rate": 2.1027216856892012e-05, "loss": 0.0003, "step": 3960 }, { "epoch": 1.1592039800995024, "grad_norm": 0.002012160373851657, "learning_rate": 2.1019900497512437e-05, "loss": 0.0, "step": 3961 }, { "epoch": 1.1594966344746853, "grad_norm": 0.007525267545133829, "learning_rate": 2.1012584138132865e-05, "loss": 0.0001, "step": 3962 }, { "epoch": 1.1597892888498684, "grad_norm": 0.0028305898886173964, "learning_rate": 2.1005267778753293e-05, "loss": 0.0, "step": 3963 }, { "epoch": 1.1600819432250513, "grad_norm": 0.004379513207823038, "learning_rate": 2.099795141937372e-05, "loss": 0.0, "step": 3964 }, { "epoch": 1.1603745976002342, "grad_norm": 0.0022841941099613905, "learning_rate": 2.099063505999415e-05, "loss": 0.0, "step": 3965 }, { "epoch": 1.160667251975417, "grad_norm": 0.00092789635527879, "learning_rate": 2.0983318700614577e-05, "loss": 0.0, "step": 3966 }, { "epoch": 1.1609599063506, "grad_norm": 0.0009375543450005352, "learning_rate": 2.0976002341235e-05, "loss": 0.0, "step": 3967 }, { "epoch": 1.1612525607257829, "grad_norm": 0.013091144151985645, "learning_rate": 2.096868598185543e-05, "loss": 0.0001, "step": 3968 }, { "epoch": 1.1615452151009658, "grad_norm": 0.00041092262836173177, "learning_rate": 2.0961369622475857e-05, "loss": 0.0, "step": 3969 }, { "epoch": 1.1618378694761486, "grad_norm": 0.00044362861081026495, "learning_rate": 2.0954053263096285e-05, "loss": 0.0, "step": 3970 }, { "epoch": 1.1621305238513315, "grad_norm": 0.0012614787556231022, "learning_rate": 2.0946736903716713e-05, "loss": 0.0, "step": 3971 }, { "epoch": 1.1624231782265144, "grad_norm": 0.0003866611805278808, "learning_rate": 2.0939420544337138e-05, "loss": 0.0, "step": 3972 }, { "epoch": 1.1627158326016973, "grad_norm": 0.0011134531814604998, "learning_rate": 2.0932104184957565e-05, "loss": 0.0, "step": 3973 }, { "epoch": 1.1630084869768802, "grad_norm": 0.0004822705523110926, "learning_rate": 2.0924787825577993e-05, "loss": 0.0, "step": 3974 }, { "epoch": 1.1633011413520633, "grad_norm": 0.00044631559285335243, "learning_rate": 2.091747146619842e-05, "loss": 0.0, "step": 3975 }, { "epoch": 1.1635937957272462, "grad_norm": 0.0002210295933764428, "learning_rate": 2.091015510681885e-05, "loss": 0.0, "step": 3976 }, { "epoch": 1.163886450102429, "grad_norm": 0.0007133566541597247, "learning_rate": 2.0902838747439274e-05, "loss": 0.0, "step": 3977 }, { "epoch": 1.164179104477612, "grad_norm": 0.003904839511960745, "learning_rate": 2.0895522388059702e-05, "loss": 0.0, "step": 3978 }, { "epoch": 1.1644717588527949, "grad_norm": 0.00019842377514578402, "learning_rate": 2.088820602868013e-05, "loss": 0.0, "step": 3979 }, { "epoch": 1.1647644132279777, "grad_norm": 0.0004171247419435531, "learning_rate": 2.0880889669300558e-05, "loss": 0.0, "step": 3980 }, { "epoch": 1.1650570676031606, "grad_norm": 0.0003570810076780617, "learning_rate": 2.0873573309920986e-05, "loss": 0.0, "step": 3981 }, { "epoch": 1.1653497219783435, "grad_norm": 0.00041155374492518604, "learning_rate": 2.0866256950541414e-05, "loss": 0.0, "step": 3982 }, { "epoch": 1.1656423763535264, "grad_norm": 0.00036599207669496536, "learning_rate": 2.0858940591161838e-05, "loss": 0.0, "step": 3983 }, { "epoch": 1.1659350307287093, "grad_norm": 0.0005415352643467486, "learning_rate": 2.0851624231782266e-05, "loss": 0.0, "step": 3984 }, { "epoch": 1.1662276851038924, "grad_norm": 0.00030228469404391944, "learning_rate": 2.0844307872402694e-05, "loss": 0.0, "step": 3985 }, { "epoch": 1.1665203394790753, "grad_norm": 0.0003409007331356406, "learning_rate": 2.0836991513023122e-05, "loss": 0.0, "step": 3986 }, { "epoch": 1.1668129938542582, "grad_norm": 0.000449575629318133, "learning_rate": 2.082967515364355e-05, "loss": 0.0, "step": 3987 }, { "epoch": 1.167105648229441, "grad_norm": 0.00036963477032259107, "learning_rate": 2.0822358794263975e-05, "loss": 0.0, "step": 3988 }, { "epoch": 1.167398302604624, "grad_norm": 0.0006797259557060897, "learning_rate": 2.0815042434884403e-05, "loss": 0.0, "step": 3989 }, { "epoch": 1.1676909569798068, "grad_norm": 0.0004133189213462174, "learning_rate": 2.080772607550483e-05, "loss": 0.0, "step": 3990 }, { "epoch": 1.1679836113549897, "grad_norm": 0.0008724583312869072, "learning_rate": 2.080040971612526e-05, "loss": 0.0, "step": 3991 }, { "epoch": 1.1682762657301726, "grad_norm": 0.00334708159789443, "learning_rate": 2.0793093356745686e-05, "loss": 0.0, "step": 3992 }, { "epoch": 1.1685689201053555, "grad_norm": 0.00032076481147669256, "learning_rate": 2.078577699736611e-05, "loss": 0.0, "step": 3993 }, { "epoch": 1.1688615744805384, "grad_norm": 0.00036713696317747235, "learning_rate": 2.077846063798654e-05, "loss": 0.0, "step": 3994 }, { "epoch": 1.1691542288557213, "grad_norm": 0.0003647505945991725, "learning_rate": 2.0771144278606967e-05, "loss": 0.0, "step": 3995 }, { "epoch": 1.1694468832309042, "grad_norm": 0.00010544279939495027, "learning_rate": 2.0763827919227395e-05, "loss": 0.0, "step": 3996 }, { "epoch": 1.1697395376060873, "grad_norm": 0.0005103484727442265, "learning_rate": 2.0756511559847823e-05, "loss": 0.0, "step": 3997 }, { "epoch": 1.1700321919812702, "grad_norm": 0.0012317753862589598, "learning_rate": 2.0749195200468247e-05, "loss": 0.0, "step": 3998 }, { "epoch": 1.170324846356453, "grad_norm": 0.006900831591337919, "learning_rate": 2.0741878841088675e-05, "loss": 0.0, "step": 3999 }, { "epoch": 1.170617500731636, "grad_norm": 0.0020533285569399595, "learning_rate": 2.0734562481709103e-05, "loss": 0.0, "step": 4000 }, { "epoch": 1.1709101551068188, "grad_norm": 0.0004798827867489308, "learning_rate": 2.072724612232953e-05, "loss": 0.0, "step": 4001 }, { "epoch": 1.1712028094820017, "grad_norm": 0.00036796287167817354, "learning_rate": 2.071992976294996e-05, "loss": 0.0, "step": 4002 }, { "epoch": 1.1714954638571846, "grad_norm": 0.020118188112974167, "learning_rate": 2.0712613403570387e-05, "loss": 0.0001, "step": 4003 }, { "epoch": 1.1717881182323675, "grad_norm": 0.0013013577554374933, "learning_rate": 2.070529704419081e-05, "loss": 0.0, "step": 4004 }, { "epoch": 1.1720807726075504, "grad_norm": 0.00038200916606001556, "learning_rate": 2.069798068481124e-05, "loss": 0.0, "step": 4005 }, { "epoch": 1.1723734269827335, "grad_norm": 0.0011500732507556677, "learning_rate": 2.0690664325431668e-05, "loss": 0.0, "step": 4006 }, { "epoch": 1.1726660813579164, "grad_norm": 0.00043047755025327206, "learning_rate": 2.0683347966052095e-05, "loss": 0.0, "step": 4007 }, { "epoch": 1.1729587357330993, "grad_norm": 0.0002854850608855486, "learning_rate": 2.0676031606672523e-05, "loss": 0.0, "step": 4008 }, { "epoch": 1.1732513901082822, "grad_norm": 0.0002904585562646389, "learning_rate": 2.0668715247292948e-05, "loss": 0.0, "step": 4009 }, { "epoch": 1.173544044483465, "grad_norm": 0.0018572751432657242, "learning_rate": 2.0661398887913376e-05, "loss": 0.0, "step": 4010 }, { "epoch": 1.173836698858648, "grad_norm": 0.00020035271882079542, "learning_rate": 2.0654082528533804e-05, "loss": 0.0, "step": 4011 }, { "epoch": 1.1741293532338308, "grad_norm": 0.0010458094766363502, "learning_rate": 2.0646766169154232e-05, "loss": 0.0, "step": 4012 }, { "epoch": 1.1744220076090137, "grad_norm": 0.000360166362952441, "learning_rate": 2.0639449809774656e-05, "loss": 0.0, "step": 4013 }, { "epoch": 1.1747146619841966, "grad_norm": 0.0004725328180938959, "learning_rate": 2.0632133450395084e-05, "loss": 0.0, "step": 4014 }, { "epoch": 1.1750073163593795, "grad_norm": 0.00040227134013548493, "learning_rate": 2.0624817091015512e-05, "loss": 0.0, "step": 4015 }, { "epoch": 1.1752999707345624, "grad_norm": 0.00047024450032040477, "learning_rate": 2.061750073163594e-05, "loss": 0.0, "step": 4016 }, { "epoch": 1.1755926251097453, "grad_norm": 0.0002049401227850467, "learning_rate": 2.0610184372256365e-05, "loss": 0.0, "step": 4017 }, { "epoch": 1.1758852794849284, "grad_norm": 0.00016430718824267387, "learning_rate": 2.0602868012876793e-05, "loss": 0.0, "step": 4018 }, { "epoch": 1.1761779338601113, "grad_norm": 0.00045258854515850544, "learning_rate": 2.059555165349722e-05, "loss": 0.0, "step": 4019 }, { "epoch": 1.1764705882352942, "grad_norm": 0.0007949948194436729, "learning_rate": 2.058823529411765e-05, "loss": 0.0, "step": 4020 }, { "epoch": 1.176763242610477, "grad_norm": 0.0004329327493906021, "learning_rate": 2.0580918934738073e-05, "loss": 0.0, "step": 4021 }, { "epoch": 1.17705589698566, "grad_norm": 0.000122263198136352, "learning_rate": 2.05736025753585e-05, "loss": 0.0, "step": 4022 }, { "epoch": 1.1773485513608428, "grad_norm": 0.00021170203399378806, "learning_rate": 2.056628621597893e-05, "loss": 0.0, "step": 4023 }, { "epoch": 1.1776412057360257, "grad_norm": 0.0002878759987652302, "learning_rate": 2.0558969856599357e-05, "loss": 0.0, "step": 4024 }, { "epoch": 1.1779338601112086, "grad_norm": 0.0004215857479721308, "learning_rate": 2.055165349721978e-05, "loss": 0.0, "step": 4025 }, { "epoch": 1.1782265144863915, "grad_norm": 0.00035516414209268987, "learning_rate": 2.054433713784021e-05, "loss": 0.0, "step": 4026 }, { "epoch": 1.1785191688615746, "grad_norm": 0.00024092184321489185, "learning_rate": 2.0537020778460638e-05, "loss": 0.0, "step": 4027 }, { "epoch": 1.1788118232367575, "grad_norm": 0.0005252889823168516, "learning_rate": 2.0529704419081065e-05, "loss": 0.0, "step": 4028 }, { "epoch": 1.1791044776119404, "grad_norm": 0.000901224440895021, "learning_rate": 2.0522388059701493e-05, "loss": 0.0, "step": 4029 }, { "epoch": 1.1793971319871233, "grad_norm": 0.0007788223447278142, "learning_rate": 2.0515071700321918e-05, "loss": 0.0, "step": 4030 }, { "epoch": 1.1796897863623061, "grad_norm": 0.005932623986154795, "learning_rate": 2.0507755340942346e-05, "loss": 0.0, "step": 4031 }, { "epoch": 1.179982440737489, "grad_norm": 0.12450405210256577, "learning_rate": 2.0500438981562774e-05, "loss": 0.0008, "step": 4032 }, { "epoch": 1.180275095112672, "grad_norm": 0.0016493074363097548, "learning_rate": 2.0493122622183202e-05, "loss": 0.0, "step": 4033 }, { "epoch": 1.1805677494878548, "grad_norm": 0.00031817733542993665, "learning_rate": 2.048580626280363e-05, "loss": 0.0, "step": 4034 }, { "epoch": 1.1808604038630377, "grad_norm": 0.000735086970962584, "learning_rate": 2.0478489903424058e-05, "loss": 0.0, "step": 4035 }, { "epoch": 1.1811530582382206, "grad_norm": 0.00015424641605932266, "learning_rate": 2.0471173544044482e-05, "loss": 0.0, "step": 4036 }, { "epoch": 1.1814457126134035, "grad_norm": 0.00034844077890738845, "learning_rate": 2.046385718466491e-05, "loss": 0.0, "step": 4037 }, { "epoch": 1.1817383669885864, "grad_norm": 0.0006114047137089074, "learning_rate": 2.0456540825285338e-05, "loss": 0.0, "step": 4038 }, { "epoch": 1.1820310213637695, "grad_norm": 0.0004173670313321054, "learning_rate": 2.0449224465905766e-05, "loss": 0.0, "step": 4039 }, { "epoch": 1.1823236757389524, "grad_norm": 0.00021460339485201985, "learning_rate": 2.0441908106526194e-05, "loss": 0.0, "step": 4040 }, { "epoch": 1.1826163301141353, "grad_norm": 0.00044810911640524864, "learning_rate": 2.043459174714662e-05, "loss": 0.0, "step": 4041 }, { "epoch": 1.1829089844893181, "grad_norm": 0.000367204484064132, "learning_rate": 2.0427275387767047e-05, "loss": 0.0, "step": 4042 }, { "epoch": 1.183201638864501, "grad_norm": 0.0007425369694828987, "learning_rate": 2.0419959028387475e-05, "loss": 0.0, "step": 4043 }, { "epoch": 1.183494293239684, "grad_norm": 0.00026489770971238613, "learning_rate": 2.0412642669007903e-05, "loss": 0.0, "step": 4044 }, { "epoch": 1.1837869476148668, "grad_norm": 0.00024482072331011295, "learning_rate": 2.040532630962833e-05, "loss": 0.0, "step": 4045 }, { "epoch": 1.1840796019900497, "grad_norm": 0.00015763095871079713, "learning_rate": 2.0398009950248755e-05, "loss": 0.0, "step": 4046 }, { "epoch": 1.1843722563652326, "grad_norm": 0.0009888801723718643, "learning_rate": 2.0390693590869183e-05, "loss": 0.0, "step": 4047 }, { "epoch": 1.1846649107404157, "grad_norm": 0.0003547268861439079, "learning_rate": 2.038337723148961e-05, "loss": 0.0, "step": 4048 }, { "epoch": 1.1849575651155986, "grad_norm": 0.0004106337728444487, "learning_rate": 2.037606087211004e-05, "loss": 0.0, "step": 4049 }, { "epoch": 1.1852502194907815, "grad_norm": 0.0001579160598339513, "learning_rate": 2.0368744512730467e-05, "loss": 0.0, "step": 4050 }, { "epoch": 1.1855428738659644, "grad_norm": 0.00015295147022698075, "learning_rate": 2.0361428153350895e-05, "loss": 0.0, "step": 4051 }, { "epoch": 1.1858355282411472, "grad_norm": 0.0012576906010508537, "learning_rate": 2.035411179397132e-05, "loss": 0.0, "step": 4052 }, { "epoch": 1.1861281826163301, "grad_norm": 0.0005667202058248222, "learning_rate": 2.0346795434591747e-05, "loss": 0.0, "step": 4053 }, { "epoch": 1.186420836991513, "grad_norm": 0.00020606222096830606, "learning_rate": 2.0339479075212175e-05, "loss": 0.0, "step": 4054 }, { "epoch": 1.186713491366696, "grad_norm": 0.0003121673362329602, "learning_rate": 2.0332162715832603e-05, "loss": 0.0, "step": 4055 }, { "epoch": 1.1870061457418788, "grad_norm": 0.00022469591931439936, "learning_rate": 2.032484635645303e-05, "loss": 0.0, "step": 4056 }, { "epoch": 1.1872988001170617, "grad_norm": 0.0002625453344080597, "learning_rate": 2.0317529997073456e-05, "loss": 0.0, "step": 4057 }, { "epoch": 1.1875914544922446, "grad_norm": 0.00021747525897808373, "learning_rate": 2.0310213637693884e-05, "loss": 0.0, "step": 4058 }, { "epoch": 1.1878841088674275, "grad_norm": 0.00032584808650426567, "learning_rate": 2.030289727831431e-05, "loss": 0.0, "step": 4059 }, { "epoch": 1.1881767632426106, "grad_norm": 0.00024117686552926898, "learning_rate": 2.029558091893474e-05, "loss": 0.0, "step": 4060 }, { "epoch": 1.1884694176177935, "grad_norm": 0.00013844919158145785, "learning_rate": 2.0288264559555167e-05, "loss": 0.0, "step": 4061 }, { "epoch": 1.1887620719929763, "grad_norm": 0.00015658863412681967, "learning_rate": 2.0280948200175592e-05, "loss": 0.0, "step": 4062 }, { "epoch": 1.1890547263681592, "grad_norm": 0.00014539046969730407, "learning_rate": 2.027363184079602e-05, "loss": 0.0, "step": 4063 }, { "epoch": 1.1893473807433421, "grad_norm": 0.00018449235358275473, "learning_rate": 2.0266315481416448e-05, "loss": 0.0, "step": 4064 }, { "epoch": 1.189640035118525, "grad_norm": 0.00014005535922478884, "learning_rate": 2.0258999122036876e-05, "loss": 0.0, "step": 4065 }, { "epoch": 1.189932689493708, "grad_norm": 0.0002201669994974509, "learning_rate": 2.0251682762657304e-05, "loss": 0.0, "step": 4066 }, { "epoch": 1.1902253438688908, "grad_norm": 0.000424054975155741, "learning_rate": 2.024436640327773e-05, "loss": 0.0, "step": 4067 }, { "epoch": 1.1905179982440737, "grad_norm": 0.00021554843988269567, "learning_rate": 2.0237050043898156e-05, "loss": 0.0, "step": 4068 }, { "epoch": 1.1908106526192568, "grad_norm": 0.0001550370652694255, "learning_rate": 2.0229733684518584e-05, "loss": 0.0, "step": 4069 }, { "epoch": 1.1911033069944397, "grad_norm": 0.0001866599777713418, "learning_rate": 2.0222417325139012e-05, "loss": 0.0, "step": 4070 }, { "epoch": 1.1913959613696226, "grad_norm": 0.00020945954020135105, "learning_rate": 2.021510096575944e-05, "loss": 0.0, "step": 4071 }, { "epoch": 1.1916886157448054, "grad_norm": 0.0008195735281333327, "learning_rate": 2.0207784606379868e-05, "loss": 0.0, "step": 4072 }, { "epoch": 1.1919812701199883, "grad_norm": 0.00025099836057052016, "learning_rate": 2.0200468247000293e-05, "loss": 0.0, "step": 4073 }, { "epoch": 1.1922739244951712, "grad_norm": 0.0002605269255582243, "learning_rate": 2.019315188762072e-05, "loss": 0.0, "step": 4074 }, { "epoch": 1.192566578870354, "grad_norm": 0.0009943390032276511, "learning_rate": 2.018583552824115e-05, "loss": 0.0, "step": 4075 }, { "epoch": 1.192859233245537, "grad_norm": 0.0001504563115304336, "learning_rate": 2.0178519168861577e-05, "loss": 0.0, "step": 4076 }, { "epoch": 1.1931518876207199, "grad_norm": 0.001579423202201724, "learning_rate": 2.0171202809482005e-05, "loss": 0.0, "step": 4077 }, { "epoch": 1.1934445419959028, "grad_norm": 0.00023885307018645108, "learning_rate": 2.016388645010243e-05, "loss": 0.0, "step": 4078 }, { "epoch": 1.1937371963710857, "grad_norm": 0.0003890085208695382, "learning_rate": 2.0156570090722857e-05, "loss": 0.0, "step": 4079 }, { "epoch": 1.1940298507462686, "grad_norm": 0.00026691873790696263, "learning_rate": 2.0149253731343285e-05, "loss": 0.0, "step": 4080 }, { "epoch": 1.1943225051214517, "grad_norm": 0.00038226062315516174, "learning_rate": 2.0141937371963713e-05, "loss": 0.0, "step": 4081 }, { "epoch": 1.1946151594966345, "grad_norm": 0.00014924677088856697, "learning_rate": 2.013462101258414e-05, "loss": 0.0, "step": 4082 }, { "epoch": 1.1949078138718174, "grad_norm": 10.570392608642578, "learning_rate": 2.0127304653204565e-05, "loss": 0.0159, "step": 4083 }, { "epoch": 1.1952004682470003, "grad_norm": 0.0016134226461872458, "learning_rate": 2.0119988293824993e-05, "loss": 0.0, "step": 4084 }, { "epoch": 1.1954931226221832, "grad_norm": 0.00032581633422523737, "learning_rate": 2.011267193444542e-05, "loss": 0.0, "step": 4085 }, { "epoch": 1.195785776997366, "grad_norm": 0.00027731049340218306, "learning_rate": 2.010535557506585e-05, "loss": 0.0, "step": 4086 }, { "epoch": 1.196078431372549, "grad_norm": 0.00011565641761990264, "learning_rate": 2.0098039215686277e-05, "loss": 0.0, "step": 4087 }, { "epoch": 1.1963710857477319, "grad_norm": 9.082938194274902, "learning_rate": 2.0090722856306705e-05, "loss": 0.0128, "step": 4088 }, { "epoch": 1.1966637401229148, "grad_norm": 0.04108380898833275, "learning_rate": 2.008340649692713e-05, "loss": 0.0001, "step": 4089 }, { "epoch": 1.1969563944980977, "grad_norm": 0.0002807502169162035, "learning_rate": 2.0076090137547558e-05, "loss": 0.0, "step": 4090 }, { "epoch": 1.1972490488732808, "grad_norm": 0.00023227729252539575, "learning_rate": 2.0068773778167986e-05, "loss": 0.0, "step": 4091 }, { "epoch": 1.1975417032484637, "grad_norm": 0.00022215255012270063, "learning_rate": 2.0061457418788414e-05, "loss": 0.0, "step": 4092 }, { "epoch": 1.1978343576236465, "grad_norm": 0.0011499938555061817, "learning_rate": 2.005414105940884e-05, "loss": 0.0, "step": 4093 }, { "epoch": 1.1981270119988294, "grad_norm": 0.0001239770499523729, "learning_rate": 2.0046824700029266e-05, "loss": 0.0, "step": 4094 }, { "epoch": 1.1984196663740123, "grad_norm": 0.0002783769741654396, "learning_rate": 2.0039508340649694e-05, "loss": 0.0, "step": 4095 }, { "epoch": 1.1987123207491952, "grad_norm": 0.00011884696868946776, "learning_rate": 2.0032191981270122e-05, "loss": 0.0, "step": 4096 }, { "epoch": 1.199004975124378, "grad_norm": 0.00015855680976528674, "learning_rate": 2.002487562189055e-05, "loss": 0.0, "step": 4097 }, { "epoch": 1.199297629499561, "grad_norm": 0.0002209139202022925, "learning_rate": 2.0017559262510978e-05, "loss": 0.0, "step": 4098 }, { "epoch": 1.1995902838747439, "grad_norm": 0.0003696352359838784, "learning_rate": 2.0010242903131403e-05, "loss": 0.0, "step": 4099 }, { "epoch": 1.1998829382499268, "grad_norm": 0.006352592725306749, "learning_rate": 2.000292654375183e-05, "loss": 0.0, "step": 4100 }, { "epoch": 1.2001755926251096, "grad_norm": 0.00024181559274438769, "learning_rate": 1.999561018437226e-05, "loss": 0.0, "step": 4101 }, { "epoch": 1.2004682470002925, "grad_norm": 0.00061303615802899, "learning_rate": 1.9988293824992686e-05, "loss": 0.0, "step": 4102 }, { "epoch": 1.2007609013754756, "grad_norm": 0.0003650170692708343, "learning_rate": 1.9980977465613114e-05, "loss": 0.0, "step": 4103 }, { "epoch": 1.2010535557506585, "grad_norm": 0.0004829903191421181, "learning_rate": 1.997366110623354e-05, "loss": 0.0, "step": 4104 }, { "epoch": 1.2013462101258414, "grad_norm": 0.0010053931036964059, "learning_rate": 1.9966344746853967e-05, "loss": 0.0, "step": 4105 }, { "epoch": 1.2016388645010243, "grad_norm": 0.0004990804591216147, "learning_rate": 1.9959028387474395e-05, "loss": 0.0, "step": 4106 }, { "epoch": 1.2019315188762072, "grad_norm": 0.0003033496323041618, "learning_rate": 1.9951712028094823e-05, "loss": 0.0, "step": 4107 }, { "epoch": 1.20222417325139, "grad_norm": 0.0024242354556918144, "learning_rate": 1.9944395668715247e-05, "loss": 0.0, "step": 4108 }, { "epoch": 1.202516827626573, "grad_norm": 0.00014314088912215084, "learning_rate": 1.9937079309335675e-05, "loss": 0.0, "step": 4109 }, { "epoch": 1.2028094820017559, "grad_norm": 0.00014785472012590617, "learning_rate": 1.9929762949956103e-05, "loss": 0.0, "step": 4110 }, { "epoch": 1.2031021363769387, "grad_norm": 0.0008265096694231033, "learning_rate": 1.992244659057653e-05, "loss": 0.0, "step": 4111 }, { "epoch": 1.2033947907521219, "grad_norm": 0.00018385302973911166, "learning_rate": 1.9915130231196956e-05, "loss": 0.0, "step": 4112 }, { "epoch": 1.2036874451273047, "grad_norm": 0.00033309354330413043, "learning_rate": 1.9907813871817384e-05, "loss": 0.0, "step": 4113 }, { "epoch": 1.2039800995024876, "grad_norm": 0.00032117695081979036, "learning_rate": 1.990049751243781e-05, "loss": 0.0, "step": 4114 }, { "epoch": 1.2042727538776705, "grad_norm": 8.890005111694336, "learning_rate": 1.989318115305824e-05, "loss": 0.0187, "step": 4115 }, { "epoch": 1.2045654082528534, "grad_norm": 0.0002898283419199288, "learning_rate": 1.9885864793678664e-05, "loss": 0.0, "step": 4116 }, { "epoch": 1.2048580626280363, "grad_norm": 0.00043106195516884327, "learning_rate": 1.9878548434299092e-05, "loss": 0.0, "step": 4117 }, { "epoch": 1.2051507170032192, "grad_norm": 0.0008336814353242517, "learning_rate": 1.987123207491952e-05, "loss": 0.0, "step": 4118 }, { "epoch": 1.205443371378402, "grad_norm": 0.0005825763219036162, "learning_rate": 1.9863915715539948e-05, "loss": 0.0, "step": 4119 }, { "epoch": 1.205736025753585, "grad_norm": 0.0008535462548024952, "learning_rate": 1.9856599356160376e-05, "loss": 0.0, "step": 4120 }, { "epoch": 1.2060286801287678, "grad_norm": 0.000628861365839839, "learning_rate": 1.98492829967808e-05, "loss": 0.0, "step": 4121 }, { "epoch": 1.2063213345039507, "grad_norm": 0.0032100165262818336, "learning_rate": 1.984196663740123e-05, "loss": 0.0, "step": 4122 }, { "epoch": 1.2066139888791336, "grad_norm": 0.0007732409867458045, "learning_rate": 1.9834650278021656e-05, "loss": 0.0, "step": 4123 }, { "epoch": 1.2069066432543167, "grad_norm": 0.00039209527312777936, "learning_rate": 1.9827333918642084e-05, "loss": 0.0, "step": 4124 }, { "epoch": 1.2071992976294996, "grad_norm": 0.005325721111148596, "learning_rate": 1.9820017559262512e-05, "loss": 0.0, "step": 4125 }, { "epoch": 1.2074919520046825, "grad_norm": 0.00978061929345131, "learning_rate": 1.9812701199882937e-05, "loss": 0.0, "step": 4126 }, { "epoch": 1.2077846063798654, "grad_norm": 0.0028644222766160965, "learning_rate": 1.9805384840503365e-05, "loss": 0.0, "step": 4127 }, { "epoch": 1.2080772607550483, "grad_norm": 0.0008429356385022402, "learning_rate": 1.9798068481123793e-05, "loss": 0.0, "step": 4128 }, { "epoch": 1.2083699151302312, "grad_norm": 0.0014908623415976763, "learning_rate": 1.979075212174422e-05, "loss": 0.0, "step": 4129 }, { "epoch": 1.208662569505414, "grad_norm": 0.02699602022767067, "learning_rate": 1.978343576236465e-05, "loss": 0.0001, "step": 4130 }, { "epoch": 1.208955223880597, "grad_norm": 0.023643648251891136, "learning_rate": 1.9776119402985073e-05, "loss": 0.0001, "step": 4131 }, { "epoch": 1.2092478782557798, "grad_norm": 0.0020143059082329273, "learning_rate": 1.97688030436055e-05, "loss": 0.0, "step": 4132 }, { "epoch": 1.209540532630963, "grad_norm": 0.002474347362294793, "learning_rate": 1.976148668422593e-05, "loss": 0.0, "step": 4133 }, { "epoch": 1.2098331870061458, "grad_norm": 0.002414839807897806, "learning_rate": 1.9754170324846357e-05, "loss": 0.0, "step": 4134 }, { "epoch": 1.2101258413813287, "grad_norm": 0.0027450169436633587, "learning_rate": 1.9746853965466785e-05, "loss": 0.0, "step": 4135 }, { "epoch": 1.2104184957565116, "grad_norm": 0.0032009691931307316, "learning_rate": 1.973953760608721e-05, "loss": 0.0, "step": 4136 }, { "epoch": 1.2107111501316945, "grad_norm": 0.023865224793553352, "learning_rate": 1.9732221246707638e-05, "loss": 0.0001, "step": 4137 }, { "epoch": 1.2110038045068774, "grad_norm": 13.271195411682129, "learning_rate": 1.9724904887328065e-05, "loss": 0.0412, "step": 4138 }, { "epoch": 1.2112964588820603, "grad_norm": 0.0007114256150089204, "learning_rate": 1.9717588527948493e-05, "loss": 0.0, "step": 4139 }, { "epoch": 1.2115891132572432, "grad_norm": 0.010572691448032856, "learning_rate": 1.971027216856892e-05, "loss": 0.0, "step": 4140 }, { "epoch": 1.211881767632426, "grad_norm": 0.00038259930443018675, "learning_rate": 1.970295580918935e-05, "loss": 0.0, "step": 4141 }, { "epoch": 1.212174422007609, "grad_norm": 0.0001456064055673778, "learning_rate": 1.9695639449809774e-05, "loss": 0.0, "step": 4142 }, { "epoch": 1.2124670763827918, "grad_norm": 0.0004648877074941993, "learning_rate": 1.9688323090430202e-05, "loss": 0.0, "step": 4143 }, { "epoch": 1.2127597307579747, "grad_norm": 0.0002797534398268908, "learning_rate": 1.968100673105063e-05, "loss": 0.0, "step": 4144 }, { "epoch": 1.2130523851331578, "grad_norm": 0.001196891302242875, "learning_rate": 1.9673690371671058e-05, "loss": 0.0, "step": 4145 }, { "epoch": 1.2133450395083407, "grad_norm": 0.00014675207785330713, "learning_rate": 1.9666374012291486e-05, "loss": 0.0, "step": 4146 }, { "epoch": 1.2136376938835236, "grad_norm": 0.0002081404672935605, "learning_rate": 1.965905765291191e-05, "loss": 0.0, "step": 4147 }, { "epoch": 1.2139303482587065, "grad_norm": 0.0002005124551942572, "learning_rate": 1.9651741293532338e-05, "loss": 0.0, "step": 4148 }, { "epoch": 1.2142230026338894, "grad_norm": 0.0003484161861706525, "learning_rate": 1.9644424934152766e-05, "loss": 0.0, "step": 4149 }, { "epoch": 1.2145156570090723, "grad_norm": 0.0001250431960215792, "learning_rate": 1.9637108574773194e-05, "loss": 0.0, "step": 4150 }, { "epoch": 1.2148083113842552, "grad_norm": 0.00020586712344083935, "learning_rate": 1.9629792215393622e-05, "loss": 0.0, "step": 4151 }, { "epoch": 1.215100965759438, "grad_norm": 0.0001760764280334115, "learning_rate": 1.9622475856014047e-05, "loss": 0.0, "step": 4152 }, { "epoch": 1.215393620134621, "grad_norm": 0.0002378870121901855, "learning_rate": 1.9615159496634475e-05, "loss": 0.0, "step": 4153 }, { "epoch": 1.215686274509804, "grad_norm": 0.00010399192979093641, "learning_rate": 1.9607843137254903e-05, "loss": 0.0, "step": 4154 }, { "epoch": 1.215978928884987, "grad_norm": 0.0003964437055401504, "learning_rate": 1.960052677787533e-05, "loss": 0.0, "step": 4155 }, { "epoch": 1.2162715832601698, "grad_norm": 0.0001294235698878765, "learning_rate": 1.959321041849576e-05, "loss": 0.0, "step": 4156 }, { "epoch": 1.2165642376353527, "grad_norm": 0.00016335082182195038, "learning_rate": 1.9585894059116186e-05, "loss": 0.0, "step": 4157 }, { "epoch": 1.2168568920105356, "grad_norm": 0.0001436687307432294, "learning_rate": 1.957857769973661e-05, "loss": 0.0, "step": 4158 }, { "epoch": 1.2171495463857185, "grad_norm": 9.348212915938348e-05, "learning_rate": 1.957126134035704e-05, "loss": 0.0, "step": 4159 }, { "epoch": 1.2174422007609014, "grad_norm": 6.595941522391513e-05, "learning_rate": 1.9563944980977467e-05, "loss": 0.0, "step": 4160 }, { "epoch": 1.2177348551360843, "grad_norm": 6.128421955509111e-05, "learning_rate": 1.9556628621597895e-05, "loss": 0.0, "step": 4161 }, { "epoch": 1.2180275095112671, "grad_norm": 0.07660602033138275, "learning_rate": 1.9549312262218323e-05, "loss": 0.0001, "step": 4162 }, { "epoch": 1.21832016388645, "grad_norm": 0.0002029917377512902, "learning_rate": 1.9541995902838747e-05, "loss": 0.0, "step": 4163 }, { "epoch": 1.218612818261633, "grad_norm": 0.0019446692895144224, "learning_rate": 1.9534679543459175e-05, "loss": 0.0, "step": 4164 }, { "epoch": 1.2189054726368158, "grad_norm": 9.471770317759365e-05, "learning_rate": 1.9527363184079603e-05, "loss": 0.0, "step": 4165 }, { "epoch": 1.219198127011999, "grad_norm": 0.0022286963649094105, "learning_rate": 1.952004682470003e-05, "loss": 0.0, "step": 4166 }, { "epoch": 1.2194907813871818, "grad_norm": 0.00011821759107988328, "learning_rate": 1.951273046532046e-05, "loss": 0.0, "step": 4167 }, { "epoch": 1.2197834357623647, "grad_norm": 0.0004870724806096405, "learning_rate": 1.9505414105940884e-05, "loss": 0.0, "step": 4168 }, { "epoch": 1.2200760901375476, "grad_norm": 0.00013188557932153344, "learning_rate": 1.949809774656131e-05, "loss": 0.0, "step": 4169 }, { "epoch": 1.2203687445127305, "grad_norm": 0.0004395339055918157, "learning_rate": 1.949078138718174e-05, "loss": 0.0, "step": 4170 }, { "epoch": 1.2206613988879134, "grad_norm": 9.655337635194883e-05, "learning_rate": 1.9483465027802167e-05, "loss": 0.0, "step": 4171 }, { "epoch": 1.2209540532630963, "grad_norm": 0.00012214704474899918, "learning_rate": 1.9476148668422595e-05, "loss": 0.0, "step": 4172 }, { "epoch": 1.2212467076382791, "grad_norm": 0.0002902006381191313, "learning_rate": 1.9468832309043023e-05, "loss": 0.0, "step": 4173 }, { "epoch": 1.221539362013462, "grad_norm": 0.00023098342353478074, "learning_rate": 1.9461515949663448e-05, "loss": 0.0, "step": 4174 }, { "epoch": 1.221832016388645, "grad_norm": 4.296055316925049, "learning_rate": 1.9454199590283876e-05, "loss": 0.0029, "step": 4175 }, { "epoch": 1.222124670763828, "grad_norm": 0.00022539144265465438, "learning_rate": 1.9446883230904304e-05, "loss": 0.0, "step": 4176 }, { "epoch": 1.222417325139011, "grad_norm": 0.0034128634724766016, "learning_rate": 1.9439566871524732e-05, "loss": 0.0, "step": 4177 }, { "epoch": 1.2227099795141938, "grad_norm": 0.0001060848226188682, "learning_rate": 1.943225051214516e-05, "loss": 0.0, "step": 4178 }, { "epoch": 1.2230026338893767, "grad_norm": 0.0001370605459669605, "learning_rate": 1.9424934152765584e-05, "loss": 0.0, "step": 4179 }, { "epoch": 1.2232952882645596, "grad_norm": 0.0001054197273333557, "learning_rate": 1.9417617793386012e-05, "loss": 0.0, "step": 4180 }, { "epoch": 1.2235879426397425, "grad_norm": 0.00028701237170025706, "learning_rate": 1.941030143400644e-05, "loss": 0.0, "step": 4181 }, { "epoch": 1.2238805970149254, "grad_norm": 0.001215559197589755, "learning_rate": 1.9402985074626868e-05, "loss": 0.0, "step": 4182 }, { "epoch": 1.2241732513901082, "grad_norm": 0.00038847370888106525, "learning_rate": 1.9395668715247296e-05, "loss": 0.0, "step": 4183 }, { "epoch": 1.2244659057652911, "grad_norm": 0.00020916743960697204, "learning_rate": 1.938835235586772e-05, "loss": 0.0, "step": 4184 }, { "epoch": 1.224758560140474, "grad_norm": 0.00012006520410068333, "learning_rate": 1.938103599648815e-05, "loss": 0.0, "step": 4185 }, { "epoch": 1.225051214515657, "grad_norm": 0.00022883755445946008, "learning_rate": 1.9373719637108577e-05, "loss": 0.0, "step": 4186 }, { "epoch": 1.2253438688908398, "grad_norm": 0.0009294861229136586, "learning_rate": 1.9366403277729005e-05, "loss": 0.0, "step": 4187 }, { "epoch": 1.225636523266023, "grad_norm": 0.0002540037385188043, "learning_rate": 1.9359086918349432e-05, "loss": 0.0, "step": 4188 }, { "epoch": 1.2259291776412058, "grad_norm": 0.00010973669122904539, "learning_rate": 1.935177055896986e-05, "loss": 0.0, "step": 4189 }, { "epoch": 1.2262218320163887, "grad_norm": 0.00018544113845564425, "learning_rate": 1.9344454199590285e-05, "loss": 0.0, "step": 4190 }, { "epoch": 1.2265144863915716, "grad_norm": 0.0005819756188429892, "learning_rate": 1.9337137840210713e-05, "loss": 0.0, "step": 4191 }, { "epoch": 1.2268071407667545, "grad_norm": 6.846505857538432e-05, "learning_rate": 1.932982148083114e-05, "loss": 0.0, "step": 4192 }, { "epoch": 1.2270997951419373, "grad_norm": 0.0001275314571103081, "learning_rate": 1.932250512145157e-05, "loss": 0.0, "step": 4193 }, { "epoch": 1.2273924495171202, "grad_norm": 0.016942564398050308, "learning_rate": 1.9315188762071997e-05, "loss": 0.0001, "step": 4194 }, { "epoch": 1.2276851038923031, "grad_norm": 0.0003085004282183945, "learning_rate": 1.930787240269242e-05, "loss": 0.0, "step": 4195 }, { "epoch": 1.227977758267486, "grad_norm": 0.00022445937793236226, "learning_rate": 1.930055604331285e-05, "loss": 0.0, "step": 4196 }, { "epoch": 1.2282704126426691, "grad_norm": 0.0006460981676355004, "learning_rate": 1.9293239683933277e-05, "loss": 0.0, "step": 4197 }, { "epoch": 1.228563067017852, "grad_norm": 3.556636095046997, "learning_rate": 1.9285923324553705e-05, "loss": 0.1548, "step": 4198 }, { "epoch": 1.228855721393035, "grad_norm": 0.238409623503685, "learning_rate": 1.927860696517413e-05, "loss": 0.0009, "step": 4199 }, { "epoch": 1.2291483757682178, "grad_norm": 0.10258849710226059, "learning_rate": 1.9271290605794558e-05, "loss": 0.0003, "step": 4200 }, { "epoch": 1.2294410301434007, "grad_norm": 0.00675250543281436, "learning_rate": 1.9263974246414986e-05, "loss": 0.0, "step": 4201 }, { "epoch": 1.2297336845185836, "grad_norm": 0.0019149247091263533, "learning_rate": 1.9256657887035414e-05, "loss": 0.0, "step": 4202 }, { "epoch": 1.2300263388937664, "grad_norm": 0.008474493399262428, "learning_rate": 1.9249341527655838e-05, "loss": 0.0001, "step": 4203 }, { "epoch": 1.2303189932689493, "grad_norm": 0.000252736295806244, "learning_rate": 1.9242025168276266e-05, "loss": 0.0, "step": 4204 }, { "epoch": 1.2306116476441322, "grad_norm": 0.0008034301572479308, "learning_rate": 1.9234708808896694e-05, "loss": 0.0, "step": 4205 }, { "epoch": 1.230904302019315, "grad_norm": 0.001213454408571124, "learning_rate": 1.9227392449517122e-05, "loss": 0.0, "step": 4206 }, { "epoch": 1.231196956394498, "grad_norm": 0.004696414805948734, "learning_rate": 1.9220076090137547e-05, "loss": 0.0001, "step": 4207 }, { "epoch": 1.2314896107696809, "grad_norm": 0.0035157673992216587, "learning_rate": 1.9212759730757975e-05, "loss": 0.0, "step": 4208 }, { "epoch": 1.231782265144864, "grad_norm": 0.0005110283964313567, "learning_rate": 1.9205443371378402e-05, "loss": 0.0, "step": 4209 }, { "epoch": 1.2320749195200469, "grad_norm": 0.9316847324371338, "learning_rate": 1.919812701199883e-05, "loss": 0.0044, "step": 4210 }, { "epoch": 1.2323675738952298, "grad_norm": 0.2705109417438507, "learning_rate": 1.9190810652619255e-05, "loss": 0.0007, "step": 4211 }, { "epoch": 1.2326602282704127, "grad_norm": 0.03438115492463112, "learning_rate": 1.9183494293239683e-05, "loss": 0.0003, "step": 4212 }, { "epoch": 1.2329528826455955, "grad_norm": 0.02816217578947544, "learning_rate": 1.917617793386011e-05, "loss": 0.0003, "step": 4213 }, { "epoch": 1.2332455370207784, "grad_norm": 0.002822326961904764, "learning_rate": 1.916886157448054e-05, "loss": 0.0, "step": 4214 }, { "epoch": 1.2335381913959613, "grad_norm": 0.011674858629703522, "learning_rate": 1.9161545215100967e-05, "loss": 0.0001, "step": 4215 }, { "epoch": 1.2338308457711442, "grad_norm": 0.0035582296550273895, "learning_rate": 1.915422885572139e-05, "loss": 0.0, "step": 4216 }, { "epoch": 1.234123500146327, "grad_norm": 8.236109715653583e-05, "learning_rate": 1.914691249634182e-05, "loss": 0.0, "step": 4217 }, { "epoch": 1.2344161545215102, "grad_norm": 0.00041809817776083946, "learning_rate": 1.9139596136962247e-05, "loss": 0.0, "step": 4218 }, { "epoch": 1.234708808896693, "grad_norm": 0.0004134566697757691, "learning_rate": 1.9132279777582675e-05, "loss": 0.0, "step": 4219 }, { "epoch": 1.235001463271876, "grad_norm": 0.00023026966664474458, "learning_rate": 1.9124963418203103e-05, "loss": 0.0, "step": 4220 }, { "epoch": 1.2352941176470589, "grad_norm": 0.00014988753537181765, "learning_rate": 1.9117647058823528e-05, "loss": 0.0, "step": 4221 }, { "epoch": 1.2355867720222418, "grad_norm": 0.0008121030987240374, "learning_rate": 1.9110330699443956e-05, "loss": 0.0, "step": 4222 }, { "epoch": 1.2358794263974247, "grad_norm": 0.00311463326215744, "learning_rate": 1.9103014340064384e-05, "loss": 0.0, "step": 4223 }, { "epoch": 1.2361720807726075, "grad_norm": 0.0025886602234095335, "learning_rate": 1.909569798068481e-05, "loss": 0.0, "step": 4224 }, { "epoch": 1.2364647351477904, "grad_norm": 0.0012801195261999965, "learning_rate": 1.908838162130524e-05, "loss": 0.0, "step": 4225 }, { "epoch": 1.2367573895229733, "grad_norm": 7.799321610946208e-05, "learning_rate": 1.9081065261925667e-05, "loss": 0.0, "step": 4226 }, { "epoch": 1.2370500438981562, "grad_norm": 0.00016507963300682604, "learning_rate": 1.9073748902546092e-05, "loss": 0.0, "step": 4227 }, { "epoch": 1.237342698273339, "grad_norm": 0.0007038047770038247, "learning_rate": 1.906643254316652e-05, "loss": 0.0, "step": 4228 }, { "epoch": 1.237635352648522, "grad_norm": 0.00018707582785282284, "learning_rate": 1.9059116183786948e-05, "loss": 0.0, "step": 4229 }, { "epoch": 1.237928007023705, "grad_norm": 9.05595370568335e-05, "learning_rate": 1.9051799824407376e-05, "loss": 0.0, "step": 4230 }, { "epoch": 1.238220661398888, "grad_norm": 3.0786476135253906, "learning_rate": 1.9044483465027804e-05, "loss": 0.2078, "step": 4231 }, { "epoch": 1.2385133157740709, "grad_norm": 0.0010520414216443896, "learning_rate": 1.903716710564823e-05, "loss": 0.0, "step": 4232 }, { "epoch": 1.2388059701492538, "grad_norm": 0.00023917222279123962, "learning_rate": 1.9029850746268656e-05, "loss": 0.0, "step": 4233 }, { "epoch": 1.2390986245244366, "grad_norm": 0.0007023094221949577, "learning_rate": 1.9022534386889084e-05, "loss": 0.0, "step": 4234 }, { "epoch": 1.2393912788996195, "grad_norm": 0.0011123003205284476, "learning_rate": 1.9015218027509512e-05, "loss": 0.0, "step": 4235 }, { "epoch": 1.2396839332748024, "grad_norm": 0.00146926857996732, "learning_rate": 1.900790166812994e-05, "loss": 0.0, "step": 4236 }, { "epoch": 1.2399765876499853, "grad_norm": 0.0022972996812313795, "learning_rate": 1.9000585308750365e-05, "loss": 0.0, "step": 4237 }, { "epoch": 1.2402692420251682, "grad_norm": 0.040048111230134964, "learning_rate": 1.8993268949370793e-05, "loss": 0.0004, "step": 4238 }, { "epoch": 1.2405618964003513, "grad_norm": 0.019376909360289574, "learning_rate": 1.898595258999122e-05, "loss": 0.0002, "step": 4239 }, { "epoch": 1.2408545507755342, "grad_norm": 0.01507840771228075, "learning_rate": 1.897863623061165e-05, "loss": 0.0001, "step": 4240 }, { "epoch": 1.241147205150717, "grad_norm": 0.04815857484936714, "learning_rate": 1.8971319871232077e-05, "loss": 0.0005, "step": 4241 }, { "epoch": 1.2414398595259, "grad_norm": 0.12103767693042755, "learning_rate": 1.8964003511852505e-05, "loss": 0.0018, "step": 4242 }, { "epoch": 1.2417325139010829, "grad_norm": 0.0030685067176818848, "learning_rate": 1.895668715247293e-05, "loss": 0.0, "step": 4243 }, { "epoch": 1.2420251682762657, "grad_norm": 0.2151612788438797, "learning_rate": 1.8949370793093357e-05, "loss": 0.0029, "step": 4244 }, { "epoch": 1.2423178226514486, "grad_norm": 0.010484547354280949, "learning_rate": 1.8942054433713785e-05, "loss": 0.0001, "step": 4245 }, { "epoch": 1.2426104770266315, "grad_norm": 0.004427577834576368, "learning_rate": 1.8934738074334213e-05, "loss": 0.0001, "step": 4246 }, { "epoch": 1.2429031314018144, "grad_norm": 0.003969315905123949, "learning_rate": 1.892742171495464e-05, "loss": 0.0, "step": 4247 }, { "epoch": 1.2431957857769973, "grad_norm": 0.018904047086834908, "learning_rate": 1.8920105355575065e-05, "loss": 0.0002, "step": 4248 }, { "epoch": 1.2434884401521802, "grad_norm": 0.017943337559700012, "learning_rate": 1.8912788996195493e-05, "loss": 0.0002, "step": 4249 }, { "epoch": 1.243781094527363, "grad_norm": 0.004366525448858738, "learning_rate": 1.890547263681592e-05, "loss": 0.0, "step": 4250 }, { "epoch": 1.2440737489025462, "grad_norm": 0.19857020676136017, "learning_rate": 1.889815627743635e-05, "loss": 0.0011, "step": 4251 }, { "epoch": 1.244366403277729, "grad_norm": 0.0003746624570339918, "learning_rate": 1.8890839918056777e-05, "loss": 0.0, "step": 4252 }, { "epoch": 1.244659057652912, "grad_norm": 0.012967000715434551, "learning_rate": 1.8883523558677202e-05, "loss": 0.0001, "step": 4253 }, { "epoch": 1.2449517120280948, "grad_norm": 0.008523451164364815, "learning_rate": 1.887620719929763e-05, "loss": 0.0001, "step": 4254 }, { "epoch": 1.2452443664032777, "grad_norm": 0.0003721773100551218, "learning_rate": 1.8868890839918058e-05, "loss": 0.0, "step": 4255 }, { "epoch": 1.2455370207784606, "grad_norm": 0.0015894857933744788, "learning_rate": 1.8861574480538486e-05, "loss": 0.0, "step": 4256 }, { "epoch": 1.2458296751536435, "grad_norm": 0.0010224776342511177, "learning_rate": 1.8854258121158914e-05, "loss": 0.0, "step": 4257 }, { "epoch": 1.2461223295288264, "grad_norm": 0.014933550730347633, "learning_rate": 1.8846941761779338e-05, "loss": 0.0002, "step": 4258 }, { "epoch": 1.2464149839040093, "grad_norm": 0.010465152561664581, "learning_rate": 1.8839625402399766e-05, "loss": 0.0001, "step": 4259 }, { "epoch": 1.2467076382791924, "grad_norm": 0.00037264273851178586, "learning_rate": 1.8832309043020194e-05, "loss": 0.0, "step": 4260 }, { "epoch": 1.2470002926543753, "grad_norm": 0.00139771425165236, "learning_rate": 1.8824992683640622e-05, "loss": 0.0, "step": 4261 }, { "epoch": 1.2472929470295582, "grad_norm": 0.0018496651900932193, "learning_rate": 1.881767632426105e-05, "loss": 0.0, "step": 4262 }, { "epoch": 1.247585601404741, "grad_norm": 0.0002926725137513131, "learning_rate": 1.8810359964881478e-05, "loss": 0.0, "step": 4263 }, { "epoch": 1.247878255779924, "grad_norm": 0.0026824823580682278, "learning_rate": 1.8803043605501902e-05, "loss": 0.0, "step": 4264 }, { "epoch": 1.2481709101551068, "grad_norm": 0.00018589686078485101, "learning_rate": 1.879572724612233e-05, "loss": 0.0, "step": 4265 }, { "epoch": 1.2484635645302897, "grad_norm": 0.00043787300819531083, "learning_rate": 1.878841088674276e-05, "loss": 0.0, "step": 4266 }, { "epoch": 1.2487562189054726, "grad_norm": 0.00044568435987457633, "learning_rate": 1.8781094527363186e-05, "loss": 0.0, "step": 4267 }, { "epoch": 1.2490488732806555, "grad_norm": 0.0006209853454492986, "learning_rate": 1.8773778167983614e-05, "loss": 0.0, "step": 4268 }, { "epoch": 1.2493415276558384, "grad_norm": 0.006390134803950787, "learning_rate": 1.876646180860404e-05, "loss": 0.0, "step": 4269 }, { "epoch": 1.2496341820310213, "grad_norm": 0.00033496366813778877, "learning_rate": 1.8759145449224467e-05, "loss": 0.0, "step": 4270 }, { "epoch": 1.2499268364062042, "grad_norm": 0.005502917338162661, "learning_rate": 1.8751829089844895e-05, "loss": 0.0, "step": 4271 }, { "epoch": 1.250219490781387, "grad_norm": 0.00027335871709510684, "learning_rate": 1.8744512730465323e-05, "loss": 0.0, "step": 4272 }, { "epoch": 1.2505121451565702, "grad_norm": 0.00015072182577569038, "learning_rate": 1.873719637108575e-05, "loss": 0.0, "step": 4273 }, { "epoch": 1.250804799531753, "grad_norm": 0.0002814159670379013, "learning_rate": 1.8729880011706175e-05, "loss": 0.0, "step": 4274 }, { "epoch": 1.251097453906936, "grad_norm": 0.0004379993479233235, "learning_rate": 1.8722563652326603e-05, "loss": 0.0, "step": 4275 }, { "epoch": 1.2513901082821188, "grad_norm": 0.003155686892569065, "learning_rate": 1.871524729294703e-05, "loss": 0.0, "step": 4276 }, { "epoch": 1.2516827626573017, "grad_norm": 0.00023797780158929527, "learning_rate": 1.870793093356746e-05, "loss": 0.0, "step": 4277 }, { "epoch": 1.2519754170324846, "grad_norm": 0.040031664073467255, "learning_rate": 1.8700614574187887e-05, "loss": 0.0001, "step": 4278 }, { "epoch": 1.2522680714076675, "grad_norm": 0.0011026357533410192, "learning_rate": 1.8693298214808315e-05, "loss": 0.0, "step": 4279 }, { "epoch": 1.2525607257828504, "grad_norm": 0.006923212204128504, "learning_rate": 1.868598185542874e-05, "loss": 0.0001, "step": 4280 }, { "epoch": 1.2528533801580335, "grad_norm": 0.00014879163063596934, "learning_rate": 1.8678665496049167e-05, "loss": 0.0, "step": 4281 }, { "epoch": 1.2531460345332164, "grad_norm": 0.0005386594566516578, "learning_rate": 1.8671349136669595e-05, "loss": 0.0, "step": 4282 }, { "epoch": 1.2534386889083993, "grad_norm": 0.011392862536013126, "learning_rate": 1.8664032777290023e-05, "loss": 0.0001, "step": 4283 }, { "epoch": 1.2537313432835822, "grad_norm": 0.00036759424256160855, "learning_rate": 1.865671641791045e-05, "loss": 0.0, "step": 4284 }, { "epoch": 1.254023997658765, "grad_norm": 0.06291080266237259, "learning_rate": 1.8649400058530876e-05, "loss": 0.0008, "step": 4285 }, { "epoch": 1.254316652033948, "grad_norm": 0.00039730401476845145, "learning_rate": 1.8642083699151304e-05, "loss": 0.0, "step": 4286 }, { "epoch": 1.2546093064091308, "grad_norm": 0.00015699613140895963, "learning_rate": 1.8634767339771732e-05, "loss": 0.0, "step": 4287 }, { "epoch": 1.2549019607843137, "grad_norm": 39.24689865112305, "learning_rate": 1.862745098039216e-05, "loss": 0.0722, "step": 4288 }, { "epoch": 1.2551946151594966, "grad_norm": 0.0009274079347960651, "learning_rate": 1.8620134621012584e-05, "loss": 0.0, "step": 4289 }, { "epoch": 1.2554872695346795, "grad_norm": 0.00017085866420529783, "learning_rate": 1.8612818261633012e-05, "loss": 0.0, "step": 4290 }, { "epoch": 1.2557799239098624, "grad_norm": 0.00016269812476821244, "learning_rate": 1.860550190225344e-05, "loss": 0.0, "step": 4291 }, { "epoch": 1.2560725782850453, "grad_norm": 0.002036924008280039, "learning_rate": 1.8598185542873868e-05, "loss": 0.0, "step": 4292 }, { "epoch": 1.2563652326602281, "grad_norm": 0.003428952069953084, "learning_rate": 1.8590869183494293e-05, "loss": 0.0, "step": 4293 }, { "epoch": 1.2566578870354113, "grad_norm": 0.04832572862505913, "learning_rate": 1.858355282411472e-05, "loss": 0.0006, "step": 4294 }, { "epoch": 1.2569505414105941, "grad_norm": 0.04637840390205383, "learning_rate": 1.857623646473515e-05, "loss": 0.0006, "step": 4295 }, { "epoch": 1.257243195785777, "grad_norm": 0.00366193731315434, "learning_rate": 1.8568920105355577e-05, "loss": 0.0, "step": 4296 }, { "epoch": 1.25753585016096, "grad_norm": 0.00028218806255608797, "learning_rate": 1.8561603745976e-05, "loss": 0.0, "step": 4297 }, { "epoch": 1.2578285045361428, "grad_norm": 0.00024327139544766396, "learning_rate": 1.855428738659643e-05, "loss": 0.0, "step": 4298 }, { "epoch": 1.2581211589113257, "grad_norm": 0.0015721708768978715, "learning_rate": 1.8546971027216857e-05, "loss": 0.0, "step": 4299 }, { "epoch": 1.2584138132865086, "grad_norm": 0.0009468572679907084, "learning_rate": 1.8539654667837285e-05, "loss": 0.0, "step": 4300 }, { "epoch": 1.2587064676616915, "grad_norm": 0.31393736600875854, "learning_rate": 1.853233830845771e-05, "loss": 0.0007, "step": 4301 }, { "epoch": 1.2589991220368746, "grad_norm": 0.0014417548663914204, "learning_rate": 1.8525021949078137e-05, "loss": 0.0, "step": 4302 }, { "epoch": 1.2592917764120575, "grad_norm": 0.0002797488123178482, "learning_rate": 1.8517705589698565e-05, "loss": 0.0, "step": 4303 }, { "epoch": 1.2595844307872404, "grad_norm": 0.00036022544372826815, "learning_rate": 1.8510389230318993e-05, "loss": 0.0, "step": 4304 }, { "epoch": 1.2598770851624232, "grad_norm": 0.004282816778868437, "learning_rate": 1.850307287093942e-05, "loss": 0.0, "step": 4305 }, { "epoch": 1.2601697395376061, "grad_norm": 0.011342636309564114, "learning_rate": 1.8495756511559846e-05, "loss": 0.0001, "step": 4306 }, { "epoch": 1.260462393912789, "grad_norm": 0.001547664636746049, "learning_rate": 1.8488440152180274e-05, "loss": 0.0, "step": 4307 }, { "epoch": 1.260755048287972, "grad_norm": 0.0006469974759966135, "learning_rate": 1.8481123792800702e-05, "loss": 0.0, "step": 4308 }, { "epoch": 1.2610477026631548, "grad_norm": 0.0004033749573864043, "learning_rate": 1.847380743342113e-05, "loss": 0.0, "step": 4309 }, { "epoch": 1.2613403570383377, "grad_norm": 0.0019031958654522896, "learning_rate": 1.8466491074041558e-05, "loss": 0.0, "step": 4310 }, { "epoch": 1.2616330114135206, "grad_norm": 0.0014144045999273658, "learning_rate": 1.8459174714661986e-05, "loss": 0.0, "step": 4311 }, { "epoch": 1.2619256657887035, "grad_norm": 0.007924694567918777, "learning_rate": 1.845185835528241e-05, "loss": 0.0001, "step": 4312 }, { "epoch": 1.2622183201638864, "grad_norm": 0.006693069823086262, "learning_rate": 1.8444541995902838e-05, "loss": 0.0001, "step": 4313 }, { "epoch": 1.2625109745390692, "grad_norm": 0.00043680029921233654, "learning_rate": 1.8437225636523266e-05, "loss": 0.0, "step": 4314 }, { "epoch": 1.2628036289142521, "grad_norm": 0.0006893217214383185, "learning_rate": 1.8429909277143694e-05, "loss": 0.0, "step": 4315 }, { "epoch": 1.2630962832894352, "grad_norm": 0.0004322109743952751, "learning_rate": 1.8422592917764122e-05, "loss": 0.0, "step": 4316 }, { "epoch": 1.2633889376646181, "grad_norm": 0.00439478037878871, "learning_rate": 1.8415276558384547e-05, "loss": 0.0001, "step": 4317 }, { "epoch": 1.263681592039801, "grad_norm": 0.0003082916955463588, "learning_rate": 1.8407960199004975e-05, "loss": 0.0, "step": 4318 }, { "epoch": 1.263974246414984, "grad_norm": 0.001095048850402236, "learning_rate": 1.8400643839625402e-05, "loss": 0.0, "step": 4319 }, { "epoch": 1.2642669007901668, "grad_norm": 0.0002884860441554338, "learning_rate": 1.839332748024583e-05, "loss": 0.0, "step": 4320 }, { "epoch": 1.2645595551653497, "grad_norm": 0.0017246073111891747, "learning_rate": 1.838601112086626e-05, "loss": 0.0, "step": 4321 }, { "epoch": 1.2648522095405326, "grad_norm": 0.00025704840663820505, "learning_rate": 1.8378694761486683e-05, "loss": 0.0, "step": 4322 }, { "epoch": 1.2651448639157157, "grad_norm": 0.0005988952470943332, "learning_rate": 1.837137840210711e-05, "loss": 0.0, "step": 4323 }, { "epoch": 1.2654375182908986, "grad_norm": 0.0017043894622474909, "learning_rate": 1.836406204272754e-05, "loss": 0.0, "step": 4324 }, { "epoch": 1.2657301726660815, "grad_norm": 0.0003185214300174266, "learning_rate": 1.8356745683347967e-05, "loss": 0.0, "step": 4325 }, { "epoch": 1.2660228270412643, "grad_norm": 0.0003728196315933019, "learning_rate": 1.8349429323968395e-05, "loss": 0.0, "step": 4326 }, { "epoch": 1.2663154814164472, "grad_norm": 0.002830952638760209, "learning_rate": 1.834211296458882e-05, "loss": 0.0, "step": 4327 }, { "epoch": 1.2666081357916301, "grad_norm": 0.00018264648679178208, "learning_rate": 1.8334796605209247e-05, "loss": 0.0, "step": 4328 }, { "epoch": 1.266900790166813, "grad_norm": 0.00030146719655022025, "learning_rate": 1.8327480245829675e-05, "loss": 0.0, "step": 4329 }, { "epoch": 1.267193444541996, "grad_norm": 0.0001685014140093699, "learning_rate": 1.8320163886450103e-05, "loss": 0.0, "step": 4330 }, { "epoch": 1.2674860989171788, "grad_norm": 0.0008038796368055046, "learning_rate": 1.831284752707053e-05, "loss": 0.0, "step": 4331 }, { "epoch": 1.2677787532923617, "grad_norm": 0.0002496697998140007, "learning_rate": 1.830553116769096e-05, "loss": 0.0, "step": 4332 }, { "epoch": 1.2680714076675446, "grad_norm": 0.0002453435445204377, "learning_rate": 1.8298214808311384e-05, "loss": 0.0, "step": 4333 }, { "epoch": 1.2683640620427274, "grad_norm": 0.00045765130198560655, "learning_rate": 1.829089844893181e-05, "loss": 0.0, "step": 4334 }, { "epoch": 1.2686567164179103, "grad_norm": 0.001018995069898665, "learning_rate": 1.828358208955224e-05, "loss": 0.0, "step": 4335 }, { "epoch": 1.2689493707930932, "grad_norm": 0.00016092466830741614, "learning_rate": 1.8276265730172667e-05, "loss": 0.0, "step": 4336 }, { "epoch": 1.2692420251682763, "grad_norm": 0.00014827775885351002, "learning_rate": 1.8268949370793095e-05, "loss": 0.0, "step": 4337 }, { "epoch": 1.2695346795434592, "grad_norm": 0.0003899074799846858, "learning_rate": 1.826163301141352e-05, "loss": 0.0, "step": 4338 }, { "epoch": 1.269827333918642, "grad_norm": 0.00010926521645160392, "learning_rate": 1.8254316652033948e-05, "loss": 0.0, "step": 4339 }, { "epoch": 1.270119988293825, "grad_norm": 0.0010218808893114328, "learning_rate": 1.8247000292654376e-05, "loss": 0.0, "step": 4340 }, { "epoch": 1.2704126426690079, "grad_norm": 0.0007877651951275766, "learning_rate": 1.8239683933274804e-05, "loss": 0.0, "step": 4341 }, { "epoch": 1.2707052970441908, "grad_norm": 0.00018021459982264787, "learning_rate": 1.8232367573895232e-05, "loss": 0.0, "step": 4342 }, { "epoch": 1.2709979514193737, "grad_norm": 0.0012451495276764035, "learning_rate": 1.8225051214515656e-05, "loss": 0.0, "step": 4343 }, { "epoch": 1.2712906057945565, "grad_norm": 0.0005864020204171538, "learning_rate": 1.8217734855136084e-05, "loss": 0.0, "step": 4344 }, { "epoch": 1.2715832601697397, "grad_norm": 0.00013048344408161938, "learning_rate": 1.8210418495756512e-05, "loss": 0.0, "step": 4345 }, { "epoch": 1.2718759145449225, "grad_norm": 0.0015791425248607993, "learning_rate": 1.820310213637694e-05, "loss": 0.0, "step": 4346 }, { "epoch": 1.2721685689201054, "grad_norm": 0.00019306503236293793, "learning_rate": 1.8195785776997368e-05, "loss": 0.0, "step": 4347 }, { "epoch": 1.2724612232952883, "grad_norm": 0.00011835879558930174, "learning_rate": 1.8188469417617796e-05, "loss": 0.0, "step": 4348 }, { "epoch": 1.2727538776704712, "grad_norm": 0.0002314967568963766, "learning_rate": 1.818115305823822e-05, "loss": 0.0, "step": 4349 }, { "epoch": 1.273046532045654, "grad_norm": 0.00022952862491365522, "learning_rate": 1.817383669885865e-05, "loss": 0.0, "step": 4350 }, { "epoch": 1.273339186420837, "grad_norm": 2.1702842712402344, "learning_rate": 1.8166520339479077e-05, "loss": 0.0146, "step": 4351 }, { "epoch": 1.2736318407960199, "grad_norm": 0.0002519426343496889, "learning_rate": 1.8159203980099505e-05, "loss": 0.0, "step": 4352 }, { "epoch": 1.2739244951712028, "grad_norm": 0.00180053838994354, "learning_rate": 1.8151887620719932e-05, "loss": 0.0, "step": 4353 }, { "epoch": 1.2742171495463857, "grad_norm": 0.0031657288782298565, "learning_rate": 1.8144571261340357e-05, "loss": 0.0, "step": 4354 }, { "epoch": 1.2745098039215685, "grad_norm": 0.00036263116635382175, "learning_rate": 1.8137254901960785e-05, "loss": 0.0, "step": 4355 }, { "epoch": 1.2748024582967514, "grad_norm": 0.00032865680987015367, "learning_rate": 1.8129938542581213e-05, "loss": 0.0, "step": 4356 }, { "epoch": 1.2750951126719343, "grad_norm": 0.0009084104094654322, "learning_rate": 1.812262218320164e-05, "loss": 0.0, "step": 4357 }, { "epoch": 1.2753877670471174, "grad_norm": 8.174698829650879, "learning_rate": 1.811530582382207e-05, "loss": 0.2658, "step": 4358 }, { "epoch": 1.2756804214223003, "grad_norm": 0.0010587909491732717, "learning_rate": 1.8107989464442493e-05, "loss": 0.0, "step": 4359 }, { "epoch": 1.2759730757974832, "grad_norm": 0.0005852478789165616, "learning_rate": 1.810067310506292e-05, "loss": 0.0, "step": 4360 }, { "epoch": 1.276265730172666, "grad_norm": 0.0003181801876053214, "learning_rate": 1.809335674568335e-05, "loss": 0.0, "step": 4361 }, { "epoch": 1.276558384547849, "grad_norm": 0.00035449196002446115, "learning_rate": 1.8086040386303777e-05, "loss": 0.0, "step": 4362 }, { "epoch": 1.2768510389230319, "grad_norm": 0.06469757109880447, "learning_rate": 1.8078724026924205e-05, "loss": 0.0004, "step": 4363 }, { "epoch": 1.2771436932982148, "grad_norm": 0.000493619532790035, "learning_rate": 1.8071407667544633e-05, "loss": 0.0, "step": 4364 }, { "epoch": 1.2774363476733976, "grad_norm": 0.0003512586699798703, "learning_rate": 1.8064091308165058e-05, "loss": 0.0, "step": 4365 }, { "epoch": 1.2777290020485808, "grad_norm": 0.0003507339279167354, "learning_rate": 1.8056774948785486e-05, "loss": 0.0, "step": 4366 }, { "epoch": 1.2780216564237636, "grad_norm": 0.00018073168757837266, "learning_rate": 1.8049458589405914e-05, "loss": 0.0, "step": 4367 }, { "epoch": 1.2783143107989465, "grad_norm": 0.00024586240760982037, "learning_rate": 1.804214223002634e-05, "loss": 0.0, "step": 4368 }, { "epoch": 1.2786069651741294, "grad_norm": 0.0006003909511491656, "learning_rate": 1.803482587064677e-05, "loss": 0.0, "step": 4369 }, { "epoch": 1.2788996195493123, "grad_norm": 0.00019972145673818886, "learning_rate": 1.8027509511267194e-05, "loss": 0.0, "step": 4370 }, { "epoch": 1.2791922739244952, "grad_norm": 0.0001580975076649338, "learning_rate": 1.8020193151887622e-05, "loss": 0.0, "step": 4371 }, { "epoch": 1.279484928299678, "grad_norm": 0.00012921317829750478, "learning_rate": 1.801287679250805e-05, "loss": 0.0, "step": 4372 }, { "epoch": 1.279777582674861, "grad_norm": 0.0006235188920982182, "learning_rate": 1.8005560433128478e-05, "loss": 0.0, "step": 4373 }, { "epoch": 1.2800702370500439, "grad_norm": 0.00019048931426368654, "learning_rate": 1.7998244073748906e-05, "loss": 0.0, "step": 4374 }, { "epoch": 1.2803628914252267, "grad_norm": 9.730968304211274e-05, "learning_rate": 1.799092771436933e-05, "loss": 0.0, "step": 4375 }, { "epoch": 1.2806555458004096, "grad_norm": 0.00012557368609122932, "learning_rate": 1.798361135498976e-05, "loss": 0.0, "step": 4376 }, { "epoch": 1.2809482001755925, "grad_norm": 0.003751950105652213, "learning_rate": 1.7976294995610186e-05, "loss": 0.0, "step": 4377 }, { "epoch": 1.2812408545507754, "grad_norm": 0.00039506788016296923, "learning_rate": 1.7968978636230614e-05, "loss": 0.0, "step": 4378 }, { "epoch": 1.2815335089259585, "grad_norm": 0.00354623980820179, "learning_rate": 1.7961662276851042e-05, "loss": 0.0, "step": 4379 }, { "epoch": 1.2818261633011414, "grad_norm": 0.0002384902909398079, "learning_rate": 1.7954345917471467e-05, "loss": 0.0, "step": 4380 }, { "epoch": 1.2821188176763243, "grad_norm": 0.0029882891103625298, "learning_rate": 1.7947029558091895e-05, "loss": 0.0, "step": 4381 }, { "epoch": 1.2824114720515072, "grad_norm": 0.0005152951343916357, "learning_rate": 1.7939713198712323e-05, "loss": 0.0, "step": 4382 }, { "epoch": 1.28270412642669, "grad_norm": 0.03205489739775658, "learning_rate": 1.793239683933275e-05, "loss": 0.0004, "step": 4383 }, { "epoch": 1.282996780801873, "grad_norm": 0.006373089738190174, "learning_rate": 1.7925080479953175e-05, "loss": 0.0, "step": 4384 }, { "epoch": 1.2832894351770558, "grad_norm": 0.0009934831177815795, "learning_rate": 1.7917764120573603e-05, "loss": 0.0, "step": 4385 }, { "epoch": 1.2835820895522387, "grad_norm": 0.00041518089710734785, "learning_rate": 1.791044776119403e-05, "loss": 0.0, "step": 4386 }, { "epoch": 1.2838747439274218, "grad_norm": 0.00018887739861384034, "learning_rate": 1.790313140181446e-05, "loss": 0.0, "step": 4387 }, { "epoch": 1.2841673983026047, "grad_norm": 0.00019374901603441685, "learning_rate": 1.7895815042434884e-05, "loss": 0.0, "step": 4388 }, { "epoch": 1.2844600526777876, "grad_norm": 0.0006210590363480151, "learning_rate": 1.788849868305531e-05, "loss": 0.0, "step": 4389 }, { "epoch": 1.2847527070529705, "grad_norm": 0.00012727176363114268, "learning_rate": 1.788118232367574e-05, "loss": 0.0, "step": 4390 }, { "epoch": 1.2850453614281534, "grad_norm": 0.003193584969267249, "learning_rate": 1.7873865964296167e-05, "loss": 0.0, "step": 4391 }, { "epoch": 1.2853380158033363, "grad_norm": 0.00022801656450610608, "learning_rate": 1.7866549604916592e-05, "loss": 0.0, "step": 4392 }, { "epoch": 1.2856306701785192, "grad_norm": 0.004470504354685545, "learning_rate": 1.785923324553702e-05, "loss": 0.0, "step": 4393 }, { "epoch": 1.285923324553702, "grad_norm": 0.00029001777875237167, "learning_rate": 1.7851916886157448e-05, "loss": 0.0, "step": 4394 }, { "epoch": 1.286215978928885, "grad_norm": 0.0005296553717926145, "learning_rate": 1.7844600526777876e-05, "loss": 0.0, "step": 4395 }, { "epoch": 1.2865086333040678, "grad_norm": 6.382421997841448e-05, "learning_rate": 1.78372841673983e-05, "loss": 0.0, "step": 4396 }, { "epoch": 1.2868012876792507, "grad_norm": 0.0020240354351699352, "learning_rate": 1.782996780801873e-05, "loss": 0.0, "step": 4397 }, { "epoch": 1.2870939420544336, "grad_norm": 0.003384276293218136, "learning_rate": 1.7822651448639156e-05, "loss": 0.0, "step": 4398 }, { "epoch": 1.2873865964296165, "grad_norm": 0.04222164303064346, "learning_rate": 1.7815335089259584e-05, "loss": 0.0002, "step": 4399 }, { "epoch": 1.2876792508047996, "grad_norm": 1.5526560544967651, "learning_rate": 1.7808018729880012e-05, "loss": 0.0169, "step": 4400 }, { "epoch": 1.2879719051799825, "grad_norm": 0.00047081487718969584, "learning_rate": 1.780070237050044e-05, "loss": 0.0, "step": 4401 }, { "epoch": 1.2882645595551654, "grad_norm": 0.0005067844176664948, "learning_rate": 1.7793386011120865e-05, "loss": 0.0, "step": 4402 }, { "epoch": 1.2885572139303483, "grad_norm": 0.00012542780314106494, "learning_rate": 1.7786069651741293e-05, "loss": 0.0, "step": 4403 }, { "epoch": 1.2888498683055312, "grad_norm": 9.943839540937915e-05, "learning_rate": 1.777875329236172e-05, "loss": 0.0, "step": 4404 }, { "epoch": 1.289142522680714, "grad_norm": 7.087265968322754, "learning_rate": 1.777143693298215e-05, "loss": 0.2037, "step": 4405 }, { "epoch": 1.289435177055897, "grad_norm": 0.00020516323274932802, "learning_rate": 1.7764120573602577e-05, "loss": 0.0, "step": 4406 }, { "epoch": 1.2897278314310798, "grad_norm": 0.0007501508225686848, "learning_rate": 1.7756804214223e-05, "loss": 0.0, "step": 4407 }, { "epoch": 1.290020485806263, "grad_norm": 0.0001918547204695642, "learning_rate": 1.774948785484343e-05, "loss": 0.0, "step": 4408 }, { "epoch": 1.2903131401814458, "grad_norm": 0.0006179132033139467, "learning_rate": 1.7742171495463857e-05, "loss": 0.0, "step": 4409 }, { "epoch": 1.2906057945566287, "grad_norm": 0.00027921059518121183, "learning_rate": 1.7734855136084285e-05, "loss": 0.0, "step": 4410 }, { "epoch": 1.2908984489318116, "grad_norm": 6.703787221340463e-05, "learning_rate": 1.7727538776704713e-05, "loss": 0.0, "step": 4411 }, { "epoch": 1.2911911033069945, "grad_norm": 0.00042909561307169497, "learning_rate": 1.7720222417325137e-05, "loss": 0.0, "step": 4412 }, { "epoch": 1.2914837576821774, "grad_norm": 0.0004101863887626678, "learning_rate": 1.7712906057945565e-05, "loss": 0.0, "step": 4413 }, { "epoch": 1.2917764120573603, "grad_norm": 0.0006459795404225588, "learning_rate": 1.7705589698565993e-05, "loss": 0.0, "step": 4414 }, { "epoch": 1.2920690664325432, "grad_norm": 0.0002578320854809135, "learning_rate": 1.769827333918642e-05, "loss": 0.0, "step": 4415 }, { "epoch": 1.292361720807726, "grad_norm": 0.0011339564807713032, "learning_rate": 1.769095697980685e-05, "loss": 0.0, "step": 4416 }, { "epoch": 1.292654375182909, "grad_norm": 0.015644166618585587, "learning_rate": 1.7683640620427277e-05, "loss": 0.0001, "step": 4417 }, { "epoch": 1.2929470295580918, "grad_norm": 0.0011464759008958936, "learning_rate": 1.7676324261047702e-05, "loss": 0.0, "step": 4418 }, { "epoch": 1.2932396839332747, "grad_norm": 0.0006816753302700818, "learning_rate": 1.766900790166813e-05, "loss": 0.0, "step": 4419 }, { "epoch": 1.2935323383084576, "grad_norm": 11.788400650024414, "learning_rate": 1.7661691542288558e-05, "loss": 0.0448, "step": 4420 }, { "epoch": 1.2938249926836405, "grad_norm": 0.014803797006607056, "learning_rate": 1.7654375182908986e-05, "loss": 0.0001, "step": 4421 }, { "epoch": 1.2941176470588236, "grad_norm": 0.00040439984877593815, "learning_rate": 1.7647058823529414e-05, "loss": 0.0, "step": 4422 }, { "epoch": 1.2944103014340065, "grad_norm": 9.866422653198242, "learning_rate": 1.7639742464149838e-05, "loss": 0.0453, "step": 4423 }, { "epoch": 1.2947029558091894, "grad_norm": 0.04085865989327431, "learning_rate": 1.7632426104770266e-05, "loss": 0.0002, "step": 4424 }, { "epoch": 1.2949956101843723, "grad_norm": 0.006987076718360186, "learning_rate": 1.7625109745390694e-05, "loss": 0.0001, "step": 4425 }, { "epoch": 1.2952882645595551, "grad_norm": 0.01172039844095707, "learning_rate": 1.7617793386011122e-05, "loss": 0.0001, "step": 4426 }, { "epoch": 1.295580918934738, "grad_norm": 0.0010197163792327046, "learning_rate": 1.761047702663155e-05, "loss": 0.0, "step": 4427 }, { "epoch": 1.295873573309921, "grad_norm": 0.0006449169595725834, "learning_rate": 1.7603160667251975e-05, "loss": 0.0, "step": 4428 }, { "epoch": 1.2961662276851038, "grad_norm": 0.011789483949542046, "learning_rate": 1.7595844307872402e-05, "loss": 0.0001, "step": 4429 }, { "epoch": 1.296458882060287, "grad_norm": 0.001827823929488659, "learning_rate": 1.758852794849283e-05, "loss": 0.0, "step": 4430 }, { "epoch": 1.2967515364354698, "grad_norm": 0.00024653668515384197, "learning_rate": 1.758121158911326e-05, "loss": 0.0, "step": 4431 }, { "epoch": 1.2970441908106527, "grad_norm": 0.03542637452483177, "learning_rate": 1.7573895229733686e-05, "loss": 0.0002, "step": 4432 }, { "epoch": 1.2973368451858356, "grad_norm": 0.04266812279820442, "learning_rate": 1.7566578870354114e-05, "loss": 0.0002, "step": 4433 }, { "epoch": 1.2976294995610185, "grad_norm": 0.007048277650028467, "learning_rate": 1.755926251097454e-05, "loss": 0.0001, "step": 4434 }, { "epoch": 1.2979221539362014, "grad_norm": 0.007927828468382359, "learning_rate": 1.7551946151594967e-05, "loss": 0.0001, "step": 4435 }, { "epoch": 1.2982148083113842, "grad_norm": 0.004960792139172554, "learning_rate": 1.7544629792215395e-05, "loss": 0.0, "step": 4436 }, { "epoch": 1.2985074626865671, "grad_norm": 0.02081185020506382, "learning_rate": 1.7537313432835823e-05, "loss": 0.0002, "step": 4437 }, { "epoch": 1.29880011706175, "grad_norm": 0.06372889876365662, "learning_rate": 1.752999707345625e-05, "loss": 0.0003, "step": 4438 }, { "epoch": 1.299092771436933, "grad_norm": 0.0006737270741723478, "learning_rate": 1.7522680714076675e-05, "loss": 0.0, "step": 4439 }, { "epoch": 1.2993854258121158, "grad_norm": 0.012304473668336868, "learning_rate": 1.7515364354697103e-05, "loss": 0.0001, "step": 4440 }, { "epoch": 1.2996780801872987, "grad_norm": 0.06632810086011887, "learning_rate": 1.750804799531753e-05, "loss": 0.0003, "step": 4441 }, { "epoch": 1.2999707345624816, "grad_norm": 0.0013256085803732276, "learning_rate": 1.750073163593796e-05, "loss": 0.0, "step": 4442 }, { "epoch": 1.3002633889376647, "grad_norm": 0.006500875577330589, "learning_rate": 1.7493415276558387e-05, "loss": 0.0001, "step": 4443 }, { "epoch": 1.3005560433128476, "grad_norm": 0.00017142921569757164, "learning_rate": 1.748609891717881e-05, "loss": 0.0, "step": 4444 }, { "epoch": 1.3008486976880305, "grad_norm": 0.00018845274462364614, "learning_rate": 1.747878255779924e-05, "loss": 0.0, "step": 4445 }, { "epoch": 1.3011413520632134, "grad_norm": 0.00017048002337105572, "learning_rate": 1.7471466198419667e-05, "loss": 0.0, "step": 4446 }, { "epoch": 1.3014340064383962, "grad_norm": 0.00018671387806534767, "learning_rate": 1.7464149839040095e-05, "loss": 0.0, "step": 4447 }, { "epoch": 1.3017266608135791, "grad_norm": 0.001127854106016457, "learning_rate": 1.7456833479660523e-05, "loss": 0.0, "step": 4448 }, { "epoch": 1.302019315188762, "grad_norm": 0.5655648112297058, "learning_rate": 1.7449517120280948e-05, "loss": 0.003, "step": 4449 }, { "epoch": 1.302311969563945, "grad_norm": 0.0004359325102996081, "learning_rate": 1.7442200760901376e-05, "loss": 0.0, "step": 4450 }, { "epoch": 1.302604623939128, "grad_norm": 5.662459373474121, "learning_rate": 1.7434884401521804e-05, "loss": 0.0189, "step": 4451 }, { "epoch": 1.302897278314311, "grad_norm": 0.029793256893754005, "learning_rate": 1.7427568042142232e-05, "loss": 0.0002, "step": 4452 }, { "epoch": 1.3031899326894938, "grad_norm": 0.0007705074967816472, "learning_rate": 1.742025168276266e-05, "loss": 0.0, "step": 4453 }, { "epoch": 1.3034825870646767, "grad_norm": 0.000265125825535506, "learning_rate": 1.7412935323383088e-05, "loss": 0.0, "step": 4454 }, { "epoch": 1.3037752414398596, "grad_norm": 0.1741778403520584, "learning_rate": 1.7405618964003512e-05, "loss": 0.0006, "step": 4455 }, { "epoch": 1.3040678958150425, "grad_norm": 0.006701210513710976, "learning_rate": 1.739830260462394e-05, "loss": 0.0001, "step": 4456 }, { "epoch": 1.3043605501902253, "grad_norm": 0.07441215962171555, "learning_rate": 1.7390986245244368e-05, "loss": 0.0003, "step": 4457 }, { "epoch": 1.3046532045654082, "grad_norm": 0.0002838452346622944, "learning_rate": 1.7383669885864796e-05, "loss": 0.0, "step": 4458 }, { "epoch": 1.3049458589405911, "grad_norm": 0.000586601672694087, "learning_rate": 1.7376353526485224e-05, "loss": 0.0, "step": 4459 }, { "epoch": 1.305238513315774, "grad_norm": 0.0004960218211635947, "learning_rate": 1.736903716710565e-05, "loss": 0.0, "step": 4460 }, { "epoch": 1.305531167690957, "grad_norm": 0.0006264621042646468, "learning_rate": 1.7361720807726077e-05, "loss": 0.0, "step": 4461 }, { "epoch": 1.3058238220661398, "grad_norm": 0.0017462418181821704, "learning_rate": 1.7354404448346505e-05, "loss": 0.0, "step": 4462 }, { "epoch": 1.3061164764413227, "grad_norm": 0.0009365348960272968, "learning_rate": 1.7347088088966932e-05, "loss": 0.0, "step": 4463 }, { "epoch": 1.3064091308165058, "grad_norm": 0.0006757163209840655, "learning_rate": 1.733977172958736e-05, "loss": 0.0, "step": 4464 }, { "epoch": 1.3067017851916887, "grad_norm": 0.0011337717296555638, "learning_rate": 1.7332455370207785e-05, "loss": 0.0, "step": 4465 }, { "epoch": 1.3069944395668716, "grad_norm": 0.0008460487588308752, "learning_rate": 1.7325139010828213e-05, "loss": 0.0, "step": 4466 }, { "epoch": 1.3072870939420544, "grad_norm": 0.0015477667329832911, "learning_rate": 1.731782265144864e-05, "loss": 0.0, "step": 4467 }, { "epoch": 1.3075797483172373, "grad_norm": 0.004709722939878702, "learning_rate": 1.731050629206907e-05, "loss": 0.0001, "step": 4468 }, { "epoch": 1.3078724026924202, "grad_norm": 0.0004866288509219885, "learning_rate": 1.7303189932689497e-05, "loss": 0.0, "step": 4469 }, { "epoch": 1.308165057067603, "grad_norm": 0.003755094949156046, "learning_rate": 1.7295873573309925e-05, "loss": 0.0, "step": 4470 }, { "epoch": 1.308457711442786, "grad_norm": 0.1101662740111351, "learning_rate": 1.728855721393035e-05, "loss": 0.0003, "step": 4471 }, { "epoch": 1.308750365817969, "grad_norm": 0.0001950179139385, "learning_rate": 1.7281240854550777e-05, "loss": 0.0, "step": 4472 }, { "epoch": 1.309043020193152, "grad_norm": 0.005900123622268438, "learning_rate": 1.7273924495171205e-05, "loss": 0.0001, "step": 4473 }, { "epoch": 1.3093356745683349, "grad_norm": 0.005997837986797094, "learning_rate": 1.7266608135791633e-05, "loss": 0.0001, "step": 4474 }, { "epoch": 1.3096283289435178, "grad_norm": 0.00015216268366202712, "learning_rate": 1.7259291776412058e-05, "loss": 0.0, "step": 4475 }, { "epoch": 1.3099209833187007, "grad_norm": 0.6806633472442627, "learning_rate": 1.7251975417032486e-05, "loss": 0.0016, "step": 4476 }, { "epoch": 1.3102136376938835, "grad_norm": 0.007385977543890476, "learning_rate": 1.7244659057652914e-05, "loss": 0.0001, "step": 4477 }, { "epoch": 1.3105062920690664, "grad_norm": 0.001072707585990429, "learning_rate": 1.723734269827334e-05, "loss": 0.0, "step": 4478 }, { "epoch": 1.3107989464442493, "grad_norm": 0.0015393621288239956, "learning_rate": 1.7230026338893766e-05, "loss": 0.0, "step": 4479 }, { "epoch": 1.3110916008194322, "grad_norm": 0.0010753574315458536, "learning_rate": 1.7222709979514194e-05, "loss": 0.0, "step": 4480 }, { "epoch": 1.311384255194615, "grad_norm": 0.00011825386172858998, "learning_rate": 1.7215393620134622e-05, "loss": 0.0, "step": 4481 }, { "epoch": 1.311676909569798, "grad_norm": 0.0003599435440264642, "learning_rate": 1.720807726075505e-05, "loss": 0.0, "step": 4482 }, { "epoch": 1.3119695639449809, "grad_norm": 0.0006308953743427992, "learning_rate": 1.7200760901375475e-05, "loss": 0.0, "step": 4483 }, { "epoch": 1.3122622183201638, "grad_norm": 0.002207934157922864, "learning_rate": 1.7193444541995902e-05, "loss": 0.0, "step": 4484 }, { "epoch": 1.3125548726953469, "grad_norm": 0.00015759652887936682, "learning_rate": 1.718612818261633e-05, "loss": 0.0, "step": 4485 }, { "epoch": 1.3128475270705298, "grad_norm": 0.0001810126268537715, "learning_rate": 1.717881182323676e-05, "loss": 0.0, "step": 4486 }, { "epoch": 1.3131401814457127, "grad_norm": 9.173175811767578, "learning_rate": 1.7171495463857183e-05, "loss": 0.1658, "step": 4487 }, { "epoch": 1.3134328358208955, "grad_norm": 0.00019536117906682193, "learning_rate": 1.716417910447761e-05, "loss": 0.0, "step": 4488 }, { "epoch": 1.3137254901960784, "grad_norm": 0.00016868358943611383, "learning_rate": 1.715686274509804e-05, "loss": 0.0, "step": 4489 }, { "epoch": 1.3140181445712613, "grad_norm": 0.0006458146963268518, "learning_rate": 1.7149546385718467e-05, "loss": 0.0, "step": 4490 }, { "epoch": 1.3143107989464442, "grad_norm": 0.00034699728712439537, "learning_rate": 1.7142230026338895e-05, "loss": 0.0, "step": 4491 }, { "epoch": 1.314603453321627, "grad_norm": 0.011271150782704353, "learning_rate": 1.713491366695932e-05, "loss": 0.0001, "step": 4492 }, { "epoch": 1.3148961076968102, "grad_norm": 0.0006695784977637231, "learning_rate": 1.7127597307579747e-05, "loss": 0.0, "step": 4493 }, { "epoch": 1.315188762071993, "grad_norm": 0.001160523621365428, "learning_rate": 1.7120280948200175e-05, "loss": 0.0, "step": 4494 }, { "epoch": 1.315481416447176, "grad_norm": 0.0007655543158762157, "learning_rate": 1.7112964588820603e-05, "loss": 0.0, "step": 4495 }, { "epoch": 1.3157740708223589, "grad_norm": 11.203448295593262, "learning_rate": 1.710564822944103e-05, "loss": 0.037, "step": 4496 }, { "epoch": 1.3160667251975418, "grad_norm": 0.00023826721007935703, "learning_rate": 1.7098331870061456e-05, "loss": 0.0, "step": 4497 }, { "epoch": 1.3163593795727246, "grad_norm": 0.0005640920717269182, "learning_rate": 1.7091015510681884e-05, "loss": 0.0, "step": 4498 }, { "epoch": 1.3166520339479075, "grad_norm": 0.00016525565297342837, "learning_rate": 1.708369915130231e-05, "loss": 0.0, "step": 4499 }, { "epoch": 1.3169446883230904, "grad_norm": 0.0005659122252836823, "learning_rate": 1.707638279192274e-05, "loss": 0.0, "step": 4500 }, { "epoch": 1.3172373426982733, "grad_norm": 0.00027133754338137805, "learning_rate": 1.7069066432543167e-05, "loss": 0.0, "step": 4501 }, { "epoch": 1.3175299970734562, "grad_norm": 0.0008626925409771502, "learning_rate": 1.7061750073163595e-05, "loss": 0.0, "step": 4502 }, { "epoch": 1.317822651448639, "grad_norm": 0.0005522826686501503, "learning_rate": 1.705443371378402e-05, "loss": 0.0, "step": 4503 }, { "epoch": 1.318115305823822, "grad_norm": 0.00013968671555630863, "learning_rate": 1.7047117354404448e-05, "loss": 0.0, "step": 4504 }, { "epoch": 1.3184079601990049, "grad_norm": 0.00545914564281702, "learning_rate": 1.7039800995024876e-05, "loss": 0.0, "step": 4505 }, { "epoch": 1.3187006145741877, "grad_norm": 0.0023491347674280405, "learning_rate": 1.7032484635645304e-05, "loss": 0.0, "step": 4506 }, { "epoch": 1.3189932689493709, "grad_norm": 0.00017365990788675845, "learning_rate": 1.7025168276265732e-05, "loss": 0.0, "step": 4507 }, { "epoch": 1.3192859233245537, "grad_norm": 0.0003467008355073631, "learning_rate": 1.7017851916886156e-05, "loss": 0.0, "step": 4508 }, { "epoch": 1.3195785776997366, "grad_norm": 0.0010194872738793492, "learning_rate": 1.7010535557506584e-05, "loss": 0.0, "step": 4509 }, { "epoch": 1.3198712320749195, "grad_norm": 0.00044863088987767696, "learning_rate": 1.7003219198127012e-05, "loss": 0.0, "step": 4510 }, { "epoch": 1.3201638864501024, "grad_norm": 0.0003877152921631932, "learning_rate": 1.699590283874744e-05, "loss": 0.0, "step": 4511 }, { "epoch": 1.3204565408252853, "grad_norm": 0.0001742753229336813, "learning_rate": 1.6988586479367868e-05, "loss": 0.0, "step": 4512 }, { "epoch": 1.3207491952004682, "grad_norm": 0.00042221200419589877, "learning_rate": 1.6981270119988293e-05, "loss": 0.0, "step": 4513 }, { "epoch": 1.3210418495756513, "grad_norm": 0.0014971998753026128, "learning_rate": 1.697395376060872e-05, "loss": 0.0, "step": 4514 }, { "epoch": 1.3213345039508342, "grad_norm": 0.0009239388746209443, "learning_rate": 1.696663740122915e-05, "loss": 0.0, "step": 4515 }, { "epoch": 1.321627158326017, "grad_norm": 0.0002854466438293457, "learning_rate": 1.6959321041849577e-05, "loss": 0.0, "step": 4516 }, { "epoch": 1.3219198127012, "grad_norm": 0.00314055266790092, "learning_rate": 1.6952004682470005e-05, "loss": 0.0, "step": 4517 }, { "epoch": 1.3222124670763828, "grad_norm": 0.00027303065871819854, "learning_rate": 1.694468832309043e-05, "loss": 0.0, "step": 4518 }, { "epoch": 1.3225051214515657, "grad_norm": 0.0009044178295880556, "learning_rate": 1.6937371963710857e-05, "loss": 0.0, "step": 4519 }, { "epoch": 1.3227977758267486, "grad_norm": 0.0003398100088816136, "learning_rate": 1.6930055604331285e-05, "loss": 0.0, "step": 4520 }, { "epoch": 1.3230904302019315, "grad_norm": 0.0007959523354656994, "learning_rate": 1.6922739244951713e-05, "loss": 0.0, "step": 4521 }, { "epoch": 1.3233830845771144, "grad_norm": 0.00034519078326411545, "learning_rate": 1.691542288557214e-05, "loss": 0.0, "step": 4522 }, { "epoch": 1.3236757389522973, "grad_norm": 0.0001793931587599218, "learning_rate": 1.690810652619257e-05, "loss": 0.0, "step": 4523 }, { "epoch": 1.3239683933274802, "grad_norm": 0.00026595251983962953, "learning_rate": 1.6900790166812993e-05, "loss": 0.0, "step": 4524 }, { "epoch": 1.324261047702663, "grad_norm": 0.00012598829925991595, "learning_rate": 1.689347380743342e-05, "loss": 0.0, "step": 4525 }, { "epoch": 1.324553702077846, "grad_norm": 0.00086337880929932, "learning_rate": 1.688615744805385e-05, "loss": 0.0, "step": 4526 }, { "epoch": 1.3248463564530288, "grad_norm": 0.0017857812345027924, "learning_rate": 1.6878841088674277e-05, "loss": 0.0, "step": 4527 }, { "epoch": 1.325139010828212, "grad_norm": 0.000812956306617707, "learning_rate": 1.6871524729294705e-05, "loss": 0.0, "step": 4528 }, { "epoch": 1.3254316652033948, "grad_norm": 0.001117186970077455, "learning_rate": 1.686420836991513e-05, "loss": 0.0, "step": 4529 }, { "epoch": 1.3257243195785777, "grad_norm": 0.0001848274696385488, "learning_rate": 1.6856892010535558e-05, "loss": 0.0, "step": 4530 }, { "epoch": 1.3260169739537606, "grad_norm": 0.0008892636396922171, "learning_rate": 1.6849575651155986e-05, "loss": 0.0, "step": 4531 }, { "epoch": 1.3263096283289435, "grad_norm": 0.0011231438256800175, "learning_rate": 1.6842259291776414e-05, "loss": 0.0, "step": 4532 }, { "epoch": 1.3266022827041264, "grad_norm": 0.0030756371561437845, "learning_rate": 1.683494293239684e-05, "loss": 0.0, "step": 4533 }, { "epoch": 1.3268949370793093, "grad_norm": 0.00010932189616141841, "learning_rate": 1.6827626573017266e-05, "loss": 0.0, "step": 4534 }, { "epoch": 1.3271875914544922, "grad_norm": 0.0002901269472204149, "learning_rate": 1.6820310213637694e-05, "loss": 0.0, "step": 4535 }, { "epoch": 1.3274802458296753, "grad_norm": 0.0020996062085032463, "learning_rate": 1.6812993854258122e-05, "loss": 0.0, "step": 4536 }, { "epoch": 1.3277729002048582, "grad_norm": 0.0007693211082369089, "learning_rate": 1.680567749487855e-05, "loss": 0.0, "step": 4537 }, { "epoch": 1.328065554580041, "grad_norm": 0.00048650274402461946, "learning_rate": 1.6798361135498978e-05, "loss": 0.0, "step": 4538 }, { "epoch": 1.328358208955224, "grad_norm": 0.0013929984997957945, "learning_rate": 1.6791044776119406e-05, "loss": 0.0, "step": 4539 }, { "epoch": 1.3286508633304068, "grad_norm": 0.00036329045542515814, "learning_rate": 1.678372841673983e-05, "loss": 0.0, "step": 4540 }, { "epoch": 1.3289435177055897, "grad_norm": 0.0011186026968061924, "learning_rate": 1.677641205736026e-05, "loss": 0.0, "step": 4541 }, { "epoch": 1.3292361720807726, "grad_norm": 0.0006345548899844289, "learning_rate": 1.6769095697980686e-05, "loss": 0.0, "step": 4542 }, { "epoch": 1.3295288264559555, "grad_norm": 0.0022853300906717777, "learning_rate": 1.6761779338601114e-05, "loss": 0.0, "step": 4543 }, { "epoch": 1.3298214808311384, "grad_norm": 0.0003682018432300538, "learning_rate": 1.6754462979221542e-05, "loss": 0.0, "step": 4544 }, { "epoch": 1.3301141352063213, "grad_norm": 0.001419796491973102, "learning_rate": 1.6747146619841967e-05, "loss": 0.0, "step": 4545 }, { "epoch": 1.3304067895815042, "grad_norm": 0.0007176802610047162, "learning_rate": 1.6739830260462395e-05, "loss": 0.0, "step": 4546 }, { "epoch": 1.330699443956687, "grad_norm": 0.007843074388802052, "learning_rate": 1.6732513901082823e-05, "loss": 0.0001, "step": 4547 }, { "epoch": 1.33099209833187, "grad_norm": 0.00015430124767590314, "learning_rate": 1.672519754170325e-05, "loss": 0.0, "step": 4548 }, { "epoch": 1.331284752707053, "grad_norm": 0.0009809049079194665, "learning_rate": 1.671788118232368e-05, "loss": 0.0, "step": 4549 }, { "epoch": 1.331577407082236, "grad_norm": 0.0005149840144440532, "learning_rate": 1.6710564822944103e-05, "loss": 0.0, "step": 4550 }, { "epoch": 1.3318700614574188, "grad_norm": 0.0008740427438169718, "learning_rate": 1.670324846356453e-05, "loss": 0.0, "step": 4551 }, { "epoch": 1.3321627158326017, "grad_norm": 0.00028420190210454166, "learning_rate": 1.669593210418496e-05, "loss": 0.0, "step": 4552 }, { "epoch": 1.3324553702077846, "grad_norm": 0.000594905752222985, "learning_rate": 1.6688615744805387e-05, "loss": 0.0, "step": 4553 }, { "epoch": 1.3327480245829675, "grad_norm": 0.025138122960925102, "learning_rate": 1.6681299385425815e-05, "loss": 0.0001, "step": 4554 }, { "epoch": 1.3330406789581504, "grad_norm": 0.0018326579593122005, "learning_rate": 1.6673983026046243e-05, "loss": 0.0, "step": 4555 }, { "epoch": 1.3333333333333333, "grad_norm": 0.0002547812182456255, "learning_rate": 1.6666666666666667e-05, "loss": 0.0, "step": 4556 }, { "epoch": 1.3336259877085164, "grad_norm": 0.00041645910823717713, "learning_rate": 1.6659350307287095e-05, "loss": 0.0, "step": 4557 }, { "epoch": 1.3339186420836993, "grad_norm": 0.0005207156063988805, "learning_rate": 1.6652033947907523e-05, "loss": 0.0, "step": 4558 }, { "epoch": 1.3342112964588821, "grad_norm": 0.01128105353564024, "learning_rate": 1.664471758852795e-05, "loss": 0.0001, "step": 4559 }, { "epoch": 1.334503950834065, "grad_norm": 0.0005704149371013045, "learning_rate": 1.663740122914838e-05, "loss": 0.0, "step": 4560 }, { "epoch": 1.334796605209248, "grad_norm": 0.004358152858912945, "learning_rate": 1.6630084869768804e-05, "loss": 0.0, "step": 4561 }, { "epoch": 1.3350892595844308, "grad_norm": 0.0003605438978411257, "learning_rate": 1.6622768510389232e-05, "loss": 0.0, "step": 4562 }, { "epoch": 1.3353819139596137, "grad_norm": 0.00022131146397441626, "learning_rate": 1.661545215100966e-05, "loss": 0.0, "step": 4563 }, { "epoch": 1.3356745683347966, "grad_norm": 0.00019689615874085575, "learning_rate": 1.6608135791630088e-05, "loss": 0.0, "step": 4564 }, { "epoch": 1.3359672227099795, "grad_norm": 0.00034774577943608165, "learning_rate": 1.6600819432250516e-05, "loss": 0.0, "step": 4565 }, { "epoch": 1.3362598770851624, "grad_norm": 0.0001815056020859629, "learning_rate": 1.659350307287094e-05, "loss": 0.0, "step": 4566 }, { "epoch": 1.3365525314603453, "grad_norm": 0.00022513096337206662, "learning_rate": 1.6586186713491368e-05, "loss": 0.0, "step": 4567 }, { "epoch": 1.3368451858355281, "grad_norm": 0.005451194941997528, "learning_rate": 1.6578870354111796e-05, "loss": 0.0, "step": 4568 }, { "epoch": 1.337137840210711, "grad_norm": 2.7392542362213135, "learning_rate": 1.6571553994732224e-05, "loss": 0.2519, "step": 4569 }, { "epoch": 1.3374304945858941, "grad_norm": 0.0004196388181298971, "learning_rate": 1.656423763535265e-05, "loss": 0.0, "step": 4570 }, { "epoch": 1.337723148961077, "grad_norm": 0.0005751379649154842, "learning_rate": 1.6556921275973077e-05, "loss": 0.0, "step": 4571 }, { "epoch": 1.33801580333626, "grad_norm": 0.0018512074602767825, "learning_rate": 1.6549604916593505e-05, "loss": 0.0, "step": 4572 }, { "epoch": 1.3383084577114428, "grad_norm": 0.0012926749186590314, "learning_rate": 1.6542288557213932e-05, "loss": 0.0, "step": 4573 }, { "epoch": 1.3386011120866257, "grad_norm": 0.004534748382866383, "learning_rate": 1.6534972197834357e-05, "loss": 0.0001, "step": 4574 }, { "epoch": 1.3388937664618086, "grad_norm": 0.0014039897359907627, "learning_rate": 1.6527655838454785e-05, "loss": 0.0, "step": 4575 }, { "epoch": 1.3391864208369915, "grad_norm": 0.002373710973188281, "learning_rate": 1.6520339479075213e-05, "loss": 0.0, "step": 4576 }, { "epoch": 1.3394790752121744, "grad_norm": 3.6035537719726562, "learning_rate": 1.651302311969564e-05, "loss": 0.0175, "step": 4577 }, { "epoch": 1.3397717295873575, "grad_norm": 0.6810202598571777, "learning_rate": 1.6505706760316065e-05, "loss": 0.0031, "step": 4578 }, { "epoch": 1.3400643839625404, "grad_norm": 0.02525263838469982, "learning_rate": 1.6498390400936493e-05, "loss": 0.0003, "step": 4579 }, { "epoch": 1.3403570383377232, "grad_norm": 0.015978388488292694, "learning_rate": 1.649107404155692e-05, "loss": 0.0002, "step": 4580 }, { "epoch": 1.3406496927129061, "grad_norm": 0.018606310710310936, "learning_rate": 1.648375768217735e-05, "loss": 0.0002, "step": 4581 }, { "epoch": 1.340942347088089, "grad_norm": 0.0010876130545511842, "learning_rate": 1.6476441322797774e-05, "loss": 0.0, "step": 4582 }, { "epoch": 1.341235001463272, "grad_norm": 0.0028584327083081007, "learning_rate": 1.6469124963418202e-05, "loss": 0.0, "step": 4583 }, { "epoch": 1.3415276558384548, "grad_norm": 0.003823688719421625, "learning_rate": 1.646180860403863e-05, "loss": 0.0001, "step": 4584 }, { "epoch": 1.3418203102136377, "grad_norm": 0.0016548007261008024, "learning_rate": 1.6454492244659058e-05, "loss": 0.0, "step": 4585 }, { "epoch": 1.3421129645888206, "grad_norm": 0.0022606924176216125, "learning_rate": 1.6447175885279486e-05, "loss": 0.0, "step": 4586 }, { "epoch": 1.3424056189640035, "grad_norm": 0.5410664081573486, "learning_rate": 1.643985952589991e-05, "loss": 0.0023, "step": 4587 }, { "epoch": 1.3426982733391863, "grad_norm": 0.0010367278009653091, "learning_rate": 1.6432543166520338e-05, "loss": 0.0, "step": 4588 }, { "epoch": 1.3429909277143692, "grad_norm": 0.0008697782177478075, "learning_rate": 1.6425226807140766e-05, "loss": 0.0, "step": 4589 }, { "epoch": 1.3432835820895521, "grad_norm": 0.0026082817930728197, "learning_rate": 1.6417910447761194e-05, "loss": 0.0001, "step": 4590 }, { "epoch": 1.3435762364647352, "grad_norm": 0.007427962962538004, "learning_rate": 1.6410594088381622e-05, "loss": 0.0001, "step": 4591 }, { "epoch": 1.3438688908399181, "grad_norm": 0.0020755648147314787, "learning_rate": 1.640327772900205e-05, "loss": 0.0, "step": 4592 }, { "epoch": 1.344161545215101, "grad_norm": 3.129110813140869, "learning_rate": 1.6395961369622475e-05, "loss": 0.021, "step": 4593 }, { "epoch": 1.344454199590284, "grad_norm": 0.004509671591222286, "learning_rate": 1.6388645010242902e-05, "loss": 0.0001, "step": 4594 }, { "epoch": 1.3447468539654668, "grad_norm": 0.1679302603006363, "learning_rate": 1.638132865086333e-05, "loss": 0.0005, "step": 4595 }, { "epoch": 1.3450395083406497, "grad_norm": 0.008735943585634232, "learning_rate": 1.637401229148376e-05, "loss": 0.0001, "step": 4596 }, { "epoch": 1.3453321627158326, "grad_norm": 0.010910887271165848, "learning_rate": 1.6366695932104186e-05, "loss": 0.0001, "step": 4597 }, { "epoch": 1.3456248170910154, "grad_norm": 0.007489155046641827, "learning_rate": 1.635937957272461e-05, "loss": 0.0001, "step": 4598 }, { "epoch": 1.3459174714661986, "grad_norm": 0.01282794401049614, "learning_rate": 1.635206321334504e-05, "loss": 0.0002, "step": 4599 }, { "epoch": 1.3462101258413814, "grad_norm": 0.0005722651840187609, "learning_rate": 1.6344746853965467e-05, "loss": 0.0, "step": 4600 }, { "epoch": 1.3465027802165643, "grad_norm": 0.02161412313580513, "learning_rate": 1.6337430494585895e-05, "loss": 0.0003, "step": 4601 }, { "epoch": 1.3467954345917472, "grad_norm": 0.03608215972781181, "learning_rate": 1.6330114135206323e-05, "loss": 0.0005, "step": 4602 }, { "epoch": 1.34708808896693, "grad_norm": 5.9469399275258183e-05, "learning_rate": 1.6322797775826747e-05, "loss": 0.0, "step": 4603 }, { "epoch": 1.347380743342113, "grad_norm": 0.00882419478148222, "learning_rate": 1.6315481416447175e-05, "loss": 0.0001, "step": 4604 }, { "epoch": 1.3476733977172959, "grad_norm": 0.20794746279716492, "learning_rate": 1.6308165057067603e-05, "loss": 0.0016, "step": 4605 }, { "epoch": 1.3479660520924788, "grad_norm": 0.0021146859508007765, "learning_rate": 1.630084869768803e-05, "loss": 0.0, "step": 4606 }, { "epoch": 1.3482587064676617, "grad_norm": 0.2945975959300995, "learning_rate": 1.629353233830846e-05, "loss": 0.0027, "step": 4607 }, { "epoch": 1.3485513608428445, "grad_norm": 0.004604853689670563, "learning_rate": 1.6286215978928887e-05, "loss": 0.0001, "step": 4608 }, { "epoch": 1.3488440152180274, "grad_norm": 0.021960347890853882, "learning_rate": 1.627889961954931e-05, "loss": 0.0003, "step": 4609 }, { "epoch": 1.3491366695932103, "grad_norm": 0.003494089236482978, "learning_rate": 1.627158326016974e-05, "loss": 0.0001, "step": 4610 }, { "epoch": 1.3494293239683932, "grad_norm": 0.017365049570798874, "learning_rate": 1.6264266900790167e-05, "loss": 0.0002, "step": 4611 }, { "epoch": 1.349721978343576, "grad_norm": 0.0007535493350587785, "learning_rate": 1.6256950541410595e-05, "loss": 0.0, "step": 4612 }, { "epoch": 1.3500146327187592, "grad_norm": 0.00664918078109622, "learning_rate": 1.6249634182031023e-05, "loss": 0.0001, "step": 4613 }, { "epoch": 1.350307287093942, "grad_norm": 0.8967113494873047, "learning_rate": 1.6242317822651448e-05, "loss": 0.0034, "step": 4614 }, { "epoch": 1.350599941469125, "grad_norm": 0.01985401101410389, "learning_rate": 1.6235001463271876e-05, "loss": 0.0002, "step": 4615 }, { "epoch": 1.3508925958443079, "grad_norm": 0.0015297238714993, "learning_rate": 1.6227685103892304e-05, "loss": 0.0, "step": 4616 }, { "epoch": 1.3511852502194908, "grad_norm": 0.0006844381568953395, "learning_rate": 1.6220368744512732e-05, "loss": 0.0, "step": 4617 }, { "epoch": 1.3514779045946737, "grad_norm": 0.00511653209105134, "learning_rate": 1.621305238513316e-05, "loss": 0.0, "step": 4618 }, { "epoch": 1.3517705589698565, "grad_norm": 0.004954398609697819, "learning_rate": 1.6205736025753584e-05, "loss": 0.0, "step": 4619 }, { "epoch": 1.3520632133450394, "grad_norm": 0.00022442862973548472, "learning_rate": 1.6198419666374012e-05, "loss": 0.0, "step": 4620 }, { "epoch": 1.3523558677202225, "grad_norm": 0.00047761958558112383, "learning_rate": 1.619110330699444e-05, "loss": 0.0, "step": 4621 }, { "epoch": 1.3526485220954054, "grad_norm": 0.027839884161949158, "learning_rate": 1.6183786947614868e-05, "loss": 0.0001, "step": 4622 }, { "epoch": 1.3529411764705883, "grad_norm": 0.00015593717398587614, "learning_rate": 1.6176470588235296e-05, "loss": 0.0, "step": 4623 }, { "epoch": 1.3532338308457712, "grad_norm": 0.0009893402457237244, "learning_rate": 1.6169154228855724e-05, "loss": 0.0, "step": 4624 }, { "epoch": 1.353526485220954, "grad_norm": 0.0024944068863987923, "learning_rate": 1.616183786947615e-05, "loss": 0.0, "step": 4625 }, { "epoch": 1.353819139596137, "grad_norm": 0.00030253714066930115, "learning_rate": 1.6154521510096577e-05, "loss": 0.0, "step": 4626 }, { "epoch": 1.3541117939713199, "grad_norm": 0.0011133461957797408, "learning_rate": 1.6147205150717005e-05, "loss": 0.0, "step": 4627 }, { "epoch": 1.3544044483465028, "grad_norm": 0.0009970914106816053, "learning_rate": 1.6139888791337432e-05, "loss": 0.0, "step": 4628 }, { "epoch": 1.3546971027216856, "grad_norm": 0.0009626855025999248, "learning_rate": 1.613257243195786e-05, "loss": 0.0, "step": 4629 }, { "epoch": 1.3549897570968685, "grad_norm": 0.0006123323692008853, "learning_rate": 1.6125256072578285e-05, "loss": 0.0, "step": 4630 }, { "epoch": 1.3552824114720514, "grad_norm": 0.010217566974461079, "learning_rate": 1.6117939713198713e-05, "loss": 0.0, "step": 4631 }, { "epoch": 1.3555750658472343, "grad_norm": 0.004811934195458889, "learning_rate": 1.611062335381914e-05, "loss": 0.0, "step": 4632 }, { "epoch": 1.3558677202224172, "grad_norm": 0.002419382566586137, "learning_rate": 1.610330699443957e-05, "loss": 0.0, "step": 4633 }, { "epoch": 1.3561603745976003, "grad_norm": 0.04514714330434799, "learning_rate": 1.6095990635059997e-05, "loss": 0.0002, "step": 4634 }, { "epoch": 1.3564530289727832, "grad_norm": 0.00044169966713525355, "learning_rate": 1.608867427568042e-05, "loss": 0.0, "step": 4635 }, { "epoch": 1.356745683347966, "grad_norm": 0.001194591517560184, "learning_rate": 1.608135791630085e-05, "loss": 0.0, "step": 4636 }, { "epoch": 1.357038337723149, "grad_norm": 2.3717167377471924, "learning_rate": 1.6074041556921277e-05, "loss": 0.0027, "step": 4637 }, { "epoch": 1.3573309920983319, "grad_norm": 0.0013330380897969007, "learning_rate": 1.6066725197541705e-05, "loss": 0.0, "step": 4638 }, { "epoch": 1.3576236464735147, "grad_norm": 0.00037112022982910275, "learning_rate": 1.6059408838162133e-05, "loss": 0.0, "step": 4639 }, { "epoch": 1.3579163008486976, "grad_norm": 0.00014045827265363187, "learning_rate": 1.605209247878256e-05, "loss": 0.0, "step": 4640 }, { "epoch": 1.3582089552238805, "grad_norm": 2.3475875854492188, "learning_rate": 1.6044776119402986e-05, "loss": 0.1643, "step": 4641 }, { "epoch": 1.3585016095990636, "grad_norm": 0.0006882725283503532, "learning_rate": 1.6037459760023414e-05, "loss": 0.0, "step": 4642 }, { "epoch": 1.3587942639742465, "grad_norm": 0.0011248913360759616, "learning_rate": 1.603014340064384e-05, "loss": 0.0, "step": 4643 }, { "epoch": 1.3590869183494294, "grad_norm": 0.0014706410001963377, "learning_rate": 1.602282704126427e-05, "loss": 0.0, "step": 4644 }, { "epoch": 1.3593795727246123, "grad_norm": 0.016344530507922173, "learning_rate": 1.6015510681884697e-05, "loss": 0.0001, "step": 4645 }, { "epoch": 1.3596722270997952, "grad_norm": 0.0013378773583099246, "learning_rate": 1.6008194322505122e-05, "loss": 0.0, "step": 4646 }, { "epoch": 1.359964881474978, "grad_norm": 0.0006376465316861868, "learning_rate": 1.600087796312555e-05, "loss": 0.0, "step": 4647 }, { "epoch": 1.360257535850161, "grad_norm": 0.0018161243060603738, "learning_rate": 1.5993561603745978e-05, "loss": 0.0, "step": 4648 }, { "epoch": 1.3605501902253438, "grad_norm": 0.9918575286865234, "learning_rate": 1.5986245244366406e-05, "loss": 0.0012, "step": 4649 }, { "epoch": 1.3608428446005267, "grad_norm": 0.0007200206164270639, "learning_rate": 1.5978928884986834e-05, "loss": 0.0, "step": 4650 }, { "epoch": 1.3611354989757096, "grad_norm": 0.03315776214003563, "learning_rate": 1.597161252560726e-05, "loss": 0.0003, "step": 4651 }, { "epoch": 1.3614281533508925, "grad_norm": 0.013876295648515224, "learning_rate": 1.5964296166227686e-05, "loss": 0.0002, "step": 4652 }, { "epoch": 1.3617208077260754, "grad_norm": 0.02295016311109066, "learning_rate": 1.5956979806848114e-05, "loss": 0.0002, "step": 4653 }, { "epoch": 1.3620134621012583, "grad_norm": 4.472840309143066, "learning_rate": 1.5949663447468542e-05, "loss": 0.2316, "step": 4654 }, { "epoch": 1.3623061164764414, "grad_norm": 2.750107765197754, "learning_rate": 1.594234708808897e-05, "loss": 0.0357, "step": 4655 }, { "epoch": 1.3625987708516243, "grad_norm": 0.0004565751878544688, "learning_rate": 1.5935030728709395e-05, "loss": 0.0, "step": 4656 }, { "epoch": 1.3628914252268072, "grad_norm": 0.0007538439822383225, "learning_rate": 1.5927714369329823e-05, "loss": 0.0, "step": 4657 }, { "epoch": 1.36318407960199, "grad_norm": 0.0005997567204758525, "learning_rate": 1.592039800995025e-05, "loss": 0.0, "step": 4658 }, { "epoch": 1.363476733977173, "grad_norm": 0.0017027143621817231, "learning_rate": 1.591308165057068e-05, "loss": 0.0, "step": 4659 }, { "epoch": 1.3637693883523558, "grad_norm": 0.04539995267987251, "learning_rate": 1.5905765291191103e-05, "loss": 0.0005, "step": 4660 }, { "epoch": 1.3640620427275387, "grad_norm": 0.0005709021934308112, "learning_rate": 1.589844893181153e-05, "loss": 0.0, "step": 4661 }, { "epoch": 1.3643546971027216, "grad_norm": 0.0025214501656591892, "learning_rate": 1.589113257243196e-05, "loss": 0.0, "step": 4662 }, { "epoch": 1.3646473514779047, "grad_norm": 0.0008173706009984016, "learning_rate": 1.5883816213052387e-05, "loss": 0.0, "step": 4663 }, { "epoch": 1.3649400058530876, "grad_norm": 0.0025939266197383404, "learning_rate": 1.587649985367281e-05, "loss": 0.0001, "step": 4664 }, { "epoch": 1.3652326602282705, "grad_norm": 0.0337810143828392, "learning_rate": 1.586918349429324e-05, "loss": 0.0003, "step": 4665 }, { "epoch": 1.3655253146034534, "grad_norm": 0.0031624024268239737, "learning_rate": 1.5861867134913667e-05, "loss": 0.0001, "step": 4666 }, { "epoch": 1.3658179689786363, "grad_norm": 0.0024290585424751043, "learning_rate": 1.5854550775534095e-05, "loss": 0.0, "step": 4667 }, { "epoch": 1.3661106233538192, "grad_norm": 0.0005581422010436654, "learning_rate": 1.584723441615452e-05, "loss": 0.0, "step": 4668 }, { "epoch": 1.366403277729002, "grad_norm": 0.0010313043603673577, "learning_rate": 1.5839918056774948e-05, "loss": 0.0, "step": 4669 }, { "epoch": 1.366695932104185, "grad_norm": 0.059076376259326935, "learning_rate": 1.5832601697395376e-05, "loss": 0.0003, "step": 4670 }, { "epoch": 1.3669885864793678, "grad_norm": 0.004700097721070051, "learning_rate": 1.5825285338015804e-05, "loss": 0.0001, "step": 4671 }, { "epoch": 1.3672812408545507, "grad_norm": 0.004779118578881025, "learning_rate": 1.581796897863623e-05, "loss": 0.0001, "step": 4672 }, { "epoch": 1.3675738952297336, "grad_norm": 0.002353015821427107, "learning_rate": 1.5810652619256656e-05, "loss": 0.0, "step": 4673 }, { "epoch": 1.3678665496049165, "grad_norm": 5.330374717712402, "learning_rate": 1.5803336259877084e-05, "loss": 0.235, "step": 4674 }, { "epoch": 1.3681592039800994, "grad_norm": 4.413346767425537, "learning_rate": 1.5796019900497512e-05, "loss": 0.256, "step": 4675 }, { "epoch": 1.3684518583552825, "grad_norm": 0.0033523140009492636, "learning_rate": 1.578870354111794e-05, "loss": 0.0001, "step": 4676 }, { "epoch": 1.3687445127304654, "grad_norm": 0.0006485573248937726, "learning_rate": 1.5781387181738368e-05, "loss": 0.0, "step": 4677 }, { "epoch": 1.3690371671056483, "grad_norm": 0.003252198686823249, "learning_rate": 1.5774070822358793e-05, "loss": 0.0001, "step": 4678 }, { "epoch": 1.3693298214808312, "grad_norm": 0.0008407050045207143, "learning_rate": 1.576675446297922e-05, "loss": 0.0, "step": 4679 }, { "epoch": 1.369622475856014, "grad_norm": 0.001850403961725533, "learning_rate": 1.575943810359965e-05, "loss": 0.0, "step": 4680 }, { "epoch": 1.369915130231197, "grad_norm": 0.16275891661643982, "learning_rate": 1.5752121744220077e-05, "loss": 0.0009, "step": 4681 }, { "epoch": 1.3702077846063798, "grad_norm": 0.29024508595466614, "learning_rate": 1.5744805384840504e-05, "loss": 0.0015, "step": 4682 }, { "epoch": 1.3705004389815627, "grad_norm": 0.8833554983139038, "learning_rate": 1.573748902546093e-05, "loss": 0.0039, "step": 4683 }, { "epoch": 1.3707930933567458, "grad_norm": 0.003219273639842868, "learning_rate": 1.5730172666081357e-05, "loss": 0.0001, "step": 4684 }, { "epoch": 1.3710857477319287, "grad_norm": 0.005362699273973703, "learning_rate": 1.5722856306701785e-05, "loss": 0.0001, "step": 4685 }, { "epoch": 1.3713784021071116, "grad_norm": 0.008708633482456207, "learning_rate": 1.5715539947322213e-05, "loss": 0.0002, "step": 4686 }, { "epoch": 1.3716710564822945, "grad_norm": 0.018540022894740105, "learning_rate": 1.570822358794264e-05, "loss": 0.0002, "step": 4687 }, { "epoch": 1.3719637108574774, "grad_norm": 0.007677059154957533, "learning_rate": 1.5700907228563065e-05, "loss": 0.0001, "step": 4688 }, { "epoch": 1.3722563652326603, "grad_norm": 0.0029118761885911226, "learning_rate": 1.5693590869183493e-05, "loss": 0.0001, "step": 4689 }, { "epoch": 1.3725490196078431, "grad_norm": 0.005615610629320145, "learning_rate": 1.568627450980392e-05, "loss": 0.0001, "step": 4690 }, { "epoch": 1.372841673983026, "grad_norm": 0.00372922420501709, "learning_rate": 1.567895815042435e-05, "loss": 0.0001, "step": 4691 }, { "epoch": 1.373134328358209, "grad_norm": 0.0014681870816275477, "learning_rate": 1.5671641791044777e-05, "loss": 0.0, "step": 4692 }, { "epoch": 1.3734269827333918, "grad_norm": 0.0036305561661720276, "learning_rate": 1.5664325431665205e-05, "loss": 0.0001, "step": 4693 }, { "epoch": 1.3737196371085747, "grad_norm": 0.003226341214030981, "learning_rate": 1.565700907228563e-05, "loss": 0.0001, "step": 4694 }, { "epoch": 1.3740122914837576, "grad_norm": 0.0033313813619315624, "learning_rate": 1.5649692712906058e-05, "loss": 0.0001, "step": 4695 }, { "epoch": 1.3743049458589405, "grad_norm": 0.002889020601287484, "learning_rate": 1.5642376353526486e-05, "loss": 0.0001, "step": 4696 }, { "epoch": 1.3745976002341234, "grad_norm": 0.001413546153344214, "learning_rate": 1.5635059994146914e-05, "loss": 0.0, "step": 4697 }, { "epoch": 1.3748902546093065, "grad_norm": 0.02051861211657524, "learning_rate": 1.562774363476734e-05, "loss": 0.0002, "step": 4698 }, { "epoch": 1.3751829089844894, "grad_norm": 0.02124558761715889, "learning_rate": 1.5620427275387766e-05, "loss": 0.0002, "step": 4699 }, { "epoch": 1.3754755633596722, "grad_norm": 0.042467426508665085, "learning_rate": 1.5613110916008194e-05, "loss": 0.0004, "step": 4700 }, { "epoch": 1.3757682177348551, "grad_norm": 1.1788674592971802, "learning_rate": 1.5605794556628622e-05, "loss": 0.0036, "step": 4701 }, { "epoch": 1.376060872110038, "grad_norm": 0.005982064642012119, "learning_rate": 1.559847819724905e-05, "loss": 0.0001, "step": 4702 }, { "epoch": 1.376353526485221, "grad_norm": 0.003018517279997468, "learning_rate": 1.5591161837869478e-05, "loss": 0.0001, "step": 4703 }, { "epoch": 1.3766461808604038, "grad_norm": 0.0016903309151530266, "learning_rate": 1.5583845478489902e-05, "loss": 0.0, "step": 4704 }, { "epoch": 1.3769388352355867, "grad_norm": 0.0051790825091302395, "learning_rate": 1.557652911911033e-05, "loss": 0.0001, "step": 4705 }, { "epoch": 1.3772314896107698, "grad_norm": 0.0019127910491079092, "learning_rate": 1.556921275973076e-05, "loss": 0.0, "step": 4706 }, { "epoch": 1.3775241439859527, "grad_norm": 0.0031322145368903875, "learning_rate": 1.5561896400351186e-05, "loss": 0.0001, "step": 4707 }, { "epoch": 1.3778167983611356, "grad_norm": 0.004797694273293018, "learning_rate": 1.5554580040971614e-05, "loss": 0.0001, "step": 4708 }, { "epoch": 1.3781094527363185, "grad_norm": 0.0022557724732905626, "learning_rate": 1.554726368159204e-05, "loss": 0.0001, "step": 4709 }, { "epoch": 1.3784021071115014, "grad_norm": 0.0006947844522073865, "learning_rate": 1.5539947322212467e-05, "loss": 0.0, "step": 4710 }, { "epoch": 1.3786947614866842, "grad_norm": 0.003478395752608776, "learning_rate": 1.5532630962832895e-05, "loss": 0.0001, "step": 4711 }, { "epoch": 1.3789874158618671, "grad_norm": 0.00031348259653896093, "learning_rate": 1.5525314603453323e-05, "loss": 0.0, "step": 4712 }, { "epoch": 1.37928007023705, "grad_norm": 0.0031743720173835754, "learning_rate": 1.551799824407375e-05, "loss": 0.0001, "step": 4713 }, { "epoch": 1.379572724612233, "grad_norm": 0.0017899590311571956, "learning_rate": 1.551068188469418e-05, "loss": 0.0, "step": 4714 }, { "epoch": 1.3798653789874158, "grad_norm": 0.0030690422281622887, "learning_rate": 1.5503365525314603e-05, "loss": 0.0, "step": 4715 }, { "epoch": 1.3801580333625987, "grad_norm": 0.007692730985581875, "learning_rate": 1.549604916593503e-05, "loss": 0.0001, "step": 4716 }, { "epoch": 1.3804506877377816, "grad_norm": 0.04855996370315552, "learning_rate": 1.548873280655546e-05, "loss": 0.0004, "step": 4717 }, { "epoch": 1.3807433421129645, "grad_norm": 0.007183206267654896, "learning_rate": 1.5481416447175887e-05, "loss": 0.0001, "step": 4718 }, { "epoch": 1.3810359964881476, "grad_norm": 0.001024767872877419, "learning_rate": 1.5474100087796315e-05, "loss": 0.0, "step": 4719 }, { "epoch": 1.3813286508633305, "grad_norm": 0.003089958569034934, "learning_rate": 1.546678372841674e-05, "loss": 0.0001, "step": 4720 }, { "epoch": 1.3816213052385133, "grad_norm": 0.010393207892775536, "learning_rate": 1.5459467369037167e-05, "loss": 0.0001, "step": 4721 }, { "epoch": 1.3819139596136962, "grad_norm": 0.03264763206243515, "learning_rate": 1.5452151009657595e-05, "loss": 0.0003, "step": 4722 }, { "epoch": 1.3822066139888791, "grad_norm": 0.0018647081451490521, "learning_rate": 1.5444834650278023e-05, "loss": 0.0001, "step": 4723 }, { "epoch": 1.382499268364062, "grad_norm": 0.001332324231043458, "learning_rate": 1.543751829089845e-05, "loss": 0.0, "step": 4724 }, { "epoch": 1.382791922739245, "grad_norm": 0.015286540612578392, "learning_rate": 1.5430201931518876e-05, "loss": 0.0001, "step": 4725 }, { "epoch": 1.3830845771144278, "grad_norm": 0.0017781606875360012, "learning_rate": 1.5422885572139304e-05, "loss": 0.0, "step": 4726 }, { "epoch": 1.383377231489611, "grad_norm": 0.0014070915058255196, "learning_rate": 1.5415569212759732e-05, "loss": 0.0, "step": 4727 }, { "epoch": 1.3836698858647938, "grad_norm": 0.0006733147893100977, "learning_rate": 1.540825285338016e-05, "loss": 0.0, "step": 4728 }, { "epoch": 1.3839625402399767, "grad_norm": 0.001550506567582488, "learning_rate": 1.5400936494000588e-05, "loss": 0.0, "step": 4729 }, { "epoch": 1.3842551946151596, "grad_norm": 0.0058020418509840965, "learning_rate": 1.5393620134621016e-05, "loss": 0.0001, "step": 4730 }, { "epoch": 1.3845478489903424, "grad_norm": 0.005312574096024036, "learning_rate": 1.538630377524144e-05, "loss": 0.0001, "step": 4731 }, { "epoch": 1.3848405033655253, "grad_norm": 0.003337213536724448, "learning_rate": 1.5378987415861868e-05, "loss": 0.0, "step": 4732 }, { "epoch": 1.3851331577407082, "grad_norm": 0.0022060826886445284, "learning_rate": 1.5371671056482296e-05, "loss": 0.0, "step": 4733 }, { "epoch": 1.385425812115891, "grad_norm": 0.0006479641306214035, "learning_rate": 1.5364354697102724e-05, "loss": 0.0, "step": 4734 }, { "epoch": 1.385718466491074, "grad_norm": 0.0009419500129297376, "learning_rate": 1.5357038337723152e-05, "loss": 0.0, "step": 4735 }, { "epoch": 1.3860111208662569, "grad_norm": 0.0012897298438474536, "learning_rate": 1.5349721978343577e-05, "loss": 0.0, "step": 4736 }, { "epoch": 1.3863037752414398, "grad_norm": 0.0025886131916195154, "learning_rate": 1.5342405618964004e-05, "loss": 0.0001, "step": 4737 }, { "epoch": 1.3865964296166227, "grad_norm": 0.0006920943851582706, "learning_rate": 1.5335089259584432e-05, "loss": 0.0, "step": 4738 }, { "epoch": 1.3868890839918055, "grad_norm": 0.0004685150342993438, "learning_rate": 1.532777290020486e-05, "loss": 0.0, "step": 4739 }, { "epoch": 1.3871817383669887, "grad_norm": 0.000487115845317021, "learning_rate": 1.532045654082529e-05, "loss": 0.0, "step": 4740 }, { "epoch": 1.3874743927421715, "grad_norm": 0.00047260106657631695, "learning_rate": 1.5313140181445713e-05, "loss": 0.0, "step": 4741 }, { "epoch": 1.3877670471173544, "grad_norm": 0.008047452196478844, "learning_rate": 1.530582382206614e-05, "loss": 0.0001, "step": 4742 }, { "epoch": 1.3880597014925373, "grad_norm": 0.02765766903758049, "learning_rate": 1.529850746268657e-05, "loss": 0.0002, "step": 4743 }, { "epoch": 1.3883523558677202, "grad_norm": 0.0014313700376078486, "learning_rate": 1.5291191103306997e-05, "loss": 0.0, "step": 4744 }, { "epoch": 1.388645010242903, "grad_norm": 0.21554401516914368, "learning_rate": 1.5283874743927425e-05, "loss": 0.0008, "step": 4745 }, { "epoch": 1.388937664618086, "grad_norm": 0.002505704527720809, "learning_rate": 1.5276558384547853e-05, "loss": 0.0, "step": 4746 }, { "epoch": 1.3892303189932689, "grad_norm": 0.0006845356547273695, "learning_rate": 1.5269242025168277e-05, "loss": 0.0, "step": 4747 }, { "epoch": 1.389522973368452, "grad_norm": 0.0009476763661950827, "learning_rate": 1.5261925665788705e-05, "loss": 0.0, "step": 4748 }, { "epoch": 1.3898156277436349, "grad_norm": 0.019107108935713768, "learning_rate": 1.5254609306409131e-05, "loss": 0.0002, "step": 4749 }, { "epoch": 1.3901082821188178, "grad_norm": 0.0009891375666484237, "learning_rate": 1.524729294702956e-05, "loss": 0.0, "step": 4750 }, { "epoch": 1.3904009364940006, "grad_norm": 0.0008806857513263822, "learning_rate": 1.5239976587649987e-05, "loss": 0.0, "step": 4751 }, { "epoch": 1.3906935908691835, "grad_norm": 0.0010386556386947632, "learning_rate": 1.5232660228270412e-05, "loss": 0.0, "step": 4752 }, { "epoch": 1.3909862452443664, "grad_norm": 0.001391564728692174, "learning_rate": 1.522534386889084e-05, "loss": 0.0, "step": 4753 }, { "epoch": 1.3912788996195493, "grad_norm": 0.00067763717379421, "learning_rate": 1.5218027509511268e-05, "loss": 0.0, "step": 4754 }, { "epoch": 1.3915715539947322, "grad_norm": 0.000680471770465374, "learning_rate": 1.5210711150131696e-05, "loss": 0.0, "step": 4755 }, { "epoch": 1.391864208369915, "grad_norm": 0.001424195826984942, "learning_rate": 1.5203394790752124e-05, "loss": 0.0, "step": 4756 }, { "epoch": 1.392156862745098, "grad_norm": 0.0004938715137541294, "learning_rate": 1.5196078431372548e-05, "loss": 0.0, "step": 4757 }, { "epoch": 1.3924495171202809, "grad_norm": 0.0009264610707759857, "learning_rate": 1.5188762071992976e-05, "loss": 0.0, "step": 4758 }, { "epoch": 1.3927421714954638, "grad_norm": 0.024234339594841003, "learning_rate": 1.5181445712613404e-05, "loss": 0.0002, "step": 4759 }, { "epoch": 1.3930348258706466, "grad_norm": 0.0010981353698298335, "learning_rate": 1.5174129353233832e-05, "loss": 0.0, "step": 4760 }, { "epoch": 1.3933274802458298, "grad_norm": 0.0005907249869778752, "learning_rate": 1.516681299385426e-05, "loss": 0.0, "step": 4761 }, { "epoch": 1.3936201346210126, "grad_norm": 25.285842895507812, "learning_rate": 1.5159496634474688e-05, "loss": 0.0793, "step": 4762 }, { "epoch": 1.3939127889961955, "grad_norm": 0.0005344810779206455, "learning_rate": 1.5152180275095113e-05, "loss": 0.0, "step": 4763 }, { "epoch": 1.3942054433713784, "grad_norm": 0.003299710340797901, "learning_rate": 1.514486391571554e-05, "loss": 0.0001, "step": 4764 }, { "epoch": 1.3944980977465613, "grad_norm": 0.0008219130686484277, "learning_rate": 1.5137547556335968e-05, "loss": 0.0, "step": 4765 }, { "epoch": 1.3947907521217442, "grad_norm": 0.013707981444895267, "learning_rate": 1.5130231196956396e-05, "loss": 0.0001, "step": 4766 }, { "epoch": 1.395083406496927, "grad_norm": 0.0005613876855932176, "learning_rate": 1.5122914837576824e-05, "loss": 0.0, "step": 4767 }, { "epoch": 1.39537606087211, "grad_norm": 0.0008589632343500853, "learning_rate": 1.5115598478197249e-05, "loss": 0.0, "step": 4768 }, { "epoch": 1.395668715247293, "grad_norm": 0.0007482717046514153, "learning_rate": 1.5108282118817677e-05, "loss": 0.0, "step": 4769 }, { "epoch": 1.395961369622476, "grad_norm": 0.0007141092792153358, "learning_rate": 1.5100965759438105e-05, "loss": 0.0, "step": 4770 }, { "epoch": 1.3962540239976589, "grad_norm": 0.0016803006874397397, "learning_rate": 1.5093649400058533e-05, "loss": 0.0, "step": 4771 }, { "epoch": 1.3965466783728417, "grad_norm": 0.0020187615882605314, "learning_rate": 1.508633304067896e-05, "loss": 0.0, "step": 4772 }, { "epoch": 1.3968393327480246, "grad_norm": 0.00683267368003726, "learning_rate": 1.5079016681299385e-05, "loss": 0.0001, "step": 4773 }, { "epoch": 1.3971319871232075, "grad_norm": 0.00163374247495085, "learning_rate": 1.5071700321919813e-05, "loss": 0.0, "step": 4774 }, { "epoch": 1.3974246414983904, "grad_norm": 0.001296279369853437, "learning_rate": 1.5064383962540241e-05, "loss": 0.0, "step": 4775 }, { "epoch": 1.3977172958735733, "grad_norm": 0.013661934062838554, "learning_rate": 1.5057067603160669e-05, "loss": 0.0002, "step": 4776 }, { "epoch": 1.3980099502487562, "grad_norm": 0.053470246493816376, "learning_rate": 1.5049751243781095e-05, "loss": 0.0002, "step": 4777 }, { "epoch": 1.398302604623939, "grad_norm": 0.0011320695048198104, "learning_rate": 1.5042434884401522e-05, "loss": 0.0, "step": 4778 }, { "epoch": 1.398595258999122, "grad_norm": 0.0019740501884371042, "learning_rate": 1.503511852502195e-05, "loss": 0.0, "step": 4779 }, { "epoch": 1.3988879133743048, "grad_norm": 0.0005951537750661373, "learning_rate": 1.5027802165642378e-05, "loss": 0.0, "step": 4780 }, { "epoch": 1.3991805677494877, "grad_norm": 0.0020884210243821144, "learning_rate": 1.5020485806262804e-05, "loss": 0.0, "step": 4781 }, { "epoch": 1.3994732221246708, "grad_norm": 0.0016150816809386015, "learning_rate": 1.5013169446883232e-05, "loss": 0.0, "step": 4782 }, { "epoch": 1.3997658764998537, "grad_norm": 0.0014983267756178975, "learning_rate": 1.500585308750366e-05, "loss": 0.0, "step": 4783 }, { "epoch": 1.4000585308750366, "grad_norm": 0.004859476815909147, "learning_rate": 1.4998536728124086e-05, "loss": 0.0001, "step": 4784 }, { "epoch": 1.4003511852502195, "grad_norm": 0.0011176099069416523, "learning_rate": 1.4991220368744512e-05, "loss": 0.0, "step": 4785 }, { "epoch": 1.4006438396254024, "grad_norm": 0.003072738880291581, "learning_rate": 1.498390400936494e-05, "loss": 0.0001, "step": 4786 }, { "epoch": 1.4009364940005853, "grad_norm": 0.0038070150185376406, "learning_rate": 1.4976587649985368e-05, "loss": 0.0001, "step": 4787 }, { "epoch": 1.4012291483757682, "grad_norm": 0.0016997898928821087, "learning_rate": 1.4969271290605796e-05, "loss": 0.0, "step": 4788 }, { "epoch": 1.401521802750951, "grad_norm": 0.0010296344989910722, "learning_rate": 1.496195493122622e-05, "loss": 0.0, "step": 4789 }, { "epoch": 1.4018144571261342, "grad_norm": 0.00441362801939249, "learning_rate": 1.4954638571846649e-05, "loss": 0.0001, "step": 4790 }, { "epoch": 1.402107111501317, "grad_norm": 0.0014113447396084666, "learning_rate": 1.4947322212467077e-05, "loss": 0.0, "step": 4791 }, { "epoch": 1.4023997658765, "grad_norm": 0.008393810130655766, "learning_rate": 1.4940005853087504e-05, "loss": 0.0001, "step": 4792 }, { "epoch": 1.4026924202516828, "grad_norm": 10.538448333740234, "learning_rate": 1.4932689493707932e-05, "loss": 0.0358, "step": 4793 }, { "epoch": 1.4029850746268657, "grad_norm": 0.0042242067866027355, "learning_rate": 1.4925373134328357e-05, "loss": 0.0001, "step": 4794 }, { "epoch": 1.4032777290020486, "grad_norm": 0.0030644184444099665, "learning_rate": 1.4918056774948785e-05, "loss": 0.0, "step": 4795 }, { "epoch": 1.4035703833772315, "grad_norm": 0.0134981544688344, "learning_rate": 1.4910740415569213e-05, "loss": 0.0001, "step": 4796 }, { "epoch": 1.4038630377524144, "grad_norm": 0.0011584991589188576, "learning_rate": 1.4903424056189641e-05, "loss": 0.0, "step": 4797 }, { "epoch": 1.4041556921275973, "grad_norm": 0.0006236496847122908, "learning_rate": 1.4896107696810069e-05, "loss": 0.0, "step": 4798 }, { "epoch": 1.4044483465027802, "grad_norm": 0.0005023694830015302, "learning_rate": 1.4888791337430497e-05, "loss": 0.0, "step": 4799 }, { "epoch": 1.404741000877963, "grad_norm": 0.0009039943688549101, "learning_rate": 1.4881474978050921e-05, "loss": 0.0, "step": 4800 }, { "epoch": 1.405033655253146, "grad_norm": 0.0020111447665840387, "learning_rate": 1.487415861867135e-05, "loss": 0.0, "step": 4801 }, { "epoch": 1.4053263096283288, "grad_norm": 0.001803457853384316, "learning_rate": 1.4866842259291777e-05, "loss": 0.0, "step": 4802 }, { "epoch": 1.4056189640035117, "grad_norm": 0.0007646935409866273, "learning_rate": 1.4859525899912205e-05, "loss": 0.0, "step": 4803 }, { "epoch": 1.4059116183786948, "grad_norm": 0.0012778372038155794, "learning_rate": 1.4852209540532633e-05, "loss": 0.0, "step": 4804 }, { "epoch": 1.4062042727538777, "grad_norm": 0.0005517517565749586, "learning_rate": 1.4844893181153058e-05, "loss": 0.0, "step": 4805 }, { "epoch": 1.4064969271290606, "grad_norm": 0.004605107940733433, "learning_rate": 1.4837576821773486e-05, "loss": 0.0001, "step": 4806 }, { "epoch": 1.4067895815042435, "grad_norm": 0.0006094065029174089, "learning_rate": 1.4830260462393914e-05, "loss": 0.0, "step": 4807 }, { "epoch": 1.4070822358794264, "grad_norm": 0.0008657328435219824, "learning_rate": 1.4822944103014342e-05, "loss": 0.0, "step": 4808 }, { "epoch": 1.4073748902546093, "grad_norm": 0.002067999681457877, "learning_rate": 1.481562774363477e-05, "loss": 0.0, "step": 4809 }, { "epoch": 1.4076675446297922, "grad_norm": 0.0011860699160024524, "learning_rate": 1.4808311384255194e-05, "loss": 0.0, "step": 4810 }, { "epoch": 1.407960199004975, "grad_norm": 0.00043056573485955596, "learning_rate": 1.4800995024875622e-05, "loss": 0.0, "step": 4811 }, { "epoch": 1.4082528533801582, "grad_norm": 0.0012745216954499483, "learning_rate": 1.479367866549605e-05, "loss": 0.0, "step": 4812 }, { "epoch": 1.408545507755341, "grad_norm": 0.0004913328448310494, "learning_rate": 1.4786362306116478e-05, "loss": 0.0, "step": 4813 }, { "epoch": 1.408838162130524, "grad_norm": 0.0008683112100698054, "learning_rate": 1.4779045946736906e-05, "loss": 0.0, "step": 4814 }, { "epoch": 1.4091308165057068, "grad_norm": 0.0005154652753844857, "learning_rate": 1.4771729587357334e-05, "loss": 0.0, "step": 4815 }, { "epoch": 1.4094234708808897, "grad_norm": 0.0005342514486983418, "learning_rate": 1.4764413227977758e-05, "loss": 0.0, "step": 4816 }, { "epoch": 1.4097161252560726, "grad_norm": 0.0017309903632849455, "learning_rate": 1.4757096868598186e-05, "loss": 0.0, "step": 4817 }, { "epoch": 1.4100087796312555, "grad_norm": 0.0008248227532021701, "learning_rate": 1.4749780509218614e-05, "loss": 0.0, "step": 4818 }, { "epoch": 1.4103014340064384, "grad_norm": 0.000505580217577517, "learning_rate": 1.4742464149839042e-05, "loss": 0.0, "step": 4819 }, { "epoch": 1.4105940883816213, "grad_norm": 0.0016366797499358654, "learning_rate": 1.4735147790459468e-05, "loss": 0.0, "step": 4820 }, { "epoch": 1.4108867427568041, "grad_norm": 0.0010418167803436518, "learning_rate": 1.4727831431079895e-05, "loss": 0.0, "step": 4821 }, { "epoch": 1.411179397131987, "grad_norm": 0.0002579023130238056, "learning_rate": 1.4720515071700323e-05, "loss": 0.0, "step": 4822 }, { "epoch": 1.41147205150717, "grad_norm": 0.00028388010105118155, "learning_rate": 1.471319871232075e-05, "loss": 0.0, "step": 4823 }, { "epoch": 1.4117647058823528, "grad_norm": 0.0003754272765945643, "learning_rate": 1.4705882352941177e-05, "loss": 0.0, "step": 4824 }, { "epoch": 1.412057360257536, "grad_norm": 0.007686973083764315, "learning_rate": 1.4698565993561605e-05, "loss": 0.0001, "step": 4825 }, { "epoch": 1.4123500146327188, "grad_norm": 0.0014296626904979348, "learning_rate": 1.4691249634182031e-05, "loss": 0.0, "step": 4826 }, { "epoch": 1.4126426690079017, "grad_norm": 0.000988753279671073, "learning_rate": 1.4683933274802459e-05, "loss": 0.0, "step": 4827 }, { "epoch": 1.4129353233830846, "grad_norm": 0.005212455987930298, "learning_rate": 1.4676616915422885e-05, "loss": 0.0001, "step": 4828 }, { "epoch": 1.4132279777582675, "grad_norm": 0.0006819405243732035, "learning_rate": 1.4669300556043313e-05, "loss": 0.0, "step": 4829 }, { "epoch": 1.4135206321334504, "grad_norm": 0.0005956888780929148, "learning_rate": 1.4661984196663741e-05, "loss": 0.0, "step": 4830 }, { "epoch": 1.4138132865086332, "grad_norm": 0.0010589334415271878, "learning_rate": 1.4654667837284169e-05, "loss": 0.0, "step": 4831 }, { "epoch": 1.4141059408838161, "grad_norm": 0.0024124777410179377, "learning_rate": 1.4647351477904594e-05, "loss": 0.0, "step": 4832 }, { "epoch": 1.4143985952589992, "grad_norm": 0.0024167487863451242, "learning_rate": 1.4640035118525022e-05, "loss": 0.0, "step": 4833 }, { "epoch": 1.4146912496341821, "grad_norm": 0.001746480236761272, "learning_rate": 1.463271875914545e-05, "loss": 0.0, "step": 4834 }, { "epoch": 1.414983904009365, "grad_norm": 0.0018156811129301786, "learning_rate": 1.4625402399765878e-05, "loss": 0.0, "step": 4835 }, { "epoch": 1.415276558384548, "grad_norm": 0.0006777496309950948, "learning_rate": 1.4618086040386306e-05, "loss": 0.0, "step": 4836 }, { "epoch": 1.4155692127597308, "grad_norm": 0.0018828504253178835, "learning_rate": 1.461076968100673e-05, "loss": 0.0, "step": 4837 }, { "epoch": 1.4158618671349137, "grad_norm": 0.0006251346203498542, "learning_rate": 1.4603453321627158e-05, "loss": 0.0, "step": 4838 }, { "epoch": 1.4161545215100966, "grad_norm": 0.0006089691305533051, "learning_rate": 1.4596136962247586e-05, "loss": 0.0, "step": 4839 }, { "epoch": 1.4164471758852795, "grad_norm": 0.0014087685849517584, "learning_rate": 1.4588820602868014e-05, "loss": 0.0, "step": 4840 }, { "epoch": 1.4167398302604624, "grad_norm": 0.0006932232645340264, "learning_rate": 1.4581504243488442e-05, "loss": 0.0, "step": 4841 }, { "epoch": 1.4170324846356452, "grad_norm": 0.0017885544802993536, "learning_rate": 1.4574187884108866e-05, "loss": 0.0, "step": 4842 }, { "epoch": 1.4173251390108281, "grad_norm": 0.00011074745270889252, "learning_rate": 1.4566871524729294e-05, "loss": 0.0, "step": 4843 }, { "epoch": 1.417617793386011, "grad_norm": 0.0006461992743425071, "learning_rate": 1.4559555165349722e-05, "loss": 0.0, "step": 4844 }, { "epoch": 1.417910447761194, "grad_norm": 0.002514176769182086, "learning_rate": 1.455223880597015e-05, "loss": 0.0001, "step": 4845 }, { "epoch": 1.418203102136377, "grad_norm": 0.0006553163984790444, "learning_rate": 1.4544922446590578e-05, "loss": 0.0, "step": 4846 }, { "epoch": 1.41849575651156, "grad_norm": 0.0013165060663595796, "learning_rate": 1.4537606087211003e-05, "loss": 0.0, "step": 4847 }, { "epoch": 1.4187884108867428, "grad_norm": 0.0023327763192355633, "learning_rate": 1.453028972783143e-05, "loss": 0.0, "step": 4848 }, { "epoch": 1.4190810652619257, "grad_norm": 0.0013811510289087892, "learning_rate": 1.4522973368451859e-05, "loss": 0.0, "step": 4849 }, { "epoch": 1.4193737196371086, "grad_norm": 0.00045944369048811495, "learning_rate": 1.4515657009072287e-05, "loss": 0.0, "step": 4850 }, { "epoch": 1.4196663740122915, "grad_norm": 0.0009314783965237439, "learning_rate": 1.4508340649692715e-05, "loss": 0.0, "step": 4851 }, { "epoch": 1.4199590283874743, "grad_norm": 0.00024277539341710508, "learning_rate": 1.4501024290313143e-05, "loss": 0.0, "step": 4852 }, { "epoch": 1.4202516827626572, "grad_norm": 0.0004942430532537401, "learning_rate": 1.4493707930933567e-05, "loss": 0.0, "step": 4853 }, { "epoch": 1.4205443371378403, "grad_norm": 0.0028814657125622034, "learning_rate": 1.4486391571553995e-05, "loss": 0.0, "step": 4854 }, { "epoch": 1.4208369915130232, "grad_norm": 0.001684304908849299, "learning_rate": 1.4479075212174423e-05, "loss": 0.0, "step": 4855 }, { "epoch": 1.4211296458882061, "grad_norm": 0.0008666531648486853, "learning_rate": 1.4471758852794851e-05, "loss": 0.0, "step": 4856 }, { "epoch": 1.421422300263389, "grad_norm": 0.0023896319326013327, "learning_rate": 1.4464442493415279e-05, "loss": 0.0, "step": 4857 }, { "epoch": 1.421714954638572, "grad_norm": 0.0006379640544764698, "learning_rate": 1.4457126134035703e-05, "loss": 0.0, "step": 4858 }, { "epoch": 1.4220076090137548, "grad_norm": 0.0034662364050745964, "learning_rate": 1.4449809774656131e-05, "loss": 0.0001, "step": 4859 }, { "epoch": 1.4223002633889377, "grad_norm": 0.40443992614746094, "learning_rate": 1.444249341527656e-05, "loss": 0.0012, "step": 4860 }, { "epoch": 1.4225929177641206, "grad_norm": 0.0009623516816645861, "learning_rate": 1.4435177055896987e-05, "loss": 0.0, "step": 4861 }, { "epoch": 1.4228855721393034, "grad_norm": 0.004101148806512356, "learning_rate": 1.4427860696517415e-05, "loss": 0.0, "step": 4862 }, { "epoch": 1.4231782265144863, "grad_norm": 0.0006218489143066108, "learning_rate": 1.442054433713784e-05, "loss": 0.0, "step": 4863 }, { "epoch": 1.4234708808896692, "grad_norm": 0.0009097387082874775, "learning_rate": 1.4413227977758268e-05, "loss": 0.0, "step": 4864 }, { "epoch": 1.423763535264852, "grad_norm": 0.0008629619260318577, "learning_rate": 1.4405911618378696e-05, "loss": 0.0, "step": 4865 }, { "epoch": 1.424056189640035, "grad_norm": 0.0007062812801450491, "learning_rate": 1.4398595258999124e-05, "loss": 0.0, "step": 4866 }, { "epoch": 1.424348844015218, "grad_norm": 0.0006604838999919593, "learning_rate": 1.439127889961955e-05, "loss": 0.0, "step": 4867 }, { "epoch": 1.424641498390401, "grad_norm": 0.0003423292946536094, "learning_rate": 1.4383962540239978e-05, "loss": 0.0, "step": 4868 }, { "epoch": 1.4249341527655839, "grad_norm": 0.0016571956221014261, "learning_rate": 1.4376646180860404e-05, "loss": 0.0, "step": 4869 }, { "epoch": 1.4252268071407668, "grad_norm": 0.0017341498751193285, "learning_rate": 1.4369329821480832e-05, "loss": 0.0, "step": 4870 }, { "epoch": 1.4255194615159497, "grad_norm": 0.0006102113402448595, "learning_rate": 1.4362013462101258e-05, "loss": 0.0, "step": 4871 }, { "epoch": 1.4258121158911325, "grad_norm": 0.0016994901234284043, "learning_rate": 1.4354697102721686e-05, "loss": 0.0, "step": 4872 }, { "epoch": 1.4261047702663154, "grad_norm": 0.0011881024111062288, "learning_rate": 1.4347380743342114e-05, "loss": 0.0, "step": 4873 }, { "epoch": 1.4263974246414983, "grad_norm": 0.0010435834992676973, "learning_rate": 1.434006438396254e-05, "loss": 0.0, "step": 4874 }, { "epoch": 1.4266900790166814, "grad_norm": 0.007706243544816971, "learning_rate": 1.4332748024582968e-05, "loss": 0.0001, "step": 4875 }, { "epoch": 1.4269827333918643, "grad_norm": 0.0002755485475063324, "learning_rate": 1.4325431665203395e-05, "loss": 0.0, "step": 4876 }, { "epoch": 1.4272753877670472, "grad_norm": 0.004249555990099907, "learning_rate": 1.4318115305823823e-05, "loss": 0.0001, "step": 4877 }, { "epoch": 1.42756804214223, "grad_norm": 0.000787175667937845, "learning_rate": 1.431079894644425e-05, "loss": 0.0, "step": 4878 }, { "epoch": 1.427860696517413, "grad_norm": 0.0003150795819237828, "learning_rate": 1.4303482587064677e-05, "loss": 0.0, "step": 4879 }, { "epoch": 1.4281533508925959, "grad_norm": 0.0005350884748622775, "learning_rate": 1.4296166227685103e-05, "loss": 0.0, "step": 4880 }, { "epoch": 1.4284460052677788, "grad_norm": 0.006955491844564676, "learning_rate": 1.4288849868305531e-05, "loss": 0.0001, "step": 4881 }, { "epoch": 1.4287386596429617, "grad_norm": 0.015475178137421608, "learning_rate": 1.4281533508925959e-05, "loss": 0.0001, "step": 4882 }, { "epoch": 1.4290313140181445, "grad_norm": 0.0012212616857141256, "learning_rate": 1.4274217149546387e-05, "loss": 0.0, "step": 4883 }, { "epoch": 1.4293239683933274, "grad_norm": 0.0003490689559839666, "learning_rate": 1.4266900790166815e-05, "loss": 0.0, "step": 4884 }, { "epoch": 1.4296166227685103, "grad_norm": 0.00045466903247870505, "learning_rate": 1.425958443078724e-05, "loss": 0.0, "step": 4885 }, { "epoch": 1.4299092771436932, "grad_norm": 0.0018789108144119382, "learning_rate": 1.4252268071407667e-05, "loss": 0.0, "step": 4886 }, { "epoch": 1.430201931518876, "grad_norm": 0.0007043906371109188, "learning_rate": 1.4244951712028095e-05, "loss": 0.0, "step": 4887 }, { "epoch": 1.430494585894059, "grad_norm": 0.0014209231594577432, "learning_rate": 1.4237635352648523e-05, "loss": 0.0, "step": 4888 }, { "epoch": 1.430787240269242, "grad_norm": 0.00036013106000609696, "learning_rate": 1.4230318993268951e-05, "loss": 0.0, "step": 4889 }, { "epoch": 1.431079894644425, "grad_norm": 0.000885677698533982, "learning_rate": 1.4223002633889376e-05, "loss": 0.0, "step": 4890 }, { "epoch": 1.4313725490196079, "grad_norm": 0.0004048089904244989, "learning_rate": 1.4215686274509804e-05, "loss": 0.0, "step": 4891 }, { "epoch": 1.4316652033947908, "grad_norm": 0.000629786285571754, "learning_rate": 1.4208369915130232e-05, "loss": 0.0, "step": 4892 }, { "epoch": 1.4319578577699736, "grad_norm": 0.0004508458368945867, "learning_rate": 1.420105355575066e-05, "loss": 0.0, "step": 4893 }, { "epoch": 1.4322505121451565, "grad_norm": 0.0013702167198061943, "learning_rate": 1.4193737196371088e-05, "loss": 0.0, "step": 4894 }, { "epoch": 1.4325431665203394, "grad_norm": 0.00038688714266754687, "learning_rate": 1.4186420836991512e-05, "loss": 0.0, "step": 4895 }, { "epoch": 1.4328358208955223, "grad_norm": 0.0003965978976339102, "learning_rate": 1.417910447761194e-05, "loss": 0.0, "step": 4896 }, { "epoch": 1.4331284752707054, "grad_norm": 0.001525850617326796, "learning_rate": 1.4171788118232368e-05, "loss": 0.0, "step": 4897 }, { "epoch": 1.4334211296458883, "grad_norm": 0.00031529253465123475, "learning_rate": 1.4164471758852796e-05, "loss": 0.0, "step": 4898 }, { "epoch": 1.4337137840210712, "grad_norm": 0.0014511430636048317, "learning_rate": 1.4157155399473224e-05, "loss": 0.0, "step": 4899 }, { "epoch": 1.434006438396254, "grad_norm": 0.003197466256096959, "learning_rate": 1.4149839040093649e-05, "loss": 0.0, "step": 4900 }, { "epoch": 1.434299092771437, "grad_norm": 0.00047766315401531756, "learning_rate": 1.4142522680714077e-05, "loss": 0.0, "step": 4901 }, { "epoch": 1.4345917471466199, "grad_norm": 0.00045633772970177233, "learning_rate": 1.4135206321334504e-05, "loss": 0.0, "step": 4902 }, { "epoch": 1.4348844015218027, "grad_norm": 0.0004014969163108617, "learning_rate": 1.4127889961954932e-05, "loss": 0.0, "step": 4903 }, { "epoch": 1.4351770558969856, "grad_norm": 0.0007565372507087886, "learning_rate": 1.412057360257536e-05, "loss": 0.0, "step": 4904 }, { "epoch": 1.4354697102721685, "grad_norm": 0.0011360982898622751, "learning_rate": 1.4113257243195788e-05, "loss": 0.0, "step": 4905 }, { "epoch": 1.4357623646473514, "grad_norm": 0.0017992169596254826, "learning_rate": 1.4105940883816213e-05, "loss": 0.0, "step": 4906 }, { "epoch": 1.4360550190225343, "grad_norm": 0.00038683050661347806, "learning_rate": 1.409862452443664e-05, "loss": 0.0, "step": 4907 }, { "epoch": 1.4363476733977172, "grad_norm": 0.001485635293647647, "learning_rate": 1.4091308165057069e-05, "loss": 0.0, "step": 4908 }, { "epoch": 1.4366403277729, "grad_norm": 0.0007270164205692708, "learning_rate": 1.4083991805677497e-05, "loss": 0.0, "step": 4909 }, { "epoch": 1.4369329821480832, "grad_norm": 0.0008282930357381701, "learning_rate": 1.4076675446297925e-05, "loss": 0.0, "step": 4910 }, { "epoch": 1.437225636523266, "grad_norm": 17.508834838867188, "learning_rate": 1.406935908691835e-05, "loss": 0.0674, "step": 4911 }, { "epoch": 1.437518290898449, "grad_norm": 0.0011270351242274046, "learning_rate": 1.4062042727538777e-05, "loss": 0.0, "step": 4912 }, { "epoch": 1.4378109452736318, "grad_norm": 0.0009964543860405684, "learning_rate": 1.4054726368159205e-05, "loss": 0.0, "step": 4913 }, { "epoch": 1.4381035996488147, "grad_norm": 0.0013905660016462207, "learning_rate": 1.4047410008779633e-05, "loss": 0.0, "step": 4914 }, { "epoch": 1.4383962540239976, "grad_norm": 0.00012497082934714854, "learning_rate": 1.404009364940006e-05, "loss": 0.0, "step": 4915 }, { "epoch": 1.4386889083991805, "grad_norm": 0.19590729475021362, "learning_rate": 1.4032777290020486e-05, "loss": 0.0005, "step": 4916 }, { "epoch": 1.4389815627743634, "grad_norm": 0.0005345430108718574, "learning_rate": 1.4025460930640914e-05, "loss": 0.0, "step": 4917 }, { "epoch": 1.4392742171495465, "grad_norm": 0.0005475014913827181, "learning_rate": 1.4018144571261342e-05, "loss": 0.0, "step": 4918 }, { "epoch": 1.4395668715247294, "grad_norm": 0.0006744549027644098, "learning_rate": 1.4010828211881768e-05, "loss": 0.0, "step": 4919 }, { "epoch": 1.4398595258999123, "grad_norm": 0.0010098711354658008, "learning_rate": 1.4003511852502196e-05, "loss": 0.0, "step": 4920 }, { "epoch": 1.4401521802750952, "grad_norm": 0.00028649295563809574, "learning_rate": 1.3996195493122624e-05, "loss": 0.0, "step": 4921 }, { "epoch": 1.440444834650278, "grad_norm": 5.341841220855713, "learning_rate": 1.398887913374305e-05, "loss": 0.0083, "step": 4922 }, { "epoch": 1.440737489025461, "grad_norm": 0.0038020676001906395, "learning_rate": 1.3981562774363476e-05, "loss": 0.0001, "step": 4923 }, { "epoch": 1.4410301434006438, "grad_norm": 0.0007207989692687988, "learning_rate": 1.3974246414983904e-05, "loss": 0.0, "step": 4924 }, { "epoch": 1.4413227977758267, "grad_norm": 0.0017718099988996983, "learning_rate": 1.3966930055604332e-05, "loss": 0.0, "step": 4925 }, { "epoch": 1.4416154521510096, "grad_norm": 0.0026354538276791573, "learning_rate": 1.395961369622476e-05, "loss": 0.0, "step": 4926 }, { "epoch": 1.4419081065261925, "grad_norm": 0.0006366702727973461, "learning_rate": 1.3952297336845185e-05, "loss": 0.0, "step": 4927 }, { "epoch": 1.4422007609013754, "grad_norm": 0.0007797402213327587, "learning_rate": 1.3944980977465613e-05, "loss": 0.0, "step": 4928 }, { "epoch": 1.4424934152765583, "grad_norm": 0.0004057021869812161, "learning_rate": 1.393766461808604e-05, "loss": 0.0, "step": 4929 }, { "epoch": 1.4427860696517412, "grad_norm": 0.0004022572538815439, "learning_rate": 1.3930348258706468e-05, "loss": 0.0, "step": 4930 }, { "epoch": 1.4430787240269243, "grad_norm": 0.0011776091996580362, "learning_rate": 1.3923031899326896e-05, "loss": 0.0, "step": 4931 }, { "epoch": 1.4433713784021072, "grad_norm": 0.0002888218150474131, "learning_rate": 1.3915715539947321e-05, "loss": 0.0, "step": 4932 }, { "epoch": 1.44366403277729, "grad_norm": 0.0005171523662284017, "learning_rate": 1.3908399180567749e-05, "loss": 0.0, "step": 4933 }, { "epoch": 1.443956687152473, "grad_norm": 0.0003763338609132916, "learning_rate": 1.3901082821188177e-05, "loss": 0.0, "step": 4934 }, { "epoch": 1.4442493415276558, "grad_norm": 0.0005641872994601727, "learning_rate": 1.3893766461808605e-05, "loss": 0.0, "step": 4935 }, { "epoch": 1.4445419959028387, "grad_norm": 0.00030219266773201525, "learning_rate": 1.3886450102429033e-05, "loss": 0.0, "step": 4936 }, { "epoch": 1.4448346502780216, "grad_norm": 0.000511305988766253, "learning_rate": 1.387913374304946e-05, "loss": 0.0, "step": 4937 }, { "epoch": 1.4451273046532045, "grad_norm": 0.00015900484868325293, "learning_rate": 1.3871817383669885e-05, "loss": 0.0, "step": 4938 }, { "epoch": 1.4454199590283876, "grad_norm": 0.0038649272173643112, "learning_rate": 1.3864501024290313e-05, "loss": 0.0, "step": 4939 }, { "epoch": 1.4457126134035705, "grad_norm": 0.0005638069706037641, "learning_rate": 1.3857184664910741e-05, "loss": 0.0, "step": 4940 }, { "epoch": 1.4460052677787534, "grad_norm": 0.00047475972678512335, "learning_rate": 1.3849868305531169e-05, "loss": 0.0, "step": 4941 }, { "epoch": 1.4462979221539363, "grad_norm": 0.00030017804238013923, "learning_rate": 1.3842551946151597e-05, "loss": 0.0, "step": 4942 }, { "epoch": 1.4465905765291192, "grad_norm": 0.000523193331900984, "learning_rate": 1.3835235586772022e-05, "loss": 0.0, "step": 4943 }, { "epoch": 1.446883230904302, "grad_norm": 0.0013426643563434482, "learning_rate": 1.382791922739245e-05, "loss": 0.0, "step": 4944 }, { "epoch": 1.447175885279485, "grad_norm": 0.001148558221757412, "learning_rate": 1.3820602868012878e-05, "loss": 0.0, "step": 4945 }, { "epoch": 1.4474685396546678, "grad_norm": 0.0019947707187384367, "learning_rate": 1.3813286508633305e-05, "loss": 0.0, "step": 4946 }, { "epoch": 1.4477611940298507, "grad_norm": 0.0031954795122146606, "learning_rate": 1.3805970149253733e-05, "loss": 0.0, "step": 4947 }, { "epoch": 1.4480538484050336, "grad_norm": 0.002721356926485896, "learning_rate": 1.3798653789874158e-05, "loss": 0.0, "step": 4948 }, { "epoch": 1.4483465027802165, "grad_norm": 0.0005469535244628787, "learning_rate": 1.3791337430494586e-05, "loss": 0.0, "step": 4949 }, { "epoch": 1.4486391571553994, "grad_norm": 0.0012882015435025096, "learning_rate": 1.3784021071115014e-05, "loss": 0.0, "step": 4950 }, { "epoch": 1.4489318115305823, "grad_norm": 0.0010478420881554484, "learning_rate": 1.3776704711735442e-05, "loss": 0.0, "step": 4951 }, { "epoch": 1.4492244659057654, "grad_norm": 0.002568156225606799, "learning_rate": 1.376938835235587e-05, "loss": 0.0, "step": 4952 }, { "epoch": 1.4495171202809483, "grad_norm": 15.102858543395996, "learning_rate": 1.3762071992976298e-05, "loss": 0.1145, "step": 4953 }, { "epoch": 1.4498097746561311, "grad_norm": 0.006282886490225792, "learning_rate": 1.3754755633596722e-05, "loss": 0.0001, "step": 4954 }, { "epoch": 1.450102429031314, "grad_norm": 0.0005572250229306519, "learning_rate": 1.374743927421715e-05, "loss": 0.0, "step": 4955 }, { "epoch": 1.450395083406497, "grad_norm": 0.000361787126166746, "learning_rate": 1.3740122914837578e-05, "loss": 0.0, "step": 4956 }, { "epoch": 1.4506877377816798, "grad_norm": 0.00026979981339536607, "learning_rate": 1.3732806555458006e-05, "loss": 0.0, "step": 4957 }, { "epoch": 1.4509803921568627, "grad_norm": 0.0007593963528051972, "learning_rate": 1.3725490196078432e-05, "loss": 0.0, "step": 4958 }, { "epoch": 1.4512730465320456, "grad_norm": 0.0003041128220502287, "learning_rate": 1.3718173836698859e-05, "loss": 0.0, "step": 4959 }, { "epoch": 1.4515657009072287, "grad_norm": 0.00046027437201701105, "learning_rate": 1.3710857477319287e-05, "loss": 0.0, "step": 4960 }, { "epoch": 1.4518583552824116, "grad_norm": 0.0025288264732807875, "learning_rate": 1.3703541117939715e-05, "loss": 0.0, "step": 4961 }, { "epoch": 1.4521510096575945, "grad_norm": 0.00045756640611216426, "learning_rate": 1.369622475856014e-05, "loss": 0.0, "step": 4962 }, { "epoch": 1.4524436640327774, "grad_norm": 0.0004389963869471103, "learning_rate": 1.3688908399180569e-05, "loss": 0.0, "step": 4963 }, { "epoch": 1.4527363184079602, "grad_norm": 0.0001569542509969324, "learning_rate": 1.3681592039800995e-05, "loss": 0.0, "step": 4964 }, { "epoch": 1.4530289727831431, "grad_norm": 0.002118258038535714, "learning_rate": 1.3674275680421423e-05, "loss": 0.0, "step": 4965 }, { "epoch": 1.453321627158326, "grad_norm": 0.0007403275230899453, "learning_rate": 1.366695932104185e-05, "loss": 0.0, "step": 4966 }, { "epoch": 1.453614281533509, "grad_norm": 0.0005932514322921634, "learning_rate": 1.3659642961662277e-05, "loss": 0.0, "step": 4967 }, { "epoch": 1.4539069359086918, "grad_norm": 0.000845657370518893, "learning_rate": 1.3652326602282705e-05, "loss": 0.0, "step": 4968 }, { "epoch": 1.4541995902838747, "grad_norm": 0.004051250871270895, "learning_rate": 1.3645010242903131e-05, "loss": 0.0, "step": 4969 }, { "epoch": 1.4544922446590576, "grad_norm": 0.0012087548384442925, "learning_rate": 1.3637693883523558e-05, "loss": 0.0, "step": 4970 }, { "epoch": 1.4547848990342405, "grad_norm": 0.2617930471897125, "learning_rate": 1.3630377524143986e-05, "loss": 0.0006, "step": 4971 }, { "epoch": 1.4550775534094234, "grad_norm": 0.0018058578716591, "learning_rate": 1.3623061164764414e-05, "loss": 0.0, "step": 4972 }, { "epoch": 1.4553702077846065, "grad_norm": 0.00019489476107992232, "learning_rate": 1.3615744805384842e-05, "loss": 0.0, "step": 4973 }, { "epoch": 1.4556628621597894, "grad_norm": 0.00017870304873213172, "learning_rate": 1.360842844600527e-05, "loss": 0.0, "step": 4974 }, { "epoch": 1.4559555165349722, "grad_norm": 0.009403640404343605, "learning_rate": 1.3601112086625694e-05, "loss": 0.0001, "step": 4975 }, { "epoch": 1.4562481709101551, "grad_norm": 0.0010502643417567015, "learning_rate": 1.3593795727246122e-05, "loss": 0.0, "step": 4976 }, { "epoch": 1.456540825285338, "grad_norm": 0.0006351264892145991, "learning_rate": 1.358647936786655e-05, "loss": 0.0, "step": 4977 }, { "epoch": 1.456833479660521, "grad_norm": 0.00040663781692273915, "learning_rate": 1.3579163008486978e-05, "loss": 0.0, "step": 4978 }, { "epoch": 1.4571261340357038, "grad_norm": 0.00038158282404765487, "learning_rate": 1.3571846649107406e-05, "loss": 0.0, "step": 4979 }, { "epoch": 1.4574187884108867, "grad_norm": 0.00010390720126451924, "learning_rate": 1.356453028972783e-05, "loss": 0.0, "step": 4980 }, { "epoch": 1.4577114427860698, "grad_norm": 0.000412240216974169, "learning_rate": 1.3557213930348258e-05, "loss": 0.0, "step": 4981 }, { "epoch": 1.4580040971612527, "grad_norm": 0.000625427404884249, "learning_rate": 1.3549897570968686e-05, "loss": 0.0, "step": 4982 }, { "epoch": 1.4582967515364356, "grad_norm": 0.0014911566395312548, "learning_rate": 1.3542581211589114e-05, "loss": 0.0, "step": 4983 }, { "epoch": 1.4585894059116185, "grad_norm": 0.0005219570593908429, "learning_rate": 1.3535264852209542e-05, "loss": 0.0, "step": 4984 }, { "epoch": 1.4588820602868013, "grad_norm": 0.005115315783768892, "learning_rate": 1.3527948492829967e-05, "loss": 0.0001, "step": 4985 }, { "epoch": 1.4591747146619842, "grad_norm": 0.0006503094919025898, "learning_rate": 1.3520632133450395e-05, "loss": 0.0, "step": 4986 }, { "epoch": 1.4594673690371671, "grad_norm": 0.0003797007957473397, "learning_rate": 1.3513315774070823e-05, "loss": 0.0, "step": 4987 }, { "epoch": 1.45976002341235, "grad_norm": 0.0003924908523913473, "learning_rate": 1.350599941469125e-05, "loss": 0.0, "step": 4988 }, { "epoch": 1.460052677787533, "grad_norm": 0.00021261059737298638, "learning_rate": 1.3498683055311679e-05, "loss": 0.0, "step": 4989 }, { "epoch": 1.4603453321627158, "grad_norm": 0.0020935856737196445, "learning_rate": 1.3491366695932107e-05, "loss": 0.0, "step": 4990 }, { "epoch": 1.4606379865378987, "grad_norm": 0.0005655603599734604, "learning_rate": 1.3484050336552531e-05, "loss": 0.0, "step": 4991 }, { "epoch": 1.4609306409130816, "grad_norm": 0.0003155624435748905, "learning_rate": 1.3476733977172959e-05, "loss": 0.0, "step": 4992 }, { "epoch": 1.4612232952882644, "grad_norm": 0.0008528819889761508, "learning_rate": 1.3469417617793387e-05, "loss": 0.0, "step": 4993 }, { "epoch": 1.4615159496634473, "grad_norm": 0.0002827745920512825, "learning_rate": 1.3462101258413815e-05, "loss": 0.0, "step": 4994 }, { "epoch": 1.4618086040386304, "grad_norm": 0.0006649006390944123, "learning_rate": 1.3454784899034243e-05, "loss": 0.0, "step": 4995 }, { "epoch": 1.4621012584138133, "grad_norm": 0.0003897503484040499, "learning_rate": 1.3447468539654667e-05, "loss": 0.0, "step": 4996 }, { "epoch": 1.4623939127889962, "grad_norm": 0.0005316457245498896, "learning_rate": 1.3440152180275095e-05, "loss": 0.0, "step": 4997 }, { "epoch": 1.462686567164179, "grad_norm": 0.00031724441214464605, "learning_rate": 1.3432835820895523e-05, "loss": 0.0, "step": 4998 }, { "epoch": 1.462979221539362, "grad_norm": 0.0014708181843161583, "learning_rate": 1.3425519461515951e-05, "loss": 0.0, "step": 4999 }, { "epoch": 1.4632718759145449, "grad_norm": 0.0015603512292727828, "learning_rate": 1.341820310213638e-05, "loss": 0.0, "step": 5000 }, { "epoch": 1.4635645302897278, "grad_norm": 0.0002041055413428694, "learning_rate": 1.3410886742756804e-05, "loss": 0.0, "step": 5001 }, { "epoch": 1.4638571846649107, "grad_norm": 0.010169253684580326, "learning_rate": 1.3403570383377232e-05, "loss": 0.0001, "step": 5002 }, { "epoch": 1.4641498390400938, "grad_norm": 0.0003600012860260904, "learning_rate": 1.339625402399766e-05, "loss": 0.0, "step": 5003 }, { "epoch": 1.4644424934152767, "grad_norm": 0.00545900035649538, "learning_rate": 1.3388937664618088e-05, "loss": 0.0001, "step": 5004 }, { "epoch": 1.4647351477904595, "grad_norm": 0.00021998106967657804, "learning_rate": 1.3381621305238516e-05, "loss": 0.0, "step": 5005 }, { "epoch": 1.4650278021656424, "grad_norm": 0.0014308503596112132, "learning_rate": 1.3374304945858942e-05, "loss": 0.0, "step": 5006 }, { "epoch": 1.4653204565408253, "grad_norm": 0.0008946347516030073, "learning_rate": 1.3366988586479368e-05, "loss": 0.0, "step": 5007 }, { "epoch": 1.4656131109160082, "grad_norm": 0.0009333545458503067, "learning_rate": 1.3359672227099796e-05, "loss": 0.0, "step": 5008 }, { "epoch": 1.465905765291191, "grad_norm": 0.0006259658839553595, "learning_rate": 1.3352355867720224e-05, "loss": 0.0, "step": 5009 }, { "epoch": 1.466198419666374, "grad_norm": 0.00019977972260676324, "learning_rate": 1.334503950834065e-05, "loss": 0.0, "step": 5010 }, { "epoch": 1.4664910740415569, "grad_norm": 0.0003582122444640845, "learning_rate": 1.3337723148961078e-05, "loss": 0.0, "step": 5011 }, { "epoch": 1.4667837284167398, "grad_norm": 0.0016777211567386985, "learning_rate": 1.3330406789581504e-05, "loss": 0.0, "step": 5012 }, { "epoch": 1.4670763827919227, "grad_norm": 0.0008100473205558956, "learning_rate": 1.3323090430201932e-05, "loss": 0.0, "step": 5013 }, { "epoch": 1.4673690371671055, "grad_norm": 0.00014631861995439976, "learning_rate": 1.3315774070822359e-05, "loss": 0.0, "step": 5014 }, { "epoch": 1.4676616915422884, "grad_norm": 0.0002567546034697443, "learning_rate": 1.3308457711442787e-05, "loss": 0.0, "step": 5015 }, { "epoch": 1.4679543459174715, "grad_norm": 9.500472515355796e-05, "learning_rate": 1.3301141352063215e-05, "loss": 0.0, "step": 5016 }, { "epoch": 1.4682470002926544, "grad_norm": 0.0003111510304734111, "learning_rate": 1.329382499268364e-05, "loss": 0.0, "step": 5017 }, { "epoch": 1.4685396546678373, "grad_norm": 0.001199898892082274, "learning_rate": 1.3286508633304067e-05, "loss": 0.0, "step": 5018 }, { "epoch": 1.4688323090430202, "grad_norm": 0.00024551950627937913, "learning_rate": 1.3279192273924495e-05, "loss": 0.0, "step": 5019 }, { "epoch": 1.469124963418203, "grad_norm": 0.00043802321306429803, "learning_rate": 1.3271875914544923e-05, "loss": 0.0, "step": 5020 }, { "epoch": 1.469417617793386, "grad_norm": 0.0006880344590172172, "learning_rate": 1.3264559555165351e-05, "loss": 0.0, "step": 5021 }, { "epoch": 1.4697102721685689, "grad_norm": 0.00022182802786119282, "learning_rate": 1.3257243195785779e-05, "loss": 0.0, "step": 5022 }, { "epoch": 1.4700029265437518, "grad_norm": 0.000889520684722811, "learning_rate": 1.3249926836406203e-05, "loss": 0.0, "step": 5023 }, { "epoch": 1.4702955809189349, "grad_norm": 0.0015817388193681836, "learning_rate": 1.3242610477026631e-05, "loss": 0.0, "step": 5024 }, { "epoch": 1.4705882352941178, "grad_norm": 0.0004126756975892931, "learning_rate": 1.323529411764706e-05, "loss": 0.0, "step": 5025 }, { "epoch": 1.4708808896693006, "grad_norm": 0.0004018440959043801, "learning_rate": 1.3227977758267487e-05, "loss": 0.0, "step": 5026 }, { "epoch": 1.4711735440444835, "grad_norm": 3.947730779647827, "learning_rate": 1.3220661398887915e-05, "loss": 0.0052, "step": 5027 }, { "epoch": 1.4714661984196664, "grad_norm": 0.00024372065672650933, "learning_rate": 1.321334503950834e-05, "loss": 0.0, "step": 5028 }, { "epoch": 1.4717588527948493, "grad_norm": 0.00025667098816484213, "learning_rate": 1.3206028680128768e-05, "loss": 0.0, "step": 5029 }, { "epoch": 1.4720515071700322, "grad_norm": 0.0001714712561806664, "learning_rate": 1.3198712320749196e-05, "loss": 0.0, "step": 5030 }, { "epoch": 1.472344161545215, "grad_norm": 0.0007231322815641761, "learning_rate": 1.3191395961369624e-05, "loss": 0.0, "step": 5031 }, { "epoch": 1.472636815920398, "grad_norm": 3.6601428985595703, "learning_rate": 1.3184079601990052e-05, "loss": 0.1682, "step": 5032 }, { "epoch": 1.4729294702955809, "grad_norm": 0.0038921311497688293, "learning_rate": 1.3176763242610476e-05, "loss": 0.0, "step": 5033 }, { "epoch": 1.4732221246707637, "grad_norm": 0.00017758534522727132, "learning_rate": 1.3169446883230904e-05, "loss": 0.0, "step": 5034 }, { "epoch": 1.4735147790459466, "grad_norm": 0.001061857445165515, "learning_rate": 1.3162130523851332e-05, "loss": 0.0, "step": 5035 }, { "epoch": 1.4738074334211295, "grad_norm": 11.057812690734863, "learning_rate": 1.315481416447176e-05, "loss": 0.1485, "step": 5036 }, { "epoch": 1.4741000877963126, "grad_norm": 0.0003560293698683381, "learning_rate": 1.3147497805092188e-05, "loss": 0.0, "step": 5037 }, { "epoch": 1.4743927421714955, "grad_norm": 0.0003879110736306757, "learning_rate": 1.3140181445712613e-05, "loss": 0.0, "step": 5038 }, { "epoch": 1.4746853965466784, "grad_norm": 0.0008185947081074119, "learning_rate": 1.313286508633304e-05, "loss": 0.0, "step": 5039 }, { "epoch": 1.4749780509218613, "grad_norm": 0.0019864842761307955, "learning_rate": 1.3125548726953468e-05, "loss": 0.0, "step": 5040 }, { "epoch": 1.4752707052970442, "grad_norm": 0.0014067594893276691, "learning_rate": 1.3118232367573896e-05, "loss": 0.0, "step": 5041 }, { "epoch": 1.475563359672227, "grad_norm": 0.0010090309660881758, "learning_rate": 1.3110916008194324e-05, "loss": 0.0, "step": 5042 }, { "epoch": 1.47585601404741, "grad_norm": 0.00034172378946095705, "learning_rate": 1.3103599648814752e-05, "loss": 0.0, "step": 5043 }, { "epoch": 1.4761486684225928, "grad_norm": 0.001443683635443449, "learning_rate": 1.3096283289435177e-05, "loss": 0.0, "step": 5044 }, { "epoch": 1.476441322797776, "grad_norm": 0.0004092727031093091, "learning_rate": 1.3088966930055605e-05, "loss": 0.0, "step": 5045 }, { "epoch": 1.4767339771729588, "grad_norm": 0.013661233708262444, "learning_rate": 1.3081650570676033e-05, "loss": 0.0001, "step": 5046 }, { "epoch": 1.4770266315481417, "grad_norm": 0.0003617628535721451, "learning_rate": 1.307433421129646e-05, "loss": 0.0, "step": 5047 }, { "epoch": 1.4773192859233246, "grad_norm": 0.000713932910002768, "learning_rate": 1.3067017851916889e-05, "loss": 0.0, "step": 5048 }, { "epoch": 1.4776119402985075, "grad_norm": 0.0007373035768978298, "learning_rate": 1.3059701492537313e-05, "loss": 0.0, "step": 5049 }, { "epoch": 1.4779045946736904, "grad_norm": 0.0002988100459333509, "learning_rate": 1.3052385133157741e-05, "loss": 0.0, "step": 5050 }, { "epoch": 1.4781972490488733, "grad_norm": 0.0005745989619754255, "learning_rate": 1.3045068773778169e-05, "loss": 0.0, "step": 5051 }, { "epoch": 1.4784899034240562, "grad_norm": 0.0006586909294128418, "learning_rate": 1.3037752414398597e-05, "loss": 0.0, "step": 5052 }, { "epoch": 1.478782557799239, "grad_norm": 0.01472164411097765, "learning_rate": 1.3030436055019023e-05, "loss": 0.0002, "step": 5053 }, { "epoch": 1.479075212174422, "grad_norm": 0.02453051321208477, "learning_rate": 1.302311969563945e-05, "loss": 0.0002, "step": 5054 }, { "epoch": 1.4793678665496048, "grad_norm": 0.0015729361912235618, "learning_rate": 1.3015803336259878e-05, "loss": 0.0, "step": 5055 }, { "epoch": 1.4796605209247877, "grad_norm": 0.00018458921113051474, "learning_rate": 1.3008486976880305e-05, "loss": 0.0, "step": 5056 }, { "epoch": 1.4799531752999706, "grad_norm": 0.001480237115174532, "learning_rate": 1.3001170617500732e-05, "loss": 0.0, "step": 5057 }, { "epoch": 1.4802458296751537, "grad_norm": 0.000548430485650897, "learning_rate": 1.299385425812116e-05, "loss": 0.0, "step": 5058 }, { "epoch": 1.4805384840503366, "grad_norm": 0.0007473621517419815, "learning_rate": 1.2986537898741588e-05, "loss": 0.0, "step": 5059 }, { "epoch": 1.4808311384255195, "grad_norm": 0.0016481553902849555, "learning_rate": 1.2979221539362014e-05, "loss": 0.0, "step": 5060 }, { "epoch": 1.4811237928007024, "grad_norm": 0.0009578980389051139, "learning_rate": 1.297190517998244e-05, "loss": 0.0, "step": 5061 }, { "epoch": 1.4814164471758853, "grad_norm": 0.0005476105725392699, "learning_rate": 1.2964588820602868e-05, "loss": 0.0, "step": 5062 }, { "epoch": 1.4817091015510682, "grad_norm": 0.012019267305731773, "learning_rate": 1.2957272461223296e-05, "loss": 0.0001, "step": 5063 }, { "epoch": 1.482001755926251, "grad_norm": 0.0003678315260913223, "learning_rate": 1.2949956101843724e-05, "loss": 0.0, "step": 5064 }, { "epoch": 1.482294410301434, "grad_norm": 0.00038286292692646384, "learning_rate": 1.2942639742464149e-05, "loss": 0.0, "step": 5065 }, { "epoch": 1.482587064676617, "grad_norm": 0.004511554725468159, "learning_rate": 1.2935323383084577e-05, "loss": 0.0001, "step": 5066 }, { "epoch": 1.4828797190518, "grad_norm": 0.0002436785725876689, "learning_rate": 1.2928007023705004e-05, "loss": 0.0, "step": 5067 }, { "epoch": 1.4831723734269828, "grad_norm": 0.00030211356352083385, "learning_rate": 1.2920690664325432e-05, "loss": 0.0, "step": 5068 }, { "epoch": 1.4834650278021657, "grad_norm": 0.0012050060322508216, "learning_rate": 1.291337430494586e-05, "loss": 0.0, "step": 5069 }, { "epoch": 1.4837576821773486, "grad_norm": 0.0007688667974434793, "learning_rate": 1.2906057945566285e-05, "loss": 0.0, "step": 5070 }, { "epoch": 1.4840503365525315, "grad_norm": 0.00024990117526613176, "learning_rate": 1.2898741586186713e-05, "loss": 0.0, "step": 5071 }, { "epoch": 1.4843429909277144, "grad_norm": 0.00033757698838599026, "learning_rate": 1.289142522680714e-05, "loss": 0.0, "step": 5072 }, { "epoch": 1.4846356453028973, "grad_norm": 0.0011487255105748773, "learning_rate": 1.2884108867427569e-05, "loss": 0.0, "step": 5073 }, { "epoch": 1.4849282996780802, "grad_norm": 0.0008633338147774339, "learning_rate": 1.2876792508047997e-05, "loss": 0.0, "step": 5074 }, { "epoch": 1.485220954053263, "grad_norm": 0.0006356406374834478, "learning_rate": 1.2869476148668425e-05, "loss": 0.0, "step": 5075 }, { "epoch": 1.485513608428446, "grad_norm": 0.0013006513472646475, "learning_rate": 1.286215978928885e-05, "loss": 0.0, "step": 5076 }, { "epoch": 1.4858062628036288, "grad_norm": 0.32902756333351135, "learning_rate": 1.2854843429909277e-05, "loss": 0.0013, "step": 5077 }, { "epoch": 1.4860989171788117, "grad_norm": 0.0004978632205165923, "learning_rate": 1.2847527070529705e-05, "loss": 0.0, "step": 5078 }, { "epoch": 1.4863915715539946, "grad_norm": 0.00033494128729216754, "learning_rate": 1.2840210711150133e-05, "loss": 0.0, "step": 5079 }, { "epoch": 1.4866842259291777, "grad_norm": 0.00036529218778014183, "learning_rate": 1.2832894351770561e-05, "loss": 0.0, "step": 5080 }, { "epoch": 1.4869768803043606, "grad_norm": 0.0002138021809514612, "learning_rate": 1.2825577992390986e-05, "loss": 0.0, "step": 5081 }, { "epoch": 1.4872695346795435, "grad_norm": 0.0006728937150910497, "learning_rate": 1.2818261633011414e-05, "loss": 0.0, "step": 5082 }, { "epoch": 1.4875621890547264, "grad_norm": 0.00042946042958647013, "learning_rate": 1.2810945273631842e-05, "loss": 0.0, "step": 5083 }, { "epoch": 1.4878548434299093, "grad_norm": 0.0011009355075657368, "learning_rate": 1.280362891425227e-05, "loss": 0.0, "step": 5084 }, { "epoch": 1.4881474978050921, "grad_norm": 0.00038497481727972627, "learning_rate": 1.2796312554872697e-05, "loss": 0.0, "step": 5085 }, { "epoch": 1.488440152180275, "grad_norm": 0.00034325546585023403, "learning_rate": 1.2788996195493122e-05, "loss": 0.0, "step": 5086 }, { "epoch": 1.488732806555458, "grad_norm": 0.0006645070970989764, "learning_rate": 1.278167983611355e-05, "loss": 0.0, "step": 5087 }, { "epoch": 1.489025460930641, "grad_norm": 0.007781506050378084, "learning_rate": 1.2774363476733978e-05, "loss": 0.0001, "step": 5088 }, { "epoch": 1.489318115305824, "grad_norm": 0.0004191446350887418, "learning_rate": 1.2767047117354406e-05, "loss": 0.0, "step": 5089 }, { "epoch": 1.4896107696810068, "grad_norm": 0.0004313608515076339, "learning_rate": 1.2759730757974834e-05, "loss": 0.0, "step": 5090 }, { "epoch": 1.4899034240561897, "grad_norm": 0.8468864560127258, "learning_rate": 1.2752414398595262e-05, "loss": 0.0048, "step": 5091 }, { "epoch": 1.4901960784313726, "grad_norm": 0.009637207724153996, "learning_rate": 1.2745098039215686e-05, "loss": 0.0001, "step": 5092 }, { "epoch": 1.4904887328065555, "grad_norm": 0.001110792625695467, "learning_rate": 1.2737781679836114e-05, "loss": 0.0, "step": 5093 }, { "epoch": 1.4907813871817384, "grad_norm": 0.041818760335445404, "learning_rate": 1.2730465320456542e-05, "loss": 0.0002, "step": 5094 }, { "epoch": 1.4910740415569212, "grad_norm": 0.0006818793481215835, "learning_rate": 1.272314896107697e-05, "loss": 0.0, "step": 5095 }, { "epoch": 1.4913666959321041, "grad_norm": 0.03327636420726776, "learning_rate": 1.2715832601697396e-05, "loss": 0.0001, "step": 5096 }, { "epoch": 1.491659350307287, "grad_norm": 0.0002938769175671041, "learning_rate": 1.2708516242317823e-05, "loss": 0.0, "step": 5097 }, { "epoch": 1.49195200468247, "grad_norm": 0.0005069047911092639, "learning_rate": 1.270119988293825e-05, "loss": 0.0, "step": 5098 }, { "epoch": 1.4922446590576528, "grad_norm": 0.000463327975012362, "learning_rate": 1.2693883523558679e-05, "loss": 0.0, "step": 5099 }, { "epoch": 1.4925373134328357, "grad_norm": 0.0002462085976731032, "learning_rate": 1.2686567164179105e-05, "loss": 0.0, "step": 5100 }, { "epoch": 1.4928299678080188, "grad_norm": 0.0003037136630155146, "learning_rate": 1.2679250804799533e-05, "loss": 0.0, "step": 5101 }, { "epoch": 1.4931226221832017, "grad_norm": 0.001337929628789425, "learning_rate": 1.2671934445419959e-05, "loss": 0.0, "step": 5102 }, { "epoch": 1.4934152765583846, "grad_norm": 0.0003057431895285845, "learning_rate": 1.2664618086040387e-05, "loss": 0.0, "step": 5103 }, { "epoch": 1.4937079309335675, "grad_norm": 0.005425651557743549, "learning_rate": 1.2657301726660813e-05, "loss": 0.0001, "step": 5104 }, { "epoch": 1.4940005853087504, "grad_norm": 0.039733052253723145, "learning_rate": 1.2649985367281241e-05, "loss": 0.0003, "step": 5105 }, { "epoch": 1.4942932396839332, "grad_norm": 0.00035593679058365524, "learning_rate": 1.2642669007901669e-05, "loss": 0.0, "step": 5106 }, { "epoch": 1.4945858940591161, "grad_norm": 0.00027309643337503076, "learning_rate": 1.2635352648522095e-05, "loss": 0.0, "step": 5107 }, { "epoch": 1.494878548434299, "grad_norm": 0.0004261991707608104, "learning_rate": 1.2628036289142522e-05, "loss": 0.0, "step": 5108 }, { "epoch": 1.4951712028094821, "grad_norm": 0.00017297992599196732, "learning_rate": 1.262071992976295e-05, "loss": 0.0, "step": 5109 }, { "epoch": 1.495463857184665, "grad_norm": 0.00018700190412346274, "learning_rate": 1.2613403570383378e-05, "loss": 0.0, "step": 5110 }, { "epoch": 1.495756511559848, "grad_norm": 0.00016983953537419438, "learning_rate": 1.2606087211003805e-05, "loss": 0.0, "step": 5111 }, { "epoch": 1.4960491659350308, "grad_norm": 0.00023661194427404553, "learning_rate": 1.2598770851624233e-05, "loss": 0.0, "step": 5112 }, { "epoch": 1.4963418203102137, "grad_norm": 0.0859820768237114, "learning_rate": 1.2591454492244658e-05, "loss": 0.0003, "step": 5113 }, { "epoch": 1.4966344746853966, "grad_norm": 0.0002156875270884484, "learning_rate": 1.2584138132865086e-05, "loss": 0.0, "step": 5114 }, { "epoch": 1.4969271290605795, "grad_norm": 0.0038348534144461155, "learning_rate": 1.2576821773485514e-05, "loss": 0.0, "step": 5115 }, { "epoch": 1.4972197834357623, "grad_norm": 0.0002400731318630278, "learning_rate": 1.2569505414105942e-05, "loss": 0.0, "step": 5116 }, { "epoch": 1.4975124378109452, "grad_norm": 0.005611362401396036, "learning_rate": 1.256218905472637e-05, "loss": 0.0001, "step": 5117 }, { "epoch": 1.4978050921861281, "grad_norm": 0.00036208340316079557, "learning_rate": 1.2554872695346794e-05, "loss": 0.0, "step": 5118 }, { "epoch": 1.498097746561311, "grad_norm": 0.00026920289383269846, "learning_rate": 1.2547556335967222e-05, "loss": 0.0, "step": 5119 }, { "epoch": 1.498390400936494, "grad_norm": 0.0069849989376962185, "learning_rate": 1.254023997658765e-05, "loss": 0.0001, "step": 5120 }, { "epoch": 1.4986830553116768, "grad_norm": 0.006348796654492617, "learning_rate": 1.2532923617208078e-05, "loss": 0.0001, "step": 5121 }, { "epoch": 1.49897570968686, "grad_norm": 0.0009372648200951517, "learning_rate": 1.2525607257828506e-05, "loss": 0.0, "step": 5122 }, { "epoch": 1.4992683640620428, "grad_norm": 0.00017370175919495523, "learning_rate": 1.251829089844893e-05, "loss": 0.0, "step": 5123 }, { "epoch": 1.4995610184372257, "grad_norm": 0.0004743621975649148, "learning_rate": 1.2510974539069359e-05, "loss": 0.0, "step": 5124 }, { "epoch": 1.4998536728124086, "grad_norm": 0.00016216834774240851, "learning_rate": 1.2503658179689787e-05, "loss": 0.0, "step": 5125 }, { "epoch": 1.5001463271875914, "grad_norm": 0.00036036837263964117, "learning_rate": 1.2496341820310215e-05, "loss": 0.0, "step": 5126 }, { "epoch": 1.5004389815627743, "grad_norm": 0.0001896850735647604, "learning_rate": 1.248902546093064e-05, "loss": 0.0, "step": 5127 }, { "epoch": 1.5007316359379572, "grad_norm": 0.0015876274555921555, "learning_rate": 1.2481709101551069e-05, "loss": 0.0, "step": 5128 }, { "epoch": 1.5010242903131403, "grad_norm": 0.0003233393654227257, "learning_rate": 1.2474392742171497e-05, "loss": 0.0, "step": 5129 }, { "epoch": 1.5013169446883232, "grad_norm": 0.00015400140546262264, "learning_rate": 1.2467076382791923e-05, "loss": 0.0, "step": 5130 }, { "epoch": 1.501609599063506, "grad_norm": 0.011688701808452606, "learning_rate": 1.2459760023412351e-05, "loss": 0.0001, "step": 5131 }, { "epoch": 1.501902253438689, "grad_norm": 0.0010759357828646898, "learning_rate": 1.2452443664032779e-05, "loss": 0.0, "step": 5132 }, { "epoch": 1.5021949078138719, "grad_norm": 0.0020588075276464224, "learning_rate": 1.2445127304653205e-05, "loss": 0.0, "step": 5133 }, { "epoch": 1.5024875621890548, "grad_norm": 0.00021067510533612221, "learning_rate": 1.2437810945273633e-05, "loss": 0.0, "step": 5134 }, { "epoch": 1.5027802165642377, "grad_norm": 0.0002671003749128431, "learning_rate": 1.243049458589406e-05, "loss": 0.0, "step": 5135 }, { "epoch": 1.5030728709394205, "grad_norm": 0.0004259385459590703, "learning_rate": 1.2423178226514487e-05, "loss": 0.0, "step": 5136 }, { "epoch": 1.5033655253146034, "grad_norm": 0.00041439401684328914, "learning_rate": 1.2415861867134915e-05, "loss": 0.0, "step": 5137 }, { "epoch": 1.5036581796897863, "grad_norm": 0.00020446009875740856, "learning_rate": 1.2408545507755342e-05, "loss": 0.0, "step": 5138 }, { "epoch": 1.5039508340649692, "grad_norm": 0.00014487757289316505, "learning_rate": 1.240122914837577e-05, "loss": 0.0, "step": 5139 }, { "epoch": 1.504243488440152, "grad_norm": 0.0002552311634644866, "learning_rate": 1.2393912788996197e-05, "loss": 0.0, "step": 5140 }, { "epoch": 1.504536142815335, "grad_norm": 0.0020015928894281387, "learning_rate": 1.2386596429616624e-05, "loss": 0.0, "step": 5141 }, { "epoch": 1.5048287971905179, "grad_norm": 0.000605784181971103, "learning_rate": 1.2379280070237052e-05, "loss": 0.0, "step": 5142 }, { "epoch": 1.5051214515657008, "grad_norm": 0.0010445835068821907, "learning_rate": 1.2371963710857478e-05, "loss": 0.0, "step": 5143 }, { "epoch": 1.5054141059408837, "grad_norm": 0.00021796950022689998, "learning_rate": 1.2364647351477906e-05, "loss": 0.0, "step": 5144 }, { "epoch": 1.5057067603160668, "grad_norm": 0.0011622559977695346, "learning_rate": 1.2357330992098334e-05, "loss": 0.0, "step": 5145 }, { "epoch": 1.5059994146912496, "grad_norm": 0.00017326697707176208, "learning_rate": 1.235001463271876e-05, "loss": 0.0, "step": 5146 }, { "epoch": 1.5062920690664325, "grad_norm": 0.0017791668651625514, "learning_rate": 1.2342698273339188e-05, "loss": 0.0, "step": 5147 }, { "epoch": 1.5065847234416154, "grad_norm": 0.0007580950041301548, "learning_rate": 1.2335381913959614e-05, "loss": 0.0, "step": 5148 }, { "epoch": 1.5068773778167983, "grad_norm": 0.00043160910718142986, "learning_rate": 1.2328065554580042e-05, "loss": 0.0, "step": 5149 }, { "epoch": 1.5071700321919814, "grad_norm": 0.0001991214812733233, "learning_rate": 1.2320749195200468e-05, "loss": 0.0, "step": 5150 }, { "epoch": 1.5074626865671643, "grad_norm": 0.0004408336244523525, "learning_rate": 1.2313432835820896e-05, "loss": 0.0, "step": 5151 }, { "epoch": 1.5077553409423472, "grad_norm": 0.0007374389097094536, "learning_rate": 1.2306116476441323e-05, "loss": 0.0, "step": 5152 }, { "epoch": 1.50804799531753, "grad_norm": 0.00024802814004942775, "learning_rate": 1.229880011706175e-05, "loss": 0.0, "step": 5153 }, { "epoch": 1.508340649692713, "grad_norm": 0.00018486542103346437, "learning_rate": 1.2291483757682177e-05, "loss": 0.0, "step": 5154 }, { "epoch": 1.5086333040678959, "grad_norm": 0.0001799042511265725, "learning_rate": 1.2284167398302605e-05, "loss": 0.0, "step": 5155 }, { "epoch": 1.5089259584430788, "grad_norm": 0.00019250593322794884, "learning_rate": 1.2276851038923033e-05, "loss": 0.0, "step": 5156 }, { "epoch": 1.5092186128182616, "grad_norm": 0.00026157504180446267, "learning_rate": 1.2269534679543459e-05, "loss": 0.0, "step": 5157 }, { "epoch": 1.5095112671934445, "grad_norm": 0.00043231903691776097, "learning_rate": 1.2262218320163887e-05, "loss": 0.0, "step": 5158 }, { "epoch": 1.5098039215686274, "grad_norm": 0.0003085647476837039, "learning_rate": 1.2254901960784313e-05, "loss": 0.0, "step": 5159 }, { "epoch": 1.5100965759438103, "grad_norm": 0.00012914616672787815, "learning_rate": 1.2247585601404741e-05, "loss": 0.0, "step": 5160 }, { "epoch": 1.5103892303189932, "grad_norm": 0.0002531928475946188, "learning_rate": 1.2240269242025169e-05, "loss": 0.0, "step": 5161 }, { "epoch": 1.510681884694176, "grad_norm": 0.00022002340119797736, "learning_rate": 1.2232952882645595e-05, "loss": 0.0, "step": 5162 }, { "epoch": 1.510974539069359, "grad_norm": 0.00036131092929281294, "learning_rate": 1.2225636523266023e-05, "loss": 0.0, "step": 5163 }, { "epoch": 1.5112671934445419, "grad_norm": 0.0001618622336536646, "learning_rate": 1.221832016388645e-05, "loss": 0.0, "step": 5164 }, { "epoch": 1.5115598478197247, "grad_norm": 0.000462544645415619, "learning_rate": 1.2211003804506878e-05, "loss": 0.0, "step": 5165 }, { "epoch": 1.5118525021949079, "grad_norm": 0.13465426862239838, "learning_rate": 1.2203687445127305e-05, "loss": 0.0006, "step": 5166 }, { "epoch": 1.5121451565700907, "grad_norm": 0.0018989065429195762, "learning_rate": 1.2196371085747732e-05, "loss": 0.0, "step": 5167 }, { "epoch": 1.5124378109452736, "grad_norm": 0.00041230389615520835, "learning_rate": 1.218905472636816e-05, "loss": 0.0, "step": 5168 }, { "epoch": 1.5127304653204565, "grad_norm": 5.563209924730472e-05, "learning_rate": 1.2181738366988588e-05, "loss": 0.0, "step": 5169 }, { "epoch": 1.5130231196956394, "grad_norm": 0.0004752097011078149, "learning_rate": 1.2174422007609014e-05, "loss": 0.0, "step": 5170 }, { "epoch": 1.5133157740708225, "grad_norm": 0.014747914858162403, "learning_rate": 1.2167105648229442e-05, "loss": 0.0001, "step": 5171 }, { "epoch": 1.5136084284460054, "grad_norm": 0.00024430773919448256, "learning_rate": 1.2159789288849868e-05, "loss": 0.0, "step": 5172 }, { "epoch": 1.5139010828211883, "grad_norm": 0.00029366539092734456, "learning_rate": 1.2152472929470296e-05, "loss": 0.0, "step": 5173 }, { "epoch": 1.5141937371963712, "grad_norm": 0.6517511606216431, "learning_rate": 1.2145156570090724e-05, "loss": 0.0012, "step": 5174 }, { "epoch": 1.514486391571554, "grad_norm": 0.00038376718293875456, "learning_rate": 1.213784021071115e-05, "loss": 0.0, "step": 5175 }, { "epoch": 1.514779045946737, "grad_norm": 0.00014857873611617833, "learning_rate": 1.2130523851331578e-05, "loss": 0.0, "step": 5176 }, { "epoch": 1.5150717003219198, "grad_norm": 0.0004696003161370754, "learning_rate": 1.2123207491952006e-05, "loss": 0.0, "step": 5177 }, { "epoch": 1.5153643546971027, "grad_norm": 0.0009346023434773088, "learning_rate": 1.2115891132572432e-05, "loss": 0.0, "step": 5178 }, { "epoch": 1.5156570090722856, "grad_norm": 0.008241580799221992, "learning_rate": 1.210857477319286e-05, "loss": 0.0001, "step": 5179 }, { "epoch": 1.5159496634474685, "grad_norm": 0.00023253982362803072, "learning_rate": 1.2101258413813287e-05, "loss": 0.0, "step": 5180 }, { "epoch": 1.5162423178226514, "grad_norm": 0.00026602420257404447, "learning_rate": 1.2093942054433715e-05, "loss": 0.0, "step": 5181 }, { "epoch": 1.5165349721978343, "grad_norm": 0.0006170397973619401, "learning_rate": 1.2086625695054143e-05, "loss": 0.0, "step": 5182 }, { "epoch": 1.5168276265730172, "grad_norm": 0.0007347619393840432, "learning_rate": 1.2079309335674569e-05, "loss": 0.0, "step": 5183 }, { "epoch": 1.5171202809482, "grad_norm": 0.00021642430510837585, "learning_rate": 1.2071992976294997e-05, "loss": 0.0, "step": 5184 }, { "epoch": 1.517412935323383, "grad_norm": 0.00039059240953065455, "learning_rate": 1.2064676616915425e-05, "loss": 0.0, "step": 5185 }, { "epoch": 1.5177055896985658, "grad_norm": 0.00016414794663432986, "learning_rate": 1.2057360257535851e-05, "loss": 0.0, "step": 5186 }, { "epoch": 1.517998244073749, "grad_norm": 0.0010327588533982635, "learning_rate": 1.2050043898156279e-05, "loss": 0.0, "step": 5187 }, { "epoch": 1.5182908984489318, "grad_norm": 0.0007639078539796174, "learning_rate": 1.2042727538776705e-05, "loss": 0.0, "step": 5188 }, { "epoch": 1.5185835528241147, "grad_norm": 0.0015009710332378745, "learning_rate": 1.2035411179397133e-05, "loss": 0.0, "step": 5189 }, { "epoch": 1.5188762071992976, "grad_norm": 0.0005584223545156419, "learning_rate": 1.2028094820017561e-05, "loss": 0.0, "step": 5190 }, { "epoch": 1.5191688615744805, "grad_norm": 0.0007484898087568581, "learning_rate": 1.2020778460637987e-05, "loss": 0.0, "step": 5191 }, { "epoch": 1.5194615159496636, "grad_norm": 0.000180575909325853, "learning_rate": 1.2013462101258415e-05, "loss": 0.0, "step": 5192 }, { "epoch": 1.5197541703248465, "grad_norm": 0.00027021754067391157, "learning_rate": 1.2006145741878842e-05, "loss": 0.0, "step": 5193 }, { "epoch": 1.5200468247000294, "grad_norm": 0.000288851821096614, "learning_rate": 1.199882938249927e-05, "loss": 0.0, "step": 5194 }, { "epoch": 1.5203394790752123, "grad_norm": 0.0009474614635109901, "learning_rate": 1.1991513023119696e-05, "loss": 0.0, "step": 5195 }, { "epoch": 1.5206321334503952, "grad_norm": 0.0007197274244390428, "learning_rate": 1.1984196663740124e-05, "loss": 0.0, "step": 5196 }, { "epoch": 1.520924787825578, "grad_norm": 0.0004081302904523909, "learning_rate": 1.197688030436055e-05, "loss": 0.0, "step": 5197 }, { "epoch": 1.521217442200761, "grad_norm": 0.0008590701618231833, "learning_rate": 1.1969563944980978e-05, "loss": 0.0, "step": 5198 }, { "epoch": 1.5215100965759438, "grad_norm": 11.854995727539062, "learning_rate": 1.1962247585601404e-05, "loss": 0.1868, "step": 5199 }, { "epoch": 1.5218027509511267, "grad_norm": 0.0003525600768625736, "learning_rate": 1.1954931226221832e-05, "loss": 0.0, "step": 5200 }, { "epoch": 1.5220954053263096, "grad_norm": 0.0009801547275856137, "learning_rate": 1.194761486684226e-05, "loss": 0.0, "step": 5201 }, { "epoch": 1.5223880597014925, "grad_norm": 0.00014240208838600665, "learning_rate": 1.1940298507462686e-05, "loss": 0.0, "step": 5202 }, { "epoch": 1.5226807140766754, "grad_norm": 0.0006725612911395729, "learning_rate": 1.1932982148083114e-05, "loss": 0.0, "step": 5203 }, { "epoch": 1.5229733684518583, "grad_norm": 0.00021944883337710053, "learning_rate": 1.192566578870354e-05, "loss": 0.0, "step": 5204 }, { "epoch": 1.5232660228270412, "grad_norm": 0.00021907799236942083, "learning_rate": 1.1918349429323968e-05, "loss": 0.0, "step": 5205 }, { "epoch": 1.523558677202224, "grad_norm": 0.001510326866991818, "learning_rate": 1.1911033069944396e-05, "loss": 0.0, "step": 5206 }, { "epoch": 1.523851331577407, "grad_norm": 6.531454710057005e-05, "learning_rate": 1.1903716710564823e-05, "loss": 0.0, "step": 5207 }, { "epoch": 1.52414398595259, "grad_norm": 0.000475716165965423, "learning_rate": 1.189640035118525e-05, "loss": 0.0, "step": 5208 }, { "epoch": 1.524436640327773, "grad_norm": 0.0007988332072272897, "learning_rate": 1.1889083991805679e-05, "loss": 0.0, "step": 5209 }, { "epoch": 1.5247292947029558, "grad_norm": 0.0004497581103350967, "learning_rate": 1.1881767632426105e-05, "loss": 0.0, "step": 5210 }, { "epoch": 1.5250219490781387, "grad_norm": 0.00024964759359136224, "learning_rate": 1.1874451273046533e-05, "loss": 0.0, "step": 5211 }, { "epoch": 1.5253146034533216, "grad_norm": 0.0013601906830444932, "learning_rate": 1.1867134913666959e-05, "loss": 0.0, "step": 5212 }, { "epoch": 1.5256072578285045, "grad_norm": 0.0006484125624410808, "learning_rate": 1.1859818554287387e-05, "loss": 0.0, "step": 5213 }, { "epoch": 1.5258999122036876, "grad_norm": 0.0006421684520319104, "learning_rate": 1.1852502194907815e-05, "loss": 0.0, "step": 5214 }, { "epoch": 1.5261925665788705, "grad_norm": 0.00012406571477185935, "learning_rate": 1.1845185835528241e-05, "loss": 0.0, "step": 5215 }, { "epoch": 1.5264852209540534, "grad_norm": 0.00018810993060469627, "learning_rate": 1.1837869476148669e-05, "loss": 0.0, "step": 5216 }, { "epoch": 1.5267778753292363, "grad_norm": 0.00022501441708300263, "learning_rate": 1.1830553116769097e-05, "loss": 0.0, "step": 5217 }, { "epoch": 1.5270705297044191, "grad_norm": 0.00022160427761264145, "learning_rate": 1.1823236757389523e-05, "loss": 0.0, "step": 5218 }, { "epoch": 1.527363184079602, "grad_norm": 0.0007525791297666728, "learning_rate": 1.1815920398009951e-05, "loss": 0.0, "step": 5219 }, { "epoch": 1.527655838454785, "grad_norm": 0.001195007935166359, "learning_rate": 1.1808604038630378e-05, "loss": 0.0, "step": 5220 }, { "epoch": 1.5279484928299678, "grad_norm": 0.00014784677478019148, "learning_rate": 1.1801287679250805e-05, "loss": 0.0, "step": 5221 }, { "epoch": 1.5282411472051507, "grad_norm": 0.0003311718173790723, "learning_rate": 1.1793971319871233e-05, "loss": 0.0, "step": 5222 }, { "epoch": 1.5285338015803336, "grad_norm": 0.012646566145122051, "learning_rate": 1.178665496049166e-05, "loss": 0.0001, "step": 5223 }, { "epoch": 1.5288264559555165, "grad_norm": 0.0003175755846314132, "learning_rate": 1.1779338601112088e-05, "loss": 0.0, "step": 5224 }, { "epoch": 1.5291191103306994, "grad_norm": 0.0007361461757682264, "learning_rate": 1.1772022241732514e-05, "loss": 0.0, "step": 5225 }, { "epoch": 1.5294117647058822, "grad_norm": 0.002891580108553171, "learning_rate": 1.1764705882352942e-05, "loss": 0.0, "step": 5226 }, { "epoch": 1.5297044190810651, "grad_norm": 0.01717713475227356, "learning_rate": 1.175738952297337e-05, "loss": 0.0001, "step": 5227 }, { "epoch": 1.529997073456248, "grad_norm": 0.0002141928707715124, "learning_rate": 1.1750073163593796e-05, "loss": 0.0, "step": 5228 }, { "epoch": 1.5302897278314311, "grad_norm": 0.00033819713280536234, "learning_rate": 1.1742756804214224e-05, "loss": 0.0, "step": 5229 }, { "epoch": 1.530582382206614, "grad_norm": 0.00022239873942453414, "learning_rate": 1.1735440444834652e-05, "loss": 0.0, "step": 5230 }, { "epoch": 1.530875036581797, "grad_norm": 0.00019714840163942426, "learning_rate": 1.1728124085455078e-05, "loss": 0.0, "step": 5231 }, { "epoch": 1.5311676909569798, "grad_norm": 0.0002605865884106606, "learning_rate": 1.1720807726075506e-05, "loss": 0.0, "step": 5232 }, { "epoch": 1.5314603453321627, "grad_norm": 0.00030124778277240694, "learning_rate": 1.1713491366695932e-05, "loss": 0.0, "step": 5233 }, { "epoch": 1.5317529997073456, "grad_norm": 0.0002326490357518196, "learning_rate": 1.170617500731636e-05, "loss": 0.0, "step": 5234 }, { "epoch": 1.5320456540825287, "grad_norm": 0.0002671336114872247, "learning_rate": 1.1698858647936788e-05, "loss": 0.0, "step": 5235 }, { "epoch": 1.5323383084577116, "grad_norm": 7.774143887218088e-05, "learning_rate": 1.1691542288557215e-05, "loss": 0.0, "step": 5236 }, { "epoch": 1.5326309628328945, "grad_norm": 0.000155685018398799, "learning_rate": 1.1684225929177643e-05, "loss": 0.0, "step": 5237 }, { "epoch": 1.5329236172080773, "grad_norm": 0.0005528471665456891, "learning_rate": 1.1676909569798069e-05, "loss": 0.0, "step": 5238 }, { "epoch": 1.5332162715832602, "grad_norm": 0.0002706719096750021, "learning_rate": 1.1669593210418497e-05, "loss": 0.0, "step": 5239 }, { "epoch": 1.5335089259584431, "grad_norm": 0.0028580750804394484, "learning_rate": 1.1662276851038923e-05, "loss": 0.0, "step": 5240 }, { "epoch": 1.533801580333626, "grad_norm": 0.0005850521847605705, "learning_rate": 1.1654960491659351e-05, "loss": 0.0, "step": 5241 }, { "epoch": 1.534094234708809, "grad_norm": 0.00022836266725789756, "learning_rate": 1.1647644132279777e-05, "loss": 0.0, "step": 5242 }, { "epoch": 1.5343868890839918, "grad_norm": 0.0002294753649039194, "learning_rate": 1.1640327772900205e-05, "loss": 0.0, "step": 5243 }, { "epoch": 1.5346795434591747, "grad_norm": 0.0001081616137525998, "learning_rate": 1.1633011413520631e-05, "loss": 0.0, "step": 5244 }, { "epoch": 1.5349721978343576, "grad_norm": 0.00021684737293981016, "learning_rate": 1.162569505414106e-05, "loss": 0.0, "step": 5245 }, { "epoch": 1.5352648522095405, "grad_norm": 0.15254947543144226, "learning_rate": 1.1618378694761487e-05, "loss": 0.0005, "step": 5246 }, { "epoch": 1.5355575065847233, "grad_norm": 0.0001880263298517093, "learning_rate": 1.1611062335381914e-05, "loss": 0.0, "step": 5247 }, { "epoch": 1.5358501609599062, "grad_norm": 0.00023818780027795583, "learning_rate": 1.1603745976002341e-05, "loss": 0.0, "step": 5248 }, { "epoch": 1.5361428153350891, "grad_norm": 0.0008488367311656475, "learning_rate": 1.1596429616622768e-05, "loss": 0.0, "step": 5249 }, { "epoch": 1.536435469710272, "grad_norm": 0.00025859667221084237, "learning_rate": 1.1589113257243196e-05, "loss": 0.0, "step": 5250 }, { "epoch": 1.5367281240854551, "grad_norm": 0.0006804241565987468, "learning_rate": 1.1581796897863624e-05, "loss": 0.0, "step": 5251 }, { "epoch": 1.537020778460638, "grad_norm": 0.0003907028876710683, "learning_rate": 1.157448053848405e-05, "loss": 0.0, "step": 5252 }, { "epoch": 1.537313432835821, "grad_norm": 0.0006528544472530484, "learning_rate": 1.1567164179104478e-05, "loss": 0.0, "step": 5253 }, { "epoch": 1.5376060872110038, "grad_norm": 0.0007995760533958673, "learning_rate": 1.1559847819724906e-05, "loss": 0.0, "step": 5254 }, { "epoch": 1.5378987415861867, "grad_norm": 0.00013504792877938598, "learning_rate": 1.1552531460345332e-05, "loss": 0.0, "step": 5255 }, { "epoch": 1.5381913959613698, "grad_norm": 0.0011998468544334173, "learning_rate": 1.154521510096576e-05, "loss": 0.0, "step": 5256 }, { "epoch": 1.5384840503365527, "grad_norm": 0.00048505334416404366, "learning_rate": 1.1537898741586186e-05, "loss": 0.0, "step": 5257 }, { "epoch": 1.5387767047117356, "grad_norm": 0.0005926922312937677, "learning_rate": 1.1530582382206614e-05, "loss": 0.0, "step": 5258 }, { "epoch": 1.5390693590869184, "grad_norm": 0.0023627367336302996, "learning_rate": 1.1523266022827042e-05, "loss": 0.0, "step": 5259 }, { "epoch": 1.5393620134621013, "grad_norm": 0.00027779172523878515, "learning_rate": 1.1515949663447468e-05, "loss": 0.0, "step": 5260 }, { "epoch": 1.5396546678372842, "grad_norm": 0.0007822969928383827, "learning_rate": 1.1508633304067896e-05, "loss": 0.0, "step": 5261 }, { "epoch": 1.539947322212467, "grad_norm": 7.334064483642578, "learning_rate": 1.1501316944688324e-05, "loss": 0.1488, "step": 5262 }, { "epoch": 1.54023997658765, "grad_norm": 0.0007610148168168962, "learning_rate": 1.149400058530875e-05, "loss": 0.0, "step": 5263 }, { "epoch": 1.5405326309628329, "grad_norm": 0.00011353415175108239, "learning_rate": 1.1486684225929179e-05, "loss": 0.0, "step": 5264 }, { "epoch": 1.5408252853380158, "grad_norm": 0.0006136444280855358, "learning_rate": 1.1479367866549605e-05, "loss": 0.0, "step": 5265 }, { "epoch": 1.5411179397131987, "grad_norm": 0.00028312645736150444, "learning_rate": 1.1472051507170033e-05, "loss": 0.0, "step": 5266 }, { "epoch": 1.5414105940883815, "grad_norm": 0.0005532561335712671, "learning_rate": 1.146473514779046e-05, "loss": 0.0, "step": 5267 }, { "epoch": 1.5417032484635644, "grad_norm": 0.0004004383517894894, "learning_rate": 1.1457418788410887e-05, "loss": 0.0, "step": 5268 }, { "epoch": 1.5419959028387473, "grad_norm": 0.001884422730654478, "learning_rate": 1.1450102429031315e-05, "loss": 0.0, "step": 5269 }, { "epoch": 1.5422885572139302, "grad_norm": 0.0054294029250741005, "learning_rate": 1.1442786069651743e-05, "loss": 0.0001, "step": 5270 }, { "epoch": 1.542581211589113, "grad_norm": 0.0014466885477304459, "learning_rate": 1.1435469710272169e-05, "loss": 0.0, "step": 5271 }, { "epoch": 1.5428738659642962, "grad_norm": 0.000630267895758152, "learning_rate": 1.1428153350892597e-05, "loss": 0.0, "step": 5272 }, { "epoch": 1.543166520339479, "grad_norm": 0.004547810181975365, "learning_rate": 1.1420836991513023e-05, "loss": 0.0, "step": 5273 }, { "epoch": 1.543459174714662, "grad_norm": 0.00033015169901773334, "learning_rate": 1.1413520632133451e-05, "loss": 0.0, "step": 5274 }, { "epoch": 1.5437518290898449, "grad_norm": 0.0033606109209358692, "learning_rate": 1.140620427275388e-05, "loss": 0.0, "step": 5275 }, { "epoch": 1.5440444834650278, "grad_norm": 0.0004756157286465168, "learning_rate": 1.1398887913374305e-05, "loss": 0.0, "step": 5276 }, { "epoch": 1.5443371378402109, "grad_norm": 0.0032487944699823856, "learning_rate": 1.1391571553994733e-05, "loss": 0.0, "step": 5277 }, { "epoch": 1.5446297922153938, "grad_norm": 0.00032525573624297976, "learning_rate": 1.1384255194615161e-05, "loss": 0.0, "step": 5278 }, { "epoch": 1.5449224465905766, "grad_norm": 0.002686867257580161, "learning_rate": 1.1376938835235588e-05, "loss": 0.0, "step": 5279 }, { "epoch": 1.5452151009657595, "grad_norm": 0.0040970370173454285, "learning_rate": 1.1369622475856016e-05, "loss": 0.0001, "step": 5280 }, { "epoch": 1.5455077553409424, "grad_norm": 0.0005278911557979882, "learning_rate": 1.1362306116476442e-05, "loss": 0.0, "step": 5281 }, { "epoch": 1.5458004097161253, "grad_norm": 0.0020878189243376255, "learning_rate": 1.135498975709687e-05, "loss": 0.0, "step": 5282 }, { "epoch": 1.5460930640913082, "grad_norm": 0.0007402309565804899, "learning_rate": 1.1347673397717298e-05, "loss": 0.0, "step": 5283 }, { "epoch": 1.546385718466491, "grad_norm": 0.1317036747932434, "learning_rate": 1.1340357038337724e-05, "loss": 0.0005, "step": 5284 }, { "epoch": 1.546678372841674, "grad_norm": 0.001256045768968761, "learning_rate": 1.1333040678958152e-05, "loss": 0.0, "step": 5285 }, { "epoch": 1.5469710272168569, "grad_norm": 0.0002911437186412513, "learning_rate": 1.1325724319578578e-05, "loss": 0.0, "step": 5286 }, { "epoch": 1.5472636815920398, "grad_norm": 0.000973232788965106, "learning_rate": 1.1318407960199006e-05, "loss": 0.0, "step": 5287 }, { "epoch": 1.5475563359672226, "grad_norm": 0.0005813579773530364, "learning_rate": 1.1311091600819432e-05, "loss": 0.0, "step": 5288 }, { "epoch": 1.5478489903424055, "grad_norm": 0.0004719037970062345, "learning_rate": 1.130377524143986e-05, "loss": 0.0, "step": 5289 }, { "epoch": 1.5481416447175884, "grad_norm": 0.0004831487312912941, "learning_rate": 1.1296458882060287e-05, "loss": 0.0, "step": 5290 }, { "epoch": 1.5484342990927713, "grad_norm": 0.00042136141564697027, "learning_rate": 1.1289142522680715e-05, "loss": 0.0, "step": 5291 }, { "epoch": 1.5487269534679542, "grad_norm": 0.001426449278369546, "learning_rate": 1.128182616330114e-05, "loss": 0.0, "step": 5292 }, { "epoch": 1.5490196078431373, "grad_norm": 0.0014369417913258076, "learning_rate": 1.1274509803921569e-05, "loss": 0.0, "step": 5293 }, { "epoch": 1.5493122622183202, "grad_norm": 0.0026862751692533493, "learning_rate": 1.1267193444541995e-05, "loss": 0.0, "step": 5294 }, { "epoch": 1.549604916593503, "grad_norm": 0.001535497372969985, "learning_rate": 1.1259877085162423e-05, "loss": 0.0, "step": 5295 }, { "epoch": 1.549897570968686, "grad_norm": 0.06716033816337585, "learning_rate": 1.1252560725782851e-05, "loss": 0.0004, "step": 5296 }, { "epoch": 1.5501902253438689, "grad_norm": 0.0018338051158934832, "learning_rate": 1.1245244366403277e-05, "loss": 0.0, "step": 5297 }, { "epoch": 1.550482879719052, "grad_norm": 0.009203138761222363, "learning_rate": 1.1237928007023705e-05, "loss": 0.0001, "step": 5298 }, { "epoch": 1.5507755340942349, "grad_norm": 0.0012893445091322064, "learning_rate": 1.1230611647644133e-05, "loss": 0.0, "step": 5299 }, { "epoch": 1.5510681884694177, "grad_norm": 0.00018410121265333146, "learning_rate": 1.122329528826456e-05, "loss": 0.0, "step": 5300 }, { "epoch": 1.5513608428446006, "grad_norm": 22.063335418701172, "learning_rate": 1.1215978928884987e-05, "loss": 0.1073, "step": 5301 }, { "epoch": 1.5516534972197835, "grad_norm": 0.00047670266940258443, "learning_rate": 1.1208662569505414e-05, "loss": 0.0, "step": 5302 }, { "epoch": 1.5519461515949664, "grad_norm": 3.2325854301452637, "learning_rate": 1.1201346210125841e-05, "loss": 0.2646, "step": 5303 }, { "epoch": 1.5522388059701493, "grad_norm": 0.002112477319315076, "learning_rate": 1.119402985074627e-05, "loss": 0.0, "step": 5304 }, { "epoch": 1.5525314603453322, "grad_norm": 0.0007211135234683752, "learning_rate": 1.1186713491366696e-05, "loss": 0.0, "step": 5305 }, { "epoch": 1.552824114720515, "grad_norm": 0.000910220667719841, "learning_rate": 1.1179397131987124e-05, "loss": 0.0, "step": 5306 }, { "epoch": 1.553116769095698, "grad_norm": 0.08611762523651123, "learning_rate": 1.1172080772607552e-05, "loss": 0.0004, "step": 5307 }, { "epoch": 1.5534094234708808, "grad_norm": 0.2647046148777008, "learning_rate": 1.1164764413227978e-05, "loss": 0.0009, "step": 5308 }, { "epoch": 1.5537020778460637, "grad_norm": 16.1777400970459, "learning_rate": 1.1157448053848406e-05, "loss": 0.0437, "step": 5309 }, { "epoch": 1.5539947322212466, "grad_norm": 0.000800949230324477, "learning_rate": 1.1150131694468832e-05, "loss": 0.0, "step": 5310 }, { "epoch": 1.5542873865964295, "grad_norm": 0.0012882291339337826, "learning_rate": 1.114281533508926e-05, "loss": 0.0, "step": 5311 }, { "epoch": 1.5545800409716124, "grad_norm": 0.00023692568356636912, "learning_rate": 1.1135498975709688e-05, "loss": 0.0, "step": 5312 }, { "epoch": 1.5548726953467953, "grad_norm": 0.0003705483686644584, "learning_rate": 1.1128182616330114e-05, "loss": 0.0, "step": 5313 }, { "epoch": 1.5551653497219784, "grad_norm": 0.1628727912902832, "learning_rate": 1.1120866256950542e-05, "loss": 0.0008, "step": 5314 }, { "epoch": 1.5554580040971613, "grad_norm": 0.0014205491170287132, "learning_rate": 1.111354989757097e-05, "loss": 0.0, "step": 5315 }, { "epoch": 1.5557506584723442, "grad_norm": 0.0007606131257489324, "learning_rate": 1.1106233538191396e-05, "loss": 0.0, "step": 5316 }, { "epoch": 1.556043312847527, "grad_norm": 0.41100841760635376, "learning_rate": 1.1098917178811824e-05, "loss": 0.0016, "step": 5317 }, { "epoch": 1.55633596722271, "grad_norm": 0.003020121483132243, "learning_rate": 1.109160081943225e-05, "loss": 0.0001, "step": 5318 }, { "epoch": 1.5566286215978928, "grad_norm": 0.0013891629641875625, "learning_rate": 1.1084284460052679e-05, "loss": 0.0, "step": 5319 }, { "epoch": 1.556921275973076, "grad_norm": 0.0006036332342773676, "learning_rate": 1.1076968100673106e-05, "loss": 0.0, "step": 5320 }, { "epoch": 1.5572139303482588, "grad_norm": 0.0015644223894923925, "learning_rate": 1.1069651741293533e-05, "loss": 0.0, "step": 5321 }, { "epoch": 1.5575065847234417, "grad_norm": 0.02502756007015705, "learning_rate": 1.106233538191396e-05, "loss": 0.0002, "step": 5322 }, { "epoch": 1.5577992390986246, "grad_norm": 0.0007970908773131669, "learning_rate": 1.1055019022534389e-05, "loss": 0.0, "step": 5323 }, { "epoch": 1.5580918934738075, "grad_norm": 0.000541463028639555, "learning_rate": 1.1047702663154815e-05, "loss": 0.0, "step": 5324 }, { "epoch": 1.5583845478489904, "grad_norm": 0.0011445097625255585, "learning_rate": 1.1040386303775243e-05, "loss": 0.0, "step": 5325 }, { "epoch": 1.5586772022241733, "grad_norm": 0.006099475547671318, "learning_rate": 1.1033069944395669e-05, "loss": 0.0001, "step": 5326 }, { "epoch": 1.5589698565993562, "grad_norm": 0.009510427713394165, "learning_rate": 1.1025753585016097e-05, "loss": 0.0001, "step": 5327 }, { "epoch": 1.559262510974539, "grad_norm": 0.002174089662730694, "learning_rate": 1.1018437225636525e-05, "loss": 0.0, "step": 5328 }, { "epoch": 1.559555165349722, "grad_norm": 0.0037814818788319826, "learning_rate": 1.1011120866256951e-05, "loss": 0.0001, "step": 5329 }, { "epoch": 1.5598478197249048, "grad_norm": 0.0018479940481483936, "learning_rate": 1.100380450687738e-05, "loss": 0.0, "step": 5330 }, { "epoch": 1.5601404741000877, "grad_norm": 0.0019330501090735197, "learning_rate": 1.0996488147497805e-05, "loss": 0.0, "step": 5331 }, { "epoch": 1.5604331284752706, "grad_norm": 0.029701590538024902, "learning_rate": 1.0989171788118233e-05, "loss": 0.0002, "step": 5332 }, { "epoch": 1.5607257828504535, "grad_norm": 0.0005212106043472886, "learning_rate": 1.098185542873866e-05, "loss": 0.0, "step": 5333 }, { "epoch": 1.5610184372256364, "grad_norm": 0.0018254711758345366, "learning_rate": 1.0974539069359088e-05, "loss": 0.0, "step": 5334 }, { "epoch": 1.5613110916008193, "grad_norm": 0.0021136414725333452, "learning_rate": 1.0967222709979514e-05, "loss": 0.0, "step": 5335 }, { "epoch": 1.5616037459760024, "grad_norm": 0.0023741601034998894, "learning_rate": 1.0959906350599942e-05, "loss": 0.0001, "step": 5336 }, { "epoch": 1.5618964003511853, "grad_norm": 0.011876046657562256, "learning_rate": 1.0952589991220368e-05, "loss": 0.0001, "step": 5337 }, { "epoch": 1.5621890547263682, "grad_norm": 0.0011572998482733965, "learning_rate": 1.0945273631840796e-05, "loss": 0.0, "step": 5338 }, { "epoch": 1.562481709101551, "grad_norm": 0.012459277175366879, "learning_rate": 1.0937957272461224e-05, "loss": 0.0001, "step": 5339 }, { "epoch": 1.562774363476734, "grad_norm": 0.01071714423596859, "learning_rate": 1.093064091308165e-05, "loss": 0.0001, "step": 5340 }, { "epoch": 1.563067017851917, "grad_norm": 0.0024051477666944265, "learning_rate": 1.0923324553702078e-05, "loss": 0.0, "step": 5341 }, { "epoch": 1.5633596722271, "grad_norm": 0.00044596398947760463, "learning_rate": 1.0916008194322504e-05, "loss": 0.0, "step": 5342 }, { "epoch": 1.5636523266022828, "grad_norm": 0.00155029003508389, "learning_rate": 1.0908691834942932e-05, "loss": 0.0, "step": 5343 }, { "epoch": 1.5639449809774657, "grad_norm": 0.0009954465785995126, "learning_rate": 1.090137547556336e-05, "loss": 0.0, "step": 5344 }, { "epoch": 1.5642376353526486, "grad_norm": 0.0013960065552964807, "learning_rate": 1.0894059116183787e-05, "loss": 0.0, "step": 5345 }, { "epoch": 1.5645302897278315, "grad_norm": 0.0006535080610774457, "learning_rate": 1.0886742756804215e-05, "loss": 0.0, "step": 5346 }, { "epoch": 1.5648229441030144, "grad_norm": 0.0018199041951447725, "learning_rate": 1.0879426397424643e-05, "loss": 0.0, "step": 5347 }, { "epoch": 1.5651155984781973, "grad_norm": 0.007253793999552727, "learning_rate": 1.0872110038045069e-05, "loss": 0.0001, "step": 5348 }, { "epoch": 1.5654082528533801, "grad_norm": 0.004021256230771542, "learning_rate": 1.0864793678665497e-05, "loss": 0.0, "step": 5349 }, { "epoch": 1.565700907228563, "grad_norm": 0.002721029333770275, "learning_rate": 1.0857477319285923e-05, "loss": 0.0001, "step": 5350 }, { "epoch": 1.565993561603746, "grad_norm": 0.000878484221175313, "learning_rate": 1.0850160959906351e-05, "loss": 0.0, "step": 5351 }, { "epoch": 1.5662862159789288, "grad_norm": 0.0012332138139754534, "learning_rate": 1.0842844600526779e-05, "loss": 0.0, "step": 5352 }, { "epoch": 1.5665788703541117, "grad_norm": 0.0008330278797075152, "learning_rate": 1.0835528241147205e-05, "loss": 0.0, "step": 5353 }, { "epoch": 1.5668715247292946, "grad_norm": 0.0025928113609552383, "learning_rate": 1.0828211881767633e-05, "loss": 0.0001, "step": 5354 }, { "epoch": 1.5671641791044775, "grad_norm": 0.0011688877129927278, "learning_rate": 1.082089552238806e-05, "loss": 0.0, "step": 5355 }, { "epoch": 1.5674568334796604, "grad_norm": 0.0014505028957501054, "learning_rate": 1.0813579163008487e-05, "loss": 0.0, "step": 5356 }, { "epoch": 1.5677494878548435, "grad_norm": 0.0016532718436792493, "learning_rate": 1.0806262803628915e-05, "loss": 0.0, "step": 5357 }, { "epoch": 1.5680421422300264, "grad_norm": 0.007025901693850756, "learning_rate": 1.0798946444249341e-05, "loss": 0.0001, "step": 5358 }, { "epoch": 1.5683347966052092, "grad_norm": 0.001686119707301259, "learning_rate": 1.079163008486977e-05, "loss": 0.0, "step": 5359 }, { "epoch": 1.5686274509803921, "grad_norm": 0.0013476323802024126, "learning_rate": 1.0784313725490197e-05, "loss": 0.0, "step": 5360 }, { "epoch": 1.568920105355575, "grad_norm": 0.0018852063221856952, "learning_rate": 1.0776997366110624e-05, "loss": 0.0001, "step": 5361 }, { "epoch": 1.5692127597307581, "grad_norm": 0.0031956795137375593, "learning_rate": 1.0769681006731052e-05, "loss": 0.0001, "step": 5362 }, { "epoch": 1.569505414105941, "grad_norm": 0.0007793364929966629, "learning_rate": 1.0762364647351478e-05, "loss": 0.0, "step": 5363 }, { "epoch": 1.569798068481124, "grad_norm": 0.004618383478373289, "learning_rate": 1.0755048287971906e-05, "loss": 0.0001, "step": 5364 }, { "epoch": 1.5700907228563068, "grad_norm": 0.04225985333323479, "learning_rate": 1.0747731928592334e-05, "loss": 0.0002, "step": 5365 }, { "epoch": 1.5703833772314897, "grad_norm": 0.000692862959112972, "learning_rate": 1.074041556921276e-05, "loss": 0.0, "step": 5366 }, { "epoch": 1.5706760316066726, "grad_norm": 0.0008593127131462097, "learning_rate": 1.0733099209833188e-05, "loss": 0.0, "step": 5367 }, { "epoch": 1.5709686859818555, "grad_norm": 0.001766446977853775, "learning_rate": 1.0725782850453616e-05, "loss": 0.0, "step": 5368 }, { "epoch": 1.5712613403570383, "grad_norm": 0.0005354926688596606, "learning_rate": 1.0718466491074042e-05, "loss": 0.0, "step": 5369 }, { "epoch": 1.5715539947322212, "grad_norm": 0.0015676389448344707, "learning_rate": 1.071115013169447e-05, "loss": 0.0, "step": 5370 }, { "epoch": 1.5718466491074041, "grad_norm": 0.0008060939144343138, "learning_rate": 1.0703833772314896e-05, "loss": 0.0, "step": 5371 }, { "epoch": 1.572139303482587, "grad_norm": 0.0022210762836039066, "learning_rate": 1.0696517412935324e-05, "loss": 0.0, "step": 5372 }, { "epoch": 1.57243195785777, "grad_norm": 0.0008389541762880981, "learning_rate": 1.0689201053555752e-05, "loss": 0.0, "step": 5373 }, { "epoch": 1.5727246122329528, "grad_norm": 0.0008097761892713606, "learning_rate": 1.0681884694176179e-05, "loss": 0.0, "step": 5374 }, { "epoch": 1.5730172666081357, "grad_norm": 0.00037051280378364027, "learning_rate": 1.0674568334796606e-05, "loss": 0.0, "step": 5375 }, { "epoch": 1.5733099209833186, "grad_norm": 0.0004782450560014695, "learning_rate": 1.0667251975417033e-05, "loss": 0.0, "step": 5376 }, { "epoch": 1.5736025753585015, "grad_norm": 0.0011488194577395916, "learning_rate": 1.065993561603746e-05, "loss": 0.0, "step": 5377 }, { "epoch": 1.5738952297336846, "grad_norm": 0.0013666459126397967, "learning_rate": 1.0652619256657887e-05, "loss": 0.0, "step": 5378 }, { "epoch": 1.5741878841088675, "grad_norm": 0.000563440378755331, "learning_rate": 1.0645302897278315e-05, "loss": 0.0, "step": 5379 }, { "epoch": 1.5744805384840503, "grad_norm": 0.013980953954160213, "learning_rate": 1.0637986537898743e-05, "loss": 0.0001, "step": 5380 }, { "epoch": 1.5747731928592332, "grad_norm": 0.0008600152796134353, "learning_rate": 1.0630670178519169e-05, "loss": 0.0, "step": 5381 }, { "epoch": 1.5750658472344161, "grad_norm": 0.0011449077865108848, "learning_rate": 1.0623353819139597e-05, "loss": 0.0, "step": 5382 }, { "epoch": 1.5753585016095992, "grad_norm": 0.0003296992217656225, "learning_rate": 1.0616037459760023e-05, "loss": 0.0, "step": 5383 }, { "epoch": 1.5756511559847821, "grad_norm": 0.0017691970570012927, "learning_rate": 1.0608721100380451e-05, "loss": 0.0, "step": 5384 }, { "epoch": 1.575943810359965, "grad_norm": 0.0010965262772515416, "learning_rate": 1.0601404741000878e-05, "loss": 0.0, "step": 5385 }, { "epoch": 1.576236464735148, "grad_norm": 0.0016302278963848948, "learning_rate": 1.0594088381621305e-05, "loss": 0.0, "step": 5386 }, { "epoch": 1.5765291191103308, "grad_norm": 0.0007256264798343182, "learning_rate": 1.0586772022241732e-05, "loss": 0.0, "step": 5387 }, { "epoch": 1.5768217734855137, "grad_norm": 0.0013906812528148293, "learning_rate": 1.057945566286216e-05, "loss": 0.0, "step": 5388 }, { "epoch": 1.5771144278606966, "grad_norm": 0.013793290592730045, "learning_rate": 1.0572139303482588e-05, "loss": 0.0001, "step": 5389 }, { "epoch": 1.5774070822358794, "grad_norm": 0.0006986415246501565, "learning_rate": 1.0564822944103014e-05, "loss": 0.0, "step": 5390 }, { "epoch": 1.5776997366110623, "grad_norm": 3.0798137187957764, "learning_rate": 1.0557506584723442e-05, "loss": 0.2331, "step": 5391 }, { "epoch": 1.5779923909862452, "grad_norm": 0.0024522338062524796, "learning_rate": 1.055019022534387e-05, "loss": 0.0, "step": 5392 }, { "epoch": 1.578285045361428, "grad_norm": 0.0007080997456796467, "learning_rate": 1.0542873865964296e-05, "loss": 0.0, "step": 5393 }, { "epoch": 1.578577699736611, "grad_norm": 0.0005693368148058653, "learning_rate": 1.0535557506584724e-05, "loss": 0.0, "step": 5394 }, { "epoch": 1.5788703541117939, "grad_norm": 0.0035647971089929342, "learning_rate": 1.052824114720515e-05, "loss": 0.0001, "step": 5395 }, { "epoch": 1.5791630084869768, "grad_norm": 0.0051018777303397655, "learning_rate": 1.0520924787825578e-05, "loss": 0.0001, "step": 5396 }, { "epoch": 1.5794556628621597, "grad_norm": 0.01608877442777157, "learning_rate": 1.0513608428446006e-05, "loss": 0.0003, "step": 5397 }, { "epoch": 1.5797483172373425, "grad_norm": 0.024239428341388702, "learning_rate": 1.0506292069066432e-05, "loss": 0.0003, "step": 5398 }, { "epoch": 1.5800409716125257, "grad_norm": 0.023922836408019066, "learning_rate": 1.049897570968686e-05, "loss": 0.0004, "step": 5399 }, { "epoch": 1.5803336259877085, "grad_norm": 0.040479667484760284, "learning_rate": 1.0491659350307288e-05, "loss": 0.0007, "step": 5400 }, { "epoch": 1.5806262803628914, "grad_norm": 0.026322608813643456, "learning_rate": 1.0484342990927715e-05, "loss": 0.0005, "step": 5401 }, { "epoch": 1.5809189347380743, "grad_norm": 0.016565097495913506, "learning_rate": 1.0477026631548143e-05, "loss": 0.0003, "step": 5402 }, { "epoch": 1.5812115891132572, "grad_norm": 0.014207043685019016, "learning_rate": 1.0469710272168569e-05, "loss": 0.0002, "step": 5403 }, { "epoch": 1.58150424348844, "grad_norm": 4.747661590576172, "learning_rate": 1.0462393912788997e-05, "loss": 0.1737, "step": 5404 }, { "epoch": 1.5817968978636232, "grad_norm": 0.03960366174578667, "learning_rate": 1.0455077553409425e-05, "loss": 0.0007, "step": 5405 }, { "epoch": 1.582089552238806, "grad_norm": 0.053216978907585144, "learning_rate": 1.0447761194029851e-05, "loss": 0.001, "step": 5406 }, { "epoch": 1.582382206613989, "grad_norm": 0.01265775691717863, "learning_rate": 1.0440444834650279e-05, "loss": 0.0002, "step": 5407 }, { "epoch": 1.5826748609891719, "grad_norm": 0.01505838893353939, "learning_rate": 1.0433128475270707e-05, "loss": 0.0003, "step": 5408 }, { "epoch": 1.5829675153643548, "grad_norm": 0.01124452706426382, "learning_rate": 1.0425812115891133e-05, "loss": 0.0003, "step": 5409 }, { "epoch": 1.5832601697395376, "grad_norm": 0.015584814362227917, "learning_rate": 1.0418495756511561e-05, "loss": 0.0003, "step": 5410 }, { "epoch": 1.5835528241147205, "grad_norm": 0.01889246329665184, "learning_rate": 1.0411179397131987e-05, "loss": 0.0004, "step": 5411 }, { "epoch": 1.5838454784899034, "grad_norm": 0.0218829195946455, "learning_rate": 1.0403863037752415e-05, "loss": 0.0005, "step": 5412 }, { "epoch": 1.5841381328650863, "grad_norm": 0.005514830816537142, "learning_rate": 1.0396546678372843e-05, "loss": 0.0001, "step": 5413 }, { "epoch": 1.5844307872402692, "grad_norm": 0.36543434858322144, "learning_rate": 1.038923031899327e-05, "loss": 0.0017, "step": 5414 }, { "epoch": 1.584723441615452, "grad_norm": 0.00626228516921401, "learning_rate": 1.0381913959613697e-05, "loss": 0.0001, "step": 5415 }, { "epoch": 1.585016095990635, "grad_norm": 0.007346426136791706, "learning_rate": 1.0374597600234124e-05, "loss": 0.0001, "step": 5416 }, { "epoch": 1.5853087503658179, "grad_norm": 0.0025608576834201813, "learning_rate": 1.0367281240854552e-05, "loss": 0.0001, "step": 5417 }, { "epoch": 1.5856014047410008, "grad_norm": 0.0082298768684268, "learning_rate": 1.035996488147498e-05, "loss": 0.0002, "step": 5418 }, { "epoch": 1.5858940591161836, "grad_norm": 0.004710091277956963, "learning_rate": 1.0352648522095406e-05, "loss": 0.0001, "step": 5419 }, { "epoch": 1.5861867134913668, "grad_norm": 0.007210554555058479, "learning_rate": 1.0345332162715834e-05, "loss": 0.0002, "step": 5420 }, { "epoch": 1.5864793678665496, "grad_norm": 0.010958317667245865, "learning_rate": 1.0338015803336262e-05, "loss": 0.0003, "step": 5421 }, { "epoch": 1.5867720222417325, "grad_norm": 0.011359517462551594, "learning_rate": 1.0330699443956688e-05, "loss": 0.0002, "step": 5422 }, { "epoch": 1.5870646766169154, "grad_norm": 0.003848788794130087, "learning_rate": 1.0323383084577116e-05, "loss": 0.0001, "step": 5423 }, { "epoch": 1.5873573309920983, "grad_norm": 0.012078369036316872, "learning_rate": 1.0316066725197542e-05, "loss": 0.0002, "step": 5424 }, { "epoch": 1.5876499853672812, "grad_norm": 0.007848396897315979, "learning_rate": 1.030875036581797e-05, "loss": 0.0002, "step": 5425 }, { "epoch": 1.5879426397424643, "grad_norm": 0.009114718064665794, "learning_rate": 1.0301434006438396e-05, "loss": 0.0002, "step": 5426 }, { "epoch": 1.5882352941176472, "grad_norm": 0.001855672337114811, "learning_rate": 1.0294117647058824e-05, "loss": 0.0, "step": 5427 }, { "epoch": 1.58852794849283, "grad_norm": 0.0008037255611270666, "learning_rate": 1.028680128767925e-05, "loss": 0.0, "step": 5428 }, { "epoch": 1.588820602868013, "grad_norm": 0.004607463255524635, "learning_rate": 1.0279484928299679e-05, "loss": 0.0001, "step": 5429 }, { "epoch": 1.5891132572431959, "grad_norm": 0.02064778096973896, "learning_rate": 1.0272168568920105e-05, "loss": 0.0003, "step": 5430 }, { "epoch": 1.5894059116183787, "grad_norm": 0.007519340142607689, "learning_rate": 1.0264852209540533e-05, "loss": 0.0002, "step": 5431 }, { "epoch": 1.5896985659935616, "grad_norm": 0.004054190590977669, "learning_rate": 1.0257535850160959e-05, "loss": 0.0001, "step": 5432 }, { "epoch": 1.5899912203687445, "grad_norm": 0.007159278728067875, "learning_rate": 1.0250219490781387e-05, "loss": 0.0001, "step": 5433 }, { "epoch": 1.5902838747439274, "grad_norm": 0.005060483701527119, "learning_rate": 1.0242903131401815e-05, "loss": 0.0001, "step": 5434 }, { "epoch": 1.5905765291191103, "grad_norm": 0.004195989575237036, "learning_rate": 1.0235586772022241e-05, "loss": 0.0001, "step": 5435 }, { "epoch": 1.5908691834942932, "grad_norm": 0.004533226601779461, "learning_rate": 1.0228270412642669e-05, "loss": 0.0001, "step": 5436 }, { "epoch": 1.591161837869476, "grad_norm": 0.00588589021936059, "learning_rate": 1.0220954053263097e-05, "loss": 0.0001, "step": 5437 }, { "epoch": 1.591454492244659, "grad_norm": 0.004369951784610748, "learning_rate": 1.0213637693883523e-05, "loss": 0.0001, "step": 5438 }, { "epoch": 1.5917471466198418, "grad_norm": 0.020003627985715866, "learning_rate": 1.0206321334503951e-05, "loss": 0.0003, "step": 5439 }, { "epoch": 1.5920398009950247, "grad_norm": 0.004071712959557772, "learning_rate": 1.0199004975124378e-05, "loss": 0.0001, "step": 5440 }, { "epoch": 1.5923324553702076, "grad_norm": 0.005775037687271833, "learning_rate": 1.0191688615744805e-05, "loss": 0.0002, "step": 5441 }, { "epoch": 1.5926251097453907, "grad_norm": 0.0015077510615810752, "learning_rate": 1.0184372256365233e-05, "loss": 0.0, "step": 5442 }, { "epoch": 1.5929177641205736, "grad_norm": 0.005627945531159639, "learning_rate": 1.017705589698566e-05, "loss": 0.0001, "step": 5443 }, { "epoch": 1.5932104184957565, "grad_norm": 0.3537636697292328, "learning_rate": 1.0169739537606088e-05, "loss": 0.0017, "step": 5444 }, { "epoch": 1.5935030728709394, "grad_norm": 0.021307745948433876, "learning_rate": 1.0162423178226516e-05, "loss": 0.0003, "step": 5445 }, { "epoch": 1.5937957272461223, "grad_norm": 0.004862764850258827, "learning_rate": 1.0155106818846942e-05, "loss": 0.0001, "step": 5446 }, { "epoch": 1.5940883816213054, "grad_norm": 0.001179985934868455, "learning_rate": 1.014779045946737e-05, "loss": 0.0, "step": 5447 }, { "epoch": 1.5943810359964883, "grad_norm": 0.004859209526330233, "learning_rate": 1.0140474100087796e-05, "loss": 0.0001, "step": 5448 }, { "epoch": 1.5946736903716712, "grad_norm": 0.0020784807857125998, "learning_rate": 1.0133157740708224e-05, "loss": 0.0, "step": 5449 }, { "epoch": 1.594966344746854, "grad_norm": 0.005648862570524216, "learning_rate": 1.0125841381328652e-05, "loss": 0.0001, "step": 5450 }, { "epoch": 1.595258999122037, "grad_norm": 0.0021079182624816895, "learning_rate": 1.0118525021949078e-05, "loss": 0.0, "step": 5451 }, { "epoch": 1.5955516534972198, "grad_norm": 0.0028216817881911993, "learning_rate": 1.0111208662569506e-05, "loss": 0.0001, "step": 5452 }, { "epoch": 1.5958443078724027, "grad_norm": 0.002231398830190301, "learning_rate": 1.0103892303189934e-05, "loss": 0.0, "step": 5453 }, { "epoch": 1.5961369622475856, "grad_norm": 0.0026541994884610176, "learning_rate": 1.009657594381036e-05, "loss": 0.0001, "step": 5454 }, { "epoch": 1.5964296166227685, "grad_norm": 11.862154006958008, "learning_rate": 1.0089259584430788e-05, "loss": 0.1116, "step": 5455 }, { "epoch": 1.5967222709979514, "grad_norm": 0.0055429027415812016, "learning_rate": 1.0081943225051215e-05, "loss": 0.0001, "step": 5456 }, { "epoch": 1.5970149253731343, "grad_norm": 0.015475308522582054, "learning_rate": 1.0074626865671643e-05, "loss": 0.0002, "step": 5457 }, { "epoch": 1.5973075797483172, "grad_norm": 0.006429288536310196, "learning_rate": 1.006731050629207e-05, "loss": 0.0001, "step": 5458 }, { "epoch": 1.5976002341235, "grad_norm": 0.0036033045034855604, "learning_rate": 1.0059994146912497e-05, "loss": 0.0001, "step": 5459 }, { "epoch": 1.597892888498683, "grad_norm": 0.006906921975314617, "learning_rate": 1.0052677787532925e-05, "loss": 0.0001, "step": 5460 }, { "epoch": 1.5981855428738658, "grad_norm": 0.0003840086574200541, "learning_rate": 1.0045361428153353e-05, "loss": 0.0, "step": 5461 }, { "epoch": 1.5984781972490487, "grad_norm": 0.0009076825808733702, "learning_rate": 1.0038045068773779e-05, "loss": 0.0, "step": 5462 }, { "epoch": 1.5987708516242318, "grad_norm": 0.0018735651392489672, "learning_rate": 1.0030728709394207e-05, "loss": 0.0001, "step": 5463 }, { "epoch": 1.5990635059994147, "grad_norm": 0.004747231025248766, "learning_rate": 1.0023412350014633e-05, "loss": 0.0, "step": 5464 }, { "epoch": 1.5993561603745976, "grad_norm": 0.007176742423325777, "learning_rate": 1.0016095990635061e-05, "loss": 0.0001, "step": 5465 }, { "epoch": 1.5996488147497805, "grad_norm": 0.004211073741316795, "learning_rate": 1.0008779631255489e-05, "loss": 0.0001, "step": 5466 }, { "epoch": 1.5999414691249634, "grad_norm": 0.004269284196197987, "learning_rate": 1.0001463271875915e-05, "loss": 0.0001, "step": 5467 }, { "epoch": 1.6002341235001465, "grad_norm": 0.0071305325254797935, "learning_rate": 9.994146912496343e-06, "loss": 0.0001, "step": 5468 }, { "epoch": 1.6005267778753294, "grad_norm": 0.01426814403384924, "learning_rate": 9.98683055311677e-06, "loss": 0.0002, "step": 5469 }, { "epoch": 1.6008194322505123, "grad_norm": 0.0011637783609330654, "learning_rate": 9.979514193737197e-06, "loss": 0.0, "step": 5470 }, { "epoch": 1.6011120866256952, "grad_norm": 0.0064765228889882565, "learning_rate": 9.972197834357624e-06, "loss": 0.0001, "step": 5471 }, { "epoch": 1.601404741000878, "grad_norm": 0.005298473406583071, "learning_rate": 9.964881474978052e-06, "loss": 0.0001, "step": 5472 }, { "epoch": 1.601697395376061, "grad_norm": 0.6955668926239014, "learning_rate": 9.957565115598478e-06, "loss": 0.002, "step": 5473 }, { "epoch": 1.6019900497512438, "grad_norm": 0.0010658545652404428, "learning_rate": 9.950248756218906e-06, "loss": 0.0, "step": 5474 }, { "epoch": 1.6022827041264267, "grad_norm": 0.0014335220912471414, "learning_rate": 9.942932396839332e-06, "loss": 0.0, "step": 5475 }, { "epoch": 1.6025753585016096, "grad_norm": 0.0029462689999490976, "learning_rate": 9.93561603745976e-06, "loss": 0.0001, "step": 5476 }, { "epoch": 1.6028680128767925, "grad_norm": 0.0032651119399815798, "learning_rate": 9.928299678080188e-06, "loss": 0.0001, "step": 5477 }, { "epoch": 1.6031606672519754, "grad_norm": 0.004083502572029829, "learning_rate": 9.920983318700614e-06, "loss": 0.0001, "step": 5478 }, { "epoch": 1.6034533216271583, "grad_norm": 0.0063823312520980835, "learning_rate": 9.913666959321042e-06, "loss": 0.0002, "step": 5479 }, { "epoch": 1.6037459760023411, "grad_norm": 0.0018457062542438507, "learning_rate": 9.906350599941468e-06, "loss": 0.0, "step": 5480 }, { "epoch": 1.604038630377524, "grad_norm": 0.0017032860778272152, "learning_rate": 9.899034240561896e-06, "loss": 0.0, "step": 5481 }, { "epoch": 1.604331284752707, "grad_norm": 0.0014460444217547774, "learning_rate": 9.891717881182324e-06, "loss": 0.0, "step": 5482 }, { "epoch": 1.6046239391278898, "grad_norm": 0.002544417278841138, "learning_rate": 9.88440152180275e-06, "loss": 0.0001, "step": 5483 }, { "epoch": 1.604916593503073, "grad_norm": 0.003711249679327011, "learning_rate": 9.877085162423179e-06, "loss": 0.0001, "step": 5484 }, { "epoch": 1.6052092478782558, "grad_norm": 0.0012813995126634836, "learning_rate": 9.869768803043605e-06, "loss": 0.0, "step": 5485 }, { "epoch": 1.6055019022534387, "grad_norm": 0.01935703307390213, "learning_rate": 9.862452443664033e-06, "loss": 0.0001, "step": 5486 }, { "epoch": 1.6057945566286216, "grad_norm": 0.003095586085692048, "learning_rate": 9.85513608428446e-06, "loss": 0.0, "step": 5487 }, { "epoch": 1.6060872110038045, "grad_norm": 0.0031561930663883686, "learning_rate": 9.847819724904887e-06, "loss": 0.0001, "step": 5488 }, { "epoch": 1.6063798653789876, "grad_norm": 1.153779149055481, "learning_rate": 9.840503365525315e-06, "loss": 0.0061, "step": 5489 }, { "epoch": 1.6066725197541705, "grad_norm": 0.0033556653652340174, "learning_rate": 9.833187006145743e-06, "loss": 0.0001, "step": 5490 }, { "epoch": 1.6069651741293534, "grad_norm": 0.0040247077122330666, "learning_rate": 9.825870646766169e-06, "loss": 0.0001, "step": 5491 }, { "epoch": 1.6072578285045362, "grad_norm": 0.005303109996020794, "learning_rate": 9.818554287386597e-06, "loss": 0.0001, "step": 5492 }, { "epoch": 1.6075504828797191, "grad_norm": 0.0048234895803034306, "learning_rate": 9.811237928007023e-06, "loss": 0.0001, "step": 5493 }, { "epoch": 1.607843137254902, "grad_norm": 0.0017658272990956903, "learning_rate": 9.803921568627451e-06, "loss": 0.0, "step": 5494 }, { "epoch": 1.608135791630085, "grad_norm": 0.0017520791152492166, "learning_rate": 9.79660520924788e-06, "loss": 0.0, "step": 5495 }, { "epoch": 1.6084284460052678, "grad_norm": 0.0030433055944740772, "learning_rate": 9.789288849868305e-06, "loss": 0.0, "step": 5496 }, { "epoch": 1.6087211003804507, "grad_norm": 0.0010577259818091989, "learning_rate": 9.781972490488733e-06, "loss": 0.0, "step": 5497 }, { "epoch": 1.6090137547556336, "grad_norm": 0.0006441577570512891, "learning_rate": 9.774656131109161e-06, "loss": 0.0, "step": 5498 }, { "epoch": 1.6093064091308165, "grad_norm": 0.0019147149287164211, "learning_rate": 9.767339771729588e-06, "loss": 0.0, "step": 5499 }, { "epoch": 1.6095990635059994, "grad_norm": 0.002758044982329011, "learning_rate": 9.760023412350016e-06, "loss": 0.0, "step": 5500 }, { "epoch": 1.6098917178811822, "grad_norm": 0.0017483183182775974, "learning_rate": 9.752707052970442e-06, "loss": 0.0, "step": 5501 }, { "epoch": 1.6101843722563651, "grad_norm": 0.07355238497257233, "learning_rate": 9.74539069359087e-06, "loss": 0.0004, "step": 5502 }, { "epoch": 1.610477026631548, "grad_norm": 0.004786691628396511, "learning_rate": 9.738074334211298e-06, "loss": 0.0, "step": 5503 }, { "epoch": 1.610769681006731, "grad_norm": 0.0004528155841398984, "learning_rate": 9.730757974831724e-06, "loss": 0.0, "step": 5504 }, { "epoch": 1.611062335381914, "grad_norm": 0.0020629502832889557, "learning_rate": 9.723441615452152e-06, "loss": 0.0, "step": 5505 }, { "epoch": 1.611354989757097, "grad_norm": 0.0018566844519227743, "learning_rate": 9.71612525607258e-06, "loss": 0.0, "step": 5506 }, { "epoch": 1.6116476441322798, "grad_norm": 0.0012614359147846699, "learning_rate": 9.708808896693006e-06, "loss": 0.0, "step": 5507 }, { "epoch": 1.6119402985074627, "grad_norm": 0.009515652433037758, "learning_rate": 9.701492537313434e-06, "loss": 0.0001, "step": 5508 }, { "epoch": 1.6122329528826456, "grad_norm": 0.001026952755637467, "learning_rate": 9.69417617793386e-06, "loss": 0.0, "step": 5509 }, { "epoch": 1.6125256072578285, "grad_norm": 0.0011891749454662204, "learning_rate": 9.686859818554288e-06, "loss": 0.0, "step": 5510 }, { "epoch": 1.6128182616330116, "grad_norm": 0.003185735549777746, "learning_rate": 9.679543459174716e-06, "loss": 0.0, "step": 5511 }, { "epoch": 1.6131109160081945, "grad_norm": 0.0013077668845653534, "learning_rate": 9.672227099795142e-06, "loss": 0.0, "step": 5512 }, { "epoch": 1.6134035703833773, "grad_norm": 0.0011455434141680598, "learning_rate": 9.66491074041557e-06, "loss": 0.0, "step": 5513 }, { "epoch": 1.6136962247585602, "grad_norm": 0.0009633139125071466, "learning_rate": 9.657594381035998e-06, "loss": 0.0, "step": 5514 }, { "epoch": 1.6139888791337431, "grad_norm": 0.005264429375529289, "learning_rate": 9.650278021656425e-06, "loss": 0.0, "step": 5515 }, { "epoch": 1.614281533508926, "grad_norm": 0.008792583830654621, "learning_rate": 9.642961662276853e-06, "loss": 0.0001, "step": 5516 }, { "epoch": 1.614574187884109, "grad_norm": 0.0020714502315968275, "learning_rate": 9.635645302897279e-06, "loss": 0.0, "step": 5517 }, { "epoch": 1.6148668422592918, "grad_norm": 0.0029987585730850697, "learning_rate": 9.628328943517707e-06, "loss": 0.0, "step": 5518 }, { "epoch": 1.6151594966344747, "grad_norm": 0.0023104194551706314, "learning_rate": 9.621012584138133e-06, "loss": 0.0, "step": 5519 }, { "epoch": 1.6154521510096576, "grad_norm": 0.008713692426681519, "learning_rate": 9.613696224758561e-06, "loss": 0.0001, "step": 5520 }, { "epoch": 1.6157448053848404, "grad_norm": 0.000451488682301715, "learning_rate": 9.606379865378987e-06, "loss": 0.0, "step": 5521 }, { "epoch": 1.6160374597600233, "grad_norm": 0.010159490630030632, "learning_rate": 9.599063505999415e-06, "loss": 0.0001, "step": 5522 }, { "epoch": 1.6163301141352062, "grad_norm": 0.0007553183240815997, "learning_rate": 9.591747146619841e-06, "loss": 0.0, "step": 5523 }, { "epoch": 1.616622768510389, "grad_norm": 0.513761579990387, "learning_rate": 9.58443078724027e-06, "loss": 0.0037, "step": 5524 }, { "epoch": 1.616915422885572, "grad_norm": 0.0022291538771241903, "learning_rate": 9.577114427860696e-06, "loss": 0.0, "step": 5525 }, { "epoch": 1.6172080772607549, "grad_norm": 0.10149343311786652, "learning_rate": 9.569798068481124e-06, "loss": 0.0003, "step": 5526 }, { "epoch": 1.617500731635938, "grad_norm": 0.0024327621795237064, "learning_rate": 9.562481709101552e-06, "loss": 0.0, "step": 5527 }, { "epoch": 1.6177933860111209, "grad_norm": 0.0017709648236632347, "learning_rate": 9.555165349721978e-06, "loss": 0.0, "step": 5528 }, { "epoch": 1.6180860403863038, "grad_norm": 0.0004216205852571875, "learning_rate": 9.547848990342406e-06, "loss": 0.0, "step": 5529 }, { "epoch": 1.6183786947614867, "grad_norm": 0.06876334547996521, "learning_rate": 9.540532630962834e-06, "loss": 0.0004, "step": 5530 }, { "epoch": 1.6186713491366695, "grad_norm": 0.002134463516995311, "learning_rate": 9.53321627158326e-06, "loss": 0.0, "step": 5531 }, { "epoch": 1.6189640035118527, "grad_norm": 0.008550962433218956, "learning_rate": 9.525899912203688e-06, "loss": 0.0001, "step": 5532 }, { "epoch": 1.6192566578870355, "grad_norm": 0.0006103214109316468, "learning_rate": 9.518583552824114e-06, "loss": 0.0, "step": 5533 }, { "epoch": 1.6195493122622184, "grad_norm": 0.0005587683990597725, "learning_rate": 9.511267193444542e-06, "loss": 0.0, "step": 5534 }, { "epoch": 1.6198419666374013, "grad_norm": 0.0008965490851551294, "learning_rate": 9.50395083406497e-06, "loss": 0.0, "step": 5535 }, { "epoch": 1.6201346210125842, "grad_norm": 0.0025804329197853804, "learning_rate": 9.496634474685396e-06, "loss": 0.0, "step": 5536 }, { "epoch": 1.620427275387767, "grad_norm": 0.0016825495986267924, "learning_rate": 9.489318115305824e-06, "loss": 0.0, "step": 5537 }, { "epoch": 1.62071992976295, "grad_norm": 0.001002712408080697, "learning_rate": 9.482001755926252e-06, "loss": 0.0, "step": 5538 }, { "epoch": 1.6210125841381329, "grad_norm": 0.0002484250580891967, "learning_rate": 9.474685396546679e-06, "loss": 0.0, "step": 5539 }, { "epoch": 1.6213052385133158, "grad_norm": 0.0004409311804920435, "learning_rate": 9.467369037167106e-06, "loss": 0.0, "step": 5540 }, { "epoch": 1.6215978928884986, "grad_norm": 0.0027556063141673803, "learning_rate": 9.460052677787533e-06, "loss": 0.0, "step": 5541 }, { "epoch": 1.6218905472636815, "grad_norm": 0.0011143895098939538, "learning_rate": 9.45273631840796e-06, "loss": 0.0, "step": 5542 }, { "epoch": 1.6221832016388644, "grad_norm": 0.003866976359859109, "learning_rate": 9.445419959028389e-06, "loss": 0.0, "step": 5543 }, { "epoch": 1.6224758560140473, "grad_norm": 0.001845609163865447, "learning_rate": 9.438103599648815e-06, "loss": 0.0, "step": 5544 }, { "epoch": 1.6227685103892302, "grad_norm": 0.0027286650147289038, "learning_rate": 9.430787240269243e-06, "loss": 0.0001, "step": 5545 }, { "epoch": 1.623061164764413, "grad_norm": 0.0007557672797702253, "learning_rate": 9.423470880889669e-06, "loss": 0.0, "step": 5546 }, { "epoch": 1.623353819139596, "grad_norm": 0.0007909720879979432, "learning_rate": 9.416154521510097e-06, "loss": 0.0, "step": 5547 }, { "epoch": 1.623646473514779, "grad_norm": 0.0005950201884843409, "learning_rate": 9.408838162130525e-06, "loss": 0.0, "step": 5548 }, { "epoch": 1.623939127889962, "grad_norm": 0.0003656030457932502, "learning_rate": 9.401521802750951e-06, "loss": 0.0, "step": 5549 }, { "epoch": 1.6242317822651449, "grad_norm": 0.0009229332208633423, "learning_rate": 9.39420544337138e-06, "loss": 0.0, "step": 5550 }, { "epoch": 1.6245244366403278, "grad_norm": 0.0010801558382809162, "learning_rate": 9.386889083991807e-06, "loss": 0.0, "step": 5551 }, { "epoch": 1.6248170910155106, "grad_norm": 0.005564079154282808, "learning_rate": 9.379572724612233e-06, "loss": 0.0001, "step": 5552 }, { "epoch": 1.6251097453906937, "grad_norm": 0.0021888986229896545, "learning_rate": 9.372256365232661e-06, "loss": 0.0, "step": 5553 }, { "epoch": 1.6254023997658766, "grad_norm": 0.001076598186045885, "learning_rate": 9.364940005853088e-06, "loss": 0.0, "step": 5554 }, { "epoch": 1.6256950541410595, "grad_norm": 0.00043485217611305416, "learning_rate": 9.357623646473516e-06, "loss": 0.0, "step": 5555 }, { "epoch": 1.6259877085162424, "grad_norm": 0.00046464326442219317, "learning_rate": 9.350307287093944e-06, "loss": 0.0, "step": 5556 }, { "epoch": 1.6262803628914253, "grad_norm": 0.00032036672928370535, "learning_rate": 9.34299092771437e-06, "loss": 0.0, "step": 5557 }, { "epoch": 1.6265730172666082, "grad_norm": 0.0007653280044905841, "learning_rate": 9.335674568334798e-06, "loss": 0.0, "step": 5558 }, { "epoch": 1.626865671641791, "grad_norm": 0.0007247430039569736, "learning_rate": 9.328358208955226e-06, "loss": 0.0, "step": 5559 }, { "epoch": 1.627158326016974, "grad_norm": 0.000609236944001168, "learning_rate": 9.321041849575652e-06, "loss": 0.0, "step": 5560 }, { "epoch": 1.6274509803921569, "grad_norm": 3.0711312294006348, "learning_rate": 9.31372549019608e-06, "loss": 0.0043, "step": 5561 }, { "epoch": 1.6277436347673397, "grad_norm": 0.0011066512670367956, "learning_rate": 9.306409130816506e-06, "loss": 0.0, "step": 5562 }, { "epoch": 1.6280362891425226, "grad_norm": 0.0012488741194829345, "learning_rate": 9.299092771436934e-06, "loss": 0.0, "step": 5563 }, { "epoch": 1.6283289435177055, "grad_norm": 0.0015597431920468807, "learning_rate": 9.29177641205736e-06, "loss": 0.0, "step": 5564 }, { "epoch": 1.6286215978928884, "grad_norm": 0.00021354974887799472, "learning_rate": 9.284460052677788e-06, "loss": 0.0, "step": 5565 }, { "epoch": 1.6289142522680713, "grad_norm": 0.001786278560757637, "learning_rate": 9.277143693298215e-06, "loss": 0.0, "step": 5566 }, { "epoch": 1.6292069066432542, "grad_norm": 4.217692852020264, "learning_rate": 9.269827333918642e-06, "loss": 0.1724, "step": 5567 }, { "epoch": 1.629499561018437, "grad_norm": 0.014737540856003761, "learning_rate": 9.262510974539069e-06, "loss": 0.0001, "step": 5568 }, { "epoch": 1.6297922153936202, "grad_norm": 0.031021222472190857, "learning_rate": 9.255194615159497e-06, "loss": 0.0002, "step": 5569 }, { "epoch": 1.630084869768803, "grad_norm": 0.002510768361389637, "learning_rate": 9.247878255779923e-06, "loss": 0.0, "step": 5570 }, { "epoch": 1.630377524143986, "grad_norm": 0.012060116045176983, "learning_rate": 9.240561896400351e-06, "loss": 0.0001, "step": 5571 }, { "epoch": 1.6306701785191688, "grad_norm": 0.00570687185972929, "learning_rate": 9.233245537020779e-06, "loss": 0.0001, "step": 5572 }, { "epoch": 1.6309628328943517, "grad_norm": 0.0003556690935511142, "learning_rate": 9.225929177641205e-06, "loss": 0.0, "step": 5573 }, { "epoch": 1.6312554872695348, "grad_norm": 26.108539581298828, "learning_rate": 9.218612818261633e-06, "loss": 0.0918, "step": 5574 }, { "epoch": 1.6315481416447177, "grad_norm": 0.005321469157934189, "learning_rate": 9.211296458882061e-06, "loss": 0.0001, "step": 5575 }, { "epoch": 1.6318407960199006, "grad_norm": 0.0020039896480739117, "learning_rate": 9.203980099502487e-06, "loss": 0.0, "step": 5576 }, { "epoch": 1.6321334503950835, "grad_norm": 0.01536587718874216, "learning_rate": 9.196663740122915e-06, "loss": 0.0001, "step": 5577 }, { "epoch": 1.6324261047702664, "grad_norm": 0.005312270950525999, "learning_rate": 9.189347380743341e-06, "loss": 0.0001, "step": 5578 }, { "epoch": 1.6327187591454493, "grad_norm": 0.004251805599778891, "learning_rate": 9.18203102136377e-06, "loss": 0.0001, "step": 5579 }, { "epoch": 1.6330114135206322, "grad_norm": 0.05280857905745506, "learning_rate": 9.174714661984197e-06, "loss": 0.0003, "step": 5580 }, { "epoch": 1.633304067895815, "grad_norm": 0.0005378371570259333, "learning_rate": 9.167398302604624e-06, "loss": 0.0, "step": 5581 }, { "epoch": 1.633596722270998, "grad_norm": 0.14982321858406067, "learning_rate": 9.160081943225052e-06, "loss": 0.0015, "step": 5582 }, { "epoch": 1.6338893766461808, "grad_norm": 0.10131068527698517, "learning_rate": 9.15276558384548e-06, "loss": 0.0006, "step": 5583 }, { "epoch": 1.6341820310213637, "grad_norm": 0.011011369526386261, "learning_rate": 9.145449224465906e-06, "loss": 0.0001, "step": 5584 }, { "epoch": 1.6344746853965466, "grad_norm": 0.004358216188848019, "learning_rate": 9.138132865086334e-06, "loss": 0.0001, "step": 5585 }, { "epoch": 1.6347673397717295, "grad_norm": 0.004245396703481674, "learning_rate": 9.13081650570676e-06, "loss": 0.0001, "step": 5586 }, { "epoch": 1.6350599941469124, "grad_norm": 0.021265236660838127, "learning_rate": 9.123500146327188e-06, "loss": 0.0002, "step": 5587 }, { "epoch": 1.6353526485220953, "grad_norm": 0.004796027671545744, "learning_rate": 9.116183786947616e-06, "loss": 0.0001, "step": 5588 }, { "epoch": 1.6356453028972782, "grad_norm": 0.0007314679096452892, "learning_rate": 9.108867427568042e-06, "loss": 0.0, "step": 5589 }, { "epoch": 1.6359379572724613, "grad_norm": 0.008435488678514957, "learning_rate": 9.10155106818847e-06, "loss": 0.0001, "step": 5590 }, { "epoch": 1.6362306116476442, "grad_norm": 0.0021220894996076822, "learning_rate": 9.094234708808898e-06, "loss": 0.0, "step": 5591 }, { "epoch": 1.636523266022827, "grad_norm": 0.011719580739736557, "learning_rate": 9.086918349429324e-06, "loss": 0.0002, "step": 5592 }, { "epoch": 1.63681592039801, "grad_norm": 0.0012470482615754008, "learning_rate": 9.079601990049752e-06, "loss": 0.0, "step": 5593 }, { "epoch": 1.6371085747731928, "grad_norm": 6.902370929718018, "learning_rate": 9.072285630670179e-06, "loss": 0.0652, "step": 5594 }, { "epoch": 1.6374012291483757, "grad_norm": 0.11666715145111084, "learning_rate": 9.064969271290606e-06, "loss": 0.0018, "step": 5595 }, { "epoch": 1.6376938835235588, "grad_norm": 0.15053029358386993, "learning_rate": 9.057652911911034e-06, "loss": 0.0009, "step": 5596 }, { "epoch": 1.6379865378987417, "grad_norm": 0.0008611659286543727, "learning_rate": 9.05033655253146e-06, "loss": 0.0, "step": 5597 }, { "epoch": 1.6382791922739246, "grad_norm": 0.0003323563141748309, "learning_rate": 9.043020193151889e-06, "loss": 0.0, "step": 5598 }, { "epoch": 1.6385718466491075, "grad_norm": 19.84071922302246, "learning_rate": 9.035703833772317e-06, "loss": 0.0436, "step": 5599 }, { "epoch": 1.6388645010242904, "grad_norm": 0.0016030750703066587, "learning_rate": 9.028387474392743e-06, "loss": 0.0, "step": 5600 }, { "epoch": 1.6391571553994733, "grad_norm": 0.0007422008784487844, "learning_rate": 9.02107111501317e-06, "loss": 0.0, "step": 5601 }, { "epoch": 1.6394498097746562, "grad_norm": 0.0137802017852664, "learning_rate": 9.013754755633597e-06, "loss": 0.0001, "step": 5602 }, { "epoch": 1.639742464149839, "grad_norm": 0.004875142592936754, "learning_rate": 9.006438396254025e-06, "loss": 0.0001, "step": 5603 }, { "epoch": 1.640035118525022, "grad_norm": 0.017864851281046867, "learning_rate": 8.999122036874453e-06, "loss": 0.0001, "step": 5604 }, { "epoch": 1.6403277729002048, "grad_norm": 0.0029294146224856377, "learning_rate": 8.99180567749488e-06, "loss": 0.0, "step": 5605 }, { "epoch": 1.6406204272753877, "grad_norm": 0.05114160105586052, "learning_rate": 8.984489318115307e-06, "loss": 0.0001, "step": 5606 }, { "epoch": 1.6409130816505706, "grad_norm": 0.001088925520889461, "learning_rate": 8.977172958735733e-06, "loss": 0.0, "step": 5607 }, { "epoch": 1.6412057360257535, "grad_norm": 0.0010532621527090669, "learning_rate": 8.969856599356161e-06, "loss": 0.0, "step": 5608 }, { "epoch": 1.6414983904009364, "grad_norm": 0.012787350453436375, "learning_rate": 8.962540239976588e-06, "loss": 0.0002, "step": 5609 }, { "epoch": 1.6417910447761193, "grad_norm": 0.0012421078281477094, "learning_rate": 8.955223880597016e-06, "loss": 0.0, "step": 5610 }, { "epoch": 1.6420836991513024, "grad_norm": 0.04169453680515289, "learning_rate": 8.947907521217442e-06, "loss": 0.0005, "step": 5611 }, { "epoch": 1.6423763535264853, "grad_norm": 0.21967889368534088, "learning_rate": 8.94059116183787e-06, "loss": 0.0008, "step": 5612 }, { "epoch": 1.6426690079016681, "grad_norm": 0.00621822802349925, "learning_rate": 8.933274802458296e-06, "loss": 0.0001, "step": 5613 }, { "epoch": 1.642961662276851, "grad_norm": 0.0066894544288516045, "learning_rate": 8.925958443078724e-06, "loss": 0.0001, "step": 5614 }, { "epoch": 1.643254316652034, "grad_norm": 0.0031689521856606007, "learning_rate": 8.91864208369915e-06, "loss": 0.0, "step": 5615 }, { "epoch": 1.6435469710272168, "grad_norm": 0.0013706308091059327, "learning_rate": 8.911325724319578e-06, "loss": 0.0, "step": 5616 }, { "epoch": 1.6438396254024, "grad_norm": 0.0011935265501961112, "learning_rate": 8.904009364940006e-06, "loss": 0.0, "step": 5617 }, { "epoch": 1.6441322797775828, "grad_norm": 0.0029069141019135714, "learning_rate": 8.896693005560432e-06, "loss": 0.0, "step": 5618 }, { "epoch": 1.6444249341527657, "grad_norm": 0.0016296259127557278, "learning_rate": 8.88937664618086e-06, "loss": 0.0, "step": 5619 }, { "epoch": 1.6447175885279486, "grad_norm": 0.0017259359592571855, "learning_rate": 8.882060286801288e-06, "loss": 0.0, "step": 5620 }, { "epoch": 1.6450102429031315, "grad_norm": 0.001674041268415749, "learning_rate": 8.874743927421715e-06, "loss": 0.0, "step": 5621 }, { "epoch": 1.6453028972783144, "grad_norm": 0.004651952069252729, "learning_rate": 8.867427568042142e-06, "loss": 0.0001, "step": 5622 }, { "epoch": 1.6455955516534972, "grad_norm": 0.0013309124624356627, "learning_rate": 8.860111208662569e-06, "loss": 0.0, "step": 5623 }, { "epoch": 1.6458882060286801, "grad_norm": 0.0045665837824344635, "learning_rate": 8.852794849282997e-06, "loss": 0.0001, "step": 5624 }, { "epoch": 1.646180860403863, "grad_norm": 0.0037553836591541767, "learning_rate": 8.845478489903425e-06, "loss": 0.0, "step": 5625 }, { "epoch": 1.646473514779046, "grad_norm": 0.011135270819067955, "learning_rate": 8.838162130523851e-06, "loss": 0.0001, "step": 5626 }, { "epoch": 1.6467661691542288, "grad_norm": 0.010377340018749237, "learning_rate": 8.830845771144279e-06, "loss": 0.0001, "step": 5627 }, { "epoch": 1.6470588235294117, "grad_norm": 0.002636183286085725, "learning_rate": 8.823529411764707e-06, "loss": 0.0001, "step": 5628 }, { "epoch": 1.6473514779045946, "grad_norm": 13.157976150512695, "learning_rate": 8.816213052385133e-06, "loss": 0.069, "step": 5629 }, { "epoch": 1.6476441322797775, "grad_norm": 9.887460708618164, "learning_rate": 8.808896693005561e-06, "loss": 0.0368, "step": 5630 }, { "epoch": 1.6479367866549604, "grad_norm": 0.010155647993087769, "learning_rate": 8.801580333625987e-06, "loss": 0.0001, "step": 5631 }, { "epoch": 1.6482294410301432, "grad_norm": 0.0005502697313204408, "learning_rate": 8.794263974246415e-06, "loss": 0.0, "step": 5632 }, { "epoch": 1.6485220954053263, "grad_norm": 0.0068548452109098434, "learning_rate": 8.786947614866843e-06, "loss": 0.0001, "step": 5633 }, { "epoch": 1.6488147497805092, "grad_norm": 0.0030745361000299454, "learning_rate": 8.77963125548727e-06, "loss": 0.0, "step": 5634 }, { "epoch": 1.6491074041556921, "grad_norm": 0.00037383261951617897, "learning_rate": 8.772314896107697e-06, "loss": 0.0, "step": 5635 }, { "epoch": 1.649400058530875, "grad_norm": 0.0018102648900821805, "learning_rate": 8.764998536728125e-06, "loss": 0.0, "step": 5636 }, { "epoch": 1.649692712906058, "grad_norm": 0.0008046173606999218, "learning_rate": 8.757682177348552e-06, "loss": 0.0, "step": 5637 }, { "epoch": 1.649985367281241, "grad_norm": 0.0004981788224540651, "learning_rate": 8.75036581796898e-06, "loss": 0.0, "step": 5638 }, { "epoch": 1.650278021656424, "grad_norm": 0.0017273883568122983, "learning_rate": 8.743049458589406e-06, "loss": 0.0, "step": 5639 }, { "epoch": 1.6505706760316068, "grad_norm": 0.0026937955990433693, "learning_rate": 8.735733099209834e-06, "loss": 0.0, "step": 5640 }, { "epoch": 1.6508633304067897, "grad_norm": 0.0070621841587126255, "learning_rate": 8.728416739830262e-06, "loss": 0.0001, "step": 5641 }, { "epoch": 1.6511559847819726, "grad_norm": 0.0021231193095445633, "learning_rate": 8.721100380450688e-06, "loss": 0.0, "step": 5642 }, { "epoch": 1.6514486391571555, "grad_norm": 0.0005490577896125615, "learning_rate": 8.713784021071116e-06, "loss": 0.0, "step": 5643 }, { "epoch": 1.6517412935323383, "grad_norm": 0.00028466907679103315, "learning_rate": 8.706467661691544e-06, "loss": 0.0, "step": 5644 }, { "epoch": 1.6520339479075212, "grad_norm": 0.0001460868224967271, "learning_rate": 8.69915130231197e-06, "loss": 0.0, "step": 5645 }, { "epoch": 1.6523266022827041, "grad_norm": 0.0008332178695127368, "learning_rate": 8.691834942932398e-06, "loss": 0.0, "step": 5646 }, { "epoch": 1.652619256657887, "grad_norm": 0.001206650398671627, "learning_rate": 8.684518583552824e-06, "loss": 0.0, "step": 5647 }, { "epoch": 1.65291191103307, "grad_norm": 0.002128488617017865, "learning_rate": 8.677202224173252e-06, "loss": 0.0, "step": 5648 }, { "epoch": 1.6532045654082528, "grad_norm": 0.0008632836979813874, "learning_rate": 8.66988586479368e-06, "loss": 0.0, "step": 5649 }, { "epoch": 1.6534972197834357, "grad_norm": 0.000147968516102992, "learning_rate": 8.662569505414106e-06, "loss": 0.0, "step": 5650 }, { "epoch": 1.6537898741586186, "grad_norm": 22.177335739135742, "learning_rate": 8.655253146034534e-06, "loss": 0.0928, "step": 5651 }, { "epoch": 1.6540825285338014, "grad_norm": 0.0005284177605062723, "learning_rate": 8.647936786654962e-06, "loss": 0.0, "step": 5652 }, { "epoch": 1.6543751829089843, "grad_norm": 0.0003988874377682805, "learning_rate": 8.640620427275389e-06, "loss": 0.0, "step": 5653 }, { "epoch": 1.6546678372841674, "grad_norm": 0.001079960959032178, "learning_rate": 8.633304067895817e-06, "loss": 0.0, "step": 5654 }, { "epoch": 1.6549604916593503, "grad_norm": 0.0009095671703107655, "learning_rate": 8.625987708516243e-06, "loss": 0.0, "step": 5655 }, { "epoch": 1.6552531460345332, "grad_norm": 0.00140785810071975, "learning_rate": 8.61867134913667e-06, "loss": 0.0, "step": 5656 }, { "epoch": 1.655545800409716, "grad_norm": 0.002098365221172571, "learning_rate": 8.611354989757097e-06, "loss": 0.0, "step": 5657 }, { "epoch": 1.655838454784899, "grad_norm": 0.0018707603449001908, "learning_rate": 8.604038630377525e-06, "loss": 0.0, "step": 5658 }, { "epoch": 1.656131109160082, "grad_norm": 0.002513097133487463, "learning_rate": 8.596722270997951e-06, "loss": 0.0, "step": 5659 }, { "epoch": 1.656423763535265, "grad_norm": 0.00026852835435420275, "learning_rate": 8.58940591161838e-06, "loss": 0.0, "step": 5660 }, { "epoch": 1.6567164179104479, "grad_norm": 0.00040217710193246603, "learning_rate": 8.582089552238805e-06, "loss": 0.0, "step": 5661 }, { "epoch": 1.6570090722856308, "grad_norm": 0.0013781053712591529, "learning_rate": 8.574773192859233e-06, "loss": 0.0, "step": 5662 }, { "epoch": 1.6573017266608137, "grad_norm": 10.875997543334961, "learning_rate": 8.56745683347966e-06, "loss": 0.0121, "step": 5663 }, { "epoch": 1.6575943810359965, "grad_norm": 0.0004421852936502546, "learning_rate": 8.560140474100088e-06, "loss": 0.0, "step": 5664 }, { "epoch": 1.6578870354111794, "grad_norm": 0.003944370895624161, "learning_rate": 8.552824114720516e-06, "loss": 0.0001, "step": 5665 }, { "epoch": 1.6581796897863623, "grad_norm": 0.0005194034310989082, "learning_rate": 8.545507755340942e-06, "loss": 0.0, "step": 5666 }, { "epoch": 1.6584723441615452, "grad_norm": 0.000154358524014242, "learning_rate": 8.53819139596137e-06, "loss": 0.0, "step": 5667 }, { "epoch": 1.658764998536728, "grad_norm": 2.193861484527588, "learning_rate": 8.530875036581798e-06, "loss": 0.0058, "step": 5668 }, { "epoch": 1.659057652911911, "grad_norm": 0.00018022813310381025, "learning_rate": 8.523558677202224e-06, "loss": 0.0, "step": 5669 }, { "epoch": 1.6593503072870939, "grad_norm": 0.0004651829949580133, "learning_rate": 8.516242317822652e-06, "loss": 0.0, "step": 5670 }, { "epoch": 1.6596429616622768, "grad_norm": 0.0011934270150959492, "learning_rate": 8.508925958443078e-06, "loss": 0.0, "step": 5671 }, { "epoch": 1.6599356160374596, "grad_norm": 0.0004671476490329951, "learning_rate": 8.501609599063506e-06, "loss": 0.0, "step": 5672 }, { "epoch": 1.6602282704126425, "grad_norm": 0.0004122898099012673, "learning_rate": 8.494293239683934e-06, "loss": 0.0, "step": 5673 }, { "epoch": 1.6605209247878254, "grad_norm": 0.0004934952594339848, "learning_rate": 8.48697688030436e-06, "loss": 0.0, "step": 5674 }, { "epoch": 1.6608135791630085, "grad_norm": 0.0037849058862775564, "learning_rate": 8.479660520924788e-06, "loss": 0.0001, "step": 5675 }, { "epoch": 1.6611062335381914, "grad_norm": 0.0011096809757873416, "learning_rate": 8.472344161545215e-06, "loss": 0.0, "step": 5676 }, { "epoch": 1.6613988879133743, "grad_norm": 0.0008264588541351259, "learning_rate": 8.465027802165642e-06, "loss": 0.0, "step": 5677 }, { "epoch": 1.6616915422885572, "grad_norm": 0.0005098911351524293, "learning_rate": 8.45771144278607e-06, "loss": 0.0, "step": 5678 }, { "epoch": 1.66198419666374, "grad_norm": 0.0015530316159129143, "learning_rate": 8.450395083406497e-06, "loss": 0.0, "step": 5679 }, { "epoch": 1.6622768510389232, "grad_norm": 0.001707770861685276, "learning_rate": 8.443078724026925e-06, "loss": 0.0, "step": 5680 }, { "epoch": 1.662569505414106, "grad_norm": 0.000391901470720768, "learning_rate": 8.435762364647353e-06, "loss": 0.0, "step": 5681 }, { "epoch": 1.662862159789289, "grad_norm": 0.00044289889046922326, "learning_rate": 8.428446005267779e-06, "loss": 0.0, "step": 5682 }, { "epoch": 1.6631548141644719, "grad_norm": 0.0008527369936928153, "learning_rate": 8.421129645888207e-06, "loss": 0.0, "step": 5683 }, { "epoch": 1.6634474685396547, "grad_norm": 0.00022882982739247382, "learning_rate": 8.413813286508633e-06, "loss": 0.0, "step": 5684 }, { "epoch": 1.6637401229148376, "grad_norm": 0.0015376824885606766, "learning_rate": 8.406496927129061e-06, "loss": 0.0, "step": 5685 }, { "epoch": 1.6640327772900205, "grad_norm": 0.006001113913953304, "learning_rate": 8.399180567749489e-06, "loss": 0.0, "step": 5686 }, { "epoch": 1.6643254316652034, "grad_norm": 0.000480707676615566, "learning_rate": 8.391864208369915e-06, "loss": 0.0, "step": 5687 }, { "epoch": 1.6646180860403863, "grad_norm": 0.0010196049697697163, "learning_rate": 8.384547848990343e-06, "loss": 0.0, "step": 5688 }, { "epoch": 1.6649107404155692, "grad_norm": 0.0002625976630952209, "learning_rate": 8.377231489610771e-06, "loss": 0.0, "step": 5689 }, { "epoch": 1.665203394790752, "grad_norm": 0.0020561853889375925, "learning_rate": 8.369915130231197e-06, "loss": 0.0, "step": 5690 }, { "epoch": 1.665496049165935, "grad_norm": 0.004877160768955946, "learning_rate": 8.362598770851625e-06, "loss": 0.0, "step": 5691 }, { "epoch": 1.6657887035411179, "grad_norm": 0.0002567913325037807, "learning_rate": 8.355282411472052e-06, "loss": 0.0, "step": 5692 }, { "epoch": 1.6660813579163007, "grad_norm": 0.0006398832774721086, "learning_rate": 8.34796605209248e-06, "loss": 0.0, "step": 5693 }, { "epoch": 1.6663740122914836, "grad_norm": 0.0006991418777033687, "learning_rate": 8.340649692712907e-06, "loss": 0.0, "step": 5694 }, { "epoch": 1.6666666666666665, "grad_norm": 0.0005723349750041962, "learning_rate": 8.333333333333334e-06, "loss": 0.0, "step": 5695 }, { "epoch": 1.6669593210418496, "grad_norm": 0.000251155550358817, "learning_rate": 8.326016973953762e-06, "loss": 0.0, "step": 5696 }, { "epoch": 1.6672519754170325, "grad_norm": 0.007094892673194408, "learning_rate": 8.31870061457419e-06, "loss": 0.0001, "step": 5697 }, { "epoch": 1.6675446297922154, "grad_norm": 0.0020860317163169384, "learning_rate": 8.311384255194616e-06, "loss": 0.0, "step": 5698 }, { "epoch": 1.6678372841673983, "grad_norm": 0.0011908210581168532, "learning_rate": 8.304067895815044e-06, "loss": 0.0, "step": 5699 }, { "epoch": 1.6681299385425812, "grad_norm": 0.0003415268729440868, "learning_rate": 8.29675153643547e-06, "loss": 0.0, "step": 5700 }, { "epoch": 1.668422592917764, "grad_norm": 0.0010934275342151523, "learning_rate": 8.289435177055898e-06, "loss": 0.0, "step": 5701 }, { "epoch": 1.6687152472929472, "grad_norm": 0.00020715589926112443, "learning_rate": 8.282118817676324e-06, "loss": 0.0, "step": 5702 }, { "epoch": 1.66900790166813, "grad_norm": 0.0003347251040395349, "learning_rate": 8.274802458296752e-06, "loss": 0.0, "step": 5703 }, { "epoch": 1.669300556043313, "grad_norm": 0.0010050522396340966, "learning_rate": 8.267486098917179e-06, "loss": 0.0, "step": 5704 }, { "epoch": 1.6695932104184958, "grad_norm": 0.0010485193924978375, "learning_rate": 8.260169739537606e-06, "loss": 0.0, "step": 5705 }, { "epoch": 1.6698858647936787, "grad_norm": 0.0003816711832769215, "learning_rate": 8.252853380158033e-06, "loss": 0.0, "step": 5706 }, { "epoch": 1.6701785191688616, "grad_norm": 0.00030886335298419, "learning_rate": 8.24553702077846e-06, "loss": 0.0, "step": 5707 }, { "epoch": 1.6704711735440445, "grad_norm": 0.0003605361853260547, "learning_rate": 8.238220661398887e-06, "loss": 0.0, "step": 5708 }, { "epoch": 1.6707638279192274, "grad_norm": 0.000228889737627469, "learning_rate": 8.230904302019315e-06, "loss": 0.0, "step": 5709 }, { "epoch": 1.6710564822944103, "grad_norm": 0.0003481085877865553, "learning_rate": 8.223587942639743e-06, "loss": 0.0, "step": 5710 }, { "epoch": 1.6713491366695932, "grad_norm": 0.0017270563403144479, "learning_rate": 8.216271583260169e-06, "loss": 0.0, "step": 5711 }, { "epoch": 1.671641791044776, "grad_norm": 0.0026479167863726616, "learning_rate": 8.208955223880597e-06, "loss": 0.0, "step": 5712 }, { "epoch": 1.671934445419959, "grad_norm": 0.00040890229865908623, "learning_rate": 8.201638864501025e-06, "loss": 0.0, "step": 5713 }, { "epoch": 1.6722270997951418, "grad_norm": 0.0006078731385059655, "learning_rate": 8.194322505121451e-06, "loss": 0.0, "step": 5714 }, { "epoch": 1.6725197541703247, "grad_norm": 0.0004306174232624471, "learning_rate": 8.18700614574188e-06, "loss": 0.0, "step": 5715 }, { "epoch": 1.6728124085455076, "grad_norm": 0.0004859520122408867, "learning_rate": 8.179689786362305e-06, "loss": 0.0, "step": 5716 }, { "epoch": 1.6731050629206905, "grad_norm": 0.0002467651211190969, "learning_rate": 8.172373426982733e-06, "loss": 0.0, "step": 5717 }, { "epoch": 1.6733977172958736, "grad_norm": 0.00070102111203596, "learning_rate": 8.165057067603161e-06, "loss": 0.0, "step": 5718 }, { "epoch": 1.6736903716710565, "grad_norm": 0.0001780585735104978, "learning_rate": 8.157740708223588e-06, "loss": 0.0, "step": 5719 }, { "epoch": 1.6739830260462394, "grad_norm": 3.630791425704956, "learning_rate": 8.150424348844016e-06, "loss": 0.1884, "step": 5720 }, { "epoch": 1.6742756804214223, "grad_norm": 0.0020441662054508924, "learning_rate": 8.143107989464443e-06, "loss": 0.0, "step": 5721 }, { "epoch": 1.6745683347966052, "grad_norm": 0.001493821619078517, "learning_rate": 8.13579163008487e-06, "loss": 0.0, "step": 5722 }, { "epoch": 1.6748609891717883, "grad_norm": 0.0010016279993578792, "learning_rate": 8.128475270705298e-06, "loss": 0.0, "step": 5723 }, { "epoch": 1.6751536435469712, "grad_norm": 0.00042762173688970506, "learning_rate": 8.121158911325724e-06, "loss": 0.0, "step": 5724 }, { "epoch": 1.675446297922154, "grad_norm": 0.00023038113431539387, "learning_rate": 8.113842551946152e-06, "loss": 0.0, "step": 5725 }, { "epoch": 1.675738952297337, "grad_norm": 0.0010480210185050964, "learning_rate": 8.10652619256658e-06, "loss": 0.0, "step": 5726 }, { "epoch": 1.6760316066725198, "grad_norm": 0.000737055903300643, "learning_rate": 8.099209833187006e-06, "loss": 0.0, "step": 5727 }, { "epoch": 1.6763242610477027, "grad_norm": 0.0007961266674101353, "learning_rate": 8.091893473807434e-06, "loss": 0.0, "step": 5728 }, { "epoch": 1.6766169154228856, "grad_norm": 0.0014897778164595366, "learning_rate": 8.084577114427862e-06, "loss": 0.0, "step": 5729 }, { "epoch": 1.6769095697980685, "grad_norm": 0.003211492905393243, "learning_rate": 8.077260755048288e-06, "loss": 0.0001, "step": 5730 }, { "epoch": 1.6772022241732514, "grad_norm": 0.0006992241251282394, "learning_rate": 8.069944395668716e-06, "loss": 0.0, "step": 5731 }, { "epoch": 1.6774948785484343, "grad_norm": 0.0003750199975911528, "learning_rate": 8.062628036289142e-06, "loss": 0.0, "step": 5732 }, { "epoch": 1.6777875329236172, "grad_norm": 0.0006417161202989519, "learning_rate": 8.05531167690957e-06, "loss": 0.0, "step": 5733 }, { "epoch": 1.6780801872988, "grad_norm": 0.0015260449144989252, "learning_rate": 8.047995317529998e-06, "loss": 0.0, "step": 5734 }, { "epoch": 1.678372841673983, "grad_norm": 0.0005805394030176103, "learning_rate": 8.040678958150425e-06, "loss": 0.0, "step": 5735 }, { "epoch": 1.6786654960491658, "grad_norm": 0.001112207886762917, "learning_rate": 8.033362598770853e-06, "loss": 0.0, "step": 5736 }, { "epoch": 1.6789581504243487, "grad_norm": 0.002090575871989131, "learning_rate": 8.02604623939128e-06, "loss": 0.0, "step": 5737 }, { "epoch": 1.6792508047995316, "grad_norm": 0.0011604102328419685, "learning_rate": 8.018729880011707e-06, "loss": 0.0, "step": 5738 }, { "epoch": 1.6795434591747147, "grad_norm": 0.0008536601671949029, "learning_rate": 8.011413520632135e-06, "loss": 0.0, "step": 5739 }, { "epoch": 1.6798361135498976, "grad_norm": 0.004838922992348671, "learning_rate": 8.004097161252561e-06, "loss": 0.0001, "step": 5740 }, { "epoch": 1.6801287679250805, "grad_norm": 0.0020388218108564615, "learning_rate": 7.996780801872989e-06, "loss": 0.0, "step": 5741 }, { "epoch": 1.6804214223002634, "grad_norm": 0.015307688154280186, "learning_rate": 7.989464442493417e-06, "loss": 0.0002, "step": 5742 }, { "epoch": 1.6807140766754463, "grad_norm": 0.00044861010974273086, "learning_rate": 7.982148083113843e-06, "loss": 0.0, "step": 5743 }, { "epoch": 1.6810067310506294, "grad_norm": 0.006845089606940746, "learning_rate": 7.974831723734271e-06, "loss": 0.0001, "step": 5744 }, { "epoch": 1.6812993854258123, "grad_norm": 0.004504820331931114, "learning_rate": 7.967515364354697e-06, "loss": 0.0001, "step": 5745 }, { "epoch": 1.6815920398009951, "grad_norm": 0.001339981216005981, "learning_rate": 7.960199004975125e-06, "loss": 0.0, "step": 5746 }, { "epoch": 1.681884694176178, "grad_norm": 0.002657198579981923, "learning_rate": 7.952882645595552e-06, "loss": 0.0001, "step": 5747 }, { "epoch": 1.682177348551361, "grad_norm": 0.0030452387873083353, "learning_rate": 7.94556628621598e-06, "loss": 0.0001, "step": 5748 }, { "epoch": 1.6824700029265438, "grad_norm": 0.002118358388543129, "learning_rate": 7.938249926836406e-06, "loss": 0.0, "step": 5749 }, { "epoch": 1.6827626573017267, "grad_norm": 0.006794402375817299, "learning_rate": 7.930933567456834e-06, "loss": 0.0001, "step": 5750 }, { "epoch": 1.6830553116769096, "grad_norm": 0.0026812430005520582, "learning_rate": 7.92361720807726e-06, "loss": 0.0001, "step": 5751 }, { "epoch": 1.6833479660520925, "grad_norm": 0.004444151651114225, "learning_rate": 7.916300848697688e-06, "loss": 0.0001, "step": 5752 }, { "epoch": 1.6836406204272754, "grad_norm": 0.003096078522503376, "learning_rate": 7.908984489318114e-06, "loss": 0.0001, "step": 5753 }, { "epoch": 1.6839332748024582, "grad_norm": 0.0019110736902803183, "learning_rate": 7.901668129938542e-06, "loss": 0.0, "step": 5754 }, { "epoch": 1.6842259291776411, "grad_norm": 0.005232947412878275, "learning_rate": 7.89435177055897e-06, "loss": 0.0001, "step": 5755 }, { "epoch": 1.684518583552824, "grad_norm": 0.003011050634086132, "learning_rate": 7.887035411179396e-06, "loss": 0.0001, "step": 5756 }, { "epoch": 1.684811237928007, "grad_norm": 0.0004928670823574066, "learning_rate": 7.879719051799824e-06, "loss": 0.0, "step": 5757 }, { "epoch": 1.6851038923031898, "grad_norm": 0.006043503992259502, "learning_rate": 7.872402692420252e-06, "loss": 0.0001, "step": 5758 }, { "epoch": 1.6853965466783727, "grad_norm": 0.0013670605840161443, "learning_rate": 7.865086333040679e-06, "loss": 0.0, "step": 5759 }, { "epoch": 1.6856892010535558, "grad_norm": 0.0008556185639463365, "learning_rate": 7.857769973661106e-06, "loss": 0.0, "step": 5760 }, { "epoch": 1.6859818554287387, "grad_norm": 0.0012822890421375632, "learning_rate": 7.850453614281533e-06, "loss": 0.0, "step": 5761 }, { "epoch": 1.6862745098039216, "grad_norm": 0.0002606217167340219, "learning_rate": 7.84313725490196e-06, "loss": 0.0, "step": 5762 }, { "epoch": 1.6865671641791045, "grad_norm": 0.0017107477178797126, "learning_rate": 7.835820895522389e-06, "loss": 0.0, "step": 5763 }, { "epoch": 1.6868598185542873, "grad_norm": 0.0019767291378229856, "learning_rate": 7.828504536142815e-06, "loss": 0.0, "step": 5764 }, { "epoch": 1.6871524729294705, "grad_norm": 10.906447410583496, "learning_rate": 7.821188176763243e-06, "loss": 0.0728, "step": 5765 }, { "epoch": 1.6874451273046533, "grad_norm": 0.0011748663382604718, "learning_rate": 7.81387181738367e-06, "loss": 0.0, "step": 5766 }, { "epoch": 1.6877377816798362, "grad_norm": 0.08097922801971436, "learning_rate": 7.806555458004097e-06, "loss": 0.0006, "step": 5767 }, { "epoch": 1.6880304360550191, "grad_norm": 0.00511398958042264, "learning_rate": 7.799239098624525e-06, "loss": 0.0001, "step": 5768 }, { "epoch": 1.688323090430202, "grad_norm": 0.006898779422044754, "learning_rate": 7.791922739244951e-06, "loss": 0.0001, "step": 5769 }, { "epoch": 1.688615744805385, "grad_norm": 0.0006506079225800931, "learning_rate": 7.78460637986538e-06, "loss": 0.0, "step": 5770 }, { "epoch": 1.6889083991805678, "grad_norm": 0.005674232728779316, "learning_rate": 7.777290020485807e-06, "loss": 0.0001, "step": 5771 }, { "epoch": 1.6892010535557507, "grad_norm": 0.0003180634812451899, "learning_rate": 7.769973661106233e-06, "loss": 0.0, "step": 5772 }, { "epoch": 1.6894937079309336, "grad_norm": 0.023299410939216614, "learning_rate": 7.762657301726661e-06, "loss": 0.0002, "step": 5773 }, { "epoch": 1.6897863623061165, "grad_norm": 0.0024151585530489683, "learning_rate": 7.75534094234709e-06, "loss": 0.0, "step": 5774 }, { "epoch": 1.6900790166812993, "grad_norm": 0.0006375240045599639, "learning_rate": 7.748024582967516e-06, "loss": 0.0, "step": 5775 }, { "epoch": 1.6903716710564822, "grad_norm": 0.005888013169169426, "learning_rate": 7.740708223587943e-06, "loss": 0.0001, "step": 5776 }, { "epoch": 1.6906643254316651, "grad_norm": 0.002546227304264903, "learning_rate": 7.73339186420837e-06, "loss": 0.0, "step": 5777 }, { "epoch": 1.690956979806848, "grad_norm": 0.0007193618221208453, "learning_rate": 7.726075504828798e-06, "loss": 0.0, "step": 5778 }, { "epoch": 1.691249634182031, "grad_norm": 0.006669328082352877, "learning_rate": 7.718759145449226e-06, "loss": 0.0001, "step": 5779 }, { "epoch": 1.6915422885572138, "grad_norm": 0.0009613312431611121, "learning_rate": 7.711442786069652e-06, "loss": 0.0, "step": 5780 }, { "epoch": 1.691834942932397, "grad_norm": 0.003605912672355771, "learning_rate": 7.70412642669008e-06, "loss": 0.0001, "step": 5781 }, { "epoch": 1.6921275973075798, "grad_norm": 0.0015000062994658947, "learning_rate": 7.696810067310508e-06, "loss": 0.0, "step": 5782 }, { "epoch": 1.6924202516827627, "grad_norm": 0.0006242544623091817, "learning_rate": 7.689493707930934e-06, "loss": 0.0, "step": 5783 }, { "epoch": 1.6927129060579456, "grad_norm": 0.007943959906697273, "learning_rate": 7.682177348551362e-06, "loss": 0.0001, "step": 5784 }, { "epoch": 1.6930055604331284, "grad_norm": 9.961484465748072e-05, "learning_rate": 7.674860989171788e-06, "loss": 0.0, "step": 5785 }, { "epoch": 1.6932982148083113, "grad_norm": 0.0051424140110611916, "learning_rate": 7.667544629792216e-06, "loss": 0.0, "step": 5786 }, { "epoch": 1.6935908691834944, "grad_norm": 0.004370726179331541, "learning_rate": 7.660228270412644e-06, "loss": 0.0001, "step": 5787 }, { "epoch": 1.6938835235586773, "grad_norm": 0.001413586433045566, "learning_rate": 7.65291191103307e-06, "loss": 0.0, "step": 5788 }, { "epoch": 1.6941761779338602, "grad_norm": 0.001110206125304103, "learning_rate": 7.645595551653498e-06, "loss": 0.0, "step": 5789 }, { "epoch": 1.694468832309043, "grad_norm": 0.0006187248509377241, "learning_rate": 7.638279192273926e-06, "loss": 0.0, "step": 5790 }, { "epoch": 1.694761486684226, "grad_norm": 0.001334872213192284, "learning_rate": 7.630962832894353e-06, "loss": 0.0, "step": 5791 }, { "epoch": 1.6950541410594089, "grad_norm": 0.0008332266588695347, "learning_rate": 7.62364647351478e-06, "loss": 0.0, "step": 5792 }, { "epoch": 1.6953467954345918, "grad_norm": 0.0010400940664112568, "learning_rate": 7.616330114135206e-06, "loss": 0.0, "step": 5793 }, { "epoch": 1.6956394498097747, "grad_norm": 0.0016752583906054497, "learning_rate": 7.609013754755634e-06, "loss": 0.0, "step": 5794 }, { "epoch": 1.6959321041849575, "grad_norm": 0.0008904021815396845, "learning_rate": 7.601697395376062e-06, "loss": 0.0, "step": 5795 }, { "epoch": 1.6962247585601404, "grad_norm": 0.005862629506736994, "learning_rate": 7.594381035996488e-06, "loss": 0.0001, "step": 5796 }, { "epoch": 1.6965174129353233, "grad_norm": 0.0006948685040697455, "learning_rate": 7.587064676616916e-06, "loss": 0.0, "step": 5797 }, { "epoch": 1.6968100673105062, "grad_norm": 1.873349666595459, "learning_rate": 7.579748317237344e-06, "loss": 0.0037, "step": 5798 }, { "epoch": 1.697102721685689, "grad_norm": 0.00048663359484635293, "learning_rate": 7.57243195785777e-06, "loss": 0.0, "step": 5799 }, { "epoch": 1.697395376060872, "grad_norm": 0.00143390370067209, "learning_rate": 7.565115598478198e-06, "loss": 0.0, "step": 5800 }, { "epoch": 1.6976880304360549, "grad_norm": 0.0016824619378894567, "learning_rate": 7.5577992390986245e-06, "loss": 0.0, "step": 5801 }, { "epoch": 1.6979806848112378, "grad_norm": 0.0010202049743384123, "learning_rate": 7.550482879719052e-06, "loss": 0.0, "step": 5802 }, { "epoch": 1.6982733391864209, "grad_norm": 0.001123300869949162, "learning_rate": 7.54316652033948e-06, "loss": 0.0, "step": 5803 }, { "epoch": 1.6985659935616038, "grad_norm": 0.0006333779892884195, "learning_rate": 7.535850160959907e-06, "loss": 0.0, "step": 5804 }, { "epoch": 1.6988586479367866, "grad_norm": 0.0012238698545843363, "learning_rate": 7.5285338015803346e-06, "loss": 0.0, "step": 5805 }, { "epoch": 1.6991513023119695, "grad_norm": 0.00036312537849880755, "learning_rate": 7.521217442200761e-06, "loss": 0.0, "step": 5806 }, { "epoch": 1.6994439566871524, "grad_norm": 0.0009994838619604707, "learning_rate": 7.513901082821189e-06, "loss": 0.0, "step": 5807 }, { "epoch": 1.6997366110623355, "grad_norm": 0.0009392634965479374, "learning_rate": 7.506584723441616e-06, "loss": 0.0, "step": 5808 }, { "epoch": 1.7000292654375184, "grad_norm": 0.0006655640318058431, "learning_rate": 7.499268364062043e-06, "loss": 0.0, "step": 5809 }, { "epoch": 1.7003219198127013, "grad_norm": 0.0019335742108523846, "learning_rate": 7.49195200468247e-06, "loss": 0.0, "step": 5810 }, { "epoch": 1.7006145741878842, "grad_norm": 0.004660231526941061, "learning_rate": 7.484635645302898e-06, "loss": 0.0001, "step": 5811 }, { "epoch": 1.700907228563067, "grad_norm": 0.001245697378180921, "learning_rate": 7.477319285923324e-06, "loss": 0.0, "step": 5812 }, { "epoch": 1.70119988293825, "grad_norm": 0.00019114658061880618, "learning_rate": 7.470002926543752e-06, "loss": 0.0, "step": 5813 }, { "epoch": 1.7014925373134329, "grad_norm": 0.0001273701636819169, "learning_rate": 7.4626865671641785e-06, "loss": 0.0, "step": 5814 }, { "epoch": 1.7017851916886158, "grad_norm": 0.0007166353752836585, "learning_rate": 7.4553702077846065e-06, "loss": 0.0, "step": 5815 }, { "epoch": 1.7020778460637986, "grad_norm": 0.022403433918952942, "learning_rate": 7.448053848405034e-06, "loss": 0.0002, "step": 5816 }, { "epoch": 1.7023705004389815, "grad_norm": 0.0005216248100623488, "learning_rate": 7.440737489025461e-06, "loss": 0.0, "step": 5817 }, { "epoch": 1.7026631548141644, "grad_norm": 0.00046261277748271823, "learning_rate": 7.433421129645889e-06, "loss": 0.0, "step": 5818 }, { "epoch": 1.7029558091893473, "grad_norm": 0.0011683704797178507, "learning_rate": 7.4261047702663166e-06, "loss": 0.0, "step": 5819 }, { "epoch": 1.7032484635645302, "grad_norm": 0.003403689945116639, "learning_rate": 7.418788410886743e-06, "loss": 0.0001, "step": 5820 }, { "epoch": 1.703541117939713, "grad_norm": 0.0008349033887498081, "learning_rate": 7.411472051507171e-06, "loss": 0.0, "step": 5821 }, { "epoch": 1.703833772314896, "grad_norm": 0.0022660107351839542, "learning_rate": 7.404155692127597e-06, "loss": 0.0, "step": 5822 }, { "epoch": 1.7041264266900789, "grad_norm": 0.0004610086034517735, "learning_rate": 7.396839332748025e-06, "loss": 0.0, "step": 5823 }, { "epoch": 1.704419081065262, "grad_norm": 0.0004957149503752589, "learning_rate": 7.389522973368453e-06, "loss": 0.0, "step": 5824 }, { "epoch": 1.7047117354404449, "grad_norm": 0.0004168192099314183, "learning_rate": 7.382206613988879e-06, "loss": 0.0, "step": 5825 }, { "epoch": 1.7050043898156277, "grad_norm": 0.00333635276183486, "learning_rate": 7.374890254609307e-06, "loss": 0.0, "step": 5826 }, { "epoch": 1.7052970441908106, "grad_norm": 0.014504407532513142, "learning_rate": 7.367573895229734e-06, "loss": 0.0001, "step": 5827 }, { "epoch": 1.7055896985659935, "grad_norm": 0.0007202645647339523, "learning_rate": 7.360257535850161e-06, "loss": 0.0, "step": 5828 }, { "epoch": 1.7058823529411766, "grad_norm": 0.00040720164543017745, "learning_rate": 7.3529411764705884e-06, "loss": 0.0, "step": 5829 }, { "epoch": 1.7061750073163595, "grad_norm": 4.333349704742432, "learning_rate": 7.3456248170910155e-06, "loss": 0.298, "step": 5830 }, { "epoch": 1.7064676616915424, "grad_norm": 0.008811557665467262, "learning_rate": 7.338308457711443e-06, "loss": 0.0001, "step": 5831 }, { "epoch": 1.7067603160667253, "grad_norm": 0.0010597541695460677, "learning_rate": 7.330992098331871e-06, "loss": 0.0, "step": 5832 }, { "epoch": 1.7070529704419082, "grad_norm": 0.0005229754606261849, "learning_rate": 7.323675738952297e-06, "loss": 0.0, "step": 5833 }, { "epoch": 1.707345624817091, "grad_norm": 0.0004036907048430294, "learning_rate": 7.316359379572725e-06, "loss": 0.0, "step": 5834 }, { "epoch": 1.707638279192274, "grad_norm": 0.0015681361546739936, "learning_rate": 7.309043020193153e-06, "loss": 0.0, "step": 5835 }, { "epoch": 1.7079309335674568, "grad_norm": 0.0007967501296661794, "learning_rate": 7.301726660813579e-06, "loss": 0.0, "step": 5836 }, { "epoch": 1.7082235879426397, "grad_norm": 0.0037929262034595013, "learning_rate": 7.294410301434007e-06, "loss": 0.0, "step": 5837 }, { "epoch": 1.7085162423178226, "grad_norm": 0.0007488639676012099, "learning_rate": 7.287093942054433e-06, "loss": 0.0, "step": 5838 }, { "epoch": 1.7088088966930055, "grad_norm": 0.0005727543029934168, "learning_rate": 7.279777582674861e-06, "loss": 0.0, "step": 5839 }, { "epoch": 1.7091015510681884, "grad_norm": 0.0005464969435706735, "learning_rate": 7.272461223295289e-06, "loss": 0.0, "step": 5840 }, { "epoch": 1.7093942054433713, "grad_norm": 0.0005111905629746616, "learning_rate": 7.265144863915715e-06, "loss": 0.0, "step": 5841 }, { "epoch": 1.7096868598185542, "grad_norm": 0.005028417333960533, "learning_rate": 7.257828504536143e-06, "loss": 0.0001, "step": 5842 }, { "epoch": 1.709979514193737, "grad_norm": 0.0034411484375596046, "learning_rate": 7.250512145156571e-06, "loss": 0.0001, "step": 5843 }, { "epoch": 1.71027216856892, "grad_norm": 0.027304381132125854, "learning_rate": 7.2431957857769975e-06, "loss": 0.0001, "step": 5844 }, { "epoch": 1.710564822944103, "grad_norm": 0.000758185051381588, "learning_rate": 7.2358794263974255e-06, "loss": 0.0, "step": 5845 }, { "epoch": 1.710857477319286, "grad_norm": 0.0034596100449562073, "learning_rate": 7.228563067017852e-06, "loss": 0.0001, "step": 5846 }, { "epoch": 1.7111501316944688, "grad_norm": 0.0017440527444705367, "learning_rate": 7.22124670763828e-06, "loss": 0.0, "step": 5847 }, { "epoch": 1.7114427860696517, "grad_norm": 0.00359273049980402, "learning_rate": 7.213930348258708e-06, "loss": 0.0001, "step": 5848 }, { "epoch": 1.7117354404448346, "grad_norm": 0.002152426866814494, "learning_rate": 7.206613988879134e-06, "loss": 0.0, "step": 5849 }, { "epoch": 1.7120280948200177, "grad_norm": 2.712299108505249, "learning_rate": 7.199297629499562e-06, "loss": 0.1384, "step": 5850 }, { "epoch": 1.7123207491952006, "grad_norm": 0.0023311313707381487, "learning_rate": 7.191981270119989e-06, "loss": 0.0, "step": 5851 }, { "epoch": 1.7126134035703835, "grad_norm": 0.01575295813381672, "learning_rate": 7.184664910740416e-06, "loss": 0.0003, "step": 5852 }, { "epoch": 1.7129060579455664, "grad_norm": 0.002103086095303297, "learning_rate": 7.177348551360843e-06, "loss": 0.0, "step": 5853 }, { "epoch": 1.7131987123207493, "grad_norm": 0.004746034741401672, "learning_rate": 7.17003219198127e-06, "loss": 0.0001, "step": 5854 }, { "epoch": 1.7134913666959322, "grad_norm": 0.009943942539393902, "learning_rate": 7.162715832601697e-06, "loss": 0.0001, "step": 5855 }, { "epoch": 1.713784021071115, "grad_norm": 0.008769072592258453, "learning_rate": 7.155399473222125e-06, "loss": 0.0001, "step": 5856 }, { "epoch": 1.714076675446298, "grad_norm": 0.03581611439585686, "learning_rate": 7.1480831138425516e-06, "loss": 0.0005, "step": 5857 }, { "epoch": 1.7143693298214808, "grad_norm": 0.007830784656107426, "learning_rate": 7.1407667544629795e-06, "loss": 0.0001, "step": 5858 }, { "epoch": 1.7146619841966637, "grad_norm": 0.012227417901158333, "learning_rate": 7.1334503950834075e-06, "loss": 0.0002, "step": 5859 }, { "epoch": 1.7149546385718466, "grad_norm": 0.013806314207613468, "learning_rate": 7.126134035703834e-06, "loss": 0.0002, "step": 5860 }, { "epoch": 1.7152472929470295, "grad_norm": 0.005988314747810364, "learning_rate": 7.118817676324262e-06, "loss": 0.0001, "step": 5861 }, { "epoch": 1.7155399473222124, "grad_norm": 0.00835619680583477, "learning_rate": 7.111501316944688e-06, "loss": 0.0001, "step": 5862 }, { "epoch": 1.7158326016973953, "grad_norm": 0.1585269570350647, "learning_rate": 7.104184957565116e-06, "loss": 0.0017, "step": 5863 }, { "epoch": 1.7161252560725782, "grad_norm": 0.00829259678721428, "learning_rate": 7.096868598185544e-06, "loss": 0.0001, "step": 5864 }, { "epoch": 1.716417910447761, "grad_norm": 0.024783294647932053, "learning_rate": 7.08955223880597e-06, "loss": 0.0004, "step": 5865 }, { "epoch": 1.7167105648229442, "grad_norm": 0.012745466083288193, "learning_rate": 7.082235879426398e-06, "loss": 0.0002, "step": 5866 }, { "epoch": 1.717003219198127, "grad_norm": 0.010452482849359512, "learning_rate": 7.074919520046824e-06, "loss": 0.0001, "step": 5867 }, { "epoch": 1.71729587357331, "grad_norm": 0.020907893776893616, "learning_rate": 7.067603160667252e-06, "loss": 0.0003, "step": 5868 }, { "epoch": 1.7175885279484928, "grad_norm": 0.007186241913586855, "learning_rate": 7.06028680128768e-06, "loss": 0.0001, "step": 5869 }, { "epoch": 1.7178811823236757, "grad_norm": 0.011177362874150276, "learning_rate": 7.0529704419081064e-06, "loss": 0.0001, "step": 5870 }, { "epoch": 1.7181738366988588, "grad_norm": 0.06881462782621384, "learning_rate": 7.045654082528534e-06, "loss": 0.0009, "step": 5871 }, { "epoch": 1.7184664910740417, "grad_norm": 0.025284336879849434, "learning_rate": 7.038337723148962e-06, "loss": 0.0004, "step": 5872 }, { "epoch": 1.7187591454492246, "grad_norm": 0.0042735980823636055, "learning_rate": 7.031021363769389e-06, "loss": 0.0001, "step": 5873 }, { "epoch": 1.7190517998244075, "grad_norm": 0.004070568364113569, "learning_rate": 7.0237050043898166e-06, "loss": 0.0001, "step": 5874 }, { "epoch": 1.7193444541995904, "grad_norm": 0.01514324638992548, "learning_rate": 7.016388645010243e-06, "loss": 0.0002, "step": 5875 }, { "epoch": 1.7196371085747733, "grad_norm": 0.01873566210269928, "learning_rate": 7.009072285630671e-06, "loss": 0.0002, "step": 5876 }, { "epoch": 1.7199297629499561, "grad_norm": 0.02724928967654705, "learning_rate": 7.001755926251098e-06, "loss": 0.0004, "step": 5877 }, { "epoch": 1.720222417325139, "grad_norm": 0.046195484697818756, "learning_rate": 6.994439566871525e-06, "loss": 0.0005, "step": 5878 }, { "epoch": 1.720515071700322, "grad_norm": 0.47184377908706665, "learning_rate": 6.987123207491952e-06, "loss": 0.0064, "step": 5879 }, { "epoch": 1.7208077260755048, "grad_norm": 0.022266795858740807, "learning_rate": 6.97980684811238e-06, "loss": 0.0003, "step": 5880 }, { "epoch": 1.7211003804506877, "grad_norm": 0.01935538277029991, "learning_rate": 6.972490488732806e-06, "loss": 0.0003, "step": 5881 }, { "epoch": 1.7213930348258706, "grad_norm": 0.004155569709837437, "learning_rate": 6.965174129353234e-06, "loss": 0.0001, "step": 5882 }, { "epoch": 1.7216856892010535, "grad_norm": 0.004147347528487444, "learning_rate": 6.9578577699736605e-06, "loss": 0.0001, "step": 5883 }, { "epoch": 1.7219783435762364, "grad_norm": 0.00926827359944582, "learning_rate": 6.9505414105940884e-06, "loss": 0.0001, "step": 5884 }, { "epoch": 1.7222709979514192, "grad_norm": 0.013978640548884869, "learning_rate": 6.943225051214516e-06, "loss": 0.0003, "step": 5885 }, { "epoch": 1.7225636523266021, "grad_norm": 0.0015110508538782597, "learning_rate": 6.935908691834943e-06, "loss": 0.0, "step": 5886 }, { "epoch": 1.7228563067017852, "grad_norm": 0.0021568655502051115, "learning_rate": 6.928592332455371e-06, "loss": 0.0, "step": 5887 }, { "epoch": 1.7231489610769681, "grad_norm": 0.002588055795058608, "learning_rate": 6.9212759730757985e-06, "loss": 0.0, "step": 5888 }, { "epoch": 1.723441615452151, "grad_norm": 0.003095940686762333, "learning_rate": 6.913959613696225e-06, "loss": 0.0001, "step": 5889 }, { "epoch": 1.723734269827334, "grad_norm": 0.0008062701090238988, "learning_rate": 6.906643254316653e-06, "loss": 0.0, "step": 5890 }, { "epoch": 1.7240269242025168, "grad_norm": 0.0026606128085404634, "learning_rate": 6.899326894937079e-06, "loss": 0.0001, "step": 5891 }, { "epoch": 1.7243195785776997, "grad_norm": 0.0003834792005363852, "learning_rate": 6.892010535557507e-06, "loss": 0.0, "step": 5892 }, { "epoch": 1.7246122329528828, "grad_norm": 0.004114055074751377, "learning_rate": 6.884694176177935e-06, "loss": 0.0001, "step": 5893 }, { "epoch": 1.7249048873280657, "grad_norm": 0.0007469934644177556, "learning_rate": 6.877377816798361e-06, "loss": 0.0, "step": 5894 }, { "epoch": 1.7251975417032486, "grad_norm": 0.0007380041643045843, "learning_rate": 6.870061457418789e-06, "loss": 0.0, "step": 5895 }, { "epoch": 1.7254901960784315, "grad_norm": 0.0008967572939582169, "learning_rate": 6.862745098039216e-06, "loss": 0.0, "step": 5896 }, { "epoch": 1.7257828504536143, "grad_norm": 0.000646058761049062, "learning_rate": 6.855428738659643e-06, "loss": 0.0, "step": 5897 }, { "epoch": 1.7260755048287972, "grad_norm": 0.0005116090178489685, "learning_rate": 6.84811237928007e-06, "loss": 0.0, "step": 5898 }, { "epoch": 1.7263681592039801, "grad_norm": 0.003847175743430853, "learning_rate": 6.8407960199004975e-06, "loss": 0.0001, "step": 5899 }, { "epoch": 1.726660813579163, "grad_norm": 0.0018641845090314746, "learning_rate": 6.833479660520925e-06, "loss": 0.0, "step": 5900 }, { "epoch": 1.726953467954346, "grad_norm": 0.0018206120003014803, "learning_rate": 6.826163301141353e-06, "loss": 0.0, "step": 5901 }, { "epoch": 1.7272461223295288, "grad_norm": 0.0013743533054366708, "learning_rate": 6.818846941761779e-06, "loss": 0.0, "step": 5902 }, { "epoch": 1.7275387767047117, "grad_norm": 0.0014163124142214656, "learning_rate": 6.811530582382207e-06, "loss": 0.0, "step": 5903 }, { "epoch": 1.7278314310798946, "grad_norm": 0.0029564399737864733, "learning_rate": 6.804214223002635e-06, "loss": 0.0, "step": 5904 }, { "epoch": 1.7281240854550775, "grad_norm": 0.001355087966658175, "learning_rate": 6.796897863623061e-06, "loss": 0.0, "step": 5905 }, { "epoch": 1.7284167398302603, "grad_norm": 0.00047083597746677697, "learning_rate": 6.789581504243489e-06, "loss": 0.0, "step": 5906 }, { "epoch": 1.7287093942054432, "grad_norm": 0.0031178754288703203, "learning_rate": 6.782265144863915e-06, "loss": 0.0001, "step": 5907 }, { "epoch": 1.7290020485806261, "grad_norm": 0.0011868373258039355, "learning_rate": 6.774948785484343e-06, "loss": 0.0, "step": 5908 }, { "epoch": 1.7292947029558092, "grad_norm": 0.003698865883052349, "learning_rate": 6.767632426104771e-06, "loss": 0.0001, "step": 5909 }, { "epoch": 1.7295873573309921, "grad_norm": 0.0058182161301374435, "learning_rate": 6.760316066725197e-06, "loss": 0.0001, "step": 5910 }, { "epoch": 1.729880011706175, "grad_norm": 0.00122758187353611, "learning_rate": 6.752999707345625e-06, "loss": 0.0, "step": 5911 }, { "epoch": 1.730172666081358, "grad_norm": 0.0046470691449940205, "learning_rate": 6.745683347966053e-06, "loss": 0.0001, "step": 5912 }, { "epoch": 1.7304653204565408, "grad_norm": 0.000916247081477195, "learning_rate": 6.7383669885864795e-06, "loss": 0.0, "step": 5913 }, { "epoch": 1.7307579748317239, "grad_norm": 0.008241435512900352, "learning_rate": 6.7310506292069075e-06, "loss": 0.0001, "step": 5914 }, { "epoch": 1.7310506292069068, "grad_norm": 0.0019211042672395706, "learning_rate": 6.723734269827334e-06, "loss": 0.0, "step": 5915 }, { "epoch": 1.7313432835820897, "grad_norm": 0.0039780582301318645, "learning_rate": 6.716417910447762e-06, "loss": 0.0001, "step": 5916 }, { "epoch": 1.7316359379572726, "grad_norm": 0.0008696449222043157, "learning_rate": 6.70910155106819e-06, "loss": 0.0, "step": 5917 }, { "epoch": 1.7319285923324554, "grad_norm": 0.004171712789684534, "learning_rate": 6.701785191688616e-06, "loss": 0.0001, "step": 5918 }, { "epoch": 1.7322212467076383, "grad_norm": 0.000716442649718374, "learning_rate": 6.694468832309044e-06, "loss": 0.0, "step": 5919 }, { "epoch": 1.7325139010828212, "grad_norm": 0.0023232263047248125, "learning_rate": 6.687152472929471e-06, "loss": 0.0, "step": 5920 }, { "epoch": 1.732806555458004, "grad_norm": 0.0019374418770894408, "learning_rate": 6.679836113549898e-06, "loss": 0.0, "step": 5921 }, { "epoch": 1.733099209833187, "grad_norm": 0.001732157776132226, "learning_rate": 6.672519754170325e-06, "loss": 0.0, "step": 5922 }, { "epoch": 1.7333918642083699, "grad_norm": 0.0008150420617312193, "learning_rate": 6.665203394790752e-06, "loss": 0.0, "step": 5923 }, { "epoch": 1.7336845185835528, "grad_norm": 0.000821317604277283, "learning_rate": 6.657887035411179e-06, "loss": 0.0, "step": 5924 }, { "epoch": 1.7339771729587357, "grad_norm": 0.0010837716981768608, "learning_rate": 6.650570676031607e-06, "loss": 0.0, "step": 5925 }, { "epoch": 1.7342698273339185, "grad_norm": 0.0008237934089265764, "learning_rate": 6.6432543166520335e-06, "loss": 0.0, "step": 5926 }, { "epoch": 1.7345624817091014, "grad_norm": 0.000993865542113781, "learning_rate": 6.6359379572724615e-06, "loss": 0.0, "step": 5927 }, { "epoch": 1.7348551360842843, "grad_norm": 0.0005446638097055256, "learning_rate": 6.6286215978928894e-06, "loss": 0.0, "step": 5928 }, { "epoch": 1.7351477904594672, "grad_norm": 0.00519817229360342, "learning_rate": 6.621305238513316e-06, "loss": 0.0001, "step": 5929 }, { "epoch": 1.7354404448346503, "grad_norm": 0.0009001668076962233, "learning_rate": 6.613988879133744e-06, "loss": 0.0, "step": 5930 }, { "epoch": 1.7357330992098332, "grad_norm": 0.0030989614315330982, "learning_rate": 6.60667251975417e-06, "loss": 0.0001, "step": 5931 }, { "epoch": 1.736025753585016, "grad_norm": 0.0009251827141270041, "learning_rate": 6.599356160374598e-06, "loss": 0.0, "step": 5932 }, { "epoch": 1.736318407960199, "grad_norm": 0.0012371689081192017, "learning_rate": 6.592039800995026e-06, "loss": 0.0, "step": 5933 }, { "epoch": 1.7366110623353819, "grad_norm": 0.0036600898019969463, "learning_rate": 6.584723441615452e-06, "loss": 0.0001, "step": 5934 }, { "epoch": 1.736903716710565, "grad_norm": 0.0002120627905242145, "learning_rate": 6.57740708223588e-06, "loss": 0.0, "step": 5935 }, { "epoch": 1.7371963710857479, "grad_norm": 0.002093351213261485, "learning_rate": 6.570090722856306e-06, "loss": 0.0, "step": 5936 }, { "epoch": 1.7374890254609308, "grad_norm": 0.0016823607729747891, "learning_rate": 6.562774363476734e-06, "loss": 0.0, "step": 5937 }, { "epoch": 1.7377816798361136, "grad_norm": 0.0007766742492094636, "learning_rate": 6.555458004097162e-06, "loss": 0.0, "step": 5938 }, { "epoch": 1.7380743342112965, "grad_norm": 0.0010464844526723027, "learning_rate": 6.5481416447175884e-06, "loss": 0.0, "step": 5939 }, { "epoch": 1.7383669885864794, "grad_norm": 0.005947027821093798, "learning_rate": 6.540825285338016e-06, "loss": 0.0001, "step": 5940 }, { "epoch": 1.7386596429616623, "grad_norm": 0.000575781858060509, "learning_rate": 6.533508925958444e-06, "loss": 0.0, "step": 5941 }, { "epoch": 1.7389522973368452, "grad_norm": 0.0007926999824121594, "learning_rate": 6.526192566578871e-06, "loss": 0.0, "step": 5942 }, { "epoch": 1.739244951712028, "grad_norm": 0.0013060495257377625, "learning_rate": 6.5188762071992985e-06, "loss": 0.0, "step": 5943 }, { "epoch": 1.739537606087211, "grad_norm": 0.0033319902140647173, "learning_rate": 6.511559847819725e-06, "loss": 0.0001, "step": 5944 }, { "epoch": 1.7398302604623939, "grad_norm": 0.015158601105213165, "learning_rate": 6.504243488440153e-06, "loss": 0.0001, "step": 5945 }, { "epoch": 1.7401229148375768, "grad_norm": 0.0009031699155457318, "learning_rate": 6.49692712906058e-06, "loss": 0.0, "step": 5946 }, { "epoch": 1.7404155692127596, "grad_norm": 0.0016654676292091608, "learning_rate": 6.489610769681007e-06, "loss": 0.0, "step": 5947 }, { "epoch": 1.7407082235879425, "grad_norm": 0.0007055064779706299, "learning_rate": 6.482294410301434e-06, "loss": 0.0, "step": 5948 }, { "epoch": 1.7410008779631254, "grad_norm": 0.0005536130629479885, "learning_rate": 6.474978050921862e-06, "loss": 0.0, "step": 5949 }, { "epoch": 1.7412935323383083, "grad_norm": 0.0008756743045523763, "learning_rate": 6.467661691542288e-06, "loss": 0.0, "step": 5950 }, { "epoch": 1.7415861867134914, "grad_norm": 0.00022419232118409127, "learning_rate": 6.460345332162716e-06, "loss": 0.0, "step": 5951 }, { "epoch": 1.7418788410886743, "grad_norm": 0.001122329500503838, "learning_rate": 6.4530289727831425e-06, "loss": 0.0, "step": 5952 }, { "epoch": 1.7421714954638572, "grad_norm": 0.0014706572983413935, "learning_rate": 6.44571261340357e-06, "loss": 0.0, "step": 5953 }, { "epoch": 1.74246414983904, "grad_norm": 0.0018914340762421489, "learning_rate": 6.438396254023998e-06, "loss": 0.0, "step": 5954 }, { "epoch": 1.742756804214223, "grad_norm": 0.0010767308995127678, "learning_rate": 6.431079894644425e-06, "loss": 0.0, "step": 5955 }, { "epoch": 1.743049458589406, "grad_norm": 0.0013149684527888894, "learning_rate": 6.4237635352648526e-06, "loss": 0.0, "step": 5956 }, { "epoch": 1.743342112964589, "grad_norm": 0.004895191639661789, "learning_rate": 6.4164471758852805e-06, "loss": 0.0001, "step": 5957 }, { "epoch": 1.7436347673397719, "grad_norm": 0.0004786603385582566, "learning_rate": 6.409130816505707e-06, "loss": 0.0, "step": 5958 }, { "epoch": 1.7439274217149547, "grad_norm": 0.0014774624723941088, "learning_rate": 6.401814457126135e-06, "loss": 0.0, "step": 5959 }, { "epoch": 1.7442200760901376, "grad_norm": 0.10711546987295151, "learning_rate": 6.394498097746561e-06, "loss": 0.0009, "step": 5960 }, { "epoch": 1.7445127304653205, "grad_norm": 0.001412463141605258, "learning_rate": 6.387181738366989e-06, "loss": 0.0, "step": 5961 }, { "epoch": 1.7448053848405034, "grad_norm": 0.0007251347415149212, "learning_rate": 6.379865378987417e-06, "loss": 0.0, "step": 5962 }, { "epoch": 1.7450980392156863, "grad_norm": 0.0013520853826776147, "learning_rate": 6.372549019607843e-06, "loss": 0.0, "step": 5963 }, { "epoch": 1.7453906935908692, "grad_norm": 0.0023613956291228533, "learning_rate": 6.365232660228271e-06, "loss": 0.0001, "step": 5964 }, { "epoch": 1.745683347966052, "grad_norm": 0.0010315789841115475, "learning_rate": 6.357916300848698e-06, "loss": 0.0, "step": 5965 }, { "epoch": 1.745976002341235, "grad_norm": 0.00086397078121081, "learning_rate": 6.350599941469125e-06, "loss": 0.0, "step": 5966 }, { "epoch": 1.7462686567164178, "grad_norm": 0.0010542155941948295, "learning_rate": 6.343283582089552e-06, "loss": 0.0, "step": 5967 }, { "epoch": 1.7465613110916007, "grad_norm": 0.04119636490941048, "learning_rate": 6.3359672227099795e-06, "loss": 0.0002, "step": 5968 }, { "epoch": 1.7468539654667836, "grad_norm": 0.0012462715385481715, "learning_rate": 6.328650863330407e-06, "loss": 0.0, "step": 5969 }, { "epoch": 1.7471466198419665, "grad_norm": 0.00031741769635118544, "learning_rate": 6.3213345039508346e-06, "loss": 0.0, "step": 5970 }, { "epoch": 1.7474392742171494, "grad_norm": 0.0022417607251554728, "learning_rate": 6.314018144571261e-06, "loss": 0.0, "step": 5971 }, { "epoch": 1.7477319285923325, "grad_norm": 0.0006855145911686122, "learning_rate": 6.306701785191689e-06, "loss": 0.0, "step": 5972 }, { "epoch": 1.7480245829675154, "grad_norm": 0.01104377955198288, "learning_rate": 6.299385425812117e-06, "loss": 0.0002, "step": 5973 }, { "epoch": 1.7483172373426983, "grad_norm": 0.0005324244848452508, "learning_rate": 6.292069066432543e-06, "loss": 0.0, "step": 5974 }, { "epoch": 1.7486098917178812, "grad_norm": 0.0008127905894070864, "learning_rate": 6.284752707052971e-06, "loss": 0.0, "step": 5975 }, { "epoch": 1.748902546093064, "grad_norm": 0.0008344686939381063, "learning_rate": 6.277436347673397e-06, "loss": 0.0, "step": 5976 }, { "epoch": 1.749195200468247, "grad_norm": 0.0004917003680020571, "learning_rate": 6.270119988293825e-06, "loss": 0.0, "step": 5977 }, { "epoch": 1.74948785484343, "grad_norm": 0.0009864646708592772, "learning_rate": 6.262803628914253e-06, "loss": 0.0, "step": 5978 }, { "epoch": 1.749780509218613, "grad_norm": 0.0019904731307178736, "learning_rate": 6.255487269534679e-06, "loss": 0.0, "step": 5979 }, { "epoch": 1.7500731635937958, "grad_norm": 0.006802576594054699, "learning_rate": 6.248170910155107e-06, "loss": 0.0001, "step": 5980 }, { "epoch": 1.7503658179689787, "grad_norm": 0.000930826470721513, "learning_rate": 6.240854550775534e-06, "loss": 0.0, "step": 5981 }, { "epoch": 1.7506584723441616, "grad_norm": 0.0014126853784546256, "learning_rate": 6.2335381913959615e-06, "loss": 0.0, "step": 5982 }, { "epoch": 1.7509511267193445, "grad_norm": 0.0048756287433207035, "learning_rate": 6.2262218320163894e-06, "loss": 0.0, "step": 5983 }, { "epoch": 1.7512437810945274, "grad_norm": 0.0008603222668170929, "learning_rate": 6.2189054726368165e-06, "loss": 0.0, "step": 5984 }, { "epoch": 1.7515364354697103, "grad_norm": 0.006025762762874365, "learning_rate": 6.211589113257244e-06, "loss": 0.0001, "step": 5985 }, { "epoch": 1.7518290898448932, "grad_norm": 0.0022837838623672724, "learning_rate": 6.204272753877671e-06, "loss": 0.0, "step": 5986 }, { "epoch": 1.752121744220076, "grad_norm": 0.0006897895946167409, "learning_rate": 6.196956394498099e-06, "loss": 0.0, "step": 5987 }, { "epoch": 1.752414398595259, "grad_norm": 0.0007595543283969164, "learning_rate": 6.189640035118526e-06, "loss": 0.0, "step": 5988 }, { "epoch": 1.7527070529704418, "grad_norm": 0.0010641933185979724, "learning_rate": 6.182323675738953e-06, "loss": 0.0, "step": 5989 }, { "epoch": 1.7529997073456247, "grad_norm": 0.000681478064507246, "learning_rate": 6.17500731635938e-06, "loss": 0.0, "step": 5990 }, { "epoch": 1.7532923617208076, "grad_norm": 0.0002897112863138318, "learning_rate": 6.167690956979807e-06, "loss": 0.0, "step": 5991 }, { "epoch": 1.7535850160959905, "grad_norm": 0.0010328495409339666, "learning_rate": 6.160374597600234e-06, "loss": 0.0, "step": 5992 }, { "epoch": 1.7538776704711734, "grad_norm": 0.0010134896729141474, "learning_rate": 6.153058238220661e-06, "loss": 0.0, "step": 5993 }, { "epoch": 1.7541703248463565, "grad_norm": 0.006686047185212374, "learning_rate": 6.1457418788410884e-06, "loss": 0.0001, "step": 5994 }, { "epoch": 1.7544629792215394, "grad_norm": 0.0012931345263496041, "learning_rate": 6.138425519461516e-06, "loss": 0.0, "step": 5995 }, { "epoch": 1.7547556335967223, "grad_norm": 0.0007904611411504447, "learning_rate": 6.1311091600819435e-06, "loss": 0.0, "step": 5996 }, { "epoch": 1.7550482879719052, "grad_norm": 0.0005322484066709876, "learning_rate": 6.123792800702371e-06, "loss": 0.0, "step": 5997 }, { "epoch": 1.755340942347088, "grad_norm": 0.0006279582157731056, "learning_rate": 6.116476441322798e-06, "loss": 0.0, "step": 5998 }, { "epoch": 1.7556335967222712, "grad_norm": 0.0011968952603638172, "learning_rate": 6.109160081943225e-06, "loss": 0.0, "step": 5999 }, { "epoch": 1.755926251097454, "grad_norm": 0.011229063384234905, "learning_rate": 6.101843722563653e-06, "loss": 0.0001, "step": 6000 }, { "epoch": 1.756218905472637, "grad_norm": 0.0007652883650735021, "learning_rate": 6.09452736318408e-06, "loss": 0.0, "step": 6001 }, { "epoch": 1.7565115598478198, "grad_norm": 0.0005294579896144569, "learning_rate": 6.087211003804507e-06, "loss": 0.0, "step": 6002 }, { "epoch": 1.7568042142230027, "grad_norm": 0.0006569406250491738, "learning_rate": 6.079894644424934e-06, "loss": 0.0, "step": 6003 }, { "epoch": 1.7570968685981856, "grad_norm": 0.007408336736261845, "learning_rate": 6.072578285045362e-06, "loss": 0.0001, "step": 6004 }, { "epoch": 1.7573895229733685, "grad_norm": 0.0008690960239619017, "learning_rate": 6.065261925665789e-06, "loss": 0.0, "step": 6005 }, { "epoch": 1.7576821773485514, "grad_norm": 0.0007068330887705088, "learning_rate": 6.057945566286216e-06, "loss": 0.0, "step": 6006 }, { "epoch": 1.7579748317237343, "grad_norm": 0.005987859331071377, "learning_rate": 6.050629206906643e-06, "loss": 0.0001, "step": 6007 }, { "epoch": 1.7582674860989171, "grad_norm": 0.0012825940502807498, "learning_rate": 6.043312847527071e-06, "loss": 0.0, "step": 6008 }, { "epoch": 1.7585601404741, "grad_norm": 0.0008381876978091896, "learning_rate": 6.035996488147498e-06, "loss": 0.0, "step": 6009 }, { "epoch": 1.758852794849283, "grad_norm": 0.0004752900858875364, "learning_rate": 6.0286801287679255e-06, "loss": 0.0, "step": 6010 }, { "epoch": 1.7591454492244658, "grad_norm": 0.0021880939602851868, "learning_rate": 6.0213637693883526e-06, "loss": 0.0, "step": 6011 }, { "epoch": 1.7594381035996487, "grad_norm": 0.000659592158626765, "learning_rate": 6.0140474100087805e-06, "loss": 0.0, "step": 6012 }, { "epoch": 1.7597307579748316, "grad_norm": 0.001379102817736566, "learning_rate": 6.006731050629208e-06, "loss": 0.0, "step": 6013 }, { "epoch": 1.7600234123500145, "grad_norm": 0.0027575218118727207, "learning_rate": 5.999414691249635e-06, "loss": 0.0, "step": 6014 }, { "epoch": 1.7603160667251976, "grad_norm": 0.0003732953919097781, "learning_rate": 5.992098331870062e-06, "loss": 0.0, "step": 6015 }, { "epoch": 1.7606087211003805, "grad_norm": 0.0004398910969030112, "learning_rate": 5.984781972490489e-06, "loss": 0.0, "step": 6016 }, { "epoch": 1.7609013754755634, "grad_norm": 0.0027367311995476484, "learning_rate": 5.977465613110916e-06, "loss": 0.0, "step": 6017 }, { "epoch": 1.7611940298507462, "grad_norm": 0.0009312551701441407, "learning_rate": 5.970149253731343e-06, "loss": 0.0, "step": 6018 }, { "epoch": 1.7614866842259291, "grad_norm": 0.00025712751084938645, "learning_rate": 5.96283289435177e-06, "loss": 0.0, "step": 6019 }, { "epoch": 1.7617793386011122, "grad_norm": 0.0007998314686119556, "learning_rate": 5.955516534972198e-06, "loss": 0.0, "step": 6020 }, { "epoch": 1.7620719929762951, "grad_norm": 1.298336148262024, "learning_rate": 5.948200175592625e-06, "loss": 0.0054, "step": 6021 }, { "epoch": 1.762364647351478, "grad_norm": 0.0007662497810088098, "learning_rate": 5.940883816213052e-06, "loss": 0.0, "step": 6022 }, { "epoch": 1.762657301726661, "grad_norm": 0.00021636112069245428, "learning_rate": 5.9335674568334795e-06, "loss": 0.0, "step": 6023 }, { "epoch": 1.7629499561018438, "grad_norm": 0.0003535682044457644, "learning_rate": 5.9262510974539075e-06, "loss": 0.0, "step": 6024 }, { "epoch": 1.7632426104770267, "grad_norm": 0.0011517098173499107, "learning_rate": 5.9189347380743346e-06, "loss": 0.0, "step": 6025 }, { "epoch": 1.7635352648522096, "grad_norm": 0.0010904577793553472, "learning_rate": 5.911618378694762e-06, "loss": 0.0, "step": 6026 }, { "epoch": 1.7638279192273925, "grad_norm": 0.0008237168658524752, "learning_rate": 5.904302019315189e-06, "loss": 0.0, "step": 6027 }, { "epoch": 1.7641205736025753, "grad_norm": 0.0005817033816128969, "learning_rate": 5.896985659935617e-06, "loss": 0.0, "step": 6028 }, { "epoch": 1.7644132279777582, "grad_norm": 0.0004842921916861087, "learning_rate": 5.889669300556044e-06, "loss": 0.0, "step": 6029 }, { "epoch": 1.7647058823529411, "grad_norm": 0.0006301872199401259, "learning_rate": 5.882352941176471e-06, "loss": 0.0, "step": 6030 }, { "epoch": 1.764998536728124, "grad_norm": 0.005268681328743696, "learning_rate": 5.875036581796898e-06, "loss": 0.0001, "step": 6031 }, { "epoch": 1.765291191103307, "grad_norm": 0.001128269243054092, "learning_rate": 5.867720222417326e-06, "loss": 0.0, "step": 6032 }, { "epoch": 1.7655838454784898, "grad_norm": 0.0004982168320566416, "learning_rate": 5.860403863037753e-06, "loss": 0.0, "step": 6033 }, { "epoch": 1.7658764998536727, "grad_norm": 0.0007496103644371033, "learning_rate": 5.85308750365818e-06, "loss": 0.0, "step": 6034 }, { "epoch": 1.7661691542288556, "grad_norm": 0.0005136104300618172, "learning_rate": 5.845771144278607e-06, "loss": 0.0, "step": 6035 }, { "epoch": 1.7664618086040387, "grad_norm": 0.0015976725844666362, "learning_rate": 5.838454784899034e-06, "loss": 0.0, "step": 6036 }, { "epoch": 1.7667544629792216, "grad_norm": 0.0012290143640711904, "learning_rate": 5.8311384255194615e-06, "loss": 0.0, "step": 6037 }, { "epoch": 1.7670471173544045, "grad_norm": 0.00034109456464648247, "learning_rate": 5.823822066139889e-06, "loss": 0.0, "step": 6038 }, { "epoch": 1.7673397717295873, "grad_norm": 0.020967284217476845, "learning_rate": 5.816505706760316e-06, "loss": 0.0002, "step": 6039 }, { "epoch": 1.7676324261047702, "grad_norm": 0.002137015573680401, "learning_rate": 5.809189347380744e-06, "loss": 0.0, "step": 6040 }, { "epoch": 1.7679250804799533, "grad_norm": 0.003661647904664278, "learning_rate": 5.801872988001171e-06, "loss": 0.0001, "step": 6041 }, { "epoch": 1.7682177348551362, "grad_norm": 0.010518140159547329, "learning_rate": 5.794556628621598e-06, "loss": 0.0001, "step": 6042 }, { "epoch": 1.7685103892303191, "grad_norm": 0.0016498948680236936, "learning_rate": 5.787240269242025e-06, "loss": 0.0, "step": 6043 }, { "epoch": 1.768803043605502, "grad_norm": 0.000788530393037945, "learning_rate": 5.779923909862453e-06, "loss": 0.0, "step": 6044 }, { "epoch": 1.769095697980685, "grad_norm": 0.0015095038106665015, "learning_rate": 5.77260755048288e-06, "loss": 0.0, "step": 6045 }, { "epoch": 1.7693883523558678, "grad_norm": 0.0013496472965925932, "learning_rate": 5.765291191103307e-06, "loss": 0.0, "step": 6046 }, { "epoch": 1.7696810067310507, "grad_norm": 1.13015878200531, "learning_rate": 5.757974831723734e-06, "loss": 0.0057, "step": 6047 }, { "epoch": 1.7699736611062336, "grad_norm": 0.36021360754966736, "learning_rate": 5.750658472344162e-06, "loss": 0.0013, "step": 6048 }, { "epoch": 1.7702663154814164, "grad_norm": 0.0015636446187272668, "learning_rate": 5.743342112964589e-06, "loss": 0.0, "step": 6049 }, { "epoch": 1.7705589698565993, "grad_norm": 0.0010189537424594164, "learning_rate": 5.736025753585016e-06, "loss": 0.0, "step": 6050 }, { "epoch": 1.7708516242317822, "grad_norm": 0.020609412342309952, "learning_rate": 5.7287093942054435e-06, "loss": 0.0002, "step": 6051 }, { "epoch": 1.771144278606965, "grad_norm": 0.0023420306388288736, "learning_rate": 5.7213930348258714e-06, "loss": 0.0, "step": 6052 }, { "epoch": 1.771436932982148, "grad_norm": 0.0005672802799381316, "learning_rate": 5.7140766754462985e-06, "loss": 0.0, "step": 6053 }, { "epoch": 1.7717295873573309, "grad_norm": 0.0013113165041431785, "learning_rate": 5.706760316066726e-06, "loss": 0.0, "step": 6054 }, { "epoch": 1.7720222417325138, "grad_norm": 0.004139709286391735, "learning_rate": 5.699443956687153e-06, "loss": 0.0001, "step": 6055 }, { "epoch": 1.7723148961076967, "grad_norm": 0.0018005968304350972, "learning_rate": 5.692127597307581e-06, "loss": 0.0, "step": 6056 }, { "epoch": 1.7726075504828798, "grad_norm": 0.0013284042943269014, "learning_rate": 5.684811237928008e-06, "loss": 0.0, "step": 6057 }, { "epoch": 1.7729002048580627, "grad_norm": 0.00022370753868017346, "learning_rate": 5.677494878548435e-06, "loss": 0.0, "step": 6058 }, { "epoch": 1.7731928592332455, "grad_norm": 0.00017913279589265585, "learning_rate": 5.670178519168862e-06, "loss": 0.0, "step": 6059 }, { "epoch": 1.7734855136084284, "grad_norm": 0.000413614819990471, "learning_rate": 5.662862159789289e-06, "loss": 0.0, "step": 6060 }, { "epoch": 1.7737781679836113, "grad_norm": 0.0003607451217249036, "learning_rate": 5.655545800409716e-06, "loss": 0.0, "step": 6061 }, { "epoch": 1.7740708223587944, "grad_norm": 0.00033977828570641577, "learning_rate": 5.648229441030143e-06, "loss": 0.0, "step": 6062 }, { "epoch": 1.7743634767339773, "grad_norm": 0.0004045834648422897, "learning_rate": 5.64091308165057e-06, "loss": 0.0, "step": 6063 }, { "epoch": 1.7746561311091602, "grad_norm": 0.0006311695324257016, "learning_rate": 5.6335967222709975e-06, "loss": 0.0, "step": 6064 }, { "epoch": 1.774948785484343, "grad_norm": 0.011869017034769058, "learning_rate": 5.6262803628914255e-06, "loss": 0.0001, "step": 6065 }, { "epoch": 1.775241439859526, "grad_norm": 0.003849781583994627, "learning_rate": 5.6189640035118526e-06, "loss": 0.0001, "step": 6066 }, { "epoch": 1.7755340942347089, "grad_norm": 0.008825070224702358, "learning_rate": 5.61164764413228e-06, "loss": 0.0001, "step": 6067 }, { "epoch": 1.7758267486098918, "grad_norm": 0.0005195000558160245, "learning_rate": 5.604331284752707e-06, "loss": 0.0, "step": 6068 }, { "epoch": 1.7761194029850746, "grad_norm": 0.0002385463158134371, "learning_rate": 5.597014925373135e-06, "loss": 0.0, "step": 6069 }, { "epoch": 1.7764120573602575, "grad_norm": 0.0003511181566864252, "learning_rate": 5.589698565993562e-06, "loss": 0.0, "step": 6070 }, { "epoch": 1.7767047117354404, "grad_norm": 0.000668007880449295, "learning_rate": 5.582382206613989e-06, "loss": 0.0, "step": 6071 }, { "epoch": 1.7769973661106233, "grad_norm": 0.001052291365340352, "learning_rate": 5.575065847234416e-06, "loss": 0.0, "step": 6072 }, { "epoch": 1.7772900204858062, "grad_norm": 0.0008030377794057131, "learning_rate": 5.567749487854844e-06, "loss": 0.0, "step": 6073 }, { "epoch": 1.777582674860989, "grad_norm": 0.0005521881976164877, "learning_rate": 5.560433128475271e-06, "loss": 0.0, "step": 6074 }, { "epoch": 1.777875329236172, "grad_norm": 0.0007169344462454319, "learning_rate": 5.553116769095698e-06, "loss": 0.0, "step": 6075 }, { "epoch": 1.7781679836113549, "grad_norm": 0.0018445259192958474, "learning_rate": 5.545800409716125e-06, "loss": 0.0, "step": 6076 }, { "epoch": 1.7784606379865378, "grad_norm": 0.009776068851351738, "learning_rate": 5.538484050336553e-06, "loss": 0.0001, "step": 6077 }, { "epoch": 1.7787532923617209, "grad_norm": 0.00043877839925698936, "learning_rate": 5.53116769095698e-06, "loss": 0.0, "step": 6078 }, { "epoch": 1.7790459467369037, "grad_norm": 0.0015850827330723405, "learning_rate": 5.5238513315774074e-06, "loss": 0.0, "step": 6079 }, { "epoch": 1.7793386011120866, "grad_norm": 13.653970718383789, "learning_rate": 5.5165349721978346e-06, "loss": 0.0361, "step": 6080 }, { "epoch": 1.7796312554872695, "grad_norm": 0.00023248694196809083, "learning_rate": 5.5092186128182625e-06, "loss": 0.0, "step": 6081 }, { "epoch": 1.7799239098624524, "grad_norm": 0.0009998355526477098, "learning_rate": 5.50190225343869e-06, "loss": 0.0, "step": 6082 }, { "epoch": 1.7802165642376353, "grad_norm": 0.0004500342474784702, "learning_rate": 5.494585894059117e-06, "loss": 0.0, "step": 6083 }, { "epoch": 1.7805092186128184, "grad_norm": 0.0004393111157696694, "learning_rate": 5.487269534679544e-06, "loss": 0.0, "step": 6084 }, { "epoch": 1.7808018729880013, "grad_norm": 0.0019799573346972466, "learning_rate": 5.479953175299971e-06, "loss": 0.0, "step": 6085 }, { "epoch": 1.7810945273631842, "grad_norm": 0.000697512470651418, "learning_rate": 5.472636815920398e-06, "loss": 0.0, "step": 6086 }, { "epoch": 1.781387181738367, "grad_norm": 0.000314964447170496, "learning_rate": 5.465320456540825e-06, "loss": 0.0, "step": 6087 }, { "epoch": 1.78167983611355, "grad_norm": 0.006685046944767237, "learning_rate": 5.458004097161252e-06, "loss": 0.0001, "step": 6088 }, { "epoch": 1.7819724904887329, "grad_norm": 0.0002795507898554206, "learning_rate": 5.45068773778168e-06, "loss": 0.0, "step": 6089 }, { "epoch": 1.7822651448639157, "grad_norm": 0.0012665147660300136, "learning_rate": 5.443371378402107e-06, "loss": 0.0, "step": 6090 }, { "epoch": 1.7825577992390986, "grad_norm": 0.00019367047934792936, "learning_rate": 5.436055019022534e-06, "loss": 0.0, "step": 6091 }, { "epoch": 1.7828504536142815, "grad_norm": 0.0001833774003898725, "learning_rate": 5.4287386596429615e-06, "loss": 0.0, "step": 6092 }, { "epoch": 1.7831431079894644, "grad_norm": 0.00014645690680481493, "learning_rate": 5.4214223002633894e-06, "loss": 0.0, "step": 6093 }, { "epoch": 1.7834357623646473, "grad_norm": 0.0003003651218023151, "learning_rate": 5.4141059408838165e-06, "loss": 0.0, "step": 6094 }, { "epoch": 1.7837284167398302, "grad_norm": 0.0004113362228963524, "learning_rate": 5.406789581504244e-06, "loss": 0.0, "step": 6095 }, { "epoch": 1.784021071115013, "grad_norm": 0.00032383357756771147, "learning_rate": 5.399473222124671e-06, "loss": 0.0, "step": 6096 }, { "epoch": 1.784313725490196, "grad_norm": 0.001158069702796638, "learning_rate": 5.392156862745099e-06, "loss": 0.0, "step": 6097 }, { "epoch": 1.7846063798653788, "grad_norm": 0.0012851745123043656, "learning_rate": 5.384840503365526e-06, "loss": 0.0, "step": 6098 }, { "epoch": 1.7848990342405617, "grad_norm": 0.00016551860608160496, "learning_rate": 5.377524143985953e-06, "loss": 0.0, "step": 6099 }, { "epoch": 1.7851916886157448, "grad_norm": 0.000494194682687521, "learning_rate": 5.37020778460638e-06, "loss": 0.0, "step": 6100 }, { "epoch": 1.7854843429909277, "grad_norm": 0.0003518579760566354, "learning_rate": 5.362891425226808e-06, "loss": 0.0, "step": 6101 }, { "epoch": 1.7857769973661106, "grad_norm": 0.0005160426953807473, "learning_rate": 5.355575065847235e-06, "loss": 0.0, "step": 6102 }, { "epoch": 1.7860696517412935, "grad_norm": 0.02960193157196045, "learning_rate": 5.348258706467662e-06, "loss": 0.0001, "step": 6103 }, { "epoch": 1.7863623061164764, "grad_norm": 0.0008992599323391914, "learning_rate": 5.340942347088089e-06, "loss": 0.0, "step": 6104 }, { "epoch": 1.7866549604916595, "grad_norm": 0.001342964475043118, "learning_rate": 5.333625987708516e-06, "loss": 0.0, "step": 6105 }, { "epoch": 1.7869476148668424, "grad_norm": 0.000340066704666242, "learning_rate": 5.3263096283289435e-06, "loss": 0.0, "step": 6106 }, { "epoch": 1.7872402692420253, "grad_norm": 0.0016259491676464677, "learning_rate": 5.318993268949371e-06, "loss": 0.0, "step": 6107 }, { "epoch": 1.7875329236172082, "grad_norm": 9.170470730168745e-05, "learning_rate": 5.3116769095697985e-06, "loss": 0.0, "step": 6108 }, { "epoch": 1.787825577992391, "grad_norm": 0.00018735171761363745, "learning_rate": 5.304360550190226e-06, "loss": 0.0, "step": 6109 }, { "epoch": 1.788118232367574, "grad_norm": 0.00010125528933713213, "learning_rate": 5.297044190810653e-06, "loss": 0.0, "step": 6110 }, { "epoch": 1.7884108867427568, "grad_norm": 0.00017634555115364492, "learning_rate": 5.28972783143108e-06, "loss": 0.0, "step": 6111 }, { "epoch": 1.7887035411179397, "grad_norm": 0.0003152966091874987, "learning_rate": 5.282411472051507e-06, "loss": 0.0, "step": 6112 }, { "epoch": 1.7889961954931226, "grad_norm": 8.96536948857829e-05, "learning_rate": 5.275095112671935e-06, "loss": 0.0, "step": 6113 }, { "epoch": 1.7892888498683055, "grad_norm": 0.0004922590451315045, "learning_rate": 5.267778753292362e-06, "loss": 0.0, "step": 6114 }, { "epoch": 1.7895815042434884, "grad_norm": 0.0004588527954183519, "learning_rate": 5.260462393912789e-06, "loss": 0.0, "step": 6115 }, { "epoch": 1.7898741586186713, "grad_norm": 0.0004414925933815539, "learning_rate": 5.253146034533216e-06, "loss": 0.0, "step": 6116 }, { "epoch": 1.7901668129938542, "grad_norm": 0.0002697826421353966, "learning_rate": 5.245829675153644e-06, "loss": 0.0, "step": 6117 }, { "epoch": 1.790459467369037, "grad_norm": 0.0016554900212213397, "learning_rate": 5.238513315774071e-06, "loss": 0.0, "step": 6118 }, { "epoch": 1.79075212174422, "grad_norm": 0.0006203555967658758, "learning_rate": 5.231196956394498e-06, "loss": 0.0, "step": 6119 }, { "epoch": 1.7910447761194028, "grad_norm": 0.0002844484697561711, "learning_rate": 5.2238805970149255e-06, "loss": 0.0, "step": 6120 }, { "epoch": 1.791337430494586, "grad_norm": 0.0003177575417794287, "learning_rate": 5.216564237635353e-06, "loss": 0.0, "step": 6121 }, { "epoch": 1.7916300848697688, "grad_norm": 0.00031799066346138716, "learning_rate": 5.2092478782557805e-06, "loss": 0.0, "step": 6122 }, { "epoch": 1.7919227392449517, "grad_norm": 0.0006663068779744208, "learning_rate": 5.201931518876208e-06, "loss": 0.0, "step": 6123 }, { "epoch": 1.7922153936201346, "grad_norm": 0.001612933585420251, "learning_rate": 5.194615159496635e-06, "loss": 0.0, "step": 6124 }, { "epoch": 1.7925080479953175, "grad_norm": 0.02046128176152706, "learning_rate": 5.187298800117062e-06, "loss": 0.0001, "step": 6125 }, { "epoch": 1.7928007023705006, "grad_norm": 0.00048213082482106984, "learning_rate": 5.17998244073749e-06, "loss": 0.0, "step": 6126 }, { "epoch": 1.7930933567456835, "grad_norm": 0.003854131791740656, "learning_rate": 5.172666081357917e-06, "loss": 0.0, "step": 6127 }, { "epoch": 1.7933860111208664, "grad_norm": 0.0006036604754626751, "learning_rate": 5.165349721978344e-06, "loss": 0.0, "step": 6128 }, { "epoch": 1.7936786654960493, "grad_norm": 0.0008485932485200465, "learning_rate": 5.158033362598771e-06, "loss": 0.0, "step": 6129 }, { "epoch": 1.7939713198712322, "grad_norm": 0.0005633328692056239, "learning_rate": 5.150717003219198e-06, "loss": 0.0, "step": 6130 }, { "epoch": 1.794263974246415, "grad_norm": 0.0004222339775878936, "learning_rate": 5.143400643839625e-06, "loss": 0.0, "step": 6131 }, { "epoch": 1.794556628621598, "grad_norm": 0.0005146698094904423, "learning_rate": 5.136084284460052e-06, "loss": 0.0, "step": 6132 }, { "epoch": 1.7948492829967808, "grad_norm": 0.0013351863017305732, "learning_rate": 5.1287679250804795e-06, "loss": 0.0, "step": 6133 }, { "epoch": 1.7951419373719637, "grad_norm": 0.0003425734757911414, "learning_rate": 5.1214515657009074e-06, "loss": 0.0, "step": 6134 }, { "epoch": 1.7954345917471466, "grad_norm": 0.001160002313554287, "learning_rate": 5.1141352063213345e-06, "loss": 0.0, "step": 6135 }, { "epoch": 1.7957272461223295, "grad_norm": 0.00015163766511250287, "learning_rate": 5.106818846941762e-06, "loss": 0.0, "step": 6136 }, { "epoch": 1.7960199004975124, "grad_norm": 0.000965707004070282, "learning_rate": 5.099502487562189e-06, "loss": 0.0, "step": 6137 }, { "epoch": 1.7963125548726953, "grad_norm": 0.00031617077183909714, "learning_rate": 5.092186128182617e-06, "loss": 0.0, "step": 6138 }, { "epoch": 1.7966052092478781, "grad_norm": 0.0001316484558628872, "learning_rate": 5.084869768803044e-06, "loss": 0.0, "step": 6139 }, { "epoch": 1.796897863623061, "grad_norm": 0.00207359972409904, "learning_rate": 5.077553409423471e-06, "loss": 0.0, "step": 6140 }, { "epoch": 1.797190517998244, "grad_norm": 0.0006536368746310472, "learning_rate": 5.070237050043898e-06, "loss": 0.0, "step": 6141 }, { "epoch": 1.797483172373427, "grad_norm": 0.0003556523297447711, "learning_rate": 5.062920690664326e-06, "loss": 0.0, "step": 6142 }, { "epoch": 1.79777582674861, "grad_norm": 0.012084085494279861, "learning_rate": 5.055604331284753e-06, "loss": 0.0001, "step": 6143 }, { "epoch": 1.7980684811237928, "grad_norm": 0.00036497731343843043, "learning_rate": 5.04828797190518e-06, "loss": 0.0, "step": 6144 }, { "epoch": 1.7983611354989757, "grad_norm": 0.002084647072479129, "learning_rate": 5.040971612525607e-06, "loss": 0.0, "step": 6145 }, { "epoch": 1.7986537898741586, "grad_norm": 0.0014732355484738946, "learning_rate": 5.033655253146035e-06, "loss": 0.0, "step": 6146 }, { "epoch": 1.7989464442493417, "grad_norm": 0.0006228667916730046, "learning_rate": 5.026338893766462e-06, "loss": 0.0, "step": 6147 }, { "epoch": 1.7992390986245246, "grad_norm": 0.0002631722891237587, "learning_rate": 5.0190225343868894e-06, "loss": 0.0, "step": 6148 }, { "epoch": 1.7995317529997075, "grad_norm": 0.00019564389367587864, "learning_rate": 5.0117061750073165e-06, "loss": 0.0, "step": 6149 }, { "epoch": 1.7998244073748904, "grad_norm": 0.9656554460525513, "learning_rate": 5.0043898156277445e-06, "loss": 0.0066, "step": 6150 }, { "epoch": 1.8001170617500732, "grad_norm": 0.001417526276782155, "learning_rate": 4.997073456248172e-06, "loss": 0.0, "step": 6151 }, { "epoch": 1.8004097161252561, "grad_norm": 0.00033955316757783294, "learning_rate": 4.989757096868599e-06, "loss": 0.0, "step": 6152 }, { "epoch": 1.800702370500439, "grad_norm": 0.0006844167364761233, "learning_rate": 4.982440737489026e-06, "loss": 0.0, "step": 6153 }, { "epoch": 1.800995024875622, "grad_norm": 0.0009767315350472927, "learning_rate": 4.975124378109453e-06, "loss": 0.0, "step": 6154 }, { "epoch": 1.8012876792508048, "grad_norm": 0.0001971491874428466, "learning_rate": 4.96780801872988e-06, "loss": 0.0, "step": 6155 }, { "epoch": 1.8015803336259877, "grad_norm": 0.00021163126803003252, "learning_rate": 4.960491659350307e-06, "loss": 0.0, "step": 6156 }, { "epoch": 1.8018729880011706, "grad_norm": 0.003267401596531272, "learning_rate": 4.953175299970734e-06, "loss": 0.0, "step": 6157 }, { "epoch": 1.8021656423763535, "grad_norm": 0.0008824951364658773, "learning_rate": 4.945858940591162e-06, "loss": 0.0, "step": 6158 }, { "epoch": 1.8024582967515363, "grad_norm": 3.2411012649536133, "learning_rate": 4.938542581211589e-06, "loss": 0.1711, "step": 6159 }, { "epoch": 1.8027509511267192, "grad_norm": 0.00016725769091863185, "learning_rate": 4.931226221832016e-06, "loss": 0.0, "step": 6160 }, { "epoch": 1.8030436055019021, "grad_norm": 0.00017268993542529643, "learning_rate": 4.9239098624524435e-06, "loss": 0.0, "step": 6161 }, { "epoch": 1.803336259877085, "grad_norm": 0.001485522836446762, "learning_rate": 4.916593503072871e-06, "loss": 0.0, "step": 6162 }, { "epoch": 1.8036289142522681, "grad_norm": 0.011133319698274136, "learning_rate": 4.9092771436932985e-06, "loss": 0.0001, "step": 6163 }, { "epoch": 1.803921568627451, "grad_norm": 0.0012351942714303732, "learning_rate": 4.901960784313726e-06, "loss": 0.0, "step": 6164 }, { "epoch": 1.804214223002634, "grad_norm": 0.02805311419069767, "learning_rate": 4.894644424934153e-06, "loss": 0.0001, "step": 6165 }, { "epoch": 1.8045068773778168, "grad_norm": 0.0007728440104983747, "learning_rate": 4.887328065554581e-06, "loss": 0.0, "step": 6166 }, { "epoch": 1.8047995317529997, "grad_norm": 0.0007231071358546615, "learning_rate": 4.880011706175008e-06, "loss": 0.0, "step": 6167 }, { "epoch": 1.8050921861281826, "grad_norm": 0.0025570434518158436, "learning_rate": 4.872695346795435e-06, "loss": 0.0001, "step": 6168 }, { "epoch": 1.8053848405033657, "grad_norm": 0.0006787261227145791, "learning_rate": 4.865378987415862e-06, "loss": 0.0, "step": 6169 }, { "epoch": 1.8056774948785486, "grad_norm": 0.0006139308679848909, "learning_rate": 4.85806262803629e-06, "loss": 0.0, "step": 6170 }, { "epoch": 1.8059701492537314, "grad_norm": 0.0007231313502416015, "learning_rate": 4.850746268656717e-06, "loss": 0.0, "step": 6171 }, { "epoch": 1.8062628036289143, "grad_norm": 0.0019035454606637359, "learning_rate": 4.843429909277144e-06, "loss": 0.0, "step": 6172 }, { "epoch": 1.8065554580040972, "grad_norm": 0.00047850527334958315, "learning_rate": 4.836113549897571e-06, "loss": 0.0, "step": 6173 }, { "epoch": 1.8068481123792801, "grad_norm": 0.00034121458884328604, "learning_rate": 4.828797190517999e-06, "loss": 0.0, "step": 6174 }, { "epoch": 1.807140766754463, "grad_norm": 0.01851416565477848, "learning_rate": 4.821480831138426e-06, "loss": 0.0002, "step": 6175 }, { "epoch": 1.807433421129646, "grad_norm": 0.00022216422075871378, "learning_rate": 4.814164471758853e-06, "loss": 0.0, "step": 6176 }, { "epoch": 1.8077260755048288, "grad_norm": 0.0003145903756376356, "learning_rate": 4.8068481123792805e-06, "loss": 0.0, "step": 6177 }, { "epoch": 1.8080187298800117, "grad_norm": 0.0008986227912828326, "learning_rate": 4.799531752999708e-06, "loss": 0.0, "step": 6178 }, { "epoch": 1.8083113842551946, "grad_norm": 0.0014955041697248816, "learning_rate": 4.792215393620135e-06, "loss": 0.0, "step": 6179 }, { "epoch": 1.8086040386303774, "grad_norm": 0.0005902801640331745, "learning_rate": 4.784899034240562e-06, "loss": 0.0, "step": 6180 }, { "epoch": 1.8088966930055603, "grad_norm": 0.0005175388068892062, "learning_rate": 4.777582674860989e-06, "loss": 0.0, "step": 6181 }, { "epoch": 1.8091893473807432, "grad_norm": 0.0007320493459701538, "learning_rate": 4.770266315481417e-06, "loss": 0.0, "step": 6182 }, { "epoch": 1.809482001755926, "grad_norm": 0.0008106476161628962, "learning_rate": 4.762949956101844e-06, "loss": 0.0, "step": 6183 }, { "epoch": 1.809774656131109, "grad_norm": 0.0032793600112199783, "learning_rate": 4.755633596722271e-06, "loss": 0.0, "step": 6184 }, { "epoch": 1.810067310506292, "grad_norm": 0.00037016900023445487, "learning_rate": 4.748317237342698e-06, "loss": 0.0, "step": 6185 }, { "epoch": 1.810359964881475, "grad_norm": 0.0010186518775299191, "learning_rate": 4.741000877963126e-06, "loss": 0.0, "step": 6186 }, { "epoch": 1.8106526192566579, "grad_norm": 0.001128872623667121, "learning_rate": 4.733684518583553e-06, "loss": 0.0, "step": 6187 }, { "epoch": 1.8109452736318408, "grad_norm": 0.0027060238644480705, "learning_rate": 4.72636815920398e-06, "loss": 0.0, "step": 6188 }, { "epoch": 1.8112379280070237, "grad_norm": 0.0008564601885154843, "learning_rate": 4.7190517998244074e-06, "loss": 0.0, "step": 6189 }, { "epoch": 1.8115305823822068, "grad_norm": 0.004802064970135689, "learning_rate": 4.7117354404448345e-06, "loss": 0.0001, "step": 6190 }, { "epoch": 1.8118232367573897, "grad_norm": 0.0005869761225767434, "learning_rate": 4.7044190810652625e-06, "loss": 0.0, "step": 6191 }, { "epoch": 1.8121158911325725, "grad_norm": 0.00042698491597548127, "learning_rate": 4.69710272168569e-06, "loss": 0.0, "step": 6192 }, { "epoch": 1.8124085455077554, "grad_norm": 0.0010814317502081394, "learning_rate": 4.689786362306117e-06, "loss": 0.0, "step": 6193 }, { "epoch": 1.8127011998829383, "grad_norm": 0.000888126203790307, "learning_rate": 4.682470002926544e-06, "loss": 0.0, "step": 6194 }, { "epoch": 1.8129938542581212, "grad_norm": 0.00033576504210941494, "learning_rate": 4.675153643546972e-06, "loss": 0.0, "step": 6195 }, { "epoch": 1.813286508633304, "grad_norm": 0.00024507715716026723, "learning_rate": 4.667837284167399e-06, "loss": 0.0, "step": 6196 }, { "epoch": 1.813579163008487, "grad_norm": 0.000262171815847978, "learning_rate": 4.660520924787826e-06, "loss": 0.0, "step": 6197 }, { "epoch": 1.8138718173836699, "grad_norm": 0.00517128873616457, "learning_rate": 4.653204565408253e-06, "loss": 0.0, "step": 6198 }, { "epoch": 1.8141644717588528, "grad_norm": 0.00016362473252229393, "learning_rate": 4.64588820602868e-06, "loss": 0.0, "step": 6199 }, { "epoch": 1.8144571261340356, "grad_norm": 0.0005344537785276771, "learning_rate": 4.638571846649107e-06, "loss": 0.0, "step": 6200 }, { "epoch": 1.8147497805092185, "grad_norm": 0.01572296768426895, "learning_rate": 4.631255487269534e-06, "loss": 0.0001, "step": 6201 }, { "epoch": 1.8150424348844014, "grad_norm": 0.0012269177241250873, "learning_rate": 4.6239391278899615e-06, "loss": 0.0, "step": 6202 }, { "epoch": 1.8153350892595843, "grad_norm": 0.00042726314859464765, "learning_rate": 4.6166227685103894e-06, "loss": 0.0, "step": 6203 }, { "epoch": 1.8156277436347672, "grad_norm": 0.0005853824550285935, "learning_rate": 4.6093064091308165e-06, "loss": 0.0, "step": 6204 }, { "epoch": 1.81592039800995, "grad_norm": 0.0008499994874000549, "learning_rate": 4.601990049751244e-06, "loss": 0.0, "step": 6205 }, { "epoch": 1.8162130523851332, "grad_norm": 0.0009012450464069843, "learning_rate": 4.594673690371671e-06, "loss": 0.0, "step": 6206 }, { "epoch": 1.816505706760316, "grad_norm": 0.00019460823386907578, "learning_rate": 4.587357330992099e-06, "loss": 0.0, "step": 6207 }, { "epoch": 1.816798361135499, "grad_norm": 0.0010536338668316603, "learning_rate": 4.580040971612526e-06, "loss": 0.0, "step": 6208 }, { "epoch": 1.8170910155106819, "grad_norm": 0.0004098449135199189, "learning_rate": 4.572724612232953e-06, "loss": 0.0, "step": 6209 }, { "epoch": 1.8173836698858647, "grad_norm": 0.0020609141793102026, "learning_rate": 4.56540825285338e-06, "loss": 0.0, "step": 6210 }, { "epoch": 1.8176763242610479, "grad_norm": 0.0004708924680016935, "learning_rate": 4.558091893473808e-06, "loss": 0.0, "step": 6211 }, { "epoch": 1.8179689786362307, "grad_norm": 0.00048562430310994387, "learning_rate": 4.550775534094235e-06, "loss": 0.0, "step": 6212 }, { "epoch": 1.8182616330114136, "grad_norm": 0.00024222530191764235, "learning_rate": 4.543459174714662e-06, "loss": 0.0, "step": 6213 }, { "epoch": 1.8185542873865965, "grad_norm": 0.0023239348083734512, "learning_rate": 4.536142815335089e-06, "loss": 0.0, "step": 6214 }, { "epoch": 1.8188469417617794, "grad_norm": 0.0003518529119901359, "learning_rate": 4.528826455955517e-06, "loss": 0.0, "step": 6215 }, { "epoch": 1.8191395961369623, "grad_norm": 0.001817098120227456, "learning_rate": 4.521510096575944e-06, "loss": 0.0, "step": 6216 }, { "epoch": 1.8194322505121452, "grad_norm": 0.0005887098959647119, "learning_rate": 4.514193737196371e-06, "loss": 0.0, "step": 6217 }, { "epoch": 1.819724904887328, "grad_norm": 0.003914620261639357, "learning_rate": 4.5068773778167985e-06, "loss": 0.0001, "step": 6218 }, { "epoch": 1.820017559262511, "grad_norm": 0.0035934834741055965, "learning_rate": 4.4995610184372265e-06, "loss": 0.0, "step": 6219 }, { "epoch": 1.8203102136376939, "grad_norm": 0.0004365076601970941, "learning_rate": 4.4922446590576536e-06, "loss": 0.0, "step": 6220 }, { "epoch": 1.8206028680128767, "grad_norm": 0.00044464407255873084, "learning_rate": 4.484928299678081e-06, "loss": 0.0, "step": 6221 }, { "epoch": 1.8208955223880596, "grad_norm": 0.0006023648893460631, "learning_rate": 4.477611940298508e-06, "loss": 0.0, "step": 6222 }, { "epoch": 1.8211881767632425, "grad_norm": 0.01346750371158123, "learning_rate": 4.470295580918935e-06, "loss": 0.0001, "step": 6223 }, { "epoch": 1.8214808311384254, "grad_norm": 0.00067456642864272, "learning_rate": 4.462979221539362e-06, "loss": 0.0, "step": 6224 }, { "epoch": 1.8217734855136083, "grad_norm": 0.0004930745344609022, "learning_rate": 4.455662862159789e-06, "loss": 0.0, "step": 6225 }, { "epoch": 1.8220661398887912, "grad_norm": 0.0006139373872429132, "learning_rate": 4.448346502780216e-06, "loss": 0.0, "step": 6226 }, { "epoch": 1.8223587942639743, "grad_norm": 0.000917270896025002, "learning_rate": 4.441030143400644e-06, "loss": 0.0, "step": 6227 }, { "epoch": 1.8226514486391572, "grad_norm": 0.00100823980756104, "learning_rate": 4.433713784021071e-06, "loss": 0.0, "step": 6228 }, { "epoch": 1.82294410301434, "grad_norm": 0.0018652203725650907, "learning_rate": 4.426397424641498e-06, "loss": 0.0, "step": 6229 }, { "epoch": 1.823236757389523, "grad_norm": 0.0015184390358626842, "learning_rate": 4.4190810652619255e-06, "loss": 0.0, "step": 6230 }, { "epoch": 1.8235294117647058, "grad_norm": 0.0004591936303768307, "learning_rate": 4.411764705882353e-06, "loss": 0.0, "step": 6231 }, { "epoch": 1.823822066139889, "grad_norm": 0.0018184625077992678, "learning_rate": 4.4044483465027805e-06, "loss": 0.0, "step": 6232 }, { "epoch": 1.8241147205150718, "grad_norm": 0.001610770937986672, "learning_rate": 4.397131987123208e-06, "loss": 0.0, "step": 6233 }, { "epoch": 1.8244073748902547, "grad_norm": 0.0005263009225018322, "learning_rate": 4.389815627743635e-06, "loss": 0.0, "step": 6234 }, { "epoch": 1.8247000292654376, "grad_norm": 0.0009672120795585215, "learning_rate": 4.382499268364063e-06, "loss": 0.0, "step": 6235 }, { "epoch": 1.8249926836406205, "grad_norm": 0.0011546674650162458, "learning_rate": 4.37518290898449e-06, "loss": 0.0, "step": 6236 }, { "epoch": 1.8252853380158034, "grad_norm": 0.0345073826611042, "learning_rate": 4.367866549604917e-06, "loss": 0.0002, "step": 6237 }, { "epoch": 1.8255779923909863, "grad_norm": 0.001559751806780696, "learning_rate": 4.360550190225344e-06, "loss": 0.0, "step": 6238 }, { "epoch": 1.8258706467661692, "grad_norm": 0.0009903997415676713, "learning_rate": 4.353233830845772e-06, "loss": 0.0, "step": 6239 }, { "epoch": 1.826163301141352, "grad_norm": 0.0005342111689969897, "learning_rate": 4.345917471466199e-06, "loss": 0.0, "step": 6240 }, { "epoch": 1.826455955516535, "grad_norm": 0.00039438612293452024, "learning_rate": 4.338601112086626e-06, "loss": 0.0, "step": 6241 }, { "epoch": 1.8267486098917178, "grad_norm": 0.0005523980362340808, "learning_rate": 4.331284752707053e-06, "loss": 0.0, "step": 6242 }, { "epoch": 1.8270412642669007, "grad_norm": 0.0013923441292718053, "learning_rate": 4.323968393327481e-06, "loss": 0.0, "step": 6243 }, { "epoch": 1.8273339186420836, "grad_norm": 0.0006046186317689717, "learning_rate": 4.316652033947908e-06, "loss": 0.0, "step": 6244 }, { "epoch": 1.8276265730172665, "grad_norm": 0.005008961074054241, "learning_rate": 4.309335674568335e-06, "loss": 0.0001, "step": 6245 }, { "epoch": 1.8279192273924494, "grad_norm": 0.0005424632108770311, "learning_rate": 4.3020193151887625e-06, "loss": 0.0, "step": 6246 }, { "epoch": 1.8282118817676323, "grad_norm": 0.0008885089191608131, "learning_rate": 4.29470295580919e-06, "loss": 0.0, "step": 6247 }, { "epoch": 1.8285045361428154, "grad_norm": 0.0007215089281089604, "learning_rate": 4.287386596429617e-06, "loss": 0.0, "step": 6248 }, { "epoch": 1.8287971905179983, "grad_norm": 0.0020264852792024612, "learning_rate": 4.280070237050044e-06, "loss": 0.0, "step": 6249 }, { "epoch": 1.8290898448931812, "grad_norm": 0.0008645343477837741, "learning_rate": 4.272753877670471e-06, "loss": 0.0, "step": 6250 }, { "epoch": 1.829382499268364, "grad_norm": 0.00043376587564125657, "learning_rate": 4.265437518290899e-06, "loss": 0.0, "step": 6251 }, { "epoch": 1.829675153643547, "grad_norm": 0.0011695638531818986, "learning_rate": 4.258121158911326e-06, "loss": 0.0, "step": 6252 }, { "epoch": 1.8299678080187298, "grad_norm": 0.00021304836263880134, "learning_rate": 4.250804799531753e-06, "loss": 0.0, "step": 6253 }, { "epoch": 1.830260462393913, "grad_norm": 0.0005459172534756362, "learning_rate": 4.24348844015218e-06, "loss": 0.0, "step": 6254 }, { "epoch": 1.8305531167690958, "grad_norm": 0.0005447863368317485, "learning_rate": 4.236172080772607e-06, "loss": 0.0, "step": 6255 }, { "epoch": 1.8308457711442787, "grad_norm": 0.0007374795968644321, "learning_rate": 4.228855721393035e-06, "loss": 0.0, "step": 6256 }, { "epoch": 1.8311384255194616, "grad_norm": 0.004098342731595039, "learning_rate": 4.221539362013462e-06, "loss": 0.0001, "step": 6257 }, { "epoch": 1.8314310798946445, "grad_norm": 0.00016194079944398254, "learning_rate": 4.2142230026338894e-06, "loss": 0.0, "step": 6258 }, { "epoch": 1.8317237342698274, "grad_norm": 0.0026144194416701794, "learning_rate": 4.2069066432543165e-06, "loss": 0.0, "step": 6259 }, { "epoch": 1.8320163886450103, "grad_norm": 1.628932237625122, "learning_rate": 4.1995902838747445e-06, "loss": 0.0105, "step": 6260 }, { "epoch": 1.8323090430201932, "grad_norm": 0.00035455342731438577, "learning_rate": 4.192273924495172e-06, "loss": 0.0, "step": 6261 }, { "epoch": 1.832601697395376, "grad_norm": 0.0003120841574855149, "learning_rate": 4.184957565115599e-06, "loss": 0.0, "step": 6262 }, { "epoch": 1.832894351770559, "grad_norm": 0.001973384525626898, "learning_rate": 4.177641205736026e-06, "loss": 0.0, "step": 6263 }, { "epoch": 1.8331870061457418, "grad_norm": 0.00034848632640205324, "learning_rate": 4.170324846356454e-06, "loss": 0.0, "step": 6264 }, { "epoch": 1.8334796605209247, "grad_norm": 0.0014513269998133183, "learning_rate": 4.163008486976881e-06, "loss": 0.0, "step": 6265 }, { "epoch": 1.8337723148961076, "grad_norm": 0.0016633718041703105, "learning_rate": 4.155692127597308e-06, "loss": 0.0, "step": 6266 }, { "epoch": 1.8340649692712905, "grad_norm": 0.0005357158370316029, "learning_rate": 4.148375768217735e-06, "loss": 0.0, "step": 6267 }, { "epoch": 1.8343576236464734, "grad_norm": 0.0016632014885544777, "learning_rate": 4.141059408838162e-06, "loss": 0.0, "step": 6268 }, { "epoch": 1.8346502780216565, "grad_norm": 0.0007505984976887703, "learning_rate": 4.133743049458589e-06, "loss": 0.0, "step": 6269 }, { "epoch": 1.8349429323968394, "grad_norm": 0.00036485426244325936, "learning_rate": 4.126426690079016e-06, "loss": 0.0, "step": 6270 }, { "epoch": 1.8352355867720223, "grad_norm": 0.006879924796521664, "learning_rate": 4.1191103306994435e-06, "loss": 0.0001, "step": 6271 }, { "epoch": 1.8355282411472051, "grad_norm": 0.0015742400428280234, "learning_rate": 4.111793971319871e-06, "loss": 0.0, "step": 6272 }, { "epoch": 1.835820895522388, "grad_norm": 0.0005280297482386231, "learning_rate": 4.1044776119402985e-06, "loss": 0.0, "step": 6273 }, { "epoch": 1.836113549897571, "grad_norm": 0.0004075948672834784, "learning_rate": 4.097161252560726e-06, "loss": 0.0, "step": 6274 }, { "epoch": 1.836406204272754, "grad_norm": 0.0004716550756711513, "learning_rate": 4.089844893181153e-06, "loss": 0.0, "step": 6275 }, { "epoch": 1.836698858647937, "grad_norm": 0.008090038783848286, "learning_rate": 4.082528533801581e-06, "loss": 0.0001, "step": 6276 }, { "epoch": 1.8369915130231198, "grad_norm": 0.00023774802684783936, "learning_rate": 4.075212174422008e-06, "loss": 0.0, "step": 6277 }, { "epoch": 1.8372841673983027, "grad_norm": 0.002482503652572632, "learning_rate": 4.067895815042435e-06, "loss": 0.0, "step": 6278 }, { "epoch": 1.8375768217734856, "grad_norm": 0.0006763905403204262, "learning_rate": 4.060579455662862e-06, "loss": 0.0, "step": 6279 }, { "epoch": 1.8378694761486685, "grad_norm": 0.00540641276165843, "learning_rate": 4.05326309628329e-06, "loss": 0.0001, "step": 6280 }, { "epoch": 1.8381621305238514, "grad_norm": 0.0001323170290561393, "learning_rate": 4.045946736903717e-06, "loss": 0.0, "step": 6281 }, { "epoch": 1.8384547848990342, "grad_norm": 0.0015577462036162615, "learning_rate": 4.038630377524144e-06, "loss": 0.0, "step": 6282 }, { "epoch": 1.8387474392742171, "grad_norm": 0.0008135398966260254, "learning_rate": 4.031314018144571e-06, "loss": 0.0, "step": 6283 }, { "epoch": 1.8390400936494, "grad_norm": 0.0004831902333535254, "learning_rate": 4.023997658764999e-06, "loss": 0.0, "step": 6284 }, { "epoch": 1.839332748024583, "grad_norm": 0.003544786712154746, "learning_rate": 4.016681299385426e-06, "loss": 0.0, "step": 6285 }, { "epoch": 1.8396254023997658, "grad_norm": 0.0009445445029996336, "learning_rate": 4.009364940005853e-06, "loss": 0.0, "step": 6286 }, { "epoch": 1.8399180567749487, "grad_norm": 0.0035348988603800535, "learning_rate": 4.0020485806262805e-06, "loss": 0.0, "step": 6287 }, { "epoch": 1.8402107111501316, "grad_norm": 0.0016584821278229356, "learning_rate": 3.9947322212467085e-06, "loss": 0.0, "step": 6288 }, { "epoch": 1.8405033655253145, "grad_norm": 0.00039671818376518786, "learning_rate": 3.9874158618671356e-06, "loss": 0.0, "step": 6289 }, { "epoch": 1.8407960199004973, "grad_norm": 0.0004341302264947444, "learning_rate": 3.980099502487563e-06, "loss": 0.0, "step": 6290 }, { "epoch": 1.8410886742756805, "grad_norm": 0.002159208757802844, "learning_rate": 3.97278314310799e-06, "loss": 0.0, "step": 6291 }, { "epoch": 1.8413813286508633, "grad_norm": 0.0012557889567688107, "learning_rate": 3.965466783728417e-06, "loss": 0.0, "step": 6292 }, { "epoch": 1.8416739830260462, "grad_norm": 0.0004266448086127639, "learning_rate": 3.958150424348844e-06, "loss": 0.0, "step": 6293 }, { "epoch": 1.8419666374012291, "grad_norm": 0.00016384995251428336, "learning_rate": 3.950834064969271e-06, "loss": 0.0, "step": 6294 }, { "epoch": 1.842259291776412, "grad_norm": 0.00015025155153125525, "learning_rate": 3.943517705589698e-06, "loss": 0.0, "step": 6295 }, { "epoch": 1.8425519461515951, "grad_norm": 9.3340482711792, "learning_rate": 3.936201346210126e-06, "loss": 0.1112, "step": 6296 }, { "epoch": 1.842844600526778, "grad_norm": 0.0023394993040710688, "learning_rate": 3.928884986830553e-06, "loss": 0.0, "step": 6297 }, { "epoch": 1.843137254901961, "grad_norm": 0.000579311337787658, "learning_rate": 3.92156862745098e-06, "loss": 0.0, "step": 6298 }, { "epoch": 1.8434299092771438, "grad_norm": 0.0004900764324702322, "learning_rate": 3.9142522680714074e-06, "loss": 0.0, "step": 6299 }, { "epoch": 1.8437225636523267, "grad_norm": 0.0003009681240655482, "learning_rate": 3.906935908691835e-06, "loss": 0.0, "step": 6300 }, { "epoch": 1.8440152180275096, "grad_norm": 0.0026180697605013847, "learning_rate": 3.8996195493122625e-06, "loss": 0.0, "step": 6301 }, { "epoch": 1.8443078724026924, "grad_norm": 0.004241331480443478, "learning_rate": 3.89230318993269e-06, "loss": 0.0001, "step": 6302 }, { "epoch": 1.8446005267778753, "grad_norm": 0.0010259848786517978, "learning_rate": 3.884986830553117e-06, "loss": 0.0, "step": 6303 }, { "epoch": 1.8448931811530582, "grad_norm": 23.828861236572266, "learning_rate": 3.877670471173545e-06, "loss": 0.0556, "step": 6304 }, { "epoch": 1.8451858355282411, "grad_norm": 0.00034048352972604334, "learning_rate": 3.870354111793972e-06, "loss": 0.0, "step": 6305 }, { "epoch": 1.845478489903424, "grad_norm": 0.0010296351974830031, "learning_rate": 3.863037752414399e-06, "loss": 0.0, "step": 6306 }, { "epoch": 1.845771144278607, "grad_norm": 0.0013108521234244108, "learning_rate": 3.855721393034826e-06, "loss": 0.0, "step": 6307 }, { "epoch": 1.8460637986537898, "grad_norm": 0.0008955459343269467, "learning_rate": 3.848405033655254e-06, "loss": 0.0, "step": 6308 }, { "epoch": 1.8463564530289727, "grad_norm": 0.000490850827191025, "learning_rate": 3.841088674275681e-06, "loss": 0.0, "step": 6309 }, { "epoch": 1.8466491074041556, "grad_norm": 0.0005940706469118595, "learning_rate": 3.833772314896108e-06, "loss": 0.0, "step": 6310 }, { "epoch": 1.8469417617793384, "grad_norm": 0.0010513426968827844, "learning_rate": 3.826455955516535e-06, "loss": 0.0, "step": 6311 }, { "epoch": 1.8472344161545216, "grad_norm": 0.0002777844783850014, "learning_rate": 3.819139596136963e-06, "loss": 0.0, "step": 6312 }, { "epoch": 1.8475270705297044, "grad_norm": 0.0004163272387813777, "learning_rate": 3.81182323675739e-06, "loss": 0.0, "step": 6313 }, { "epoch": 1.8478197249048873, "grad_norm": 0.008874928578734398, "learning_rate": 3.804506877377817e-06, "loss": 0.0001, "step": 6314 }, { "epoch": 1.8481123792800702, "grad_norm": 0.0009299430530518293, "learning_rate": 3.797190517998244e-06, "loss": 0.0, "step": 6315 }, { "epoch": 1.848405033655253, "grad_norm": 0.0005185367190279067, "learning_rate": 3.789874158618672e-06, "loss": 0.0, "step": 6316 }, { "epoch": 1.8486976880304362, "grad_norm": 0.0009174785809591413, "learning_rate": 3.782557799239099e-06, "loss": 0.0, "step": 6317 }, { "epoch": 1.848990342405619, "grad_norm": 0.0004378536541480571, "learning_rate": 3.775241439859526e-06, "loss": 0.0, "step": 6318 }, { "epoch": 1.849282996780802, "grad_norm": 0.000477397843496874, "learning_rate": 3.7679250804799533e-06, "loss": 0.0, "step": 6319 }, { "epoch": 1.8495756511559849, "grad_norm": 0.002084932057186961, "learning_rate": 3.7606087211003804e-06, "loss": 0.0, "step": 6320 }, { "epoch": 1.8498683055311678, "grad_norm": 0.00024068816856015474, "learning_rate": 3.753292361720808e-06, "loss": 0.0, "step": 6321 }, { "epoch": 1.8501609599063507, "grad_norm": 0.002064603613689542, "learning_rate": 3.745976002341235e-06, "loss": 0.0, "step": 6322 }, { "epoch": 1.8504536142815335, "grad_norm": 8.975202945293859e-05, "learning_rate": 3.738659642961662e-06, "loss": 0.0, "step": 6323 }, { "epoch": 1.8507462686567164, "grad_norm": 0.000657002383377403, "learning_rate": 3.7313432835820893e-06, "loss": 0.0, "step": 6324 }, { "epoch": 1.8510389230318993, "grad_norm": 0.0009384411969222128, "learning_rate": 3.724026924202517e-06, "loss": 0.0, "step": 6325 }, { "epoch": 1.8513315774070822, "grad_norm": 0.00032740531605668366, "learning_rate": 3.7167105648229443e-06, "loss": 0.0, "step": 6326 }, { "epoch": 1.851624231782265, "grad_norm": 7.135482883313671e-05, "learning_rate": 3.7093942054433714e-06, "loss": 0.0, "step": 6327 }, { "epoch": 1.851916886157448, "grad_norm": 0.0014389071147888899, "learning_rate": 3.7020778460637985e-06, "loss": 0.0, "step": 6328 }, { "epoch": 1.8522095405326309, "grad_norm": 0.00794234685599804, "learning_rate": 3.6947614866842265e-06, "loss": 0.0, "step": 6329 }, { "epoch": 1.8525021949078138, "grad_norm": 0.0008657536236569285, "learning_rate": 3.6874451273046536e-06, "loss": 0.0, "step": 6330 }, { "epoch": 1.8527948492829966, "grad_norm": 0.0003155196609441191, "learning_rate": 3.6801287679250807e-06, "loss": 0.0, "step": 6331 }, { "epoch": 1.8530875036581795, "grad_norm": 0.0004522484668996185, "learning_rate": 3.6728124085455078e-06, "loss": 0.0, "step": 6332 }, { "epoch": 1.8533801580333626, "grad_norm": 0.0005152150406502187, "learning_rate": 3.6654960491659353e-06, "loss": 0.0, "step": 6333 }, { "epoch": 1.8536728124085455, "grad_norm": 0.0018022602889686823, "learning_rate": 3.6581796897863624e-06, "loss": 0.0, "step": 6334 }, { "epoch": 1.8539654667837284, "grad_norm": 0.0011118188267573714, "learning_rate": 3.6508633304067895e-06, "loss": 0.0, "step": 6335 }, { "epoch": 1.8542581211589113, "grad_norm": 0.10409577935934067, "learning_rate": 3.6435469710272166e-06, "loss": 0.0004, "step": 6336 }, { "epoch": 1.8545507755340942, "grad_norm": 0.0006436710827983916, "learning_rate": 3.6362306116476446e-06, "loss": 0.0, "step": 6337 }, { "epoch": 1.8548434299092773, "grad_norm": 0.0019154376350343227, "learning_rate": 3.6289142522680717e-06, "loss": 0.0, "step": 6338 }, { "epoch": 1.8551360842844602, "grad_norm": 0.0009707739809527993, "learning_rate": 3.6215978928884988e-06, "loss": 0.0, "step": 6339 }, { "epoch": 1.855428738659643, "grad_norm": 0.002355606062337756, "learning_rate": 3.614281533508926e-06, "loss": 0.0, "step": 6340 }, { "epoch": 1.855721393034826, "grad_norm": 0.0009587014792487025, "learning_rate": 3.606965174129354e-06, "loss": 0.0, "step": 6341 }, { "epoch": 1.8560140474100089, "grad_norm": 0.0001421875786036253, "learning_rate": 3.599648814749781e-06, "loss": 0.0, "step": 6342 }, { "epoch": 1.8563067017851917, "grad_norm": 0.0029366235248744488, "learning_rate": 3.592332455370208e-06, "loss": 0.0, "step": 6343 }, { "epoch": 1.8565993561603746, "grad_norm": 0.00022222657571546733, "learning_rate": 3.585016095990635e-06, "loss": 0.0, "step": 6344 }, { "epoch": 1.8568920105355575, "grad_norm": 0.0001300495641771704, "learning_rate": 3.5776997366110627e-06, "loss": 0.0, "step": 6345 }, { "epoch": 1.8571846649107404, "grad_norm": 0.00048738211626186967, "learning_rate": 3.5703833772314898e-06, "loss": 0.0, "step": 6346 }, { "epoch": 1.8574773192859233, "grad_norm": 0.0005268138484098017, "learning_rate": 3.563067017851917e-06, "loss": 0.0, "step": 6347 }, { "epoch": 1.8577699736611062, "grad_norm": 0.0016816152492538095, "learning_rate": 3.555750658472344e-06, "loss": 0.0, "step": 6348 }, { "epoch": 1.858062628036289, "grad_norm": 0.0009939387673512101, "learning_rate": 3.548434299092772e-06, "loss": 0.0, "step": 6349 }, { "epoch": 1.858355282411472, "grad_norm": 0.0012612274149432778, "learning_rate": 3.541117939713199e-06, "loss": 0.0, "step": 6350 }, { "epoch": 1.8586479367866549, "grad_norm": 0.020176591351628304, "learning_rate": 3.533801580333626e-06, "loss": 0.0001, "step": 6351 }, { "epoch": 1.8589405911618377, "grad_norm": 0.0012354014907032251, "learning_rate": 3.5264852209540532e-06, "loss": 0.0, "step": 6352 }, { "epoch": 1.8592332455370206, "grad_norm": 0.0006875548860989511, "learning_rate": 3.519168861574481e-06, "loss": 0.0, "step": 6353 }, { "epoch": 1.8595258999122037, "grad_norm": 0.00011707415251294151, "learning_rate": 3.5118525021949083e-06, "loss": 0.0, "step": 6354 }, { "epoch": 1.8598185542873866, "grad_norm": 0.00039305599057115614, "learning_rate": 3.5045361428153354e-06, "loss": 0.0, "step": 6355 }, { "epoch": 1.8601112086625695, "grad_norm": 0.019264813512563705, "learning_rate": 3.4972197834357625e-06, "loss": 0.0003, "step": 6356 }, { "epoch": 1.8604038630377524, "grad_norm": 0.0008811648585833609, "learning_rate": 3.48990342405619e-06, "loss": 0.0, "step": 6357 }, { "epoch": 1.8606965174129353, "grad_norm": 0.00041657642577774823, "learning_rate": 3.482587064676617e-06, "loss": 0.0, "step": 6358 }, { "epoch": 1.8609891717881182, "grad_norm": 0.0003095753490924835, "learning_rate": 3.4752707052970442e-06, "loss": 0.0, "step": 6359 }, { "epoch": 1.8612818261633013, "grad_norm": 0.0004555534978862852, "learning_rate": 3.4679543459174713e-06, "loss": 0.0, "step": 6360 }, { "epoch": 1.8615744805384842, "grad_norm": 0.0005622628377750516, "learning_rate": 3.4606379865378993e-06, "loss": 0.0, "step": 6361 }, { "epoch": 1.861867134913667, "grad_norm": 0.0009711002348922193, "learning_rate": 3.4533216271583264e-06, "loss": 0.0, "step": 6362 }, { "epoch": 1.86215978928885, "grad_norm": 0.00030452609644271433, "learning_rate": 3.4460052677787535e-06, "loss": 0.0, "step": 6363 }, { "epoch": 1.8624524436640328, "grad_norm": 0.0008452314650639892, "learning_rate": 3.4386889083991806e-06, "loss": 0.0, "step": 6364 }, { "epoch": 1.8627450980392157, "grad_norm": 0.000371145608369261, "learning_rate": 3.431372549019608e-06, "loss": 0.0, "step": 6365 }, { "epoch": 1.8630377524143986, "grad_norm": 0.00022421056928578764, "learning_rate": 3.424056189640035e-06, "loss": 0.0, "step": 6366 }, { "epoch": 1.8633304067895815, "grad_norm": 0.001361420494504273, "learning_rate": 3.4167398302604623e-06, "loss": 0.0, "step": 6367 }, { "epoch": 1.8636230611647644, "grad_norm": 0.0004811399267055094, "learning_rate": 3.4094234708808894e-06, "loss": 0.0, "step": 6368 }, { "epoch": 1.8639157155399473, "grad_norm": 0.0016553618479520082, "learning_rate": 3.4021071115013174e-06, "loss": 0.0, "step": 6369 }, { "epoch": 1.8642083699151302, "grad_norm": 0.0007354956469498575, "learning_rate": 3.3947907521217445e-06, "loss": 0.0, "step": 6370 }, { "epoch": 1.864501024290313, "grad_norm": 0.0010148129658773541, "learning_rate": 3.3874743927421716e-06, "loss": 0.0, "step": 6371 }, { "epoch": 1.864793678665496, "grad_norm": 0.0008505440782755613, "learning_rate": 3.3801580333625987e-06, "loss": 0.0, "step": 6372 }, { "epoch": 1.8650863330406788, "grad_norm": 0.004304022528231144, "learning_rate": 3.3728416739830266e-06, "loss": 0.0001, "step": 6373 }, { "epoch": 1.8653789874158617, "grad_norm": 0.0010357032297179103, "learning_rate": 3.3655253146034537e-06, "loss": 0.0, "step": 6374 }, { "epoch": 1.8656716417910446, "grad_norm": 0.0010642132256180048, "learning_rate": 3.358208955223881e-06, "loss": 0.0, "step": 6375 }, { "epoch": 1.8659642961662277, "grad_norm": 0.00038587921881116927, "learning_rate": 3.350892595844308e-06, "loss": 0.0, "step": 6376 }, { "epoch": 1.8662569505414106, "grad_norm": 0.0003773169592022896, "learning_rate": 3.3435762364647355e-06, "loss": 0.0, "step": 6377 }, { "epoch": 1.8665496049165935, "grad_norm": 0.000283686415059492, "learning_rate": 3.3362598770851626e-06, "loss": 0.0, "step": 6378 }, { "epoch": 1.8668422592917764, "grad_norm": 0.00034093321301043034, "learning_rate": 3.3289435177055897e-06, "loss": 0.0, "step": 6379 }, { "epoch": 1.8671349136669593, "grad_norm": 0.0007197193335741758, "learning_rate": 3.3216271583260168e-06, "loss": 0.0, "step": 6380 }, { "epoch": 1.8674275680421424, "grad_norm": 0.0006039079744368792, "learning_rate": 3.3143107989464447e-06, "loss": 0.0, "step": 6381 }, { "epoch": 1.8677202224173253, "grad_norm": 0.0003744322166312486, "learning_rate": 3.306994439566872e-06, "loss": 0.0, "step": 6382 }, { "epoch": 1.8680128767925082, "grad_norm": 0.0007173375342972577, "learning_rate": 3.299678080187299e-06, "loss": 0.0, "step": 6383 }, { "epoch": 1.868305531167691, "grad_norm": 0.00014455945347435772, "learning_rate": 3.292361720807726e-06, "loss": 0.0, "step": 6384 }, { "epoch": 1.868598185542874, "grad_norm": 0.0036249211989343166, "learning_rate": 3.285045361428153e-06, "loss": 0.0, "step": 6385 }, { "epoch": 1.8688908399180568, "grad_norm": 0.012913602404296398, "learning_rate": 3.277729002048581e-06, "loss": 0.0002, "step": 6386 }, { "epoch": 1.8691834942932397, "grad_norm": 0.00016423001943621784, "learning_rate": 3.270412642669008e-06, "loss": 0.0, "step": 6387 }, { "epoch": 1.8694761486684226, "grad_norm": 0.0016108466079458594, "learning_rate": 3.2630962832894353e-06, "loss": 0.0, "step": 6388 }, { "epoch": 1.8697688030436055, "grad_norm": 0.0030950899235904217, "learning_rate": 3.2557799239098624e-06, "loss": 0.0, "step": 6389 }, { "epoch": 1.8700614574187884, "grad_norm": 0.0007110501755960286, "learning_rate": 3.24846356453029e-06, "loss": 0.0, "step": 6390 }, { "epoch": 1.8703541117939713, "grad_norm": 0.0028555227909237146, "learning_rate": 3.241147205150717e-06, "loss": 0.0, "step": 6391 }, { "epoch": 1.8706467661691542, "grad_norm": 0.000410253502195701, "learning_rate": 3.233830845771144e-06, "loss": 0.0, "step": 6392 }, { "epoch": 1.870939420544337, "grad_norm": 0.0005777594051323831, "learning_rate": 3.2265144863915712e-06, "loss": 0.0, "step": 6393 }, { "epoch": 1.87123207491952, "grad_norm": 0.007163824513554573, "learning_rate": 3.219198127011999e-06, "loss": 0.0001, "step": 6394 }, { "epoch": 1.8715247292947028, "grad_norm": 0.0002955606614705175, "learning_rate": 3.2118817676324263e-06, "loss": 0.0, "step": 6395 }, { "epoch": 1.8718173836698857, "grad_norm": 0.0015786330914124846, "learning_rate": 3.2045654082528534e-06, "loss": 0.0, "step": 6396 }, { "epoch": 1.8721100380450688, "grad_norm": 0.0005388298304751515, "learning_rate": 3.1972490488732805e-06, "loss": 0.0, "step": 6397 }, { "epoch": 1.8724026924202517, "grad_norm": 0.0013606016291305423, "learning_rate": 3.1899326894937084e-06, "loss": 0.0, "step": 6398 }, { "epoch": 1.8726953467954346, "grad_norm": 4.769907474517822, "learning_rate": 3.1826163301141355e-06, "loss": 0.165, "step": 6399 }, { "epoch": 1.8729880011706175, "grad_norm": 0.0002351369766984135, "learning_rate": 3.1752999707345627e-06, "loss": 0.0, "step": 6400 }, { "epoch": 1.8732806555458004, "grad_norm": 0.0036255517043173313, "learning_rate": 3.1679836113549898e-06, "loss": 0.0, "step": 6401 }, { "epoch": 1.8735733099209835, "grad_norm": 0.013011470437049866, "learning_rate": 3.1606672519754173e-06, "loss": 0.0002, "step": 6402 }, { "epoch": 1.8738659642961664, "grad_norm": 0.024230200797319412, "learning_rate": 3.1533508925958444e-06, "loss": 0.0003, "step": 6403 }, { "epoch": 1.8741586186713493, "grad_norm": 0.0005433835322037339, "learning_rate": 3.1460345332162715e-06, "loss": 0.0, "step": 6404 }, { "epoch": 1.8744512730465321, "grad_norm": 0.019165195524692535, "learning_rate": 3.1387181738366986e-06, "loss": 0.0003, "step": 6405 }, { "epoch": 1.874743927421715, "grad_norm": 0.000899212434887886, "learning_rate": 3.1314018144571265e-06, "loss": 0.0, "step": 6406 }, { "epoch": 1.875036581796898, "grad_norm": 0.0005501322448253632, "learning_rate": 3.1240854550775536e-06, "loss": 0.0, "step": 6407 }, { "epoch": 1.8753292361720808, "grad_norm": 0.007282521575689316, "learning_rate": 3.1167690956979807e-06, "loss": 0.0001, "step": 6408 }, { "epoch": 1.8756218905472637, "grad_norm": 0.00667223334312439, "learning_rate": 3.1094527363184083e-06, "loss": 0.0001, "step": 6409 }, { "epoch": 1.8759145449224466, "grad_norm": 0.0007393379928544164, "learning_rate": 3.1021363769388354e-06, "loss": 0.0, "step": 6410 }, { "epoch": 1.8762071992976295, "grad_norm": 0.0019803973846137524, "learning_rate": 3.094820017559263e-06, "loss": 0.0, "step": 6411 }, { "epoch": 1.8764998536728124, "grad_norm": 0.0018379775574430823, "learning_rate": 3.08750365817969e-06, "loss": 0.0, "step": 6412 }, { "epoch": 1.8767925080479952, "grad_norm": 0.0005812466260977089, "learning_rate": 3.080187298800117e-06, "loss": 0.0, "step": 6413 }, { "epoch": 1.8770851624231781, "grad_norm": 0.0013607299188151956, "learning_rate": 3.0728709394205442e-06, "loss": 0.0, "step": 6414 }, { "epoch": 1.877377816798361, "grad_norm": 0.0012550321407616138, "learning_rate": 3.0655545800409717e-06, "loss": 0.0, "step": 6415 }, { "epoch": 1.877670471173544, "grad_norm": 0.0002778058697003871, "learning_rate": 3.058238220661399e-06, "loss": 0.0, "step": 6416 }, { "epoch": 1.8779631255487268, "grad_norm": 0.00119539606384933, "learning_rate": 3.0509218612818264e-06, "loss": 0.0, "step": 6417 }, { "epoch": 1.87825577992391, "grad_norm": 0.003676731837913394, "learning_rate": 3.0436055019022535e-06, "loss": 0.0, "step": 6418 }, { "epoch": 1.8785484342990928, "grad_norm": 0.00016842203331179917, "learning_rate": 3.036289142522681e-06, "loss": 0.0, "step": 6419 }, { "epoch": 1.8788410886742757, "grad_norm": 0.008006864227354527, "learning_rate": 3.028972783143108e-06, "loss": 0.0001, "step": 6420 }, { "epoch": 1.8791337430494586, "grad_norm": 0.001742623164318502, "learning_rate": 3.0216564237635356e-06, "loss": 0.0, "step": 6421 }, { "epoch": 1.8794263974246415, "grad_norm": 0.17839489877223969, "learning_rate": 3.0143400643839627e-06, "loss": 0.0004, "step": 6422 }, { "epoch": 1.8797190517998246, "grad_norm": 0.002378784818574786, "learning_rate": 3.0070237050043903e-06, "loss": 0.0, "step": 6423 }, { "epoch": 1.8800117061750075, "grad_norm": 0.00027628379757516086, "learning_rate": 2.9997073456248174e-06, "loss": 0.0, "step": 6424 }, { "epoch": 1.8803043605501903, "grad_norm": 0.000949666544329375, "learning_rate": 2.9923909862452445e-06, "loss": 0.0, "step": 6425 }, { "epoch": 1.8805970149253732, "grad_norm": 0.0007470758864656091, "learning_rate": 2.9850746268656716e-06, "loss": 0.0, "step": 6426 }, { "epoch": 1.8808896693005561, "grad_norm": 0.009189325384795666, "learning_rate": 2.977758267486099e-06, "loss": 0.0001, "step": 6427 }, { "epoch": 1.881182323675739, "grad_norm": 9.509653318673372e-05, "learning_rate": 2.970441908106526e-06, "loss": 0.0, "step": 6428 }, { "epoch": 1.881474978050922, "grad_norm": 0.003322751959785819, "learning_rate": 2.9631255487269537e-06, "loss": 0.0, "step": 6429 }, { "epoch": 1.8817676324261048, "grad_norm": 0.0018158421153202653, "learning_rate": 2.955809189347381e-06, "loss": 0.0, "step": 6430 }, { "epoch": 1.8820602868012877, "grad_norm": 0.0018216572934761643, "learning_rate": 2.9484928299678084e-06, "loss": 0.0, "step": 6431 }, { "epoch": 1.8823529411764706, "grad_norm": 0.003540392266586423, "learning_rate": 2.9411764705882355e-06, "loss": 0.0, "step": 6432 }, { "epoch": 1.8826455955516535, "grad_norm": 0.003678417531773448, "learning_rate": 2.933860111208663e-06, "loss": 0.0001, "step": 6433 }, { "epoch": 1.8829382499268363, "grad_norm": 0.0003505812492221594, "learning_rate": 2.92654375182909e-06, "loss": 0.0, "step": 6434 }, { "epoch": 1.8832309043020192, "grad_norm": 0.0007867827080190182, "learning_rate": 2.919227392449517e-06, "loss": 0.0, "step": 6435 }, { "epoch": 1.8835235586772021, "grad_norm": 0.0008522221469320357, "learning_rate": 2.9119110330699443e-06, "loss": 0.0, "step": 6436 }, { "epoch": 1.883816213052385, "grad_norm": 0.0005372047307901084, "learning_rate": 2.904594673690372e-06, "loss": 0.0, "step": 6437 }, { "epoch": 1.884108867427568, "grad_norm": 0.003420688910409808, "learning_rate": 2.897278314310799e-06, "loss": 0.0001, "step": 6438 }, { "epoch": 1.884401521802751, "grad_norm": 0.004475302062928677, "learning_rate": 2.8899619549312265e-06, "loss": 0.0001, "step": 6439 }, { "epoch": 1.8846941761779339, "grad_norm": 0.0016268702456727624, "learning_rate": 2.8826455955516536e-06, "loss": 0.0, "step": 6440 }, { "epoch": 1.8849868305531168, "grad_norm": 0.00023823804804123938, "learning_rate": 2.875329236172081e-06, "loss": 0.0, "step": 6441 }, { "epoch": 1.8852794849282997, "grad_norm": 0.0006432412192225456, "learning_rate": 2.868012876792508e-06, "loss": 0.0, "step": 6442 }, { "epoch": 1.8855721393034826, "grad_norm": 0.0009195173042826355, "learning_rate": 2.8606965174129357e-06, "loss": 0.0, "step": 6443 }, { "epoch": 1.8858647936786654, "grad_norm": 0.0014948223251849413, "learning_rate": 2.853380158033363e-06, "loss": 0.0, "step": 6444 }, { "epoch": 1.8861574480538486, "grad_norm": 0.001303711673244834, "learning_rate": 2.8460637986537903e-06, "loss": 0.0, "step": 6445 }, { "epoch": 1.8864501024290314, "grad_norm": 0.009742318652570248, "learning_rate": 2.8387474392742174e-06, "loss": 0.0001, "step": 6446 }, { "epoch": 1.8867427568042143, "grad_norm": 0.0009691920131444931, "learning_rate": 2.8314310798946445e-06, "loss": 0.0, "step": 6447 }, { "epoch": 1.8870354111793972, "grad_norm": 0.0013085316168144345, "learning_rate": 2.8241147205150717e-06, "loss": 0.0, "step": 6448 }, { "epoch": 1.88732806555458, "grad_norm": 0.0013028222601860762, "learning_rate": 2.8167983611354988e-06, "loss": 0.0, "step": 6449 }, { "epoch": 1.887620719929763, "grad_norm": 0.000381801015464589, "learning_rate": 2.8094820017559263e-06, "loss": 0.0, "step": 6450 }, { "epoch": 1.8879133743049459, "grad_norm": 0.003299749456346035, "learning_rate": 2.8021656423763534e-06, "loss": 0.0, "step": 6451 }, { "epoch": 1.8882060286801288, "grad_norm": 0.0018888454651460052, "learning_rate": 2.794849282996781e-06, "loss": 0.0, "step": 6452 }, { "epoch": 1.8884986830553117, "grad_norm": 0.0009297673823311925, "learning_rate": 2.787532923617208e-06, "loss": 0.0, "step": 6453 }, { "epoch": 1.8887913374304945, "grad_norm": 0.0006277947686612606, "learning_rate": 2.7802165642376355e-06, "loss": 0.0, "step": 6454 }, { "epoch": 1.8890839918056774, "grad_norm": 0.0011863817926496267, "learning_rate": 2.7729002048580626e-06, "loss": 0.0, "step": 6455 }, { "epoch": 1.8893766461808603, "grad_norm": 0.0010690029012039304, "learning_rate": 2.76558384547849e-06, "loss": 0.0, "step": 6456 }, { "epoch": 1.8896693005560432, "grad_norm": 0.008763744495809078, "learning_rate": 2.7582674860989173e-06, "loss": 0.0001, "step": 6457 }, { "epoch": 1.889961954931226, "grad_norm": 0.00045829196460545063, "learning_rate": 2.750951126719345e-06, "loss": 0.0, "step": 6458 }, { "epoch": 1.890254609306409, "grad_norm": 0.0005661585601046681, "learning_rate": 2.743634767339772e-06, "loss": 0.0, "step": 6459 }, { "epoch": 1.890547263681592, "grad_norm": 0.0005291930865496397, "learning_rate": 2.736318407960199e-06, "loss": 0.0, "step": 6460 }, { "epoch": 1.890839918056775, "grad_norm": 0.0003165259840898216, "learning_rate": 2.729002048580626e-06, "loss": 0.0, "step": 6461 }, { "epoch": 1.8911325724319579, "grad_norm": 0.0029839524067938328, "learning_rate": 2.7216856892010536e-06, "loss": 0.0, "step": 6462 }, { "epoch": 1.8914252268071408, "grad_norm": 0.0020246263593435287, "learning_rate": 2.7143693298214807e-06, "loss": 0.0, "step": 6463 }, { "epoch": 1.8917178811823236, "grad_norm": 0.015400498174130917, "learning_rate": 2.7070529704419083e-06, "loss": 0.0002, "step": 6464 }, { "epoch": 1.8920105355575065, "grad_norm": 0.0014981385320425034, "learning_rate": 2.6997366110623354e-06, "loss": 0.0, "step": 6465 }, { "epoch": 1.8923031899326896, "grad_norm": 0.002146840561181307, "learning_rate": 2.692420251682763e-06, "loss": 0.0, "step": 6466 }, { "epoch": 1.8925958443078725, "grad_norm": 0.0004235255182720721, "learning_rate": 2.68510389230319e-06, "loss": 0.0, "step": 6467 }, { "epoch": 1.8928884986830554, "grad_norm": 0.0003561250341590494, "learning_rate": 2.6777875329236175e-06, "loss": 0.0, "step": 6468 }, { "epoch": 1.8931811530582383, "grad_norm": 0.0002480603870935738, "learning_rate": 2.6704711735440446e-06, "loss": 0.0, "step": 6469 }, { "epoch": 1.8934738074334212, "grad_norm": 0.0002996289695147425, "learning_rate": 2.6631548141644717e-06, "loss": 0.0, "step": 6470 }, { "epoch": 1.893766461808604, "grad_norm": 0.0016276652459055185, "learning_rate": 2.6558384547848993e-06, "loss": 0.0, "step": 6471 }, { "epoch": 1.894059116183787, "grad_norm": 0.00033184816129505634, "learning_rate": 2.6485220954053264e-06, "loss": 0.0, "step": 6472 }, { "epoch": 1.8943517705589699, "grad_norm": 0.0006256209453567863, "learning_rate": 2.6412057360257535e-06, "loss": 0.0, "step": 6473 }, { "epoch": 1.8946444249341527, "grad_norm": 0.0066751097328960896, "learning_rate": 2.633889376646181e-06, "loss": 0.0001, "step": 6474 }, { "epoch": 1.8949370793093356, "grad_norm": 0.002083531813696027, "learning_rate": 2.626573017266608e-06, "loss": 0.0, "step": 6475 }, { "epoch": 1.8952297336845185, "grad_norm": 0.004366927780210972, "learning_rate": 2.6192566578870356e-06, "loss": 0.0001, "step": 6476 }, { "epoch": 1.8955223880597014, "grad_norm": 1.804510235786438, "learning_rate": 2.6119402985074627e-06, "loss": 0.0036, "step": 6477 }, { "epoch": 1.8958150424348843, "grad_norm": 0.0002599820145405829, "learning_rate": 2.6046239391278903e-06, "loss": 0.0, "step": 6478 }, { "epoch": 1.8961076968100672, "grad_norm": 0.0017510734032839537, "learning_rate": 2.5973075797483174e-06, "loss": 0.0, "step": 6479 }, { "epoch": 1.89640035118525, "grad_norm": 0.0003297017829027027, "learning_rate": 2.589991220368745e-06, "loss": 0.0, "step": 6480 }, { "epoch": 1.896693005560433, "grad_norm": 0.004637278616428375, "learning_rate": 2.582674860989172e-06, "loss": 0.0001, "step": 6481 }, { "epoch": 1.896985659935616, "grad_norm": 0.00019260791304986924, "learning_rate": 2.575358501609599e-06, "loss": 0.0, "step": 6482 }, { "epoch": 1.897278314310799, "grad_norm": 0.0009068685467354953, "learning_rate": 2.568042142230026e-06, "loss": 0.0, "step": 6483 }, { "epoch": 1.8975709686859819, "grad_norm": 0.00039480082341469824, "learning_rate": 2.5607257828504537e-06, "loss": 0.0, "step": 6484 }, { "epoch": 1.8978636230611647, "grad_norm": 0.00045794202014803886, "learning_rate": 2.553409423470881e-06, "loss": 0.0, "step": 6485 }, { "epoch": 1.8981562774363476, "grad_norm": 0.00048717408208176494, "learning_rate": 2.5460930640913084e-06, "loss": 0.0, "step": 6486 }, { "epoch": 1.8984489318115307, "grad_norm": 0.00021634685981553048, "learning_rate": 2.5387767047117355e-06, "loss": 0.0, "step": 6487 }, { "epoch": 1.8987415861867136, "grad_norm": 0.0002700884360820055, "learning_rate": 2.531460345332163e-06, "loss": 0.0, "step": 6488 }, { "epoch": 1.8990342405618965, "grad_norm": 0.004484069999307394, "learning_rate": 2.52414398595259e-06, "loss": 0.0001, "step": 6489 }, { "epoch": 1.8993268949370794, "grad_norm": 0.00015893818635959178, "learning_rate": 2.5168276265730176e-06, "loss": 0.0, "step": 6490 }, { "epoch": 1.8996195493122623, "grad_norm": 0.001963461982086301, "learning_rate": 2.5095112671934447e-06, "loss": 0.0, "step": 6491 }, { "epoch": 1.8999122036874452, "grad_norm": 0.0012448845664039254, "learning_rate": 2.5021949078138722e-06, "loss": 0.0, "step": 6492 }, { "epoch": 1.900204858062628, "grad_norm": 0.030520202592015266, "learning_rate": 2.4948785484342993e-06, "loss": 0.0001, "step": 6493 }, { "epoch": 1.900497512437811, "grad_norm": 0.00040364472079090774, "learning_rate": 2.4875621890547264e-06, "loss": 0.0, "step": 6494 }, { "epoch": 1.9007901668129938, "grad_norm": 0.002049447502940893, "learning_rate": 2.4802458296751536e-06, "loss": 0.0, "step": 6495 }, { "epoch": 1.9010828211881767, "grad_norm": 0.0020897185895591974, "learning_rate": 2.472929470295581e-06, "loss": 0.0, "step": 6496 }, { "epoch": 1.9013754755633596, "grad_norm": 0.00046888107317499816, "learning_rate": 2.465613110916008e-06, "loss": 0.0, "step": 6497 }, { "epoch": 1.9016681299385425, "grad_norm": 0.0004110477748326957, "learning_rate": 2.4582967515364357e-06, "loss": 0.0, "step": 6498 }, { "epoch": 1.9019607843137254, "grad_norm": 0.0018982577603310347, "learning_rate": 2.450980392156863e-06, "loss": 0.0, "step": 6499 }, { "epoch": 1.9022534386889083, "grad_norm": 0.0007911332650110126, "learning_rate": 2.4436640327772903e-06, "loss": 0.0, "step": 6500 }, { "epoch": 1.9025460930640912, "grad_norm": 0.00031156413024291396, "learning_rate": 2.4363476733977174e-06, "loss": 0.0, "step": 6501 }, { "epoch": 1.902838747439274, "grad_norm": 0.0020903449039906263, "learning_rate": 2.429031314018145e-06, "loss": 0.0, "step": 6502 }, { "epoch": 1.9031314018144572, "grad_norm": 0.0002454284986015409, "learning_rate": 2.421714954638572e-06, "loss": 0.0, "step": 6503 }, { "epoch": 1.90342405618964, "grad_norm": 0.0014006092678755522, "learning_rate": 2.4143985952589996e-06, "loss": 0.0, "step": 6504 }, { "epoch": 1.903716710564823, "grad_norm": 0.001830007415264845, "learning_rate": 2.4070822358794267e-06, "loss": 0.0, "step": 6505 }, { "epoch": 1.9040093649400058, "grad_norm": 0.00019994727335870266, "learning_rate": 2.399765876499854e-06, "loss": 0.0, "step": 6506 }, { "epoch": 1.9043020193151887, "grad_norm": 0.0018469392089173198, "learning_rate": 2.392449517120281e-06, "loss": 0.0, "step": 6507 }, { "epoch": 1.9045946736903718, "grad_norm": 0.0019941171631217003, "learning_rate": 2.3851331577407084e-06, "loss": 0.0, "step": 6508 }, { "epoch": 1.9048873280655547, "grad_norm": 0.00025979202473536134, "learning_rate": 2.3778167983611355e-06, "loss": 0.0, "step": 6509 }, { "epoch": 1.9051799824407376, "grad_norm": 0.0006788742612116039, "learning_rate": 2.370500438981563e-06, "loss": 0.0, "step": 6510 }, { "epoch": 1.9054726368159205, "grad_norm": 0.0010134984040632844, "learning_rate": 2.36318407960199e-06, "loss": 0.0, "step": 6511 }, { "epoch": 1.9057652911911034, "grad_norm": 0.00046694010961800814, "learning_rate": 2.3558677202224173e-06, "loss": 0.0, "step": 6512 }, { "epoch": 1.9060579455662863, "grad_norm": 0.004774912726134062, "learning_rate": 2.348551360842845e-06, "loss": 0.0, "step": 6513 }, { "epoch": 1.9063505999414692, "grad_norm": 0.0028041843324899673, "learning_rate": 2.341235001463272e-06, "loss": 0.0, "step": 6514 }, { "epoch": 1.906643254316652, "grad_norm": 0.003318385686725378, "learning_rate": 2.3339186420836994e-06, "loss": 0.0, "step": 6515 }, { "epoch": 1.906935908691835, "grad_norm": 0.00046362687135115266, "learning_rate": 2.3266022827041265e-06, "loss": 0.0, "step": 6516 }, { "epoch": 1.9072285630670178, "grad_norm": 0.0013521397486329079, "learning_rate": 2.3192859233245536e-06, "loss": 0.0, "step": 6517 }, { "epoch": 1.9075212174422007, "grad_norm": 0.00040896268910728395, "learning_rate": 2.3119695639449807e-06, "loss": 0.0, "step": 6518 }, { "epoch": 1.9078138718173836, "grad_norm": 0.0002475320943631232, "learning_rate": 2.3046532045654083e-06, "loss": 0.0, "step": 6519 }, { "epoch": 1.9081065261925665, "grad_norm": 0.004071303643286228, "learning_rate": 2.2973368451858354e-06, "loss": 0.0001, "step": 6520 }, { "epoch": 1.9083991805677494, "grad_norm": 0.00022741260181646794, "learning_rate": 2.290020485806263e-06, "loss": 0.0, "step": 6521 }, { "epoch": 1.9086918349429323, "grad_norm": 0.0004800940223503858, "learning_rate": 2.28270412642669e-06, "loss": 0.0, "step": 6522 }, { "epoch": 1.9089844893181152, "grad_norm": 0.0002673115814104676, "learning_rate": 2.2753877670471175e-06, "loss": 0.0, "step": 6523 }, { "epoch": 1.9092771436932983, "grad_norm": 0.0007175962091423571, "learning_rate": 2.2680714076675446e-06, "loss": 0.0, "step": 6524 }, { "epoch": 1.9095697980684812, "grad_norm": 0.00019210069149266928, "learning_rate": 2.260755048287972e-06, "loss": 0.0, "step": 6525 }, { "epoch": 1.909862452443664, "grad_norm": 0.0005344034289009869, "learning_rate": 2.2534386889083993e-06, "loss": 0.0, "step": 6526 }, { "epoch": 1.910155106818847, "grad_norm": 0.000837336468975991, "learning_rate": 2.2461223295288268e-06, "loss": 0.0, "step": 6527 }, { "epoch": 1.9104477611940298, "grad_norm": 0.00025415114942006767, "learning_rate": 2.238805970149254e-06, "loss": 0.0, "step": 6528 }, { "epoch": 1.910740415569213, "grad_norm": 0.00047166517470031977, "learning_rate": 2.231489610769681e-06, "loss": 0.0, "step": 6529 }, { "epoch": 1.9110330699443958, "grad_norm": 0.00017651126836426556, "learning_rate": 2.224173251390108e-06, "loss": 0.0, "step": 6530 }, { "epoch": 1.9113257243195787, "grad_norm": 0.00020520172256510705, "learning_rate": 2.2168568920105356e-06, "loss": 0.0, "step": 6531 }, { "epoch": 1.9116183786947616, "grad_norm": 0.00011977095709880814, "learning_rate": 2.2095405326309627e-06, "loss": 0.0, "step": 6532 }, { "epoch": 1.9119110330699445, "grad_norm": 0.00024520300212316215, "learning_rate": 2.2022241732513903e-06, "loss": 0.0, "step": 6533 }, { "epoch": 1.9122036874451274, "grad_norm": 0.00042339987703599036, "learning_rate": 2.1949078138718174e-06, "loss": 0.0, "step": 6534 }, { "epoch": 1.9124963418203103, "grad_norm": 0.0014089974574744701, "learning_rate": 2.187591454492245e-06, "loss": 0.0, "step": 6535 }, { "epoch": 1.9127889961954931, "grad_norm": 0.0005294946604408324, "learning_rate": 2.180275095112672e-06, "loss": 0.0, "step": 6536 }, { "epoch": 1.913081650570676, "grad_norm": 0.0004275265382602811, "learning_rate": 2.1729587357330995e-06, "loss": 0.0, "step": 6537 }, { "epoch": 1.913374304945859, "grad_norm": 0.002226543379947543, "learning_rate": 2.1656423763535266e-06, "loss": 0.0, "step": 6538 }, { "epoch": 1.9136669593210418, "grad_norm": 0.00032171490602195263, "learning_rate": 2.158326016973954e-06, "loss": 0.0, "step": 6539 }, { "epoch": 1.9139596136962247, "grad_norm": 0.0063609592616558075, "learning_rate": 2.1510096575943812e-06, "loss": 0.0001, "step": 6540 }, { "epoch": 1.9142522680714076, "grad_norm": 0.0011484756832942367, "learning_rate": 2.1436932982148083e-06, "loss": 0.0, "step": 6541 }, { "epoch": 1.9145449224465905, "grad_norm": 0.0010393839329481125, "learning_rate": 2.1363769388352355e-06, "loss": 0.0, "step": 6542 }, { "epoch": 1.9148375768217734, "grad_norm": 0.001812055124901235, "learning_rate": 2.129060579455663e-06, "loss": 0.0, "step": 6543 }, { "epoch": 1.9151302311969562, "grad_norm": 0.0005704189534299076, "learning_rate": 2.12174422007609e-06, "loss": 0.0, "step": 6544 }, { "epoch": 1.9154228855721394, "grad_norm": 0.0007563745020888746, "learning_rate": 2.1144278606965176e-06, "loss": 0.0, "step": 6545 }, { "epoch": 1.9157155399473222, "grad_norm": 0.0011137282708659768, "learning_rate": 2.1071115013169447e-06, "loss": 0.0, "step": 6546 }, { "epoch": 1.9160081943225051, "grad_norm": 0.00035710533848032355, "learning_rate": 2.0997951419373722e-06, "loss": 0.0, "step": 6547 }, { "epoch": 1.916300848697688, "grad_norm": 0.00036031228955835104, "learning_rate": 2.0924787825577993e-06, "loss": 0.0, "step": 6548 }, { "epoch": 1.916593503072871, "grad_norm": 0.00026160478591918945, "learning_rate": 2.085162423178227e-06, "loss": 0.0, "step": 6549 }, { "epoch": 1.9168861574480538, "grad_norm": 0.005153276491910219, "learning_rate": 2.077846063798654e-06, "loss": 0.0001, "step": 6550 }, { "epoch": 1.917178811823237, "grad_norm": 0.0007959470385685563, "learning_rate": 2.070529704419081e-06, "loss": 0.0, "step": 6551 }, { "epoch": 1.9174714661984198, "grad_norm": 0.0016937382752075791, "learning_rate": 2.063213345039508e-06, "loss": 0.0, "step": 6552 }, { "epoch": 1.9177641205736027, "grad_norm": 0.0024497276172041893, "learning_rate": 2.0558969856599357e-06, "loss": 0.0, "step": 6553 }, { "epoch": 1.9180567749487856, "grad_norm": 0.0009276737691834569, "learning_rate": 2.048580626280363e-06, "loss": 0.0, "step": 6554 }, { "epoch": 1.9183494293239685, "grad_norm": 0.0032618502154946327, "learning_rate": 2.0412642669007903e-06, "loss": 0.0, "step": 6555 }, { "epoch": 1.9186420836991513, "grad_norm": 0.0019429664826020598, "learning_rate": 2.0339479075212174e-06, "loss": 0.0, "step": 6556 }, { "epoch": 1.9189347380743342, "grad_norm": 0.0017567048780620098, "learning_rate": 2.026631548141645e-06, "loss": 0.0, "step": 6557 }, { "epoch": 1.9192273924495171, "grad_norm": 0.006915146019309759, "learning_rate": 2.019315188762072e-06, "loss": 0.0001, "step": 6558 }, { "epoch": 1.9195200468247, "grad_norm": 0.0016446765512228012, "learning_rate": 2.0119988293824996e-06, "loss": 0.0, "step": 6559 }, { "epoch": 1.919812701199883, "grad_norm": 0.0006018587737344205, "learning_rate": 2.0046824700029267e-06, "loss": 0.0, "step": 6560 }, { "epoch": 1.9201053555750658, "grad_norm": 0.0001805188221624121, "learning_rate": 1.9973661106233542e-06, "loss": 0.0, "step": 6561 }, { "epoch": 1.9203980099502487, "grad_norm": 0.00033795545459724963, "learning_rate": 1.9900497512437813e-06, "loss": 0.0, "step": 6562 }, { "epoch": 1.9206906643254316, "grad_norm": 0.0005303092184476554, "learning_rate": 1.9827333918642084e-06, "loss": 0.0, "step": 6563 }, { "epoch": 1.9209833187006145, "grad_norm": 0.0004896099562756717, "learning_rate": 1.9754170324846355e-06, "loss": 0.0, "step": 6564 }, { "epoch": 1.9212759730757973, "grad_norm": 0.002939594676718116, "learning_rate": 1.968100673105063e-06, "loss": 0.0, "step": 6565 }, { "epoch": 1.9215686274509802, "grad_norm": 0.0007181447581388056, "learning_rate": 1.96078431372549e-06, "loss": 0.0, "step": 6566 }, { "epoch": 1.9218612818261633, "grad_norm": 0.0001777529832907021, "learning_rate": 1.9534679543459177e-06, "loss": 0.0, "step": 6567 }, { "epoch": 1.9221539362013462, "grad_norm": 0.0002563889720477164, "learning_rate": 1.946151594966345e-06, "loss": 0.0, "step": 6568 }, { "epoch": 1.9224465905765291, "grad_norm": 0.004232883453369141, "learning_rate": 1.9388352355867723e-06, "loss": 0.0, "step": 6569 }, { "epoch": 1.922739244951712, "grad_norm": 0.001293240231461823, "learning_rate": 1.9315188762071994e-06, "loss": 0.0, "step": 6570 }, { "epoch": 1.923031899326895, "grad_norm": 0.00044473825255408883, "learning_rate": 1.924202516827627e-06, "loss": 0.0, "step": 6571 }, { "epoch": 1.923324553702078, "grad_norm": 0.002846464514732361, "learning_rate": 1.916886157448054e-06, "loss": 0.0, "step": 6572 }, { "epoch": 1.9236172080772609, "grad_norm": 0.0003088650992140174, "learning_rate": 1.9095697980684816e-06, "loss": 0.0, "step": 6573 }, { "epoch": 1.9239098624524438, "grad_norm": 0.0007258460973389447, "learning_rate": 1.9022534386889085e-06, "loss": 0.0, "step": 6574 }, { "epoch": 1.9242025168276267, "grad_norm": 0.001201051753014326, "learning_rate": 1.894937079309336e-06, "loss": 0.0, "step": 6575 }, { "epoch": 1.9244951712028096, "grad_norm": 0.00023510832397732884, "learning_rate": 1.887620719929763e-06, "loss": 0.0, "step": 6576 }, { "epoch": 1.9247878255779924, "grad_norm": 0.0005937555688433349, "learning_rate": 1.8803043605501902e-06, "loss": 0.0, "step": 6577 }, { "epoch": 1.9250804799531753, "grad_norm": 0.00188451015856117, "learning_rate": 1.8729880011706175e-06, "loss": 0.0, "step": 6578 }, { "epoch": 1.9253731343283582, "grad_norm": 0.0023857697378844023, "learning_rate": 1.8656716417910446e-06, "loss": 0.0, "step": 6579 }, { "epoch": 1.925665788703541, "grad_norm": 0.0008184052421711385, "learning_rate": 1.8583552824114722e-06, "loss": 0.0, "step": 6580 }, { "epoch": 1.925958443078724, "grad_norm": 0.001048028003424406, "learning_rate": 1.8510389230318993e-06, "loss": 0.0, "step": 6581 }, { "epoch": 1.9262510974539069, "grad_norm": 0.003968420904129744, "learning_rate": 1.8437225636523268e-06, "loss": 0.0001, "step": 6582 }, { "epoch": 1.9265437518290898, "grad_norm": 0.0005019573145546019, "learning_rate": 1.8364062042727539e-06, "loss": 0.0, "step": 6583 }, { "epoch": 1.9268364062042727, "grad_norm": 0.00038237779517658055, "learning_rate": 1.8290898448931812e-06, "loss": 0.0, "step": 6584 }, { "epoch": 1.9271290605794555, "grad_norm": 5.985738754272461, "learning_rate": 1.8217734855136083e-06, "loss": 0.0265, "step": 6585 }, { "epoch": 1.9274217149546384, "grad_norm": 0.000331999734044075, "learning_rate": 1.8144571261340358e-06, "loss": 0.0, "step": 6586 }, { "epoch": 1.9277143693298213, "grad_norm": 0.0014998256228864193, "learning_rate": 1.807140766754463e-06, "loss": 0.0, "step": 6587 }, { "epoch": 1.9280070237050044, "grad_norm": 0.00024889782071113586, "learning_rate": 1.7998244073748905e-06, "loss": 0.0, "step": 6588 }, { "epoch": 1.9282996780801873, "grad_norm": 0.01695532537996769, "learning_rate": 1.7925080479953176e-06, "loss": 0.0001, "step": 6589 }, { "epoch": 1.9285923324553702, "grad_norm": 0.03542960807681084, "learning_rate": 1.7851916886157449e-06, "loss": 0.0001, "step": 6590 }, { "epoch": 1.928884986830553, "grad_norm": 0.00017672083049546927, "learning_rate": 1.777875329236172e-06, "loss": 0.0, "step": 6591 }, { "epoch": 1.929177641205736, "grad_norm": 0.023258518427610397, "learning_rate": 1.7705589698565995e-06, "loss": 0.0002, "step": 6592 }, { "epoch": 1.929470295580919, "grad_norm": 0.00031844244222156703, "learning_rate": 1.7632426104770266e-06, "loss": 0.0, "step": 6593 }, { "epoch": 1.929762949956102, "grad_norm": 0.0007790586678311229, "learning_rate": 1.7559262510974541e-06, "loss": 0.0, "step": 6594 }, { "epoch": 1.9300556043312849, "grad_norm": 0.0001478357007727027, "learning_rate": 1.7486098917178812e-06, "loss": 0.0, "step": 6595 }, { "epoch": 1.9303482587064678, "grad_norm": 0.00033634313149377704, "learning_rate": 1.7412935323383086e-06, "loss": 0.0, "step": 6596 }, { "epoch": 1.9306409130816506, "grad_norm": 0.01702825166285038, "learning_rate": 1.7339771729587357e-06, "loss": 0.0003, "step": 6597 }, { "epoch": 1.9309335674568335, "grad_norm": 0.008284145966172218, "learning_rate": 1.7266608135791632e-06, "loss": 0.0001, "step": 6598 }, { "epoch": 1.9312262218320164, "grad_norm": 0.0009149824036285281, "learning_rate": 1.7193444541995903e-06, "loss": 0.0, "step": 6599 }, { "epoch": 1.9315188762071993, "grad_norm": 0.0010282084112986922, "learning_rate": 1.7120280948200176e-06, "loss": 0.0, "step": 6600 }, { "epoch": 1.9318115305823822, "grad_norm": 0.00043911809916608036, "learning_rate": 1.7047117354404447e-06, "loss": 0.0, "step": 6601 }, { "epoch": 1.932104184957565, "grad_norm": 0.0004219369147904217, "learning_rate": 1.6973953760608722e-06, "loss": 0.0, "step": 6602 }, { "epoch": 1.932396839332748, "grad_norm": 0.0002833109174389392, "learning_rate": 1.6900790166812993e-06, "loss": 0.0, "step": 6603 }, { "epoch": 1.9326894937079309, "grad_norm": 0.000787951226811856, "learning_rate": 1.6827626573017269e-06, "loss": 0.0, "step": 6604 }, { "epoch": 1.9329821480831137, "grad_norm": 0.0002686723310034722, "learning_rate": 1.675446297922154e-06, "loss": 0.0, "step": 6605 }, { "epoch": 1.9332748024582966, "grad_norm": 0.0009234495228156447, "learning_rate": 1.6681299385425813e-06, "loss": 0.0, "step": 6606 }, { "epoch": 1.9335674568334795, "grad_norm": 0.0074235666543245316, "learning_rate": 1.6608135791630084e-06, "loss": 0.0001, "step": 6607 }, { "epoch": 1.9338601112086624, "grad_norm": 0.0007379117305390537, "learning_rate": 1.653497219783436e-06, "loss": 0.0, "step": 6608 }, { "epoch": 1.9341527655838455, "grad_norm": 0.0002182548923883587, "learning_rate": 1.646180860403863e-06, "loss": 0.0, "step": 6609 }, { "epoch": 1.9344454199590284, "grad_norm": 0.00016724392480682582, "learning_rate": 1.6388645010242905e-06, "loss": 0.0, "step": 6610 }, { "epoch": 1.9347380743342113, "grad_norm": 0.0007237906102091074, "learning_rate": 1.6315481416447176e-06, "loss": 0.0, "step": 6611 }, { "epoch": 1.9350307287093942, "grad_norm": 0.0016593494219705462, "learning_rate": 1.624231782265145e-06, "loss": 0.0, "step": 6612 }, { "epoch": 1.935323383084577, "grad_norm": 0.0003152602876070887, "learning_rate": 1.616915422885572e-06, "loss": 0.0, "step": 6613 }, { "epoch": 1.9356160374597602, "grad_norm": 0.00036291356082074344, "learning_rate": 1.6095990635059996e-06, "loss": 0.0, "step": 6614 }, { "epoch": 1.935908691834943, "grad_norm": 0.00020301327458582819, "learning_rate": 1.6022827041264267e-06, "loss": 0.0, "step": 6615 }, { "epoch": 1.936201346210126, "grad_norm": 0.001788858906365931, "learning_rate": 1.5949663447468542e-06, "loss": 0.0, "step": 6616 }, { "epoch": 1.9364940005853089, "grad_norm": 0.0008270929683931172, "learning_rate": 1.5876499853672813e-06, "loss": 0.0, "step": 6617 }, { "epoch": 1.9367866549604917, "grad_norm": 0.00048240655451081693, "learning_rate": 1.5803336259877086e-06, "loss": 0.0, "step": 6618 }, { "epoch": 1.9370793093356746, "grad_norm": 0.0003302799887023866, "learning_rate": 1.5730172666081357e-06, "loss": 0.0, "step": 6619 }, { "epoch": 1.9373719637108575, "grad_norm": 0.001860711956396699, "learning_rate": 1.5657009072285633e-06, "loss": 0.0, "step": 6620 }, { "epoch": 1.9376646180860404, "grad_norm": 0.0005062110722064972, "learning_rate": 1.5583845478489904e-06, "loss": 0.0, "step": 6621 }, { "epoch": 1.9379572724612233, "grad_norm": 0.0015211127465590835, "learning_rate": 1.5510681884694177e-06, "loss": 0.0, "step": 6622 }, { "epoch": 1.9382499268364062, "grad_norm": 0.0007924987585283816, "learning_rate": 1.543751829089845e-06, "loss": 0.0, "step": 6623 }, { "epoch": 1.938542581211589, "grad_norm": 0.001091635087504983, "learning_rate": 1.5364354697102721e-06, "loss": 0.0, "step": 6624 }, { "epoch": 1.938835235586772, "grad_norm": 0.000785691081546247, "learning_rate": 1.5291191103306994e-06, "loss": 0.0, "step": 6625 }, { "epoch": 1.9391278899619548, "grad_norm": 0.00026529538445174694, "learning_rate": 1.5218027509511267e-06, "loss": 0.0, "step": 6626 }, { "epoch": 1.9394205443371377, "grad_norm": 0.000439536408521235, "learning_rate": 1.514486391571554e-06, "loss": 0.0, "step": 6627 }, { "epoch": 1.9397131987123206, "grad_norm": 0.0004655602388083935, "learning_rate": 1.5071700321919814e-06, "loss": 0.0, "step": 6628 }, { "epoch": 1.9400058530875035, "grad_norm": 0.0006081527099013329, "learning_rate": 1.4998536728124087e-06, "loss": 0.0, "step": 6629 }, { "epoch": 1.9402985074626866, "grad_norm": 0.000489642086904496, "learning_rate": 1.4925373134328358e-06, "loss": 0.0, "step": 6630 }, { "epoch": 1.9405911618378695, "grad_norm": 0.0007188718882389367, "learning_rate": 1.485220954053263e-06, "loss": 0.0, "step": 6631 }, { "epoch": 1.9408838162130524, "grad_norm": 0.0002691959962248802, "learning_rate": 1.4779045946736904e-06, "loss": 0.0, "step": 6632 }, { "epoch": 1.9411764705882353, "grad_norm": 0.0006269677542150021, "learning_rate": 1.4705882352941177e-06, "loss": 0.0, "step": 6633 }, { "epoch": 1.9414691249634182, "grad_norm": 0.0014246383216232061, "learning_rate": 1.463271875914545e-06, "loss": 0.0, "step": 6634 }, { "epoch": 1.941761779338601, "grad_norm": 0.006769080180674791, "learning_rate": 1.4559555165349721e-06, "loss": 0.0001, "step": 6635 }, { "epoch": 1.9420544337137842, "grad_norm": 0.00024840165860950947, "learning_rate": 1.4486391571553995e-06, "loss": 0.0, "step": 6636 }, { "epoch": 1.942347088088967, "grad_norm": 0.00021002003632020205, "learning_rate": 1.4413227977758268e-06, "loss": 0.0, "step": 6637 }, { "epoch": 1.94263974246415, "grad_norm": 0.00024155636492650956, "learning_rate": 1.434006438396254e-06, "loss": 0.0, "step": 6638 }, { "epoch": 1.9429323968393328, "grad_norm": 0.0002558117266744375, "learning_rate": 1.4266900790166814e-06, "loss": 0.0, "step": 6639 }, { "epoch": 1.9432250512145157, "grad_norm": 0.00030932450317777693, "learning_rate": 1.4193737196371087e-06, "loss": 0.0, "step": 6640 }, { "epoch": 1.9435177055896986, "grad_norm": 0.0008397055207751691, "learning_rate": 1.4120573602575358e-06, "loss": 0.0, "step": 6641 }, { "epoch": 1.9438103599648815, "grad_norm": 0.0004229200421832502, "learning_rate": 1.4047410008779631e-06, "loss": 0.0, "step": 6642 }, { "epoch": 1.9441030143400644, "grad_norm": 0.004902367480099201, "learning_rate": 1.3974246414983905e-06, "loss": 0.0001, "step": 6643 }, { "epoch": 1.9443956687152473, "grad_norm": 0.0009248115820810199, "learning_rate": 1.3901082821188178e-06, "loss": 0.0, "step": 6644 }, { "epoch": 1.9446883230904302, "grad_norm": 4.626027584075928, "learning_rate": 1.382791922739245e-06, "loss": 0.2806, "step": 6645 }, { "epoch": 1.944980977465613, "grad_norm": 0.0015937514835968614, "learning_rate": 1.3754755633596724e-06, "loss": 0.0, "step": 6646 }, { "epoch": 1.945273631840796, "grad_norm": 0.0029478934593498707, "learning_rate": 1.3681592039800995e-06, "loss": 0.0, "step": 6647 }, { "epoch": 1.9455662862159788, "grad_norm": 0.30252787470817566, "learning_rate": 1.3608428446005268e-06, "loss": 0.0008, "step": 6648 }, { "epoch": 1.9458589405911617, "grad_norm": 0.0003999358450528234, "learning_rate": 1.3535264852209541e-06, "loss": 0.0, "step": 6649 }, { "epoch": 1.9461515949663446, "grad_norm": 0.0003845870669465512, "learning_rate": 1.3462101258413814e-06, "loss": 0.0, "step": 6650 }, { "epoch": 1.9464442493415277, "grad_norm": 0.00020116023370064795, "learning_rate": 1.3388937664618088e-06, "loss": 0.0, "step": 6651 }, { "epoch": 1.9467369037167106, "grad_norm": 0.0011880359379574656, "learning_rate": 1.3315774070822359e-06, "loss": 0.0, "step": 6652 }, { "epoch": 1.9470295580918935, "grad_norm": 0.0002459873794578016, "learning_rate": 1.3242610477026632e-06, "loss": 0.0, "step": 6653 }, { "epoch": 1.9473222124670764, "grad_norm": 0.008838320150971413, "learning_rate": 1.3169446883230905e-06, "loss": 0.0001, "step": 6654 }, { "epoch": 1.9476148668422593, "grad_norm": 0.000532762031070888, "learning_rate": 1.3096283289435178e-06, "loss": 0.0, "step": 6655 }, { "epoch": 1.9479075212174422, "grad_norm": 0.003082372946664691, "learning_rate": 1.3023119695639451e-06, "loss": 0.0, "step": 6656 }, { "epoch": 1.9482001755926253, "grad_norm": 0.0002804934047162533, "learning_rate": 1.2949956101843724e-06, "loss": 0.0, "step": 6657 }, { "epoch": 1.9484928299678081, "grad_norm": 0.001282552140764892, "learning_rate": 1.2876792508047995e-06, "loss": 0.0, "step": 6658 }, { "epoch": 1.948785484342991, "grad_norm": 0.0002798543428070843, "learning_rate": 1.2803628914252269e-06, "loss": 0.0, "step": 6659 }, { "epoch": 1.949078138718174, "grad_norm": 0.0009536589495837688, "learning_rate": 1.2730465320456542e-06, "loss": 0.0, "step": 6660 }, { "epoch": 1.9493707930933568, "grad_norm": 0.0001576505455886945, "learning_rate": 1.2657301726660815e-06, "loss": 0.0, "step": 6661 }, { "epoch": 1.9496634474685397, "grad_norm": 0.0032433916348963976, "learning_rate": 1.2584138132865088e-06, "loss": 0.0001, "step": 6662 }, { "epoch": 1.9499561018437226, "grad_norm": 0.03013523668050766, "learning_rate": 1.2510974539069361e-06, "loss": 0.0002, "step": 6663 }, { "epoch": 1.9502487562189055, "grad_norm": 0.001954001607373357, "learning_rate": 1.2437810945273632e-06, "loss": 0.0, "step": 6664 }, { "epoch": 1.9505414105940884, "grad_norm": 0.00017482234397903085, "learning_rate": 1.2364647351477905e-06, "loss": 0.0, "step": 6665 }, { "epoch": 1.9508340649692713, "grad_norm": 0.00079634680878371, "learning_rate": 1.2291483757682179e-06, "loss": 0.0, "step": 6666 }, { "epoch": 1.9511267193444541, "grad_norm": 0.0003813347721006721, "learning_rate": 1.2218320163886452e-06, "loss": 0.0, "step": 6667 }, { "epoch": 1.951419373719637, "grad_norm": 0.0006567532545886934, "learning_rate": 1.2145156570090725e-06, "loss": 0.0, "step": 6668 }, { "epoch": 1.95171202809482, "grad_norm": 0.0008832120802253485, "learning_rate": 1.2071992976294998e-06, "loss": 0.0, "step": 6669 }, { "epoch": 1.9520046824700028, "grad_norm": 0.0001882561919046566, "learning_rate": 1.199882938249927e-06, "loss": 0.0, "step": 6670 }, { "epoch": 1.9522973368451857, "grad_norm": 0.0003287769795861095, "learning_rate": 1.1925665788703542e-06, "loss": 0.0, "step": 6671 }, { "epoch": 1.9525899912203686, "grad_norm": 0.0007390181999653578, "learning_rate": 1.1852502194907815e-06, "loss": 0.0, "step": 6672 }, { "epoch": 1.9528826455955517, "grad_norm": 0.0011151289800181985, "learning_rate": 1.1779338601112086e-06, "loss": 0.0, "step": 6673 }, { "epoch": 1.9531752999707346, "grad_norm": 0.0009459562716074288, "learning_rate": 1.170617500731636e-06, "loss": 0.0, "step": 6674 }, { "epoch": 1.9534679543459175, "grad_norm": 0.00026561840786598623, "learning_rate": 1.1633011413520633e-06, "loss": 0.0, "step": 6675 }, { "epoch": 1.9537606087211004, "grad_norm": 0.007528937421739101, "learning_rate": 1.1559847819724904e-06, "loss": 0.0001, "step": 6676 }, { "epoch": 1.9540532630962832, "grad_norm": 0.0004000811604782939, "learning_rate": 1.1486684225929177e-06, "loss": 0.0, "step": 6677 }, { "epoch": 1.9543459174714664, "grad_norm": 0.0017375907627865672, "learning_rate": 1.141352063213345e-06, "loss": 0.0, "step": 6678 }, { "epoch": 1.9546385718466492, "grad_norm": 0.005005417391657829, "learning_rate": 1.1340357038337723e-06, "loss": 0.0001, "step": 6679 }, { "epoch": 1.9549312262218321, "grad_norm": 0.00020254511036910117, "learning_rate": 1.1267193444541996e-06, "loss": 0.0, "step": 6680 }, { "epoch": 1.955223880597015, "grad_norm": 0.0009266235865652561, "learning_rate": 1.119402985074627e-06, "loss": 0.0, "step": 6681 }, { "epoch": 1.955516534972198, "grad_norm": 0.001342368428595364, "learning_rate": 1.112086625695054e-06, "loss": 0.0, "step": 6682 }, { "epoch": 1.9558091893473808, "grad_norm": 0.00018644121882971376, "learning_rate": 1.1047702663154814e-06, "loss": 0.0, "step": 6683 }, { "epoch": 1.9561018437225637, "grad_norm": 0.00033372986945323646, "learning_rate": 1.0974539069359087e-06, "loss": 0.0, "step": 6684 }, { "epoch": 1.9563944980977466, "grad_norm": 0.001495027681812644, "learning_rate": 1.090137547556336e-06, "loss": 0.0, "step": 6685 }, { "epoch": 1.9566871524729295, "grad_norm": 0.0002885348512791097, "learning_rate": 1.0828211881767633e-06, "loss": 0.0, "step": 6686 }, { "epoch": 1.9569798068481123, "grad_norm": 0.017791662365198135, "learning_rate": 1.0755048287971906e-06, "loss": 0.0001, "step": 6687 }, { "epoch": 1.9572724612232952, "grad_norm": 0.00032072325120680034, "learning_rate": 1.0681884694176177e-06, "loss": 0.0, "step": 6688 }, { "epoch": 1.9575651155984781, "grad_norm": 0.0002882403787225485, "learning_rate": 1.060872110038045e-06, "loss": 0.0, "step": 6689 }, { "epoch": 1.957857769973661, "grad_norm": 0.00029905178234912455, "learning_rate": 1.0535557506584724e-06, "loss": 0.0, "step": 6690 }, { "epoch": 1.958150424348844, "grad_norm": 18.095945358276367, "learning_rate": 1.0462393912788997e-06, "loss": 0.0775, "step": 6691 }, { "epoch": 1.9584430787240268, "grad_norm": 0.0052270409651100636, "learning_rate": 1.038923031899327e-06, "loss": 0.0001, "step": 6692 }, { "epoch": 1.9587357330992097, "grad_norm": 0.0042382171377539635, "learning_rate": 1.031606672519754e-06, "loss": 0.0001, "step": 6693 }, { "epoch": 1.9590283874743928, "grad_norm": 0.0034267029259353876, "learning_rate": 1.0242903131401814e-06, "loss": 0.0001, "step": 6694 }, { "epoch": 1.9593210418495757, "grad_norm": 0.0046326094307005405, "learning_rate": 1.0169739537606087e-06, "loss": 0.0001, "step": 6695 }, { "epoch": 1.9596136962247586, "grad_norm": 0.000998358242213726, "learning_rate": 1.009657594381036e-06, "loss": 0.0, "step": 6696 }, { "epoch": 1.9599063505999414, "grad_norm": 0.0005337099428288639, "learning_rate": 1.0023412350014633e-06, "loss": 0.0, "step": 6697 }, { "epoch": 1.9601990049751243, "grad_norm": 0.00036399663076736033, "learning_rate": 9.950248756218907e-07, "loss": 0.0, "step": 6698 }, { "epoch": 1.9604916593503074, "grad_norm": 0.0004057335900142789, "learning_rate": 9.877085162423178e-07, "loss": 0.0, "step": 6699 }, { "epoch": 1.9607843137254903, "grad_norm": 0.001365336705930531, "learning_rate": 9.80392156862745e-07, "loss": 0.0, "step": 6700 }, { "epoch": 1.9610769681006732, "grad_norm": 0.009753764607012272, "learning_rate": 9.730757974831724e-07, "loss": 0.0001, "step": 6701 }, { "epoch": 1.9613696224758561, "grad_norm": 0.0016042344504967332, "learning_rate": 9.657594381035997e-07, "loss": 0.0, "step": 6702 }, { "epoch": 1.961662276851039, "grad_norm": 0.00023163444711826742, "learning_rate": 9.58443078724027e-07, "loss": 0.0, "step": 6703 }, { "epoch": 1.9619549312262219, "grad_norm": 0.0005272638518363237, "learning_rate": 9.511267193444542e-07, "loss": 0.0, "step": 6704 }, { "epoch": 1.9622475856014048, "grad_norm": 0.002219425980001688, "learning_rate": 9.438103599648816e-07, "loss": 0.0, "step": 6705 }, { "epoch": 1.9625402399765877, "grad_norm": 0.0005287674139253795, "learning_rate": 9.364940005853088e-07, "loss": 0.0, "step": 6706 }, { "epoch": 1.9628328943517706, "grad_norm": 0.0002533525403123349, "learning_rate": 9.291776412057361e-07, "loss": 0.0, "step": 6707 }, { "epoch": 1.9631255487269534, "grad_norm": 0.001309822197072208, "learning_rate": 9.218612818261634e-07, "loss": 0.0, "step": 6708 }, { "epoch": 1.9634182031021363, "grad_norm": 0.0002741070347838104, "learning_rate": 9.145449224465906e-07, "loss": 0.0, "step": 6709 }, { "epoch": 1.9637108574773192, "grad_norm": 0.00017839217616710812, "learning_rate": 9.072285630670179e-07, "loss": 0.0, "step": 6710 }, { "epoch": 1.964003511852502, "grad_norm": 0.00026237950078211725, "learning_rate": 8.999122036874452e-07, "loss": 0.0, "step": 6711 }, { "epoch": 1.964296166227685, "grad_norm": 0.0003299491945654154, "learning_rate": 8.925958443078724e-07, "loss": 0.0, "step": 6712 }, { "epoch": 1.9645888206028679, "grad_norm": 0.000435072899563238, "learning_rate": 8.852794849282998e-07, "loss": 0.0, "step": 6713 }, { "epoch": 1.9648814749780508, "grad_norm": 0.0001929103018483147, "learning_rate": 8.779631255487271e-07, "loss": 0.0, "step": 6714 }, { "epoch": 1.9651741293532339, "grad_norm": 0.0003146830713376403, "learning_rate": 8.706467661691543e-07, "loss": 0.0, "step": 6715 }, { "epoch": 1.9654667837284168, "grad_norm": 0.00029584948788397014, "learning_rate": 8.633304067895816e-07, "loss": 0.0, "step": 6716 }, { "epoch": 1.9657594381035997, "grad_norm": 0.0007161550456658006, "learning_rate": 8.560140474100088e-07, "loss": 0.0, "step": 6717 }, { "epoch": 1.9660520924787825, "grad_norm": 0.0007010952103883028, "learning_rate": 8.486976880304361e-07, "loss": 0.0, "step": 6718 }, { "epoch": 1.9663447468539654, "grad_norm": 0.00037265176069922745, "learning_rate": 8.413813286508634e-07, "loss": 0.0, "step": 6719 }, { "epoch": 1.9666374012291485, "grad_norm": 0.002495676511898637, "learning_rate": 8.340649692712906e-07, "loss": 0.0, "step": 6720 }, { "epoch": 1.9669300556043314, "grad_norm": 0.0006068368093110621, "learning_rate": 8.26748609891718e-07, "loss": 0.0, "step": 6721 }, { "epoch": 1.9672227099795143, "grad_norm": 0.0013754927786067128, "learning_rate": 8.194322505121453e-07, "loss": 0.0, "step": 6722 }, { "epoch": 1.9675153643546972, "grad_norm": 0.00012472060916479677, "learning_rate": 8.121158911325725e-07, "loss": 0.0, "step": 6723 }, { "epoch": 1.96780801872988, "grad_norm": 0.0019130449509248137, "learning_rate": 8.047995317529998e-07, "loss": 0.0, "step": 6724 }, { "epoch": 1.968100673105063, "grad_norm": 0.0007002189522609115, "learning_rate": 7.974831723734271e-07, "loss": 0.0, "step": 6725 }, { "epoch": 1.9683933274802459, "grad_norm": 0.00025963722146116197, "learning_rate": 7.901668129938543e-07, "loss": 0.0, "step": 6726 }, { "epoch": 1.9686859818554288, "grad_norm": 0.00016337841225322336, "learning_rate": 7.828504536142816e-07, "loss": 0.0, "step": 6727 }, { "epoch": 1.9689786362306116, "grad_norm": 0.00019893913122359663, "learning_rate": 7.755340942347088e-07, "loss": 0.0, "step": 6728 }, { "epoch": 1.9692712906057945, "grad_norm": 0.001312097767367959, "learning_rate": 7.682177348551361e-07, "loss": 0.0, "step": 6729 }, { "epoch": 1.9695639449809774, "grad_norm": 0.003005587961524725, "learning_rate": 7.609013754755634e-07, "loss": 0.0, "step": 6730 }, { "epoch": 1.9698565993561603, "grad_norm": 0.0006958172889426351, "learning_rate": 7.535850160959907e-07, "loss": 0.0, "step": 6731 }, { "epoch": 1.9701492537313432, "grad_norm": 0.001711481250822544, "learning_rate": 7.462686567164179e-07, "loss": 0.0, "step": 6732 }, { "epoch": 1.970441908106526, "grad_norm": 0.0006918639992363751, "learning_rate": 7.389522973368452e-07, "loss": 0.0, "step": 6733 }, { "epoch": 1.970734562481709, "grad_norm": 0.00021657461184076965, "learning_rate": 7.316359379572725e-07, "loss": 0.0, "step": 6734 }, { "epoch": 1.9710272168568919, "grad_norm": 0.004396005999296904, "learning_rate": 7.243195785776997e-07, "loss": 0.0001, "step": 6735 }, { "epoch": 1.971319871232075, "grad_norm": 0.014619813300669193, "learning_rate": 7.17003219198127e-07, "loss": 0.0002, "step": 6736 }, { "epoch": 1.9716125256072579, "grad_norm": 0.005002989899367094, "learning_rate": 7.096868598185544e-07, "loss": 0.0001, "step": 6737 }, { "epoch": 1.9719051799824407, "grad_norm": 0.000817713444121182, "learning_rate": 7.023705004389816e-07, "loss": 0.0, "step": 6738 }, { "epoch": 1.9721978343576236, "grad_norm": 0.0016037857858464122, "learning_rate": 6.950541410594089e-07, "loss": 0.0, "step": 6739 }, { "epoch": 1.9724904887328065, "grad_norm": 0.0007369687082245946, "learning_rate": 6.877377816798362e-07, "loss": 0.0, "step": 6740 }, { "epoch": 1.9727831431079894, "grad_norm": 0.0002680577745195478, "learning_rate": 6.804214223002634e-07, "loss": 0.0, "step": 6741 }, { "epoch": 1.9730757974831725, "grad_norm": 0.00020591943757608533, "learning_rate": 6.731050629206907e-07, "loss": 0.0, "step": 6742 }, { "epoch": 1.9733684518583554, "grad_norm": 0.0026272779796272516, "learning_rate": 6.657887035411179e-07, "loss": 0.0, "step": 6743 }, { "epoch": 1.9736611062335383, "grad_norm": 0.0003438132116571069, "learning_rate": 6.584723441615452e-07, "loss": 0.0, "step": 6744 }, { "epoch": 1.9739537606087212, "grad_norm": 0.0003406984906177968, "learning_rate": 6.511559847819726e-07, "loss": 0.0, "step": 6745 }, { "epoch": 1.974246414983904, "grad_norm": 0.004853005520999432, "learning_rate": 6.438396254023998e-07, "loss": 0.0001, "step": 6746 }, { "epoch": 1.974539069359087, "grad_norm": 0.00017146035679616034, "learning_rate": 6.365232660228271e-07, "loss": 0.0, "step": 6747 }, { "epoch": 1.9748317237342699, "grad_norm": 0.0004404619103297591, "learning_rate": 6.292069066432544e-07, "loss": 0.0, "step": 6748 }, { "epoch": 1.9751243781094527, "grad_norm": 0.0004179801617283374, "learning_rate": 6.218905472636816e-07, "loss": 0.0, "step": 6749 }, { "epoch": 1.9754170324846356, "grad_norm": 0.00242697075009346, "learning_rate": 6.145741878841089e-07, "loss": 0.0, "step": 6750 }, { "epoch": 1.9757096868598185, "grad_norm": 0.00041327610961161554, "learning_rate": 6.072578285045362e-07, "loss": 0.0, "step": 6751 }, { "epoch": 1.9760023412350014, "grad_norm": 0.0005576847470365465, "learning_rate": 5.999414691249635e-07, "loss": 0.0, "step": 6752 }, { "epoch": 1.9762949956101843, "grad_norm": 0.0003334633365739137, "learning_rate": 5.926251097453908e-07, "loss": 0.0, "step": 6753 }, { "epoch": 1.9765876499853672, "grad_norm": 0.0013462539063766599, "learning_rate": 5.85308750365818e-07, "loss": 0.0, "step": 6754 }, { "epoch": 1.97688030436055, "grad_norm": 0.0004790348175447434, "learning_rate": 5.779923909862452e-07, "loss": 0.0, "step": 6755 }, { "epoch": 1.977172958735733, "grad_norm": 0.0011190996738150716, "learning_rate": 5.706760316066725e-07, "loss": 0.0, "step": 6756 }, { "epoch": 1.9774656131109158, "grad_norm": 0.0007272576913237572, "learning_rate": 5.633596722270998e-07, "loss": 0.0, "step": 6757 }, { "epoch": 1.977758267486099, "grad_norm": 0.0003873982059303671, "learning_rate": 5.56043312847527e-07, "loss": 0.0, "step": 6758 }, { "epoch": 1.9780509218612818, "grad_norm": 0.0011222549946978688, "learning_rate": 5.487269534679543e-07, "loss": 0.0, "step": 6759 }, { "epoch": 1.9783435762364647, "grad_norm": 0.003980504814535379, "learning_rate": 5.414105940883817e-07, "loss": 0.0001, "step": 6760 }, { "epoch": 1.9786362306116476, "grad_norm": 0.0041741845197975636, "learning_rate": 5.340942347088089e-07, "loss": 0.0001, "step": 6761 }, { "epoch": 1.9789288849868305, "grad_norm": 0.0011988022597506642, "learning_rate": 5.267778753292362e-07, "loss": 0.0, "step": 6762 }, { "epoch": 1.9792215393620136, "grad_norm": 0.0013455471489578485, "learning_rate": 5.194615159496635e-07, "loss": 0.0, "step": 6763 }, { "epoch": 1.9795141937371965, "grad_norm": 0.0007949606515467167, "learning_rate": 5.121451565700907e-07, "loss": 0.0, "step": 6764 }, { "epoch": 1.9798068481123794, "grad_norm": 0.0003572139248717576, "learning_rate": 5.04828797190518e-07, "loss": 0.0, "step": 6765 }, { "epoch": 1.9800995024875623, "grad_norm": 0.002746115205809474, "learning_rate": 4.975124378109453e-07, "loss": 0.0, "step": 6766 }, { "epoch": 1.9803921568627452, "grad_norm": 0.4750521183013916, "learning_rate": 4.901960784313725e-07, "loss": 0.0012, "step": 6767 }, { "epoch": 1.980684811237928, "grad_norm": 0.0010382290929555893, "learning_rate": 4.828797190517999e-07, "loss": 0.0, "step": 6768 }, { "epoch": 1.980977465613111, "grad_norm": 0.00034128170227631927, "learning_rate": 4.755633596722271e-07, "loss": 0.0, "step": 6769 }, { "epoch": 1.9812701199882938, "grad_norm": 0.0018365723080933094, "learning_rate": 4.682470002926544e-07, "loss": 0.0, "step": 6770 }, { "epoch": 1.9815627743634767, "grad_norm": 0.0009086875361390412, "learning_rate": 4.609306409130817e-07, "loss": 0.0, "step": 6771 }, { "epoch": 1.9818554287386596, "grad_norm": 0.00021227910474408418, "learning_rate": 4.5361428153350896e-07, "loss": 0.0, "step": 6772 }, { "epoch": 1.9821480831138425, "grad_norm": 0.003428248455747962, "learning_rate": 4.462979221539362e-07, "loss": 0.0, "step": 6773 }, { "epoch": 1.9824407374890254, "grad_norm": 0.0003043974284082651, "learning_rate": 4.3898156277436353e-07, "loss": 0.0, "step": 6774 }, { "epoch": 1.9827333918642083, "grad_norm": 0.0013239759718999267, "learning_rate": 4.316652033947908e-07, "loss": 0.0, "step": 6775 }, { "epoch": 1.9830260462393912, "grad_norm": 0.0003209889691788703, "learning_rate": 4.2434884401521806e-07, "loss": 0.0, "step": 6776 }, { "epoch": 1.983318700614574, "grad_norm": 0.002172942040488124, "learning_rate": 4.170324846356453e-07, "loss": 0.0, "step": 6777 }, { "epoch": 1.983611354989757, "grad_norm": 0.0012680566869676113, "learning_rate": 4.0971612525607264e-07, "loss": 0.0, "step": 6778 }, { "epoch": 1.98390400936494, "grad_norm": 0.0007207993185147643, "learning_rate": 4.023997658764999e-07, "loss": 0.0, "step": 6779 }, { "epoch": 1.984196663740123, "grad_norm": 0.0004271386715117842, "learning_rate": 3.9508340649692716e-07, "loss": 0.0, "step": 6780 }, { "epoch": 1.9844893181153058, "grad_norm": 0.0005705247749574482, "learning_rate": 3.877670471173544e-07, "loss": 0.0, "step": 6781 }, { "epoch": 1.9847819724904887, "grad_norm": 0.001100093242712319, "learning_rate": 3.804506877377817e-07, "loss": 0.0, "step": 6782 }, { "epoch": 1.9850746268656716, "grad_norm": 0.0004704415041487664, "learning_rate": 3.7313432835820895e-07, "loss": 0.0, "step": 6783 }, { "epoch": 1.9853672812408547, "grad_norm": 0.00019834673730656505, "learning_rate": 3.6581796897863626e-07, "loss": 0.0, "step": 6784 }, { "epoch": 1.9856599356160376, "grad_norm": 0.004286408890038729, "learning_rate": 3.585016095990635e-07, "loss": 0.0001, "step": 6785 }, { "epoch": 1.9859525899912205, "grad_norm": 0.00397590221837163, "learning_rate": 3.511852502194908e-07, "loss": 0.0, "step": 6786 }, { "epoch": 1.9862452443664034, "grad_norm": 0.0005816129269078374, "learning_rate": 3.438688908399181e-07, "loss": 0.0, "step": 6787 }, { "epoch": 1.9865378987415863, "grad_norm": 0.00028890985413454473, "learning_rate": 3.3655253146034536e-07, "loss": 0.0, "step": 6788 }, { "epoch": 1.9868305531167691, "grad_norm": 0.0007584394188597798, "learning_rate": 3.292361720807726e-07, "loss": 0.0, "step": 6789 }, { "epoch": 1.987123207491952, "grad_norm": 0.00016148912254720926, "learning_rate": 3.219198127011999e-07, "loss": 0.0, "step": 6790 }, { "epoch": 1.987415861867135, "grad_norm": 0.0012093628756701946, "learning_rate": 3.146034533216272e-07, "loss": 0.0, "step": 6791 }, { "epoch": 1.9877085162423178, "grad_norm": 0.000516733038239181, "learning_rate": 3.0728709394205446e-07, "loss": 0.0, "step": 6792 }, { "epoch": 1.9880011706175007, "grad_norm": 0.0004513988096732646, "learning_rate": 2.999707345624817e-07, "loss": 0.0, "step": 6793 }, { "epoch": 1.9882938249926836, "grad_norm": 0.00036689036642201245, "learning_rate": 2.92654375182909e-07, "loss": 0.0, "step": 6794 }, { "epoch": 1.9885864793678665, "grad_norm": 0.004359518177807331, "learning_rate": 2.8533801580333625e-07, "loss": 0.0001, "step": 6795 }, { "epoch": 1.9888791337430494, "grad_norm": 0.0005313651636242867, "learning_rate": 2.780216564237635e-07, "loss": 0.0, "step": 6796 }, { "epoch": 1.9891717881182323, "grad_norm": 0.002708370564505458, "learning_rate": 2.7070529704419083e-07, "loss": 0.0, "step": 6797 }, { "epoch": 1.9894644424934151, "grad_norm": 0.0008690576069056988, "learning_rate": 2.633889376646181e-07, "loss": 0.0, "step": 6798 }, { "epoch": 1.989757096868598, "grad_norm": 0.0009381992858834565, "learning_rate": 2.5607257828504535e-07, "loss": 0.0, "step": 6799 }, { "epoch": 1.9900497512437811, "grad_norm": 0.0011495305225253105, "learning_rate": 2.4875621890547267e-07, "loss": 0.0, "step": 6800 }, { "epoch": 1.990342405618964, "grad_norm": 0.0016372364480048418, "learning_rate": 2.4143985952589993e-07, "loss": 0.0, "step": 6801 }, { "epoch": 1.990635059994147, "grad_norm": 0.000637992168776691, "learning_rate": 2.341235001463272e-07, "loss": 0.0, "step": 6802 }, { "epoch": 1.9909277143693298, "grad_norm": 0.008570825681090355, "learning_rate": 2.2680714076675448e-07, "loss": 0.0001, "step": 6803 }, { "epoch": 1.9912203687445127, "grad_norm": 0.000408270483603701, "learning_rate": 2.1949078138718177e-07, "loss": 0.0, "step": 6804 }, { "epoch": 1.9915130231196958, "grad_norm": 0.004543962422758341, "learning_rate": 2.1217442200760903e-07, "loss": 0.0001, "step": 6805 }, { "epoch": 1.9918056774948787, "grad_norm": 0.007146683521568775, "learning_rate": 2.0485806262803632e-07, "loss": 0.0, "step": 6806 }, { "epoch": 1.9920983318700616, "grad_norm": 0.0007193053606897593, "learning_rate": 1.9754170324846358e-07, "loss": 0.0, "step": 6807 }, { "epoch": 1.9923909862452445, "grad_norm": 0.0036593666300177574, "learning_rate": 1.9022534386889084e-07, "loss": 0.0001, "step": 6808 }, { "epoch": 1.9926836406204274, "grad_norm": 0.00022914950386621058, "learning_rate": 1.8290898448931813e-07, "loss": 0.0, "step": 6809 }, { "epoch": 1.9929762949956102, "grad_norm": 0.06791463494300842, "learning_rate": 1.755926251097454e-07, "loss": 0.0004, "step": 6810 }, { "epoch": 1.9932689493707931, "grad_norm": 0.0003776339872274548, "learning_rate": 1.6827626573017268e-07, "loss": 0.0, "step": 6811 }, { "epoch": 1.993561603745976, "grad_norm": 0.0002613313845358789, "learning_rate": 1.6095990635059994e-07, "loss": 0.0, "step": 6812 }, { "epoch": 1.993854258121159, "grad_norm": 0.0006130607798695564, "learning_rate": 1.5364354697102723e-07, "loss": 0.0, "step": 6813 }, { "epoch": 1.9941469124963418, "grad_norm": 0.0002516958920750767, "learning_rate": 1.463271875914545e-07, "loss": 0.0, "step": 6814 }, { "epoch": 1.9944395668715247, "grad_norm": 0.007760388310998678, "learning_rate": 1.3901082821188176e-07, "loss": 0.0001, "step": 6815 }, { "epoch": 1.9947322212467076, "grad_norm": 0.00039384610136039555, "learning_rate": 1.3169446883230904e-07, "loss": 0.0, "step": 6816 }, { "epoch": 1.9950248756218905, "grad_norm": 0.0006702264072373509, "learning_rate": 1.2437810945273633e-07, "loss": 0.0, "step": 6817 }, { "epoch": 1.9953175299970733, "grad_norm": 0.000267669529421255, "learning_rate": 1.170617500731636e-07, "loss": 0.0, "step": 6818 }, { "epoch": 1.9956101843722562, "grad_norm": 0.0061849369667470455, "learning_rate": 1.0974539069359088e-07, "loss": 0.0001, "step": 6819 }, { "epoch": 1.9959028387474391, "grad_norm": 0.007272869814187288, "learning_rate": 1.0242903131401816e-07, "loss": 0.0, "step": 6820 }, { "epoch": 1.9961954931226222, "grad_norm": 0.003008312778547406, "learning_rate": 9.511267193444542e-08, "loss": 0.0, "step": 6821 }, { "epoch": 1.9964881474978051, "grad_norm": 0.000999949057586491, "learning_rate": 8.77963125548727e-08, "loss": 0.0, "step": 6822 }, { "epoch": 1.996780801872988, "grad_norm": 0.00047217021347023547, "learning_rate": 8.047995317529997e-08, "loss": 0.0, "step": 6823 }, { "epoch": 1.997073456248171, "grad_norm": 0.0005122284637764096, "learning_rate": 7.316359379572725e-08, "loss": 0.0, "step": 6824 }, { "epoch": 1.9973661106233538, "grad_norm": 0.06620123982429504, "learning_rate": 6.584723441615452e-08, "loss": 0.0003, "step": 6825 }, { "epoch": 1.9976587649985367, "grad_norm": 0.0002497498644515872, "learning_rate": 5.85308750365818e-08, "loss": 0.0, "step": 6826 }, { "epoch": 1.9979514193737198, "grad_norm": 0.0007749227224849164, "learning_rate": 5.121451565700908e-08, "loss": 0.0, "step": 6827 }, { "epoch": 1.9982440737489027, "grad_norm": 0.0003250261361245066, "learning_rate": 4.389815627743635e-08, "loss": 0.0, "step": 6828 }, { "epoch": 1.9985367281240856, "grad_norm": 0.000477856578072533, "learning_rate": 3.6581796897863623e-08, "loss": 0.0, "step": 6829 }, { "epoch": 1.9988293824992684, "grad_norm": 0.0005425257841125131, "learning_rate": 2.92654375182909e-08, "loss": 0.0, "step": 6830 }, { "epoch": 1.9991220368744513, "grad_norm": 0.0002844630216713995, "learning_rate": 2.1949078138718174e-08, "loss": 0.0, "step": 6831 }, { "epoch": 1.9994146912496342, "grad_norm": 0.0004807655932381749, "learning_rate": 1.463271875914545e-08, "loss": 0.0, "step": 6832 }, { "epoch": 1.9997073456248171, "grad_norm": 0.0002281743654748425, "learning_rate": 7.316359379572725e-09, "loss": 0.0, "step": 6833 }, { "epoch": 2.0, "grad_norm": 4.235203596181236e-05, "learning_rate": 0.0, "loss": 0.0, "step": 6834 } ], "logging_steps": 1, "max_steps": 6834, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.481922672034893e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }