{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0003621220351258, "eval_steps": 346, "global_step": 1381, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007242440702516748, "grad_norm": 0.22644081711769104, "learning_rate": 2e-05, "loss": 10.3782, "step": 1 }, { "epoch": 0.0014484881405033496, "grad_norm": 0.21413658559322357, "learning_rate": 4e-05, "loss": 10.378, "step": 2 }, { "epoch": 0.0021727322107550242, "grad_norm": 0.20930856466293335, "learning_rate": 6e-05, "loss": 10.3801, "step": 3 }, { "epoch": 0.002896976281006699, "grad_norm": 0.21580834686756134, "learning_rate": 8e-05, "loss": 10.3772, "step": 4 }, { "epoch": 0.003621220351258374, "grad_norm": 0.22320739924907684, "learning_rate": 0.0001, "loss": 10.3771, "step": 5 }, { "epoch": 0.0043454644215100485, "grad_norm": 0.20463065803050995, "learning_rate": 0.00012, "loss": 10.3765, "step": 6 }, { "epoch": 0.005069708491761723, "grad_norm": 0.2269536256790161, "learning_rate": 0.00014, "loss": 10.374, "step": 7 }, { "epoch": 0.005793952562013398, "grad_norm": 0.23299536108970642, "learning_rate": 0.00016, "loss": 10.3733, "step": 8 }, { "epoch": 0.006518196632265073, "grad_norm": 0.2280343770980835, "learning_rate": 0.00018, "loss": 10.3716, "step": 9 }, { "epoch": 0.007242440702516748, "grad_norm": 0.23425506055355072, "learning_rate": 0.0002, "loss": 10.371, "step": 10 }, { "epoch": 0.007966684772768424, "grad_norm": 0.26600638031959534, "learning_rate": 0.00019999973746050225, "loss": 10.3668, "step": 11 }, { "epoch": 0.008690928843020097, "grad_norm": 0.2833026051521301, "learning_rate": 0.00019999894984338746, "loss": 10.3668, "step": 12 }, { "epoch": 0.009415172913271772, "grad_norm": 0.31404921412467957, "learning_rate": 0.00019999763715279132, "loss": 10.3583, "step": 13 }, { "epoch": 0.010139416983523447, "grad_norm": 0.3399479389190674, "learning_rate": 0.00019999579939560644, "loss": 10.3628, "step": 14 }, { "epoch": 0.010863661053775122, "grad_norm": 0.3927531838417053, "learning_rate": 0.00019999343658148253, "loss": 10.3534, "step": 15 }, { "epoch": 0.011587905124026797, "grad_norm": 0.43685317039489746, "learning_rate": 0.00019999054872282622, "loss": 10.3498, "step": 16 }, { "epoch": 0.012312149194278471, "grad_norm": 0.5002313852310181, "learning_rate": 0.00019998713583480103, "loss": 10.3404, "step": 17 }, { "epoch": 0.013036393264530146, "grad_norm": 0.5821593999862671, "learning_rate": 0.00019998319793532735, "loss": 10.3355, "step": 18 }, { "epoch": 0.013760637334781821, "grad_norm": 0.6062823534011841, "learning_rate": 0.00019997873504508222, "loss": 10.329, "step": 19 }, { "epoch": 0.014484881405033496, "grad_norm": 0.6338623762130737, "learning_rate": 0.0001999737471874994, "loss": 10.3277, "step": 20 }, { "epoch": 0.015209125475285171, "grad_norm": 0.7111278176307678, "learning_rate": 0.00019996823438876902, "loss": 10.3189, "step": 21 }, { "epoch": 0.015933369545536848, "grad_norm": 0.7820473313331604, "learning_rate": 0.00019996219667783765, "loss": 10.3128, "step": 22 }, { "epoch": 0.01665761361578852, "grad_norm": 0.8882144093513489, "learning_rate": 0.00019995563408640806, "loss": 10.3005, "step": 23 }, { "epoch": 0.017381857686040194, "grad_norm": 0.9162967205047607, "learning_rate": 0.00019994854664893906, "loss": 10.2918, "step": 24 }, { "epoch": 0.01810610175629187, "grad_norm": 0.9656468033790588, "learning_rate": 0.00019994093440264522, "loss": 10.277, "step": 25 }, { "epoch": 0.018830345826543544, "grad_norm": 0.9979098439216614, "learning_rate": 0.00019993279738749687, "loss": 10.2707, "step": 26 }, { "epoch": 0.01955458989679522, "grad_norm": 0.9871697425842285, "learning_rate": 0.00019992413564621985, "loss": 10.2556, "step": 27 }, { "epoch": 0.020278833967046894, "grad_norm": 1.0156805515289307, "learning_rate": 0.00019991494922429504, "loss": 10.2469, "step": 28 }, { "epoch": 0.02100307803729857, "grad_norm": 0.9702807068824768, "learning_rate": 0.00019990523816995848, "loss": 10.2315, "step": 29 }, { "epoch": 0.021727322107550243, "grad_norm": 0.9247342348098755, "learning_rate": 0.0001998950025342008, "loss": 10.2134, "step": 30 }, { "epoch": 0.022451566177801918, "grad_norm": 0.8712878227233887, "learning_rate": 0.00019988424237076728, "loss": 10.1968, "step": 31 }, { "epoch": 0.023175810248053593, "grad_norm": 0.8503078818321228, "learning_rate": 0.0001998729577361572, "loss": 10.1812, "step": 32 }, { "epoch": 0.023900054318305268, "grad_norm": 0.8318528532981873, "learning_rate": 0.0001998611486896238, "loss": 10.1752, "step": 33 }, { "epoch": 0.024624298388556943, "grad_norm": 0.7345334887504578, "learning_rate": 0.00019984881529317393, "loss": 10.1601, "step": 34 }, { "epoch": 0.025348542458808618, "grad_norm": 0.7379999160766602, "learning_rate": 0.0001998359576115677, "loss": 10.1499, "step": 35 }, { "epoch": 0.026072786529060293, "grad_norm": 0.674487829208374, "learning_rate": 0.00019982257571231804, "loss": 10.1309, "step": 36 }, { "epoch": 0.026797030599311968, "grad_norm": 0.6472688317298889, "learning_rate": 0.00019980866966569054, "loss": 10.1182, "step": 37 }, { "epoch": 0.027521274669563642, "grad_norm": 0.6024581789970398, "learning_rate": 0.00019979423954470286, "loss": 10.1074, "step": 38 }, { "epoch": 0.028245518739815317, "grad_norm": 0.610586404800415, "learning_rate": 0.0001997792854251246, "loss": 10.0958, "step": 39 }, { "epoch": 0.028969762810066992, "grad_norm": 0.5543527007102966, "learning_rate": 0.00019976380738547666, "loss": 10.0964, "step": 40 }, { "epoch": 0.029694006880318667, "grad_norm": 0.5993051528930664, "learning_rate": 0.000199747805507031, "loss": 10.0835, "step": 41 }, { "epoch": 0.030418250950570342, "grad_norm": 0.5518122911453247, "learning_rate": 0.0001997312798738101, "loss": 10.0662, "step": 42 }, { "epoch": 0.031142495020822017, "grad_norm": 0.5521525144577026, "learning_rate": 0.00019971423057258664, "loss": 10.0451, "step": 43 }, { "epoch": 0.031866739091073695, "grad_norm": 0.5692468285560608, "learning_rate": 0.00019969665769288284, "loss": 10.0466, "step": 44 }, { "epoch": 0.03259098316132537, "grad_norm": 0.5686005353927612, "learning_rate": 0.00019967856132697027, "loss": 10.0256, "step": 45 }, { "epoch": 0.03331522723157704, "grad_norm": 0.5743457078933716, "learning_rate": 0.00019965994156986912, "loss": 10.0049, "step": 46 }, { "epoch": 0.034039471301828716, "grad_norm": 0.5410902500152588, "learning_rate": 0.0001996407985193478, "loss": 10.0077, "step": 47 }, { "epoch": 0.03476371537208039, "grad_norm": 0.5433526635169983, "learning_rate": 0.0001996211322759225, "loss": 9.9934, "step": 48 }, { "epoch": 0.035487959442332066, "grad_norm": 0.5521161556243896, "learning_rate": 0.00019960094294285647, "loss": 9.9753, "step": 49 }, { "epoch": 0.03621220351258374, "grad_norm": 0.6015803217887878, "learning_rate": 0.00019958023062615973, "loss": 9.976, "step": 50 }, { "epoch": 0.036936447582835416, "grad_norm": 0.5433859825134277, "learning_rate": 0.00019955899543458824, "loss": 9.9617, "step": 51 }, { "epoch": 0.03766069165308709, "grad_norm": 0.5231461524963379, "learning_rate": 0.00019953723747964355, "loss": 9.9525, "step": 52 }, { "epoch": 0.038384935723338766, "grad_norm": 0.5396632552146912, "learning_rate": 0.00019951495687557213, "loss": 9.9383, "step": 53 }, { "epoch": 0.03910917979359044, "grad_norm": 0.5196948051452637, "learning_rate": 0.00019949215373936475, "loss": 9.928, "step": 54 }, { "epoch": 0.039833423863842116, "grad_norm": 0.4962172210216522, "learning_rate": 0.00019946882819075587, "loss": 9.9235, "step": 55 }, { "epoch": 0.04055766793409379, "grad_norm": 0.5424382090568542, "learning_rate": 0.00019944498035222305, "loss": 9.9033, "step": 56 }, { "epoch": 0.041281912004345465, "grad_norm": 0.5209097266197205, "learning_rate": 0.00019942061034898626, "loss": 9.902, "step": 57 }, { "epoch": 0.04200615607459714, "grad_norm": 0.4981401562690735, "learning_rate": 0.00019939571830900735, "loss": 9.8884, "step": 58 }, { "epoch": 0.042730400144848815, "grad_norm": 0.4946765899658203, "learning_rate": 0.0001993703043629891, "loss": 9.8841, "step": 59 }, { "epoch": 0.04345464421510049, "grad_norm": 0.526029646396637, "learning_rate": 0.00019934436864437485, "loss": 9.8607, "step": 60 }, { "epoch": 0.044178888285352165, "grad_norm": 0.528313159942627, "learning_rate": 0.0001993179112893476, "loss": 9.8549, "step": 61 }, { "epoch": 0.044903132355603836, "grad_norm": 0.5022369623184204, "learning_rate": 0.00019929093243682938, "loss": 9.83, "step": 62 }, { "epoch": 0.045627376425855515, "grad_norm": 0.5330261588096619, "learning_rate": 0.00019926343222848042, "loss": 9.8244, "step": 63 }, { "epoch": 0.046351620496107186, "grad_norm": 0.5101606249809265, "learning_rate": 0.0001992354108086986, "loss": 9.8145, "step": 64 }, { "epoch": 0.047075864566358865, "grad_norm": 0.5162172317504883, "learning_rate": 0.0001992068683246185, "loss": 9.8123, "step": 65 }, { "epoch": 0.047800108636610536, "grad_norm": 0.5125104784965515, "learning_rate": 0.0001991778049261107, "loss": 9.7947, "step": 66 }, { "epoch": 0.048524352706862214, "grad_norm": 0.5619991421699524, "learning_rate": 0.00019914822076578097, "loss": 9.773, "step": 67 }, { "epoch": 0.049248596777113886, "grad_norm": 0.5185785889625549, "learning_rate": 0.0001991181159989696, "loss": 9.7618, "step": 68 }, { "epoch": 0.049972840847365564, "grad_norm": 0.4937543272972107, "learning_rate": 0.0001990874907837503, "loss": 9.7543, "step": 69 }, { "epoch": 0.050697084917617236, "grad_norm": 0.5016326904296875, "learning_rate": 0.00019905634528092972, "loss": 9.7509, "step": 70 }, { "epoch": 0.051421328987868914, "grad_norm": 0.5426475405693054, "learning_rate": 0.0001990246796540463, "loss": 9.7287, "step": 71 }, { "epoch": 0.052145573058120585, "grad_norm": 0.5001341700553894, "learning_rate": 0.00019899249406936964, "loss": 9.7159, "step": 72 }, { "epoch": 0.052869817128372264, "grad_norm": 0.5133237242698669, "learning_rate": 0.00019895978869589946, "loss": 9.7129, "step": 73 }, { "epoch": 0.053594061198623935, "grad_norm": 0.5022423267364502, "learning_rate": 0.00019892656370536482, "loss": 9.6986, "step": 74 }, { "epoch": 0.054318305268875614, "grad_norm": 0.5012578368186951, "learning_rate": 0.0001988928192722231, "loss": 9.6968, "step": 75 }, { "epoch": 0.055042549339127285, "grad_norm": 0.5184592008590698, "learning_rate": 0.00019885855557365937, "loss": 9.6754, "step": 76 }, { "epoch": 0.05576679340937896, "grad_norm": 0.49060916900634766, "learning_rate": 0.000198823772789585, "loss": 9.6662, "step": 77 }, { "epoch": 0.056491037479630635, "grad_norm": 0.5203105211257935, "learning_rate": 0.0001987884711026371, "loss": 9.6577, "step": 78 }, { "epoch": 0.05721528154988231, "grad_norm": 0.5055034160614014, "learning_rate": 0.00019875265069817743, "loss": 9.6392, "step": 79 }, { "epoch": 0.057939525620133984, "grad_norm": 0.5237613320350647, "learning_rate": 0.00019871631176429145, "loss": 9.6415, "step": 80 }, { "epoch": 0.05866376969038566, "grad_norm": 0.5136992931365967, "learning_rate": 0.0001986794544917872, "loss": 9.6265, "step": 81 }, { "epoch": 0.059388013760637334, "grad_norm": 0.5193312168121338, "learning_rate": 0.00019864207907419447, "loss": 9.6138, "step": 82 }, { "epoch": 0.06011225783088901, "grad_norm": 0.5376698970794678, "learning_rate": 0.0001986041857077638, "loss": 9.6057, "step": 83 }, { "epoch": 0.060836501901140684, "grad_norm": 0.52781081199646, "learning_rate": 0.00019856577459146526, "loss": 9.5839, "step": 84 }, { "epoch": 0.06156074597139236, "grad_norm": 0.5407407879829407, "learning_rate": 0.00019852684592698756, "loss": 9.5891, "step": 85 }, { "epoch": 0.062284990041644034, "grad_norm": 0.5036855936050415, "learning_rate": 0.0001984873999187369, "loss": 9.5722, "step": 86 }, { "epoch": 0.06300923411189571, "grad_norm": 0.5385954976081848, "learning_rate": 0.00019844743677383604, "loss": 9.5572, "step": 87 }, { "epoch": 0.06373347818214739, "grad_norm": 0.5041705965995789, "learning_rate": 0.00019840695670212302, "loss": 9.5528, "step": 88 }, { "epoch": 0.06445772225239906, "grad_norm": 0.5461394190788269, "learning_rate": 0.00019836595991615022, "loss": 9.5181, "step": 89 }, { "epoch": 0.06518196632265073, "grad_norm": 0.5438934564590454, "learning_rate": 0.00019832444663118315, "loss": 9.5303, "step": 90 }, { "epoch": 0.06590621039290241, "grad_norm": 0.5632613301277161, "learning_rate": 0.00019828241706519934, "loss": 9.5119, "step": 91 }, { "epoch": 0.06663045446315408, "grad_norm": 0.4781850278377533, "learning_rate": 0.0001982398714388872, "loss": 9.5208, "step": 92 }, { "epoch": 0.06735469853340575, "grad_norm": 0.5311410427093506, "learning_rate": 0.00019819680997564492, "loss": 9.5111, "step": 93 }, { "epoch": 0.06807894260365743, "grad_norm": 0.5310368537902832, "learning_rate": 0.00019815323290157916, "loss": 9.4963, "step": 94 }, { "epoch": 0.06880318667390911, "grad_norm": 0.5059214234352112, "learning_rate": 0.000198109140445504, "loss": 9.4878, "step": 95 }, { "epoch": 0.06952743074416078, "grad_norm": 0.517311155796051, "learning_rate": 0.00019806453283893963, "loss": 9.478, "step": 96 }, { "epoch": 0.07025167481441245, "grad_norm": 0.5341803431510925, "learning_rate": 0.00019801941031611126, "loss": 9.4307, "step": 97 }, { "epoch": 0.07097591888466413, "grad_norm": 0.5468002557754517, "learning_rate": 0.0001979737731139478, "loss": 9.462, "step": 98 }, { "epoch": 0.07170016295491581, "grad_norm": 0.5668586492538452, "learning_rate": 0.00019792762147208056, "loss": 9.4286, "step": 99 }, { "epoch": 0.07242440702516748, "grad_norm": 0.6626901626586914, "learning_rate": 0.00019788095563284217, "loss": 9.4239, "step": 100 }, { "epoch": 0.07314865109541915, "grad_norm": 0.5568472743034363, "learning_rate": 0.00019783377584126508, "loss": 9.4093, "step": 101 }, { "epoch": 0.07387289516567083, "grad_norm": 0.5283701419830322, "learning_rate": 0.00019778608234508055, "loss": 9.4273, "step": 102 }, { "epoch": 0.07459713923592251, "grad_norm": 0.5079399943351746, "learning_rate": 0.00019773787539471705, "loss": 9.3889, "step": 103 }, { "epoch": 0.07532138330617417, "grad_norm": 0.5189304351806641, "learning_rate": 0.00019768915524329917, "loss": 9.3884, "step": 104 }, { "epoch": 0.07604562737642585, "grad_norm": 0.504858136177063, "learning_rate": 0.00019763992214664615, "loss": 9.4027, "step": 105 }, { "epoch": 0.07676987144667753, "grad_norm": 0.5144493579864502, "learning_rate": 0.00019759017636327073, "loss": 9.3547, "step": 106 }, { "epoch": 0.07749411551692921, "grad_norm": 0.49943238496780396, "learning_rate": 0.0001975399181543775, "loss": 9.3584, "step": 107 }, { "epoch": 0.07821835958718087, "grad_norm": 0.5128799080848694, "learning_rate": 0.0001974891477838618, "loss": 9.3521, "step": 108 }, { "epoch": 0.07894260365743255, "grad_norm": 0.5011847019195557, "learning_rate": 0.00019743786551830813, "loss": 9.3468, "step": 109 }, { "epoch": 0.07966684772768423, "grad_norm": 0.5017261505126953, "learning_rate": 0.00019738607162698895, "loss": 9.3335, "step": 110 }, { "epoch": 0.08039109179793591, "grad_norm": 0.5072489976882935, "learning_rate": 0.00019733376638186308, "loss": 9.3045, "step": 111 }, { "epoch": 0.08111533586818757, "grad_norm": 0.5118771195411682, "learning_rate": 0.00019728095005757434, "loss": 9.3186, "step": 112 }, { "epoch": 0.08183957993843925, "grad_norm": 0.5291309356689453, "learning_rate": 0.0001972276229314502, "loss": 9.3077, "step": 113 }, { "epoch": 0.08256382400869093, "grad_norm": 0.49927669763565063, "learning_rate": 0.00019717378528350023, "loss": 9.2945, "step": 114 }, { "epoch": 0.08328806807894261, "grad_norm": 0.5379682183265686, "learning_rate": 0.00019711943739641452, "loss": 9.2844, "step": 115 }, { "epoch": 0.08401231214919427, "grad_norm": 0.5146782398223877, "learning_rate": 0.00019706457955556247, "loss": 9.2589, "step": 116 }, { "epoch": 0.08473655621944595, "grad_norm": 0.5160855054855347, "learning_rate": 0.0001970092120489911, "loss": 9.2522, "step": 117 }, { "epoch": 0.08546080028969763, "grad_norm": 0.5257864594459534, "learning_rate": 0.0001969533351674235, "loss": 9.2622, "step": 118 }, { "epoch": 0.08618504435994931, "grad_norm": 0.4924103617668152, "learning_rate": 0.00019689694920425746, "loss": 9.2382, "step": 119 }, { "epoch": 0.08690928843020097, "grad_norm": 0.5249960422515869, "learning_rate": 0.00019684005445556383, "loss": 9.2173, "step": 120 }, { "epoch": 0.08763353250045265, "grad_norm": 0.5177789330482483, "learning_rate": 0.000196782651220085, "loss": 9.2171, "step": 121 }, { "epoch": 0.08835777657070433, "grad_norm": 0.5228692889213562, "learning_rate": 0.0001967247397992333, "loss": 9.2168, "step": 122 }, { "epoch": 0.08908202064095601, "grad_norm": 0.5426861047744751, "learning_rate": 0.00019666632049708942, "loss": 9.1786, "step": 123 }, { "epoch": 0.08980626471120767, "grad_norm": 0.5005182027816772, "learning_rate": 0.00019660739362040082, "loss": 9.1994, "step": 124 }, { "epoch": 0.09053050878145935, "grad_norm": 0.516734778881073, "learning_rate": 0.00019654795947858023, "loss": 9.172, "step": 125 }, { "epoch": 0.09125475285171103, "grad_norm": 0.5049043297767639, "learning_rate": 0.00019648801838370377, "loss": 9.1714, "step": 126 }, { "epoch": 0.09197899692196271, "grad_norm": 0.5162491202354431, "learning_rate": 0.00019642757065050956, "loss": 9.149, "step": 127 }, { "epoch": 0.09270324099221437, "grad_norm": 0.49220365285873413, "learning_rate": 0.000196366616596396, "loss": 9.164, "step": 128 }, { "epoch": 0.09342748506246605, "grad_norm": 0.508711576461792, "learning_rate": 0.00019630515654141996, "loss": 9.1311, "step": 129 }, { "epoch": 0.09415172913271773, "grad_norm": 0.5290361642837524, "learning_rate": 0.0001962431908082953, "loss": 9.1079, "step": 130 }, { "epoch": 0.09487597320296941, "grad_norm": 0.5396744608879089, "learning_rate": 0.00019618071972239107, "loss": 9.115, "step": 131 }, { "epoch": 0.09560021727322107, "grad_norm": 0.49481114745140076, "learning_rate": 0.0001961177436117298, "loss": 9.1326, "step": 132 }, { "epoch": 0.09632446134347275, "grad_norm": 0.4906870126724243, "learning_rate": 0.0001960542628069859, "loss": 9.1192, "step": 133 }, { "epoch": 0.09704870541372443, "grad_norm": 0.5055575370788574, "learning_rate": 0.00019599027764148367, "loss": 9.1099, "step": 134 }, { "epoch": 0.0977729494839761, "grad_norm": 0.5195087790489197, "learning_rate": 0.00019592578845119575, "loss": 9.0824, "step": 135 }, { "epoch": 0.09849719355422777, "grad_norm": 0.48597005009651184, "learning_rate": 0.0001958607955747414, "loss": 9.0973, "step": 136 }, { "epoch": 0.09922143762447945, "grad_norm": 0.5294395089149475, "learning_rate": 0.0001957952993533845, "loss": 9.0725, "step": 137 }, { "epoch": 0.09994568169473113, "grad_norm": 0.5333034992218018, "learning_rate": 0.00019572930013103202, "loss": 9.0177, "step": 138 }, { "epoch": 0.10066992576498279, "grad_norm": 0.5079728960990906, "learning_rate": 0.00019566279825423196, "loss": 9.0231, "step": 139 }, { "epoch": 0.10139416983523447, "grad_norm": 0.5101923942565918, "learning_rate": 0.00019559579407217172, "loss": 9.0478, "step": 140 }, { "epoch": 0.10211841390548615, "grad_norm": 0.5175127387046814, "learning_rate": 0.0001955282879366762, "loss": 9.0333, "step": 141 }, { "epoch": 0.10284265797573783, "grad_norm": 0.5011202692985535, "learning_rate": 0.00019546028020220595, "loss": 9.0051, "step": 142 }, { "epoch": 0.10356690204598949, "grad_norm": 0.5366819500923157, "learning_rate": 0.00019539177122585523, "loss": 9.014, "step": 143 }, { "epoch": 0.10429114611624117, "grad_norm": 0.5233102440834045, "learning_rate": 0.00019532276136735038, "loss": 8.9821, "step": 144 }, { "epoch": 0.10501539018649285, "grad_norm": 0.5242369771003723, "learning_rate": 0.00019525325098904757, "loss": 9.0231, "step": 145 }, { "epoch": 0.10573963425674453, "grad_norm": 0.5241793394088745, "learning_rate": 0.00019518324045593132, "loss": 8.9769, "step": 146 }, { "epoch": 0.10646387832699619, "grad_norm": 0.5393111109733582, "learning_rate": 0.0001951127301356121, "loss": 9.0189, "step": 147 }, { "epoch": 0.10718812239724787, "grad_norm": 0.5141487717628479, "learning_rate": 0.00019504172039832492, "loss": 8.9846, "step": 148 }, { "epoch": 0.10791236646749955, "grad_norm": 0.5347380042076111, "learning_rate": 0.00019497021161692687, "loss": 8.9623, "step": 149 }, { "epoch": 0.10863661053775123, "grad_norm": 0.5859756469726562, "learning_rate": 0.00019489820416689565, "loss": 8.9612, "step": 150 }, { "epoch": 0.10936085460800289, "grad_norm": 0.5630178451538086, "learning_rate": 0.0001948256984263272, "loss": 8.9316, "step": 151 }, { "epoch": 0.11008509867825457, "grad_norm": 0.5126703381538391, "learning_rate": 0.00019475269477593394, "loss": 8.9421, "step": 152 }, { "epoch": 0.11080934274850625, "grad_norm": 0.5410084128379822, "learning_rate": 0.0001946791935990427, "loss": 8.9093, "step": 153 }, { "epoch": 0.11153358681875793, "grad_norm": 0.500529408454895, "learning_rate": 0.00019460519528159275, "loss": 8.9258, "step": 154 }, { "epoch": 0.11225783088900959, "grad_norm": 0.505597710609436, "learning_rate": 0.00019453070021213366, "loss": 8.9067, "step": 155 }, { "epoch": 0.11298207495926127, "grad_norm": 0.5078559517860413, "learning_rate": 0.00019445570878182342, "loss": 8.8985, "step": 156 }, { "epoch": 0.11370631902951295, "grad_norm": 0.48962968587875366, "learning_rate": 0.0001943802213844263, "loss": 8.8889, "step": 157 }, { "epoch": 0.11443056309976463, "grad_norm": 0.521984338760376, "learning_rate": 0.00019430423841631074, "loss": 8.8726, "step": 158 }, { "epoch": 0.11515480717001629, "grad_norm": 0.510870635509491, "learning_rate": 0.00019422776027644737, "loss": 8.8657, "step": 159 }, { "epoch": 0.11587905124026797, "grad_norm": 0.48674073815345764, "learning_rate": 0.0001941507873664068, "loss": 8.8745, "step": 160 }, { "epoch": 0.11660329531051965, "grad_norm": 0.5165760517120361, "learning_rate": 0.00019407332009035769, "loss": 8.8614, "step": 161 }, { "epoch": 0.11732753938077133, "grad_norm": 0.5049470067024231, "learning_rate": 0.00019399535885506432, "loss": 8.8781, "step": 162 }, { "epoch": 0.11805178345102299, "grad_norm": 0.49627047777175903, "learning_rate": 0.00019391690406988485, "loss": 8.832, "step": 163 }, { "epoch": 0.11877602752127467, "grad_norm": 0.5152491331100464, "learning_rate": 0.00019383795614676886, "loss": 8.852, "step": 164 }, { "epoch": 0.11950027159152635, "grad_norm": 0.49169403314590454, "learning_rate": 0.00019375851550025529, "loss": 8.8621, "step": 165 }, { "epoch": 0.12022451566177803, "grad_norm": 0.5024704337120056, "learning_rate": 0.00019367858254747028, "loss": 8.8057, "step": 166 }, { "epoch": 0.12094875973202969, "grad_norm": 0.5297830104827881, "learning_rate": 0.00019359815770812503, "loss": 8.7918, "step": 167 }, { "epoch": 0.12167300380228137, "grad_norm": 0.5009456276893616, "learning_rate": 0.00019351724140451344, "loss": 8.795, "step": 168 }, { "epoch": 0.12239724787253305, "grad_norm": 0.49791526794433594, "learning_rate": 0.00019343583406151004, "loss": 8.7908, "step": 169 }, { "epoch": 0.12312149194278472, "grad_norm": 0.4869917035102844, "learning_rate": 0.00019335393610656767, "loss": 8.8117, "step": 170 }, { "epoch": 0.12384573601303639, "grad_norm": 0.4747772216796875, "learning_rate": 0.00019327154796971527, "loss": 8.7738, "step": 171 }, { "epoch": 0.12456998008328807, "grad_norm": 0.49815577268600464, "learning_rate": 0.0001931886700835557, "loss": 8.7769, "step": 172 }, { "epoch": 0.12529422415353975, "grad_norm": 0.4809548556804657, "learning_rate": 0.00019310530288326329, "loss": 8.7769, "step": 173 }, { "epoch": 0.12601846822379142, "grad_norm": 0.519578218460083, "learning_rate": 0.00019302144680658173, "loss": 8.7652, "step": 174 }, { "epoch": 0.1267427122940431, "grad_norm": 0.49192696809768677, "learning_rate": 0.0001929371022938216, "loss": 8.7399, "step": 175 }, { "epoch": 0.12746695636429478, "grad_norm": 0.5109142661094666, "learning_rate": 0.00019285226978785832, "loss": 8.7374, "step": 176 }, { "epoch": 0.12819120043454643, "grad_norm": 0.48110830783843994, "learning_rate": 0.00019276694973412948, "loss": 8.7524, "step": 177 }, { "epoch": 0.1289154445047981, "grad_norm": 0.511137843132019, "learning_rate": 0.0001926811425806328, "loss": 8.7089, "step": 178 }, { "epoch": 0.1296396885750498, "grad_norm": 0.49463972449302673, "learning_rate": 0.00019259484877792358, "loss": 8.7089, "step": 179 }, { "epoch": 0.13036393264530147, "grad_norm": 0.4988570213317871, "learning_rate": 0.00019250806877911249, "loss": 8.7283, "step": 180 }, { "epoch": 0.13108817671555315, "grad_norm": 0.48762744665145874, "learning_rate": 0.00019242080303986305, "loss": 8.6905, "step": 181 }, { "epoch": 0.13181242078580482, "grad_norm": 0.5100088119506836, "learning_rate": 0.00019233305201838937, "loss": 8.6756, "step": 182 }, { "epoch": 0.1325366648560565, "grad_norm": 0.502967119216919, "learning_rate": 0.00019224481617545358, "loss": 8.6831, "step": 183 }, { "epoch": 0.13326090892630815, "grad_norm": 0.4762354791164398, "learning_rate": 0.00019215609597436362, "loss": 8.6856, "step": 184 }, { "epoch": 0.13398515299655983, "grad_norm": 0.5262447595596313, "learning_rate": 0.00019206689188097054, "loss": 8.6393, "step": 185 }, { "epoch": 0.1347093970668115, "grad_norm": 0.5070264339447021, "learning_rate": 0.00019197720436366637, "loss": 8.628, "step": 186 }, { "epoch": 0.1354336411370632, "grad_norm": 0.4977290630340576, "learning_rate": 0.00019188703389338142, "loss": 8.6412, "step": 187 }, { "epoch": 0.13615788520731487, "grad_norm": 0.5031901001930237, "learning_rate": 0.00019179638094358187, "loss": 8.6122, "step": 188 }, { "epoch": 0.13688212927756654, "grad_norm": 0.5076611638069153, "learning_rate": 0.00019170524599026732, "loss": 8.6475, "step": 189 }, { "epoch": 0.13760637334781822, "grad_norm": 0.5410529971122742, "learning_rate": 0.00019161362951196825, "loss": 8.6091, "step": 190 }, { "epoch": 0.1383306174180699, "grad_norm": 0.5248331427574158, "learning_rate": 0.0001915215319897436, "loss": 8.6022, "step": 191 }, { "epoch": 0.13905486148832155, "grad_norm": 0.5243874788284302, "learning_rate": 0.00019142895390717804, "loss": 8.6125, "step": 192 }, { "epoch": 0.13977910555857323, "grad_norm": 0.597634494304657, "learning_rate": 0.0001913358957503797, "loss": 8.6093, "step": 193 }, { "epoch": 0.1405033496288249, "grad_norm": 0.528044581413269, "learning_rate": 0.0001912423580079774, "loss": 8.6003, "step": 194 }, { "epoch": 0.1412275936990766, "grad_norm": 0.5066649913787842, "learning_rate": 0.00019114834117111814, "loss": 8.5842, "step": 195 }, { "epoch": 0.14195183776932827, "grad_norm": 0.49779024720191956, "learning_rate": 0.00019105384573346463, "loss": 8.6286, "step": 196 }, { "epoch": 0.14267608183957994, "grad_norm": 0.5297601222991943, "learning_rate": 0.00019095887219119256, "loss": 8.5814, "step": 197 }, { "epoch": 0.14340032590983162, "grad_norm": 0.5315724015235901, "learning_rate": 0.000190863421042988, "loss": 8.5826, "step": 198 }, { "epoch": 0.1441245699800833, "grad_norm": 0.5855052471160889, "learning_rate": 0.00019076749279004496, "loss": 8.5369, "step": 199 }, { "epoch": 0.14484881405033495, "grad_norm": 0.5880944728851318, "learning_rate": 0.0001906710879360625, "loss": 8.5681, "step": 200 }, { "epoch": 0.14557305812058663, "grad_norm": 0.5085429549217224, "learning_rate": 0.00019057420698724223, "loss": 8.5739, "step": 201 }, { "epoch": 0.1462973021908383, "grad_norm": 0.541529655456543, "learning_rate": 0.00019047685045228569, "loss": 8.5405, "step": 202 }, { "epoch": 0.14702154626108999, "grad_norm": 0.5351887941360474, "learning_rate": 0.0001903790188423916, "loss": 8.5307, "step": 203 }, { "epoch": 0.14774579033134166, "grad_norm": 0.4913345277309418, "learning_rate": 0.00019028071267125323, "loss": 8.5303, "step": 204 }, { "epoch": 0.14847003440159334, "grad_norm": 0.476534903049469, "learning_rate": 0.0001901819324550556, "loss": 8.546, "step": 205 }, { "epoch": 0.14919427847184502, "grad_norm": 0.511883020401001, "learning_rate": 0.00019008267871247286, "loss": 8.523, "step": 206 }, { "epoch": 0.1499185225420967, "grad_norm": 0.5081936717033386, "learning_rate": 0.0001899829519646656, "loss": 8.5286, "step": 207 }, { "epoch": 0.15064276661234835, "grad_norm": 0.4755452275276184, "learning_rate": 0.000189882752735278, "loss": 8.5464, "step": 208 }, { "epoch": 0.15136701068260003, "grad_norm": 0.5027110576629639, "learning_rate": 0.0001897820815504352, "loss": 8.5272, "step": 209 }, { "epoch": 0.1520912547528517, "grad_norm": 0.5083355903625488, "learning_rate": 0.0001896809389387404, "loss": 8.4798, "step": 210 }, { "epoch": 0.15281549882310339, "grad_norm": 0.5499453544616699, "learning_rate": 0.00018957932543127226, "loss": 8.5331, "step": 211 }, { "epoch": 0.15353974289335506, "grad_norm": 0.51893150806427, "learning_rate": 0.00018947724156158192, "loss": 8.4911, "step": 212 }, { "epoch": 0.15426398696360674, "grad_norm": 0.5075845718383789, "learning_rate": 0.00018937468786569034, "loss": 8.4956, "step": 213 }, { "epoch": 0.15498823103385842, "grad_norm": 0.5007838606834412, "learning_rate": 0.00018927166488208548, "loss": 8.4777, "step": 214 }, { "epoch": 0.1557124751041101, "grad_norm": 0.5123993158340454, "learning_rate": 0.00018916817315171934, "loss": 8.4535, "step": 215 }, { "epoch": 0.15643671917436175, "grad_norm": 0.5103585124015808, "learning_rate": 0.00018906421321800528, "loss": 8.4328, "step": 216 }, { "epoch": 0.15716096324461343, "grad_norm": 0.49165982007980347, "learning_rate": 0.00018895978562681506, "loss": 8.4659, "step": 217 }, { "epoch": 0.1578852073148651, "grad_norm": 0.5067943930625916, "learning_rate": 0.00018885489092647606, "loss": 8.4503, "step": 218 }, { "epoch": 0.15860945138511678, "grad_norm": 0.5133797526359558, "learning_rate": 0.0001887495296677683, "loss": 8.4473, "step": 219 }, { "epoch": 0.15933369545536846, "grad_norm": 0.4692786633968353, "learning_rate": 0.0001886437024039216, "loss": 8.4552, "step": 220 }, { "epoch": 0.16005793952562014, "grad_norm": 0.4994044005870819, "learning_rate": 0.00018853740969061272, "loss": 8.4293, "step": 221 }, { "epoch": 0.16078218359587182, "grad_norm": 0.5162525773048401, "learning_rate": 0.00018843065208596236, "loss": 8.3714, "step": 222 }, { "epoch": 0.16150642766612347, "grad_norm": 0.48502400517463684, "learning_rate": 0.00018832343015053228, "loss": 8.4212, "step": 223 }, { "epoch": 0.16223067173637515, "grad_norm": 0.48316872119903564, "learning_rate": 0.00018821574444732235, "loss": 8.3908, "step": 224 }, { "epoch": 0.16295491580662683, "grad_norm": 0.5301130414009094, "learning_rate": 0.0001881075955417676, "loss": 8.4019, "step": 225 }, { "epoch": 0.1636791598768785, "grad_norm": 0.5196858048439026, "learning_rate": 0.0001879989840017351, "loss": 8.3669, "step": 226 }, { "epoch": 0.16440340394713018, "grad_norm": 0.4746645390987396, "learning_rate": 0.0001878899103975214, "loss": 8.3958, "step": 227 }, { "epoch": 0.16512764801738186, "grad_norm": 0.4931572675704956, "learning_rate": 0.000187780375301849, "loss": 8.3942, "step": 228 }, { "epoch": 0.16585189208763354, "grad_norm": 0.5066322684288025, "learning_rate": 0.00018767037928986367, "loss": 8.3708, "step": 229 }, { "epoch": 0.16657613615788522, "grad_norm": 0.46926993131637573, "learning_rate": 0.00018755992293913135, "loss": 8.3315, "step": 230 }, { "epoch": 0.16730038022813687, "grad_norm": 0.4935818314552307, "learning_rate": 0.00018744900682963523, "loss": 8.3768, "step": 231 }, { "epoch": 0.16802462429838855, "grad_norm": 0.5058098435401917, "learning_rate": 0.0001873376315437724, "loss": 8.332, "step": 232 }, { "epoch": 0.16874886836864023, "grad_norm": 0.4851613938808441, "learning_rate": 0.00018722579766635117, "loss": 8.3578, "step": 233 }, { "epoch": 0.1694731124388919, "grad_norm": 0.4681190550327301, "learning_rate": 0.00018711350578458767, "loss": 8.3757, "step": 234 }, { "epoch": 0.17019735650914358, "grad_norm": 0.48669615387916565, "learning_rate": 0.00018700075648810303, "loss": 8.3425, "step": 235 }, { "epoch": 0.17092160057939526, "grad_norm": 0.4987237751483917, "learning_rate": 0.00018688755036892012, "loss": 8.3175, "step": 236 }, { "epoch": 0.17164584464964694, "grad_norm": 0.4954805076122284, "learning_rate": 0.0001867738880214605, "loss": 8.3055, "step": 237 }, { "epoch": 0.17237008871989862, "grad_norm": 0.5057269930839539, "learning_rate": 0.00018665977004254125, "loss": 8.3266, "step": 238 }, { "epoch": 0.17309433279015027, "grad_norm": 0.5142120122909546, "learning_rate": 0.00018654519703137191, "loss": 8.2739, "step": 239 }, { "epoch": 0.17381857686040195, "grad_norm": 0.49817487597465515, "learning_rate": 0.00018643016958955135, "loss": 8.292, "step": 240 }, { "epoch": 0.17454282093065362, "grad_norm": 0.49923181533813477, "learning_rate": 0.00018631468832106446, "loss": 8.2962, "step": 241 }, { "epoch": 0.1752670650009053, "grad_norm": 0.5132392048835754, "learning_rate": 0.00018619875383227912, "loss": 8.3461, "step": 242 }, { "epoch": 0.17599130907115698, "grad_norm": 0.5324509739875793, "learning_rate": 0.000186082366731943, "loss": 8.2242, "step": 243 }, { "epoch": 0.17671555314140866, "grad_norm": 0.5011392831802368, "learning_rate": 0.0001859655276311803, "loss": 8.2918, "step": 244 }, { "epoch": 0.17743979721166034, "grad_norm": 0.5092015862464905, "learning_rate": 0.0001858482371434886, "loss": 8.2892, "step": 245 }, { "epoch": 0.17816404128191202, "grad_norm": 0.531007707118988, "learning_rate": 0.00018573049588473564, "loss": 8.2796, "step": 246 }, { "epoch": 0.17888828535216367, "grad_norm": 0.5116980075836182, "learning_rate": 0.00018561230447315604, "loss": 8.2792, "step": 247 }, { "epoch": 0.17961252942241535, "grad_norm": 0.5366231203079224, "learning_rate": 0.000185493663529348, "loss": 8.2506, "step": 248 }, { "epoch": 0.18033677349266702, "grad_norm": 0.595176637172699, "learning_rate": 0.0001853745736762703, "loss": 8.2615, "step": 249 }, { "epoch": 0.1810610175629187, "grad_norm": 0.5762497186660767, "learning_rate": 0.0001852550355392387, "loss": 8.2754, "step": 250 }, { "epoch": 0.18178526163317038, "grad_norm": 0.5076980590820312, "learning_rate": 0.00018513504974592283, "loss": 8.2569, "step": 251 }, { "epoch": 0.18250950570342206, "grad_norm": 0.5133912563323975, "learning_rate": 0.0001850146169263429, "loss": 8.2293, "step": 252 }, { "epoch": 0.18323374977367374, "grad_norm": 0.5173690915107727, "learning_rate": 0.00018489373771286637, "loss": 8.2451, "step": 253 }, { "epoch": 0.18395799384392542, "grad_norm": 0.5147438645362854, "learning_rate": 0.00018477241274020458, "loss": 8.1972, "step": 254 }, { "epoch": 0.18468223791417707, "grad_norm": 0.48920467495918274, "learning_rate": 0.00018465064264540945, "loss": 8.2376, "step": 255 }, { "epoch": 0.18540648198442874, "grad_norm": 0.4959597587585449, "learning_rate": 0.00018452842806787026, "loss": 8.2552, "step": 256 }, { "epoch": 0.18613072605468042, "grad_norm": 0.48315390944480896, "learning_rate": 0.00018440576964930998, "loss": 8.2317, "step": 257 }, { "epoch": 0.1868549701249321, "grad_norm": 0.5170494914054871, "learning_rate": 0.00018428266803378226, "loss": 8.2036, "step": 258 }, { "epoch": 0.18757921419518378, "grad_norm": 0.4896693229675293, "learning_rate": 0.00018415912386766781, "loss": 8.2285, "step": 259 }, { "epoch": 0.18830345826543546, "grad_norm": 0.5025380849838257, "learning_rate": 0.00018403513779967115, "loss": 8.2247, "step": 260 }, { "epoch": 0.18902770233568714, "grad_norm": 0.4876105785369873, "learning_rate": 0.000183910710480817, "loss": 8.2347, "step": 261 }, { "epoch": 0.18975194640593882, "grad_norm": 0.4978480041027069, "learning_rate": 0.00018378584256444712, "loss": 8.1992, "step": 262 }, { "epoch": 0.19047619047619047, "grad_norm": 0.47433796525001526, "learning_rate": 0.00018366053470621668, "loss": 8.2212, "step": 263 }, { "epoch": 0.19120043454644214, "grad_norm": 0.4850807785987854, "learning_rate": 0.00018353478756409096, "loss": 8.2226, "step": 264 }, { "epoch": 0.19192467861669382, "grad_norm": 0.5015024542808533, "learning_rate": 0.00018340860179834177, "loss": 8.1905, "step": 265 }, { "epoch": 0.1926489226869455, "grad_norm": 0.5145589113235474, "learning_rate": 0.00018328197807154407, "loss": 8.1793, "step": 266 }, { "epoch": 0.19337316675719718, "grad_norm": 0.48222506046295166, "learning_rate": 0.00018315491704857246, "loss": 8.1716, "step": 267 }, { "epoch": 0.19409741082744886, "grad_norm": 0.5012249946594238, "learning_rate": 0.00018302741939659763, "loss": 8.1869, "step": 268 }, { "epoch": 0.19482165489770054, "grad_norm": 0.494550496339798, "learning_rate": 0.00018289948578508307, "loss": 8.1733, "step": 269 }, { "epoch": 0.1955458989679522, "grad_norm": 0.51914381980896, "learning_rate": 0.00018277111688578122, "loss": 8.1836, "step": 270 }, { "epoch": 0.19627014303820386, "grad_norm": 0.5137649774551392, "learning_rate": 0.00018264231337273022, "loss": 8.1376, "step": 271 }, { "epoch": 0.19699438710845554, "grad_norm": 0.4887925982475281, "learning_rate": 0.0001825130759222503, "loss": 8.1645, "step": 272 }, { "epoch": 0.19771863117870722, "grad_norm": 0.48226258158683777, "learning_rate": 0.0001823834052129401, "loss": 8.1425, "step": 273 }, { "epoch": 0.1984428752489589, "grad_norm": 0.4919883906841278, "learning_rate": 0.00018225330192567335, "loss": 8.1622, "step": 274 }, { "epoch": 0.19916711931921058, "grad_norm": 0.49928978085517883, "learning_rate": 0.00018212276674359508, "loss": 8.1414, "step": 275 }, { "epoch": 0.19989136338946226, "grad_norm": 0.5267078280448914, "learning_rate": 0.00018199180035211805, "loss": 8.0985, "step": 276 }, { "epoch": 0.20061560745971393, "grad_norm": 0.5004377365112305, "learning_rate": 0.0001818604034389193, "loss": 8.1365, "step": 277 }, { "epoch": 0.20133985152996559, "grad_norm": 0.5070396661758423, "learning_rate": 0.00018172857669393645, "loss": 8.1672, "step": 278 }, { "epoch": 0.20206409560021726, "grad_norm": 0.49480506777763367, "learning_rate": 0.000181596320809364, "loss": 8.1374, "step": 279 }, { "epoch": 0.20278833967046894, "grad_norm": 0.4668210446834564, "learning_rate": 0.0001814636364796499, "loss": 8.1812, "step": 280 }, { "epoch": 0.20351258374072062, "grad_norm": 0.49613460898399353, "learning_rate": 0.00018133052440149163, "loss": 8.1357, "step": 281 }, { "epoch": 0.2042368278109723, "grad_norm": 0.5054727792739868, "learning_rate": 0.00018119698527383274, "loss": 8.0969, "step": 282 }, { "epoch": 0.20496107188122398, "grad_norm": 0.49716633558273315, "learning_rate": 0.0001810630197978592, "loss": 8.1176, "step": 283 }, { "epoch": 0.20568531595147566, "grad_norm": 0.491693377494812, "learning_rate": 0.00018092862867699557, "loss": 8.118, "step": 284 }, { "epoch": 0.20640956002172733, "grad_norm": 0.498813658952713, "learning_rate": 0.00018079381261690134, "loss": 8.0726, "step": 285 }, { "epoch": 0.20713380409197898, "grad_norm": 0.5227881669998169, "learning_rate": 0.00018065857232546736, "loss": 8.0672, "step": 286 }, { "epoch": 0.20785804816223066, "grad_norm": 0.4753732979297638, "learning_rate": 0.00018052290851281204, "loss": 8.06, "step": 287 }, { "epoch": 0.20858229223248234, "grad_norm": 0.510330080986023, "learning_rate": 0.0001803868218912775, "loss": 8.0716, "step": 288 }, { "epoch": 0.20930653630273402, "grad_norm": 0.5118574500083923, "learning_rate": 0.000180250313175426, "loss": 8.1125, "step": 289 }, { "epoch": 0.2100307803729857, "grad_norm": 0.5137168169021606, "learning_rate": 0.00018011338308203623, "loss": 8.0358, "step": 290 }, { "epoch": 0.21075502444323738, "grad_norm": 0.48737525939941406, "learning_rate": 0.00017997603233009922, "loss": 8.1156, "step": 291 }, { "epoch": 0.21147926851348905, "grad_norm": 0.5204469561576843, "learning_rate": 0.00017983826164081503, "loss": 8.0484, "step": 292 }, { "epoch": 0.21220351258374073, "grad_norm": 0.4880678951740265, "learning_rate": 0.00017970007173758856, "loss": 8.0651, "step": 293 }, { "epoch": 0.21292775665399238, "grad_norm": 0.47253793478012085, "learning_rate": 0.00017956146334602595, "loss": 8.0648, "step": 294 }, { "epoch": 0.21365200072424406, "grad_norm": 0.5147983431816101, "learning_rate": 0.00017942243719393076, "loss": 8.0625, "step": 295 }, { "epoch": 0.21437624479449574, "grad_norm": 0.5061482191085815, "learning_rate": 0.00017928299401130012, "loss": 8.059, "step": 296 }, { "epoch": 0.21510048886474742, "grad_norm": 0.5227460265159607, "learning_rate": 0.00017914313453032093, "loss": 8.0338, "step": 297 }, { "epoch": 0.2158247329349991, "grad_norm": 0.5411704182624817, "learning_rate": 0.00017900285948536587, "loss": 8.0671, "step": 298 }, { "epoch": 0.21654897700525078, "grad_norm": 0.5224537253379822, "learning_rate": 0.00017886216961298981, "loss": 8.0801, "step": 299 }, { "epoch": 0.21727322107550245, "grad_norm": 0.5319260954856873, "learning_rate": 0.00017872106565192567, "loss": 8.1099, "step": 300 }, { "epoch": 0.21799746514575413, "grad_norm": 0.5229703783988953, "learning_rate": 0.00017857954834308074, "loss": 8.0345, "step": 301 }, { "epoch": 0.21872170921600578, "grad_norm": 0.4948280453681946, "learning_rate": 0.0001784376184295327, "loss": 8.0936, "step": 302 }, { "epoch": 0.21944595328625746, "grad_norm": 0.4888227581977844, "learning_rate": 0.00017829527665652562, "loss": 8.062, "step": 303 }, { "epoch": 0.22017019735650914, "grad_norm": 0.49134767055511475, "learning_rate": 0.00017815252377146638, "loss": 8.0392, "step": 304 }, { "epoch": 0.22089444142676082, "grad_norm": 0.483167827129364, "learning_rate": 0.0001780093605239203, "loss": 8.07, "step": 305 }, { "epoch": 0.2216186854970125, "grad_norm": 0.4626474380493164, "learning_rate": 0.00017786578766560758, "loss": 8.0982, "step": 306 }, { "epoch": 0.22234292956726417, "grad_norm": 0.4981623888015747, "learning_rate": 0.0001777218059503991, "loss": 8.0233, "step": 307 }, { "epoch": 0.22306717363751585, "grad_norm": 0.5132870674133301, "learning_rate": 0.00017757741613431263, "loss": 8.0564, "step": 308 }, { "epoch": 0.22379141770776753, "grad_norm": 0.4659659266471863, "learning_rate": 0.00017743261897550875, "loss": 8.0599, "step": 309 }, { "epoch": 0.22451566177801918, "grad_norm": 0.48992919921875, "learning_rate": 0.00017728741523428696, "loss": 8.0318, "step": 310 }, { "epoch": 0.22523990584827086, "grad_norm": 0.465867817401886, "learning_rate": 0.00017714180567308157, "loss": 8.042, "step": 311 }, { "epoch": 0.22596414991852254, "grad_norm": 0.48665499687194824, "learning_rate": 0.0001769957910564578, "loss": 8.0519, "step": 312 }, { "epoch": 0.22668839398877422, "grad_norm": 0.47514137625694275, "learning_rate": 0.00017684937215110778, "loss": 8.0537, "step": 313 }, { "epoch": 0.2274126380590259, "grad_norm": 0.5170972943305969, "learning_rate": 0.00017670254972584638, "loss": 8.0094, "step": 314 }, { "epoch": 0.22813688212927757, "grad_norm": 0.5160325765609741, "learning_rate": 0.0001765553245516073, "loss": 8.021, "step": 315 }, { "epoch": 0.22886112619952925, "grad_norm": 0.48066890239715576, "learning_rate": 0.00017640769740143904, "loss": 8.0412, "step": 316 }, { "epoch": 0.2295853702697809, "grad_norm": 0.4954223334789276, "learning_rate": 0.00017625966905050077, "loss": 8.0338, "step": 317 }, { "epoch": 0.23030961434003258, "grad_norm": 0.5118348002433777, "learning_rate": 0.00017611124027605825, "loss": 8.0008, "step": 318 }, { "epoch": 0.23103385841028426, "grad_norm": 0.5256980657577515, "learning_rate": 0.00017596241185747978, "loss": 7.9931, "step": 319 }, { "epoch": 0.23175810248053594, "grad_norm": 0.48749062418937683, "learning_rate": 0.00017581318457623218, "loss": 7.9715, "step": 320 }, { "epoch": 0.23248234655078762, "grad_norm": 0.5072448253631592, "learning_rate": 0.0001756635592158765, "loss": 8.011, "step": 321 }, { "epoch": 0.2332065906210393, "grad_norm": 0.49068352580070496, "learning_rate": 0.00017551353656206412, "loss": 8.0219, "step": 322 }, { "epoch": 0.23393083469129097, "grad_norm": 0.47556236386299133, "learning_rate": 0.00017536311740253243, "loss": 8.0213, "step": 323 }, { "epoch": 0.23465507876154265, "grad_norm": 0.5052993297576904, "learning_rate": 0.0001752123025271009, "loss": 7.9925, "step": 324 }, { "epoch": 0.2353793228317943, "grad_norm": 0.49751394987106323, "learning_rate": 0.00017506109272766673, "loss": 7.9627, "step": 325 }, { "epoch": 0.23610356690204598, "grad_norm": 0.5014733672142029, "learning_rate": 0.00017490948879820084, "loss": 7.9567, "step": 326 }, { "epoch": 0.23682781097229766, "grad_norm": 0.49920758605003357, "learning_rate": 0.0001747574915347436, "loss": 8.0418, "step": 327 }, { "epoch": 0.23755205504254934, "grad_norm": 0.5061244368553162, "learning_rate": 0.00017460510173540072, "loss": 7.9813, "step": 328 }, { "epoch": 0.23827629911280102, "grad_norm": 0.5036886930465698, "learning_rate": 0.00017445232020033902, "loss": 7.9537, "step": 329 }, { "epoch": 0.2390005431830527, "grad_norm": 0.48648810386657715, "learning_rate": 0.00017429914773178228, "loss": 7.9614, "step": 330 }, { "epoch": 0.23972478725330437, "grad_norm": 0.47009527683258057, "learning_rate": 0.00017414558513400693, "loss": 8.0103, "step": 331 }, { "epoch": 0.24044903132355605, "grad_norm": 0.4699738919734955, "learning_rate": 0.00017399163321333793, "loss": 7.9483, "step": 332 }, { "epoch": 0.2411732753938077, "grad_norm": 0.4880366623401642, "learning_rate": 0.00017383729277814446, "loss": 8.0285, "step": 333 }, { "epoch": 0.24189751946405938, "grad_norm": 0.4812169373035431, "learning_rate": 0.00017368256463883578, "loss": 8.0094, "step": 334 }, { "epoch": 0.24262176353431106, "grad_norm": 0.5153531432151794, "learning_rate": 0.00017352744960785676, "loss": 7.9488, "step": 335 }, { "epoch": 0.24334600760456274, "grad_norm": 0.4880661368370056, "learning_rate": 0.0001733719484996839, "loss": 7.9457, "step": 336 }, { "epoch": 0.24407025167481441, "grad_norm": 0.5033547282218933, "learning_rate": 0.00017321606213082088, "loss": 7.9726, "step": 337 }, { "epoch": 0.2447944957450661, "grad_norm": 0.4773882329463959, "learning_rate": 0.0001730597913197942, "loss": 8.0068, "step": 338 }, { "epoch": 0.24551873981531777, "grad_norm": 0.48063740134239197, "learning_rate": 0.00017290313688714915, "loss": 7.9865, "step": 339 }, { "epoch": 0.24624298388556945, "grad_norm": 0.5227491855621338, "learning_rate": 0.00017274609965544523, "loss": 7.8864, "step": 340 }, { "epoch": 0.2469672279558211, "grad_norm": 0.490004301071167, "learning_rate": 0.00017258868044925195, "loss": 7.9176, "step": 341 }, { "epoch": 0.24769147202607278, "grad_norm": 0.49324241280555725, "learning_rate": 0.0001724308800951445, "loss": 7.9429, "step": 342 }, { "epoch": 0.24841571609632446, "grad_norm": 0.5013875961303711, "learning_rate": 0.00017227269942169936, "loss": 8.0167, "step": 343 }, { "epoch": 0.24913996016657614, "grad_norm": 0.49397408962249756, "learning_rate": 0.00017211413925949005, "loss": 7.9341, "step": 344 }, { "epoch": 0.2498642042368278, "grad_norm": 0.48179903626441956, "learning_rate": 0.00017195520044108268, "loss": 7.9138, "step": 345 }, { "epoch": 0.2505884483070795, "grad_norm": 0.5134261846542358, "learning_rate": 0.00017179588380103163, "loss": 7.9132, "step": 346 }, { "epoch": 0.2505884483070795, "eval_loss": 7.9483160972595215, "eval_runtime": 5.1008, "eval_samples_per_second": 228.001, "eval_steps_per_second": 114.099, "step": 346 }, { "epoch": 0.25131269237733117, "grad_norm": 0.5437692403793335, "learning_rate": 0.00017163619017587504, "loss": 7.9127, "step": 347 }, { "epoch": 0.25203693644758285, "grad_norm": 0.5261752605438232, "learning_rate": 0.00017147612040413065, "loss": 7.9209, "step": 348 }, { "epoch": 0.2527611805178345, "grad_norm": 0.5796931385993958, "learning_rate": 0.0001713156753262912, "loss": 7.8785, "step": 349 }, { "epoch": 0.2534854245880862, "grad_norm": 0.6015821099281311, "learning_rate": 0.00017115485578482006, "loss": 8.0264, "step": 350 }, { "epoch": 0.2542096686583379, "grad_norm": 0.5010411739349365, "learning_rate": 0.00017099366262414694, "loss": 7.9666, "step": 351 }, { "epoch": 0.25493391272858956, "grad_norm": 0.4945250451564789, "learning_rate": 0.00017083209669066317, "loss": 7.9885, "step": 352 }, { "epoch": 0.2556581567988412, "grad_norm": 0.5084425210952759, "learning_rate": 0.0001706701588327176, "loss": 7.9407, "step": 353 }, { "epoch": 0.25638240086909286, "grad_norm": 0.47225329279899597, "learning_rate": 0.0001705078499006119, "loss": 7.9456, "step": 354 }, { "epoch": 0.25710664493934454, "grad_norm": 0.5019710659980774, "learning_rate": 0.00017034517074659617, "loss": 7.9546, "step": 355 }, { "epoch": 0.2578308890095962, "grad_norm": 0.49253225326538086, "learning_rate": 0.00017018212222486446, "loss": 7.9919, "step": 356 }, { "epoch": 0.2585551330798479, "grad_norm": 0.4921097457408905, "learning_rate": 0.0001700187051915503, "loss": 7.9367, "step": 357 }, { "epoch": 0.2592793771500996, "grad_norm": 0.4596784710884094, "learning_rate": 0.00016985492050472227, "loss": 7.9454, "step": 358 }, { "epoch": 0.26000362122035126, "grad_norm": 0.48740360140800476, "learning_rate": 0.00016969076902437932, "loss": 7.9385, "step": 359 }, { "epoch": 0.26072786529060293, "grad_norm": 0.4537501931190491, "learning_rate": 0.00016952625161244638, "loss": 7.9553, "step": 360 }, { "epoch": 0.2614521093608546, "grad_norm": 0.5013776421546936, "learning_rate": 0.00016936136913276982, "loss": 7.9313, "step": 361 }, { "epoch": 0.2621763534311063, "grad_norm": 0.49748286604881287, "learning_rate": 0.00016919612245111295, "loss": 7.9337, "step": 362 }, { "epoch": 0.26290059750135797, "grad_norm": 0.4852049648761749, "learning_rate": 0.0001690305124351514, "loss": 7.9322, "step": 363 }, { "epoch": 0.26362484157160965, "grad_norm": 0.4738268554210663, "learning_rate": 0.0001688645399544685, "loss": 7.9062, "step": 364 }, { "epoch": 0.2643490856418613, "grad_norm": 0.5000738501548767, "learning_rate": 0.00016869820588055095, "loss": 7.9441, "step": 365 }, { "epoch": 0.265073329712113, "grad_norm": 0.4679807424545288, "learning_rate": 0.00016853151108678398, "loss": 7.9791, "step": 366 }, { "epoch": 0.2657975737823647, "grad_norm": 0.4647904336452484, "learning_rate": 0.00016836445644844697, "loss": 7.9007, "step": 367 }, { "epoch": 0.2665218178526163, "grad_norm": 0.491827130317688, "learning_rate": 0.00016819704284270874, "loss": 7.9301, "step": 368 }, { "epoch": 0.267246061922868, "grad_norm": 0.4953727126121521, "learning_rate": 0.000168029271148623, "loss": 7.8832, "step": 369 }, { "epoch": 0.26797030599311966, "grad_norm": 0.5065143704414368, "learning_rate": 0.0001678611422471236, "loss": 7.8904, "step": 370 }, { "epoch": 0.26869455006337134, "grad_norm": 0.49114301800727844, "learning_rate": 0.00016769265702102018, "loss": 7.8696, "step": 371 }, { "epoch": 0.269418794133623, "grad_norm": 0.4655497670173645, "learning_rate": 0.00016752381635499317, "loss": 7.9315, "step": 372 }, { "epoch": 0.2701430382038747, "grad_norm": 0.4854629933834076, "learning_rate": 0.0001673546211355895, "loss": 7.8832, "step": 373 }, { "epoch": 0.2708672822741264, "grad_norm": 0.4579346776008606, "learning_rate": 0.0001671850722512178, "loss": 7.8855, "step": 374 }, { "epoch": 0.27159152634437805, "grad_norm": 0.4732455909252167, "learning_rate": 0.00016701517059214348, "loss": 7.9345, "step": 375 }, { "epoch": 0.27231577041462973, "grad_norm": 0.4654392600059509, "learning_rate": 0.00016684491705048457, "loss": 7.9374, "step": 376 }, { "epoch": 0.2730400144848814, "grad_norm": 0.4884020686149597, "learning_rate": 0.0001666743125202067, "loss": 7.9046, "step": 377 }, { "epoch": 0.2737642585551331, "grad_norm": 0.4872707724571228, "learning_rate": 0.00016650335789711833, "loss": 7.9233, "step": 378 }, { "epoch": 0.27448850262538477, "grad_norm": 0.47144293785095215, "learning_rate": 0.0001663320540788663, "loss": 7.9096, "step": 379 }, { "epoch": 0.27521274669563645, "grad_norm": 0.45968198776245117, "learning_rate": 0.00016616040196493103, "loss": 7.8761, "step": 380 }, { "epoch": 0.2759369907658881, "grad_norm": 0.47557854652404785, "learning_rate": 0.00016598840245662166, "loss": 7.8756, "step": 381 }, { "epoch": 0.2766612348361398, "grad_norm": 0.45728030800819397, "learning_rate": 0.0001658160564570715, "loss": 7.9193, "step": 382 }, { "epoch": 0.2773854789063915, "grad_norm": 0.5167350172996521, "learning_rate": 0.0001656433648712332, "loss": 7.8849, "step": 383 }, { "epoch": 0.2781097229766431, "grad_norm": 0.4883110523223877, "learning_rate": 0.00016547032860587398, "loss": 7.8263, "step": 384 }, { "epoch": 0.2788339670468948, "grad_norm": 0.47416940331459045, "learning_rate": 0.00016529694856957098, "loss": 7.9048, "step": 385 }, { "epoch": 0.27955821111714646, "grad_norm": 0.4862803816795349, "learning_rate": 0.0001651232256727063, "loss": 7.8855, "step": 386 }, { "epoch": 0.28028245518739814, "grad_norm": 0.4723484516143799, "learning_rate": 0.0001649491608274624, "loss": 7.9657, "step": 387 }, { "epoch": 0.2810066992576498, "grad_norm": 0.4739847779273987, "learning_rate": 0.00016477475494781717, "loss": 7.8765, "step": 388 }, { "epoch": 0.2817309433279015, "grad_norm": 0.4680808484554291, "learning_rate": 0.00016460000894953934, "loss": 7.9019, "step": 389 }, { "epoch": 0.2824551873981532, "grad_norm": 0.5003647804260254, "learning_rate": 0.00016442492375018343, "loss": 7.839, "step": 390 }, { "epoch": 0.28317943146840485, "grad_norm": 0.5076538324356079, "learning_rate": 0.00016424950026908497, "loss": 7.8935, "step": 391 }, { "epoch": 0.28390367553865653, "grad_norm": 0.47889307141304016, "learning_rate": 0.0001640737394273559, "loss": 7.9099, "step": 392 }, { "epoch": 0.2846279196089082, "grad_norm": 0.48014241456985474, "learning_rate": 0.0001638976421478794, "loss": 7.9593, "step": 393 }, { "epoch": 0.2853521636791599, "grad_norm": 0.48349729180336, "learning_rate": 0.00016372120935530536, "loss": 7.8625, "step": 394 }, { "epoch": 0.28607640774941157, "grad_norm": 0.4960671067237854, "learning_rate": 0.00016354444197604529, "loss": 7.8534, "step": 395 }, { "epoch": 0.28680065181966324, "grad_norm": 0.515612006187439, "learning_rate": 0.00016336734093826756, "loss": 7.872, "step": 396 }, { "epoch": 0.2875248958899149, "grad_norm": 0.49004271626472473, "learning_rate": 0.00016318990717189256, "loss": 7.9025, "step": 397 }, { "epoch": 0.2882491399601666, "grad_norm": 0.5171812176704407, "learning_rate": 0.00016301214160858768, "loss": 7.8026, "step": 398 }, { "epoch": 0.2889733840304182, "grad_norm": 0.5030158162117004, "learning_rate": 0.00016283404518176257, "loss": 7.9481, "step": 399 }, { "epoch": 0.2896976281006699, "grad_norm": 0.5945943593978882, "learning_rate": 0.0001626556188265642, "loss": 7.9135, "step": 400 }, { "epoch": 0.2904218721709216, "grad_norm": 0.5005214810371399, "learning_rate": 0.00016247686347987183, "loss": 7.9069, "step": 401 }, { "epoch": 0.29114611624117326, "grad_norm": 0.4819411635398865, "learning_rate": 0.00016229778008029224, "loss": 7.8972, "step": 402 }, { "epoch": 0.29187036031142494, "grad_norm": 0.48914363980293274, "learning_rate": 0.00016211836956815477, "loss": 7.9025, "step": 403 }, { "epoch": 0.2925946043816766, "grad_norm": 0.4711272418498993, "learning_rate": 0.00016193863288550638, "loss": 7.923, "step": 404 }, { "epoch": 0.2933188484519283, "grad_norm": 0.4771362841129303, "learning_rate": 0.00016175857097610653, "loss": 7.8727, "step": 405 }, { "epoch": 0.29404309252217997, "grad_norm": 0.4811650216579437, "learning_rate": 0.00016157818478542254, "loss": 7.8895, "step": 406 }, { "epoch": 0.29476733659243165, "grad_norm": 0.49894389510154724, "learning_rate": 0.00016139747526062442, "loss": 7.8914, "step": 407 }, { "epoch": 0.29549158066268333, "grad_norm": 0.48443737626075745, "learning_rate": 0.00016121644335057993, "loss": 7.8962, "step": 408 }, { "epoch": 0.296215824732935, "grad_norm": 0.4693881869316101, "learning_rate": 0.00016103509000584958, "loss": 7.8654, "step": 409 }, { "epoch": 0.2969400688031867, "grad_norm": 0.4889278709888458, "learning_rate": 0.0001608534161786817, "loss": 7.8896, "step": 410 }, { "epoch": 0.29766431287343836, "grad_norm": 0.48877543210983276, "learning_rate": 0.0001606714228230074, "loss": 7.9108, "step": 411 }, { "epoch": 0.29838855694369004, "grad_norm": 0.46180957555770874, "learning_rate": 0.00016048911089443558, "loss": 7.9667, "step": 412 }, { "epoch": 0.2991128010139417, "grad_norm": 0.466511994600296, "learning_rate": 0.00016030648135024786, "loss": 7.9088, "step": 413 }, { "epoch": 0.2998370450841934, "grad_norm": 0.4758537709712982, "learning_rate": 0.00016012353514939363, "loss": 7.8952, "step": 414 }, { "epoch": 0.300561289154445, "grad_norm": 0.5132808089256287, "learning_rate": 0.00015994027325248492, "loss": 7.8671, "step": 415 }, { "epoch": 0.3012855332246967, "grad_norm": 0.45275020599365234, "learning_rate": 0.00015975669662179152, "loss": 7.8962, "step": 416 }, { "epoch": 0.3020097772949484, "grad_norm": 0.5065222382545471, "learning_rate": 0.00015957280622123574, "loss": 7.8559, "step": 417 }, { "epoch": 0.30273402136520006, "grad_norm": 0.4880678057670593, "learning_rate": 0.00015938860301638742, "loss": 7.8497, "step": 418 }, { "epoch": 0.30345826543545174, "grad_norm": 0.4863564968109131, "learning_rate": 0.0001592040879744589, "loss": 7.8586, "step": 419 }, { "epoch": 0.3041825095057034, "grad_norm": 0.49322959780693054, "learning_rate": 0.0001590192620643, "loss": 7.8449, "step": 420 }, { "epoch": 0.3049067535759551, "grad_norm": 0.4878312349319458, "learning_rate": 0.00015883412625639263, "loss": 7.8653, "step": 421 }, { "epoch": 0.30563099764620677, "grad_norm": 0.5146703720092773, "learning_rate": 0.00015864868152284608, "loss": 7.9, "step": 422 }, { "epoch": 0.30635524171645845, "grad_norm": 0.46097350120544434, "learning_rate": 0.00015846292883739171, "loss": 7.8997, "step": 423 }, { "epoch": 0.3070794857867101, "grad_norm": 0.48234981298446655, "learning_rate": 0.00015827686917537783, "loss": 7.9008, "step": 424 }, { "epoch": 0.3078037298569618, "grad_norm": 0.49214646220207214, "learning_rate": 0.00015809050351376467, "loss": 7.9094, "step": 425 }, { "epoch": 0.3085279739272135, "grad_norm": 0.4872777462005615, "learning_rate": 0.00015790383283111913, "loss": 7.8567, "step": 426 }, { "epoch": 0.30925221799746516, "grad_norm": 0.4729660451412201, "learning_rate": 0.00015771685810760978, "loss": 7.8577, "step": 427 }, { "epoch": 0.30997646206771684, "grad_norm": 0.4986751973628998, "learning_rate": 0.00015752958032500165, "loss": 7.8491, "step": 428 }, { "epoch": 0.3107007061379685, "grad_norm": 0.4702273905277252, "learning_rate": 0.000157342000466651, "loss": 7.8647, "step": 429 }, { "epoch": 0.3114249502082202, "grad_norm": 0.48451560735702515, "learning_rate": 0.0001571541195175003, "loss": 7.8365, "step": 430 }, { "epoch": 0.3121491942784718, "grad_norm": 0.4823257029056549, "learning_rate": 0.0001569659384640729, "loss": 7.8654, "step": 431 }, { "epoch": 0.3128734383487235, "grad_norm": 0.4614291191101074, "learning_rate": 0.00015677745829446803, "loss": 7.8945, "step": 432 }, { "epoch": 0.3135976824189752, "grad_norm": 0.47423475980758667, "learning_rate": 0.00015658867999835546, "loss": 7.8436, "step": 433 }, { "epoch": 0.31432192648922685, "grad_norm": 0.486380398273468, "learning_rate": 0.00015639960456697037, "loss": 7.836, "step": 434 }, { "epoch": 0.31504617055947853, "grad_norm": 0.4982565641403198, "learning_rate": 0.00015621023299310812, "loss": 7.8526, "step": 435 }, { "epoch": 0.3157704146297302, "grad_norm": 0.4835852086544037, "learning_rate": 0.00015602056627111907, "loss": 7.831, "step": 436 }, { "epoch": 0.3164946586999819, "grad_norm": 0.508026659488678, "learning_rate": 0.0001558306053969034, "loss": 7.837, "step": 437 }, { "epoch": 0.31721890277023357, "grad_norm": 0.5414742231369019, "learning_rate": 0.00015564035136790566, "loss": 7.8466, "step": 438 }, { "epoch": 0.31794314684048525, "grad_norm": 0.5005372762680054, "learning_rate": 0.00015544980518310988, "loss": 7.8585, "step": 439 }, { "epoch": 0.3186673909107369, "grad_norm": 0.485451877117157, "learning_rate": 0.00015525896784303398, "loss": 7.8713, "step": 440 }, { "epoch": 0.3193916349809886, "grad_norm": 0.4814086854457855, "learning_rate": 0.0001550678403497248, "loss": 7.8185, "step": 441 }, { "epoch": 0.3201158790512403, "grad_norm": 0.4870661497116089, "learning_rate": 0.00015487642370675265, "loss": 7.8919, "step": 442 }, { "epoch": 0.32084012312149196, "grad_norm": 0.530167281627655, "learning_rate": 0.00015468471891920613, "loss": 7.8137, "step": 443 }, { "epoch": 0.32156436719174364, "grad_norm": 0.4842655658721924, "learning_rate": 0.0001544927269936868, "loss": 7.7832, "step": 444 }, { "epoch": 0.3222886112619953, "grad_norm": 0.47289198637008667, "learning_rate": 0.0001543004489383039, "loss": 7.8456, "step": 445 }, { "epoch": 0.32301285533224694, "grad_norm": 0.5019034147262573, "learning_rate": 0.00015410788576266916, "loss": 7.8209, "step": 446 }, { "epoch": 0.3237370994024986, "grad_norm": 0.5903533697128296, "learning_rate": 0.00015391503847789136, "loss": 7.9144, "step": 447 }, { "epoch": 0.3244613434727503, "grad_norm": 0.5589230060577393, "learning_rate": 0.00015372190809657106, "loss": 7.7157, "step": 448 }, { "epoch": 0.325185587543002, "grad_norm": 0.5412530899047852, "learning_rate": 0.00015352849563279536, "loss": 7.7944, "step": 449 }, { "epoch": 0.32590983161325365, "grad_norm": 0.6175811290740967, "learning_rate": 0.00015333480210213244, "loss": 7.8087, "step": 450 }, { "epoch": 0.32663407568350533, "grad_norm": 0.504089891910553, "learning_rate": 0.0001531408285216264, "loss": 7.8893, "step": 451 }, { "epoch": 0.327358319753757, "grad_norm": 0.5161882638931274, "learning_rate": 0.00015294657590979172, "loss": 7.8682, "step": 452 }, { "epoch": 0.3280825638240087, "grad_norm": 0.4699746072292328, "learning_rate": 0.0001527520452866081, "loss": 7.8638, "step": 453 }, { "epoch": 0.32880680789426037, "grad_norm": 0.48906633257865906, "learning_rate": 0.00015255723767351495, "loss": 7.8385, "step": 454 }, { "epoch": 0.32953105196451205, "grad_norm": 0.48731744289398193, "learning_rate": 0.00015236215409340616, "loss": 7.8139, "step": 455 }, { "epoch": 0.3302552960347637, "grad_norm": 0.4950321614742279, "learning_rate": 0.0001521667955706246, "loss": 7.8719, "step": 456 }, { "epoch": 0.3309795401050154, "grad_norm": 0.4786604046821594, "learning_rate": 0.00015197116313095683, "loss": 7.8451, "step": 457 }, { "epoch": 0.3317037841752671, "grad_norm": 0.46545907855033875, "learning_rate": 0.00015177525780162775, "loss": 7.8802, "step": 458 }, { "epoch": 0.33242802824551876, "grad_norm": 0.4908168315887451, "learning_rate": 0.00015157908061129508, "loss": 7.8576, "step": 459 }, { "epoch": 0.33315227231577044, "grad_norm": 0.5020480751991272, "learning_rate": 0.00015138263259004402, "loss": 7.8606, "step": 460 }, { "epoch": 0.3338765163860221, "grad_norm": 0.48662593960762024, "learning_rate": 0.00015118591476938188, "loss": 7.8735, "step": 461 }, { "epoch": 0.33460076045627374, "grad_norm": 0.46343091130256653, "learning_rate": 0.00015098892818223258, "loss": 7.9559, "step": 462 }, { "epoch": 0.3353250045265254, "grad_norm": 0.467752069234848, "learning_rate": 0.0001507916738629314, "loss": 7.8742, "step": 463 }, { "epoch": 0.3360492485967771, "grad_norm": 0.4827868342399597, "learning_rate": 0.00015059415284721924, "loss": 7.8779, "step": 464 }, { "epoch": 0.3367734926670288, "grad_norm": 0.4806783199310303, "learning_rate": 0.00015039636617223754, "loss": 7.8949, "step": 465 }, { "epoch": 0.33749773673728045, "grad_norm": 0.4673584997653961, "learning_rate": 0.00015019831487652255, "loss": 7.8997, "step": 466 }, { "epoch": 0.33822198080753213, "grad_norm": 0.5170037746429443, "learning_rate": 0.00015000000000000001, "loss": 7.8233, "step": 467 }, { "epoch": 0.3389462248777838, "grad_norm": 0.5105298757553101, "learning_rate": 0.00014980142258397972, "loss": 7.8252, "step": 468 }, { "epoch": 0.3396704689480355, "grad_norm": 0.476662814617157, "learning_rate": 0.00014960258367114997, "loss": 7.843, "step": 469 }, { "epoch": 0.34039471301828716, "grad_norm": 0.49604159593582153, "learning_rate": 0.0001494034843055721, "loss": 7.8556, "step": 470 }, { "epoch": 0.34111895708853884, "grad_norm": 0.48032739758491516, "learning_rate": 0.00014920412553267508, "loss": 7.8577, "step": 471 }, { "epoch": 0.3418432011587905, "grad_norm": 0.49723532795906067, "learning_rate": 0.00014900450839924994, "loss": 7.864, "step": 472 }, { "epoch": 0.3425674452290422, "grad_norm": 0.47292062640190125, "learning_rate": 0.00014880463395344434, "loss": 7.8405, "step": 473 }, { "epoch": 0.3432916892992939, "grad_norm": 0.49656134843826294, "learning_rate": 0.00014860450324475703, "loss": 7.832, "step": 474 }, { "epoch": 0.34401593336954556, "grad_norm": 0.4985675811767578, "learning_rate": 0.0001484041173240323, "loss": 7.8131, "step": 475 }, { "epoch": 0.34474017743979724, "grad_norm": 0.4732464551925659, "learning_rate": 0.0001482034772434545, "loss": 7.8819, "step": 476 }, { "epoch": 0.3454644215100489, "grad_norm": 0.4878876507282257, "learning_rate": 0.00014800258405654257, "loss": 7.8256, "step": 477 }, { "epoch": 0.34618866558030054, "grad_norm": 0.49212250113487244, "learning_rate": 0.00014780143881814442, "loss": 7.816, "step": 478 }, { "epoch": 0.3469129096505522, "grad_norm": 0.4787651598453522, "learning_rate": 0.00014760004258443151, "loss": 7.856, "step": 479 }, { "epoch": 0.3476371537208039, "grad_norm": 0.48412755131721497, "learning_rate": 0.00014739839641289313, "loss": 7.8497, "step": 480 }, { "epoch": 0.34836139779105557, "grad_norm": 0.4971597492694855, "learning_rate": 0.00014719650136233096, "loss": 7.8446, "step": 481 }, { "epoch": 0.34908564186130725, "grad_norm": 0.4982038736343384, "learning_rate": 0.00014699435849285352, "loss": 7.7839, "step": 482 }, { "epoch": 0.34980988593155893, "grad_norm": 0.5108397603034973, "learning_rate": 0.00014679196886587052, "loss": 7.7931, "step": 483 }, { "epoch": 0.3505341300018106, "grad_norm": 0.4721708297729492, "learning_rate": 0.00014658933354408743, "loss": 7.8607, "step": 484 }, { "epoch": 0.3512583740720623, "grad_norm": 0.4693983495235443, "learning_rate": 0.0001463864535914997, "loss": 7.8773, "step": 485 }, { "epoch": 0.35198261814231396, "grad_norm": 0.5026690363883972, "learning_rate": 0.00014618333007338744, "loss": 7.8628, "step": 486 }, { "epoch": 0.35270686221256564, "grad_norm": 0.48804745078086853, "learning_rate": 0.00014597996405630947, "loss": 7.84, "step": 487 }, { "epoch": 0.3534311062828173, "grad_norm": 0.4530211389064789, "learning_rate": 0.0001457763566080981, "loss": 7.859, "step": 488 }, { "epoch": 0.354155350353069, "grad_norm": 0.48494988679885864, "learning_rate": 0.0001455725087978533, "loss": 7.7981, "step": 489 }, { "epoch": 0.3548795944233207, "grad_norm": 0.4591243267059326, "learning_rate": 0.00014536842169593703, "loss": 7.8371, "step": 490 }, { "epoch": 0.35560383849357236, "grad_norm": 0.49914512038230896, "learning_rate": 0.00014516409637396787, "loss": 7.829, "step": 491 }, { "epoch": 0.35632808256382403, "grad_norm": 0.48346269130706787, "learning_rate": 0.00014495953390481506, "loss": 7.7837, "step": 492 }, { "epoch": 0.35705232663407566, "grad_norm": 0.4839797019958496, "learning_rate": 0.00014475473536259325, "loss": 7.8779, "step": 493 }, { "epoch": 0.35777657070432733, "grad_norm": 0.4998239576816559, "learning_rate": 0.00014454970182265655, "loss": 7.8213, "step": 494 }, { "epoch": 0.358500814774579, "grad_norm": 0.5087825059890747, "learning_rate": 0.000144344434361593, "loss": 7.8142, "step": 495 }, { "epoch": 0.3592250588448307, "grad_norm": 0.47627493739128113, "learning_rate": 0.00014413893405721895, "loss": 7.8951, "step": 496 }, { "epoch": 0.35994930291508237, "grad_norm": 0.5072402954101562, "learning_rate": 0.0001439332019885733, "loss": 7.8125, "step": 497 }, { "epoch": 0.36067354698533405, "grad_norm": 0.5031032562255859, "learning_rate": 0.0001437272392359119, "loss": 7.8159, "step": 498 }, { "epoch": 0.3613977910555857, "grad_norm": 0.5236200094223022, "learning_rate": 0.000143521046880702, "loss": 7.8519, "step": 499 }, { "epoch": 0.3621220351258374, "grad_norm": 0.5609799027442932, "learning_rate": 0.00014331462600561626, "loss": 7.8537, "step": 500 }, { "epoch": 0.3628462791960891, "grad_norm": 0.4694886803627014, "learning_rate": 0.0001431079776945274, "loss": 7.9079, "step": 501 }, { "epoch": 0.36357052326634076, "grad_norm": 0.5094586610794067, "learning_rate": 0.00014290110303250225, "loss": 7.8228, "step": 502 }, { "epoch": 0.36429476733659244, "grad_norm": 0.4992700219154358, "learning_rate": 0.00014269400310579623, "loss": 7.8663, "step": 503 }, { "epoch": 0.3650190114068441, "grad_norm": 0.4908367395401001, "learning_rate": 0.00014248667900184752, "loss": 7.8586, "step": 504 }, { "epoch": 0.3657432554770958, "grad_norm": 0.46805062890052795, "learning_rate": 0.00014227913180927152, "loss": 7.8644, "step": 505 }, { "epoch": 0.3664674995473475, "grad_norm": 0.49678292870521545, "learning_rate": 0.00014207136261785484, "loss": 7.8759, "step": 506 }, { "epoch": 0.36719174361759915, "grad_norm": 0.46577930450439453, "learning_rate": 0.00014186337251854994, "loss": 7.8668, "step": 507 }, { "epoch": 0.36791598768785083, "grad_norm": 0.4829874038696289, "learning_rate": 0.00014165516260346913, "loss": 7.8516, "step": 508 }, { "epoch": 0.36864023175810245, "grad_norm": 0.4831247925758362, "learning_rate": 0.00014144673396587892, "loss": 7.8148, "step": 509 }, { "epoch": 0.36936447582835413, "grad_norm": 0.49740880727767944, "learning_rate": 0.00014123808770019432, "loss": 7.864, "step": 510 }, { "epoch": 0.3700887198986058, "grad_norm": 0.4826888144016266, "learning_rate": 0.00014102922490197308, "loss": 7.887, "step": 511 }, { "epoch": 0.3708129639688575, "grad_norm": 0.4531131684780121, "learning_rate": 0.0001408201466679098, "loss": 7.901, "step": 512 }, { "epoch": 0.37153720803910917, "grad_norm": 0.4634702801704407, "learning_rate": 0.00014061085409583043, "loss": 7.9295, "step": 513 }, { "epoch": 0.37226145210936085, "grad_norm": 0.48286962509155273, "learning_rate": 0.0001404013482846863, "loss": 7.8609, "step": 514 }, { "epoch": 0.3729856961796125, "grad_norm": 0.49540573358535767, "learning_rate": 0.00014019163033454843, "loss": 7.8966, "step": 515 }, { "epoch": 0.3737099402498642, "grad_norm": 0.471544474363327, "learning_rate": 0.00013998170134660169, "loss": 7.9148, "step": 516 }, { "epoch": 0.3744341843201159, "grad_norm": 0.4708135426044464, "learning_rate": 0.0001397715624231391, "loss": 7.882, "step": 517 }, { "epoch": 0.37515842839036756, "grad_norm": 0.5138322710990906, "learning_rate": 0.0001395612146675561, "loss": 7.8323, "step": 518 }, { "epoch": 0.37588267246061924, "grad_norm": 0.5133697390556335, "learning_rate": 0.00013935065918434445, "loss": 7.867, "step": 519 }, { "epoch": 0.3766069165308709, "grad_norm": 0.44749224185943604, "learning_rate": 0.00013913989707908683, "loss": 7.8333, "step": 520 }, { "epoch": 0.3773311606011226, "grad_norm": 0.47725266218185425, "learning_rate": 0.00013892892945845077, "loss": 7.8102, "step": 521 }, { "epoch": 0.3780554046713743, "grad_norm": 0.4651695787906647, "learning_rate": 0.00013871775743018293, "loss": 7.9104, "step": 522 }, { "epoch": 0.37877964874162595, "grad_norm": 0.4996732771396637, "learning_rate": 0.0001385063821031033, "loss": 7.8928, "step": 523 }, { "epoch": 0.37950389281187763, "grad_norm": 0.465753436088562, "learning_rate": 0.00013829480458709927, "loss": 7.8479, "step": 524 }, { "epoch": 0.38022813688212925, "grad_norm": 0.4831571877002716, "learning_rate": 0.00013808302599312, "loss": 7.8302, "step": 525 }, { "epoch": 0.38095238095238093, "grad_norm": 0.513978123664856, "learning_rate": 0.0001378710474331704, "loss": 7.8111, "step": 526 }, { "epoch": 0.3816766250226326, "grad_norm": 0.44832077622413635, "learning_rate": 0.00013765887002030529, "loss": 7.9085, "step": 527 }, { "epoch": 0.3824008690928843, "grad_norm": 0.46730518341064453, "learning_rate": 0.00013744649486862378, "loss": 7.8403, "step": 528 }, { "epoch": 0.38312511316313597, "grad_norm": 0.47098594903945923, "learning_rate": 0.00013723392309326316, "loss": 7.8301, "step": 529 }, { "epoch": 0.38384935723338764, "grad_norm": 0.46994248032569885, "learning_rate": 0.00013702115581039313, "loss": 7.8654, "step": 530 }, { "epoch": 0.3845736013036393, "grad_norm": 0.46916016936302185, "learning_rate": 0.00013680819413721003, "loss": 7.8193, "step": 531 }, { "epoch": 0.385297845373891, "grad_norm": 0.48833534121513367, "learning_rate": 0.00013659503919193089, "loss": 7.7667, "step": 532 }, { "epoch": 0.3860220894441427, "grad_norm": 0.444770485162735, "learning_rate": 0.00013638169209378756, "loss": 7.8987, "step": 533 }, { "epoch": 0.38674633351439436, "grad_norm": 0.46796807646751404, "learning_rate": 0.00013616815396302081, "loss": 7.8328, "step": 534 }, { "epoch": 0.38747057758464604, "grad_norm": 0.49306872487068176, "learning_rate": 0.00013595442592087453, "loss": 7.8562, "step": 535 }, { "epoch": 0.3881948216548977, "grad_norm": 0.45947471261024475, "learning_rate": 0.00013574050908958976, "loss": 7.8148, "step": 536 }, { "epoch": 0.3889190657251494, "grad_norm": 0.48289933800697327, "learning_rate": 0.00013552640459239888, "loss": 7.8374, "step": 537 }, { "epoch": 0.38964330979540107, "grad_norm": 0.49065014719963074, "learning_rate": 0.00013531211355351962, "loss": 7.782, "step": 538 }, { "epoch": 0.39036755386565275, "grad_norm": 0.48593437671661377, "learning_rate": 0.00013509763709814923, "loss": 7.8095, "step": 539 }, { "epoch": 0.3910917979359044, "grad_norm": 0.4995581805706024, "learning_rate": 0.00013488297635245848, "loss": 7.8081, "step": 540 }, { "epoch": 0.39181604200615605, "grad_norm": 0.4646143317222595, "learning_rate": 0.0001346681324435859, "loss": 7.9134, "step": 541 }, { "epoch": 0.39254028607640773, "grad_norm": 0.49974748492240906, "learning_rate": 0.00013445310649963169, "loss": 7.8236, "step": 542 }, { "epoch": 0.3932645301466594, "grad_norm": 0.4993489384651184, "learning_rate": 0.00013423789964965194, "loss": 7.8141, "step": 543 }, { "epoch": 0.3939887742169111, "grad_norm": 0.4687751829624176, "learning_rate": 0.00013402251302365264, "loss": 7.8152, "step": 544 }, { "epoch": 0.39471301828716276, "grad_norm": 0.47237464785575867, "learning_rate": 0.00013380694775258367, "loss": 7.8618, "step": 545 }, { "epoch": 0.39543726235741444, "grad_norm": 0.4689088463783264, "learning_rate": 0.00013359120496833304, "loss": 7.897, "step": 546 }, { "epoch": 0.3961615064276661, "grad_norm": 0.4774571657180786, "learning_rate": 0.00013337528580372078, "loss": 7.8143, "step": 547 }, { "epoch": 0.3968857504979178, "grad_norm": 0.49935322999954224, "learning_rate": 0.00013315919139249307, "loss": 7.8004, "step": 548 }, { "epoch": 0.3976099945681695, "grad_norm": 0.5764620304107666, "learning_rate": 0.00013294292286931627, "loss": 7.8003, "step": 549 }, { "epoch": 0.39833423863842116, "grad_norm": 0.5829343795776367, "learning_rate": 0.00013272648136977092, "loss": 7.771, "step": 550 }, { "epoch": 0.39905848270867283, "grad_norm": 0.49663594365119934, "learning_rate": 0.00013250986803034598, "loss": 7.8275, "step": 551 }, { "epoch": 0.3997827267789245, "grad_norm": 0.4658512473106384, "learning_rate": 0.0001322930839884325, "loss": 7.8529, "step": 552 }, { "epoch": 0.4005069708491762, "grad_norm": 0.47856324911117554, "learning_rate": 0.000132076130382318, "loss": 7.8692, "step": 553 }, { "epoch": 0.40123121491942787, "grad_norm": 0.5036735534667969, "learning_rate": 0.00013185900835118025, "loss": 7.8315, "step": 554 }, { "epoch": 0.40195545898967955, "grad_norm": 0.4810890853404999, "learning_rate": 0.00013164171903508153, "loss": 7.8672, "step": 555 }, { "epoch": 0.40267970305993117, "grad_norm": 0.5102052092552185, "learning_rate": 0.00013142426357496225, "loss": 7.8202, "step": 556 }, { "epoch": 0.40340394713018285, "grad_norm": 0.4854986071586609, "learning_rate": 0.0001312066431126355, "loss": 7.8389, "step": 557 }, { "epoch": 0.40412819120043453, "grad_norm": 0.4611469805240631, "learning_rate": 0.0001309888587907805, "loss": 7.8226, "step": 558 }, { "epoch": 0.4048524352706862, "grad_norm": 0.49772679805755615, "learning_rate": 0.00013077091175293706, "loss": 7.8254, "step": 559 }, { "epoch": 0.4055766793409379, "grad_norm": 0.47026780247688293, "learning_rate": 0.00013055280314349928, "loss": 7.8621, "step": 560 }, { "epoch": 0.40630092341118956, "grad_norm": 0.4732389748096466, "learning_rate": 0.00013033453410770963, "loss": 7.877, "step": 561 }, { "epoch": 0.40702516748144124, "grad_norm": 0.47972285747528076, "learning_rate": 0.000130116105791653, "loss": 7.8747, "step": 562 }, { "epoch": 0.4077494115516929, "grad_norm": 0.48406916856765747, "learning_rate": 0.0001298975193422506, "loss": 7.8502, "step": 563 }, { "epoch": 0.4084736556219446, "grad_norm": 0.47579431533813477, "learning_rate": 0.0001296787759072539, "loss": 7.8294, "step": 564 }, { "epoch": 0.4091978996921963, "grad_norm": 0.48034512996673584, "learning_rate": 0.0001294598766352388, "loss": 7.8261, "step": 565 }, { "epoch": 0.40992214376244795, "grad_norm": 0.44753938913345337, "learning_rate": 0.00012924082267559939, "loss": 7.8905, "step": 566 }, { "epoch": 0.41064638783269963, "grad_norm": 0.4587608575820923, "learning_rate": 0.00012902161517854197, "loss": 7.8683, "step": 567 }, { "epoch": 0.4113706319029513, "grad_norm": 0.47434911131858826, "learning_rate": 0.00012880225529507912, "loss": 7.8635, "step": 568 }, { "epoch": 0.412094875973203, "grad_norm": 0.4594435691833496, "learning_rate": 0.00012858274417702344, "loss": 7.8942, "step": 569 }, { "epoch": 0.41281912004345467, "grad_norm": 0.4879729747772217, "learning_rate": 0.00012836308297698175, "loss": 7.8623, "step": 570 }, { "epoch": 0.41354336411370635, "grad_norm": 0.5010982155799866, "learning_rate": 0.00012814327284834886, "loss": 7.8099, "step": 571 }, { "epoch": 0.41426760818395797, "grad_norm": 0.47567158937454224, "learning_rate": 0.00012792331494530158, "loss": 7.8321, "step": 572 }, { "epoch": 0.41499185225420965, "grad_norm": 0.47433096170425415, "learning_rate": 0.00012770321042279264, "loss": 7.8615, "step": 573 }, { "epoch": 0.4157160963244613, "grad_norm": 0.47150593996047974, "learning_rate": 0.00012748296043654472, "loss": 7.8348, "step": 574 }, { "epoch": 0.416440340394713, "grad_norm": 0.43587684631347656, "learning_rate": 0.0001272625661430442, "loss": 7.8704, "step": 575 }, { "epoch": 0.4171645844649647, "grad_norm": 0.4814154803752899, "learning_rate": 0.00012704202869953521, "loss": 7.8232, "step": 576 }, { "epoch": 0.41788882853521636, "grad_norm": 0.4678634703159332, "learning_rate": 0.00012682134926401354, "loss": 7.8351, "step": 577 }, { "epoch": 0.41861307260546804, "grad_norm": 0.48871251940727234, "learning_rate": 0.00012660052899522058, "loss": 7.7851, "step": 578 }, { "epoch": 0.4193373166757197, "grad_norm": 0.48269665241241455, "learning_rate": 0.00012637956905263718, "loss": 7.8597, "step": 579 }, { "epoch": 0.4200615607459714, "grad_norm": 0.46834123134613037, "learning_rate": 0.00012615847059647752, "loss": 7.8509, "step": 580 }, { "epoch": 0.4207858048162231, "grad_norm": 0.47934436798095703, "learning_rate": 0.00012593723478768323, "loss": 7.8651, "step": 581 }, { "epoch": 0.42151004888647475, "grad_norm": 0.45488452911376953, "learning_rate": 0.00012571586278791705, "loss": 7.8629, "step": 582 }, { "epoch": 0.42223429295672643, "grad_norm": 0.4674306809902191, "learning_rate": 0.00012549435575955683, "loss": 7.8591, "step": 583 }, { "epoch": 0.4229585370269781, "grad_norm": 0.47342684864997864, "learning_rate": 0.0001252727148656895, "loss": 7.7871, "step": 584 }, { "epoch": 0.4236827810972298, "grad_norm": 0.49287477135658264, "learning_rate": 0.0001250509412701048, "loss": 7.8044, "step": 585 }, { "epoch": 0.42440702516748147, "grad_norm": 0.47948122024536133, "learning_rate": 0.00012482903613728928, "loss": 7.872, "step": 586 }, { "epoch": 0.4251312692377331, "grad_norm": 0.46859344840049744, "learning_rate": 0.00012460700063242027, "loss": 7.8114, "step": 587 }, { "epoch": 0.42585551330798477, "grad_norm": 0.49053072929382324, "learning_rate": 0.00012438483592135948, "loss": 7.7944, "step": 588 }, { "epoch": 0.42657975737823645, "grad_norm": 0.5255662202835083, "learning_rate": 0.00012416254317064714, "loss": 7.7864, "step": 589 }, { "epoch": 0.4273040014484881, "grad_norm": 0.5337214469909668, "learning_rate": 0.00012394012354749584, "loss": 7.8267, "step": 590 }, { "epoch": 0.4280282455187398, "grad_norm": 0.48263901472091675, "learning_rate": 0.0001237175782197843, "loss": 7.7757, "step": 591 }, { "epoch": 0.4287524895889915, "grad_norm": 0.468227356672287, "learning_rate": 0.00012349490835605127, "loss": 7.8892, "step": 592 }, { "epoch": 0.42947673365924316, "grad_norm": 0.4746133089065552, "learning_rate": 0.00012327211512548945, "loss": 7.8251, "step": 593 }, { "epoch": 0.43020097772949484, "grad_norm": 0.4931454062461853, "learning_rate": 0.00012304919969793928, "loss": 7.7892, "step": 594 }, { "epoch": 0.4309252217997465, "grad_norm": 0.44330355525016785, "learning_rate": 0.00012282616324388283, "loss": 7.8673, "step": 595 }, { "epoch": 0.4316494658699982, "grad_norm": 0.5240752100944519, "learning_rate": 0.00012260300693443777, "loss": 7.8191, "step": 596 }, { "epoch": 0.4323737099402499, "grad_norm": 0.5226469039916992, "learning_rate": 0.00012237973194135086, "loss": 7.799, "step": 597 }, { "epoch": 0.43309795401050155, "grad_norm": 0.5419146418571472, "learning_rate": 0.00012215633943699232, "loss": 7.753, "step": 598 }, { "epoch": 0.43382219808075323, "grad_norm": 0.5095767974853516, "learning_rate": 0.00012193283059434918, "loss": 7.899, "step": 599 }, { "epoch": 0.4345464421510049, "grad_norm": 0.5662295818328857, "learning_rate": 0.0001217092065870195, "loss": 7.8183, "step": 600 }, { "epoch": 0.4352706862212566, "grad_norm": 0.46471965312957764, "learning_rate": 0.00012148546858920591, "loss": 7.8282, "step": 601 }, { "epoch": 0.43599493029150826, "grad_norm": 0.5106796622276306, "learning_rate": 0.00012126161777570967, "loss": 7.8318, "step": 602 }, { "epoch": 0.4367191743617599, "grad_norm": 0.48581328988075256, "learning_rate": 0.00012103765532192437, "loss": 7.9178, "step": 603 }, { "epoch": 0.43744341843201157, "grad_norm": 0.4774625301361084, "learning_rate": 0.00012081358240382983, "loss": 7.861, "step": 604 }, { "epoch": 0.43816766250226324, "grad_norm": 0.4969928562641144, "learning_rate": 0.00012058940019798588, "loss": 7.8667, "step": 605 }, { "epoch": 0.4388919065725149, "grad_norm": 0.4804958403110504, "learning_rate": 0.00012036510988152618, "loss": 7.8884, "step": 606 }, { "epoch": 0.4396161506427666, "grad_norm": 0.4955686330795288, "learning_rate": 0.0001201407126321521, "loss": 7.8276, "step": 607 }, { "epoch": 0.4403403947130183, "grad_norm": 0.49440160393714905, "learning_rate": 0.00011991620962812638, "loss": 7.8653, "step": 608 }, { "epoch": 0.44106463878326996, "grad_norm": 0.49569594860076904, "learning_rate": 0.00011969160204826721, "loss": 7.8461, "step": 609 }, { "epoch": 0.44178888285352164, "grad_norm": 0.4727310240268707, "learning_rate": 0.00011946689107194182, "loss": 7.8543, "step": 610 }, { "epoch": 0.4425131269237733, "grad_norm": 0.4789380729198456, "learning_rate": 0.00011924207787906032, "loss": 7.8264, "step": 611 }, { "epoch": 0.443237370994025, "grad_norm": 0.4611952006816864, "learning_rate": 0.00011901716365006956, "loss": 7.868, "step": 612 }, { "epoch": 0.44396161506427667, "grad_norm": 0.4959096312522888, "learning_rate": 0.00011879214956594693, "loss": 7.8141, "step": 613 }, { "epoch": 0.44468585913452835, "grad_norm": 0.4807751178741455, "learning_rate": 0.00011856703680819414, "loss": 7.8085, "step": 614 }, { "epoch": 0.44541010320478003, "grad_norm": 0.4968527555465698, "learning_rate": 0.00011834182655883097, "loss": 7.8014, "step": 615 }, { "epoch": 0.4461343472750317, "grad_norm": 0.4878495931625366, "learning_rate": 0.00011811652000038915, "loss": 7.7984, "step": 616 }, { "epoch": 0.4468585913452834, "grad_norm": 0.4903671443462372, "learning_rate": 0.00011789111831590606, "loss": 7.862, "step": 617 }, { "epoch": 0.44758283541553506, "grad_norm": 0.4804092347621918, "learning_rate": 0.00011766562268891864, "loss": 7.798, "step": 618 }, { "epoch": 0.4483070794857867, "grad_norm": 0.5215314626693726, "learning_rate": 0.00011744003430345705, "loss": 7.7989, "step": 619 }, { "epoch": 0.44903132355603836, "grad_norm": 0.49652373790740967, "learning_rate": 0.00011721435434403849, "loss": 7.8358, "step": 620 }, { "epoch": 0.44975556762629004, "grad_norm": 0.4879864454269409, "learning_rate": 0.00011698858399566109, "loss": 7.8349, "step": 621 }, { "epoch": 0.4504798116965417, "grad_norm": 0.4709030091762543, "learning_rate": 0.00011676272444379748, "loss": 7.7974, "step": 622 }, { "epoch": 0.4512040557667934, "grad_norm": 0.47246500849723816, "learning_rate": 0.00011653677687438874, "loss": 7.8776, "step": 623 }, { "epoch": 0.4519282998370451, "grad_norm": 0.46343374252319336, "learning_rate": 0.00011631074247383808, "loss": 7.8622, "step": 624 }, { "epoch": 0.45265254390729676, "grad_norm": 0.4682675898075104, "learning_rate": 0.00011608462242900471, "loss": 7.8559, "step": 625 }, { "epoch": 0.45337678797754843, "grad_norm": 0.4654393494129181, "learning_rate": 0.00011585841792719741, "loss": 7.8423, "step": 626 }, { "epoch": 0.4541010320478001, "grad_norm": 0.44032153487205505, "learning_rate": 0.00011563213015616856, "loss": 7.8894, "step": 627 }, { "epoch": 0.4548252761180518, "grad_norm": 0.4499657452106476, "learning_rate": 0.0001154057603041077, "loss": 7.8771, "step": 628 }, { "epoch": 0.45554952018830347, "grad_norm": 0.4855179190635681, "learning_rate": 0.0001151793095596354, "loss": 7.8613, "step": 629 }, { "epoch": 0.45627376425855515, "grad_norm": 0.4786494970321655, "learning_rate": 0.00011495277911179694, "loss": 7.8229, "step": 630 }, { "epoch": 0.4569980083288068, "grad_norm": 0.48533812165260315, "learning_rate": 0.00011472617015005609, "loss": 7.8304, "step": 631 }, { "epoch": 0.4577222523990585, "grad_norm": 0.4566459059715271, "learning_rate": 0.00011449948386428894, "loss": 7.8429, "step": 632 }, { "epoch": 0.4584464964693102, "grad_norm": 0.4726155400276184, "learning_rate": 0.00011427272144477757, "loss": 7.7996, "step": 633 }, { "epoch": 0.4591707405395618, "grad_norm": 0.44978711009025574, "learning_rate": 0.0001140458840822038, "loss": 7.8547, "step": 634 }, { "epoch": 0.4598949846098135, "grad_norm": 0.503386378288269, "learning_rate": 0.00011381897296764296, "loss": 7.8874, "step": 635 }, { "epoch": 0.46061922868006516, "grad_norm": 0.4718452990055084, "learning_rate": 0.0001135919892925577, "loss": 7.7894, "step": 636 }, { "epoch": 0.46134347275031684, "grad_norm": 0.47960880398750305, "learning_rate": 0.00011336493424879158, "loss": 7.8652, "step": 637 }, { "epoch": 0.4620677168205685, "grad_norm": 0.48030349612236023, "learning_rate": 0.00011313780902856294, "loss": 7.8594, "step": 638 }, { "epoch": 0.4627919608908202, "grad_norm": 0.448631227016449, "learning_rate": 0.00011291061482445861, "loss": 7.9241, "step": 639 }, { "epoch": 0.4635162049610719, "grad_norm": 0.5188454985618591, "learning_rate": 0.00011268335282942765, "loss": 7.7786, "step": 640 }, { "epoch": 0.46424044903132355, "grad_norm": 0.47939813137054443, "learning_rate": 0.00011245602423677506, "loss": 7.8136, "step": 641 }, { "epoch": 0.46496469310157523, "grad_norm": 0.5121312737464905, "learning_rate": 0.00011222863024015551, "loss": 7.91, "step": 642 }, { "epoch": 0.4656889371718269, "grad_norm": 0.49285969138145447, "learning_rate": 0.00011200117203356715, "loss": 7.8379, "step": 643 }, { "epoch": 0.4664131812420786, "grad_norm": 0.4866684079170227, "learning_rate": 0.0001117736508113452, "loss": 7.8206, "step": 644 }, { "epoch": 0.46713742531233027, "grad_norm": 0.46585652232170105, "learning_rate": 0.00011154606776815587, "loss": 7.8355, "step": 645 }, { "epoch": 0.46786166938258195, "grad_norm": 0.49136561155319214, "learning_rate": 0.00011131842409898982, "loss": 7.8392, "step": 646 }, { "epoch": 0.4685859134528336, "grad_norm": 0.4672096073627472, "learning_rate": 0.00011109072099915625, "loss": 7.8878, "step": 647 }, { "epoch": 0.4693101575230853, "grad_norm": 0.5394557118415833, "learning_rate": 0.00011086295966427622, "loss": 7.819, "step": 648 }, { "epoch": 0.470034401593337, "grad_norm": 0.4872957766056061, "learning_rate": 0.00011063514129027672, "loss": 7.8399, "step": 649 }, { "epoch": 0.4707586456635886, "grad_norm": 0.5272302627563477, "learning_rate": 0.00011040726707338416, "loss": 7.9755, "step": 650 }, { "epoch": 0.4714828897338403, "grad_norm": 0.4649381935596466, "learning_rate": 0.00011017933821011819, "loss": 7.8441, "step": 651 }, { "epoch": 0.47220713380409196, "grad_norm": 0.4975007474422455, "learning_rate": 0.0001099513558972854, "loss": 7.8123, "step": 652 }, { "epoch": 0.47293137787434364, "grad_norm": 0.4799993336200714, "learning_rate": 0.000109723321331973, "loss": 7.8462, "step": 653 }, { "epoch": 0.4736556219445953, "grad_norm": 0.46020984649658203, "learning_rate": 0.00010949523571154266, "loss": 7.8492, "step": 654 }, { "epoch": 0.474379866014847, "grad_norm": 0.48827457427978516, "learning_rate": 0.00010926710023362398, "loss": 7.8295, "step": 655 }, { "epoch": 0.4751041100850987, "grad_norm": 0.509655773639679, "learning_rate": 0.0001090389160961085, "loss": 7.8165, "step": 656 }, { "epoch": 0.47582835415535035, "grad_norm": 0.48283475637435913, "learning_rate": 0.00010881068449714315, "loss": 7.8341, "step": 657 }, { "epoch": 0.47655259822560203, "grad_norm": 0.4844430088996887, "learning_rate": 0.00010858240663512416, "loss": 7.8634, "step": 658 }, { "epoch": 0.4772768422958537, "grad_norm": 0.4722267985343933, "learning_rate": 0.00010835408370869063, "loss": 7.8826, "step": 659 }, { "epoch": 0.4780010863661054, "grad_norm": 0.4804382026195526, "learning_rate": 0.00010812571691671826, "loss": 7.8242, "step": 660 }, { "epoch": 0.47872533043635707, "grad_norm": 0.46896272897720337, "learning_rate": 0.00010789730745831312, "loss": 7.9247, "step": 661 }, { "epoch": 0.47944957450660874, "grad_norm": 0.5058935284614563, "learning_rate": 0.00010766885653280532, "loss": 7.8299, "step": 662 }, { "epoch": 0.4801738185768604, "grad_norm": 0.49389636516571045, "learning_rate": 0.00010744036533974267, "loss": 7.8462, "step": 663 }, { "epoch": 0.4808980626471121, "grad_norm": 0.47579723596572876, "learning_rate": 0.00010721183507888442, "loss": 7.8666, "step": 664 }, { "epoch": 0.4816223067173637, "grad_norm": 0.47958090901374817, "learning_rate": 0.00010698326695019496, "loss": 7.8241, "step": 665 }, { "epoch": 0.4823465507876154, "grad_norm": 0.5065767765045166, "learning_rate": 0.00010675466215383758, "loss": 7.8085, "step": 666 }, { "epoch": 0.4830707948578671, "grad_norm": 0.4563339352607727, "learning_rate": 0.00010652602189016799, "loss": 7.8053, "step": 667 }, { "epoch": 0.48379503892811876, "grad_norm": 0.4857904314994812, "learning_rate": 0.00010629734735972818, "loss": 7.8281, "step": 668 }, { "epoch": 0.48451928299837044, "grad_norm": 0.4706745743751526, "learning_rate": 0.00010606863976324015, "loss": 7.8073, "step": 669 }, { "epoch": 0.4852435270686221, "grad_norm": 0.49210935831069946, "learning_rate": 0.00010583990030159939, "loss": 7.8297, "step": 670 }, { "epoch": 0.4859677711388738, "grad_norm": 0.4665246307849884, "learning_rate": 0.00010561113017586878, "loss": 7.8966, "step": 671 }, { "epoch": 0.4866920152091255, "grad_norm": 0.47893619537353516, "learning_rate": 0.00010538233058727225, "loss": 7.8491, "step": 672 }, { "epoch": 0.48741625927937715, "grad_norm": 0.4599825441837311, "learning_rate": 0.00010515350273718829, "loss": 7.8459, "step": 673 }, { "epoch": 0.48814050334962883, "grad_norm": 0.47248682379722595, "learning_rate": 0.00010492464782714395, "loss": 7.8785, "step": 674 }, { "epoch": 0.4888647474198805, "grad_norm": 0.46690088510513306, "learning_rate": 0.00010469576705880826, "loss": 7.8327, "step": 675 }, { "epoch": 0.4895889914901322, "grad_norm": 0.48305100202560425, "learning_rate": 0.00010446686163398604, "loss": 7.8324, "step": 676 }, { "epoch": 0.49031323556038386, "grad_norm": 0.47751349210739136, "learning_rate": 0.00010423793275461162, "loss": 7.8515, "step": 677 }, { "epoch": 0.49103747963063554, "grad_norm": 0.46247628331184387, "learning_rate": 0.00010400898162274248, "loss": 7.8304, "step": 678 }, { "epoch": 0.4917617237008872, "grad_norm": 0.4933788478374481, "learning_rate": 0.00010378000944055291, "loss": 7.7668, "step": 679 }, { "epoch": 0.4924859677711389, "grad_norm": 0.4568544626235962, "learning_rate": 0.00010355101741032771, "loss": 7.8139, "step": 680 }, { "epoch": 0.4932102118413905, "grad_norm": 0.5135279297828674, "learning_rate": 0.000103322006734456, "loss": 7.8382, "step": 681 }, { "epoch": 0.4939344559116422, "grad_norm": 0.4543817639350891, "learning_rate": 0.00010309297861542468, "loss": 7.8125, "step": 682 }, { "epoch": 0.4946586999818939, "grad_norm": 0.48243218660354614, "learning_rate": 0.00010286393425581231, "loss": 7.8269, "step": 683 }, { "epoch": 0.49538294405214556, "grad_norm": 0.4711065888404846, "learning_rate": 0.00010263487485828271, "loss": 7.881, "step": 684 }, { "epoch": 0.49610718812239724, "grad_norm": 0.48097798228263855, "learning_rate": 0.0001024058016255787, "loss": 7.8061, "step": 685 }, { "epoch": 0.4968314321926489, "grad_norm": 0.4807778000831604, "learning_rate": 0.00010217671576051564, "loss": 7.8052, "step": 686 }, { "epoch": 0.4975556762629006, "grad_norm": 0.48832613229751587, "learning_rate": 0.00010194761846597534, "loss": 7.814, "step": 687 }, { "epoch": 0.49827992033315227, "grad_norm": 0.4787043035030365, "learning_rate": 0.00010171851094489957, "loss": 7.829, "step": 688 }, { "epoch": 0.49900416440340395, "grad_norm": 0.4968035817146301, "learning_rate": 0.0001014893944002838, "loss": 7.7458, "step": 689 }, { "epoch": 0.4997284084736556, "grad_norm": 0.458019495010376, "learning_rate": 0.00010126027003517086, "loss": 7.8039, "step": 690 }, { "epoch": 0.5004526525439073, "grad_norm": 0.4970747232437134, "learning_rate": 0.0001010311390526447, "loss": 7.7751, "step": 691 }, { "epoch": 0.501176896614159, "grad_norm": 0.45407259464263916, "learning_rate": 0.00010080200265582394, "loss": 7.836, "step": 692 }, { "epoch": 0.501176896614159, "eval_loss": 7.830474376678467, "eval_runtime": 4.779, "eval_samples_per_second": 243.355, "eval_steps_per_second": 121.782, "step": 692 }, { "epoch": 0.5019011406844106, "grad_norm": 0.5260797142982483, "learning_rate": 0.0001005728620478557, "loss": 7.7982, "step": 693 }, { "epoch": 0.5026253847546623, "grad_norm": 0.4505165219306946, "learning_rate": 0.00010034371843190915, "loss": 7.8403, "step": 694 }, { "epoch": 0.503349628824914, "grad_norm": 0.5103182196617126, "learning_rate": 0.00010011457301116933, "loss": 7.7234, "step": 695 }, { "epoch": 0.5040738728951657, "grad_norm": 0.5652449727058411, "learning_rate": 9.988542698883068e-05, "loss": 7.8412, "step": 696 }, { "epoch": 0.5047981169654173, "grad_norm": 0.5191313624382019, "learning_rate": 9.965628156809087e-05, "loss": 7.7998, "step": 697 }, { "epoch": 0.505522361035669, "grad_norm": 0.4937920570373535, "learning_rate": 9.94271379521443e-05, "loss": 7.8072, "step": 698 }, { "epoch": 0.5062466051059207, "grad_norm": 0.547224760055542, "learning_rate": 9.919799734417608e-05, "loss": 7.8387, "step": 699 }, { "epoch": 0.5069708491761724, "grad_norm": 0.5999129414558411, "learning_rate": 9.896886094735535e-05, "loss": 7.8445, "step": 700 }, { "epoch": 0.507695093246424, "grad_norm": 0.5010003447532654, "learning_rate": 9.873972996482916e-05, "loss": 7.8172, "step": 701 }, { "epoch": 0.5084193373166758, "grad_norm": 0.48822104930877686, "learning_rate": 9.851060559971624e-05, "loss": 7.8536, "step": 702 }, { "epoch": 0.5091435813869274, "grad_norm": 0.5012331604957581, "learning_rate": 9.828148905510044e-05, "loss": 7.8445, "step": 703 }, { "epoch": 0.5098678254571791, "grad_norm": 0.4834780693054199, "learning_rate": 9.805238153402469e-05, "loss": 7.822, "step": 704 }, { "epoch": 0.5105920695274307, "grad_norm": 0.4714999496936798, "learning_rate": 9.782328423948435e-05, "loss": 7.877, "step": 705 }, { "epoch": 0.5113163135976824, "grad_norm": 0.46891549229621887, "learning_rate": 9.759419837442134e-05, "loss": 7.8555, "step": 706 }, { "epoch": 0.5120405576679341, "grad_norm": 0.4694564938545227, "learning_rate": 9.736512514171732e-05, "loss": 7.8477, "step": 707 }, { "epoch": 0.5127648017381857, "grad_norm": 0.48359888792037964, "learning_rate": 9.71360657441877e-05, "loss": 7.8765, "step": 708 }, { "epoch": 0.5134890458084375, "grad_norm": 0.47419044375419617, "learning_rate": 9.690702138457534e-05, "loss": 7.8814, "step": 709 }, { "epoch": 0.5142132898786891, "grad_norm": 0.47822949290275574, "learning_rate": 9.667799326554401e-05, "loss": 7.8609, "step": 710 }, { "epoch": 0.5149375339489408, "grad_norm": 0.4817030429840088, "learning_rate": 9.644898258967232e-05, "loss": 7.8694, "step": 711 }, { "epoch": 0.5156617780191924, "grad_norm": 0.4727485775947571, "learning_rate": 9.62199905594471e-05, "loss": 7.903, "step": 712 }, { "epoch": 0.5163860220894442, "grad_norm": 0.5098878741264343, "learning_rate": 9.599101837725753e-05, "loss": 7.8711, "step": 713 }, { "epoch": 0.5171102661596958, "grad_norm": 0.449669748544693, "learning_rate": 9.576206724538837e-05, "loss": 7.868, "step": 714 }, { "epoch": 0.5178345102299475, "grad_norm": 0.4713840186595917, "learning_rate": 9.553313836601398e-05, "loss": 7.8244, "step": 715 }, { "epoch": 0.5185587543001992, "grad_norm": 0.4921199381351471, "learning_rate": 9.53042329411918e-05, "loss": 7.8841, "step": 716 }, { "epoch": 0.5192829983704509, "grad_norm": 0.47159916162490845, "learning_rate": 9.507535217285607e-05, "loss": 7.8828, "step": 717 }, { "epoch": 0.5200072424407025, "grad_norm": 0.48348769545555115, "learning_rate": 9.484649726281173e-05, "loss": 7.8346, "step": 718 }, { "epoch": 0.5207314865109542, "grad_norm": 0.48519793152809143, "learning_rate": 9.461766941272778e-05, "loss": 7.8751, "step": 719 }, { "epoch": 0.5214557305812059, "grad_norm": 0.4665411412715912, "learning_rate": 9.438886982413124e-05, "loss": 7.9164, "step": 720 }, { "epoch": 0.5221799746514575, "grad_norm": 0.4838716387748718, "learning_rate": 9.416009969840061e-05, "loss": 7.8461, "step": 721 }, { "epoch": 0.5229042187217092, "grad_norm": 0.4719375669956207, "learning_rate": 9.393136023675988e-05, "loss": 7.8157, "step": 722 }, { "epoch": 0.5236284627919608, "grad_norm": 0.5168206691741943, "learning_rate": 9.370265264027185e-05, "loss": 7.8118, "step": 723 }, { "epoch": 0.5243527068622126, "grad_norm": 0.466214656829834, "learning_rate": 9.347397810983204e-05, "loss": 7.7755, "step": 724 }, { "epoch": 0.5250769509324642, "grad_norm": 0.5822585821151733, "learning_rate": 9.324533784616247e-05, "loss": 7.85, "step": 725 }, { "epoch": 0.5258011950027159, "grad_norm": 0.48217180371284485, "learning_rate": 9.301673304980504e-05, "loss": 7.8665, "step": 726 }, { "epoch": 0.5265254390729676, "grad_norm": 0.5124320983886719, "learning_rate": 9.278816492111562e-05, "loss": 7.8112, "step": 727 }, { "epoch": 0.5272496831432193, "grad_norm": 0.4656013548374176, "learning_rate": 9.255963466025735e-05, "loss": 7.8952, "step": 728 }, { "epoch": 0.5279739272134709, "grad_norm": 0.4809459447860718, "learning_rate": 9.233114346719472e-05, "loss": 7.8659, "step": 729 }, { "epoch": 0.5286981712837227, "grad_norm": 0.46756666898727417, "learning_rate": 9.21026925416869e-05, "loss": 7.8359, "step": 730 }, { "epoch": 0.5294224153539743, "grad_norm": 0.503579318523407, "learning_rate": 9.187428308328176e-05, "loss": 7.8012, "step": 731 }, { "epoch": 0.530146659424226, "grad_norm": 0.46779781579971313, "learning_rate": 9.164591629130941e-05, "loss": 7.8359, "step": 732 }, { "epoch": 0.5308709034944776, "grad_norm": 0.4926343560218811, "learning_rate": 9.141759336487584e-05, "loss": 7.8221, "step": 733 }, { "epoch": 0.5315951475647294, "grad_norm": 0.464432954788208, "learning_rate": 9.118931550285687e-05, "loss": 7.8713, "step": 734 }, { "epoch": 0.532319391634981, "grad_norm": 0.49543848633766174, "learning_rate": 9.096108390389151e-05, "loss": 7.8084, "step": 735 }, { "epoch": 0.5330436357052326, "grad_norm": 0.45875051617622375, "learning_rate": 9.073289976637603e-05, "loss": 7.8152, "step": 736 }, { "epoch": 0.5337678797754843, "grad_norm": 0.4713042974472046, "learning_rate": 9.050476428845739e-05, "loss": 7.7912, "step": 737 }, { "epoch": 0.534492123845736, "grad_norm": 0.4827256500720978, "learning_rate": 9.027667866802701e-05, "loss": 7.8026, "step": 738 }, { "epoch": 0.5352163679159877, "grad_norm": 0.45872411131858826, "learning_rate": 9.004864410271462e-05, "loss": 7.8695, "step": 739 }, { "epoch": 0.5359406119862393, "grad_norm": 0.4700596034526825, "learning_rate": 8.982066178988182e-05, "loss": 7.8606, "step": 740 }, { "epoch": 0.5366648560564911, "grad_norm": 0.45632404088974, "learning_rate": 8.959273292661586e-05, "loss": 7.7807, "step": 741 }, { "epoch": 0.5373891001267427, "grad_norm": 0.4901478886604309, "learning_rate": 8.936485870972328e-05, "loss": 7.8481, "step": 742 }, { "epoch": 0.5381133441969944, "grad_norm": 0.4826202392578125, "learning_rate": 8.913704033572379e-05, "loss": 7.7886, "step": 743 }, { "epoch": 0.538837588267246, "grad_norm": 0.5499749779701233, "learning_rate": 8.89092790008438e-05, "loss": 7.8111, "step": 744 }, { "epoch": 0.5395618323374978, "grad_norm": 0.4294477105140686, "learning_rate": 8.868157590101019e-05, "loss": 7.8344, "step": 745 }, { "epoch": 0.5402860764077494, "grad_norm": 0.4754398763179779, "learning_rate": 8.845393223184418e-05, "loss": 7.9187, "step": 746 }, { "epoch": 0.5410103204780011, "grad_norm": 0.526314377784729, "learning_rate": 8.822634918865482e-05, "loss": 7.7336, "step": 747 }, { "epoch": 0.5417345645482528, "grad_norm": 0.5151805281639099, "learning_rate": 8.799882796643288e-05, "loss": 7.7485, "step": 748 }, { "epoch": 0.5424588086185045, "grad_norm": 0.56130450963974, "learning_rate": 8.777136975984448e-05, "loss": 7.8075, "step": 749 }, { "epoch": 0.5431830526887561, "grad_norm": 0.6159570813179016, "learning_rate": 8.754397576322498e-05, "loss": 7.7995, "step": 750 }, { "epoch": 0.5439072967590078, "grad_norm": 0.49635598063468933, "learning_rate": 8.731664717057236e-05, "loss": 7.8466, "step": 751 }, { "epoch": 0.5446315408292595, "grad_norm": 0.49500927329063416, "learning_rate": 8.708938517554141e-05, "loss": 7.8124, "step": 752 }, { "epoch": 0.5453557848995111, "grad_norm": 0.48374736309051514, "learning_rate": 8.68621909714371e-05, "loss": 7.8808, "step": 753 }, { "epoch": 0.5460800289697628, "grad_norm": 0.4718135893344879, "learning_rate": 8.663506575120845e-05, "loss": 7.8721, "step": 754 }, { "epoch": 0.5468042730400144, "grad_norm": 0.49202045798301697, "learning_rate": 8.640801070744233e-05, "loss": 7.8248, "step": 755 }, { "epoch": 0.5475285171102662, "grad_norm": 0.4670674502849579, "learning_rate": 8.618102703235702e-05, "loss": 7.8563, "step": 756 }, { "epoch": 0.5482527611805178, "grad_norm": 0.5087535381317139, "learning_rate": 8.59541159177962e-05, "loss": 7.8356, "step": 757 }, { "epoch": 0.5489770052507695, "grad_norm": 0.4710357189178467, "learning_rate": 8.572727855522243e-05, "loss": 7.8835, "step": 758 }, { "epoch": 0.5497012493210212, "grad_norm": 0.48206937313079834, "learning_rate": 8.550051613571108e-05, "loss": 7.8386, "step": 759 }, { "epoch": 0.5504254933912729, "grad_norm": 0.4598138630390167, "learning_rate": 8.527382984994394e-05, "loss": 7.9155, "step": 760 }, { "epoch": 0.5511497374615245, "grad_norm": 0.46072736382484436, "learning_rate": 8.504722088820309e-05, "loss": 7.9004, "step": 761 }, { "epoch": 0.5518739815317762, "grad_norm": 0.47069671750068665, "learning_rate": 8.482069044036462e-05, "loss": 7.8558, "step": 762 }, { "epoch": 0.5525982256020279, "grad_norm": 0.48600372672080994, "learning_rate": 8.459423969589228e-05, "loss": 7.8204, "step": 763 }, { "epoch": 0.5533224696722796, "grad_norm": 0.456483393907547, "learning_rate": 8.436786984383146e-05, "loss": 7.8135, "step": 764 }, { "epoch": 0.5540467137425312, "grad_norm": 0.4625873565673828, "learning_rate": 8.414158207280259e-05, "loss": 7.8563, "step": 765 }, { "epoch": 0.554770957812783, "grad_norm": 0.47437793016433716, "learning_rate": 8.391537757099533e-05, "loss": 7.8649, "step": 766 }, { "epoch": 0.5554952018830346, "grad_norm": 0.48973995447158813, "learning_rate": 8.368925752616194e-05, "loss": 7.8504, "step": 767 }, { "epoch": 0.5562194459532862, "grad_norm": 0.49295151233673096, "learning_rate": 8.346322312561127e-05, "loss": 7.8525, "step": 768 }, { "epoch": 0.5569436900235379, "grad_norm": 0.4774750769138336, "learning_rate": 8.323727555620256e-05, "loss": 7.8677, "step": 769 }, { "epoch": 0.5576679340937896, "grad_norm": 0.4751247465610504, "learning_rate": 8.301141600433891e-05, "loss": 7.8658, "step": 770 }, { "epoch": 0.5583921781640413, "grad_norm": 0.48602089285850525, "learning_rate": 8.278564565596152e-05, "loss": 7.8284, "step": 771 }, { "epoch": 0.5591164222342929, "grad_norm": 0.4605376124382019, "learning_rate": 8.255996569654296e-05, "loss": 7.8761, "step": 772 }, { "epoch": 0.5598406663045447, "grad_norm": 0.46493852138519287, "learning_rate": 8.233437731108139e-05, "loss": 7.8206, "step": 773 }, { "epoch": 0.5605649103747963, "grad_norm": 0.4919988512992859, "learning_rate": 8.210888168409399e-05, "loss": 7.7783, "step": 774 }, { "epoch": 0.561289154445048, "grad_norm": 0.4530481994152069, "learning_rate": 8.188347999961087e-05, "loss": 7.8679, "step": 775 }, { "epoch": 0.5620133985152996, "grad_norm": 0.4982988238334656, "learning_rate": 8.165817344116906e-05, "loss": 7.7636, "step": 776 }, { "epoch": 0.5627376425855514, "grad_norm": 0.49008551239967346, "learning_rate": 8.143296319180588e-05, "loss": 7.864, "step": 777 }, { "epoch": 0.563461886655803, "grad_norm": 0.4614430367946625, "learning_rate": 8.120785043405309e-05, "loss": 7.8705, "step": 778 }, { "epoch": 0.5641861307260547, "grad_norm": 0.46965253353118896, "learning_rate": 8.098283634993045e-05, "loss": 7.8487, "step": 779 }, { "epoch": 0.5649103747963063, "grad_norm": 0.4675351083278656, "learning_rate": 8.07579221209397e-05, "loss": 7.8998, "step": 780 }, { "epoch": 0.5656346188665581, "grad_norm": 0.5022285580635071, "learning_rate": 8.053310892805823e-05, "loss": 7.7928, "step": 781 }, { "epoch": 0.5663588629368097, "grad_norm": 0.46838775277137756, "learning_rate": 8.03083979517328e-05, "loss": 7.8427, "step": 782 }, { "epoch": 0.5670831070070613, "grad_norm": 0.501288652420044, "learning_rate": 8.008379037187366e-05, "loss": 7.7874, "step": 783 }, { "epoch": 0.5678073510773131, "grad_norm": 0.49566730856895447, "learning_rate": 7.985928736784794e-05, "loss": 7.7951, "step": 784 }, { "epoch": 0.5685315951475647, "grad_norm": 0.4716828167438507, "learning_rate": 7.963489011847385e-05, "loss": 7.7954, "step": 785 }, { "epoch": 0.5692558392178164, "grad_norm": 0.47677019238471985, "learning_rate": 7.941059980201413e-05, "loss": 7.8447, "step": 786 }, { "epoch": 0.569980083288068, "grad_norm": 0.4770268499851227, "learning_rate": 7.918641759617018e-05, "loss": 7.804, "step": 787 }, { "epoch": 0.5707043273583198, "grad_norm": 0.48790669441223145, "learning_rate": 7.896234467807561e-05, "loss": 7.7361, "step": 788 }, { "epoch": 0.5714285714285714, "grad_norm": 0.49754399061203003, "learning_rate": 7.873838222429035e-05, "loss": 7.7453, "step": 789 }, { "epoch": 0.5721528154988231, "grad_norm": 0.4493372142314911, "learning_rate": 7.851453141079413e-05, "loss": 7.8176, "step": 790 }, { "epoch": 0.5728770595690748, "grad_norm": 0.4799216091632843, "learning_rate": 7.829079341298051e-05, "loss": 7.814, "step": 791 }, { "epoch": 0.5736013036393265, "grad_norm": 0.4880140721797943, "learning_rate": 7.806716940565084e-05, "loss": 7.7657, "step": 792 }, { "epoch": 0.5743255477095781, "grad_norm": 0.46878695487976074, "learning_rate": 7.784366056300769e-05, "loss": 7.8544, "step": 793 }, { "epoch": 0.5750497917798298, "grad_norm": 0.5069106817245483, "learning_rate": 7.762026805864915e-05, "loss": 7.8384, "step": 794 }, { "epoch": 0.5757740358500815, "grad_norm": 0.521747350692749, "learning_rate": 7.739699306556228e-05, "loss": 7.7701, "step": 795 }, { "epoch": 0.5764982799203332, "grad_norm": 0.5047403573989868, "learning_rate": 7.717383675611718e-05, "loss": 7.8321, "step": 796 }, { "epoch": 0.5772225239905848, "grad_norm": 0.48073387145996094, "learning_rate": 7.695080030206076e-05, "loss": 7.7588, "step": 797 }, { "epoch": 0.5779467680608364, "grad_norm": 0.4945087730884552, "learning_rate": 7.672788487451058e-05, "loss": 7.8004, "step": 798 }, { "epoch": 0.5786710121310882, "grad_norm": 0.537676215171814, "learning_rate": 7.650509164394876e-05, "loss": 7.7014, "step": 799 }, { "epoch": 0.5793952562013398, "grad_norm": 0.5863990783691406, "learning_rate": 7.628242178021572e-05, "loss": 7.9051, "step": 800 }, { "epoch": 0.5801195002715915, "grad_norm": 0.5090913772583008, "learning_rate": 7.605987645250419e-05, "loss": 7.8515, "step": 801 }, { "epoch": 0.5808437443418432, "grad_norm": 0.507625162601471, "learning_rate": 7.583745682935286e-05, "loss": 7.8064, "step": 802 }, { "epoch": 0.5815679884120949, "grad_norm": 0.4949605166912079, "learning_rate": 7.561516407864055e-05, "loss": 7.7985, "step": 803 }, { "epoch": 0.5822922324823465, "grad_norm": 0.48158496618270874, "learning_rate": 7.539299936757978e-05, "loss": 7.8514, "step": 804 }, { "epoch": 0.5830164765525983, "grad_norm": 0.48431625962257385, "learning_rate": 7.517096386271072e-05, "loss": 7.8289, "step": 805 }, { "epoch": 0.5837407206228499, "grad_norm": 0.5053116679191589, "learning_rate": 7.494905872989522e-05, "loss": 7.8756, "step": 806 }, { "epoch": 0.5844649646931016, "grad_norm": 0.49544015526771545, "learning_rate": 7.472728513431051e-05, "loss": 7.8154, "step": 807 }, { "epoch": 0.5851892087633532, "grad_norm": 0.4702587425708771, "learning_rate": 7.450564424044318e-05, "loss": 7.8741, "step": 808 }, { "epoch": 0.585913452833605, "grad_norm": 0.47806039452552795, "learning_rate": 7.428413721208296e-05, "loss": 7.8546, "step": 809 }, { "epoch": 0.5866376969038566, "grad_norm": 0.49512267112731934, "learning_rate": 7.40627652123168e-05, "loss": 7.8856, "step": 810 }, { "epoch": 0.5873619409741083, "grad_norm": 0.5060679316520691, "learning_rate": 7.384152940352253e-05, "loss": 7.7994, "step": 811 }, { "epoch": 0.5880861850443599, "grad_norm": 0.5014981627464294, "learning_rate": 7.362043094736287e-05, "loss": 7.8225, "step": 812 }, { "epoch": 0.5888104291146117, "grad_norm": 0.5063635110855103, "learning_rate": 7.339947100477947e-05, "loss": 7.8719, "step": 813 }, { "epoch": 0.5895346731848633, "grad_norm": 0.4955052137374878, "learning_rate": 7.317865073598648e-05, "loss": 7.9004, "step": 814 }, { "epoch": 0.5902589172551149, "grad_norm": 0.4668295085430145, "learning_rate": 7.295797130046482e-05, "loss": 7.8558, "step": 815 }, { "epoch": 0.5909831613253667, "grad_norm": 0.47807928919792175, "learning_rate": 7.273743385695582e-05, "loss": 7.8619, "step": 816 }, { "epoch": 0.5917074053956183, "grad_norm": 0.48628920316696167, "learning_rate": 7.25170395634553e-05, "loss": 7.8668, "step": 817 }, { "epoch": 0.59243164946587, "grad_norm": 0.4596603214740753, "learning_rate": 7.229678957720738e-05, "loss": 7.8634, "step": 818 }, { "epoch": 0.5931558935361216, "grad_norm": 0.4702138900756836, "learning_rate": 7.207668505469844e-05, "loss": 7.8422, "step": 819 }, { "epoch": 0.5938801376063734, "grad_norm": 0.4690391421318054, "learning_rate": 7.185672715165119e-05, "loss": 7.8206, "step": 820 }, { "epoch": 0.594604381676625, "grad_norm": 0.5008434057235718, "learning_rate": 7.163691702301827e-05, "loss": 7.8105, "step": 821 }, { "epoch": 0.5953286257468767, "grad_norm": 0.5092424750328064, "learning_rate": 7.14172558229766e-05, "loss": 7.8332, "step": 822 }, { "epoch": 0.5960528698171284, "grad_norm": 0.4641984701156616, "learning_rate": 7.119774470492092e-05, "loss": 7.8529, "step": 823 }, { "epoch": 0.5967771138873801, "grad_norm": 0.4499223828315735, "learning_rate": 7.097838482145802e-05, "loss": 7.8605, "step": 824 }, { "epoch": 0.5975013579576317, "grad_norm": 0.47683218121528625, "learning_rate": 7.075917732440061e-05, "loss": 7.8361, "step": 825 }, { "epoch": 0.5982256020278834, "grad_norm": 0.48220375180244446, "learning_rate": 7.054012336476121e-05, "loss": 7.8493, "step": 826 }, { "epoch": 0.5989498460981351, "grad_norm": 0.5097104907035828, "learning_rate": 7.032122409274613e-05, "loss": 7.894, "step": 827 }, { "epoch": 0.5996740901683868, "grad_norm": 0.44973528385162354, "learning_rate": 7.010248065774943e-05, "loss": 7.8853, "step": 828 }, { "epoch": 0.6003983342386384, "grad_norm": 0.48968738317489624, "learning_rate": 6.988389420834703e-05, "loss": 7.8434, "step": 829 }, { "epoch": 0.60112257830889, "grad_norm": 0.4859766960144043, "learning_rate": 6.966546589229037e-05, "loss": 7.8775, "step": 830 }, { "epoch": 0.6018468223791418, "grad_norm": 0.5055286884307861, "learning_rate": 6.944719685650075e-05, "loss": 7.793, "step": 831 }, { "epoch": 0.6025710664493934, "grad_norm": 0.4732415974140167, "learning_rate": 6.922908824706295e-05, "loss": 7.8474, "step": 832 }, { "epoch": 0.6032953105196451, "grad_norm": 0.5133833885192871, "learning_rate": 6.90111412092195e-05, "loss": 7.7488, "step": 833 }, { "epoch": 0.6040195545898968, "grad_norm": 0.47122323513031006, "learning_rate": 6.879335688736454e-05, "loss": 7.8389, "step": 834 }, { "epoch": 0.6047437986601485, "grad_norm": 0.5096733570098877, "learning_rate": 6.857573642503776e-05, "loss": 7.791, "step": 835 }, { "epoch": 0.6054680427304001, "grad_norm": 0.4919492304325104, "learning_rate": 6.835828096491854e-05, "loss": 7.7837, "step": 836 }, { "epoch": 0.6061922868006518, "grad_norm": 0.48628994822502136, "learning_rate": 6.814099164881975e-05, "loss": 7.8275, "step": 837 }, { "epoch": 0.6069165308709035, "grad_norm": 0.5138300061225891, "learning_rate": 6.792386961768204e-05, "loss": 7.8181, "step": 838 }, { "epoch": 0.6076407749411552, "grad_norm": 0.4759021997451782, "learning_rate": 6.77069160115675e-05, "loss": 7.7909, "step": 839 }, { "epoch": 0.6083650190114068, "grad_norm": 0.5031505227088928, "learning_rate": 6.749013196965406e-05, "loss": 7.8561, "step": 840 }, { "epoch": 0.6090892630816586, "grad_norm": 0.45728060603141785, "learning_rate": 6.72735186302291e-05, "loss": 7.8218, "step": 841 }, { "epoch": 0.6098135071519102, "grad_norm": 0.4515739381313324, "learning_rate": 6.705707713068376e-05, "loss": 7.8055, "step": 842 }, { "epoch": 0.6105377512221619, "grad_norm": 0.507045567035675, "learning_rate": 6.684080860750697e-05, "loss": 7.8195, "step": 843 }, { "epoch": 0.6112619952924135, "grad_norm": 0.5178161263465881, "learning_rate": 6.662471419627924e-05, "loss": 7.8287, "step": 844 }, { "epoch": 0.6119862393626652, "grad_norm": 0.5024030804634094, "learning_rate": 6.640879503166698e-05, "loss": 7.7915, "step": 845 }, { "epoch": 0.6127104834329169, "grad_norm": 0.5156453847885132, "learning_rate": 6.619305224741632e-05, "loss": 7.8114, "step": 846 }, { "epoch": 0.6134347275031685, "grad_norm": 0.50360107421875, "learning_rate": 6.597748697634739e-05, "loss": 7.8968, "step": 847 }, { "epoch": 0.6141589715734203, "grad_norm": 0.5216660499572754, "learning_rate": 6.57621003503481e-05, "loss": 7.8246, "step": 848 }, { "epoch": 0.6148832156436719, "grad_norm": 0.48182037472724915, "learning_rate": 6.554689350036834e-05, "loss": 7.8687, "step": 849 }, { "epoch": 0.6156074597139236, "grad_norm": 0.5355033874511719, "learning_rate": 6.533186755641416e-05, "loss": 7.8434, "step": 850 }, { "epoch": 0.6163317037841752, "grad_norm": 0.49577125906944275, "learning_rate": 6.511702364754153e-05, "loss": 7.8567, "step": 851 }, { "epoch": 0.617055947854427, "grad_norm": 0.4947705566883087, "learning_rate": 6.49023629018508e-05, "loss": 7.9015, "step": 852 }, { "epoch": 0.6177801919246786, "grad_norm": 0.4933142364025116, "learning_rate": 6.468788644648039e-05, "loss": 7.8388, "step": 853 }, { "epoch": 0.6185044359949303, "grad_norm": 0.48275384306907654, "learning_rate": 6.447359540760114e-05, "loss": 7.8168, "step": 854 }, { "epoch": 0.619228680065182, "grad_norm": 0.4693118929862976, "learning_rate": 6.425949091041027e-05, "loss": 7.8784, "step": 855 }, { "epoch": 0.6199529241354337, "grad_norm": 0.49441099166870117, "learning_rate": 6.404557407912549e-05, "loss": 7.8285, "step": 856 }, { "epoch": 0.6206771682056853, "grad_norm": 0.4684365689754486, "learning_rate": 6.383184603697922e-05, "loss": 7.8689, "step": 857 }, { "epoch": 0.621401412275937, "grad_norm": 0.4639764130115509, "learning_rate": 6.361830790621246e-05, "loss": 7.8302, "step": 858 }, { "epoch": 0.6221256563461887, "grad_norm": 0.5014294385910034, "learning_rate": 6.340496080806914e-05, "loss": 7.8206, "step": 859 }, { "epoch": 0.6228499004164404, "grad_norm": 0.4852553904056549, "learning_rate": 6.319180586278996e-05, "loss": 7.8383, "step": 860 }, { "epoch": 0.623574144486692, "grad_norm": 0.4782876670360565, "learning_rate": 6.297884418960691e-05, "loss": 7.8477, "step": 861 }, { "epoch": 0.6242983885569436, "grad_norm": 0.49755236506462097, "learning_rate": 6.276607690673688e-05, "loss": 7.8563, "step": 862 }, { "epoch": 0.6250226326271954, "grad_norm": 0.485841304063797, "learning_rate": 6.255350513137626e-05, "loss": 7.8672, "step": 863 }, { "epoch": 0.625746876697447, "grad_norm": 0.4920748770236969, "learning_rate": 6.234112997969475e-05, "loss": 7.8371, "step": 864 }, { "epoch": 0.6264711207676987, "grad_norm": 0.4922964572906494, "learning_rate": 6.212895256682964e-05, "loss": 7.7465, "step": 865 }, { "epoch": 0.6271953648379504, "grad_norm": 0.46496015787124634, "learning_rate": 6.191697400688001e-05, "loss": 7.8745, "step": 866 }, { "epoch": 0.6279196089082021, "grad_norm": 0.4966541826725006, "learning_rate": 6.170519541290072e-05, "loss": 7.8253, "step": 867 }, { "epoch": 0.6286438529784537, "grad_norm": 0.47178417444229126, "learning_rate": 6.149361789689674e-05, "loss": 7.8381, "step": 868 }, { "epoch": 0.6293680970487054, "grad_norm": 0.4999862611293793, "learning_rate": 6.128224256981706e-05, "loss": 7.796, "step": 869 }, { "epoch": 0.6300923411189571, "grad_norm": 0.5017781853675842, "learning_rate": 6.107107054154924e-05, "loss": 7.8559, "step": 870 }, { "epoch": 0.6308165851892088, "grad_norm": 0.46709203720092773, "learning_rate": 6.0860102920913196e-05, "loss": 7.8881, "step": 871 }, { "epoch": 0.6315408292594604, "grad_norm": 0.49039193987846375, "learning_rate": 6.064934081565557e-05, "loss": 7.7982, "step": 872 }, { "epoch": 0.6322650733297122, "grad_norm": 0.4402787685394287, "learning_rate": 6.0438785332443946e-05, "loss": 7.8388, "step": 873 }, { "epoch": 0.6329893173999638, "grad_norm": 0.48174357414245605, "learning_rate": 6.0228437576860874e-05, "loss": 7.8222, "step": 874 }, { "epoch": 0.6337135614702155, "grad_norm": 0.4780054986476898, "learning_rate": 6.0018298653398345e-05, "loss": 7.8427, "step": 875 }, { "epoch": 0.6344378055404671, "grad_norm": 0.48912590742111206, "learning_rate": 5.980836966545158e-05, "loss": 7.8059, "step": 876 }, { "epoch": 0.6351620496107188, "grad_norm": 0.5173190832138062, "learning_rate": 5.9598651715313715e-05, "loss": 7.8253, "step": 877 }, { "epoch": 0.6358862936809705, "grad_norm": 0.4782562255859375, "learning_rate": 5.9389145904169595e-05, "loss": 7.8532, "step": 878 }, { "epoch": 0.6366105377512221, "grad_norm": 0.47767338156700134, "learning_rate": 5.917985333209022e-05, "loss": 7.8198, "step": 879 }, { "epoch": 0.6373347818214739, "grad_norm": 0.4958306849002838, "learning_rate": 5.8970775098026973e-05, "loss": 7.8114, "step": 880 }, { "epoch": 0.6380590258917255, "grad_norm": 0.49203237891197205, "learning_rate": 5.87619122998057e-05, "loss": 7.8273, "step": 881 }, { "epoch": 0.6387832699619772, "grad_norm": 0.5006452798843384, "learning_rate": 5.8553266034121124e-05, "loss": 7.7925, "step": 882 }, { "epoch": 0.6395075140322288, "grad_norm": 0.4803057312965393, "learning_rate": 5.834483739653089e-05, "loss": 7.8057, "step": 883 }, { "epoch": 0.6402317581024806, "grad_norm": 0.4590706527233124, "learning_rate": 5.813662748145008e-05, "loss": 7.8248, "step": 884 }, { "epoch": 0.6409560021727322, "grad_norm": 0.4630698263645172, "learning_rate": 5.7928637382145203e-05, "loss": 7.8183, "step": 885 }, { "epoch": 0.6416802462429839, "grad_norm": 0.5254594683647156, "learning_rate": 5.772086819072853e-05, "loss": 7.7863, "step": 886 }, { "epoch": 0.6424044903132355, "grad_norm": 0.49607181549072266, "learning_rate": 5.75133209981525e-05, "loss": 7.8268, "step": 887 }, { "epoch": 0.6431287343834873, "grad_norm": 0.4501619338989258, "learning_rate": 5.73059968942038e-05, "loss": 7.8368, "step": 888 }, { "epoch": 0.6438529784537389, "grad_norm": 0.49222031235694885, "learning_rate": 5.7098896967497775e-05, "loss": 7.7554, "step": 889 }, { "epoch": 0.6445772225239906, "grad_norm": 0.45921745896339417, "learning_rate": 5.689202230547259e-05, "loss": 7.8255, "step": 890 }, { "epoch": 0.6453014665942423, "grad_norm": 0.4936200976371765, "learning_rate": 5.668537399438374e-05, "loss": 7.8921, "step": 891 }, { "epoch": 0.6460257106644939, "grad_norm": 0.4984248876571655, "learning_rate": 5.647895311929803e-05, "loss": 7.8222, "step": 892 }, { "epoch": 0.6467499547347456, "grad_norm": 0.47879624366760254, "learning_rate": 5.627276076408807e-05, "loss": 7.9071, "step": 893 }, { "epoch": 0.6474741988049972, "grad_norm": 0.49307674169540405, "learning_rate": 5.6066798011426737e-05, "loss": 7.7861, "step": 894 }, { "epoch": 0.648198442875249, "grad_norm": 0.5343567728996277, "learning_rate": 5.586106594278109e-05, "loss": 7.8635, "step": 895 }, { "epoch": 0.6489226869455006, "grad_norm": 0.5100454092025757, "learning_rate": 5.565556563840699e-05, "loss": 7.7338, "step": 896 }, { "epoch": 0.6496469310157523, "grad_norm": 0.4881967306137085, "learning_rate": 5.5450298177343466e-05, "loss": 7.8221, "step": 897 }, { "epoch": 0.650371175086004, "grad_norm": 0.5178599953651428, "learning_rate": 5.524526463740678e-05, "loss": 7.7642, "step": 898 }, { "epoch": 0.6510954191562557, "grad_norm": 0.5256778001785278, "learning_rate": 5.5040466095184965e-05, "loss": 7.778, "step": 899 }, { "epoch": 0.6518196632265073, "grad_norm": 0.5190709829330444, "learning_rate": 5.4835903626032195e-05, "loss": 7.8178, "step": 900 }, { "epoch": 0.652543907296759, "grad_norm": 0.49377232789993286, "learning_rate": 5.4631578304063e-05, "loss": 7.8374, "step": 901 }, { "epoch": 0.6532681513670107, "grad_norm": 0.4822869598865509, "learning_rate": 5.442749120214672e-05, "loss": 7.8543, "step": 902 }, { "epoch": 0.6539923954372624, "grad_norm": 0.5066609978675842, "learning_rate": 5.4223643391901916e-05, "loss": 7.8336, "step": 903 }, { "epoch": 0.654716639507514, "grad_norm": 0.49571654200553894, "learning_rate": 5.402003594369053e-05, "loss": 7.7989, "step": 904 }, { "epoch": 0.6554408835777658, "grad_norm": 0.47712838649749756, "learning_rate": 5.3816669926612605e-05, "loss": 7.8491, "step": 905 }, { "epoch": 0.6561651276480174, "grad_norm": 0.5059483647346497, "learning_rate": 5.361354640850029e-05, "loss": 7.8269, "step": 906 }, { "epoch": 0.6568893717182691, "grad_norm": 0.4979402422904968, "learning_rate": 5.3410666455912604e-05, "loss": 7.7683, "step": 907 }, { "epoch": 0.6576136157885207, "grad_norm": 0.4917674660682678, "learning_rate": 5.320803113412951e-05, "loss": 7.8254, "step": 908 }, { "epoch": 0.6583378598587724, "grad_norm": 0.4933214783668518, "learning_rate": 5.3005641507146495e-05, "loss": 7.8163, "step": 909 }, { "epoch": 0.6590621039290241, "grad_norm": 0.4836273193359375, "learning_rate": 5.280349863766906e-05, "loss": 7.8663, "step": 910 }, { "epoch": 0.6597863479992757, "grad_norm": 0.45807477831840515, "learning_rate": 5.2601603587106863e-05, "loss": 7.8553, "step": 911 }, { "epoch": 0.6605105920695274, "grad_norm": 0.4703587293624878, "learning_rate": 5.239995741556848e-05, "loss": 7.8577, "step": 912 }, { "epoch": 0.6612348361397791, "grad_norm": 0.48380714654922485, "learning_rate": 5.2198561181855574e-05, "loss": 7.8219, "step": 913 }, { "epoch": 0.6619590802100308, "grad_norm": 0.4754285216331482, "learning_rate": 5.199741594345744e-05, "loss": 7.8467, "step": 914 }, { "epoch": 0.6626833242802824, "grad_norm": 0.4489299952983856, "learning_rate": 5.179652275654554e-05, "loss": 7.8807, "step": 915 }, { "epoch": 0.6634075683505342, "grad_norm": 0.4758758842945099, "learning_rate": 5.1595882675967755e-05, "loss": 7.7741, "step": 916 }, { "epoch": 0.6641318124207858, "grad_norm": 0.49364110827445984, "learning_rate": 5.139549675524301e-05, "loss": 7.8486, "step": 917 }, { "epoch": 0.6648560564910375, "grad_norm": 0.48365652561187744, "learning_rate": 5.1195366046555656e-05, "loss": 7.8576, "step": 918 }, { "epoch": 0.6655803005612891, "grad_norm": 0.4603043794631958, "learning_rate": 5.099549160075008e-05, "loss": 7.8672, "step": 919 }, { "epoch": 0.6663045446315409, "grad_norm": 0.46025213599205017, "learning_rate": 5.079587446732493e-05, "loss": 7.8447, "step": 920 }, { "epoch": 0.6670287887017925, "grad_norm": 0.47760942578315735, "learning_rate": 5.059651569442794e-05, "loss": 7.8622, "step": 921 }, { "epoch": 0.6677530327720442, "grad_norm": 0.4995022416114807, "learning_rate": 5.039741632885009e-05, "loss": 7.7676, "step": 922 }, { "epoch": 0.6684772768422959, "grad_norm": 0.48363152146339417, "learning_rate": 5.01985774160203e-05, "loss": 7.8692, "step": 923 }, { "epoch": 0.6692015209125475, "grad_norm": 0.4707562029361725, "learning_rate": 5.000000000000002e-05, "loss": 7.8341, "step": 924 }, { "epoch": 0.6699257649827992, "grad_norm": 0.46814507246017456, "learning_rate": 4.980168512347747e-05, "loss": 7.864, "step": 925 }, { "epoch": 0.6706500090530508, "grad_norm": 0.4722250699996948, "learning_rate": 4.9603633827762484e-05, "loss": 7.8269, "step": 926 }, { "epoch": 0.6713742531233026, "grad_norm": 0.45258694887161255, "learning_rate": 4.940584715278075e-05, "loss": 7.8147, "step": 927 }, { "epoch": 0.6720984971935542, "grad_norm": 0.4681549370288849, "learning_rate": 4.9208326137068625e-05, "loss": 7.8421, "step": 928 }, { "epoch": 0.6728227412638059, "grad_norm": 0.49207526445388794, "learning_rate": 4.901107181776743e-05, "loss": 7.7759, "step": 929 }, { "epoch": 0.6735469853340575, "grad_norm": 0.4637949764728546, "learning_rate": 4.881408523061813e-05, "loss": 7.8506, "step": 930 }, { "epoch": 0.6742712294043093, "grad_norm": 0.5171796679496765, "learning_rate": 4.861736740995601e-05, "loss": 7.8593, "step": 931 }, { "epoch": 0.6749954734745609, "grad_norm": 0.47917360067367554, "learning_rate": 4.8420919388704925e-05, "loss": 7.8142, "step": 932 }, { "epoch": 0.6757197175448126, "grad_norm": 0.4940485656261444, "learning_rate": 4.822474219837225e-05, "loss": 7.7892, "step": 933 }, { "epoch": 0.6764439616150643, "grad_norm": 0.5015715956687927, "learning_rate": 4.802883686904318e-05, "loss": 7.9136, "step": 934 }, { "epoch": 0.677168205685316, "grad_norm": 0.4832720160484314, "learning_rate": 4.7833204429375454e-05, "loss": 7.7575, "step": 935 }, { "epoch": 0.6778924497555676, "grad_norm": 0.45720604062080383, "learning_rate": 4.763784590659387e-05, "loss": 7.8713, "step": 936 }, { "epoch": 0.6786166938258194, "grad_norm": 0.47305944561958313, "learning_rate": 4.744276232648508e-05, "loss": 7.7939, "step": 937 }, { "epoch": 0.679340937896071, "grad_norm": 0.4715365171432495, "learning_rate": 4.724795471339195e-05, "loss": 7.806, "step": 938 }, { "epoch": 0.6800651819663226, "grad_norm": 0.468777060508728, "learning_rate": 4.7053424090208295e-05, "loss": 7.7986, "step": 939 }, { "epoch": 0.6807894260365743, "grad_norm": 0.47867533564567566, "learning_rate": 4.685917147837364e-05, "loss": 7.8439, "step": 940 }, { "epoch": 0.681513670106826, "grad_norm": 0.4867823123931885, "learning_rate": 4.666519789786756e-05, "loss": 7.8547, "step": 941 }, { "epoch": 0.6822379141770777, "grad_norm": 0.48606693744659424, "learning_rate": 4.6471504367204674e-05, "loss": 7.836, "step": 942 }, { "epoch": 0.6829621582473293, "grad_norm": 0.4838757812976837, "learning_rate": 4.6278091903428945e-05, "loss": 7.8131, "step": 943 }, { "epoch": 0.683686402317581, "grad_norm": 0.46455007791519165, "learning_rate": 4.608496152210867e-05, "loss": 7.8649, "step": 944 }, { "epoch": 0.6844106463878327, "grad_norm": 0.5098406076431274, "learning_rate": 4.589211423733087e-05, "loss": 7.8212, "step": 945 }, { "epoch": 0.6851348904580844, "grad_norm": 0.48916125297546387, "learning_rate": 4.569955106169611e-05, "loss": 7.7915, "step": 946 }, { "epoch": 0.685859134528336, "grad_norm": 0.525736927986145, "learning_rate": 4.5507273006313245e-05, "loss": 7.7649, "step": 947 }, { "epoch": 0.6865833785985878, "grad_norm": 0.5023128986358643, "learning_rate": 4.531528108079387e-05, "loss": 7.7986, "step": 948 }, { "epoch": 0.6873076226688394, "grad_norm": 0.5856012105941772, "learning_rate": 4.5123576293247364e-05, "loss": 7.7212, "step": 949 }, { "epoch": 0.6880318667390911, "grad_norm": 0.5098285675048828, "learning_rate": 4.493215965027519e-05, "loss": 7.9312, "step": 950 }, { "epoch": 0.6887561108093427, "grad_norm": 0.5016021728515625, "learning_rate": 4.4741032156966025e-05, "loss": 7.8514, "step": 951 }, { "epoch": 0.6894803548795945, "grad_norm": 0.4811919033527374, "learning_rate": 4.455019481689016e-05, "loss": 7.8562, "step": 952 }, { "epoch": 0.6902045989498461, "grad_norm": 0.45924901962280273, "learning_rate": 4.435964863209437e-05, "loss": 7.8562, "step": 953 }, { "epoch": 0.6909288430200978, "grad_norm": 0.5059201717376709, "learning_rate": 4.416939460309667e-05, "loss": 7.8266, "step": 954 }, { "epoch": 0.6916530870903494, "grad_norm": 0.47285687923431396, "learning_rate": 4.3979433728880936e-05, "loss": 7.8448, "step": 955 }, { "epoch": 0.6923773311606011, "grad_norm": 0.47951385378837585, "learning_rate": 4.378976700689192e-05, "loss": 7.8623, "step": 956 }, { "epoch": 0.6931015752308528, "grad_norm": 0.45039263367652893, "learning_rate": 4.360039543302965e-05, "loss": 7.876, "step": 957 }, { "epoch": 0.6938258193011044, "grad_norm": 0.47055962681770325, "learning_rate": 4.3411320001644576e-05, "loss": 7.8539, "step": 958 }, { "epoch": 0.6945500633713562, "grad_norm": 0.4704470634460449, "learning_rate": 4.322254170553201e-05, "loss": 7.8378, "step": 959 }, { "epoch": 0.6952743074416078, "grad_norm": 0.47437623143196106, "learning_rate": 4.303406153592712e-05, "loss": 7.8915, "step": 960 }, { "epoch": 0.6959985515118595, "grad_norm": 0.450575590133667, "learning_rate": 4.284588048249974e-05, "loss": 7.8715, "step": 961 }, { "epoch": 0.6967227955821111, "grad_norm": 0.49726375937461853, "learning_rate": 4.2657999533349e-05, "loss": 7.8248, "step": 962 }, { "epoch": 0.6974470396523629, "grad_norm": 0.5131885409355164, "learning_rate": 4.247041967499837e-05, "loss": 7.8141, "step": 963 }, { "epoch": 0.6981712837226145, "grad_norm": 0.5051475167274475, "learning_rate": 4.228314189239021e-05, "loss": 7.7858, "step": 964 }, { "epoch": 0.6988955277928662, "grad_norm": 0.49921953678131104, "learning_rate": 4.209616716888088e-05, "loss": 7.806, "step": 965 }, { "epoch": 0.6996197718631179, "grad_norm": 0.4952443838119507, "learning_rate": 4.190949648623538e-05, "loss": 7.8111, "step": 966 }, { "epoch": 0.7003440159333696, "grad_norm": 0.48354852199554443, "learning_rate": 4.172313082462218e-05, "loss": 7.8454, "step": 967 }, { "epoch": 0.7010682600036212, "grad_norm": 0.4903525412082672, "learning_rate": 4.153707116260831e-05, "loss": 7.796, "step": 968 }, { "epoch": 0.701792504073873, "grad_norm": 0.4461687505245209, "learning_rate": 4.135131847715391e-05, "loss": 7.8763, "step": 969 }, { "epoch": 0.7025167481441246, "grad_norm": 0.4997590482234955, "learning_rate": 4.116587374360738e-05, "loss": 7.7374, "step": 970 }, { "epoch": 0.7032409922143762, "grad_norm": 0.4803599715232849, "learning_rate": 4.0980737935700045e-05, "loss": 7.8267, "step": 971 }, { "epoch": 0.7039652362846279, "grad_norm": 0.4723648130893707, "learning_rate": 4.0795912025541106e-05, "loss": 7.8528, "step": 972 }, { "epoch": 0.7046894803548795, "grad_norm": 0.4950774312019348, "learning_rate": 4.061139698361259e-05, "loss": 7.846, "step": 973 }, { "epoch": 0.7054137244251313, "grad_norm": 0.4853728115558624, "learning_rate": 4.0427193778764307e-05, "loss": 7.8703, "step": 974 }, { "epoch": 0.7061379684953829, "grad_norm": 0.4416038393974304, "learning_rate": 4.024330337820853e-05, "loss": 7.8747, "step": 975 }, { "epoch": 0.7068622125656346, "grad_norm": 0.4781404435634613, "learning_rate": 4.0059726747515104e-05, "loss": 7.8101, "step": 976 }, { "epoch": 0.7075864566358863, "grad_norm": 0.4613541066646576, "learning_rate": 3.9876464850606435e-05, "loss": 7.8109, "step": 977 }, { "epoch": 0.708310700706138, "grad_norm": 0.4910728633403778, "learning_rate": 3.969351864975216e-05, "loss": 7.8568, "step": 978 }, { "epoch": 0.7090349447763896, "grad_norm": 0.45888233184814453, "learning_rate": 3.9510889105564454e-05, "loss": 7.7947, "step": 979 }, { "epoch": 0.7097591888466414, "grad_norm": 0.4573422968387604, "learning_rate": 3.9328577176992595e-05, "loss": 7.8149, "step": 980 }, { "epoch": 0.710483432916893, "grad_norm": 0.4781007170677185, "learning_rate": 3.914658382131832e-05, "loss": 7.8372, "step": 981 }, { "epoch": 0.7112076769871447, "grad_norm": 0.4993189871311188, "learning_rate": 3.8964909994150456e-05, "loss": 7.8337, "step": 982 }, { "epoch": 0.7119319210573963, "grad_norm": 0.46670612692832947, "learning_rate": 3.8783556649420085e-05, "loss": 7.7961, "step": 983 }, { "epoch": 0.7126561651276481, "grad_norm": 0.48190969228744507, "learning_rate": 3.860252473937559e-05, "loss": 7.8113, "step": 984 }, { "epoch": 0.7133804091978997, "grad_norm": 0.4879286587238312, "learning_rate": 3.8421815214577454e-05, "loss": 7.8245, "step": 985 }, { "epoch": 0.7141046532681513, "grad_norm": 0.4496977627277374, "learning_rate": 3.8241429023893494e-05, "loss": 7.8067, "step": 986 }, { "epoch": 0.714828897338403, "grad_norm": 0.47890475392341614, "learning_rate": 3.806136711449363e-05, "loss": 7.8695, "step": 987 }, { "epoch": 0.7155531414086547, "grad_norm": 0.46888604760169983, "learning_rate": 3.7881630431845215e-05, "loss": 7.775, "step": 988 }, { "epoch": 0.7162773854789064, "grad_norm": 0.458762526512146, "learning_rate": 3.770221991970777e-05, "loss": 7.8523, "step": 989 }, { "epoch": 0.717001629549158, "grad_norm": 0.4727189242839813, "learning_rate": 3.752313652012817e-05, "loss": 7.7896, "step": 990 }, { "epoch": 0.7177258736194098, "grad_norm": 0.4806780219078064, "learning_rate": 3.734438117343582e-05, "loss": 7.7772, "step": 991 }, { "epoch": 0.7184501176896614, "grad_norm": 0.4579210877418518, "learning_rate": 3.7165954818237436e-05, "loss": 7.8326, "step": 992 }, { "epoch": 0.7191743617599131, "grad_norm": 0.4929826259613037, "learning_rate": 3.698785839141236e-05, "loss": 7.8425, "step": 993 }, { "epoch": 0.7198986058301647, "grad_norm": 0.5137585997581482, "learning_rate": 3.681009282810746e-05, "loss": 7.765, "step": 994 }, { "epoch": 0.7206228499004165, "grad_norm": 0.4695113003253937, "learning_rate": 3.663265906173245e-05, "loss": 7.8759, "step": 995 }, { "epoch": 0.7213470939706681, "grad_norm": 0.5344631671905518, "learning_rate": 3.645555802395476e-05, "loss": 7.7362, "step": 996 }, { "epoch": 0.7220713380409198, "grad_norm": 0.5240874886512756, "learning_rate": 3.6278790644694674e-05, "loss": 7.8663, "step": 997 }, { "epoch": 0.7227955821111715, "grad_norm": 0.5019177198410034, "learning_rate": 3.610235785212064e-05, "loss": 7.6963, "step": 998 }, { "epoch": 0.7235198261814232, "grad_norm": 0.5077367424964905, "learning_rate": 3.592626057264413e-05, "loss": 7.8491, "step": 999 }, { "epoch": 0.7242440702516748, "grad_norm": 0.5551170706748962, "learning_rate": 3.575049973091506e-05, "loss": 7.8329, "step": 1000 }, { "epoch": 0.7249683143219265, "grad_norm": 0.49724796414375305, "learning_rate": 3.5575076249816584e-05, "loss": 7.8, "step": 1001 }, { "epoch": 0.7256925583921782, "grad_norm": 0.4674682319164276, "learning_rate": 3.5399991050460655e-05, "loss": 7.8876, "step": 1002 }, { "epoch": 0.7264168024624298, "grad_norm": 0.49887341260910034, "learning_rate": 3.522524505218281e-05, "loss": 7.8503, "step": 1003 }, { "epoch": 0.7271410465326815, "grad_norm": 0.47441011667251587, "learning_rate": 3.505083917253763e-05, "loss": 7.8196, "step": 1004 }, { "epoch": 0.7278652906029331, "grad_norm": 0.4713391065597534, "learning_rate": 3.4876774327293734e-05, "loss": 7.8007, "step": 1005 }, { "epoch": 0.7285895346731849, "grad_norm": 0.4885476231575012, "learning_rate": 3.4703051430429024e-05, "loss": 7.8527, "step": 1006 }, { "epoch": 0.7293137787434365, "grad_norm": 0.4708382785320282, "learning_rate": 3.452967139412602e-05, "loss": 7.8204, "step": 1007 }, { "epoch": 0.7300380228136882, "grad_norm": 0.45463642477989197, "learning_rate": 3.435663512876679e-05, "loss": 7.8397, "step": 1008 }, { "epoch": 0.7307622668839399, "grad_norm": 0.46222880482673645, "learning_rate": 3.4183943542928496e-05, "loss": 7.8621, "step": 1009 }, { "epoch": 0.7314865109541916, "grad_norm": 0.4764266610145569, "learning_rate": 3.401159754337836e-05, "loss": 7.8469, "step": 1010 }, { "epoch": 0.7322107550244432, "grad_norm": 0.4880512058734894, "learning_rate": 3.383959803506901e-05, "loss": 7.8698, "step": 1011 }, { "epoch": 0.732934999094695, "grad_norm": 0.4619225859642029, "learning_rate": 3.3667945921133734e-05, "loss": 7.8439, "step": 1012 }, { "epoch": 0.7336592431649466, "grad_norm": 0.4897652566432953, "learning_rate": 3.34966421028817e-05, "loss": 7.7954, "step": 1013 }, { "epoch": 0.7343834872351983, "grad_norm": 0.47381868958473206, "learning_rate": 3.332568747979335e-05, "loss": 7.8305, "step": 1014 }, { "epoch": 0.7351077313054499, "grad_norm": 0.4567718207836151, "learning_rate": 3.3155082949515424e-05, "loss": 7.8574, "step": 1015 }, { "epoch": 0.7358319753757017, "grad_norm": 0.4839675724506378, "learning_rate": 3.298482940785655e-05, "loss": 7.8414, "step": 1016 }, { "epoch": 0.7365562194459533, "grad_norm": 0.4948679804801941, "learning_rate": 3.2814927748782245e-05, "loss": 7.8058, "step": 1017 }, { "epoch": 0.7372804635162049, "grad_norm": 0.47992077469825745, "learning_rate": 3.26453788644105e-05, "loss": 7.7794, "step": 1018 }, { "epoch": 0.7380047075864566, "grad_norm": 0.4672059714794159, "learning_rate": 3.2476183645006854e-05, "loss": 7.8264, "step": 1019 }, { "epoch": 0.7387289516567083, "grad_norm": 0.4785839319229126, "learning_rate": 3.2307342978979847e-05, "loss": 7.8405, "step": 1020 }, { "epoch": 0.73945319572696, "grad_norm": 0.48339641094207764, "learning_rate": 3.2138857752876406e-05, "loss": 7.8538, "step": 1021 }, { "epoch": 0.7401774397972116, "grad_norm": 0.4682196378707886, "learning_rate": 3.1970728851377005e-05, "loss": 7.8156, "step": 1022 }, { "epoch": 0.7409016838674634, "grad_norm": 0.4627414643764496, "learning_rate": 3.1802957157291256e-05, "loss": 7.8184, "step": 1023 }, { "epoch": 0.741625927937715, "grad_norm": 0.4607759714126587, "learning_rate": 3.1635543551553015e-05, "loss": 7.8599, "step": 1024 }, { "epoch": 0.7423501720079667, "grad_norm": 0.4652021825313568, "learning_rate": 3.146848891321604e-05, "loss": 7.7955, "step": 1025 }, { "epoch": 0.7430744160782183, "grad_norm": 0.4668468236923218, "learning_rate": 3.130179411944909e-05, "loss": 7.8534, "step": 1026 }, { "epoch": 0.7437986601484701, "grad_norm": 0.46713390946388245, "learning_rate": 3.113546004553151e-05, "loss": 7.8634, "step": 1027 }, { "epoch": 0.7445229042187217, "grad_norm": 0.4680757224559784, "learning_rate": 3.096948756484863e-05, "loss": 7.8308, "step": 1028 }, { "epoch": 0.7452471482889734, "grad_norm": 0.48597460985183716, "learning_rate": 3.080387754888706e-05, "loss": 7.7773, "step": 1029 }, { "epoch": 0.745971392359225, "grad_norm": 0.4744811952114105, "learning_rate": 3.06386308672302e-05, "loss": 7.7997, "step": 1030 }, { "epoch": 0.7466956364294768, "grad_norm": 0.4475364685058594, "learning_rate": 3.0473748387553647e-05, "loss": 7.8358, "step": 1031 }, { "epoch": 0.7474198804997284, "grad_norm": 0.4579427242279053, "learning_rate": 3.0309230975620717e-05, "loss": 7.8671, "step": 1032 }, { "epoch": 0.74814412456998, "grad_norm": 0.4434613883495331, "learning_rate": 3.0145079495277772e-05, "loss": 7.8496, "step": 1033 }, { "epoch": 0.7488683686402318, "grad_norm": 0.5059404373168945, "learning_rate": 2.9981294808449713e-05, "loss": 7.7336, "step": 1034 }, { "epoch": 0.7495926127104834, "grad_norm": 0.4605514407157898, "learning_rate": 2.9817877775135593e-05, "loss": 7.8329, "step": 1035 }, { "epoch": 0.7503168567807351, "grad_norm": 0.48954278230667114, "learning_rate": 2.965482925340386e-05, "loss": 7.8067, "step": 1036 }, { "epoch": 0.7510411008509867, "grad_norm": 0.4908781945705414, "learning_rate": 2.9492150099388127e-05, "loss": 7.9183, "step": 1037 }, { "epoch": 0.7517653449212385, "grad_norm": 0.46575653553009033, "learning_rate": 2.932984116728239e-05, "loss": 7.7655, "step": 1038 }, { "epoch": 0.7517653449212385, "eval_loss": 7.828281402587891, "eval_runtime": 4.8059, "eval_samples_per_second": 241.993, "eval_steps_per_second": 121.101, "step": 1038 }, { "epoch": 0.7524895889914901, "grad_norm": 0.46100151538848877, "learning_rate": 2.916790330933683e-05, "loss": 7.8197, "step": 1039 }, { "epoch": 0.7532138330617418, "grad_norm": 0.47532182931900024, "learning_rate": 2.9006337375853064e-05, "loss": 7.7012, "step": 1040 }, { "epoch": 0.7539380771319935, "grad_norm": 0.486884742975235, "learning_rate": 2.884514421517993e-05, "loss": 7.8531, "step": 1041 }, { "epoch": 0.7546623212022452, "grad_norm": 0.4965648055076599, "learning_rate": 2.868432467370882e-05, "loss": 7.8061, "step": 1042 }, { "epoch": 0.7553865652724968, "grad_norm": 0.49399492144584656, "learning_rate": 2.852387959586934e-05, "loss": 7.7895, "step": 1043 }, { "epoch": 0.7561108093427485, "grad_norm": 0.48604780435562134, "learning_rate": 2.8363809824124964e-05, "loss": 7.8522, "step": 1044 }, { "epoch": 0.7568350534130002, "grad_norm": 0.46038612723350525, "learning_rate": 2.820411619896838e-05, "loss": 7.9004, "step": 1045 }, { "epoch": 0.7575592974832519, "grad_norm": 0.5330672860145569, "learning_rate": 2.8044799558917313e-05, "loss": 7.85, "step": 1046 }, { "epoch": 0.7582835415535035, "grad_norm": 0.5077647566795349, "learning_rate": 2.7885860740509963e-05, "loss": 7.845, "step": 1047 }, { "epoch": 0.7590077856237553, "grad_norm": 0.480844646692276, "learning_rate": 2.7727300578300674e-05, "loss": 7.7689, "step": 1048 }, { "epoch": 0.7597320296940069, "grad_norm": 0.49301832914352417, "learning_rate": 2.756911990485552e-05, "loss": 7.8533, "step": 1049 }, { "epoch": 0.7604562737642585, "grad_norm": 0.598520040512085, "learning_rate": 2.741131955074807e-05, "loss": 7.7991, "step": 1050 }, { "epoch": 0.7611805178345102, "grad_norm": 0.4859587252140045, "learning_rate": 2.7253900344554795e-05, "loss": 7.8235, "step": 1051 }, { "epoch": 0.7619047619047619, "grad_norm": 0.49456787109375, "learning_rate": 2.7096863112850847e-05, "loss": 7.84, "step": 1052 }, { "epoch": 0.7626290059750136, "grad_norm": 0.4734579622745514, "learning_rate": 2.6940208680205802e-05, "loss": 7.8954, "step": 1053 }, { "epoch": 0.7633532500452652, "grad_norm": 0.4647412896156311, "learning_rate": 2.6783937869179143e-05, "loss": 7.8155, "step": 1054 }, { "epoch": 0.764077494115517, "grad_norm": 0.4898563623428345, "learning_rate": 2.662805150031612e-05, "loss": 7.8175, "step": 1055 }, { "epoch": 0.7648017381857686, "grad_norm": 0.5130847692489624, "learning_rate": 2.647255039214328e-05, "loss": 7.8289, "step": 1056 }, { "epoch": 0.7655259822560203, "grad_norm": 0.4688289761543274, "learning_rate": 2.6317435361164256e-05, "loss": 7.837, "step": 1057 }, { "epoch": 0.7662502263262719, "grad_norm": 0.4437507688999176, "learning_rate": 2.6162707221855552e-05, "loss": 7.8955, "step": 1058 }, { "epoch": 0.7669744703965237, "grad_norm": 0.44463828206062317, "learning_rate": 2.6008366786662073e-05, "loss": 7.8781, "step": 1059 }, { "epoch": 0.7676987144667753, "grad_norm": 0.44665560126304626, "learning_rate": 2.585441486599308e-05, "loss": 7.8517, "step": 1060 }, { "epoch": 0.768422958537027, "grad_norm": 0.45422571897506714, "learning_rate": 2.570085226821771e-05, "loss": 7.8811, "step": 1061 }, { "epoch": 0.7691472026072786, "grad_norm": 0.46825090050697327, "learning_rate": 2.554767979966097e-05, "loss": 7.8527, "step": 1062 }, { "epoch": 0.7698714466775304, "grad_norm": 0.4797806441783905, "learning_rate": 2.5394898264599297e-05, "loss": 7.8624, "step": 1063 }, { "epoch": 0.770595690747782, "grad_norm": 0.48127833008766174, "learning_rate": 2.5242508465256397e-05, "loss": 7.8368, "step": 1064 }, { "epoch": 0.7713199348180336, "grad_norm": 0.480258584022522, "learning_rate": 2.5090511201799172e-05, "loss": 7.8536, "step": 1065 }, { "epoch": 0.7720441788882854, "grad_norm": 0.495200514793396, "learning_rate": 2.493890727233329e-05, "loss": 7.8352, "step": 1066 }, { "epoch": 0.772768422958537, "grad_norm": 0.4558762013912201, "learning_rate": 2.478769747289912e-05, "loss": 7.8246, "step": 1067 }, { "epoch": 0.7734926670287887, "grad_norm": 0.46966373920440674, "learning_rate": 2.4636882597467593e-05, "loss": 7.8572, "step": 1068 }, { "epoch": 0.7742169110990403, "grad_norm": 0.471635103225708, "learning_rate": 2.4486463437935934e-05, "loss": 7.8459, "step": 1069 }, { "epoch": 0.7749411551692921, "grad_norm": 0.47857850790023804, "learning_rate": 2.433644078412355e-05, "loss": 7.8279, "step": 1070 }, { "epoch": 0.7756653992395437, "grad_norm": 0.48017922043800354, "learning_rate": 2.418681542376785e-05, "loss": 7.8697, "step": 1071 }, { "epoch": 0.7763896433097954, "grad_norm": 0.4738040864467621, "learning_rate": 2.403758814252024e-05, "loss": 7.8073, "step": 1072 }, { "epoch": 0.777113887380047, "grad_norm": 0.46145448088645935, "learning_rate": 2.3888759723941766e-05, "loss": 7.8748, "step": 1073 }, { "epoch": 0.7778381314502988, "grad_norm": 0.4822898805141449, "learning_rate": 2.3740330949499257e-05, "loss": 7.8332, "step": 1074 }, { "epoch": 0.7785623755205504, "grad_norm": 0.47280868887901306, "learning_rate": 2.359230259856097e-05, "loss": 7.8453, "step": 1075 }, { "epoch": 0.7792866195908021, "grad_norm": 0.47516578435897827, "learning_rate": 2.3444675448392728e-05, "loss": 7.8826, "step": 1076 }, { "epoch": 0.7800108636610538, "grad_norm": 0.4623286724090576, "learning_rate": 2.3297450274153643e-05, "loss": 7.872, "step": 1077 }, { "epoch": 0.7807351077313055, "grad_norm": 0.5090032815933228, "learning_rate": 2.3150627848892248e-05, "loss": 7.8377, "step": 1078 }, { "epoch": 0.7814593518015571, "grad_norm": 0.4724178612232208, "learning_rate": 2.3004208943542215e-05, "loss": 7.8654, "step": 1079 }, { "epoch": 0.7821835958718087, "grad_norm": 0.48950740694999695, "learning_rate": 2.2858194326918435e-05, "loss": 7.882, "step": 1080 }, { "epoch": 0.7829078399420605, "grad_norm": 0.4657925069332123, "learning_rate": 2.2712584765713064e-05, "loss": 7.8123, "step": 1081 }, { "epoch": 0.7836320840123121, "grad_norm": 0.5018364191055298, "learning_rate": 2.256738102449124e-05, "loss": 7.7877, "step": 1082 }, { "epoch": 0.7843563280825638, "grad_norm": 0.4764616787433624, "learning_rate": 2.2422583865687375e-05, "loss": 7.8885, "step": 1083 }, { "epoch": 0.7850805721528155, "grad_norm": 0.496661514043808, "learning_rate": 2.227819404960092e-05, "loss": 7.8584, "step": 1084 }, { "epoch": 0.7858048162230672, "grad_norm": 0.48577845096588135, "learning_rate": 2.2134212334392434e-05, "loss": 7.7898, "step": 1085 }, { "epoch": 0.7865290602933188, "grad_norm": 0.4589325189590454, "learning_rate": 2.1990639476079712e-05, "loss": 7.8669, "step": 1086 }, { "epoch": 0.7872533043635705, "grad_norm": 0.5179975032806396, "learning_rate": 2.1847476228533648e-05, "loss": 7.7761, "step": 1087 }, { "epoch": 0.7879775484338222, "grad_norm": 0.45577552914619446, "learning_rate": 2.1704723343474396e-05, "loss": 7.8018, "step": 1088 }, { "epoch": 0.7887017925040739, "grad_norm": 0.4571017622947693, "learning_rate": 2.156238157046734e-05, "loss": 7.8382, "step": 1089 }, { "epoch": 0.7894260365743255, "grad_norm": 0.49493101239204407, "learning_rate": 2.1420451656919284e-05, "loss": 7.7954, "step": 1090 }, { "epoch": 0.7901502806445773, "grad_norm": 0.4955536723136902, "learning_rate": 2.1278934348074332e-05, "loss": 7.9138, "step": 1091 }, { "epoch": 0.7908745247148289, "grad_norm": 0.5027803182601929, "learning_rate": 2.1137830387010206e-05, "loss": 7.7502, "step": 1092 }, { "epoch": 0.7915987687850806, "grad_norm": 0.47927114367485046, "learning_rate": 2.099714051463415e-05, "loss": 7.8775, "step": 1093 }, { "epoch": 0.7923230128553322, "grad_norm": 0.47723886370658875, "learning_rate": 2.085686546967909e-05, "loss": 7.7893, "step": 1094 }, { "epoch": 0.793047256925584, "grad_norm": 0.5063787698745728, "learning_rate": 2.0717005988699887e-05, "loss": 7.8202, "step": 1095 }, { "epoch": 0.7937715009958356, "grad_norm": 0.4967922270298004, "learning_rate": 2.0577562806069238e-05, "loss": 7.8676, "step": 1096 }, { "epoch": 0.7944957450660872, "grad_norm": 0.4945312738418579, "learning_rate": 2.0438536653974073e-05, "loss": 7.7993, "step": 1097 }, { "epoch": 0.795219989136339, "grad_norm": 0.47871074080467224, "learning_rate": 2.029992826241145e-05, "loss": 7.8309, "step": 1098 }, { "epoch": 0.7959442332065906, "grad_norm": 0.5307055115699768, "learning_rate": 2.016173835918498e-05, "loss": 7.7908, "step": 1099 }, { "epoch": 0.7966684772768423, "grad_norm": 0.5632224678993225, "learning_rate": 2.0023967669900777e-05, "loss": 7.7806, "step": 1100 }, { "epoch": 0.7973927213470939, "grad_norm": 0.4810802638530731, "learning_rate": 1.9886616917963785e-05, "loss": 7.8422, "step": 1101 }, { "epoch": 0.7981169654173457, "grad_norm": 0.49098509550094604, "learning_rate": 1.9749686824573987e-05, "loss": 7.8427, "step": 1102 }, { "epoch": 0.7988412094875973, "grad_norm": 0.4845903217792511, "learning_rate": 1.9613178108722507e-05, "loss": 7.8179, "step": 1103 }, { "epoch": 0.799565453557849, "grad_norm": 0.4725109040737152, "learning_rate": 1.9477091487187983e-05, "loss": 7.8455, "step": 1104 }, { "epoch": 0.8002896976281006, "grad_norm": 0.49514099955558777, "learning_rate": 1.9341427674532643e-05, "loss": 7.8411, "step": 1105 }, { "epoch": 0.8010139416983524, "grad_norm": 0.4852292239665985, "learning_rate": 1.9206187383098694e-05, "loss": 7.85, "step": 1106 }, { "epoch": 0.801738185768604, "grad_norm": 0.505436897277832, "learning_rate": 1.9071371323004494e-05, "loss": 7.8205, "step": 1107 }, { "epoch": 0.8024624298388557, "grad_norm": 0.4951551556587219, "learning_rate": 1.893698020214082e-05, "loss": 7.8409, "step": 1108 }, { "epoch": 0.8031866739091074, "grad_norm": 0.49560704827308655, "learning_rate": 1.8803014726167266e-05, "loss": 7.8177, "step": 1109 }, { "epoch": 0.8039109179793591, "grad_norm": 0.4628080725669861, "learning_rate": 1.866947559850839e-05, "loss": 7.8229, "step": 1110 }, { "epoch": 0.8046351620496107, "grad_norm": 0.47996366024017334, "learning_rate": 1.853636352035012e-05, "loss": 7.8377, "step": 1111 }, { "epoch": 0.8053594061198623, "grad_norm": 0.48397380113601685, "learning_rate": 1.840367919063598e-05, "loss": 7.8254, "step": 1112 }, { "epoch": 0.8060836501901141, "grad_norm": 0.47569262981414795, "learning_rate": 1.8271423306063564e-05, "loss": 7.8227, "step": 1113 }, { "epoch": 0.8068078942603657, "grad_norm": 0.47170665860176086, "learning_rate": 1.8139596561080696e-05, "loss": 7.8185, "step": 1114 }, { "epoch": 0.8075321383306174, "grad_norm": 0.48356300592422485, "learning_rate": 1.800819964788196e-05, "loss": 7.847, "step": 1115 }, { "epoch": 0.8082563824008691, "grad_norm": 0.4541751444339752, "learning_rate": 1.7877233256404956e-05, "loss": 7.861, "step": 1116 }, { "epoch": 0.8089806264711208, "grad_norm": 0.43556642532348633, "learning_rate": 1.7746698074326638e-05, "loss": 7.8939, "step": 1117 }, { "epoch": 0.8097048705413724, "grad_norm": 0.47422105073928833, "learning_rate": 1.761659478705989e-05, "loss": 7.8543, "step": 1118 }, { "epoch": 0.8104291146116241, "grad_norm": 0.501738965511322, "learning_rate": 1.7486924077749712e-05, "loss": 7.8054, "step": 1119 }, { "epoch": 0.8111533586818758, "grad_norm": 0.48210737109184265, "learning_rate": 1.7357686627269788e-05, "loss": 7.889, "step": 1120 }, { "epoch": 0.8118776027521275, "grad_norm": 0.5010181069374084, "learning_rate": 1.7228883114218775e-05, "loss": 7.8183, "step": 1121 }, { "epoch": 0.8126018468223791, "grad_norm": 0.5056195855140686, "learning_rate": 1.710051421491694e-05, "loss": 7.8112, "step": 1122 }, { "epoch": 0.8133260908926309, "grad_norm": 0.49855145812034607, "learning_rate": 1.6972580603402364e-05, "loss": 7.788, "step": 1123 }, { "epoch": 0.8140503349628825, "grad_norm": 0.479667603969574, "learning_rate": 1.6845082951427572e-05, "loss": 7.8281, "step": 1124 }, { "epoch": 0.8147745790331342, "grad_norm": 0.4779108464717865, "learning_rate": 1.671802192845594e-05, "loss": 7.8274, "step": 1125 }, { "epoch": 0.8154988231033858, "grad_norm": 0.4873594641685486, "learning_rate": 1.659139820165825e-05, "loss": 7.8795, "step": 1126 }, { "epoch": 0.8162230671736375, "grad_norm": 0.47458213567733765, "learning_rate": 1.6465212435909073e-05, "loss": 7.8462, "step": 1127 }, { "epoch": 0.8169473112438892, "grad_norm": 0.4490852355957031, "learning_rate": 1.6339465293783328e-05, "loss": 7.8258, "step": 1128 }, { "epoch": 0.8176715553141408, "grad_norm": 0.4825502038002014, "learning_rate": 1.6214157435552914e-05, "loss": 7.8331, "step": 1129 }, { "epoch": 0.8183957993843926, "grad_norm": 0.4452913701534271, "learning_rate": 1.6089289519183036e-05, "loss": 7.8999, "step": 1130 }, { "epoch": 0.8191200434546442, "grad_norm": 0.4826939105987549, "learning_rate": 1.596486220032888e-05, "loss": 7.8164, "step": 1131 }, { "epoch": 0.8198442875248959, "grad_norm": 0.46260425448417664, "learning_rate": 1.5840876132332195e-05, "loss": 7.8404, "step": 1132 }, { "epoch": 0.8205685315951475, "grad_norm": 0.476392537355423, "learning_rate": 1.571733196621774e-05, "loss": 7.8025, "step": 1133 }, { "epoch": 0.8212927756653993, "grad_norm": 0.43944284319877625, "learning_rate": 1.5594230350690044e-05, "loss": 7.8283, "step": 1134 }, { "epoch": 0.8220170197356509, "grad_norm": 0.48247215151786804, "learning_rate": 1.547157193212977e-05, "loss": 7.7797, "step": 1135 }, { "epoch": 0.8227412638059026, "grad_norm": 0.4725157618522644, "learning_rate": 1.5349357354590555e-05, "loss": 7.844, "step": 1136 }, { "epoch": 0.8234655078761542, "grad_norm": 0.46587786078453064, "learning_rate": 1.5227587259795462e-05, "loss": 7.8471, "step": 1137 }, { "epoch": 0.824189751946406, "grad_norm": 0.4861607253551483, "learning_rate": 1.5106262287133643e-05, "loss": 7.745, "step": 1138 }, { "epoch": 0.8249139960166576, "grad_norm": 0.4683452248573303, "learning_rate": 1.4985383073657112e-05, "loss": 7.7614, "step": 1139 }, { "epoch": 0.8256382400869093, "grad_norm": 0.47723594307899475, "learning_rate": 1.4864950254077181e-05, "loss": 7.8326, "step": 1140 }, { "epoch": 0.826362484157161, "grad_norm": 0.49085745215415955, "learning_rate": 1.4744964460761312e-05, "loss": 7.7305, "step": 1141 }, { "epoch": 0.8270867282274127, "grad_norm": 0.5058842301368713, "learning_rate": 1.4625426323729708e-05, "loss": 7.7709, "step": 1142 }, { "epoch": 0.8278109722976643, "grad_norm": 0.47824400663375854, "learning_rate": 1.4506336470651982e-05, "loss": 7.7977, "step": 1143 }, { "epoch": 0.8285352163679159, "grad_norm": 0.5179119110107422, "learning_rate": 1.438769552684398e-05, "loss": 7.7603, "step": 1144 }, { "epoch": 0.8292594604381677, "grad_norm": 0.4722190201282501, "learning_rate": 1.4269504115264376e-05, "loss": 7.7913, "step": 1145 }, { "epoch": 0.8299837045084193, "grad_norm": 0.47829577326774597, "learning_rate": 1.4151762856511419e-05, "loss": 7.8386, "step": 1146 }, { "epoch": 0.830707948578671, "grad_norm": 0.47289183735847473, "learning_rate": 1.4034472368819718e-05, "loss": 7.8422, "step": 1147 }, { "epoch": 0.8314321926489227, "grad_norm": 0.502065122127533, "learning_rate": 1.391763326805704e-05, "loss": 7.8705, "step": 1148 }, { "epoch": 0.8321564367191744, "grad_norm": 0.529735267162323, "learning_rate": 1.3801246167720904e-05, "loss": 7.7626, "step": 1149 }, { "epoch": 0.832880680789426, "grad_norm": 0.5681172609329224, "learning_rate": 1.3685311678935575e-05, "loss": 7.8454, "step": 1150 }, { "epoch": 0.8336049248596777, "grad_norm": 0.4957883954048157, "learning_rate": 1.3569830410448658e-05, "loss": 7.8634, "step": 1151 }, { "epoch": 0.8343291689299294, "grad_norm": 0.4802315831184387, "learning_rate": 1.34548029686281e-05, "loss": 7.8578, "step": 1152 }, { "epoch": 0.8350534130001811, "grad_norm": 0.49144551157951355, "learning_rate": 1.3340229957458783e-05, "loss": 7.802, "step": 1153 }, { "epoch": 0.8357776570704327, "grad_norm": 0.457791805267334, "learning_rate": 1.3226111978539524e-05, "loss": 7.8177, "step": 1154 }, { "epoch": 0.8365019011406845, "grad_norm": 0.4937816858291626, "learning_rate": 1.3112449631079892e-05, "loss": 7.8694, "step": 1155 }, { "epoch": 0.8372261452109361, "grad_norm": 0.4611557126045227, "learning_rate": 1.2999243511896974e-05, "loss": 7.8976, "step": 1156 }, { "epoch": 0.8379503892811878, "grad_norm": 0.4825960099697113, "learning_rate": 1.2886494215412348e-05, "loss": 7.8651, "step": 1157 }, { "epoch": 0.8386746333514394, "grad_norm": 0.4975496530532837, "learning_rate": 1.2774202333648844e-05, "loss": 7.8353, "step": 1158 }, { "epoch": 0.8393988774216911, "grad_norm": 0.46706926822662354, "learning_rate": 1.2662368456227602e-05, "loss": 7.8567, "step": 1159 }, { "epoch": 0.8401231214919428, "grad_norm": 0.46040868759155273, "learning_rate": 1.25509931703648e-05, "loss": 7.8983, "step": 1160 }, { "epoch": 0.8408473655621944, "grad_norm": 0.4722301661968231, "learning_rate": 1.2440077060868638e-05, "loss": 7.8863, "step": 1161 }, { "epoch": 0.8415716096324461, "grad_norm": 0.47437435388565063, "learning_rate": 1.2329620710136358e-05, "loss": 7.8298, "step": 1162 }, { "epoch": 0.8422958537026978, "grad_norm": 0.4701695144176483, "learning_rate": 1.2219624698151033e-05, "loss": 7.8605, "step": 1163 }, { "epoch": 0.8430200977729495, "grad_norm": 0.4844394326210022, "learning_rate": 1.2110089602478624e-05, "loss": 7.8583, "step": 1164 }, { "epoch": 0.8437443418432011, "grad_norm": 0.4796792268753052, "learning_rate": 1.2001015998264886e-05, "loss": 7.8471, "step": 1165 }, { "epoch": 0.8444685859134529, "grad_norm": 0.4662293493747711, "learning_rate": 1.1892404458232454e-05, "loss": 7.9045, "step": 1166 }, { "epoch": 0.8451928299837045, "grad_norm": 0.49967148900032043, "learning_rate": 1.178425555267768e-05, "loss": 7.847, "step": 1167 }, { "epoch": 0.8459170740539562, "grad_norm": 0.45847171545028687, "learning_rate": 1.1676569849467733e-05, "loss": 7.8271, "step": 1168 }, { "epoch": 0.8466413181242078, "grad_norm": 0.4743190407752991, "learning_rate": 1.1569347914037664e-05, "loss": 7.8559, "step": 1169 }, { "epoch": 0.8473655621944596, "grad_norm": 0.48577260971069336, "learning_rate": 1.1462590309387289e-05, "loss": 7.8242, "step": 1170 }, { "epoch": 0.8480898062647112, "grad_norm": 0.4353601932525635, "learning_rate": 1.1356297596078425e-05, "loss": 7.8549, "step": 1171 }, { "epoch": 0.8488140503349629, "grad_norm": 0.46052125096321106, "learning_rate": 1.125047033223171e-05, "loss": 7.8489, "step": 1172 }, { "epoch": 0.8495382944052146, "grad_norm": 0.4780481159687042, "learning_rate": 1.1145109073523951e-05, "loss": 7.8461, "step": 1173 }, { "epoch": 0.8502625384754662, "grad_norm": 0.48932063579559326, "learning_rate": 1.1040214373184954e-05, "loss": 7.8926, "step": 1174 }, { "epoch": 0.8509867825457179, "grad_norm": 0.5182289481163025, "learning_rate": 1.0935786781994738e-05, "loss": 7.8103, "step": 1175 }, { "epoch": 0.8517110266159695, "grad_norm": 0.4565328359603882, "learning_rate": 1.0831826848280679e-05, "loss": 7.853, "step": 1176 }, { "epoch": 0.8524352706862213, "grad_norm": 0.4468229115009308, "learning_rate": 1.0728335117914534e-05, "loss": 7.8157, "step": 1177 }, { "epoch": 0.8531595147564729, "grad_norm": 0.4953078329563141, "learning_rate": 1.0625312134309662e-05, "loss": 7.8499, "step": 1178 }, { "epoch": 0.8538837588267246, "grad_norm": 0.4719950258731842, "learning_rate": 1.052275843841809e-05, "loss": 7.8703, "step": 1179 }, { "epoch": 0.8546080028969762, "grad_norm": 0.506100594997406, "learning_rate": 1.042067456872775e-05, "loss": 7.7924, "step": 1180 }, { "epoch": 0.855332246967228, "grad_norm": 0.46659165620803833, "learning_rate": 1.0319061061259606e-05, "loss": 7.8599, "step": 1181 }, { "epoch": 0.8560564910374796, "grad_norm": 0.4609448313713074, "learning_rate": 1.0217918449564812e-05, "loss": 7.8043, "step": 1182 }, { "epoch": 0.8567807351077313, "grad_norm": 0.4527445435523987, "learning_rate": 1.0117247264722008e-05, "loss": 7.8937, "step": 1183 }, { "epoch": 0.857504979177983, "grad_norm": 0.478190153837204, "learning_rate": 1.0017048035334408e-05, "loss": 7.8079, "step": 1184 }, { "epoch": 0.8582292232482347, "grad_norm": 0.46092936396598816, "learning_rate": 9.917321287527148e-06, "loss": 7.8575, "step": 1185 }, { "epoch": 0.8589534673184863, "grad_norm": 0.4953816533088684, "learning_rate": 9.81806754494441e-06, "loss": 7.8281, "step": 1186 }, { "epoch": 0.859677711388738, "grad_norm": 0.4743238687515259, "learning_rate": 9.719287328746773e-06, "loss": 7.8017, "step": 1187 }, { "epoch": 0.8604019554589897, "grad_norm": 0.48036321997642517, "learning_rate": 9.620981157608389e-06, "loss": 7.8096, "step": 1188 }, { "epoch": 0.8611261995292414, "grad_norm": 0.4978514611721039, "learning_rate": 9.523149547714327e-06, "loss": 7.7873, "step": 1189 }, { "epoch": 0.861850443599493, "grad_norm": 0.5149461627006531, "learning_rate": 9.425793012757812e-06, "loss": 7.8773, "step": 1190 }, { "epoch": 0.8625746876697447, "grad_norm": 0.4775876998901367, "learning_rate": 9.328912063937544e-06, "loss": 7.8557, "step": 1191 }, { "epoch": 0.8632989317399964, "grad_norm": 0.4656752943992615, "learning_rate": 9.232507209955077e-06, "loss": 7.8102, "step": 1192 }, { "epoch": 0.864023175810248, "grad_norm": 0.49488770961761475, "learning_rate": 9.136578957011998e-06, "loss": 7.8236, "step": 1193 }, { "epoch": 0.8647474198804997, "grad_norm": 0.4837650656700134, "learning_rate": 9.04112780880747e-06, "loss": 7.7681, "step": 1194 }, { "epoch": 0.8654716639507514, "grad_norm": 0.4603443443775177, "learning_rate": 8.946154266535366e-06, "loss": 7.8176, "step": 1195 }, { "epoch": 0.8661959080210031, "grad_norm": 0.4513426423072815, "learning_rate": 8.851658828881858e-06, "loss": 7.8717, "step": 1196 }, { "epoch": 0.8669201520912547, "grad_norm": 0.48408597707748413, "learning_rate": 8.757641992022614e-06, "loss": 7.758, "step": 1197 }, { "epoch": 0.8676443961615065, "grad_norm": 0.5038301348686218, "learning_rate": 8.664104249620298e-06, "loss": 7.8486, "step": 1198 }, { "epoch": 0.8683686402317581, "grad_norm": 0.5601458549499512, "learning_rate": 8.571046092821955e-06, "loss": 7.713, "step": 1199 }, { "epoch": 0.8690928843020098, "grad_norm": 0.5590780377388, "learning_rate": 8.478468010256425e-06, "loss": 7.7364, "step": 1200 }, { "epoch": 0.8698171283722614, "grad_norm": 0.4802273213863373, "learning_rate": 8.386370488031759e-06, "loss": 7.8386, "step": 1201 }, { "epoch": 0.8705413724425132, "grad_norm": 0.5128775238990784, "learning_rate": 8.294754009732696e-06, "loss": 7.7979, "step": 1202 }, { "epoch": 0.8712656165127648, "grad_norm": 0.49584537744522095, "learning_rate": 8.203619056418155e-06, "loss": 7.7934, "step": 1203 }, { "epoch": 0.8719898605830165, "grad_norm": 0.47992539405822754, "learning_rate": 8.112966106618602e-06, "loss": 7.8282, "step": 1204 }, { "epoch": 0.8727141046532682, "grad_norm": 0.4876195788383484, "learning_rate": 8.022795636333636e-06, "loss": 7.8194, "step": 1205 }, { "epoch": 0.8734383487235198, "grad_norm": 0.49370482563972473, "learning_rate": 7.933108119029475e-06, "loss": 7.8294, "step": 1206 }, { "epoch": 0.8741625927937715, "grad_norm": 0.47466379404067993, "learning_rate": 7.843904025636417e-06, "loss": 7.8737, "step": 1207 }, { "epoch": 0.8748868368640231, "grad_norm": 0.5090891122817993, "learning_rate": 7.75518382454643e-06, "loss": 7.8391, "step": 1208 }, { "epoch": 0.8756110809342749, "grad_norm": 0.5014375448226929, "learning_rate": 7.66694798161064e-06, "loss": 7.8252, "step": 1209 }, { "epoch": 0.8763353250045265, "grad_norm": 0.46806249022483826, "learning_rate": 7.579196960136959e-06, "loss": 7.8172, "step": 1210 }, { "epoch": 0.8770595690747782, "grad_norm": 0.4725843071937561, "learning_rate": 7.4919312208875385e-06, "loss": 7.8868, "step": 1211 }, { "epoch": 0.8777838131450298, "grad_norm": 0.5052827000617981, "learning_rate": 7.405151222076434e-06, "loss": 7.8121, "step": 1212 }, { "epoch": 0.8785080572152816, "grad_norm": 0.4352813959121704, "learning_rate": 7.318857419367242e-06, "loss": 7.8594, "step": 1213 }, { "epoch": 0.8792323012855332, "grad_norm": 0.4714949429035187, "learning_rate": 7.233050265870534e-06, "loss": 7.8642, "step": 1214 }, { "epoch": 0.8799565453557849, "grad_norm": 0.48388224840164185, "learning_rate": 7.147730212141701e-06, "loss": 7.8853, "step": 1215 }, { "epoch": 0.8806807894260366, "grad_norm": 0.48352184891700745, "learning_rate": 7.062897706178384e-06, "loss": 7.8965, "step": 1216 }, { "epoch": 0.8814050334962883, "grad_norm": 0.46383845806121826, "learning_rate": 6.9785531934182915e-06, "loss": 7.8627, "step": 1217 }, { "epoch": 0.8821292775665399, "grad_norm": 0.48393815755844116, "learning_rate": 6.894697116736715e-06, "loss": 7.874, "step": 1218 }, { "epoch": 0.8828535216367916, "grad_norm": 0.4774603843688965, "learning_rate": 6.8113299164443205e-06, "loss": 7.8833, "step": 1219 }, { "epoch": 0.8835777657070433, "grad_norm": 0.47935643792152405, "learning_rate": 6.728452030284738e-06, "loss": 7.8931, "step": 1220 }, { "epoch": 0.8843020097772949, "grad_norm": 0.45838284492492676, "learning_rate": 6.646063893432364e-06, "loss": 7.8718, "step": 1221 }, { "epoch": 0.8850262538475466, "grad_norm": 0.4871087074279785, "learning_rate": 6.564165938489996e-06, "loss": 7.8648, "step": 1222 }, { "epoch": 0.8857504979177983, "grad_norm": 0.4909784495830536, "learning_rate": 6.482758595486571e-06, "loss": 7.793, "step": 1223 }, { "epoch": 0.88647474198805, "grad_norm": 0.4485389292240143, "learning_rate": 6.401842291874982e-06, "loss": 7.8831, "step": 1224 }, { "epoch": 0.8871989860583016, "grad_norm": 0.479890376329422, "learning_rate": 6.32141745252971e-06, "loss": 7.8632, "step": 1225 }, { "epoch": 0.8879232301285533, "grad_norm": 0.456013023853302, "learning_rate": 6.241484499744732e-06, "loss": 7.8373, "step": 1226 }, { "epoch": 0.888647474198805, "grad_norm": 0.4660279452800751, "learning_rate": 6.16204385323117e-06, "loss": 7.8827, "step": 1227 }, { "epoch": 0.8893717182690567, "grad_norm": 0.4719277620315552, "learning_rate": 6.083095930115157e-06, "loss": 7.7943, "step": 1228 }, { "epoch": 0.8900959623393083, "grad_norm": 0.5206422209739685, "learning_rate": 6.004641144935696e-06, "loss": 7.7849, "step": 1229 }, { "epoch": 0.8908202064095601, "grad_norm": 0.4772622585296631, "learning_rate": 5.926679909642341e-06, "loss": 7.8206, "step": 1230 }, { "epoch": 0.8915444504798117, "grad_norm": 0.4761473536491394, "learning_rate": 5.849212633593193e-06, "loss": 7.83, "step": 1231 }, { "epoch": 0.8922686945500634, "grad_norm": 0.46055835485458374, "learning_rate": 5.77223972355263e-06, "loss": 7.8043, "step": 1232 }, { "epoch": 0.892992938620315, "grad_norm": 0.5095073580741882, "learning_rate": 5.695761583689263e-06, "loss": 7.7667, "step": 1233 }, { "epoch": 0.8937171826905668, "grad_norm": 0.466761976480484, "learning_rate": 5.619778615573712e-06, "loss": 7.8028, "step": 1234 }, { "epoch": 0.8944414267608184, "grad_norm": 0.5017898082733154, "learning_rate": 5.544291218176578e-06, "loss": 7.79, "step": 1235 }, { "epoch": 0.8951656708310701, "grad_norm": 0.464933305978775, "learning_rate": 5.469299787866355e-06, "loss": 7.8344, "step": 1236 }, { "epoch": 0.8958899149013217, "grad_norm": 0.47450631856918335, "learning_rate": 5.394804718407276e-06, "loss": 7.8237, "step": 1237 }, { "epoch": 0.8966141589715734, "grad_norm": 0.4888070821762085, "learning_rate": 5.320806400957312e-06, "loss": 7.7909, "step": 1238 }, { "epoch": 0.8973384030418251, "grad_norm": 0.4835636019706726, "learning_rate": 5.247305224066079e-06, "loss": 7.8204, "step": 1239 }, { "epoch": 0.8980626471120767, "grad_norm": 0.5021434426307678, "learning_rate": 5.174301573672813e-06, "loss": 7.824, "step": 1240 }, { "epoch": 0.8987868911823285, "grad_norm": 0.47264546155929565, "learning_rate": 5.101795833104362e-06, "loss": 7.8474, "step": 1241 }, { "epoch": 0.8995111352525801, "grad_norm": 0.514746367931366, "learning_rate": 5.02978838307312e-06, "loss": 7.818, "step": 1242 }, { "epoch": 0.9002353793228318, "grad_norm": 0.45614975690841675, "learning_rate": 4.958279601675109e-06, "loss": 7.7936, "step": 1243 }, { "epoch": 0.9009596233930834, "grad_norm": 0.477163165807724, "learning_rate": 4.887269864387889e-06, "loss": 7.8263, "step": 1244 }, { "epoch": 0.9016838674633352, "grad_norm": 0.4801469147205353, "learning_rate": 4.816759544068705e-06, "loss": 7.7873, "step": 1245 }, { "epoch": 0.9024081115335868, "grad_norm": 0.5064570307731628, "learning_rate": 4.746749010952412e-06, "loss": 7.7897, "step": 1246 }, { "epoch": 0.9031323556038385, "grad_norm": 0.4912499785423279, "learning_rate": 4.677238632649639e-06, "loss": 7.7679, "step": 1247 }, { "epoch": 0.9038565996740902, "grad_norm": 0.5194550156593323, "learning_rate": 4.608228774144785e-06, "loss": 7.7607, "step": 1248 }, { "epoch": 0.9045808437443419, "grad_norm": 0.5179385542869568, "learning_rate": 4.5397197977940845e-06, "loss": 7.7615, "step": 1249 }, { "epoch": 0.9053050878145935, "grad_norm": 0.5159029364585876, "learning_rate": 4.471712063323818e-06, "loss": 7.7898, "step": 1250 }, { "epoch": 0.9060293318848452, "grad_norm": 0.4930214285850525, "learning_rate": 4.4042059278282865e-06, "loss": 7.8184, "step": 1251 }, { "epoch": 0.9067535759550969, "grad_norm": 0.5010021328926086, "learning_rate": 4.33720174576806e-06, "loss": 7.8191, "step": 1252 }, { "epoch": 0.9074778200253485, "grad_norm": 0.47560325264930725, "learning_rate": 4.270699868967998e-06, "loss": 7.8237, "step": 1253 }, { "epoch": 0.9082020640956002, "grad_norm": 0.4990387260913849, "learning_rate": 4.2047006466155115e-06, "loss": 7.841, "step": 1254 }, { "epoch": 0.9089263081658518, "grad_norm": 0.5125808715820312, "learning_rate": 4.139204425258625e-06, "loss": 7.8264, "step": 1255 }, { "epoch": 0.9096505522361036, "grad_norm": 0.47210493683815, "learning_rate": 4.0742115488042636e-06, "loss": 7.8051, "step": 1256 }, { "epoch": 0.9103747963063552, "grad_norm": 0.4914381206035614, "learning_rate": 4.009722358516366e-06, "loss": 7.8372, "step": 1257 }, { "epoch": 0.9110990403766069, "grad_norm": 0.48577770590782166, "learning_rate": 3.945737193014121e-06, "loss": 7.848, "step": 1258 }, { "epoch": 0.9118232844468586, "grad_norm": 0.49301815032958984, "learning_rate": 3.8822563882702e-06, "loss": 7.8581, "step": 1259 }, { "epoch": 0.9125475285171103, "grad_norm": 0.4755497872829437, "learning_rate": 3.819280277608939e-06, "loss": 7.8607, "step": 1260 }, { "epoch": 0.9132717725873619, "grad_norm": 0.483230322599411, "learning_rate": 3.7568091917047244e-06, "loss": 7.8094, "step": 1261 }, { "epoch": 0.9139960166576137, "grad_norm": 0.4857102632522583, "learning_rate": 3.694843458580055e-06, "loss": 7.8308, "step": 1262 }, { "epoch": 0.9147202607278653, "grad_norm": 0.4947606921195984, "learning_rate": 3.633383403604018e-06, "loss": 7.8509, "step": 1263 }, { "epoch": 0.915444504798117, "grad_norm": 0.489041268825531, "learning_rate": 3.572429349490436e-06, "loss": 7.8645, "step": 1264 }, { "epoch": 0.9161687488683686, "grad_norm": 0.5049325227737427, "learning_rate": 3.511981616296245e-06, "loss": 7.8248, "step": 1265 }, { "epoch": 0.9168929929386204, "grad_norm": 0.4949936866760254, "learning_rate": 3.4520405214197972e-06, "loss": 7.8719, "step": 1266 }, { "epoch": 0.917617237008872, "grad_norm": 0.49333250522613525, "learning_rate": 3.3926063795991723e-06, "loss": 7.8477, "step": 1267 }, { "epoch": 0.9183414810791236, "grad_norm": 0.4706898331642151, "learning_rate": 3.3336795029106027e-06, "loss": 7.831, "step": 1268 }, { "epoch": 0.9190657251493753, "grad_norm": 0.5014546513557434, "learning_rate": 3.2752602007667167e-06, "loss": 7.8772, "step": 1269 }, { "epoch": 0.919789969219627, "grad_norm": 0.50562584400177, "learning_rate": 3.2173487799150083e-06, "loss": 7.8077, "step": 1270 }, { "epoch": 0.9205142132898787, "grad_norm": 0.4861434996128082, "learning_rate": 3.159945544436171e-06, "loss": 7.8048, "step": 1271 }, { "epoch": 0.9212384573601303, "grad_norm": 0.4890470802783966, "learning_rate": 3.103050795742546e-06, "loss": 7.8218, "step": 1272 }, { "epoch": 0.9219627014303821, "grad_norm": 0.46198517084121704, "learning_rate": 3.046664832576518e-06, "loss": 7.8283, "step": 1273 }, { "epoch": 0.9226869455006337, "grad_norm": 0.4833485186100006, "learning_rate": 2.990787951008911e-06, "loss": 7.8617, "step": 1274 }, { "epoch": 0.9234111895708854, "grad_norm": 0.47587546706199646, "learning_rate": 2.93542044443752e-06, "loss": 7.8225, "step": 1275 }, { "epoch": 0.924135433641137, "grad_norm": 0.4739225506782532, "learning_rate": 2.8805626035854793e-06, "loss": 7.8603, "step": 1276 }, { "epoch": 0.9248596777113888, "grad_norm": 0.48468372225761414, "learning_rate": 2.8262147164997975e-06, "loss": 7.8818, "step": 1277 }, { "epoch": 0.9255839217816404, "grad_norm": 0.47420841455459595, "learning_rate": 2.772377068549792e-06, "loss": 7.8859, "step": 1278 }, { "epoch": 0.9263081658518921, "grad_norm": 0.4387821555137634, "learning_rate": 2.7190499424256665e-06, "loss": 7.8423, "step": 1279 }, { "epoch": 0.9270324099221438, "grad_norm": 0.4843416213989258, "learning_rate": 2.6662336181369485e-06, "loss": 7.8529, "step": 1280 }, { "epoch": 0.9277566539923955, "grad_norm": 0.45708930492401123, "learning_rate": 2.613928373011065e-06, "loss": 7.7918, "step": 1281 }, { "epoch": 0.9284808980626471, "grad_norm": 0.44832369685173035, "learning_rate": 2.5621344816918803e-06, "loss": 7.9067, "step": 1282 }, { "epoch": 0.9292051421328987, "grad_norm": 0.485809862613678, "learning_rate": 2.5108522161382153e-06, "loss": 7.8254, "step": 1283 }, { "epoch": 0.9299293862031505, "grad_norm": 0.4524907171726227, "learning_rate": 2.4600818456225083e-06, "loss": 7.8376, "step": 1284 }, { "epoch": 0.9306536302734021, "grad_norm": 0.4835200309753418, "learning_rate": 2.4098236367292805e-06, "loss": 7.7911, "step": 1285 }, { "epoch": 0.9313778743436538, "grad_norm": 0.45698094367980957, "learning_rate": 2.360077853353848e-06, "loss": 7.9004, "step": 1286 }, { "epoch": 0.9321021184139054, "grad_norm": 0.47620901465415955, "learning_rate": 2.3108447567008695e-06, "loss": 7.7822, "step": 1287 }, { "epoch": 0.9328263624841572, "grad_norm": 0.502024233341217, "learning_rate": 2.262124605282978e-06, "loss": 7.8447, "step": 1288 }, { "epoch": 0.9335506065544088, "grad_norm": 0.5169337391853333, "learning_rate": 2.213917654919473e-06, "loss": 7.8207, "step": 1289 }, { "epoch": 0.9342748506246605, "grad_norm": 0.5109481811523438, "learning_rate": 2.1662241587349195e-06, "loss": 7.8592, "step": 1290 }, { "epoch": 0.9349990946949122, "grad_norm": 0.47905823588371277, "learning_rate": 2.119044367157852e-06, "loss": 7.9079, "step": 1291 }, { "epoch": 0.9357233387651639, "grad_norm": 0.48844465613365173, "learning_rate": 2.0723785279194386e-06, "loss": 7.7795, "step": 1292 }, { "epoch": 0.9364475828354155, "grad_norm": 0.49026617407798767, "learning_rate": 2.026226886052207e-06, "loss": 7.8043, "step": 1293 }, { "epoch": 0.9371718269056672, "grad_norm": 0.505928635597229, "learning_rate": 1.9805896838887337e-06, "loss": 7.7842, "step": 1294 }, { "epoch": 0.9378960709759189, "grad_norm": 0.4998481869697571, "learning_rate": 1.935467161060378e-06, "loss": 7.903, "step": 1295 }, { "epoch": 0.9386203150461706, "grad_norm": 0.5379026532173157, "learning_rate": 1.8908595544960272e-06, "loss": 7.8672, "step": 1296 }, { "epoch": 0.9393445591164222, "grad_norm": 0.4858308732509613, "learning_rate": 1.8467670984208652e-06, "loss": 7.7751, "step": 1297 }, { "epoch": 0.940068803186674, "grad_norm": 0.51478111743927, "learning_rate": 1.8031900243550948e-06, "loss": 7.8418, "step": 1298 }, { "epoch": 0.9407930472569256, "grad_norm": 0.4762341380119324, "learning_rate": 1.760128561112795e-06, "loss": 7.8597, "step": 1299 }, { "epoch": 0.9415172913271772, "grad_norm": 0.5281409025192261, "learning_rate": 1.7175829348006767e-06, "loss": 7.7901, "step": 1300 }, { "epoch": 0.9422415353974289, "grad_norm": 0.4912181496620178, "learning_rate": 1.6755533688168624e-06, "loss": 7.8392, "step": 1301 }, { "epoch": 0.9429657794676806, "grad_norm": 0.47923725843429565, "learning_rate": 1.634040083849786e-06, "loss": 7.8797, "step": 1302 }, { "epoch": 0.9436900235379323, "grad_norm": 0.48932698369026184, "learning_rate": 1.593043297876984e-06, "loss": 7.8072, "step": 1303 }, { "epoch": 0.9444142676081839, "grad_norm": 0.4930644631385803, "learning_rate": 1.5525632261639722e-06, "loss": 7.8609, "step": 1304 }, { "epoch": 0.9451385116784357, "grad_norm": 0.46192994713783264, "learning_rate": 1.5126000812631159e-06, "loss": 7.8686, "step": 1305 }, { "epoch": 0.9458627557486873, "grad_norm": 0.4716580808162689, "learning_rate": 1.4731540730124616e-06, "loss": 7.8314, "step": 1306 }, { "epoch": 0.946586999818939, "grad_norm": 0.45859482884407043, "learning_rate": 1.4342254085347506e-06, "loss": 7.8843, "step": 1307 }, { "epoch": 0.9473112438891906, "grad_norm": 0.4815140664577484, "learning_rate": 1.3958142922362083e-06, "loss": 7.8706, "step": 1308 }, { "epoch": 0.9480354879594424, "grad_norm": 0.46593400835990906, "learning_rate": 1.3579209258055226e-06, "loss": 7.8941, "step": 1309 }, { "epoch": 0.948759732029694, "grad_norm": 0.47369858622550964, "learning_rate": 1.3205455082128225e-06, "loss": 7.8729, "step": 1310 }, { "epoch": 0.9494839760999457, "grad_norm": 0.46756669878959656, "learning_rate": 1.283688235708569e-06, "loss": 7.8424, "step": 1311 }, { "epoch": 0.9502082201701973, "grad_norm": 0.5048179626464844, "learning_rate": 1.2473493018225646e-06, "loss": 7.854, "step": 1312 }, { "epoch": 0.9509324642404491, "grad_norm": 0.45838841795921326, "learning_rate": 1.2115288973629014e-06, "loss": 7.8695, "step": 1313 }, { "epoch": 0.9516567083107007, "grad_norm": 0.5052929520606995, "learning_rate": 1.176227210415015e-06, "loss": 7.788, "step": 1314 }, { "epoch": 0.9523809523809523, "grad_norm": 0.4793596863746643, "learning_rate": 1.1414444263406432e-06, "loss": 7.8184, "step": 1315 }, { "epoch": 0.9531051964512041, "grad_norm": 0.47426638007164, "learning_rate": 1.1071807277768798e-06, "loss": 7.8186, "step": 1316 }, { "epoch": 0.9538294405214557, "grad_norm": 0.48723068833351135, "learning_rate": 1.0734362946352107e-06, "loss": 7.8259, "step": 1317 }, { "epoch": 0.9545536845917074, "grad_norm": 0.4758625626564026, "learning_rate": 1.0402113041005468e-06, "loss": 7.8321, "step": 1318 }, { "epoch": 0.955277928661959, "grad_norm": 0.4779294431209564, "learning_rate": 1.0075059306303702e-06, "loss": 7.8489, "step": 1319 }, { "epoch": 0.9560021727322108, "grad_norm": 0.48141980171203613, "learning_rate": 9.753203459537009e-07, "loss": 7.8922, "step": 1320 }, { "epoch": 0.9567264168024624, "grad_norm": 0.4511309862136841, "learning_rate": 9.436547190702971e-07, "loss": 7.8431, "step": 1321 }, { "epoch": 0.9574506608727141, "grad_norm": 0.4854510724544525, "learning_rate": 9.125092162497129e-07, "loss": 7.8434, "step": 1322 }, { "epoch": 0.9581749049429658, "grad_norm": 0.48352810740470886, "learning_rate": 8.818840010304308e-07, "loss": 7.864, "step": 1323 }, { "epoch": 0.9588991490132175, "grad_norm": 0.4762645661830902, "learning_rate": 8.5177923421903e-07, "loss": 7.8196, "step": 1324 }, { "epoch": 0.9596233930834691, "grad_norm": 0.4877321720123291, "learning_rate": 8.221950738893203e-07, "loss": 7.7954, "step": 1325 }, { "epoch": 0.9603476371537208, "grad_norm": 0.4776611924171448, "learning_rate": 7.931316753815088e-07, "loss": 7.8505, "step": 1326 }, { "epoch": 0.9610718812239725, "grad_norm": 0.479509562253952, "learning_rate": 7.645891913014013e-07, "loss": 7.8779, "step": 1327 }, { "epoch": 0.9617961252942242, "grad_norm": 0.46871259808540344, "learning_rate": 7.365677715195918e-07, "loss": 7.8008, "step": 1328 }, { "epoch": 0.9625203693644758, "grad_norm": 0.4688069820404053, "learning_rate": 7.090675631706512e-07, "loss": 7.8098, "step": 1329 }, { "epoch": 0.9632446134347274, "grad_norm": 0.4722239077091217, "learning_rate": 6.820887106524065e-07, "loss": 7.8363, "step": 1330 }, { "epoch": 0.9639688575049792, "grad_norm": 0.4427089989185333, "learning_rate": 6.556313556251636e-07, "loss": 7.8216, "step": 1331 }, { "epoch": 0.9646931015752308, "grad_norm": 0.475266695022583, "learning_rate": 6.296956370109075e-07, "loss": 7.8203, "step": 1332 }, { "epoch": 0.9654173456454825, "grad_norm": 0.47641658782958984, "learning_rate": 6.042816909926585e-07, "loss": 7.85, "step": 1333 }, { "epoch": 0.9661415897157342, "grad_norm": 0.47987788915634155, "learning_rate": 5.793896510137287e-07, "loss": 7.7702, "step": 1334 }, { "epoch": 0.9668658337859859, "grad_norm": 0.5124553442001343, "learning_rate": 5.550196477769665e-07, "loss": 7.8409, "step": 1335 }, { "epoch": 0.9675900778562375, "grad_norm": 0.4751659333705902, "learning_rate": 5.311718092441465e-07, "loss": 7.8114, "step": 1336 }, { "epoch": 0.9683143219264893, "grad_norm": 0.45295917987823486, "learning_rate": 5.078462606352585e-07, "loss": 7.9019, "step": 1337 }, { "epoch": 0.9690385659967409, "grad_norm": 0.49618884921073914, "learning_rate": 4.850431244278753e-07, "loss": 7.7931, "step": 1338 }, { "epoch": 0.9697628100669926, "grad_norm": 0.48623406887054443, "learning_rate": 4.627625203564523e-07, "loss": 7.8004, "step": 1339 }, { "epoch": 0.9704870541372442, "grad_norm": 0.43766334652900696, "learning_rate": 4.4100456541177335e-07, "loss": 7.8297, "step": 1340 }, { "epoch": 0.971211298207496, "grad_norm": 0.46884864568710327, "learning_rate": 4.1976937384028417e-07, "loss": 7.8893, "step": 1341 }, { "epoch": 0.9719355422777476, "grad_norm": 0.48704731464385986, "learning_rate": 3.990570571435259e-07, "loss": 7.7991, "step": 1342 }, { "epoch": 0.9726597863479993, "grad_norm": 0.4837970733642578, "learning_rate": 3.7886772407751406e-07, "loss": 7.8021, "step": 1343 }, { "epoch": 0.973384030418251, "grad_norm": 0.5404148697853088, "learning_rate": 3.5920148065220484e-07, "loss": 7.7664, "step": 1344 }, { "epoch": 0.9741082744885027, "grad_norm": 0.544996976852417, "learning_rate": 3.4005843013089625e-07, "loss": 7.8745, "step": 1345 }, { "epoch": 0.9748325185587543, "grad_norm": 0.5073729753494263, "learning_rate": 3.2143867302973917e-07, "loss": 7.774, "step": 1346 }, { "epoch": 0.9755567626290059, "grad_norm": 0.5275909304618835, "learning_rate": 3.033423071171604e-07, "loss": 7.7614, "step": 1347 }, { "epoch": 0.9762810066992577, "grad_norm": 0.5046758651733398, "learning_rate": 2.857694274133849e-07, "loss": 7.7907, "step": 1348 }, { "epoch": 0.9770052507695093, "grad_norm": 0.5203927159309387, "learning_rate": 2.6872012618990306e-07, "loss": 7.852, "step": 1349 }, { "epoch": 0.977729494839761, "grad_norm": 0.6230973601341248, "learning_rate": 2.5219449296900455e-07, "loss": 7.8475, "step": 1350 }, { "epoch": 0.9784537389100126, "grad_norm": 0.4698044955730438, "learning_rate": 2.3619261452335617e-07, "loss": 7.83, "step": 1351 }, { "epoch": 0.9791779829802644, "grad_norm": 0.48747655749320984, "learning_rate": 2.207145748754247e-07, "loss": 7.8553, "step": 1352 }, { "epoch": 0.979902227050516, "grad_norm": 0.5164722800254822, "learning_rate": 2.0576045529715482e-07, "loss": 7.8262, "step": 1353 }, { "epoch": 0.9806264711207677, "grad_norm": 0.4546986222267151, "learning_rate": 1.9133033430949186e-07, "loss": 7.9124, "step": 1354 }, { "epoch": 0.9813507151910194, "grad_norm": 0.4600694179534912, "learning_rate": 1.7742428768195985e-07, "loss": 7.8223, "step": 1355 }, { "epoch": 0.9820749592612711, "grad_norm": 0.4773062765598297, "learning_rate": 1.6404238843230612e-07, "loss": 7.8256, "step": 1356 }, { "epoch": 0.9827992033315227, "grad_norm": 0.46029308438301086, "learning_rate": 1.5118470682605745e-07, "loss": 7.8241, "step": 1357 }, { "epoch": 0.9835234474017744, "grad_norm": 0.45993316173553467, "learning_rate": 1.38851310376209e-07, "loss": 7.8153, "step": 1358 }, { "epoch": 0.9842476914720261, "grad_norm": 0.45841482281684875, "learning_rate": 1.2704226384282482e-07, "loss": 7.8534, "step": 1359 }, { "epoch": 0.9849719355422778, "grad_norm": 0.4997026026248932, "learning_rate": 1.157576292327378e-07, "loss": 7.8132, "step": 1360 }, { "epoch": 0.9856961796125294, "grad_norm": 0.4676493704319, "learning_rate": 1.0499746579919478e-07, "loss": 7.8852, "step": 1361 }, { "epoch": 0.986420423682781, "grad_norm": 0.497711718082428, "learning_rate": 9.476183004154537e-08, "loss": 7.8372, "step": 1362 }, { "epoch": 0.9871446677530328, "grad_norm": 0.46188995242118835, "learning_rate": 8.505077570496456e-08, "loss": 7.8758, "step": 1363 }, { "epoch": 0.9878689118232844, "grad_norm": 0.46226826310157776, "learning_rate": 7.586435378016399e-08, "loss": 7.8781, "step": 1364 }, { "epoch": 0.9885931558935361, "grad_norm": 0.47206389904022217, "learning_rate": 6.720261250311444e-08, "loss": 7.8517, "step": 1365 }, { "epoch": 0.9893173999637878, "grad_norm": 0.4836471378803253, "learning_rate": 5.9065597354790445e-08, "loss": 7.8837, "step": 1366 }, { "epoch": 0.9900416440340395, "grad_norm": 0.4508313834667206, "learning_rate": 5.1453351060959387e-08, "loss": 7.8247, "step": 1367 }, { "epoch": 0.9907658881042911, "grad_norm": 0.46068304777145386, "learning_rate": 4.43659135919372e-08, "loss": 7.8923, "step": 1368 }, { "epoch": 0.9914901321745428, "grad_norm": 0.4875946342945099, "learning_rate": 3.780332216234417e-08, "loss": 7.866, "step": 1369 }, { "epoch": 0.9922143762447945, "grad_norm": 0.48047712445259094, "learning_rate": 3.1765611230993865e-08, "loss": 7.7948, "step": 1370 }, { "epoch": 0.9929386203150462, "grad_norm": 0.4504997134208679, "learning_rate": 2.625281250061562e-08, "loss": 7.808, "step": 1371 }, { "epoch": 0.9936628643852978, "grad_norm": 0.44184571504592896, "learning_rate": 2.1264954917776802e-08, "loss": 7.8365, "step": 1372 }, { "epoch": 0.9943871084555496, "grad_norm": 0.46836909651756287, "learning_rate": 1.6802064672660767e-08, "loss": 7.8799, "step": 1373 }, { "epoch": 0.9951113525258012, "grad_norm": 0.5343976616859436, "learning_rate": 1.286416519897804e-08, "loss": 7.7852, "step": 1374 }, { "epoch": 0.9958355965960529, "grad_norm": 0.5096919536590576, "learning_rate": 9.451277173788687e-09, "loss": 7.7644, "step": 1375 }, { "epoch": 0.9965598406663045, "grad_norm": 0.538051426410675, "learning_rate": 6.563418517469e-09, "loss": 7.8173, "step": 1376 }, { "epoch": 0.9972840847365562, "grad_norm": 0.4815247356891632, "learning_rate": 4.200604393556073e-09, "loss": 7.8366, "step": 1377 }, { "epoch": 0.9980083288068079, "grad_norm": 0.49304264783859253, "learning_rate": 2.3628472086811847e-09, "loss": 7.8003, "step": 1378 }, { "epoch": 0.9987325728770595, "grad_norm": 0.5107442736625671, "learning_rate": 1.0501566125364904e-09, "loss": 7.7771, "step": 1379 }, { "epoch": 0.9994568169473113, "grad_norm": 0.5584591031074524, "learning_rate": 2.6253949776400275e-10, "loss": 7.8327, "step": 1380 }, { "epoch": 1.0003621220351258, "grad_norm": 0.8555117845535278, "learning_rate": 0.0, "loss": 11.8993, "step": 1381 } ], "logging_steps": 1, "max_steps": 1381, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 346, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 564726482337792.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }