diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6565 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 933, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0010718113612004287, + "grad_norm": 18.787860870361328, + "learning_rate": 0.0002, + "loss": 11.4429, + "step": 1 + }, + { + "epoch": 0.0021436227224008574, + "grad_norm": 18.825637817382812, + "learning_rate": 0.00019978563772775994, + "loss": 10.4711, + "step": 2 + }, + { + "epoch": 0.003215434083601286, + "grad_norm": 10.722309112548828, + "learning_rate": 0.00019957127545551983, + "loss": 8.4237, + "step": 3 + }, + { + "epoch": 0.004287245444801715, + "grad_norm": 8.985899925231934, + "learning_rate": 0.00019935691318327976, + "loss": 7.7351, + "step": 4 + }, + { + "epoch": 0.0053590568060021436, + "grad_norm": 7.36727237701416, + "learning_rate": 0.00019914255091103966, + "loss": 7.1636, + "step": 5 + }, + { + "epoch": 0.006430868167202572, + "grad_norm": 16.767192840576172, + "learning_rate": 0.00019892818863879958, + "loss": 7.2813, + "step": 6 + }, + { + "epoch": 0.007502679528403001, + "grad_norm": 5.598256587982178, + "learning_rate": 0.00019871382636655948, + "loss": 6.6949, + "step": 7 + }, + { + "epoch": 0.00857449088960343, + "grad_norm": 9.017542839050293, + "learning_rate": 0.0001984994640943194, + "loss": 6.8995, + "step": 8 + }, + { + "epoch": 0.00964630225080386, + "grad_norm": 4.33115816116333, + "learning_rate": 0.00019828510182207934, + "loss": 6.3657, + "step": 9 + }, + { + "epoch": 0.010718113612004287, + "grad_norm": 3.3178164958953857, + "learning_rate": 0.00019807073954983923, + "loss": 6.3438, + "step": 10 + }, + { + "epoch": 0.011789924973204717, + "grad_norm": 2.961005687713623, + "learning_rate": 0.00019785637727759916, + "loss": 5.9906, + "step": 11 + }, + { + "epoch": 0.012861736334405145, + "grad_norm": 4.369665145874023, + "learning_rate": 0.00019764201500535906, + "loss": 6.3405, + "step": 12 + }, + { + "epoch": 0.013933547695605574, + "grad_norm": 7.567108154296875, + "learning_rate": 0.00019742765273311899, + "loss": 6.1571, + "step": 13 + }, + { + "epoch": 0.015005359056806002, + "grad_norm": 6.276333808898926, + "learning_rate": 0.00019721329046087888, + "loss": 5.9091, + "step": 14 + }, + { + "epoch": 0.01607717041800643, + "grad_norm": 3.475311040878296, + "learning_rate": 0.0001969989281886388, + "loss": 5.7566, + "step": 15 + }, + { + "epoch": 0.01714898177920686, + "grad_norm": 3.2999136447906494, + "learning_rate": 0.00019678456591639874, + "loss": 5.5771, + "step": 16 + }, + { + "epoch": 0.01822079314040729, + "grad_norm": 2.9271633625030518, + "learning_rate": 0.00019657020364415863, + "loss": 5.7828, + "step": 17 + }, + { + "epoch": 0.01929260450160772, + "grad_norm": 4.073993682861328, + "learning_rate": 0.00019635584137191856, + "loss": 5.8397, + "step": 18 + }, + { + "epoch": 0.020364415862808145, + "grad_norm": 4.44898796081543, + "learning_rate": 0.00019614147909967846, + "loss": 5.5915, + "step": 19 + }, + { + "epoch": 0.021436227224008574, + "grad_norm": 3.62903094291687, + "learning_rate": 0.00019592711682743839, + "loss": 5.6345, + "step": 20 + }, + { + "epoch": 0.022508038585209004, + "grad_norm": 3.9096498489379883, + "learning_rate": 0.00019571275455519828, + "loss": 5.2657, + "step": 21 + }, + { + "epoch": 0.023579849946409433, + "grad_norm": 5.483320236206055, + "learning_rate": 0.0001954983922829582, + "loss": 5.7525, + "step": 22 + }, + { + "epoch": 0.02465166130760986, + "grad_norm": 4.441135883331299, + "learning_rate": 0.00019528403001071814, + "loss": 5.1491, + "step": 23 + }, + { + "epoch": 0.02572347266881029, + "grad_norm": 4.325745582580566, + "learning_rate": 0.00019506966773847803, + "loss": 5.3696, + "step": 24 + }, + { + "epoch": 0.02679528403001072, + "grad_norm": 5.317447185516357, + "learning_rate": 0.00019485530546623796, + "loss": 5.1451, + "step": 25 + }, + { + "epoch": 0.027867095391211148, + "grad_norm": 2.9565281867980957, + "learning_rate": 0.00019464094319399786, + "loss": 5.1643, + "step": 26 + }, + { + "epoch": 0.028938906752411574, + "grad_norm": 3.596660614013672, + "learning_rate": 0.00019442658092175779, + "loss": 5.3528, + "step": 27 + }, + { + "epoch": 0.030010718113612004, + "grad_norm": 3.135538339614868, + "learning_rate": 0.00019421221864951768, + "loss": 5.2368, + "step": 28 + }, + { + "epoch": 0.031082529474812434, + "grad_norm": 4.410246849060059, + "learning_rate": 0.0001939978563772776, + "loss": 5.3679, + "step": 29 + }, + { + "epoch": 0.03215434083601286, + "grad_norm": 5.01721715927124, + "learning_rate": 0.00019378349410503754, + "loss": 5.3211, + "step": 30 + }, + { + "epoch": 0.03322615219721329, + "grad_norm": 3.6833577156066895, + "learning_rate": 0.00019356913183279743, + "loss": 5.5033, + "step": 31 + }, + { + "epoch": 0.03429796355841372, + "grad_norm": 3.7026724815368652, + "learning_rate": 0.00019335476956055736, + "loss": 5.2373, + "step": 32 + }, + { + "epoch": 0.03536977491961415, + "grad_norm": 2.8100647926330566, + "learning_rate": 0.00019314040728831726, + "loss": 5.3375, + "step": 33 + }, + { + "epoch": 0.03644158628081458, + "grad_norm": 3.55210280418396, + "learning_rate": 0.00019292604501607719, + "loss": 5.104, + "step": 34 + }, + { + "epoch": 0.03751339764201501, + "grad_norm": 3.5566956996917725, + "learning_rate": 0.00019271168274383708, + "loss": 5.3136, + "step": 35 + }, + { + "epoch": 0.03858520900321544, + "grad_norm": 3.579719066619873, + "learning_rate": 0.000192497320471597, + "loss": 5.4142, + "step": 36 + }, + { + "epoch": 0.03965702036441586, + "grad_norm": 3.1719672679901123, + "learning_rate": 0.00019228295819935694, + "loss": 5.2212, + "step": 37 + }, + { + "epoch": 0.04072883172561629, + "grad_norm": 3.3072733879089355, + "learning_rate": 0.00019206859592711684, + "loss": 5.1418, + "step": 38 + }, + { + "epoch": 0.04180064308681672, + "grad_norm": 3.1818485260009766, + "learning_rate": 0.00019185423365487676, + "loss": 5.0819, + "step": 39 + }, + { + "epoch": 0.04287245444801715, + "grad_norm": 2.7018849849700928, + "learning_rate": 0.00019163987138263666, + "loss": 4.9725, + "step": 40 + }, + { + "epoch": 0.04394426580921758, + "grad_norm": 3.215134382247925, + "learning_rate": 0.00019142550911039659, + "loss": 5.1623, + "step": 41 + }, + { + "epoch": 0.04501607717041801, + "grad_norm": 3.6956472396850586, + "learning_rate": 0.00019121114683815648, + "loss": 5.2277, + "step": 42 + }, + { + "epoch": 0.04608788853161844, + "grad_norm": 3.72455096244812, + "learning_rate": 0.0001909967845659164, + "loss": 5.1207, + "step": 43 + }, + { + "epoch": 0.04715969989281887, + "grad_norm": 3.6416709423065186, + "learning_rate": 0.00019078242229367634, + "loss": 5.4012, + "step": 44 + }, + { + "epoch": 0.04823151125401929, + "grad_norm": 3.856046676635742, + "learning_rate": 0.00019056806002143624, + "loss": 4.9041, + "step": 45 + }, + { + "epoch": 0.04930332261521972, + "grad_norm": 3.6323482990264893, + "learning_rate": 0.00019035369774919616, + "loss": 5.0846, + "step": 46 + }, + { + "epoch": 0.05037513397642015, + "grad_norm": 4.227871417999268, + "learning_rate": 0.00019013933547695606, + "loss": 5.2468, + "step": 47 + }, + { + "epoch": 0.05144694533762058, + "grad_norm": 4.334918022155762, + "learning_rate": 0.00018992497320471599, + "loss": 5.3007, + "step": 48 + }, + { + "epoch": 0.05251875669882101, + "grad_norm": 4.2465105056762695, + "learning_rate": 0.00018971061093247588, + "loss": 4.8531, + "step": 49 + }, + { + "epoch": 0.05359056806002144, + "grad_norm": 4.0939106941223145, + "learning_rate": 0.0001894962486602358, + "loss": 5.2677, + "step": 50 + }, + { + "epoch": 0.05466237942122187, + "grad_norm": 4.415087699890137, + "learning_rate": 0.00018928188638799574, + "loss": 5.1889, + "step": 51 + }, + { + "epoch": 0.055734190782422297, + "grad_norm": 2.4377565383911133, + "learning_rate": 0.00018906752411575564, + "loss": 5.0309, + "step": 52 + }, + { + "epoch": 0.05680600214362272, + "grad_norm": 3.9000558853149414, + "learning_rate": 0.00018885316184351556, + "loss": 5.0311, + "step": 53 + }, + { + "epoch": 0.05787781350482315, + "grad_norm": 3.740562915802002, + "learning_rate": 0.00018863879957127546, + "loss": 5.2185, + "step": 54 + }, + { + "epoch": 0.05894962486602358, + "grad_norm": 2.913926124572754, + "learning_rate": 0.00018842443729903539, + "loss": 4.7942, + "step": 55 + }, + { + "epoch": 0.06002143622722401, + "grad_norm": 3.0584166049957275, + "learning_rate": 0.00018821007502679528, + "loss": 5.1003, + "step": 56 + }, + { + "epoch": 0.06109324758842444, + "grad_norm": 3.646268367767334, + "learning_rate": 0.0001879957127545552, + "loss": 5.3191, + "step": 57 + }, + { + "epoch": 0.06216505894962487, + "grad_norm": 4.751884460449219, + "learning_rate": 0.00018778135048231514, + "loss": 4.9535, + "step": 58 + }, + { + "epoch": 0.0632368703108253, + "grad_norm": 3.8159265518188477, + "learning_rate": 0.00018756698821007504, + "loss": 5.0055, + "step": 59 + }, + { + "epoch": 0.06430868167202572, + "grad_norm": 2.9061062335968018, + "learning_rate": 0.00018735262593783496, + "loss": 5.0264, + "step": 60 + }, + { + "epoch": 0.06538049303322616, + "grad_norm": 3.322441577911377, + "learning_rate": 0.00018713826366559486, + "loss": 5.1263, + "step": 61 + }, + { + "epoch": 0.06645230439442658, + "grad_norm": 3.4240949153900146, + "learning_rate": 0.00018692390139335479, + "loss": 5.0013, + "step": 62 + }, + { + "epoch": 0.06752411575562701, + "grad_norm": 3.418482780456543, + "learning_rate": 0.00018670953912111469, + "loss": 5.4505, + "step": 63 + }, + { + "epoch": 0.06859592711682744, + "grad_norm": 3.3597252368927, + "learning_rate": 0.0001864951768488746, + "loss": 4.9659, + "step": 64 + }, + { + "epoch": 0.06966773847802786, + "grad_norm": 3.013861894607544, + "learning_rate": 0.0001862808145766345, + "loss": 5.1352, + "step": 65 + }, + { + "epoch": 0.0707395498392283, + "grad_norm": 5.465812683105469, + "learning_rate": 0.00018606645230439444, + "loss": 4.8951, + "step": 66 + }, + { + "epoch": 0.07181136120042872, + "grad_norm": 4.695466995239258, + "learning_rate": 0.00018585209003215436, + "loss": 4.7569, + "step": 67 + }, + { + "epoch": 0.07288317256162916, + "grad_norm": 4.39758825302124, + "learning_rate": 0.00018563772775991426, + "loss": 5.0259, + "step": 68 + }, + { + "epoch": 0.07395498392282958, + "grad_norm": 3.696704626083374, + "learning_rate": 0.00018542336548767419, + "loss": 4.7529, + "step": 69 + }, + { + "epoch": 0.07502679528403002, + "grad_norm": 5.415925979614258, + "learning_rate": 0.00018520900321543409, + "loss": 5.3223, + "step": 70 + }, + { + "epoch": 0.07609860664523044, + "grad_norm": 3.055694341659546, + "learning_rate": 0.000184994640943194, + "loss": 5.2108, + "step": 71 + }, + { + "epoch": 0.07717041800643087, + "grad_norm": 4.533783912658691, + "learning_rate": 0.0001847802786709539, + "loss": 4.9434, + "step": 72 + }, + { + "epoch": 0.0782422293676313, + "grad_norm": 2.5685031414031982, + "learning_rate": 0.00018456591639871384, + "loss": 5.1461, + "step": 73 + }, + { + "epoch": 0.07931404072883172, + "grad_norm": 3.007826805114746, + "learning_rate": 0.00018435155412647376, + "loss": 5.058, + "step": 74 + }, + { + "epoch": 0.08038585209003216, + "grad_norm": 3.6478159427642822, + "learning_rate": 0.00018413719185423366, + "loss": 4.7813, + "step": 75 + }, + { + "epoch": 0.08145766345123258, + "grad_norm": 2.475228786468506, + "learning_rate": 0.0001839228295819936, + "loss": 5.0926, + "step": 76 + }, + { + "epoch": 0.08252947481243302, + "grad_norm": 2.8211352825164795, + "learning_rate": 0.00018370846730975349, + "loss": 5.0912, + "step": 77 + }, + { + "epoch": 0.08360128617363344, + "grad_norm": 2.761347532272339, + "learning_rate": 0.0001834941050375134, + "loss": 4.9557, + "step": 78 + }, + { + "epoch": 0.08467309753483387, + "grad_norm": 3.7475688457489014, + "learning_rate": 0.0001832797427652733, + "loss": 5.2675, + "step": 79 + }, + { + "epoch": 0.0857449088960343, + "grad_norm": 3.1617653369903564, + "learning_rate": 0.00018306538049303324, + "loss": 5.1046, + "step": 80 + }, + { + "epoch": 0.08681672025723473, + "grad_norm": 2.875410556793213, + "learning_rate": 0.00018285101822079316, + "loss": 5.0993, + "step": 81 + }, + { + "epoch": 0.08788853161843516, + "grad_norm": 3.297821521759033, + "learning_rate": 0.00018263665594855306, + "loss": 4.8784, + "step": 82 + }, + { + "epoch": 0.08896034297963558, + "grad_norm": 2.7489824295043945, + "learning_rate": 0.000182422293676313, + "loss": 5.0613, + "step": 83 + }, + { + "epoch": 0.09003215434083602, + "grad_norm": 4.606520652770996, + "learning_rate": 0.00018220793140407289, + "loss": 4.795, + "step": 84 + }, + { + "epoch": 0.09110396570203644, + "grad_norm": 3.6237289905548096, + "learning_rate": 0.0001819935691318328, + "loss": 4.9828, + "step": 85 + }, + { + "epoch": 0.09217577706323687, + "grad_norm": 3.9898364543914795, + "learning_rate": 0.0001817792068595927, + "loss": 4.9822, + "step": 86 + }, + { + "epoch": 0.0932475884244373, + "grad_norm": 3.16852068901062, + "learning_rate": 0.00018156484458735264, + "loss": 4.8484, + "step": 87 + }, + { + "epoch": 0.09431939978563773, + "grad_norm": 4.019200325012207, + "learning_rate": 0.00018135048231511256, + "loss": 5.2485, + "step": 88 + }, + { + "epoch": 0.09539121114683816, + "grad_norm": 4.304863929748535, + "learning_rate": 0.00018113612004287246, + "loss": 4.8591, + "step": 89 + }, + { + "epoch": 0.09646302250803858, + "grad_norm": 3.7539124488830566, + "learning_rate": 0.0001809217577706324, + "loss": 5.1811, + "step": 90 + }, + { + "epoch": 0.09753483386923902, + "grad_norm": 4.071114540100098, + "learning_rate": 0.00018070739549839229, + "loss": 4.9069, + "step": 91 + }, + { + "epoch": 0.09860664523043944, + "grad_norm": 2.6979987621307373, + "learning_rate": 0.0001804930332261522, + "loss": 4.95, + "step": 92 + }, + { + "epoch": 0.09967845659163987, + "grad_norm": 3.5572869777679443, + "learning_rate": 0.0001802786709539121, + "loss": 5.0632, + "step": 93 + }, + { + "epoch": 0.1007502679528403, + "grad_norm": 3.7195041179656982, + "learning_rate": 0.00018006430868167204, + "loss": 4.5738, + "step": 94 + }, + { + "epoch": 0.10182207931404073, + "grad_norm": 3.0068423748016357, + "learning_rate": 0.00017984994640943196, + "loss": 5.0092, + "step": 95 + }, + { + "epoch": 0.10289389067524116, + "grad_norm": 3.6475560665130615, + "learning_rate": 0.00017963558413719186, + "loss": 4.932, + "step": 96 + }, + { + "epoch": 0.10396570203644159, + "grad_norm": 4.444890022277832, + "learning_rate": 0.0001794212218649518, + "loss": 5.3889, + "step": 97 + }, + { + "epoch": 0.10503751339764202, + "grad_norm": 2.3350093364715576, + "learning_rate": 0.00017920685959271169, + "loss": 4.8875, + "step": 98 + }, + { + "epoch": 0.10610932475884244, + "grad_norm": 5.134820938110352, + "learning_rate": 0.0001789924973204716, + "loss": 5.1827, + "step": 99 + }, + { + "epoch": 0.10718113612004287, + "grad_norm": 2.695220947265625, + "learning_rate": 0.0001787781350482315, + "loss": 5.082, + "step": 100 + }, + { + "epoch": 0.1082529474812433, + "grad_norm": 2.2416651248931885, + "learning_rate": 0.00017856377277599144, + "loss": 4.8779, + "step": 101 + }, + { + "epoch": 0.10932475884244373, + "grad_norm": 4.537072658538818, + "learning_rate": 0.00017834941050375136, + "loss": 5.1109, + "step": 102 + }, + { + "epoch": 0.11039657020364416, + "grad_norm": 3.407715320587158, + "learning_rate": 0.00017813504823151126, + "loss": 4.7432, + "step": 103 + }, + { + "epoch": 0.11146838156484459, + "grad_norm": 3.8071422576904297, + "learning_rate": 0.0001779206859592712, + "loss": 4.9924, + "step": 104 + }, + { + "epoch": 0.11254019292604502, + "grad_norm": 2.7043581008911133, + "learning_rate": 0.00017770632368703109, + "loss": 5.08, + "step": 105 + }, + { + "epoch": 0.11361200428724544, + "grad_norm": 2.9360063076019287, + "learning_rate": 0.000177491961414791, + "loss": 4.8822, + "step": 106 + }, + { + "epoch": 0.11468381564844587, + "grad_norm": 2.972593069076538, + "learning_rate": 0.0001772775991425509, + "loss": 4.9554, + "step": 107 + }, + { + "epoch": 0.1157556270096463, + "grad_norm": 3.9115068912506104, + "learning_rate": 0.00017706323687031084, + "loss": 4.9983, + "step": 108 + }, + { + "epoch": 0.11682743837084673, + "grad_norm": 2.4414217472076416, + "learning_rate": 0.00017684887459807076, + "loss": 4.8562, + "step": 109 + }, + { + "epoch": 0.11789924973204716, + "grad_norm": 4.106845378875732, + "learning_rate": 0.00017663451232583066, + "loss": 4.9848, + "step": 110 + }, + { + "epoch": 0.1189710610932476, + "grad_norm": 2.924886465072632, + "learning_rate": 0.0001764201500535906, + "loss": 4.7966, + "step": 111 + }, + { + "epoch": 0.12004287245444802, + "grad_norm": 3.0221619606018066, + "learning_rate": 0.00017620578778135049, + "loss": 4.6449, + "step": 112 + }, + { + "epoch": 0.12111468381564845, + "grad_norm": 4.032159805297852, + "learning_rate": 0.0001759914255091104, + "loss": 4.9659, + "step": 113 + }, + { + "epoch": 0.12218649517684887, + "grad_norm": 2.689523696899414, + "learning_rate": 0.0001757770632368703, + "loss": 5.0913, + "step": 114 + }, + { + "epoch": 0.1232583065380493, + "grad_norm": 4.315779685974121, + "learning_rate": 0.00017556270096463024, + "loss": 5.2312, + "step": 115 + }, + { + "epoch": 0.12433011789924973, + "grad_norm": 3.1354005336761475, + "learning_rate": 0.00017534833869239016, + "loss": 5.0489, + "step": 116 + }, + { + "epoch": 0.12540192926045016, + "grad_norm": 4.844458103179932, + "learning_rate": 0.00017513397642015006, + "loss": 5.0938, + "step": 117 + }, + { + "epoch": 0.1264737406216506, + "grad_norm": 3.60406756401062, + "learning_rate": 0.00017491961414791, + "loss": 4.9502, + "step": 118 + }, + { + "epoch": 0.12754555198285103, + "grad_norm": 2.6359314918518066, + "learning_rate": 0.00017470525187566989, + "loss": 4.7598, + "step": 119 + }, + { + "epoch": 0.12861736334405144, + "grad_norm": 2.6354403495788574, + "learning_rate": 0.0001744908896034298, + "loss": 4.7832, + "step": 120 + }, + { + "epoch": 0.12968917470525188, + "grad_norm": 3.07201886177063, + "learning_rate": 0.0001742765273311897, + "loss": 5.0391, + "step": 121 + }, + { + "epoch": 0.1307609860664523, + "grad_norm": 4.98795747756958, + "learning_rate": 0.00017406216505894964, + "loss": 5.3099, + "step": 122 + }, + { + "epoch": 0.13183279742765272, + "grad_norm": 2.553927421569824, + "learning_rate": 0.00017384780278670956, + "loss": 4.8655, + "step": 123 + }, + { + "epoch": 0.13290460878885316, + "grad_norm": 5.08751106262207, + "learning_rate": 0.00017363344051446946, + "loss": 5.2898, + "step": 124 + }, + { + "epoch": 0.1339764201500536, + "grad_norm": 3.600430965423584, + "learning_rate": 0.0001734190782422294, + "loss": 4.8845, + "step": 125 + }, + { + "epoch": 0.13504823151125403, + "grad_norm": 3.795992851257324, + "learning_rate": 0.00017320471596998929, + "loss": 5.0673, + "step": 126 + }, + { + "epoch": 0.13612004287245444, + "grad_norm": 4.044764995574951, + "learning_rate": 0.0001729903536977492, + "loss": 4.9675, + "step": 127 + }, + { + "epoch": 0.13719185423365488, + "grad_norm": 3.4079222679138184, + "learning_rate": 0.0001727759914255091, + "loss": 5.0485, + "step": 128 + }, + { + "epoch": 0.1382636655948553, + "grad_norm": 2.5793895721435547, + "learning_rate": 0.00017256162915326904, + "loss": 5.0764, + "step": 129 + }, + { + "epoch": 0.13933547695605572, + "grad_norm": 3.7445921897888184, + "learning_rate": 0.00017234726688102896, + "loss": 5.44, + "step": 130 + }, + { + "epoch": 0.14040728831725616, + "grad_norm": 2.7735092639923096, + "learning_rate": 0.00017213290460878886, + "loss": 4.5954, + "step": 131 + }, + { + "epoch": 0.1414790996784566, + "grad_norm": 4.459575653076172, + "learning_rate": 0.0001719185423365488, + "loss": 4.6362, + "step": 132 + }, + { + "epoch": 0.14255091103965703, + "grad_norm": 4.086177349090576, + "learning_rate": 0.0001717041800643087, + "loss": 4.8736, + "step": 133 + }, + { + "epoch": 0.14362272240085744, + "grad_norm": 2.562479257583618, + "learning_rate": 0.0001714898177920686, + "loss": 4.9779, + "step": 134 + }, + { + "epoch": 0.14469453376205788, + "grad_norm": 2.590363025665283, + "learning_rate": 0.0001712754555198285, + "loss": 4.8501, + "step": 135 + }, + { + "epoch": 0.1457663451232583, + "grad_norm": 4.304504871368408, + "learning_rate": 0.00017106109324758844, + "loss": 4.7176, + "step": 136 + }, + { + "epoch": 0.14683815648445875, + "grad_norm": 2.6717023849487305, + "learning_rate": 0.00017084673097534836, + "loss": 4.9784, + "step": 137 + }, + { + "epoch": 0.14790996784565916, + "grad_norm": 3.3490304946899414, + "learning_rate": 0.00017063236870310826, + "loss": 5.0833, + "step": 138 + }, + { + "epoch": 0.1489817792068596, + "grad_norm": 3.373300790786743, + "learning_rate": 0.0001704180064308682, + "loss": 5.0491, + "step": 139 + }, + { + "epoch": 0.15005359056806003, + "grad_norm": 3.4800350666046143, + "learning_rate": 0.0001702036441586281, + "loss": 4.8633, + "step": 140 + }, + { + "epoch": 0.15112540192926044, + "grad_norm": 4.172618865966797, + "learning_rate": 0.000169989281886388, + "loss": 4.7306, + "step": 141 + }, + { + "epoch": 0.15219721329046088, + "grad_norm": 2.529625654220581, + "learning_rate": 0.0001697749196141479, + "loss": 5.0736, + "step": 142 + }, + { + "epoch": 0.1532690246516613, + "grad_norm": 4.884538650512695, + "learning_rate": 0.00016956055734190784, + "loss": 4.7077, + "step": 143 + }, + { + "epoch": 0.15434083601286175, + "grad_norm": 2.5472567081451416, + "learning_rate": 0.00016934619506966776, + "loss": 5.2659, + "step": 144 + }, + { + "epoch": 0.15541264737406216, + "grad_norm": 2.724351167678833, + "learning_rate": 0.00016913183279742766, + "loss": 5.1795, + "step": 145 + }, + { + "epoch": 0.1564844587352626, + "grad_norm": 2.8223018646240234, + "learning_rate": 0.0001689174705251876, + "loss": 4.9197, + "step": 146 + }, + { + "epoch": 0.15755627009646303, + "grad_norm": 2.990795612335205, + "learning_rate": 0.0001687031082529475, + "loss": 4.7396, + "step": 147 + }, + { + "epoch": 0.15862808145766344, + "grad_norm": 4.808514595031738, + "learning_rate": 0.0001684887459807074, + "loss": 4.781, + "step": 148 + }, + { + "epoch": 0.15969989281886388, + "grad_norm": 2.9120051860809326, + "learning_rate": 0.0001682743837084673, + "loss": 4.8474, + "step": 149 + }, + { + "epoch": 0.1607717041800643, + "grad_norm": 3.2255208492279053, + "learning_rate": 0.00016806002143622724, + "loss": 4.883, + "step": 150 + }, + { + "epoch": 0.16184351554126475, + "grad_norm": 2.5771243572235107, + "learning_rate": 0.00016784565916398716, + "loss": 4.8337, + "step": 151 + }, + { + "epoch": 0.16291532690246516, + "grad_norm": 4.24485969543457, + "learning_rate": 0.00016763129689174706, + "loss": 5.3985, + "step": 152 + }, + { + "epoch": 0.1639871382636656, + "grad_norm": 2.5621776580810547, + "learning_rate": 0.000167416934619507, + "loss": 5.0145, + "step": 153 + }, + { + "epoch": 0.16505894962486603, + "grad_norm": 3.4797775745391846, + "learning_rate": 0.0001672025723472669, + "loss": 5.2251, + "step": 154 + }, + { + "epoch": 0.16613076098606644, + "grad_norm": 3.194912910461426, + "learning_rate": 0.0001669882100750268, + "loss": 4.6371, + "step": 155 + }, + { + "epoch": 0.16720257234726688, + "grad_norm": 3.799504280090332, + "learning_rate": 0.0001667738478027867, + "loss": 5.0769, + "step": 156 + }, + { + "epoch": 0.1682743837084673, + "grad_norm": 3.546942949295044, + "learning_rate": 0.00016655948553054664, + "loss": 4.6824, + "step": 157 + }, + { + "epoch": 0.16934619506966775, + "grad_norm": 3.28645658493042, + "learning_rate": 0.00016634512325830656, + "loss": 5.1034, + "step": 158 + }, + { + "epoch": 0.17041800643086816, + "grad_norm": 3.6776368618011475, + "learning_rate": 0.00016613076098606646, + "loss": 4.7307, + "step": 159 + }, + { + "epoch": 0.1714898177920686, + "grad_norm": 3.408773422241211, + "learning_rate": 0.0001659163987138264, + "loss": 5.1056, + "step": 160 + }, + { + "epoch": 0.17256162915326903, + "grad_norm": 2.734879732131958, + "learning_rate": 0.0001657020364415863, + "loss": 4.7715, + "step": 161 + }, + { + "epoch": 0.17363344051446947, + "grad_norm": 2.385500431060791, + "learning_rate": 0.0001654876741693462, + "loss": 4.869, + "step": 162 + }, + { + "epoch": 0.17470525187566988, + "grad_norm": 2.97015118598938, + "learning_rate": 0.0001652733118971061, + "loss": 4.7828, + "step": 163 + }, + { + "epoch": 0.1757770632368703, + "grad_norm": 5.304216384887695, + "learning_rate": 0.00016505894962486604, + "loss": 4.7406, + "step": 164 + }, + { + "epoch": 0.17684887459807075, + "grad_norm": 4.278355598449707, + "learning_rate": 0.00016484458735262596, + "loss": 5.1003, + "step": 165 + }, + { + "epoch": 0.17792068595927116, + "grad_norm": 3.021716356277466, + "learning_rate": 0.00016463022508038586, + "loss": 4.7373, + "step": 166 + }, + { + "epoch": 0.1789924973204716, + "grad_norm": 3.567626953125, + "learning_rate": 0.0001644158628081458, + "loss": 5.0243, + "step": 167 + }, + { + "epoch": 0.18006430868167203, + "grad_norm": 3.5427238941192627, + "learning_rate": 0.0001642015005359057, + "loss": 4.6865, + "step": 168 + }, + { + "epoch": 0.18113612004287247, + "grad_norm": 3.7450437545776367, + "learning_rate": 0.0001639871382636656, + "loss": 5.2324, + "step": 169 + }, + { + "epoch": 0.18220793140407288, + "grad_norm": 5.276814937591553, + "learning_rate": 0.0001637727759914255, + "loss": 5.0563, + "step": 170 + }, + { + "epoch": 0.1832797427652733, + "grad_norm": 5.184852123260498, + "learning_rate": 0.00016355841371918544, + "loss": 4.5753, + "step": 171 + }, + { + "epoch": 0.18435155412647375, + "grad_norm": 3.9674630165100098, + "learning_rate": 0.00016334405144694536, + "loss": 5.0476, + "step": 172 + }, + { + "epoch": 0.18542336548767416, + "grad_norm": 4.83624792098999, + "learning_rate": 0.00016312968917470526, + "loss": 4.8303, + "step": 173 + }, + { + "epoch": 0.1864951768488746, + "grad_norm": 2.8017940521240234, + "learning_rate": 0.0001629153269024652, + "loss": 4.7553, + "step": 174 + }, + { + "epoch": 0.18756698821007503, + "grad_norm": 3.8941309452056885, + "learning_rate": 0.0001627009646302251, + "loss": 4.5793, + "step": 175 + }, + { + "epoch": 0.18863879957127547, + "grad_norm": 2.9973654747009277, + "learning_rate": 0.000162486602357985, + "loss": 4.7673, + "step": 176 + }, + { + "epoch": 0.18971061093247588, + "grad_norm": 2.4675092697143555, + "learning_rate": 0.0001622722400857449, + "loss": 4.876, + "step": 177 + }, + { + "epoch": 0.1907824222936763, + "grad_norm": 4.017350196838379, + "learning_rate": 0.00016205787781350484, + "loss": 4.8945, + "step": 178 + }, + { + "epoch": 0.19185423365487675, + "grad_norm": 3.6734588146209717, + "learning_rate": 0.00016184351554126474, + "loss": 5.2249, + "step": 179 + }, + { + "epoch": 0.19292604501607716, + "grad_norm": 5.289995193481445, + "learning_rate": 0.00016162915326902466, + "loss": 4.7494, + "step": 180 + }, + { + "epoch": 0.1939978563772776, + "grad_norm": 3.5352132320404053, + "learning_rate": 0.0001614147909967846, + "loss": 4.9401, + "step": 181 + }, + { + "epoch": 0.19506966773847803, + "grad_norm": 3.0050971508026123, + "learning_rate": 0.0001612004287245445, + "loss": 4.9689, + "step": 182 + }, + { + "epoch": 0.19614147909967847, + "grad_norm": 2.5181117057800293, + "learning_rate": 0.0001609860664523044, + "loss": 4.8451, + "step": 183 + }, + { + "epoch": 0.19721329046087888, + "grad_norm": 4.061633110046387, + "learning_rate": 0.0001607717041800643, + "loss": 4.5857, + "step": 184 + }, + { + "epoch": 0.1982851018220793, + "grad_norm": 5.4610981941223145, + "learning_rate": 0.00016055734190782424, + "loss": 5.0163, + "step": 185 + }, + { + "epoch": 0.19935691318327975, + "grad_norm": 4.758736610412598, + "learning_rate": 0.00016034297963558414, + "loss": 4.5184, + "step": 186 + }, + { + "epoch": 0.20042872454448016, + "grad_norm": 3.6250007152557373, + "learning_rate": 0.00016012861736334406, + "loss": 4.9198, + "step": 187 + }, + { + "epoch": 0.2015005359056806, + "grad_norm": 2.938920736312866, + "learning_rate": 0.000159914255091104, + "loss": 4.6896, + "step": 188 + }, + { + "epoch": 0.20257234726688103, + "grad_norm": 3.388197660446167, + "learning_rate": 0.0001596998928188639, + "loss": 4.8161, + "step": 189 + }, + { + "epoch": 0.20364415862808147, + "grad_norm": 2.7313029766082764, + "learning_rate": 0.00015948553054662381, + "loss": 5.0972, + "step": 190 + }, + { + "epoch": 0.20471596998928188, + "grad_norm": 5.155452251434326, + "learning_rate": 0.0001592711682743837, + "loss": 4.9834, + "step": 191 + }, + { + "epoch": 0.2057877813504823, + "grad_norm": 3.554161310195923, + "learning_rate": 0.00015905680600214364, + "loss": 4.6315, + "step": 192 + }, + { + "epoch": 0.20685959271168275, + "grad_norm": 2.219275712966919, + "learning_rate": 0.00015884244372990354, + "loss": 4.8297, + "step": 193 + }, + { + "epoch": 0.20793140407288319, + "grad_norm": 3.4136171340942383, + "learning_rate": 0.00015862808145766346, + "loss": 4.7078, + "step": 194 + }, + { + "epoch": 0.2090032154340836, + "grad_norm": 3.1170814037323, + "learning_rate": 0.0001584137191854234, + "loss": 4.569, + "step": 195 + }, + { + "epoch": 0.21007502679528403, + "grad_norm": 4.157629489898682, + "learning_rate": 0.0001581993569131833, + "loss": 5.0884, + "step": 196 + }, + { + "epoch": 0.21114683815648447, + "grad_norm": 3.6698620319366455, + "learning_rate": 0.00015798499464094321, + "loss": 4.9795, + "step": 197 + }, + { + "epoch": 0.21221864951768488, + "grad_norm": 3.2543110847473145, + "learning_rate": 0.0001577706323687031, + "loss": 5.4088, + "step": 198 + }, + { + "epoch": 0.2132904608788853, + "grad_norm": 5.117916584014893, + "learning_rate": 0.00015755627009646304, + "loss": 5.0872, + "step": 199 + }, + { + "epoch": 0.21436227224008575, + "grad_norm": 2.6757586002349854, + "learning_rate": 0.00015734190782422294, + "loss": 5.0076, + "step": 200 + }, + { + "epoch": 0.21543408360128619, + "grad_norm": 4.662868499755859, + "learning_rate": 0.00015712754555198286, + "loss": 5.151, + "step": 201 + }, + { + "epoch": 0.2165058949624866, + "grad_norm": 2.8466174602508545, + "learning_rate": 0.0001569131832797428, + "loss": 4.7452, + "step": 202 + }, + { + "epoch": 0.21757770632368703, + "grad_norm": 3.703111410140991, + "learning_rate": 0.0001566988210075027, + "loss": 4.6917, + "step": 203 + }, + { + "epoch": 0.21864951768488747, + "grad_norm": 4.147573947906494, + "learning_rate": 0.00015648445873526261, + "loss": 4.5361, + "step": 204 + }, + { + "epoch": 0.21972132904608788, + "grad_norm": 3.213587760925293, + "learning_rate": 0.0001562700964630225, + "loss": 4.8234, + "step": 205 + }, + { + "epoch": 0.2207931404072883, + "grad_norm": 3.092233419418335, + "learning_rate": 0.00015605573419078244, + "loss": 4.5189, + "step": 206 + }, + { + "epoch": 0.22186495176848875, + "grad_norm": 3.0028648376464844, + "learning_rate": 0.00015584137191854234, + "loss": 5.1736, + "step": 207 + }, + { + "epoch": 0.22293676312968919, + "grad_norm": 4.2017998695373535, + "learning_rate": 0.00015562700964630226, + "loss": 4.9839, + "step": 208 + }, + { + "epoch": 0.2240085744908896, + "grad_norm": 2.643603801727295, + "learning_rate": 0.0001554126473740622, + "loss": 4.6254, + "step": 209 + }, + { + "epoch": 0.22508038585209003, + "grad_norm": 4.710816860198975, + "learning_rate": 0.0001551982851018221, + "loss": 5.0277, + "step": 210 + }, + { + "epoch": 0.22615219721329047, + "grad_norm": 2.4961905479431152, + "learning_rate": 0.00015498392282958201, + "loss": 4.691, + "step": 211 + }, + { + "epoch": 0.22722400857449088, + "grad_norm": 5.022608757019043, + "learning_rate": 0.0001547695605573419, + "loss": 4.7685, + "step": 212 + }, + { + "epoch": 0.2282958199356913, + "grad_norm": 3.2233705520629883, + "learning_rate": 0.00015455519828510184, + "loss": 4.7998, + "step": 213 + }, + { + "epoch": 0.22936763129689175, + "grad_norm": 2.526165246963501, + "learning_rate": 0.00015434083601286174, + "loss": 4.8508, + "step": 214 + }, + { + "epoch": 0.2304394426580922, + "grad_norm": 3.2895984649658203, + "learning_rate": 0.00015412647374062166, + "loss": 4.8211, + "step": 215 + }, + { + "epoch": 0.2315112540192926, + "grad_norm": 2.739210367202759, + "learning_rate": 0.0001539121114683816, + "loss": 5.1027, + "step": 216 + }, + { + "epoch": 0.23258306538049303, + "grad_norm": 3.5469467639923096, + "learning_rate": 0.0001536977491961415, + "loss": 5.2544, + "step": 217 + }, + { + "epoch": 0.23365487674169347, + "grad_norm": 2.3437864780426025, + "learning_rate": 0.00015348338692390141, + "loss": 4.8464, + "step": 218 + }, + { + "epoch": 0.2347266881028939, + "grad_norm": 2.8780081272125244, + "learning_rate": 0.0001532690246516613, + "loss": 4.9309, + "step": 219 + }, + { + "epoch": 0.2357984994640943, + "grad_norm": 2.508305788040161, + "learning_rate": 0.00015305466237942124, + "loss": 4.9276, + "step": 220 + }, + { + "epoch": 0.23687031082529475, + "grad_norm": 4.404898166656494, + "learning_rate": 0.00015284030010718114, + "loss": 4.6623, + "step": 221 + }, + { + "epoch": 0.2379421221864952, + "grad_norm": 4.61767578125, + "learning_rate": 0.00015262593783494106, + "loss": 5.06, + "step": 222 + }, + { + "epoch": 0.2390139335476956, + "grad_norm": 4.487622261047363, + "learning_rate": 0.000152411575562701, + "loss": 5.5357, + "step": 223 + }, + { + "epoch": 0.24008574490889603, + "grad_norm": 2.2761528491973877, + "learning_rate": 0.0001521972132904609, + "loss": 4.7513, + "step": 224 + }, + { + "epoch": 0.24115755627009647, + "grad_norm": 3.565575122833252, + "learning_rate": 0.00015198285101822081, + "loss": 4.4589, + "step": 225 + }, + { + "epoch": 0.2422293676312969, + "grad_norm": 3.7335009574890137, + "learning_rate": 0.0001517684887459807, + "loss": 4.5894, + "step": 226 + }, + { + "epoch": 0.2433011789924973, + "grad_norm": 3.367142677307129, + "learning_rate": 0.00015155412647374064, + "loss": 4.58, + "step": 227 + }, + { + "epoch": 0.24437299035369775, + "grad_norm": 2.965182304382324, + "learning_rate": 0.00015133976420150054, + "loss": 4.9648, + "step": 228 + }, + { + "epoch": 0.2454448017148982, + "grad_norm": 2.646235704421997, + "learning_rate": 0.00015112540192926046, + "loss": 4.8253, + "step": 229 + }, + { + "epoch": 0.2465166130760986, + "grad_norm": 3.499410629272461, + "learning_rate": 0.0001509110396570204, + "loss": 5.3223, + "step": 230 + }, + { + "epoch": 0.24758842443729903, + "grad_norm": 4.3262104988098145, + "learning_rate": 0.0001506966773847803, + "loss": 4.8132, + "step": 231 + }, + { + "epoch": 0.24866023579849947, + "grad_norm": 3.103876829147339, + "learning_rate": 0.00015048231511254021, + "loss": 4.9858, + "step": 232 + }, + { + "epoch": 0.2497320471596999, + "grad_norm": 3.5739259719848633, + "learning_rate": 0.0001502679528403001, + "loss": 4.8506, + "step": 233 + }, + { + "epoch": 0.2508038585209003, + "grad_norm": 2.523289918899536, + "learning_rate": 0.00015005359056806004, + "loss": 4.8562, + "step": 234 + }, + { + "epoch": 0.25187566988210075, + "grad_norm": 3.0936124324798584, + "learning_rate": 0.00014983922829581994, + "loss": 4.8166, + "step": 235 + }, + { + "epoch": 0.2529474812433012, + "grad_norm": 2.942096710205078, + "learning_rate": 0.00014962486602357986, + "loss": 4.7598, + "step": 236 + }, + { + "epoch": 0.2540192926045016, + "grad_norm": 3.589401960372925, + "learning_rate": 0.0001494105037513398, + "loss": 4.6942, + "step": 237 + }, + { + "epoch": 0.25509110396570206, + "grad_norm": 3.2308168411254883, + "learning_rate": 0.0001491961414790997, + "loss": 4.8895, + "step": 238 + }, + { + "epoch": 0.25616291532690244, + "grad_norm": 3.224642515182495, + "learning_rate": 0.00014898177920685961, + "loss": 4.7678, + "step": 239 + }, + { + "epoch": 0.2572347266881029, + "grad_norm": 3.0363709926605225, + "learning_rate": 0.0001487674169346195, + "loss": 4.7585, + "step": 240 + }, + { + "epoch": 0.2583065380493033, + "grad_norm": 3.49025821685791, + "learning_rate": 0.00014855305466237944, + "loss": 4.7028, + "step": 241 + }, + { + "epoch": 0.25937834941050375, + "grad_norm": 4.927128314971924, + "learning_rate": 0.00014833869239013934, + "loss": 4.8932, + "step": 242 + }, + { + "epoch": 0.2604501607717042, + "grad_norm": 3.036620855331421, + "learning_rate": 0.00014812433011789926, + "loss": 4.7612, + "step": 243 + }, + { + "epoch": 0.2615219721329046, + "grad_norm": 3.0013017654418945, + "learning_rate": 0.0001479099678456592, + "loss": 4.701, + "step": 244 + }, + { + "epoch": 0.26259378349410506, + "grad_norm": 2.943427324295044, + "learning_rate": 0.0001476956055734191, + "loss": 4.7791, + "step": 245 + }, + { + "epoch": 0.26366559485530544, + "grad_norm": 3.3308539390563965, + "learning_rate": 0.00014748124330117901, + "loss": 4.9904, + "step": 246 + }, + { + "epoch": 0.2647374062165059, + "grad_norm": 3.7442448139190674, + "learning_rate": 0.00014726688102893891, + "loss": 5.9101, + "step": 247 + }, + { + "epoch": 0.2658092175777063, + "grad_norm": 3.206761598587036, + "learning_rate": 0.00014705251875669884, + "loss": 5.0534, + "step": 248 + }, + { + "epoch": 0.26688102893890675, + "grad_norm": 2.8394346237182617, + "learning_rate": 0.00014683815648445874, + "loss": 4.8381, + "step": 249 + }, + { + "epoch": 0.2679528403001072, + "grad_norm": 3.0866284370422363, + "learning_rate": 0.00014662379421221866, + "loss": 4.9118, + "step": 250 + }, + { + "epoch": 0.2690246516613076, + "grad_norm": 3.6339967250823975, + "learning_rate": 0.0001464094319399786, + "loss": 5.2333, + "step": 251 + }, + { + "epoch": 0.27009646302250806, + "grad_norm": 3.3214287757873535, + "learning_rate": 0.0001461950696677385, + "loss": 4.8577, + "step": 252 + }, + { + "epoch": 0.27116827438370844, + "grad_norm": 3.002842903137207, + "learning_rate": 0.00014598070739549841, + "loss": 4.811, + "step": 253 + }, + { + "epoch": 0.2722400857449089, + "grad_norm": 3.2833826541900635, + "learning_rate": 0.00014576634512325831, + "loss": 4.7479, + "step": 254 + }, + { + "epoch": 0.2733118971061093, + "grad_norm": 4.537484645843506, + "learning_rate": 0.00014555198285101824, + "loss": 4.7998, + "step": 255 + }, + { + "epoch": 0.27438370846730975, + "grad_norm": 2.9750728607177734, + "learning_rate": 0.00014533762057877814, + "loss": 4.8613, + "step": 256 + }, + { + "epoch": 0.2754555198285102, + "grad_norm": 2.4807565212249756, + "learning_rate": 0.00014512325830653806, + "loss": 4.6727, + "step": 257 + }, + { + "epoch": 0.2765273311897106, + "grad_norm": 2.890812397003174, + "learning_rate": 0.000144908896034298, + "loss": 4.8248, + "step": 258 + }, + { + "epoch": 0.27759914255091106, + "grad_norm": 2.885880947113037, + "learning_rate": 0.0001446945337620579, + "loss": 4.7678, + "step": 259 + }, + { + "epoch": 0.27867095391211144, + "grad_norm": 3.302708387374878, + "learning_rate": 0.00014448017148981782, + "loss": 4.709, + "step": 260 + }, + { + "epoch": 0.2797427652733119, + "grad_norm": 3.09729266166687, + "learning_rate": 0.00014426580921757771, + "loss": 4.6741, + "step": 261 + }, + { + "epoch": 0.2808145766345123, + "grad_norm": 3.538935899734497, + "learning_rate": 0.00014405144694533764, + "loss": 4.698, + "step": 262 + }, + { + "epoch": 0.28188638799571275, + "grad_norm": 2.7330965995788574, + "learning_rate": 0.00014383708467309754, + "loss": 4.934, + "step": 263 + }, + { + "epoch": 0.2829581993569132, + "grad_norm": 3.0136098861694336, + "learning_rate": 0.00014362272240085746, + "loss": 4.6817, + "step": 264 + }, + { + "epoch": 0.2840300107181136, + "grad_norm": 2.376910448074341, + "learning_rate": 0.0001434083601286174, + "loss": 4.4482, + "step": 265 + }, + { + "epoch": 0.28510182207931406, + "grad_norm": 3.1064581871032715, + "learning_rate": 0.0001431939978563773, + "loss": 4.6759, + "step": 266 + }, + { + "epoch": 0.2861736334405145, + "grad_norm": 3.8472907543182373, + "learning_rate": 0.00014297963558413722, + "loss": 5.2014, + "step": 267 + }, + { + "epoch": 0.2872454448017149, + "grad_norm": 3.489697217941284, + "learning_rate": 0.00014276527331189711, + "loss": 4.9149, + "step": 268 + }, + { + "epoch": 0.2883172561629153, + "grad_norm": 4.324831485748291, + "learning_rate": 0.00014255091103965704, + "loss": 4.7237, + "step": 269 + }, + { + "epoch": 0.28938906752411575, + "grad_norm": 2.6454925537109375, + "learning_rate": 0.00014233654876741694, + "loss": 4.5799, + "step": 270 + }, + { + "epoch": 0.2904608788853162, + "grad_norm": 2.733677387237549, + "learning_rate": 0.00014212218649517686, + "loss": 5.3109, + "step": 271 + }, + { + "epoch": 0.2915326902465166, + "grad_norm": 3.2953786849975586, + "learning_rate": 0.0001419078242229368, + "loss": 4.7647, + "step": 272 + }, + { + "epoch": 0.29260450160771706, + "grad_norm": 3.716773748397827, + "learning_rate": 0.0001416934619506967, + "loss": 4.7028, + "step": 273 + }, + { + "epoch": 0.2936763129689175, + "grad_norm": 4.2017502784729, + "learning_rate": 0.00014147909967845662, + "loss": 5.0879, + "step": 274 + }, + { + "epoch": 0.2947481243301179, + "grad_norm": 3.396883487701416, + "learning_rate": 0.00014126473740621651, + "loss": 4.5388, + "step": 275 + }, + { + "epoch": 0.2958199356913183, + "grad_norm": 4.1809306144714355, + "learning_rate": 0.00014105037513397644, + "loss": 4.5289, + "step": 276 + }, + { + "epoch": 0.29689174705251875, + "grad_norm": 2.7928273677825928, + "learning_rate": 0.00014083601286173634, + "loss": 4.4894, + "step": 277 + }, + { + "epoch": 0.2979635584137192, + "grad_norm": 3.4506866931915283, + "learning_rate": 0.00014062165058949626, + "loss": 4.7572, + "step": 278 + }, + { + "epoch": 0.2990353697749196, + "grad_norm": 2.7902727127075195, + "learning_rate": 0.0001404072883172562, + "loss": 5.0598, + "step": 279 + }, + { + "epoch": 0.30010718113612006, + "grad_norm": 4.7700676918029785, + "learning_rate": 0.0001401929260450161, + "loss": 4.8324, + "step": 280 + }, + { + "epoch": 0.3011789924973205, + "grad_norm": 2.7876083850860596, + "learning_rate": 0.00013997856377277602, + "loss": 4.7838, + "step": 281 + }, + { + "epoch": 0.3022508038585209, + "grad_norm": 3.0817983150482178, + "learning_rate": 0.00013976420150053591, + "loss": 4.7157, + "step": 282 + }, + { + "epoch": 0.3033226152197213, + "grad_norm": 3.8207461833953857, + "learning_rate": 0.00013954983922829584, + "loss": 4.7388, + "step": 283 + }, + { + "epoch": 0.30439442658092175, + "grad_norm": 2.6404857635498047, + "learning_rate": 0.00013933547695605574, + "loss": 4.65, + "step": 284 + }, + { + "epoch": 0.3054662379421222, + "grad_norm": 4.702475070953369, + "learning_rate": 0.00013912111468381567, + "loss": 5.0798, + "step": 285 + }, + { + "epoch": 0.3065380493033226, + "grad_norm": 3.7465784549713135, + "learning_rate": 0.0001389067524115756, + "loss": 4.5744, + "step": 286 + }, + { + "epoch": 0.30760986066452306, + "grad_norm": 3.9282851219177246, + "learning_rate": 0.0001386923901393355, + "loss": 4.8443, + "step": 287 + }, + { + "epoch": 0.3086816720257235, + "grad_norm": 4.132015705108643, + "learning_rate": 0.00013847802786709542, + "loss": 4.682, + "step": 288 + }, + { + "epoch": 0.3097534833869239, + "grad_norm": 2.495121479034424, + "learning_rate": 0.00013826366559485531, + "loss": 4.6311, + "step": 289 + }, + { + "epoch": 0.3108252947481243, + "grad_norm": 3.6948375701904297, + "learning_rate": 0.00013804930332261524, + "loss": 4.5608, + "step": 290 + }, + { + "epoch": 0.31189710610932475, + "grad_norm": 3.2895307540893555, + "learning_rate": 0.00013783494105037514, + "loss": 4.6748, + "step": 291 + }, + { + "epoch": 0.3129689174705252, + "grad_norm": 5.540443420410156, + "learning_rate": 0.00013762057877813507, + "loss": 4.8687, + "step": 292 + }, + { + "epoch": 0.3140407288317256, + "grad_norm": 3.6029367446899414, + "learning_rate": 0.00013740621650589496, + "loss": 5.135, + "step": 293 + }, + { + "epoch": 0.31511254019292606, + "grad_norm": 3.0030689239501953, + "learning_rate": 0.0001371918542336549, + "loss": 4.5854, + "step": 294 + }, + { + "epoch": 0.3161843515541265, + "grad_norm": 2.6009085178375244, + "learning_rate": 0.00013697749196141482, + "loss": 4.7002, + "step": 295 + }, + { + "epoch": 0.3172561629153269, + "grad_norm": 4.169888973236084, + "learning_rate": 0.00013676312968917471, + "loss": 4.8343, + "step": 296 + }, + { + "epoch": 0.3183279742765273, + "grad_norm": 2.8053345680236816, + "learning_rate": 0.00013654876741693464, + "loss": 4.7164, + "step": 297 + }, + { + "epoch": 0.31939978563772775, + "grad_norm": 3.506838321685791, + "learning_rate": 0.00013633440514469454, + "loss": 4.8534, + "step": 298 + }, + { + "epoch": 0.3204715969989282, + "grad_norm": 4.393333911895752, + "learning_rate": 0.00013612004287245447, + "loss": 4.4928, + "step": 299 + }, + { + "epoch": 0.3215434083601286, + "grad_norm": 2.863898277282715, + "learning_rate": 0.00013590568060021436, + "loss": 4.5477, + "step": 300 + }, + { + "epoch": 0.32261521972132906, + "grad_norm": 3.9683938026428223, + "learning_rate": 0.0001356913183279743, + "loss": 4.7035, + "step": 301 + }, + { + "epoch": 0.3236870310825295, + "grad_norm": 3.109039783477783, + "learning_rate": 0.00013547695605573422, + "loss": 4.6411, + "step": 302 + }, + { + "epoch": 0.3247588424437299, + "grad_norm": 3.720078706741333, + "learning_rate": 0.00013526259378349411, + "loss": 4.9106, + "step": 303 + }, + { + "epoch": 0.3258306538049303, + "grad_norm": 3.9917848110198975, + "learning_rate": 0.00013504823151125404, + "loss": 5.4525, + "step": 304 + }, + { + "epoch": 0.32690246516613075, + "grad_norm": 2.6513423919677734, + "learning_rate": 0.00013483386923901394, + "loss": 4.686, + "step": 305 + }, + { + "epoch": 0.3279742765273312, + "grad_norm": 2.915001392364502, + "learning_rate": 0.00013461950696677387, + "loss": 4.7752, + "step": 306 + }, + { + "epoch": 0.3290460878885316, + "grad_norm": 2.989955186843872, + "learning_rate": 0.00013440514469453376, + "loss": 4.76, + "step": 307 + }, + { + "epoch": 0.33011789924973206, + "grad_norm": 5.258363246917725, + "learning_rate": 0.0001341907824222937, + "loss": 4.7727, + "step": 308 + }, + { + "epoch": 0.3311897106109325, + "grad_norm": 2.8469583988189697, + "learning_rate": 0.00013397642015005362, + "loss": 4.6681, + "step": 309 + }, + { + "epoch": 0.3322615219721329, + "grad_norm": 2.9092135429382324, + "learning_rate": 0.00013376205787781351, + "loss": 4.9489, + "step": 310 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 2.8508243560791016, + "learning_rate": 0.00013354769560557344, + "loss": 4.8356, + "step": 311 + }, + { + "epoch": 0.33440514469453375, + "grad_norm": 2.808962345123291, + "learning_rate": 0.00013333333333333334, + "loss": 4.6664, + "step": 312 + }, + { + "epoch": 0.3354769560557342, + "grad_norm": 3.5812997817993164, + "learning_rate": 0.00013311897106109327, + "loss": 4.4658, + "step": 313 + }, + { + "epoch": 0.3365487674169346, + "grad_norm": 2.3912558555603027, + "learning_rate": 0.00013290460878885316, + "loss": 4.5262, + "step": 314 + }, + { + "epoch": 0.33762057877813506, + "grad_norm": 3.29030704498291, + "learning_rate": 0.0001326902465166131, + "loss": 4.8285, + "step": 315 + }, + { + "epoch": 0.3386923901393355, + "grad_norm": 2.764319658279419, + "learning_rate": 0.00013247588424437302, + "loss": 4.6389, + "step": 316 + }, + { + "epoch": 0.3397642015005359, + "grad_norm": 3.782653331756592, + "learning_rate": 0.00013226152197213292, + "loss": 4.9516, + "step": 317 + }, + { + "epoch": 0.3408360128617363, + "grad_norm": 2.377980947494507, + "learning_rate": 0.00013204715969989284, + "loss": 4.5456, + "step": 318 + }, + { + "epoch": 0.34190782422293675, + "grad_norm": 2.9316773414611816, + "learning_rate": 0.00013183279742765274, + "loss": 4.4321, + "step": 319 + }, + { + "epoch": 0.3429796355841372, + "grad_norm": 3.867056369781494, + "learning_rate": 0.00013161843515541267, + "loss": 4.7064, + "step": 320 + }, + { + "epoch": 0.3440514469453376, + "grad_norm": 2.9708735942840576, + "learning_rate": 0.00013140407288317256, + "loss": 4.8706, + "step": 321 + }, + { + "epoch": 0.34512325830653806, + "grad_norm": 2.9509382247924805, + "learning_rate": 0.0001311897106109325, + "loss": 5.0689, + "step": 322 + }, + { + "epoch": 0.3461950696677385, + "grad_norm": 3.441537857055664, + "learning_rate": 0.00013097534833869242, + "loss": 5.1403, + "step": 323 + }, + { + "epoch": 0.34726688102893893, + "grad_norm": 2.2192604541778564, + "learning_rate": 0.00013076098606645232, + "loss": 4.8289, + "step": 324 + }, + { + "epoch": 0.3483386923901393, + "grad_norm": 6.160877704620361, + "learning_rate": 0.00013054662379421224, + "loss": 4.6294, + "step": 325 + }, + { + "epoch": 0.34941050375133975, + "grad_norm": 2.7882578372955322, + "learning_rate": 0.00013033226152197214, + "loss": 5.1393, + "step": 326 + }, + { + "epoch": 0.3504823151125402, + "grad_norm": 2.55655574798584, + "learning_rate": 0.00013011789924973207, + "loss": 5.307, + "step": 327 + }, + { + "epoch": 0.3515541264737406, + "grad_norm": 4.0760817527771, + "learning_rate": 0.00012990353697749196, + "loss": 4.4461, + "step": 328 + }, + { + "epoch": 0.35262593783494106, + "grad_norm": 2.735928773880005, + "learning_rate": 0.0001296891747052519, + "loss": 4.4893, + "step": 329 + }, + { + "epoch": 0.3536977491961415, + "grad_norm": 3.1315958499908447, + "learning_rate": 0.00012947481243301182, + "loss": 4.6153, + "step": 330 + }, + { + "epoch": 0.35476956055734193, + "grad_norm": 3.7019879817962646, + "learning_rate": 0.00012926045016077172, + "loss": 4.545, + "step": 331 + }, + { + "epoch": 0.3558413719185423, + "grad_norm": 3.004509449005127, + "learning_rate": 0.00012904608788853164, + "loss": 4.9725, + "step": 332 + }, + { + "epoch": 0.35691318327974275, + "grad_norm": 2.3004274368286133, + "learning_rate": 0.00012883172561629154, + "loss": 4.7199, + "step": 333 + }, + { + "epoch": 0.3579849946409432, + "grad_norm": 2.899890184402466, + "learning_rate": 0.00012861736334405147, + "loss": 4.7919, + "step": 334 + }, + { + "epoch": 0.3590568060021436, + "grad_norm": 2.68511962890625, + "learning_rate": 0.00012840300107181136, + "loss": 4.5754, + "step": 335 + }, + { + "epoch": 0.36012861736334406, + "grad_norm": 2.743640899658203, + "learning_rate": 0.0001281886387995713, + "loss": 4.4467, + "step": 336 + }, + { + "epoch": 0.3612004287245445, + "grad_norm": 3.8968470096588135, + "learning_rate": 0.00012797427652733122, + "loss": 4.8004, + "step": 337 + }, + { + "epoch": 0.36227224008574493, + "grad_norm": 2.5243613719940186, + "learning_rate": 0.00012775991425509112, + "loss": 4.7446, + "step": 338 + }, + { + "epoch": 0.3633440514469453, + "grad_norm": 3.5175392627716064, + "learning_rate": 0.00012754555198285104, + "loss": 4.7064, + "step": 339 + }, + { + "epoch": 0.36441586280814575, + "grad_norm": 4.9440155029296875, + "learning_rate": 0.00012733118971061094, + "loss": 4.729, + "step": 340 + }, + { + "epoch": 0.3654876741693462, + "grad_norm": 2.3877315521240234, + "learning_rate": 0.00012711682743837087, + "loss": 4.9724, + "step": 341 + }, + { + "epoch": 0.3665594855305466, + "grad_norm": 2.969658851623535, + "learning_rate": 0.00012690246516613077, + "loss": 4.7385, + "step": 342 + }, + { + "epoch": 0.36763129689174706, + "grad_norm": 3.0826351642608643, + "learning_rate": 0.0001266881028938907, + "loss": 4.5219, + "step": 343 + }, + { + "epoch": 0.3687031082529475, + "grad_norm": 3.592517614364624, + "learning_rate": 0.00012647374062165062, + "loss": 4.917, + "step": 344 + }, + { + "epoch": 0.36977491961414793, + "grad_norm": 2.00095272064209, + "learning_rate": 0.00012625937834941052, + "loss": 4.7813, + "step": 345 + }, + { + "epoch": 0.3708467309753483, + "grad_norm": 3.643803119659424, + "learning_rate": 0.00012604501607717044, + "loss": 4.7526, + "step": 346 + }, + { + "epoch": 0.37191854233654875, + "grad_norm": 2.889460563659668, + "learning_rate": 0.00012583065380493034, + "loss": 4.804, + "step": 347 + }, + { + "epoch": 0.3729903536977492, + "grad_norm": 3.5942623615264893, + "learning_rate": 0.00012561629153269027, + "loss": 4.418, + "step": 348 + }, + { + "epoch": 0.3740621650589496, + "grad_norm": 4.21300745010376, + "learning_rate": 0.00012540192926045017, + "loss": 4.4909, + "step": 349 + }, + { + "epoch": 0.37513397642015006, + "grad_norm": 3.0777478218078613, + "learning_rate": 0.0001251875669882101, + "loss": 4.4939, + "step": 350 + }, + { + "epoch": 0.3762057877813505, + "grad_norm": 2.5130343437194824, + "learning_rate": 0.00012497320471597002, + "loss": 4.6874, + "step": 351 + }, + { + "epoch": 0.37727759914255093, + "grad_norm": 4.369728088378906, + "learning_rate": 0.00012475884244372992, + "loss": 4.4141, + "step": 352 + }, + { + "epoch": 0.3783494105037513, + "grad_norm": 2.8714654445648193, + "learning_rate": 0.00012454448017148984, + "loss": 4.7199, + "step": 353 + }, + { + "epoch": 0.37942122186495175, + "grad_norm": 5.214938163757324, + "learning_rate": 0.00012433011789924974, + "loss": 4.6146, + "step": 354 + }, + { + "epoch": 0.3804930332261522, + "grad_norm": 4.095846176147461, + "learning_rate": 0.00012411575562700967, + "loss": 4.8992, + "step": 355 + }, + { + "epoch": 0.3815648445873526, + "grad_norm": 3.715656042098999, + "learning_rate": 0.00012390139335476957, + "loss": 4.8978, + "step": 356 + }, + { + "epoch": 0.38263665594855306, + "grad_norm": 2.896740198135376, + "learning_rate": 0.0001236870310825295, + "loss": 4.5217, + "step": 357 + }, + { + "epoch": 0.3837084673097535, + "grad_norm": 3.072086811065674, + "learning_rate": 0.00012347266881028942, + "loss": 4.8439, + "step": 358 + }, + { + "epoch": 0.38478027867095393, + "grad_norm": 2.8563904762268066, + "learning_rate": 0.00012325830653804932, + "loss": 4.8879, + "step": 359 + }, + { + "epoch": 0.3858520900321543, + "grad_norm": 2.688250780105591, + "learning_rate": 0.00012304394426580924, + "loss": 4.472, + "step": 360 + }, + { + "epoch": 0.38692390139335475, + "grad_norm": 4.014735698699951, + "learning_rate": 0.00012282958199356914, + "loss": 5.0148, + "step": 361 + }, + { + "epoch": 0.3879957127545552, + "grad_norm": 3.6055967807769775, + "learning_rate": 0.00012261521972132907, + "loss": 4.4383, + "step": 362 + }, + { + "epoch": 0.3890675241157556, + "grad_norm": 2.7861740589141846, + "learning_rate": 0.00012240085744908897, + "loss": 4.5756, + "step": 363 + }, + { + "epoch": 0.39013933547695606, + "grad_norm": 2.6457951068878174, + "learning_rate": 0.0001221864951768489, + "loss": 4.9015, + "step": 364 + }, + { + "epoch": 0.3912111468381565, + "grad_norm": 2.503458023071289, + "learning_rate": 0.0001219721329046088, + "loss": 4.6681, + "step": 365 + }, + { + "epoch": 0.39228295819935693, + "grad_norm": 3.545325517654419, + "learning_rate": 0.0001217577706323687, + "loss": 4.7556, + "step": 366 + }, + { + "epoch": 0.3933547695605573, + "grad_norm": 2.9182682037353516, + "learning_rate": 0.00012154340836012863, + "loss": 4.7351, + "step": 367 + }, + { + "epoch": 0.39442658092175775, + "grad_norm": 3.4011483192443848, + "learning_rate": 0.00012132904608788853, + "loss": 4.3977, + "step": 368 + }, + { + "epoch": 0.3954983922829582, + "grad_norm": 3.131936550140381, + "learning_rate": 0.00012111468381564845, + "loss": 4.53, + "step": 369 + }, + { + "epoch": 0.3965702036441586, + "grad_norm": 2.649606704711914, + "learning_rate": 0.00012090032154340835, + "loss": 4.5794, + "step": 370 + }, + { + "epoch": 0.39764201500535906, + "grad_norm": 3.4589664936065674, + "learning_rate": 0.00012068595927116828, + "loss": 5.2707, + "step": 371 + }, + { + "epoch": 0.3987138263665595, + "grad_norm": 2.252598524093628, + "learning_rate": 0.0001204715969989282, + "loss": 4.6805, + "step": 372 + }, + { + "epoch": 0.39978563772775993, + "grad_norm": 3.345087766647339, + "learning_rate": 0.0001202572347266881, + "loss": 4.725, + "step": 373 + }, + { + "epoch": 0.4008574490889603, + "grad_norm": 2.961495876312256, + "learning_rate": 0.00012004287245444803, + "loss": 4.5135, + "step": 374 + }, + { + "epoch": 0.40192926045016075, + "grad_norm": 2.2345147132873535, + "learning_rate": 0.00011982851018220793, + "loss": 4.4681, + "step": 375 + }, + { + "epoch": 0.4030010718113612, + "grad_norm": 2.904963731765747, + "learning_rate": 0.00011961414790996785, + "loss": 5.0236, + "step": 376 + }, + { + "epoch": 0.4040728831725616, + "grad_norm": 4.671694755554199, + "learning_rate": 0.00011939978563772775, + "loss": 5.2864, + "step": 377 + }, + { + "epoch": 0.40514469453376206, + "grad_norm": 4.501243591308594, + "learning_rate": 0.00011918542336548768, + "loss": 4.7508, + "step": 378 + }, + { + "epoch": 0.4062165058949625, + "grad_norm": 2.389822244644165, + "learning_rate": 0.0001189710610932476, + "loss": 4.8598, + "step": 379 + }, + { + "epoch": 0.40728831725616294, + "grad_norm": 2.789280891418457, + "learning_rate": 0.0001187566988210075, + "loss": 4.5269, + "step": 380 + }, + { + "epoch": 0.40836012861736337, + "grad_norm": 2.0258848667144775, + "learning_rate": 0.00011854233654876743, + "loss": 4.6753, + "step": 381 + }, + { + "epoch": 0.40943193997856375, + "grad_norm": 2.4621434211730957, + "learning_rate": 0.00011832797427652733, + "loss": 4.5195, + "step": 382 + }, + { + "epoch": 0.4105037513397642, + "grad_norm": 2.21687650680542, + "learning_rate": 0.00011811361200428725, + "loss": 4.5579, + "step": 383 + }, + { + "epoch": 0.4115755627009646, + "grad_norm": 2.6150362491607666, + "learning_rate": 0.00011789924973204715, + "loss": 5.0116, + "step": 384 + }, + { + "epoch": 0.41264737406216506, + "grad_norm": 2.5843632221221924, + "learning_rate": 0.00011768488745980708, + "loss": 4.6433, + "step": 385 + }, + { + "epoch": 0.4137191854233655, + "grad_norm": 3.124844551086426, + "learning_rate": 0.000117470525187567, + "loss": 4.6199, + "step": 386 + }, + { + "epoch": 0.41479099678456594, + "grad_norm": 2.8832359313964844, + "learning_rate": 0.0001172561629153269, + "loss": 4.646, + "step": 387 + }, + { + "epoch": 0.41586280814576637, + "grad_norm": 2.497830629348755, + "learning_rate": 0.00011704180064308683, + "loss": 4.431, + "step": 388 + }, + { + "epoch": 0.41693461950696675, + "grad_norm": 2.51007342338562, + "learning_rate": 0.00011682743837084673, + "loss": 4.8004, + "step": 389 + }, + { + "epoch": 0.4180064308681672, + "grad_norm": 3.0995190143585205, + "learning_rate": 0.00011661307609860665, + "loss": 5.0293, + "step": 390 + }, + { + "epoch": 0.4190782422293676, + "grad_norm": 3.0592472553253174, + "learning_rate": 0.00011639871382636655, + "loss": 4.6023, + "step": 391 + }, + { + "epoch": 0.42015005359056806, + "grad_norm": 2.6731066703796387, + "learning_rate": 0.00011618435155412648, + "loss": 4.5261, + "step": 392 + }, + { + "epoch": 0.4212218649517685, + "grad_norm": 2.5948925018310547, + "learning_rate": 0.0001159699892818864, + "loss": 4.737, + "step": 393 + }, + { + "epoch": 0.42229367631296894, + "grad_norm": 4.319209575653076, + "learning_rate": 0.0001157556270096463, + "loss": 4.7966, + "step": 394 + }, + { + "epoch": 0.42336548767416937, + "grad_norm": 2.590317964553833, + "learning_rate": 0.00011554126473740623, + "loss": 4.6858, + "step": 395 + }, + { + "epoch": 0.42443729903536975, + "grad_norm": 2.41375732421875, + "learning_rate": 0.00011532690246516613, + "loss": 4.3515, + "step": 396 + }, + { + "epoch": 0.4255091103965702, + "grad_norm": 2.681856632232666, + "learning_rate": 0.00011511254019292605, + "loss": 5.0034, + "step": 397 + }, + { + "epoch": 0.4265809217577706, + "grad_norm": 2.9265639781951904, + "learning_rate": 0.00011489817792068595, + "loss": 4.6002, + "step": 398 + }, + { + "epoch": 0.42765273311897106, + "grad_norm": 3.013899326324463, + "learning_rate": 0.00011468381564844588, + "loss": 4.3798, + "step": 399 + }, + { + "epoch": 0.4287245444801715, + "grad_norm": 3.1742773056030273, + "learning_rate": 0.0001144694533762058, + "loss": 4.7765, + "step": 400 + }, + { + "epoch": 0.42979635584137194, + "grad_norm": 2.3816874027252197, + "learning_rate": 0.0001142550911039657, + "loss": 4.5805, + "step": 401 + }, + { + "epoch": 0.43086816720257237, + "grad_norm": 5.512743949890137, + "learning_rate": 0.00011404072883172563, + "loss": 4.8546, + "step": 402 + }, + { + "epoch": 0.43193997856377275, + "grad_norm": 3.751079559326172, + "learning_rate": 0.00011382636655948553, + "loss": 4.5984, + "step": 403 + }, + { + "epoch": 0.4330117899249732, + "grad_norm": 3.0676698684692383, + "learning_rate": 0.00011361200428724545, + "loss": 5.0614, + "step": 404 + }, + { + "epoch": 0.4340836012861736, + "grad_norm": 3.396777391433716, + "learning_rate": 0.00011339764201500535, + "loss": 4.7951, + "step": 405 + }, + { + "epoch": 0.43515541264737406, + "grad_norm": 2.4765775203704834, + "learning_rate": 0.00011318327974276528, + "loss": 5.1756, + "step": 406 + }, + { + "epoch": 0.4362272240085745, + "grad_norm": 2.3405649662017822, + "learning_rate": 0.0001129689174705252, + "loss": 4.5914, + "step": 407 + }, + { + "epoch": 0.43729903536977494, + "grad_norm": 3.434269666671753, + "learning_rate": 0.0001127545551982851, + "loss": 4.4127, + "step": 408 + }, + { + "epoch": 0.43837084673097537, + "grad_norm": 2.127887487411499, + "learning_rate": 0.00011254019292604503, + "loss": 4.6347, + "step": 409 + }, + { + "epoch": 0.43944265809217575, + "grad_norm": 4.66085147857666, + "learning_rate": 0.00011232583065380493, + "loss": 4.9721, + "step": 410 + }, + { + "epoch": 0.4405144694533762, + "grad_norm": 4.892344951629639, + "learning_rate": 0.00011211146838156485, + "loss": 4.5199, + "step": 411 + }, + { + "epoch": 0.4415862808145766, + "grad_norm": 3.442005157470703, + "learning_rate": 0.00011189710610932475, + "loss": 4.502, + "step": 412 + }, + { + "epoch": 0.44265809217577706, + "grad_norm": 3.6790285110473633, + "learning_rate": 0.00011168274383708468, + "loss": 4.2922, + "step": 413 + }, + { + "epoch": 0.4437299035369775, + "grad_norm": 4.072107315063477, + "learning_rate": 0.00011146838156484458, + "loss": 4.5492, + "step": 414 + }, + { + "epoch": 0.44480171489817794, + "grad_norm": 2.8195745944976807, + "learning_rate": 0.0001112540192926045, + "loss": 4.4757, + "step": 415 + }, + { + "epoch": 0.44587352625937837, + "grad_norm": 3.7252724170684814, + "learning_rate": 0.00011103965702036443, + "loss": 4.7089, + "step": 416 + }, + { + "epoch": 0.44694533762057875, + "grad_norm": 2.841998338699341, + "learning_rate": 0.00011082529474812433, + "loss": 4.4443, + "step": 417 + }, + { + "epoch": 0.4480171489817792, + "grad_norm": 2.653102159500122, + "learning_rate": 0.00011061093247588425, + "loss": 4.2913, + "step": 418 + }, + { + "epoch": 0.4490889603429796, + "grad_norm": 2.2500383853912354, + "learning_rate": 0.00011039657020364415, + "loss": 4.8685, + "step": 419 + }, + { + "epoch": 0.45016077170418006, + "grad_norm": 3.34576416015625, + "learning_rate": 0.00011018220793140408, + "loss": 4.9446, + "step": 420 + }, + { + "epoch": 0.4512325830653805, + "grad_norm": 5.558518409729004, + "learning_rate": 0.00010996784565916398, + "loss": 4.6652, + "step": 421 + }, + { + "epoch": 0.45230439442658094, + "grad_norm": 2.774711847305298, + "learning_rate": 0.0001097534833869239, + "loss": 4.5453, + "step": 422 + }, + { + "epoch": 0.4533762057877814, + "grad_norm": 2.1220977306365967, + "learning_rate": 0.00010953912111468383, + "loss": 4.4978, + "step": 423 + }, + { + "epoch": 0.45444801714898175, + "grad_norm": 2.5744822025299072, + "learning_rate": 0.00010932475884244373, + "loss": 4.6972, + "step": 424 + }, + { + "epoch": 0.4555198285101822, + "grad_norm": 3.827157974243164, + "learning_rate": 0.00010911039657020365, + "loss": 4.6225, + "step": 425 + }, + { + "epoch": 0.4565916398713826, + "grad_norm": 4.045970439910889, + "learning_rate": 0.00010889603429796355, + "loss": 4.7059, + "step": 426 + }, + { + "epoch": 0.45766345123258306, + "grad_norm": 3.225592851638794, + "learning_rate": 0.00010868167202572348, + "loss": 4.8028, + "step": 427 + }, + { + "epoch": 0.4587352625937835, + "grad_norm": 2.285249948501587, + "learning_rate": 0.00010846730975348338, + "loss": 4.621, + "step": 428 + }, + { + "epoch": 0.45980707395498394, + "grad_norm": 2.890104293823242, + "learning_rate": 0.0001082529474812433, + "loss": 4.7726, + "step": 429 + }, + { + "epoch": 0.4608788853161844, + "grad_norm": 3.505073070526123, + "learning_rate": 0.00010803858520900323, + "loss": 4.6411, + "step": 430 + }, + { + "epoch": 0.46195069667738475, + "grad_norm": 3.7843589782714844, + "learning_rate": 0.00010782422293676313, + "loss": 4.6693, + "step": 431 + }, + { + "epoch": 0.4630225080385852, + "grad_norm": 4.503906726837158, + "learning_rate": 0.00010760986066452305, + "loss": 4.891, + "step": 432 + }, + { + "epoch": 0.4640943193997856, + "grad_norm": 2.121623992919922, + "learning_rate": 0.00010739549839228295, + "loss": 4.401, + "step": 433 + }, + { + "epoch": 0.46516613076098606, + "grad_norm": 3.2020905017852783, + "learning_rate": 0.00010718113612004288, + "loss": 4.976, + "step": 434 + }, + { + "epoch": 0.4662379421221865, + "grad_norm": 2.401608467102051, + "learning_rate": 0.00010696677384780278, + "loss": 4.5996, + "step": 435 + }, + { + "epoch": 0.46730975348338694, + "grad_norm": 4.06643009185791, + "learning_rate": 0.0001067524115755627, + "loss": 4.9013, + "step": 436 + }, + { + "epoch": 0.4683815648445874, + "grad_norm": 2.9890921115875244, + "learning_rate": 0.00010653804930332263, + "loss": 4.5683, + "step": 437 + }, + { + "epoch": 0.4694533762057878, + "grad_norm": 3.724813222885132, + "learning_rate": 0.00010632368703108253, + "loss": 4.635, + "step": 438 + }, + { + "epoch": 0.4705251875669882, + "grad_norm": 4.079103946685791, + "learning_rate": 0.00010610932475884245, + "loss": 3.6894, + "step": 439 + }, + { + "epoch": 0.4715969989281886, + "grad_norm": 3.2234132289886475, + "learning_rate": 0.00010589496248660235, + "loss": 4.6768, + "step": 440 + }, + { + "epoch": 0.47266881028938906, + "grad_norm": 4.110087871551514, + "learning_rate": 0.00010568060021436228, + "loss": 4.7946, + "step": 441 + }, + { + "epoch": 0.4737406216505895, + "grad_norm": 3.6127841472625732, + "learning_rate": 0.00010546623794212218, + "loss": 4.3852, + "step": 442 + }, + { + "epoch": 0.47481243301178994, + "grad_norm": 2.7365081310272217, + "learning_rate": 0.0001052518756698821, + "loss": 4.7021, + "step": 443 + }, + { + "epoch": 0.4758842443729904, + "grad_norm": 2.423485517501831, + "learning_rate": 0.00010503751339764203, + "loss": 4.5577, + "step": 444 + }, + { + "epoch": 0.4769560557341908, + "grad_norm": 4.876895427703857, + "learning_rate": 0.00010482315112540193, + "loss": 4.4222, + "step": 445 + }, + { + "epoch": 0.4780278670953912, + "grad_norm": 4.026849269866943, + "learning_rate": 0.00010460878885316185, + "loss": 4.7229, + "step": 446 + }, + { + "epoch": 0.4790996784565916, + "grad_norm": 3.8580574989318848, + "learning_rate": 0.00010439442658092175, + "loss": 4.3579, + "step": 447 + }, + { + "epoch": 0.48017148981779206, + "grad_norm": 2.972254991531372, + "learning_rate": 0.00010418006430868168, + "loss": 4.6865, + "step": 448 + }, + { + "epoch": 0.4812433011789925, + "grad_norm": 3.2446720600128174, + "learning_rate": 0.00010396570203644158, + "loss": 5.3246, + "step": 449 + }, + { + "epoch": 0.48231511254019294, + "grad_norm": 2.7309093475341797, + "learning_rate": 0.0001037513397642015, + "loss": 4.5838, + "step": 450 + }, + { + "epoch": 0.4833869239013934, + "grad_norm": 2.745063543319702, + "learning_rate": 0.00010353697749196143, + "loss": 4.802, + "step": 451 + }, + { + "epoch": 0.4844587352625938, + "grad_norm": 2.725165367126465, + "learning_rate": 0.00010332261521972133, + "loss": 4.439, + "step": 452 + }, + { + "epoch": 0.4855305466237942, + "grad_norm": 2.230207681655884, + "learning_rate": 0.00010310825294748126, + "loss": 4.5734, + "step": 453 + }, + { + "epoch": 0.4866023579849946, + "grad_norm": 5.293651580810547, + "learning_rate": 0.00010289389067524115, + "loss": 4.4032, + "step": 454 + }, + { + "epoch": 0.48767416934619506, + "grad_norm": 3.427778720855713, + "learning_rate": 0.00010267952840300108, + "loss": 5.0263, + "step": 455 + }, + { + "epoch": 0.4887459807073955, + "grad_norm": 3.034630060195923, + "learning_rate": 0.00010246516613076098, + "loss": 4.8377, + "step": 456 + }, + { + "epoch": 0.48981779206859594, + "grad_norm": 4.529390335083008, + "learning_rate": 0.0001022508038585209, + "loss": 4.6534, + "step": 457 + }, + { + "epoch": 0.4908896034297964, + "grad_norm": 3.547393798828125, + "learning_rate": 0.00010203644158628083, + "loss": 4.5927, + "step": 458 + }, + { + "epoch": 0.4919614147909968, + "grad_norm": 2.7677557468414307, + "learning_rate": 0.00010182207931404073, + "loss": 4.6297, + "step": 459 + }, + { + "epoch": 0.4930332261521972, + "grad_norm": 3.251615285873413, + "learning_rate": 0.00010160771704180066, + "loss": 4.636, + "step": 460 + }, + { + "epoch": 0.4941050375133976, + "grad_norm": 3.7128498554229736, + "learning_rate": 0.00010139335476956055, + "loss": 4.793, + "step": 461 + }, + { + "epoch": 0.49517684887459806, + "grad_norm": 3.7845911979675293, + "learning_rate": 0.00010117899249732048, + "loss": 4.9675, + "step": 462 + }, + { + "epoch": 0.4962486602357985, + "grad_norm": 2.348583698272705, + "learning_rate": 0.00010096463022508038, + "loss": 4.6437, + "step": 463 + }, + { + "epoch": 0.49732047159699894, + "grad_norm": 3.05892014503479, + "learning_rate": 0.0001007502679528403, + "loss": 4.5614, + "step": 464 + }, + { + "epoch": 0.4983922829581994, + "grad_norm": 2.5227441787719727, + "learning_rate": 0.00010053590568060023, + "loss": 4.7875, + "step": 465 + }, + { + "epoch": 0.4994640943193998, + "grad_norm": 2.151101589202881, + "learning_rate": 0.00010032154340836013, + "loss": 4.431, + "step": 466 + }, + { + "epoch": 0.5005359056806002, + "grad_norm": 2.6528310775756836, + "learning_rate": 0.00010010718113612006, + "loss": 4.6138, + "step": 467 + }, + { + "epoch": 0.5016077170418006, + "grad_norm": 3.231062889099121, + "learning_rate": 9.989281886387997e-05, + "loss": 4.9219, + "step": 468 + }, + { + "epoch": 0.5026795284030011, + "grad_norm": 2.9534308910369873, + "learning_rate": 9.967845659163988e-05, + "loss": 4.6625, + "step": 469 + }, + { + "epoch": 0.5037513397642015, + "grad_norm": 5.39685583114624, + "learning_rate": 9.946409431939979e-05, + "loss": 4.6287, + "step": 470 + }, + { + "epoch": 0.5048231511254019, + "grad_norm": 2.1971487998962402, + "learning_rate": 9.92497320471597e-05, + "loss": 4.7021, + "step": 471 + }, + { + "epoch": 0.5058949624866024, + "grad_norm": 4.07705020904541, + "learning_rate": 9.903536977491962e-05, + "loss": 4.3861, + "step": 472 + }, + { + "epoch": 0.5069667738478028, + "grad_norm": 2.779970407485962, + "learning_rate": 9.882100750267953e-05, + "loss": 4.8171, + "step": 473 + }, + { + "epoch": 0.5080385852090032, + "grad_norm": 3.3830039501190186, + "learning_rate": 9.860664523043944e-05, + "loss": 4.6153, + "step": 474 + }, + { + "epoch": 0.5091103965702036, + "grad_norm": 2.822718858718872, + "learning_rate": 9.839228295819937e-05, + "loss": 4.7932, + "step": 475 + }, + { + "epoch": 0.5101822079314041, + "grad_norm": 2.344116687774658, + "learning_rate": 9.817792068595928e-05, + "loss": 4.6028, + "step": 476 + }, + { + "epoch": 0.5112540192926045, + "grad_norm": 4.1745381355285645, + "learning_rate": 9.796355841371919e-05, + "loss": 4.2435, + "step": 477 + }, + { + "epoch": 0.5123258306538049, + "grad_norm": 3.8554046154022217, + "learning_rate": 9.77491961414791e-05, + "loss": 4.6214, + "step": 478 + }, + { + "epoch": 0.5133976420150054, + "grad_norm": 3.1669418811798096, + "learning_rate": 9.753483386923902e-05, + "loss": 4.4191, + "step": 479 + }, + { + "epoch": 0.5144694533762058, + "grad_norm": 2.7627713680267334, + "learning_rate": 9.732047159699893e-05, + "loss": 4.577, + "step": 480 + }, + { + "epoch": 0.5155412647374062, + "grad_norm": 3.044712543487549, + "learning_rate": 9.710610932475884e-05, + "loss": 4.5155, + "step": 481 + }, + { + "epoch": 0.5166130760986066, + "grad_norm": 4.0238356590271, + "learning_rate": 9.689174705251877e-05, + "loss": 5.1078, + "step": 482 + }, + { + "epoch": 0.5176848874598071, + "grad_norm": 2.3172390460968018, + "learning_rate": 9.667738478027868e-05, + "loss": 4.4444, + "step": 483 + }, + { + "epoch": 0.5187566988210075, + "grad_norm": 2.782829999923706, + "learning_rate": 9.646302250803859e-05, + "loss": 4.3296, + "step": 484 + }, + { + "epoch": 0.5198285101822079, + "grad_norm": 2.173595666885376, + "learning_rate": 9.62486602357985e-05, + "loss": 4.2437, + "step": 485 + }, + { + "epoch": 0.5209003215434084, + "grad_norm": 2.6314404010772705, + "learning_rate": 9.603429796355842e-05, + "loss": 4.4386, + "step": 486 + }, + { + "epoch": 0.5219721329046088, + "grad_norm": 2.2896389961242676, + "learning_rate": 9.581993569131833e-05, + "loss": 4.4891, + "step": 487 + }, + { + "epoch": 0.5230439442658092, + "grad_norm": 2.4675259590148926, + "learning_rate": 9.560557341907824e-05, + "loss": 4.765, + "step": 488 + }, + { + "epoch": 0.5241157556270096, + "grad_norm": 3.0715389251708984, + "learning_rate": 9.539121114683817e-05, + "loss": 4.7717, + "step": 489 + }, + { + "epoch": 0.5251875669882101, + "grad_norm": 4.1732635498046875, + "learning_rate": 9.517684887459808e-05, + "loss": 4.4018, + "step": 490 + }, + { + "epoch": 0.5262593783494105, + "grad_norm": 3.266986608505249, + "learning_rate": 9.496248660235799e-05, + "loss": 4.2295, + "step": 491 + }, + { + "epoch": 0.5273311897106109, + "grad_norm": 2.5479917526245117, + "learning_rate": 9.47481243301179e-05, + "loss": 4.4826, + "step": 492 + }, + { + "epoch": 0.5284030010718114, + "grad_norm": 2.021381139755249, + "learning_rate": 9.453376205787782e-05, + "loss": 4.4129, + "step": 493 + }, + { + "epoch": 0.5294748124330118, + "grad_norm": 2.8881123065948486, + "learning_rate": 9.431939978563773e-05, + "loss": 4.6945, + "step": 494 + }, + { + "epoch": 0.5305466237942122, + "grad_norm": 2.44140362739563, + "learning_rate": 9.410503751339764e-05, + "loss": 4.4454, + "step": 495 + }, + { + "epoch": 0.5316184351554126, + "grad_norm": 2.6777150630950928, + "learning_rate": 9.389067524115757e-05, + "loss": 5.0997, + "step": 496 + }, + { + "epoch": 0.5326902465166131, + "grad_norm": 3.453155040740967, + "learning_rate": 9.367631296891748e-05, + "loss": 4.4479, + "step": 497 + }, + { + "epoch": 0.5337620578778135, + "grad_norm": 3.0275356769561768, + "learning_rate": 9.346195069667739e-05, + "loss": 4.9407, + "step": 498 + }, + { + "epoch": 0.5348338692390139, + "grad_norm": 3.3887171745300293, + "learning_rate": 9.32475884244373e-05, + "loss": 4.8086, + "step": 499 + }, + { + "epoch": 0.5359056806002144, + "grad_norm": 3.7157983779907227, + "learning_rate": 9.303322615219722e-05, + "loss": 4.7977, + "step": 500 + }, + { + "epoch": 0.5369774919614148, + "grad_norm": 4.366313457489014, + "learning_rate": 9.281886387995713e-05, + "loss": 4.9259, + "step": 501 + }, + { + "epoch": 0.5380493033226152, + "grad_norm": 2.8501381874084473, + "learning_rate": 9.260450160771704e-05, + "loss": 4.6289, + "step": 502 + }, + { + "epoch": 0.5391211146838156, + "grad_norm": 3.3382182121276855, + "learning_rate": 9.239013933547695e-05, + "loss": 4.8087, + "step": 503 + }, + { + "epoch": 0.5401929260450161, + "grad_norm": 3.3544347286224365, + "learning_rate": 9.217577706323688e-05, + "loss": 4.515, + "step": 504 + }, + { + "epoch": 0.5412647374062165, + "grad_norm": 37.70632553100586, + "learning_rate": 9.19614147909968e-05, + "loss": 5.0745, + "step": 505 + }, + { + "epoch": 0.5423365487674169, + "grad_norm": 2.2493515014648438, + "learning_rate": 9.17470525187567e-05, + "loss": 4.4932, + "step": 506 + }, + { + "epoch": 0.5434083601286174, + "grad_norm": 2.044198751449585, + "learning_rate": 9.153269024651662e-05, + "loss": 4.4252, + "step": 507 + }, + { + "epoch": 0.5444801714898178, + "grad_norm": 3.8044934272766113, + "learning_rate": 9.131832797427653e-05, + "loss": 4.5577, + "step": 508 + }, + { + "epoch": 0.5455519828510182, + "grad_norm": 2.5661163330078125, + "learning_rate": 9.110396570203644e-05, + "loss": 4.5289, + "step": 509 + }, + { + "epoch": 0.5466237942122186, + "grad_norm": 3.2366676330566406, + "learning_rate": 9.088960342979636e-05, + "loss": 4.5276, + "step": 510 + }, + { + "epoch": 0.5476956055734191, + "grad_norm": 2.113185405731201, + "learning_rate": 9.067524115755628e-05, + "loss": 4.4132, + "step": 511 + }, + { + "epoch": 0.5487674169346195, + "grad_norm": 2.1703245639801025, + "learning_rate": 9.04608788853162e-05, + "loss": 4.4325, + "step": 512 + }, + { + "epoch": 0.5498392282958199, + "grad_norm": 2.1970648765563965, + "learning_rate": 9.02465166130761e-05, + "loss": 4.5207, + "step": 513 + }, + { + "epoch": 0.5509110396570204, + "grad_norm": 1.997298240661621, + "learning_rate": 9.003215434083602e-05, + "loss": 4.4903, + "step": 514 + }, + { + "epoch": 0.5519828510182208, + "grad_norm": 2.3199117183685303, + "learning_rate": 8.981779206859593e-05, + "loss": 4.5107, + "step": 515 + }, + { + "epoch": 0.5530546623794212, + "grad_norm": 3.4195363521575928, + "learning_rate": 8.960342979635584e-05, + "loss": 4.62, + "step": 516 + }, + { + "epoch": 0.5541264737406216, + "grad_norm": 4.756778717041016, + "learning_rate": 8.938906752411576e-05, + "loss": 4.443, + "step": 517 + }, + { + "epoch": 0.5551982851018221, + "grad_norm": 2.4822423458099365, + "learning_rate": 8.917470525187568e-05, + "loss": 4.5553, + "step": 518 + }, + { + "epoch": 0.5562700964630225, + "grad_norm": 3.304189682006836, + "learning_rate": 8.89603429796356e-05, + "loss": 4.6001, + "step": 519 + }, + { + "epoch": 0.5573419078242229, + "grad_norm": 2.1849889755249023, + "learning_rate": 8.87459807073955e-05, + "loss": 4.3251, + "step": 520 + }, + { + "epoch": 0.5584137191854234, + "grad_norm": 2.0505974292755127, + "learning_rate": 8.853161843515542e-05, + "loss": 4.8688, + "step": 521 + }, + { + "epoch": 0.5594855305466238, + "grad_norm": 3.737577438354492, + "learning_rate": 8.831725616291533e-05, + "loss": 4.417, + "step": 522 + }, + { + "epoch": 0.5605573419078242, + "grad_norm": 3.136176586151123, + "learning_rate": 8.810289389067524e-05, + "loss": 4.6331, + "step": 523 + }, + { + "epoch": 0.5616291532690246, + "grad_norm": 4.006575107574463, + "learning_rate": 8.788853161843516e-05, + "loss": 4.084, + "step": 524 + }, + { + "epoch": 0.5627009646302251, + "grad_norm": 4.377976417541504, + "learning_rate": 8.767416934619508e-05, + "loss": 4.6561, + "step": 525 + }, + { + "epoch": 0.5637727759914255, + "grad_norm": 5.064045429229736, + "learning_rate": 8.7459807073955e-05, + "loss": 4.7988, + "step": 526 + }, + { + "epoch": 0.564844587352626, + "grad_norm": 2.700467348098755, + "learning_rate": 8.72454448017149e-05, + "loss": 4.9471, + "step": 527 + }, + { + "epoch": 0.5659163987138264, + "grad_norm": 2.9747073650360107, + "learning_rate": 8.703108252947482e-05, + "loss": 4.7476, + "step": 528 + }, + { + "epoch": 0.5669882100750268, + "grad_norm": 2.418670415878296, + "learning_rate": 8.681672025723473e-05, + "loss": 4.5664, + "step": 529 + }, + { + "epoch": 0.5680600214362272, + "grad_norm": 3.5818655490875244, + "learning_rate": 8.660235798499464e-05, + "loss": 4.6175, + "step": 530 + }, + { + "epoch": 0.5691318327974276, + "grad_norm": 2.4983315467834473, + "learning_rate": 8.638799571275456e-05, + "loss": 4.5348, + "step": 531 + }, + { + "epoch": 0.5702036441586281, + "grad_norm": 2.1915688514709473, + "learning_rate": 8.617363344051448e-05, + "loss": 4.297, + "step": 532 + }, + { + "epoch": 0.5712754555198285, + "grad_norm": 2.926217794418335, + "learning_rate": 8.59592711682744e-05, + "loss": 4.77, + "step": 533 + }, + { + "epoch": 0.572347266881029, + "grad_norm": 2.369635820388794, + "learning_rate": 8.57449088960343e-05, + "loss": 4.712, + "step": 534 + }, + { + "epoch": 0.5734190782422294, + "grad_norm": 2.3241968154907227, + "learning_rate": 8.553054662379422e-05, + "loss": 4.3899, + "step": 535 + }, + { + "epoch": 0.5744908896034298, + "grad_norm": 4.746558666229248, + "learning_rate": 8.531618435155413e-05, + "loss": 4.476, + "step": 536 + }, + { + "epoch": 0.5755627009646302, + "grad_norm": 2.7210347652435303, + "learning_rate": 8.510182207931404e-05, + "loss": 4.3049, + "step": 537 + }, + { + "epoch": 0.5766345123258306, + "grad_norm": 3.503418207168579, + "learning_rate": 8.488745980707396e-05, + "loss": 4.5433, + "step": 538 + }, + { + "epoch": 0.5777063236870311, + "grad_norm": 5.652376651763916, + "learning_rate": 8.467309753483388e-05, + "loss": 4.9893, + "step": 539 + }, + { + "epoch": 0.5787781350482315, + "grad_norm": 2.5520389080047607, + "learning_rate": 8.44587352625938e-05, + "loss": 4.4338, + "step": 540 + }, + { + "epoch": 0.579849946409432, + "grad_norm": 3.322006940841675, + "learning_rate": 8.42443729903537e-05, + "loss": 4.7979, + "step": 541 + }, + { + "epoch": 0.5809217577706324, + "grad_norm": 3.558786153793335, + "learning_rate": 8.403001071811362e-05, + "loss": 4.5882, + "step": 542 + }, + { + "epoch": 0.5819935691318328, + "grad_norm": 2.970303773880005, + "learning_rate": 8.381564844587353e-05, + "loss": 4.4237, + "step": 543 + }, + { + "epoch": 0.5830653804930332, + "grad_norm": 2.5710015296936035, + "learning_rate": 8.360128617363344e-05, + "loss": 4.5529, + "step": 544 + }, + { + "epoch": 0.5841371918542336, + "grad_norm": 2.937779664993286, + "learning_rate": 8.338692390139336e-05, + "loss": 4.7481, + "step": 545 + }, + { + "epoch": 0.5852090032154341, + "grad_norm": 2.8150634765625, + "learning_rate": 8.317256162915328e-05, + "loss": 4.4981, + "step": 546 + }, + { + "epoch": 0.5862808145766345, + "grad_norm": 3.412611961364746, + "learning_rate": 8.29581993569132e-05, + "loss": 4.7558, + "step": 547 + }, + { + "epoch": 0.587352625937835, + "grad_norm": 4.309683322906494, + "learning_rate": 8.27438370846731e-05, + "loss": 4.4244, + "step": 548 + }, + { + "epoch": 0.5884244372990354, + "grad_norm": 3.520347833633423, + "learning_rate": 8.252947481243302e-05, + "loss": 4.4844, + "step": 549 + }, + { + "epoch": 0.5894962486602358, + "grad_norm": 3.5350539684295654, + "learning_rate": 8.231511254019293e-05, + "loss": 5.268, + "step": 550 + }, + { + "epoch": 0.5905680600214362, + "grad_norm": 3.6474995613098145, + "learning_rate": 8.210075026795284e-05, + "loss": 4.3715, + "step": 551 + }, + { + "epoch": 0.5916398713826366, + "grad_norm": 2.8854405879974365, + "learning_rate": 8.188638799571276e-05, + "loss": 4.398, + "step": 552 + }, + { + "epoch": 0.5927116827438371, + "grad_norm": 3.763253688812256, + "learning_rate": 8.167202572347268e-05, + "loss": 4.4518, + "step": 553 + }, + { + "epoch": 0.5937834941050375, + "grad_norm": 4.865472316741943, + "learning_rate": 8.14576634512326e-05, + "loss": 4.1518, + "step": 554 + }, + { + "epoch": 0.594855305466238, + "grad_norm": 4.091231822967529, + "learning_rate": 8.12433011789925e-05, + "loss": 4.786, + "step": 555 + }, + { + "epoch": 0.5959271168274384, + "grad_norm": 2.232417106628418, + "learning_rate": 8.102893890675242e-05, + "loss": 4.2969, + "step": 556 + }, + { + "epoch": 0.5969989281886388, + "grad_norm": 2.0849270820617676, + "learning_rate": 8.081457663451233e-05, + "loss": 4.7915, + "step": 557 + }, + { + "epoch": 0.5980707395498392, + "grad_norm": 3.684378147125244, + "learning_rate": 8.060021436227224e-05, + "loss": 5.0442, + "step": 558 + }, + { + "epoch": 0.5991425509110396, + "grad_norm": 2.289313793182373, + "learning_rate": 8.038585209003216e-05, + "loss": 4.3944, + "step": 559 + }, + { + "epoch": 0.6002143622722401, + "grad_norm": 3.844221830368042, + "learning_rate": 8.017148981779207e-05, + "loss": 4.2145, + "step": 560 + }, + { + "epoch": 0.6012861736334405, + "grad_norm": 2.877458333969116, + "learning_rate": 7.9957127545552e-05, + "loss": 4.8711, + "step": 561 + }, + { + "epoch": 0.602357984994641, + "grad_norm": 2.864017963409424, + "learning_rate": 7.974276527331191e-05, + "loss": 4.457, + "step": 562 + }, + { + "epoch": 0.6034297963558414, + "grad_norm": 2.4389705657958984, + "learning_rate": 7.952840300107182e-05, + "loss": 4.3321, + "step": 563 + }, + { + "epoch": 0.6045016077170418, + "grad_norm": 2.7437260150909424, + "learning_rate": 7.931404072883173e-05, + "loss": 4.2526, + "step": 564 + }, + { + "epoch": 0.6055734190782422, + "grad_norm": 2.4014856815338135, + "learning_rate": 7.909967845659164e-05, + "loss": 4.6251, + "step": 565 + }, + { + "epoch": 0.6066452304394426, + "grad_norm": 2.1202969551086426, + "learning_rate": 7.888531618435156e-05, + "loss": 4.4078, + "step": 566 + }, + { + "epoch": 0.6077170418006431, + "grad_norm": 3.2737956047058105, + "learning_rate": 7.867095391211147e-05, + "loss": 5.3022, + "step": 567 + }, + { + "epoch": 0.6087888531618435, + "grad_norm": 4.715353012084961, + "learning_rate": 7.84565916398714e-05, + "loss": 4.4089, + "step": 568 + }, + { + "epoch": 0.609860664523044, + "grad_norm": 3.042727470397949, + "learning_rate": 7.824222936763131e-05, + "loss": 4.7046, + "step": 569 + }, + { + "epoch": 0.6109324758842444, + "grad_norm": 4.115657806396484, + "learning_rate": 7.802786709539122e-05, + "loss": 4.6974, + "step": 570 + }, + { + "epoch": 0.6120042872454448, + "grad_norm": 2.7663843631744385, + "learning_rate": 7.781350482315113e-05, + "loss": 4.4788, + "step": 571 + }, + { + "epoch": 0.6130760986066452, + "grad_norm": 2.7116591930389404, + "learning_rate": 7.759914255091104e-05, + "loss": 4.4394, + "step": 572 + }, + { + "epoch": 0.6141479099678456, + "grad_norm": 2.617058515548706, + "learning_rate": 7.738478027867096e-05, + "loss": 4.5058, + "step": 573 + }, + { + "epoch": 0.6152197213290461, + "grad_norm": 2.9258296489715576, + "learning_rate": 7.717041800643087e-05, + "loss": 4.638, + "step": 574 + }, + { + "epoch": 0.6162915326902465, + "grad_norm": 2.290130376815796, + "learning_rate": 7.69560557341908e-05, + "loss": 4.494, + "step": 575 + }, + { + "epoch": 0.617363344051447, + "grad_norm": 2.9403839111328125, + "learning_rate": 7.674169346195071e-05, + "loss": 4.4339, + "step": 576 + }, + { + "epoch": 0.6184351554126474, + "grad_norm": 5.357972621917725, + "learning_rate": 7.652733118971062e-05, + "loss": 4.3492, + "step": 577 + }, + { + "epoch": 0.6195069667738478, + "grad_norm": 3.641831636428833, + "learning_rate": 7.631296891747053e-05, + "loss": 4.4423, + "step": 578 + }, + { + "epoch": 0.6205787781350482, + "grad_norm": 2.3929615020751953, + "learning_rate": 7.609860664523044e-05, + "loss": 4.4423, + "step": 579 + }, + { + "epoch": 0.6216505894962486, + "grad_norm": 3.540724515914917, + "learning_rate": 7.588424437299036e-05, + "loss": 4.215, + "step": 580 + }, + { + "epoch": 0.6227224008574491, + "grad_norm": 2.61544132232666, + "learning_rate": 7.566988210075027e-05, + "loss": 4.4321, + "step": 581 + }, + { + "epoch": 0.6237942122186495, + "grad_norm": 3.734426498413086, + "learning_rate": 7.54555198285102e-05, + "loss": 5.0043, + "step": 582 + }, + { + "epoch": 0.62486602357985, + "grad_norm": 2.775791645050049, + "learning_rate": 7.524115755627011e-05, + "loss": 4.2766, + "step": 583 + }, + { + "epoch": 0.6259378349410504, + "grad_norm": 2.061469793319702, + "learning_rate": 7.502679528403002e-05, + "loss": 4.6404, + "step": 584 + }, + { + "epoch": 0.6270096463022508, + "grad_norm": 2.8334105014801025, + "learning_rate": 7.481243301178993e-05, + "loss": 4.8849, + "step": 585 + }, + { + "epoch": 0.6280814576634512, + "grad_norm": 2.0020415782928467, + "learning_rate": 7.459807073954984e-05, + "loss": 4.4276, + "step": 586 + }, + { + "epoch": 0.6291532690246516, + "grad_norm": 3.436272621154785, + "learning_rate": 7.438370846730976e-05, + "loss": 4.2399, + "step": 587 + }, + { + "epoch": 0.6302250803858521, + "grad_norm": 3.1034908294677734, + "learning_rate": 7.416934619506967e-05, + "loss": 4.8148, + "step": 588 + }, + { + "epoch": 0.6312968917470525, + "grad_norm": 2.962636947631836, + "learning_rate": 7.39549839228296e-05, + "loss": 4.2156, + "step": 589 + }, + { + "epoch": 0.632368703108253, + "grad_norm": 2.5060412883758545, + "learning_rate": 7.374062165058951e-05, + "loss": 4.269, + "step": 590 + }, + { + "epoch": 0.6334405144694534, + "grad_norm": 3.32008695602417, + "learning_rate": 7.352625937834942e-05, + "loss": 4.6386, + "step": 591 + }, + { + "epoch": 0.6345123258306538, + "grad_norm": 2.796219825744629, + "learning_rate": 7.331189710610933e-05, + "loss": 4.235, + "step": 592 + }, + { + "epoch": 0.6355841371918542, + "grad_norm": 3.8189280033111572, + "learning_rate": 7.309753483386924e-05, + "loss": 4.4182, + "step": 593 + }, + { + "epoch": 0.6366559485530546, + "grad_norm": 2.9340178966522217, + "learning_rate": 7.288317256162916e-05, + "loss": 4.2946, + "step": 594 + }, + { + "epoch": 0.6377277599142551, + "grad_norm": 4.388503551483154, + "learning_rate": 7.266881028938907e-05, + "loss": 4.5359, + "step": 595 + }, + { + "epoch": 0.6387995712754555, + "grad_norm": 3.3181519508361816, + "learning_rate": 7.2454448017149e-05, + "loss": 4.4088, + "step": 596 + }, + { + "epoch": 0.639871382636656, + "grad_norm": 2.9396121501922607, + "learning_rate": 7.224008574490891e-05, + "loss": 4.8903, + "step": 597 + }, + { + "epoch": 0.6409431939978564, + "grad_norm": 4.775658130645752, + "learning_rate": 7.202572347266882e-05, + "loss": 4.9145, + "step": 598 + }, + { + "epoch": 0.6420150053590568, + "grad_norm": 3.0390172004699707, + "learning_rate": 7.181136120042873e-05, + "loss": 4.3406, + "step": 599 + }, + { + "epoch": 0.6430868167202572, + "grad_norm": 3.109550952911377, + "learning_rate": 7.159699892818864e-05, + "loss": 4.6635, + "step": 600 + }, + { + "epoch": 0.6441586280814576, + "grad_norm": 2.880469799041748, + "learning_rate": 7.138263665594856e-05, + "loss": 4.875, + "step": 601 + }, + { + "epoch": 0.6452304394426581, + "grad_norm": 3.215329170227051, + "learning_rate": 7.116827438370847e-05, + "loss": 4.347, + "step": 602 + }, + { + "epoch": 0.6463022508038585, + "grad_norm": 3.218173027038574, + "learning_rate": 7.09539121114684e-05, + "loss": 4.72, + "step": 603 + }, + { + "epoch": 0.647374062165059, + "grad_norm": 2.2390801906585693, + "learning_rate": 7.073954983922831e-05, + "loss": 4.732, + "step": 604 + }, + { + "epoch": 0.6484458735262594, + "grad_norm": 4.779411315917969, + "learning_rate": 7.052518756698822e-05, + "loss": 4.4878, + "step": 605 + }, + { + "epoch": 0.6495176848874598, + "grad_norm": 2.320235252380371, + "learning_rate": 7.031082529474813e-05, + "loss": 4.4545, + "step": 606 + }, + { + "epoch": 0.6505894962486602, + "grad_norm": 2.8826911449432373, + "learning_rate": 7.009646302250804e-05, + "loss": 4.8964, + "step": 607 + }, + { + "epoch": 0.6516613076098606, + "grad_norm": 2.1690621376037598, + "learning_rate": 6.988210075026796e-05, + "loss": 4.6063, + "step": 608 + }, + { + "epoch": 0.6527331189710611, + "grad_norm": 3.2647693157196045, + "learning_rate": 6.966773847802787e-05, + "loss": 4.8094, + "step": 609 + }, + { + "epoch": 0.6538049303322615, + "grad_norm": 3.142394542694092, + "learning_rate": 6.94533762057878e-05, + "loss": 4.3307, + "step": 610 + }, + { + "epoch": 0.654876741693462, + "grad_norm": 3.0853631496429443, + "learning_rate": 6.923901393354771e-05, + "loss": 4.3164, + "step": 611 + }, + { + "epoch": 0.6559485530546624, + "grad_norm": 3.033388614654541, + "learning_rate": 6.902465166130762e-05, + "loss": 4.4713, + "step": 612 + }, + { + "epoch": 0.6570203644158628, + "grad_norm": 2.7106258869171143, + "learning_rate": 6.881028938906753e-05, + "loss": 4.5005, + "step": 613 + }, + { + "epoch": 0.6580921757770632, + "grad_norm": 3.773181200027466, + "learning_rate": 6.859592711682744e-05, + "loss": 4.5155, + "step": 614 + }, + { + "epoch": 0.6591639871382636, + "grad_norm": 5.108589172363281, + "learning_rate": 6.838156484458736e-05, + "loss": 4.5609, + "step": 615 + }, + { + "epoch": 0.6602357984994641, + "grad_norm": 3.9254658222198486, + "learning_rate": 6.816720257234727e-05, + "loss": 4.5529, + "step": 616 + }, + { + "epoch": 0.6613076098606645, + "grad_norm": 3.022003650665283, + "learning_rate": 6.795284030010718e-05, + "loss": 4.2055, + "step": 617 + }, + { + "epoch": 0.662379421221865, + "grad_norm": 2.3257830142974854, + "learning_rate": 6.773847802786711e-05, + "loss": 4.4332, + "step": 618 + }, + { + "epoch": 0.6634512325830654, + "grad_norm": 1.9579797983169556, + "learning_rate": 6.752411575562702e-05, + "loss": 4.3841, + "step": 619 + }, + { + "epoch": 0.6645230439442658, + "grad_norm": 4.0417351722717285, + "learning_rate": 6.730975348338693e-05, + "loss": 4.7266, + "step": 620 + }, + { + "epoch": 0.6655948553054662, + "grad_norm": 2.9920849800109863, + "learning_rate": 6.709539121114685e-05, + "loss": 4.4271, + "step": 621 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 3.686924934387207, + "learning_rate": 6.688102893890676e-05, + "loss": 4.6426, + "step": 622 + }, + { + "epoch": 0.6677384780278671, + "grad_norm": 2.8612165451049805, + "learning_rate": 6.666666666666667e-05, + "loss": 4.4797, + "step": 623 + }, + { + "epoch": 0.6688102893890675, + "grad_norm": 2.1640119552612305, + "learning_rate": 6.645230439442658e-05, + "loss": 4.3597, + "step": 624 + }, + { + "epoch": 0.669882100750268, + "grad_norm": 2.755424737930298, + "learning_rate": 6.623794212218651e-05, + "loss": 4.5758, + "step": 625 + }, + { + "epoch": 0.6709539121114684, + "grad_norm": 3.111708402633667, + "learning_rate": 6.602357984994642e-05, + "loss": 4.3939, + "step": 626 + }, + { + "epoch": 0.6720257234726688, + "grad_norm": 2.549870014190674, + "learning_rate": 6.580921757770633e-05, + "loss": 4.3836, + "step": 627 + }, + { + "epoch": 0.6730975348338692, + "grad_norm": 2.126152515411377, + "learning_rate": 6.559485530546625e-05, + "loss": 4.5625, + "step": 628 + }, + { + "epoch": 0.6741693461950696, + "grad_norm": 4.276716709136963, + "learning_rate": 6.538049303322616e-05, + "loss": 4.5386, + "step": 629 + }, + { + "epoch": 0.6752411575562701, + "grad_norm": 3.121826648712158, + "learning_rate": 6.516613076098607e-05, + "loss": 4.564, + "step": 630 + }, + { + "epoch": 0.6763129689174705, + "grad_norm": 2.442767381668091, + "learning_rate": 6.495176848874598e-05, + "loss": 4.655, + "step": 631 + }, + { + "epoch": 0.677384780278671, + "grad_norm": 2.6107630729675293, + "learning_rate": 6.473740621650591e-05, + "loss": 4.5491, + "step": 632 + }, + { + "epoch": 0.6784565916398714, + "grad_norm": 3.7255454063415527, + "learning_rate": 6.452304394426582e-05, + "loss": 4.3361, + "step": 633 + }, + { + "epoch": 0.6795284030010718, + "grad_norm": 2.9900388717651367, + "learning_rate": 6.430868167202573e-05, + "loss": 4.5003, + "step": 634 + }, + { + "epoch": 0.6806002143622722, + "grad_norm": 2.156949758529663, + "learning_rate": 6.409431939978565e-05, + "loss": 4.348, + "step": 635 + }, + { + "epoch": 0.6816720257234726, + "grad_norm": 2.6016056537628174, + "learning_rate": 6.387995712754556e-05, + "loss": 4.4779, + "step": 636 + }, + { + "epoch": 0.6827438370846731, + "grad_norm": 3.6784298419952393, + "learning_rate": 6.366559485530547e-05, + "loss": 4.4926, + "step": 637 + }, + { + "epoch": 0.6838156484458735, + "grad_norm": 3.2673237323760986, + "learning_rate": 6.345123258306538e-05, + "loss": 4.4138, + "step": 638 + }, + { + "epoch": 0.684887459807074, + "grad_norm": 2.430467367172241, + "learning_rate": 6.323687031082531e-05, + "loss": 4.4007, + "step": 639 + }, + { + "epoch": 0.6859592711682744, + "grad_norm": 3.361685276031494, + "learning_rate": 6.302250803858522e-05, + "loss": 4.1909, + "step": 640 + }, + { + "epoch": 0.6870310825294748, + "grad_norm": 2.180561065673828, + "learning_rate": 6.280814576634513e-05, + "loss": 4.4958, + "step": 641 + }, + { + "epoch": 0.6881028938906752, + "grad_norm": 3.7177274227142334, + "learning_rate": 6.259378349410505e-05, + "loss": 4.5105, + "step": 642 + }, + { + "epoch": 0.6891747052518756, + "grad_norm": 3.392599105834961, + "learning_rate": 6.237942122186496e-05, + "loss": 4.3326, + "step": 643 + }, + { + "epoch": 0.6902465166130761, + "grad_norm": 2.894768238067627, + "learning_rate": 6.216505894962487e-05, + "loss": 4.5509, + "step": 644 + }, + { + "epoch": 0.6913183279742765, + "grad_norm": 6.643900394439697, + "learning_rate": 6.195069667738478e-05, + "loss": 4.8811, + "step": 645 + }, + { + "epoch": 0.692390139335477, + "grad_norm": 3.7013251781463623, + "learning_rate": 6.173633440514471e-05, + "loss": 4.2468, + "step": 646 + }, + { + "epoch": 0.6934619506966774, + "grad_norm": 2.7494664192199707, + "learning_rate": 6.152197213290462e-05, + "loss": 5.0329, + "step": 647 + }, + { + "epoch": 0.6945337620578779, + "grad_norm": 2.2148094177246094, + "learning_rate": 6.130760986066453e-05, + "loss": 4.3135, + "step": 648 + }, + { + "epoch": 0.6956055734190782, + "grad_norm": 2.2598962783813477, + "learning_rate": 6.109324758842445e-05, + "loss": 4.3436, + "step": 649 + }, + { + "epoch": 0.6966773847802786, + "grad_norm": 4.410878658294678, + "learning_rate": 6.087888531618435e-05, + "loss": 4.8447, + "step": 650 + }, + { + "epoch": 0.6977491961414791, + "grad_norm": 2.493114709854126, + "learning_rate": 6.0664523043944264e-05, + "loss": 4.2732, + "step": 651 + }, + { + "epoch": 0.6988210075026795, + "grad_norm": 2.972546100616455, + "learning_rate": 6.0450160771704176e-05, + "loss": 4.5284, + "step": 652 + }, + { + "epoch": 0.69989281886388, + "grad_norm": 3.26788330078125, + "learning_rate": 6.02357984994641e-05, + "loss": 4.7868, + "step": 653 + }, + { + "epoch": 0.7009646302250804, + "grad_norm": 4.179505348205566, + "learning_rate": 6.0021436227224014e-05, + "loss": 4.4146, + "step": 654 + }, + { + "epoch": 0.7020364415862809, + "grad_norm": 2.4835996627807617, + "learning_rate": 5.980707395498393e-05, + "loss": 4.9622, + "step": 655 + }, + { + "epoch": 0.7031082529474812, + "grad_norm": 3.4152846336364746, + "learning_rate": 5.959271168274384e-05, + "loss": 4.8898, + "step": 656 + }, + { + "epoch": 0.7041800643086816, + "grad_norm": 3.430803060531616, + "learning_rate": 5.937834941050375e-05, + "loss": 4.6275, + "step": 657 + }, + { + "epoch": 0.7052518756698821, + "grad_norm": 2.6322433948516846, + "learning_rate": 5.9163987138263664e-05, + "loss": 4.5736, + "step": 658 + }, + { + "epoch": 0.7063236870310825, + "grad_norm": 2.5020229816436768, + "learning_rate": 5.8949624866023576e-05, + "loss": 5.0218, + "step": 659 + }, + { + "epoch": 0.707395498392283, + "grad_norm": 2.7531423568725586, + "learning_rate": 5.87352625937835e-05, + "loss": 4.7368, + "step": 660 + }, + { + "epoch": 0.7084673097534834, + "grad_norm": 2.332533597946167, + "learning_rate": 5.8520900321543414e-05, + "loss": 4.4331, + "step": 661 + }, + { + "epoch": 0.7095391211146839, + "grad_norm": 2.4030661582946777, + "learning_rate": 5.830653804930333e-05, + "loss": 4.3736, + "step": 662 + }, + { + "epoch": 0.7106109324758842, + "grad_norm": 2.747347354888916, + "learning_rate": 5.809217577706324e-05, + "loss": 4.4708, + "step": 663 + }, + { + "epoch": 0.7116827438370846, + "grad_norm": 3.930635690689087, + "learning_rate": 5.787781350482315e-05, + "loss": 4.7282, + "step": 664 + }, + { + "epoch": 0.7127545551982851, + "grad_norm": 2.4571895599365234, + "learning_rate": 5.7663451232583064e-05, + "loss": 4.3438, + "step": 665 + }, + { + "epoch": 0.7138263665594855, + "grad_norm": 2.307389497756958, + "learning_rate": 5.7449088960342976e-05, + "loss": 4.4917, + "step": 666 + }, + { + "epoch": 0.714898177920686, + "grad_norm": 3.0858333110809326, + "learning_rate": 5.72347266881029e-05, + "loss": 4.7639, + "step": 667 + }, + { + "epoch": 0.7159699892818864, + "grad_norm": 2.8468878269195557, + "learning_rate": 5.7020364415862815e-05, + "loss": 4.4349, + "step": 668 + }, + { + "epoch": 0.7170418006430869, + "grad_norm": 2.674168825149536, + "learning_rate": 5.680600214362273e-05, + "loss": 4.6781, + "step": 669 + }, + { + "epoch": 0.7181136120042872, + "grad_norm": 3.077220916748047, + "learning_rate": 5.659163987138264e-05, + "loss": 4.5982, + "step": 670 + }, + { + "epoch": 0.7191854233654876, + "grad_norm": 2.4594485759735107, + "learning_rate": 5.637727759914255e-05, + "loss": 4.4059, + "step": 671 + }, + { + "epoch": 0.7202572347266881, + "grad_norm": 3.3636786937713623, + "learning_rate": 5.6162915326902464e-05, + "loss": 4.5043, + "step": 672 + }, + { + "epoch": 0.7213290460878885, + "grad_norm": 5.092872619628906, + "learning_rate": 5.5948553054662377e-05, + "loss": 4.6357, + "step": 673 + }, + { + "epoch": 0.722400857449089, + "grad_norm": 3.4879798889160156, + "learning_rate": 5.573419078242229e-05, + "loss": 4.4038, + "step": 674 + }, + { + "epoch": 0.7234726688102894, + "grad_norm": 3.5704169273376465, + "learning_rate": 5.5519828510182215e-05, + "loss": 4.2148, + "step": 675 + }, + { + "epoch": 0.7245444801714899, + "grad_norm": 3.622905731201172, + "learning_rate": 5.530546623794213e-05, + "loss": 4.2071, + "step": 676 + }, + { + "epoch": 0.7256162915326902, + "grad_norm": 2.2104790210723877, + "learning_rate": 5.509110396570204e-05, + "loss": 4.3016, + "step": 677 + }, + { + "epoch": 0.7266881028938906, + "grad_norm": 2.5049002170562744, + "learning_rate": 5.487674169346195e-05, + "loss": 4.2184, + "step": 678 + }, + { + "epoch": 0.7277599142550911, + "grad_norm": 2.545485019683838, + "learning_rate": 5.4662379421221864e-05, + "loss": 4.3773, + "step": 679 + }, + { + "epoch": 0.7288317256162915, + "grad_norm": 3.1245920658111572, + "learning_rate": 5.444801714898178e-05, + "loss": 4.6495, + "step": 680 + }, + { + "epoch": 0.729903536977492, + "grad_norm": 3.3570711612701416, + "learning_rate": 5.423365487674169e-05, + "loss": 4.3041, + "step": 681 + }, + { + "epoch": 0.7309753483386924, + "grad_norm": 2.5760622024536133, + "learning_rate": 5.4019292604501615e-05, + "loss": 4.6344, + "step": 682 + }, + { + "epoch": 0.7320471596998929, + "grad_norm": 3.143083095550537, + "learning_rate": 5.380493033226153e-05, + "loss": 4.6799, + "step": 683 + }, + { + "epoch": 0.7331189710610932, + "grad_norm": 4.669005870819092, + "learning_rate": 5.359056806002144e-05, + "loss": 4.3954, + "step": 684 + }, + { + "epoch": 0.7341907824222936, + "grad_norm": 3.921560049057007, + "learning_rate": 5.337620578778135e-05, + "loss": 4.6724, + "step": 685 + }, + { + "epoch": 0.7352625937834941, + "grad_norm": 4.419112682342529, + "learning_rate": 5.3161843515541264e-05, + "loss": 4.6749, + "step": 686 + }, + { + "epoch": 0.7363344051446945, + "grad_norm": 3.118290424346924, + "learning_rate": 5.294748124330118e-05, + "loss": 4.3938, + "step": 687 + }, + { + "epoch": 0.737406216505895, + "grad_norm": 2.2908451557159424, + "learning_rate": 5.273311897106109e-05, + "loss": 4.4141, + "step": 688 + }, + { + "epoch": 0.7384780278670954, + "grad_norm": 2.6434786319732666, + "learning_rate": 5.2518756698821015e-05, + "loss": 4.4886, + "step": 689 + }, + { + "epoch": 0.7395498392282959, + "grad_norm": 2.6037230491638184, + "learning_rate": 5.230439442658093e-05, + "loss": 4.5665, + "step": 690 + }, + { + "epoch": 0.7406216505894962, + "grad_norm": 3.525683641433716, + "learning_rate": 5.209003215434084e-05, + "loss": 4.1823, + "step": 691 + }, + { + "epoch": 0.7416934619506966, + "grad_norm": 3.5986504554748535, + "learning_rate": 5.187566988210075e-05, + "loss": 4.4201, + "step": 692 + }, + { + "epoch": 0.7427652733118971, + "grad_norm": 4.4375176429748535, + "learning_rate": 5.1661307609860665e-05, + "loss": 4.3157, + "step": 693 + }, + { + "epoch": 0.7438370846730975, + "grad_norm": 2.6561779975891113, + "learning_rate": 5.144694533762058e-05, + "loss": 4.3444, + "step": 694 + }, + { + "epoch": 0.744908896034298, + "grad_norm": 2.6672708988189697, + "learning_rate": 5.123258306538049e-05, + "loss": 4.5737, + "step": 695 + }, + { + "epoch": 0.7459807073954984, + "grad_norm": 3.335510015487671, + "learning_rate": 5.1018220793140415e-05, + "loss": 4.4153, + "step": 696 + }, + { + "epoch": 0.7470525187566989, + "grad_norm": 2.9571712017059326, + "learning_rate": 5.080385852090033e-05, + "loss": 4.4999, + "step": 697 + }, + { + "epoch": 0.7481243301178992, + "grad_norm": 2.0713906288146973, + "learning_rate": 5.058949624866024e-05, + "loss": 4.1479, + "step": 698 + }, + { + "epoch": 0.7491961414790996, + "grad_norm": 2.731901168823242, + "learning_rate": 5.037513397642015e-05, + "loss": 4.3306, + "step": 699 + }, + { + "epoch": 0.7502679528403001, + "grad_norm": 3.4839823246002197, + "learning_rate": 5.0160771704180065e-05, + "loss": 4.3706, + "step": 700 + }, + { + "epoch": 0.7513397642015005, + "grad_norm": 3.9278030395507812, + "learning_rate": 4.9946409431939984e-05, + "loss": 4.4768, + "step": 701 + }, + { + "epoch": 0.752411575562701, + "grad_norm": 2.87748122215271, + "learning_rate": 4.9732047159699896e-05, + "loss": 4.463, + "step": 702 + }, + { + "epoch": 0.7534833869239014, + "grad_norm": 3.587252616882324, + "learning_rate": 4.951768488745981e-05, + "loss": 4.7204, + "step": 703 + }, + { + "epoch": 0.7545551982851019, + "grad_norm": 2.9082605838775635, + "learning_rate": 4.930332261521972e-05, + "loss": 4.7662, + "step": 704 + }, + { + "epoch": 0.7556270096463023, + "grad_norm": 3.1029810905456543, + "learning_rate": 4.908896034297964e-05, + "loss": 4.5769, + "step": 705 + }, + { + "epoch": 0.7566988210075026, + "grad_norm": 2.8559889793395996, + "learning_rate": 4.887459807073955e-05, + "loss": 4.2848, + "step": 706 + }, + { + "epoch": 0.7577706323687031, + "grad_norm": 2.697112560272217, + "learning_rate": 4.8660235798499465e-05, + "loss": 4.4089, + "step": 707 + }, + { + "epoch": 0.7588424437299035, + "grad_norm": 3.552123546600342, + "learning_rate": 4.8445873526259384e-05, + "loss": 4.3264, + "step": 708 + }, + { + "epoch": 0.759914255091104, + "grad_norm": 4.04207181930542, + "learning_rate": 4.8231511254019296e-05, + "loss": 4.9773, + "step": 709 + }, + { + "epoch": 0.7609860664523044, + "grad_norm": 4.384789943695068, + "learning_rate": 4.801714898177921e-05, + "loss": 4.2986, + "step": 710 + }, + { + "epoch": 0.7620578778135049, + "grad_norm": 3.348785400390625, + "learning_rate": 4.780278670953912e-05, + "loss": 4.6206, + "step": 711 + }, + { + "epoch": 0.7631296891747053, + "grad_norm": 2.0677852630615234, + "learning_rate": 4.758842443729904e-05, + "loss": 4.4998, + "step": 712 + }, + { + "epoch": 0.7642015005359056, + "grad_norm": 3.080993890762329, + "learning_rate": 4.737406216505895e-05, + "loss": 4.2714, + "step": 713 + }, + { + "epoch": 0.7652733118971061, + "grad_norm": 2.621718406677246, + "learning_rate": 4.7159699892818865e-05, + "loss": 4.508, + "step": 714 + }, + { + "epoch": 0.7663451232583065, + "grad_norm": 2.7665212154388428, + "learning_rate": 4.6945337620578784e-05, + "loss": 4.4284, + "step": 715 + }, + { + "epoch": 0.767416934619507, + "grad_norm": 4.301600456237793, + "learning_rate": 4.6730975348338697e-05, + "loss": 4.8282, + "step": 716 + }, + { + "epoch": 0.7684887459807074, + "grad_norm": 4.236885070800781, + "learning_rate": 4.651661307609861e-05, + "loss": 4.8417, + "step": 717 + }, + { + "epoch": 0.7695605573419079, + "grad_norm": 5.495549201965332, + "learning_rate": 4.630225080385852e-05, + "loss": 4.124, + "step": 718 + }, + { + "epoch": 0.7706323687031083, + "grad_norm": 3.4314773082733154, + "learning_rate": 4.608788853161844e-05, + "loss": 4.5811, + "step": 719 + }, + { + "epoch": 0.7717041800643086, + "grad_norm": 2.890056848526001, + "learning_rate": 4.587352625937835e-05, + "loss": 4.2577, + "step": 720 + }, + { + "epoch": 0.7727759914255091, + "grad_norm": 2.566664218902588, + "learning_rate": 4.5659163987138265e-05, + "loss": 4.3006, + "step": 721 + }, + { + "epoch": 0.7738478027867095, + "grad_norm": 3.2805826663970947, + "learning_rate": 4.544480171489818e-05, + "loss": 4.1904, + "step": 722 + }, + { + "epoch": 0.77491961414791, + "grad_norm": 3.1700801849365234, + "learning_rate": 4.52304394426581e-05, + "loss": 4.2985, + "step": 723 + }, + { + "epoch": 0.7759914255091104, + "grad_norm": 2.913239002227783, + "learning_rate": 4.501607717041801e-05, + "loss": 4.5163, + "step": 724 + }, + { + "epoch": 0.7770632368703109, + "grad_norm": 3.6739931106567383, + "learning_rate": 4.480171489817792e-05, + "loss": 4.0745, + "step": 725 + }, + { + "epoch": 0.7781350482315113, + "grad_norm": 4.171450138092041, + "learning_rate": 4.458735262593784e-05, + "loss": 4.1734, + "step": 726 + }, + { + "epoch": 0.7792068595927116, + "grad_norm": 4.786383628845215, + "learning_rate": 4.437299035369775e-05, + "loss": 4.3863, + "step": 727 + }, + { + "epoch": 0.7802786709539121, + "grad_norm": 3.3467090129852295, + "learning_rate": 4.4158628081457665e-05, + "loss": 4.2379, + "step": 728 + }, + { + "epoch": 0.7813504823151125, + "grad_norm": 3.2874300479888916, + "learning_rate": 4.394426580921758e-05, + "loss": 4.5021, + "step": 729 + }, + { + "epoch": 0.782422293676313, + "grad_norm": 2.1295790672302246, + "learning_rate": 4.37299035369775e-05, + "loss": 4.2291, + "step": 730 + }, + { + "epoch": 0.7834941050375134, + "grad_norm": 3.8685142993927, + "learning_rate": 4.351554126473741e-05, + "loss": 4.2344, + "step": 731 + }, + { + "epoch": 0.7845659163987139, + "grad_norm": 2.811497688293457, + "learning_rate": 4.330117899249732e-05, + "loss": 4.7444, + "step": 732 + }, + { + "epoch": 0.7856377277599143, + "grad_norm": 2.16815185546875, + "learning_rate": 4.308681672025724e-05, + "loss": 4.5432, + "step": 733 + }, + { + "epoch": 0.7867095391211146, + "grad_norm": 3.302750587463379, + "learning_rate": 4.287245444801715e-05, + "loss": 4.1623, + "step": 734 + }, + { + "epoch": 0.7877813504823151, + "grad_norm": 3.3594536781311035, + "learning_rate": 4.2658092175777065e-05, + "loss": 4.9906, + "step": 735 + }, + { + "epoch": 0.7888531618435155, + "grad_norm": 2.7700035572052, + "learning_rate": 4.244372990353698e-05, + "loss": 4.457, + "step": 736 + }, + { + "epoch": 0.789924973204716, + "grad_norm": 2.545769691467285, + "learning_rate": 4.22293676312969e-05, + "loss": 4.3638, + "step": 737 + }, + { + "epoch": 0.7909967845659164, + "grad_norm": 3.0002028942108154, + "learning_rate": 4.201500535905681e-05, + "loss": 4.3884, + "step": 738 + }, + { + "epoch": 0.7920685959271169, + "grad_norm": 2.997331142425537, + "learning_rate": 4.180064308681672e-05, + "loss": 4.3515, + "step": 739 + }, + { + "epoch": 0.7931404072883173, + "grad_norm": 2.539663553237915, + "learning_rate": 4.158628081457664e-05, + "loss": 4.4387, + "step": 740 + }, + { + "epoch": 0.7942122186495176, + "grad_norm": 2.4811909198760986, + "learning_rate": 4.137191854233655e-05, + "loss": 4.2736, + "step": 741 + }, + { + "epoch": 0.7952840300107181, + "grad_norm": 3.7457571029663086, + "learning_rate": 4.1157556270096466e-05, + "loss": 4.4863, + "step": 742 + }, + { + "epoch": 0.7963558413719185, + "grad_norm": 2.783266067504883, + "learning_rate": 4.094319399785638e-05, + "loss": 4.5847, + "step": 743 + }, + { + "epoch": 0.797427652733119, + "grad_norm": 2.202946186065674, + "learning_rate": 4.07288317256163e-05, + "loss": 4.4435, + "step": 744 + }, + { + "epoch": 0.7984994640943194, + "grad_norm": 3.0813934803009033, + "learning_rate": 4.051446945337621e-05, + "loss": 4.1583, + "step": 745 + }, + { + "epoch": 0.7995712754555199, + "grad_norm": 3.1837377548217773, + "learning_rate": 4.030010718113612e-05, + "loss": 4.5823, + "step": 746 + }, + { + "epoch": 0.8006430868167203, + "grad_norm": 2.310713529586792, + "learning_rate": 4.0085744908896034e-05, + "loss": 4.4296, + "step": 747 + }, + { + "epoch": 0.8017148981779206, + "grad_norm": 2.4914042949676514, + "learning_rate": 3.9871382636655953e-05, + "loss": 4.2493, + "step": 748 + }, + { + "epoch": 0.8027867095391211, + "grad_norm": 4.564463138580322, + "learning_rate": 3.9657020364415866e-05, + "loss": 4.5261, + "step": 749 + }, + { + "epoch": 0.8038585209003215, + "grad_norm": 3.1176915168762207, + "learning_rate": 3.944265809217578e-05, + "loss": 4.6123, + "step": 750 + }, + { + "epoch": 0.804930332261522, + "grad_norm": 2.2957403659820557, + "learning_rate": 3.92282958199357e-05, + "loss": 4.6512, + "step": 751 + }, + { + "epoch": 0.8060021436227224, + "grad_norm": 3.3911380767822266, + "learning_rate": 3.901393354769561e-05, + "loss": 4.3699, + "step": 752 + }, + { + "epoch": 0.8070739549839229, + "grad_norm": 2.3711419105529785, + "learning_rate": 3.879957127545552e-05, + "loss": 4.3832, + "step": 753 + }, + { + "epoch": 0.8081457663451233, + "grad_norm": 4.154041767120361, + "learning_rate": 3.8585209003215434e-05, + "loss": 4.4298, + "step": 754 + }, + { + "epoch": 0.8092175777063236, + "grad_norm": 4.426170825958252, + "learning_rate": 3.8370846730975354e-05, + "loss": 4.2205, + "step": 755 + }, + { + "epoch": 0.8102893890675241, + "grad_norm": 2.643601179122925, + "learning_rate": 3.8156484458735266e-05, + "loss": 4.1436, + "step": 756 + }, + { + "epoch": 0.8113612004287245, + "grad_norm": 2.698427200317383, + "learning_rate": 3.794212218649518e-05, + "loss": 4.3915, + "step": 757 + }, + { + "epoch": 0.812433011789925, + "grad_norm": 2.6776609420776367, + "learning_rate": 3.77277599142551e-05, + "loss": 4.7078, + "step": 758 + }, + { + "epoch": 0.8135048231511254, + "grad_norm": 3.502988338470459, + "learning_rate": 3.751339764201501e-05, + "loss": 4.5208, + "step": 759 + }, + { + "epoch": 0.8145766345123259, + "grad_norm": 2.7470109462738037, + "learning_rate": 3.729903536977492e-05, + "loss": 4.3289, + "step": 760 + }, + { + "epoch": 0.8156484458735263, + "grad_norm": 3.7509968280792236, + "learning_rate": 3.7084673097534835e-05, + "loss": 4.3425, + "step": 761 + }, + { + "epoch": 0.8167202572347267, + "grad_norm": 2.3047983646392822, + "learning_rate": 3.6870310825294754e-05, + "loss": 4.2544, + "step": 762 + }, + { + "epoch": 0.8177920685959271, + "grad_norm": 3.482459545135498, + "learning_rate": 3.6655948553054666e-05, + "loss": 4.6132, + "step": 763 + }, + { + "epoch": 0.8188638799571275, + "grad_norm": 3.8098599910736084, + "learning_rate": 3.644158628081458e-05, + "loss": 4.2388, + "step": 764 + }, + { + "epoch": 0.819935691318328, + "grad_norm": 3.2833423614501953, + "learning_rate": 3.62272240085745e-05, + "loss": 4.1095, + "step": 765 + }, + { + "epoch": 0.8210075026795284, + "grad_norm": 3.2167413234710693, + "learning_rate": 3.601286173633441e-05, + "loss": 4.2657, + "step": 766 + }, + { + "epoch": 0.8220793140407289, + "grad_norm": 2.6981112957000732, + "learning_rate": 3.579849946409432e-05, + "loss": 4.2648, + "step": 767 + }, + { + "epoch": 0.8231511254019293, + "grad_norm": 2.9271039962768555, + "learning_rate": 3.5584137191854235e-05, + "loss": 4.2033, + "step": 768 + }, + { + "epoch": 0.8242229367631297, + "grad_norm": 3.722834587097168, + "learning_rate": 3.5369774919614154e-05, + "loss": 4.6958, + "step": 769 + }, + { + "epoch": 0.8252947481243301, + "grad_norm": 2.129990339279175, + "learning_rate": 3.5155412647374066e-05, + "loss": 4.1381, + "step": 770 + }, + { + "epoch": 0.8263665594855305, + "grad_norm": 5.136198043823242, + "learning_rate": 3.494105037513398e-05, + "loss": 4.6291, + "step": 771 + }, + { + "epoch": 0.827438370846731, + "grad_norm": 2.7707130908966064, + "learning_rate": 3.47266881028939e-05, + "loss": 4.0731, + "step": 772 + }, + { + "epoch": 0.8285101822079314, + "grad_norm": 3.942397117614746, + "learning_rate": 3.451232583065381e-05, + "loss": 5.0691, + "step": 773 + }, + { + "epoch": 0.8295819935691319, + "grad_norm": 2.749673366546631, + "learning_rate": 3.429796355841372e-05, + "loss": 4.6547, + "step": 774 + }, + { + "epoch": 0.8306538049303323, + "grad_norm": 3.3368706703186035, + "learning_rate": 3.4083601286173635e-05, + "loss": 4.2318, + "step": 775 + }, + { + "epoch": 0.8317256162915327, + "grad_norm": 3.024880886077881, + "learning_rate": 3.3869239013933554e-05, + "loss": 4.4178, + "step": 776 + }, + { + "epoch": 0.8327974276527331, + "grad_norm": 2.661226272583008, + "learning_rate": 3.3654876741693466e-05, + "loss": 4.3274, + "step": 777 + }, + { + "epoch": 0.8338692390139335, + "grad_norm": 2.7595772743225098, + "learning_rate": 3.344051446945338e-05, + "loss": 4.5459, + "step": 778 + }, + { + "epoch": 0.834941050375134, + "grad_norm": 2.113373041152954, + "learning_rate": 3.322615219721329e-05, + "loss": 4.3255, + "step": 779 + }, + { + "epoch": 0.8360128617363344, + "grad_norm": 2.7816619873046875, + "learning_rate": 3.301178992497321e-05, + "loss": 4.3316, + "step": 780 + }, + { + "epoch": 0.8370846730975349, + "grad_norm": 3.1241683959960938, + "learning_rate": 3.279742765273312e-05, + "loss": 4.4017, + "step": 781 + }, + { + "epoch": 0.8381564844587353, + "grad_norm": 3.018803119659424, + "learning_rate": 3.2583065380493035e-05, + "loss": 4.1559, + "step": 782 + }, + { + "epoch": 0.8392282958199357, + "grad_norm": 2.408975601196289, + "learning_rate": 3.2368703108252954e-05, + "loss": 4.3322, + "step": 783 + }, + { + "epoch": 0.8403001071811361, + "grad_norm": 2.97987961769104, + "learning_rate": 3.2154340836012867e-05, + "loss": 4.2923, + "step": 784 + }, + { + "epoch": 0.8413719185423365, + "grad_norm": 2.9940474033355713, + "learning_rate": 3.193997856377278e-05, + "loss": 4.4587, + "step": 785 + }, + { + "epoch": 0.842443729903537, + "grad_norm": 2.686958074569702, + "learning_rate": 3.172561629153269e-05, + "loss": 4.4801, + "step": 786 + }, + { + "epoch": 0.8435155412647374, + "grad_norm": 1.6722944974899292, + "learning_rate": 3.151125401929261e-05, + "loss": 4.3032, + "step": 787 + }, + { + "epoch": 0.8445873526259379, + "grad_norm": 4.333474636077881, + "learning_rate": 3.129689174705252e-05, + "loss": 4.409, + "step": 788 + }, + { + "epoch": 0.8456591639871383, + "grad_norm": 3.1574814319610596, + "learning_rate": 3.1082529474812435e-05, + "loss": 4.6209, + "step": 789 + }, + { + "epoch": 0.8467309753483387, + "grad_norm": 2.261394739151001, + "learning_rate": 3.0868167202572354e-05, + "loss": 4.3995, + "step": 790 + }, + { + "epoch": 0.8478027867095391, + "grad_norm": 2.2780826091766357, + "learning_rate": 3.065380493033227e-05, + "loss": 4.2624, + "step": 791 + }, + { + "epoch": 0.8488745980707395, + "grad_norm": 2.5379581451416016, + "learning_rate": 3.0439442658092176e-05, + "loss": 4.5917, + "step": 792 + }, + { + "epoch": 0.84994640943194, + "grad_norm": 3.24279522895813, + "learning_rate": 3.0225080385852088e-05, + "loss": 4.4947, + "step": 793 + }, + { + "epoch": 0.8510182207931404, + "grad_norm": 3.2784323692321777, + "learning_rate": 3.0010718113612007e-05, + "loss": 4.3983, + "step": 794 + }, + { + "epoch": 0.8520900321543409, + "grad_norm": 2.6092724800109863, + "learning_rate": 2.979635584137192e-05, + "loss": 4.1586, + "step": 795 + }, + { + "epoch": 0.8531618435155413, + "grad_norm": 3.3995344638824463, + "learning_rate": 2.9581993569131832e-05, + "loss": 4.5867, + "step": 796 + }, + { + "epoch": 0.8542336548767417, + "grad_norm": 2.998579978942871, + "learning_rate": 2.936763129689175e-05, + "loss": 4.6124, + "step": 797 + }, + { + "epoch": 0.8553054662379421, + "grad_norm": 3.120178699493408, + "learning_rate": 2.9153269024651663e-05, + "loss": 4.5091, + "step": 798 + }, + { + "epoch": 0.8563772775991425, + "grad_norm": 3.543617010116577, + "learning_rate": 2.8938906752411576e-05, + "loss": 4.6284, + "step": 799 + }, + { + "epoch": 0.857449088960343, + "grad_norm": 3.2729685306549072, + "learning_rate": 2.8724544480171488e-05, + "loss": 4.2905, + "step": 800 + }, + { + "epoch": 0.8585209003215434, + "grad_norm": 3.9517574310302734, + "learning_rate": 2.8510182207931407e-05, + "loss": 4.7788, + "step": 801 + }, + { + "epoch": 0.8595927116827439, + "grad_norm": 3.896101713180542, + "learning_rate": 2.829581993569132e-05, + "loss": 4.4362, + "step": 802 + }, + { + "epoch": 0.8606645230439443, + "grad_norm": 3.16723370552063, + "learning_rate": 2.8081457663451232e-05, + "loss": 4.5719, + "step": 803 + }, + { + "epoch": 0.8617363344051447, + "grad_norm": 3.9187586307525635, + "learning_rate": 2.7867095391211144e-05, + "loss": 4.4248, + "step": 804 + }, + { + "epoch": 0.8628081457663451, + "grad_norm": 2.844324827194214, + "learning_rate": 2.7652733118971064e-05, + "loss": 4.3487, + "step": 805 + }, + { + "epoch": 0.8638799571275455, + "grad_norm": 2.617323398590088, + "learning_rate": 2.7438370846730976e-05, + "loss": 4.5764, + "step": 806 + }, + { + "epoch": 0.864951768488746, + "grad_norm": 3.5526764392852783, + "learning_rate": 2.722400857449089e-05, + "loss": 4.5931, + "step": 807 + }, + { + "epoch": 0.8660235798499464, + "grad_norm": 2.6362595558166504, + "learning_rate": 2.7009646302250807e-05, + "loss": 4.3441, + "step": 808 + }, + { + "epoch": 0.8670953912111469, + "grad_norm": 3.744828701019287, + "learning_rate": 2.679528403001072e-05, + "loss": 4.4955, + "step": 809 + }, + { + "epoch": 0.8681672025723473, + "grad_norm": 2.8340702056884766, + "learning_rate": 2.6580921757770632e-05, + "loss": 4.2257, + "step": 810 + }, + { + "epoch": 0.8692390139335477, + "grad_norm": 2.4332375526428223, + "learning_rate": 2.6366559485530545e-05, + "loss": 4.1914, + "step": 811 + }, + { + "epoch": 0.8703108252947481, + "grad_norm": 2.8110697269439697, + "learning_rate": 2.6152197213290464e-05, + "loss": 4.4457, + "step": 812 + }, + { + "epoch": 0.8713826366559485, + "grad_norm": 3.533874273300171, + "learning_rate": 2.5937834941050376e-05, + "loss": 4.7375, + "step": 813 + }, + { + "epoch": 0.872454448017149, + "grad_norm": 3.4596922397613525, + "learning_rate": 2.572347266881029e-05, + "loss": 4.3842, + "step": 814 + }, + { + "epoch": 0.8735262593783494, + "grad_norm": 4.416886329650879, + "learning_rate": 2.5509110396570208e-05, + "loss": 5.1532, + "step": 815 + }, + { + "epoch": 0.8745980707395499, + "grad_norm": 2.818956136703491, + "learning_rate": 2.529474812433012e-05, + "loss": 4.2577, + "step": 816 + }, + { + "epoch": 0.8756698821007503, + "grad_norm": 4.539247989654541, + "learning_rate": 2.5080385852090032e-05, + "loss": 4.3223, + "step": 817 + }, + { + "epoch": 0.8767416934619507, + "grad_norm": 4.091986656188965, + "learning_rate": 2.4866023579849948e-05, + "loss": 4.4761, + "step": 818 + }, + { + "epoch": 0.8778135048231511, + "grad_norm": 2.9086873531341553, + "learning_rate": 2.465166130760986e-05, + "loss": 4.3534, + "step": 819 + }, + { + "epoch": 0.8788853161843515, + "grad_norm": 3.3018639087677, + "learning_rate": 2.4437299035369776e-05, + "loss": 4.487, + "step": 820 + }, + { + "epoch": 0.879957127545552, + "grad_norm": 2.7818877696990967, + "learning_rate": 2.4222936763129692e-05, + "loss": 4.5688, + "step": 821 + }, + { + "epoch": 0.8810289389067524, + "grad_norm": 2.4334094524383545, + "learning_rate": 2.4008574490889604e-05, + "loss": 4.8955, + "step": 822 + }, + { + "epoch": 0.8821007502679529, + "grad_norm": 2.775305986404419, + "learning_rate": 2.379421221864952e-05, + "loss": 4.2866, + "step": 823 + }, + { + "epoch": 0.8831725616291533, + "grad_norm": 3.648191213607788, + "learning_rate": 2.3579849946409433e-05, + "loss": 4.2068, + "step": 824 + }, + { + "epoch": 0.8842443729903537, + "grad_norm": 4.619109630584717, + "learning_rate": 2.3365487674169348e-05, + "loss": 4.4062, + "step": 825 + }, + { + "epoch": 0.8853161843515541, + "grad_norm": 3.174395799636841, + "learning_rate": 2.315112540192926e-05, + "loss": 4.199, + "step": 826 + }, + { + "epoch": 0.8863879957127545, + "grad_norm": 3.6223504543304443, + "learning_rate": 2.2936763129689176e-05, + "loss": 4.3869, + "step": 827 + }, + { + "epoch": 0.887459807073955, + "grad_norm": 3.9439122676849365, + "learning_rate": 2.272240085744909e-05, + "loss": 5.0497, + "step": 828 + }, + { + "epoch": 0.8885316184351554, + "grad_norm": 3.0752944946289062, + "learning_rate": 2.2508038585209005e-05, + "loss": 4.4295, + "step": 829 + }, + { + "epoch": 0.8896034297963559, + "grad_norm": 2.4576616287231445, + "learning_rate": 2.229367631296892e-05, + "loss": 4.7185, + "step": 830 + }, + { + "epoch": 0.8906752411575563, + "grad_norm": 2.447382688522339, + "learning_rate": 2.2079314040728833e-05, + "loss": 4.1357, + "step": 831 + }, + { + "epoch": 0.8917470525187567, + "grad_norm": 2.2739741802215576, + "learning_rate": 2.186495176848875e-05, + "loss": 3.9939, + "step": 832 + }, + { + "epoch": 0.8928188638799571, + "grad_norm": 3.1691207885742188, + "learning_rate": 2.165058949624866e-05, + "loss": 4.354, + "step": 833 + }, + { + "epoch": 0.8938906752411575, + "grad_norm": 2.7398335933685303, + "learning_rate": 2.1436227224008577e-05, + "loss": 4.2388, + "step": 834 + }, + { + "epoch": 0.894962486602358, + "grad_norm": 2.907942056655884, + "learning_rate": 2.122186495176849e-05, + "loss": 4.4821, + "step": 835 + }, + { + "epoch": 0.8960342979635584, + "grad_norm": 3.2182884216308594, + "learning_rate": 2.1007502679528405e-05, + "loss": 4.8471, + "step": 836 + }, + { + "epoch": 0.8971061093247589, + "grad_norm": 3.2632901668548584, + "learning_rate": 2.079314040728832e-05, + "loss": 4.7403, + "step": 837 + }, + { + "epoch": 0.8981779206859593, + "grad_norm": 2.8337507247924805, + "learning_rate": 2.0578778135048233e-05, + "loss": 4.8651, + "step": 838 + }, + { + "epoch": 0.8992497320471597, + "grad_norm": 2.620939254760742, + "learning_rate": 2.036441586280815e-05, + "loss": 4.1015, + "step": 839 + }, + { + "epoch": 0.9003215434083601, + "grad_norm": 3.049180746078491, + "learning_rate": 2.015005359056806e-05, + "loss": 4.6333, + "step": 840 + }, + { + "epoch": 0.9013933547695605, + "grad_norm": 2.508197546005249, + "learning_rate": 1.9935691318327977e-05, + "loss": 4.1966, + "step": 841 + }, + { + "epoch": 0.902465166130761, + "grad_norm": 4.801202297210693, + "learning_rate": 1.972132904608789e-05, + "loss": 4.8339, + "step": 842 + }, + { + "epoch": 0.9035369774919614, + "grad_norm": 2.1153371334075928, + "learning_rate": 1.9506966773847805e-05, + "loss": 4.0906, + "step": 843 + }, + { + "epoch": 0.9046087888531619, + "grad_norm": 4.357201099395752, + "learning_rate": 1.9292604501607717e-05, + "loss": 4.6109, + "step": 844 + }, + { + "epoch": 0.9056806002143623, + "grad_norm": 4.721092224121094, + "learning_rate": 1.9078242229367633e-05, + "loss": 4.1943, + "step": 845 + }, + { + "epoch": 0.9067524115755627, + "grad_norm": 2.453629970550537, + "learning_rate": 1.886387995712755e-05, + "loss": 4.5132, + "step": 846 + }, + { + "epoch": 0.9078242229367631, + "grad_norm": 2.598621129989624, + "learning_rate": 1.864951768488746e-05, + "loss": 4.2245, + "step": 847 + }, + { + "epoch": 0.9088960342979635, + "grad_norm": 2.0761516094207764, + "learning_rate": 1.8435155412647377e-05, + "loss": 4.2331, + "step": 848 + }, + { + "epoch": 0.909967845659164, + "grad_norm": 2.663645029067993, + "learning_rate": 1.822079314040729e-05, + "loss": 4.2554, + "step": 849 + }, + { + "epoch": 0.9110396570203644, + "grad_norm": 2.794361114501953, + "learning_rate": 1.8006430868167205e-05, + "loss": 4.0979, + "step": 850 + }, + { + "epoch": 0.9121114683815649, + "grad_norm": 4.002750396728516, + "learning_rate": 1.7792068595927117e-05, + "loss": 4.156, + "step": 851 + }, + { + "epoch": 0.9131832797427653, + "grad_norm": 2.1327261924743652, + "learning_rate": 1.7577706323687033e-05, + "loss": 4.2994, + "step": 852 + }, + { + "epoch": 0.9142550911039657, + "grad_norm": 2.218019723892212, + "learning_rate": 1.736334405144695e-05, + "loss": 4.2681, + "step": 853 + }, + { + "epoch": 0.9153269024651661, + "grad_norm": 2.634157419204712, + "learning_rate": 1.714898177920686e-05, + "loss": 4.457, + "step": 854 + }, + { + "epoch": 0.9163987138263665, + "grad_norm": 3.584799289703369, + "learning_rate": 1.6934619506966777e-05, + "loss": 4.4086, + "step": 855 + }, + { + "epoch": 0.917470525187567, + "grad_norm": 3.640207052230835, + "learning_rate": 1.672025723472669e-05, + "loss": 4.2253, + "step": 856 + }, + { + "epoch": 0.9185423365487674, + "grad_norm": 2.656949758529663, + "learning_rate": 1.6505894962486605e-05, + "loss": 4.3226, + "step": 857 + }, + { + "epoch": 0.9196141479099679, + "grad_norm": 2.573221445083618, + "learning_rate": 1.6291532690246518e-05, + "loss": 4.6633, + "step": 858 + }, + { + "epoch": 0.9206859592711683, + "grad_norm": 2.5666069984436035, + "learning_rate": 1.6077170418006433e-05, + "loss": 4.4142, + "step": 859 + }, + { + "epoch": 0.9217577706323687, + "grad_norm": 2.6048295497894287, + "learning_rate": 1.5862808145766346e-05, + "loss": 4.5767, + "step": 860 + }, + { + "epoch": 0.9228295819935691, + "grad_norm": 3.020139694213867, + "learning_rate": 1.564844587352626e-05, + "loss": 4.2079, + "step": 861 + }, + { + "epoch": 0.9239013933547695, + "grad_norm": 3.268181085586548, + "learning_rate": 1.5434083601286177e-05, + "loss": 4.2529, + "step": 862 + }, + { + "epoch": 0.92497320471597, + "grad_norm": 1.9898625612258911, + "learning_rate": 1.5219721329046088e-05, + "loss": 4.2561, + "step": 863 + }, + { + "epoch": 0.9260450160771704, + "grad_norm": 2.4188132286071777, + "learning_rate": 1.5005359056806004e-05, + "loss": 4.4644, + "step": 864 + }, + { + "epoch": 0.9271168274383709, + "grad_norm": 3.1744115352630615, + "learning_rate": 1.4790996784565916e-05, + "loss": 4.2628, + "step": 865 + }, + { + "epoch": 0.9281886387995713, + "grad_norm": 3.263568639755249, + "learning_rate": 1.4576634512325832e-05, + "loss": 4.2172, + "step": 866 + }, + { + "epoch": 0.9292604501607717, + "grad_norm": 2.486051082611084, + "learning_rate": 1.4362272240085744e-05, + "loss": 4.1906, + "step": 867 + }, + { + "epoch": 0.9303322615219721, + "grad_norm": 3.1796326637268066, + "learning_rate": 1.414790996784566e-05, + "loss": 4.4266, + "step": 868 + }, + { + "epoch": 0.9314040728831725, + "grad_norm": 4.484216690063477, + "learning_rate": 1.3933547695605572e-05, + "loss": 4.5339, + "step": 869 + }, + { + "epoch": 0.932475884244373, + "grad_norm": 2.6323423385620117, + "learning_rate": 1.3719185423365488e-05, + "loss": 4.3935, + "step": 870 + }, + { + "epoch": 0.9335476956055734, + "grad_norm": 3.191985845565796, + "learning_rate": 1.3504823151125404e-05, + "loss": 4.3227, + "step": 871 + }, + { + "epoch": 0.9346195069667739, + "grad_norm": 4.297986030578613, + "learning_rate": 1.3290460878885316e-05, + "loss": 4.2514, + "step": 872 + }, + { + "epoch": 0.9356913183279743, + "grad_norm": 3.8759825229644775, + "learning_rate": 1.3076098606645232e-05, + "loss": 4.1061, + "step": 873 + }, + { + "epoch": 0.9367631296891747, + "grad_norm": 2.874577283859253, + "learning_rate": 1.2861736334405144e-05, + "loss": 4.3546, + "step": 874 + }, + { + "epoch": 0.9378349410503751, + "grad_norm": 2.8280208110809326, + "learning_rate": 1.264737406216506e-05, + "loss": 4.2303, + "step": 875 + }, + { + "epoch": 0.9389067524115756, + "grad_norm": 5.017232418060303, + "learning_rate": 1.2433011789924974e-05, + "loss": 4.4325, + "step": 876 + }, + { + "epoch": 0.939978563772776, + "grad_norm": 3.0879180431365967, + "learning_rate": 1.2218649517684888e-05, + "loss": 4.3345, + "step": 877 + }, + { + "epoch": 0.9410503751339764, + "grad_norm": 5.276310920715332, + "learning_rate": 1.2004287245444802e-05, + "loss": 4.195, + "step": 878 + }, + { + "epoch": 0.9421221864951769, + "grad_norm": 3.4308736324310303, + "learning_rate": 1.1789924973204716e-05, + "loss": 4.1783, + "step": 879 + }, + { + "epoch": 0.9431939978563773, + "grad_norm": 4.4123430252075195, + "learning_rate": 1.157556270096463e-05, + "loss": 4.5029, + "step": 880 + }, + { + "epoch": 0.9442658092175777, + "grad_norm": 2.261904239654541, + "learning_rate": 1.1361200428724544e-05, + "loss": 4.2614, + "step": 881 + }, + { + "epoch": 0.9453376205787781, + "grad_norm": 2.2538812160491943, + "learning_rate": 1.114683815648446e-05, + "loss": 4.2999, + "step": 882 + }, + { + "epoch": 0.9464094319399786, + "grad_norm": 3.461613893508911, + "learning_rate": 1.0932475884244374e-05, + "loss": 4.6115, + "step": 883 + }, + { + "epoch": 0.947481243301179, + "grad_norm": 3.164299488067627, + "learning_rate": 1.0718113612004288e-05, + "loss": 4.0643, + "step": 884 + }, + { + "epoch": 0.9485530546623794, + "grad_norm": 2.1744544506073, + "learning_rate": 1.0503751339764202e-05, + "loss": 4.3665, + "step": 885 + }, + { + "epoch": 0.9496248660235799, + "grad_norm": 2.9400393962860107, + "learning_rate": 1.0289389067524116e-05, + "loss": 3.9939, + "step": 886 + }, + { + "epoch": 0.9506966773847803, + "grad_norm": 2.9955875873565674, + "learning_rate": 1.007502679528403e-05, + "loss": 4.2878, + "step": 887 + }, + { + "epoch": 0.9517684887459807, + "grad_norm": 2.6207101345062256, + "learning_rate": 9.860664523043945e-06, + "loss": 4.0709, + "step": 888 + }, + { + "epoch": 0.9528403001071811, + "grad_norm": 2.4389564990997314, + "learning_rate": 9.646302250803859e-06, + "loss": 4.3193, + "step": 889 + }, + { + "epoch": 0.9539121114683816, + "grad_norm": 3.457326889038086, + "learning_rate": 9.431939978563774e-06, + "loss": 4.7996, + "step": 890 + }, + { + "epoch": 0.954983922829582, + "grad_norm": 2.875967502593994, + "learning_rate": 9.217577706323688e-06, + "loss": 4.5278, + "step": 891 + }, + { + "epoch": 0.9560557341907824, + "grad_norm": 4.708974838256836, + "learning_rate": 9.003215434083602e-06, + "loss": 4.2928, + "step": 892 + }, + { + "epoch": 0.9571275455519829, + "grad_norm": 2.4193968772888184, + "learning_rate": 8.788853161843517e-06, + "loss": 4.0673, + "step": 893 + }, + { + "epoch": 0.9581993569131833, + "grad_norm": 2.4473676681518555, + "learning_rate": 8.57449088960343e-06, + "loss": 4.3235, + "step": 894 + }, + { + "epoch": 0.9592711682743837, + "grad_norm": 2.7152135372161865, + "learning_rate": 8.360128617363345e-06, + "loss": 4.3448, + "step": 895 + }, + { + "epoch": 0.9603429796355841, + "grad_norm": 2.9722485542297363, + "learning_rate": 8.145766345123259e-06, + "loss": 4.6859, + "step": 896 + }, + { + "epoch": 0.9614147909967846, + "grad_norm": 4.064876079559326, + "learning_rate": 7.931404072883173e-06, + "loss": 4.3012, + "step": 897 + }, + { + "epoch": 0.962486602357985, + "grad_norm": 2.750589370727539, + "learning_rate": 7.717041800643089e-06, + "loss": 4.506, + "step": 898 + }, + { + "epoch": 0.9635584137191854, + "grad_norm": 3.3813419342041016, + "learning_rate": 7.502679528403002e-06, + "loss": 4.2005, + "step": 899 + }, + { + "epoch": 0.9646302250803859, + "grad_norm": 3.6827099323272705, + "learning_rate": 7.288317256162916e-06, + "loss": 4.8387, + "step": 900 + }, + { + "epoch": 0.9657020364415863, + "grad_norm": 2.949695110321045, + "learning_rate": 7.07395498392283e-06, + "loss": 4.3218, + "step": 901 + }, + { + "epoch": 0.9667738478027867, + "grad_norm": 3.40405535697937, + "learning_rate": 6.859592711682744e-06, + "loss": 4.2812, + "step": 902 + }, + { + "epoch": 0.9678456591639871, + "grad_norm": 3.3691530227661133, + "learning_rate": 6.645230439442658e-06, + "loss": 4.2832, + "step": 903 + }, + { + "epoch": 0.9689174705251876, + "grad_norm": 3.336270332336426, + "learning_rate": 6.430868167202572e-06, + "loss": 4.2197, + "step": 904 + }, + { + "epoch": 0.969989281886388, + "grad_norm": 2.827868700027466, + "learning_rate": 6.216505894962487e-06, + "loss": 4.3922, + "step": 905 + }, + { + "epoch": 0.9710610932475884, + "grad_norm": 4.273472309112549, + "learning_rate": 6.002143622722401e-06, + "loss": 4.2416, + "step": 906 + }, + { + "epoch": 0.9721329046087889, + "grad_norm": 2.3178086280822754, + "learning_rate": 5.787781350482315e-06, + "loss": 4.6557, + "step": 907 + }, + { + "epoch": 0.9732047159699893, + "grad_norm": 2.3149664402008057, + "learning_rate": 5.57341907824223e-06, + "loss": 4.2855, + "step": 908 + }, + { + "epoch": 0.9742765273311897, + "grad_norm": 3.162186622619629, + "learning_rate": 5.359056806002144e-06, + "loss": 4.4175, + "step": 909 + }, + { + "epoch": 0.9753483386923901, + "grad_norm": 3.124436378479004, + "learning_rate": 5.144694533762058e-06, + "loss": 4.3642, + "step": 910 + }, + { + "epoch": 0.9764201500535906, + "grad_norm": 3.431694984436035, + "learning_rate": 4.930332261521972e-06, + "loss": 4.5494, + "step": 911 + }, + { + "epoch": 0.977491961414791, + "grad_norm": 4.276769638061523, + "learning_rate": 4.715969989281887e-06, + "loss": 4.1405, + "step": 912 + }, + { + "epoch": 0.9785637727759914, + "grad_norm": 2.1469695568084717, + "learning_rate": 4.501607717041801e-06, + "loss": 4.5683, + "step": 913 + }, + { + "epoch": 0.9796355841371919, + "grad_norm": 4.689246654510498, + "learning_rate": 4.287245444801715e-06, + "loss": 4.4474, + "step": 914 + }, + { + "epoch": 0.9807073954983923, + "grad_norm": 2.666457176208496, + "learning_rate": 4.072883172561629e-06, + "loss": 4.4224, + "step": 915 + }, + { + "epoch": 0.9817792068595927, + "grad_norm": 4.759450435638428, + "learning_rate": 3.858520900321544e-06, + "loss": 4.5016, + "step": 916 + }, + { + "epoch": 0.9828510182207931, + "grad_norm": 4.514771938323975, + "learning_rate": 3.644158628081458e-06, + "loss": 4.4132, + "step": 917 + }, + { + "epoch": 0.9839228295819936, + "grad_norm": 2.440972089767456, + "learning_rate": 3.429796355841372e-06, + "loss": 4.2929, + "step": 918 + }, + { + "epoch": 0.984994640943194, + "grad_norm": 2.3493099212646484, + "learning_rate": 3.215434083601286e-06, + "loss": 4.191, + "step": 919 + }, + { + "epoch": 0.9860664523043944, + "grad_norm": 3.0151264667510986, + "learning_rate": 3.0010718113612005e-06, + "loss": 4.3449, + "step": 920 + }, + { + "epoch": 0.9871382636655949, + "grad_norm": 2.319638252258301, + "learning_rate": 2.786709539121115e-06, + "loss": 4.0492, + "step": 921 + }, + { + "epoch": 0.9882100750267953, + "grad_norm": 1.8680282831192017, + "learning_rate": 2.572347266881029e-06, + "loss": 4.1374, + "step": 922 + }, + { + "epoch": 0.9892818863879957, + "grad_norm": 5.088656425476074, + "learning_rate": 2.3579849946409436e-06, + "loss": 4.3019, + "step": 923 + }, + { + "epoch": 0.9903536977491961, + "grad_norm": 3.4761383533477783, + "learning_rate": 2.1436227224008577e-06, + "loss": 4.3048, + "step": 924 + }, + { + "epoch": 0.9914255091103966, + "grad_norm": 2.1858043670654297, + "learning_rate": 1.929260450160772e-06, + "loss": 4.239, + "step": 925 + }, + { + "epoch": 0.992497320471597, + "grad_norm": 3.2120463848114014, + "learning_rate": 1.714898177920686e-06, + "loss": 4.2429, + "step": 926 + }, + { + "epoch": 0.9935691318327974, + "grad_norm": 2.1242594718933105, + "learning_rate": 1.5005359056806003e-06, + "loss": 4.0712, + "step": 927 + }, + { + "epoch": 0.9946409431939979, + "grad_norm": 2.9433510303497314, + "learning_rate": 1.2861736334405146e-06, + "loss": 4.0293, + "step": 928 + }, + { + "epoch": 0.9957127545551983, + "grad_norm": 4.833005905151367, + "learning_rate": 1.0718113612004288e-06, + "loss": 4.081, + "step": 929 + }, + { + "epoch": 0.9967845659163987, + "grad_norm": 3.332620620727539, + "learning_rate": 8.57449088960343e-07, + "loss": 4.4352, + "step": 930 + }, + { + "epoch": 0.9978563772775991, + "grad_norm": 2.7460505962371826, + "learning_rate": 6.430868167202573e-07, + "loss": 4.176, + "step": 931 + }, + { + "epoch": 0.9989281886387996, + "grad_norm": 3.637892961502075, + "learning_rate": 4.287245444801715e-07, + "loss": 4.3091, + "step": 932 + }, + { + "epoch": 1.0, + "grad_norm": 2.8850045204162598, + "learning_rate": 2.1436227224008575e-07, + "loss": 4.2321, + "step": 933 + } + ], + "logging_steps": 1, + "max_steps": 933, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1048339206640128.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}