diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,29539 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.13208570653647952, + "eval_steps": 500, + "global_step": 3687, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00027122321670735016, + "grad_norm": 39.49641418457031, + "learning_rate": 5.405405405405406e-09, + "loss": 1.4339, + "num_input_tokens_seen": 2097152, + "step": 1 + }, + { + "epoch": 0.0005424464334147003, + "grad_norm": 34.68007278442383, + "learning_rate": 1.0810810810810811e-08, + "loss": 1.4592, + "num_input_tokens_seen": 4194304, + "step": 2 + }, + { + "epoch": 0.0008136696501220504, + "grad_norm": 39.95429992675781, + "learning_rate": 1.6216216216216218e-08, + "loss": 1.5967, + "num_input_tokens_seen": 6291456, + "step": 3 + }, + { + "epoch": 0.0010848928668294006, + "grad_norm": 31.149150848388672, + "learning_rate": 2.1621621621621623e-08, + "loss": 1.2915, + "num_input_tokens_seen": 8388608, + "step": 4 + }, + { + "epoch": 0.0013561160835367507, + "grad_norm": 42.642982482910156, + "learning_rate": 2.7027027027027028e-08, + "loss": 1.8907, + "num_input_tokens_seen": 10485760, + "step": 5 + }, + { + "epoch": 0.0016273393002441008, + "grad_norm": 41.49152755737305, + "learning_rate": 3.2432432432432436e-08, + "loss": 1.5168, + "num_input_tokens_seen": 12582912, + "step": 6 + }, + { + "epoch": 0.001898562516951451, + "grad_norm": 35.22410202026367, + "learning_rate": 3.783783783783784e-08, + "loss": 1.4075, + "num_input_tokens_seen": 14680064, + "step": 7 + }, + { + "epoch": 0.0021697857336588013, + "grad_norm": 34.59489059448242, + "learning_rate": 4.3243243243243246e-08, + "loss": 1.701, + "num_input_tokens_seen": 16777216, + "step": 8 + }, + { + "epoch": 0.0024410089503661514, + "grad_norm": 40.61011505126953, + "learning_rate": 4.864864864864865e-08, + "loss": 2.0642, + "num_input_tokens_seen": 18874368, + "step": 9 + }, + { + "epoch": 0.0027122321670735015, + "grad_norm": 39.64620590209961, + "learning_rate": 5.4054054054054056e-08, + "loss": 1.7744, + "num_input_tokens_seen": 20971520, + "step": 10 + }, + { + "epoch": 0.0029834553837808516, + "grad_norm": 43.06705856323242, + "learning_rate": 5.945945945945946e-08, + "loss": 1.8829, + "num_input_tokens_seen": 23068672, + "step": 11 + }, + { + "epoch": 0.0032546786004882017, + "grad_norm": 33.02948760986328, + "learning_rate": 6.486486486486487e-08, + "loss": 1.4287, + "num_input_tokens_seen": 25165824, + "step": 12 + }, + { + "epoch": 0.003525901817195552, + "grad_norm": 26.8834285736084, + "learning_rate": 7.027027027027027e-08, + "loss": 1.3548, + "num_input_tokens_seen": 27262976, + "step": 13 + }, + { + "epoch": 0.003797125033902902, + "grad_norm": 40.67939376831055, + "learning_rate": 7.567567567567568e-08, + "loss": 1.8238, + "num_input_tokens_seen": 29360128, + "step": 14 + }, + { + "epoch": 0.0040683482506102524, + "grad_norm": 25.111343383789062, + "learning_rate": 8.108108108108108e-08, + "loss": 0.9103, + "num_input_tokens_seen": 31457280, + "step": 15 + }, + { + "epoch": 0.0043395714673176026, + "grad_norm": 35.32511901855469, + "learning_rate": 8.648648648648649e-08, + "loss": 1.2824, + "num_input_tokens_seen": 33554432, + "step": 16 + }, + { + "epoch": 0.004610794684024953, + "grad_norm": 38.915740966796875, + "learning_rate": 9.189189189189189e-08, + "loss": 1.9842, + "num_input_tokens_seen": 35651584, + "step": 17 + }, + { + "epoch": 0.004882017900732303, + "grad_norm": 37.164268493652344, + "learning_rate": 9.72972972972973e-08, + "loss": 1.7929, + "num_input_tokens_seen": 37748736, + "step": 18 + }, + { + "epoch": 0.005153241117439653, + "grad_norm": 24.598073959350586, + "learning_rate": 1.0270270270270271e-07, + "loss": 0.9814, + "num_input_tokens_seen": 39845888, + "step": 19 + }, + { + "epoch": 0.005424464334147003, + "grad_norm": 24.054792404174805, + "learning_rate": 1.0810810810810811e-07, + "loss": 0.8789, + "num_input_tokens_seen": 41943040, + "step": 20 + }, + { + "epoch": 0.005695687550854353, + "grad_norm": 27.910194396972656, + "learning_rate": 1.1351351351351351e-07, + "loss": 1.0153, + "num_input_tokens_seen": 44040192, + "step": 21 + }, + { + "epoch": 0.005966910767561703, + "grad_norm": 35.53153991699219, + "learning_rate": 1.1891891891891891e-07, + "loss": 1.5747, + "num_input_tokens_seen": 46137344, + "step": 22 + }, + { + "epoch": 0.006238133984269053, + "grad_norm": 38.2767219543457, + "learning_rate": 1.2432432432432432e-07, + "loss": 1.6904, + "num_input_tokens_seen": 48234496, + "step": 23 + }, + { + "epoch": 0.006509357200976403, + "grad_norm": 32.410587310791016, + "learning_rate": 1.2972972972972974e-07, + "loss": 1.9401, + "num_input_tokens_seen": 50331648, + "step": 24 + }, + { + "epoch": 0.0067805804176837535, + "grad_norm": 33.3242073059082, + "learning_rate": 1.3513513513513515e-07, + "loss": 1.2135, + "num_input_tokens_seen": 52428800, + "step": 25 + }, + { + "epoch": 0.007051803634391104, + "grad_norm": 25.93454360961914, + "learning_rate": 1.4054054054054055e-07, + "loss": 0.9542, + "num_input_tokens_seen": 54525952, + "step": 26 + }, + { + "epoch": 0.007323026851098454, + "grad_norm": 21.06357192993164, + "learning_rate": 1.4594594594594595e-07, + "loss": 0.6859, + "num_input_tokens_seen": 56623104, + "step": 27 + }, + { + "epoch": 0.007594250067805804, + "grad_norm": 26.06874656677246, + "learning_rate": 1.5135135135135135e-07, + "loss": 0.8169, + "num_input_tokens_seen": 58720256, + "step": 28 + }, + { + "epoch": 0.007865473284513154, + "grad_norm": 29.234174728393555, + "learning_rate": 1.5675675675675675e-07, + "loss": 0.9595, + "num_input_tokens_seen": 60817408, + "step": 29 + }, + { + "epoch": 0.008136696501220505, + "grad_norm": 31.234846115112305, + "learning_rate": 1.6216216216216215e-07, + "loss": 1.3304, + "num_input_tokens_seen": 62914560, + "step": 30 + }, + { + "epoch": 0.008407919717927854, + "grad_norm": 40.34751892089844, + "learning_rate": 1.6756756756756758e-07, + "loss": 0.8478, + "num_input_tokens_seen": 65011712, + "step": 31 + }, + { + "epoch": 0.008679142934635205, + "grad_norm": 27.279531478881836, + "learning_rate": 1.7297297297297298e-07, + "loss": 0.8963, + "num_input_tokens_seen": 67108864, + "step": 32 + }, + { + "epoch": 0.008950366151342554, + "grad_norm": 21.022947311401367, + "learning_rate": 1.7837837837837838e-07, + "loss": 0.556, + "num_input_tokens_seen": 69206016, + "step": 33 + }, + { + "epoch": 0.009221589368049905, + "grad_norm": 37.905052185058594, + "learning_rate": 1.8378378378378379e-07, + "loss": 0.9591, + "num_input_tokens_seen": 71303168, + "step": 34 + }, + { + "epoch": 0.009492812584757255, + "grad_norm": 37.577152252197266, + "learning_rate": 1.891891891891892e-07, + "loss": 1.6584, + "num_input_tokens_seen": 73400320, + "step": 35 + }, + { + "epoch": 0.009764035801464606, + "grad_norm": 28.42804527282715, + "learning_rate": 1.945945945945946e-07, + "loss": 1.0023, + "num_input_tokens_seen": 75497472, + "step": 36 + }, + { + "epoch": 0.010035259018171955, + "grad_norm": 26.009700775146484, + "learning_rate": 2e-07, + "loss": 0.7367, + "num_input_tokens_seen": 77594624, + "step": 37 + }, + { + "epoch": 0.010306482234879306, + "grad_norm": 23.724992752075195, + "learning_rate": 2.0540540540540542e-07, + "loss": 0.7751, + "num_input_tokens_seen": 79691776, + "step": 38 + }, + { + "epoch": 0.010577705451586655, + "grad_norm": 23.584884643554688, + "learning_rate": 2.1081081081081082e-07, + "loss": 0.6391, + "num_input_tokens_seen": 81788928, + "step": 39 + }, + { + "epoch": 0.010848928668294006, + "grad_norm": 36.632781982421875, + "learning_rate": 2.1621621621621622e-07, + "loss": 1.1372, + "num_input_tokens_seen": 83886080, + "step": 40 + }, + { + "epoch": 0.011120151885001357, + "grad_norm": 36.59737777709961, + "learning_rate": 2.2162162162162162e-07, + "loss": 1.3684, + "num_input_tokens_seen": 85983232, + "step": 41 + }, + { + "epoch": 0.011391375101708706, + "grad_norm": 19.298629760742188, + "learning_rate": 2.2702702702702703e-07, + "loss": 0.5202, + "num_input_tokens_seen": 88080384, + "step": 42 + }, + { + "epoch": 0.011662598318416057, + "grad_norm": 41.80552291870117, + "learning_rate": 2.3243243243243243e-07, + "loss": 0.84, + "num_input_tokens_seen": 90177536, + "step": 43 + }, + { + "epoch": 0.011933821535123406, + "grad_norm": 23.654796600341797, + "learning_rate": 2.3783783783783783e-07, + "loss": 0.6959, + "num_input_tokens_seen": 92274688, + "step": 44 + }, + { + "epoch": 0.012205044751830757, + "grad_norm": 20.645301818847656, + "learning_rate": 2.4324324324324326e-07, + "loss": 0.5908, + "num_input_tokens_seen": 94371840, + "step": 45 + }, + { + "epoch": 0.012476267968538107, + "grad_norm": 30.316768646240234, + "learning_rate": 2.4864864864864863e-07, + "loss": 0.9048, + "num_input_tokens_seen": 96468992, + "step": 46 + }, + { + "epoch": 0.012747491185245458, + "grad_norm": 37.43491744995117, + "learning_rate": 2.5405405405405406e-07, + "loss": 0.9024, + "num_input_tokens_seen": 98566144, + "step": 47 + }, + { + "epoch": 0.013018714401952807, + "grad_norm": 38.99563980102539, + "learning_rate": 2.594594594594595e-07, + "loss": 0.9539, + "num_input_tokens_seen": 100663296, + "step": 48 + }, + { + "epoch": 0.013289937618660158, + "grad_norm": 22.807531356811523, + "learning_rate": 2.6486486486486486e-07, + "loss": 0.5623, + "num_input_tokens_seen": 102760448, + "step": 49 + }, + { + "epoch": 0.013561160835367507, + "grad_norm": 24.203222274780273, + "learning_rate": 2.702702702702703e-07, + "loss": 0.5913, + "num_input_tokens_seen": 104857600, + "step": 50 + }, + { + "epoch": 0.013832384052074858, + "grad_norm": 24.680194854736328, + "learning_rate": 2.7567567567567567e-07, + "loss": 0.4792, + "num_input_tokens_seen": 106954752, + "step": 51 + }, + { + "epoch": 0.014103607268782207, + "grad_norm": 22.284217834472656, + "learning_rate": 2.810810810810811e-07, + "loss": 0.4839, + "num_input_tokens_seen": 109051904, + "step": 52 + }, + { + "epoch": 0.014374830485489558, + "grad_norm": 40.31373596191406, + "learning_rate": 2.8648648648648647e-07, + "loss": 1.8157, + "num_input_tokens_seen": 111149056, + "step": 53 + }, + { + "epoch": 0.014646053702196907, + "grad_norm": 23.11035919189453, + "learning_rate": 2.918918918918919e-07, + "loss": 0.5529, + "num_input_tokens_seen": 113246208, + "step": 54 + }, + { + "epoch": 0.014917276918904258, + "grad_norm": 38.69032287597656, + "learning_rate": 2.972972972972973e-07, + "loss": 1.1754, + "num_input_tokens_seen": 115343360, + "step": 55 + }, + { + "epoch": 0.015188500135611608, + "grad_norm": 18.51485824584961, + "learning_rate": 3.027027027027027e-07, + "loss": 0.4798, + "num_input_tokens_seen": 117440512, + "step": 56 + }, + { + "epoch": 0.015459723352318959, + "grad_norm": 27.4355411529541, + "learning_rate": 3.0810810810810813e-07, + "loss": 0.9573, + "num_input_tokens_seen": 119537664, + "step": 57 + }, + { + "epoch": 0.015730946569026308, + "grad_norm": 24.34264373779297, + "learning_rate": 3.135135135135135e-07, + "loss": 0.4343, + "num_input_tokens_seen": 121634816, + "step": 58 + }, + { + "epoch": 0.01600216978573366, + "grad_norm": 23.938716888427734, + "learning_rate": 3.1891891891891893e-07, + "loss": 0.6501, + "num_input_tokens_seen": 123731968, + "step": 59 + }, + { + "epoch": 0.01627339300244101, + "grad_norm": 20.936176300048828, + "learning_rate": 3.243243243243243e-07, + "loss": 0.4849, + "num_input_tokens_seen": 125829120, + "step": 60 + }, + { + "epoch": 0.01654461621914836, + "grad_norm": 32.816497802734375, + "learning_rate": 3.2972972972972973e-07, + "loss": 0.9031, + "num_input_tokens_seen": 127926272, + "step": 61 + }, + { + "epoch": 0.016815839435855708, + "grad_norm": 26.125568389892578, + "learning_rate": 3.3513513513513516e-07, + "loss": 0.6413, + "num_input_tokens_seen": 130023424, + "step": 62 + }, + { + "epoch": 0.01708706265256306, + "grad_norm": 24.256683349609375, + "learning_rate": 3.4054054054054054e-07, + "loss": 0.8662, + "num_input_tokens_seen": 132120576, + "step": 63 + }, + { + "epoch": 0.01735828586927041, + "grad_norm": 23.595643997192383, + "learning_rate": 3.4594594594594597e-07, + "loss": 0.5304, + "num_input_tokens_seen": 134217728, + "step": 64 + }, + { + "epoch": 0.01762950908597776, + "grad_norm": 21.607576370239258, + "learning_rate": 3.5135135135135134e-07, + "loss": 0.6673, + "num_input_tokens_seen": 136314880, + "step": 65 + }, + { + "epoch": 0.01790073230268511, + "grad_norm": 21.87681007385254, + "learning_rate": 3.5675675675675677e-07, + "loss": 0.544, + "num_input_tokens_seen": 138412032, + "step": 66 + }, + { + "epoch": 0.01817195551939246, + "grad_norm": 25.676681518554688, + "learning_rate": 3.6216216216216214e-07, + "loss": 0.972, + "num_input_tokens_seen": 140509184, + "step": 67 + }, + { + "epoch": 0.01844317873609981, + "grad_norm": 23.464296340942383, + "learning_rate": 3.6756756756756757e-07, + "loss": 0.8916, + "num_input_tokens_seen": 142606336, + "step": 68 + }, + { + "epoch": 0.01871440195280716, + "grad_norm": 30.373069763183594, + "learning_rate": 3.72972972972973e-07, + "loss": 1.0737, + "num_input_tokens_seen": 144703488, + "step": 69 + }, + { + "epoch": 0.01898562516951451, + "grad_norm": 21.44524574279785, + "learning_rate": 3.783783783783784e-07, + "loss": 0.4633, + "num_input_tokens_seen": 146800640, + "step": 70 + }, + { + "epoch": 0.01925684838622186, + "grad_norm": 21.777481079101562, + "learning_rate": 3.837837837837838e-07, + "loss": 0.6957, + "num_input_tokens_seen": 148897792, + "step": 71 + }, + { + "epoch": 0.01952807160292921, + "grad_norm": 36.132041931152344, + "learning_rate": 3.891891891891892e-07, + "loss": 1.2758, + "num_input_tokens_seen": 150994944, + "step": 72 + }, + { + "epoch": 0.019799294819636562, + "grad_norm": 28.66299819946289, + "learning_rate": 3.945945945945946e-07, + "loss": 0.6293, + "num_input_tokens_seen": 153092096, + "step": 73 + }, + { + "epoch": 0.02007051803634391, + "grad_norm": 19.70673942565918, + "learning_rate": 4e-07, + "loss": 0.4103, + "num_input_tokens_seen": 155189248, + "step": 74 + }, + { + "epoch": 0.02034174125305126, + "grad_norm": 20.52765655517578, + "learning_rate": 4.054054054054054e-07, + "loss": 0.538, + "num_input_tokens_seen": 157286400, + "step": 75 + }, + { + "epoch": 0.02061296446975861, + "grad_norm": 22.38905906677246, + "learning_rate": 4.1081081081081084e-07, + "loss": 0.4337, + "num_input_tokens_seen": 159383552, + "step": 76 + }, + { + "epoch": 0.020884187686465962, + "grad_norm": 17.63435173034668, + "learning_rate": 4.162162162162162e-07, + "loss": 0.4086, + "num_input_tokens_seen": 161480704, + "step": 77 + }, + { + "epoch": 0.02115541090317331, + "grad_norm": 27.260120391845703, + "learning_rate": 4.2162162162162164e-07, + "loss": 0.7754, + "num_input_tokens_seen": 163577856, + "step": 78 + }, + { + "epoch": 0.02142663411988066, + "grad_norm": 21.357715606689453, + "learning_rate": 4.27027027027027e-07, + "loss": 0.4759, + "num_input_tokens_seen": 165675008, + "step": 79 + }, + { + "epoch": 0.021697857336588012, + "grad_norm": 19.342985153198242, + "learning_rate": 4.3243243243243244e-07, + "loss": 0.536, + "num_input_tokens_seen": 167772160, + "step": 80 + }, + { + "epoch": 0.021969080553295363, + "grad_norm": 25.34566879272461, + "learning_rate": 4.378378378378378e-07, + "loss": 0.4032, + "num_input_tokens_seen": 169869312, + "step": 81 + }, + { + "epoch": 0.022240303770002714, + "grad_norm": 16.721052169799805, + "learning_rate": 4.4324324324324325e-07, + "loss": 0.4892, + "num_input_tokens_seen": 171966464, + "step": 82 + }, + { + "epoch": 0.02251152698671006, + "grad_norm": 28.31974983215332, + "learning_rate": 4.486486486486487e-07, + "loss": 0.6054, + "num_input_tokens_seen": 174063616, + "step": 83 + }, + { + "epoch": 0.022782750203417412, + "grad_norm": 29.826642990112305, + "learning_rate": 4.5405405405405405e-07, + "loss": 0.7516, + "num_input_tokens_seen": 176160768, + "step": 84 + }, + { + "epoch": 0.023053973420124763, + "grad_norm": 24.864288330078125, + "learning_rate": 4.594594594594595e-07, + "loss": 0.8178, + "num_input_tokens_seen": 178257920, + "step": 85 + }, + { + "epoch": 0.023325196636832114, + "grad_norm": 20.49217414855957, + "learning_rate": 4.6486486486486485e-07, + "loss": 0.6725, + "num_input_tokens_seen": 180355072, + "step": 86 + }, + { + "epoch": 0.023596419853539462, + "grad_norm": 32.16914367675781, + "learning_rate": 4.702702702702703e-07, + "loss": 0.8543, + "num_input_tokens_seen": 182452224, + "step": 87 + }, + { + "epoch": 0.023867643070246813, + "grad_norm": 28.581802368164062, + "learning_rate": 4.7567567567567566e-07, + "loss": 0.5865, + "num_input_tokens_seen": 184549376, + "step": 88 + }, + { + "epoch": 0.024138866286954164, + "grad_norm": 27.280996322631836, + "learning_rate": 4.810810810810811e-07, + "loss": 0.9451, + "num_input_tokens_seen": 186646528, + "step": 89 + }, + { + "epoch": 0.024410089503661515, + "grad_norm": 24.71245765686035, + "learning_rate": 4.864864864864865e-07, + "loss": 0.6835, + "num_input_tokens_seen": 188743680, + "step": 90 + }, + { + "epoch": 0.024681312720368862, + "grad_norm": 18.934518814086914, + "learning_rate": 4.918918918918919e-07, + "loss": 0.3742, + "num_input_tokens_seen": 190840832, + "step": 91 + }, + { + "epoch": 0.024952535937076213, + "grad_norm": 22.003257751464844, + "learning_rate": 4.972972972972973e-07, + "loss": 0.5762, + "num_input_tokens_seen": 192937984, + "step": 92 + }, + { + "epoch": 0.025223759153783564, + "grad_norm": 14.231109619140625, + "learning_rate": 5.027027027027027e-07, + "loss": 0.2882, + "num_input_tokens_seen": 195035136, + "step": 93 + }, + { + "epoch": 0.025494982370490915, + "grad_norm": 26.178083419799805, + "learning_rate": 5.081081081081081e-07, + "loss": 0.829, + "num_input_tokens_seen": 197132288, + "step": 94 + }, + { + "epoch": 0.025766205587198263, + "grad_norm": 26.79072380065918, + "learning_rate": 5.135135135135134e-07, + "loss": 0.6192, + "num_input_tokens_seen": 199229440, + "step": 95 + }, + { + "epoch": 0.026037428803905614, + "grad_norm": 27.17604637145996, + "learning_rate": 5.18918918918919e-07, + "loss": 0.7937, + "num_input_tokens_seen": 201326592, + "step": 96 + }, + { + "epoch": 0.026308652020612965, + "grad_norm": 23.408369064331055, + "learning_rate": 5.243243243243243e-07, + "loss": 0.5996, + "num_input_tokens_seen": 203423744, + "step": 97 + }, + { + "epoch": 0.026579875237320316, + "grad_norm": 30.211393356323242, + "learning_rate": 5.297297297297297e-07, + "loss": 0.8991, + "num_input_tokens_seen": 205520896, + "step": 98 + }, + { + "epoch": 0.026851098454027666, + "grad_norm": 38.316158294677734, + "learning_rate": 5.35135135135135e-07, + "loss": 1.6948, + "num_input_tokens_seen": 207618048, + "step": 99 + }, + { + "epoch": 0.027122321670735014, + "grad_norm": 17.80075454711914, + "learning_rate": 5.405405405405406e-07, + "loss": 0.4548, + "num_input_tokens_seen": 209715200, + "step": 100 + }, + { + "epoch": 0.027393544887442365, + "grad_norm": 20.674842834472656, + "learning_rate": 5.459459459459459e-07, + "loss": 0.4327, + "num_input_tokens_seen": 211812352, + "step": 101 + }, + { + "epoch": 0.027664768104149716, + "grad_norm": 18.368560791015625, + "learning_rate": 5.513513513513513e-07, + "loss": 0.4917, + "num_input_tokens_seen": 213909504, + "step": 102 + }, + { + "epoch": 0.027935991320857067, + "grad_norm": 27.678592681884766, + "learning_rate": 5.567567567567567e-07, + "loss": 0.8262, + "num_input_tokens_seen": 216006656, + "step": 103 + }, + { + "epoch": 0.028207214537564414, + "grad_norm": 23.276077270507812, + "learning_rate": 5.621621621621622e-07, + "loss": 0.5294, + "num_input_tokens_seen": 218103808, + "step": 104 + }, + { + "epoch": 0.028478437754271765, + "grad_norm": 15.913488388061523, + "learning_rate": 5.675675675675675e-07, + "loss": 0.4865, + "num_input_tokens_seen": 220200960, + "step": 105 + }, + { + "epoch": 0.028749660970979116, + "grad_norm": 21.9434871673584, + "learning_rate": 5.729729729729729e-07, + "loss": 0.5292, + "num_input_tokens_seen": 222298112, + "step": 106 + }, + { + "epoch": 0.029020884187686467, + "grad_norm": 23.412330627441406, + "learning_rate": 5.783783783783784e-07, + "loss": 0.7398, + "num_input_tokens_seen": 224395264, + "step": 107 + }, + { + "epoch": 0.029292107404393815, + "grad_norm": 18.569114685058594, + "learning_rate": 5.837837837837838e-07, + "loss": 0.5104, + "num_input_tokens_seen": 226492416, + "step": 108 + }, + { + "epoch": 0.029563330621101166, + "grad_norm": 24.90005874633789, + "learning_rate": 5.891891891891891e-07, + "loss": 0.7079, + "num_input_tokens_seen": 228589568, + "step": 109 + }, + { + "epoch": 0.029834553837808517, + "grad_norm": 28.545652389526367, + "learning_rate": 5.945945945945947e-07, + "loss": 1.0792, + "num_input_tokens_seen": 230686720, + "step": 110 + }, + { + "epoch": 0.030105777054515868, + "grad_norm": 24.83963394165039, + "learning_rate": 6e-07, + "loss": 0.8437, + "num_input_tokens_seen": 232783872, + "step": 111 + }, + { + "epoch": 0.030377000271223215, + "grad_norm": 24.97904396057129, + "learning_rate": 6.054054054054054e-07, + "loss": 1.014, + "num_input_tokens_seen": 234881024, + "step": 112 + }, + { + "epoch": 0.030648223487930566, + "grad_norm": 20.15316390991211, + "learning_rate": 6.108108108108107e-07, + "loss": 0.4607, + "num_input_tokens_seen": 236978176, + "step": 113 + }, + { + "epoch": 0.030919446704637917, + "grad_norm": 17.138620376586914, + "learning_rate": 6.162162162162163e-07, + "loss": 0.4309, + "num_input_tokens_seen": 239075328, + "step": 114 + }, + { + "epoch": 0.031190669921345268, + "grad_norm": 21.900487899780273, + "learning_rate": 6.216216216216216e-07, + "loss": 0.57, + "num_input_tokens_seen": 241172480, + "step": 115 + }, + { + "epoch": 0.031461893138052616, + "grad_norm": 17.815444946289062, + "learning_rate": 6.27027027027027e-07, + "loss": 0.4261, + "num_input_tokens_seen": 243269632, + "step": 116 + }, + { + "epoch": 0.03173311635475997, + "grad_norm": 27.65782928466797, + "learning_rate": 6.324324324324324e-07, + "loss": 0.6149, + "num_input_tokens_seen": 245366784, + "step": 117 + }, + { + "epoch": 0.03200433957146732, + "grad_norm": 22.965322494506836, + "learning_rate": 6.378378378378379e-07, + "loss": 0.7231, + "num_input_tokens_seen": 247463936, + "step": 118 + }, + { + "epoch": 0.032275562788174665, + "grad_norm": 24.27642250061035, + "learning_rate": 6.432432432432432e-07, + "loss": 0.9918, + "num_input_tokens_seen": 249561088, + "step": 119 + }, + { + "epoch": 0.03254678600488202, + "grad_norm": 19.986236572265625, + "learning_rate": 6.486486486486486e-07, + "loss": 0.4824, + "num_input_tokens_seen": 251658240, + "step": 120 + }, + { + "epoch": 0.03281800922158937, + "grad_norm": 28.109073638916016, + "learning_rate": 6.54054054054054e-07, + "loss": 0.9029, + "num_input_tokens_seen": 253755392, + "step": 121 + }, + { + "epoch": 0.03308923243829672, + "grad_norm": 23.587142944335938, + "learning_rate": 6.594594594594595e-07, + "loss": 0.8004, + "num_input_tokens_seen": 255852544, + "step": 122 + }, + { + "epoch": 0.03336045565500407, + "grad_norm": 30.206212997436523, + "learning_rate": 6.648648648648648e-07, + "loss": 0.6818, + "num_input_tokens_seen": 257949696, + "step": 123 + }, + { + "epoch": 0.033631678871711417, + "grad_norm": 25.182830810546875, + "learning_rate": 6.702702702702703e-07, + "loss": 0.8508, + "num_input_tokens_seen": 260046848, + "step": 124 + }, + { + "epoch": 0.03390290208841877, + "grad_norm": 26.919660568237305, + "learning_rate": 6.756756756756756e-07, + "loss": 0.4411, + "num_input_tokens_seen": 262144000, + "step": 125 + }, + { + "epoch": 0.03417412530512612, + "grad_norm": 23.468584060668945, + "learning_rate": 6.810810810810811e-07, + "loss": 0.8003, + "num_input_tokens_seen": 264241152, + "step": 126 + }, + { + "epoch": 0.034445348521833466, + "grad_norm": 18.447784423828125, + "learning_rate": 6.864864864864864e-07, + "loss": 0.5526, + "num_input_tokens_seen": 266338304, + "step": 127 + }, + { + "epoch": 0.03471657173854082, + "grad_norm": 25.12859535217285, + "learning_rate": 6.918918918918919e-07, + "loss": 0.8309, + "num_input_tokens_seen": 268435456, + "step": 128 + }, + { + "epoch": 0.03498779495524817, + "grad_norm": 29.933900833129883, + "learning_rate": 6.972972972972973e-07, + "loss": 0.9983, + "num_input_tokens_seen": 270532608, + "step": 129 + }, + { + "epoch": 0.03525901817195552, + "grad_norm": 20.363910675048828, + "learning_rate": 7.027027027027027e-07, + "loss": 0.655, + "num_input_tokens_seen": 272629760, + "step": 130 + }, + { + "epoch": 0.03553024138866287, + "grad_norm": 25.439632415771484, + "learning_rate": 7.081081081081081e-07, + "loss": 0.5726, + "num_input_tokens_seen": 274726912, + "step": 131 + }, + { + "epoch": 0.03580146460537022, + "grad_norm": 40.57099914550781, + "learning_rate": 7.135135135135135e-07, + "loss": 1.3098, + "num_input_tokens_seen": 276824064, + "step": 132 + }, + { + "epoch": 0.03607268782207757, + "grad_norm": 23.27831268310547, + "learning_rate": 7.189189189189189e-07, + "loss": 0.555, + "num_input_tokens_seen": 278921216, + "step": 133 + }, + { + "epoch": 0.03634391103878492, + "grad_norm": 31.733198165893555, + "learning_rate": 7.243243243243243e-07, + "loss": 0.7732, + "num_input_tokens_seen": 281018368, + "step": 134 + }, + { + "epoch": 0.03661513425549227, + "grad_norm": 34.08761215209961, + "learning_rate": 7.297297297297297e-07, + "loss": 1.1484, + "num_input_tokens_seen": 283115520, + "step": 135 + }, + { + "epoch": 0.03688635747219962, + "grad_norm": 16.70537567138672, + "learning_rate": 7.351351351351351e-07, + "loss": 0.3605, + "num_input_tokens_seen": 285212672, + "step": 136 + }, + { + "epoch": 0.03715758068890697, + "grad_norm": 22.29771614074707, + "learning_rate": 7.405405405405405e-07, + "loss": 0.6014, + "num_input_tokens_seen": 287309824, + "step": 137 + }, + { + "epoch": 0.03742880390561432, + "grad_norm": 18.111330032348633, + "learning_rate": 7.45945945945946e-07, + "loss": 0.5774, + "num_input_tokens_seen": 289406976, + "step": 138 + }, + { + "epoch": 0.03770002712232167, + "grad_norm": 16.711551666259766, + "learning_rate": 7.513513513513513e-07, + "loss": 0.3626, + "num_input_tokens_seen": 291504128, + "step": 139 + }, + { + "epoch": 0.03797125033902902, + "grad_norm": 18.173215866088867, + "learning_rate": 7.567567567567568e-07, + "loss": 0.5264, + "num_input_tokens_seen": 293601280, + "step": 140 + }, + { + "epoch": 0.03824247355573637, + "grad_norm": 15.02091121673584, + "learning_rate": 7.621621621621621e-07, + "loss": 0.323, + "num_input_tokens_seen": 295698432, + "step": 141 + }, + { + "epoch": 0.03851369677244372, + "grad_norm": 18.926862716674805, + "learning_rate": 7.675675675675676e-07, + "loss": 0.3422, + "num_input_tokens_seen": 297795584, + "step": 142 + }, + { + "epoch": 0.038784919989151075, + "grad_norm": 20.333032608032227, + "learning_rate": 7.729729729729729e-07, + "loss": 0.6574, + "num_input_tokens_seen": 299892736, + "step": 143 + }, + { + "epoch": 0.03905614320585842, + "grad_norm": 24.684614181518555, + "learning_rate": 7.783783783783784e-07, + "loss": 0.7197, + "num_input_tokens_seen": 301989888, + "step": 144 + }, + { + "epoch": 0.03932736642256577, + "grad_norm": 28.73190689086914, + "learning_rate": 7.837837837837838e-07, + "loss": 1.0523, + "num_input_tokens_seen": 304087040, + "step": 145 + }, + { + "epoch": 0.039598589639273124, + "grad_norm": 24.374296188354492, + "learning_rate": 7.891891891891892e-07, + "loss": 0.5069, + "num_input_tokens_seen": 306184192, + "step": 146 + }, + { + "epoch": 0.03986981285598047, + "grad_norm": 21.409297943115234, + "learning_rate": 7.945945945945945e-07, + "loss": 0.5361, + "num_input_tokens_seen": 308281344, + "step": 147 + }, + { + "epoch": 0.04014103607268782, + "grad_norm": 24.508695602416992, + "learning_rate": 8e-07, + "loss": 0.7915, + "num_input_tokens_seen": 310378496, + "step": 148 + }, + { + "epoch": 0.040412259289395173, + "grad_norm": 20.341073989868164, + "learning_rate": 8.054054054054054e-07, + "loss": 0.4466, + "num_input_tokens_seen": 312475648, + "step": 149 + }, + { + "epoch": 0.04068348250610252, + "grad_norm": 23.124542236328125, + "learning_rate": 8.108108108108108e-07, + "loss": 0.6242, + "num_input_tokens_seen": 314572800, + "step": 150 + }, + { + "epoch": 0.040954705722809875, + "grad_norm": 21.62948226928711, + "learning_rate": 8.162162162162161e-07, + "loss": 0.811, + "num_input_tokens_seen": 316669952, + "step": 151 + }, + { + "epoch": 0.04122592893951722, + "grad_norm": 22.204092025756836, + "learning_rate": 8.216216216216217e-07, + "loss": 0.6949, + "num_input_tokens_seen": 318767104, + "step": 152 + }, + { + "epoch": 0.04149715215622457, + "grad_norm": 24.287416458129883, + "learning_rate": 8.27027027027027e-07, + "loss": 0.5292, + "num_input_tokens_seen": 320864256, + "step": 153 + }, + { + "epoch": 0.041768375372931925, + "grad_norm": 20.532560348510742, + "learning_rate": 8.324324324324324e-07, + "loss": 0.5385, + "num_input_tokens_seen": 322961408, + "step": 154 + }, + { + "epoch": 0.04203959858963927, + "grad_norm": 24.48872184753418, + "learning_rate": 8.378378378378377e-07, + "loss": 0.9439, + "num_input_tokens_seen": 325058560, + "step": 155 + }, + { + "epoch": 0.04231082180634662, + "grad_norm": 23.042749404907227, + "learning_rate": 8.432432432432433e-07, + "loss": 0.7453, + "num_input_tokens_seen": 327155712, + "step": 156 + }, + { + "epoch": 0.042582045023053974, + "grad_norm": 22.487960815429688, + "learning_rate": 8.486486486486486e-07, + "loss": 0.7324, + "num_input_tokens_seen": 329252864, + "step": 157 + }, + { + "epoch": 0.04285326823976132, + "grad_norm": 24.17815589904785, + "learning_rate": 8.54054054054054e-07, + "loss": 0.7878, + "num_input_tokens_seen": 331350016, + "step": 158 + }, + { + "epoch": 0.043124491456468676, + "grad_norm": 18.532455444335938, + "learning_rate": 8.594594594594595e-07, + "loss": 0.5971, + "num_input_tokens_seen": 333447168, + "step": 159 + }, + { + "epoch": 0.043395714673176024, + "grad_norm": 25.64451026916504, + "learning_rate": 8.648648648648649e-07, + "loss": 0.9119, + "num_input_tokens_seen": 335544320, + "step": 160 + }, + { + "epoch": 0.04366693788988337, + "grad_norm": 17.135353088378906, + "learning_rate": 8.702702702702702e-07, + "loss": 0.5172, + "num_input_tokens_seen": 337641472, + "step": 161 + }, + { + "epoch": 0.043938161106590726, + "grad_norm": 24.14339828491211, + "learning_rate": 8.756756756756756e-07, + "loss": 0.8312, + "num_input_tokens_seen": 339738624, + "step": 162 + }, + { + "epoch": 0.04420938432329807, + "grad_norm": 28.03148078918457, + "learning_rate": 8.810810810810811e-07, + "loss": 1.1445, + "num_input_tokens_seen": 341835776, + "step": 163 + }, + { + "epoch": 0.04448060754000543, + "grad_norm": 16.04241180419922, + "learning_rate": 8.864864864864865e-07, + "loss": 0.3395, + "num_input_tokens_seen": 343932928, + "step": 164 + }, + { + "epoch": 0.044751830756712775, + "grad_norm": 25.558391571044922, + "learning_rate": 8.918918918918918e-07, + "loss": 0.6255, + "num_input_tokens_seen": 346030080, + "step": 165 + }, + { + "epoch": 0.04502305397342012, + "grad_norm": 19.032011032104492, + "learning_rate": 8.972972972972974e-07, + "loss": 0.578, + "num_input_tokens_seen": 348127232, + "step": 166 + }, + { + "epoch": 0.04529427719012748, + "grad_norm": 16.146650314331055, + "learning_rate": 9.027027027027027e-07, + "loss": 0.3783, + "num_input_tokens_seen": 350224384, + "step": 167 + }, + { + "epoch": 0.045565500406834825, + "grad_norm": 20.948945999145508, + "learning_rate": 9.081081081081081e-07, + "loss": 0.5665, + "num_input_tokens_seen": 352321536, + "step": 168 + }, + { + "epoch": 0.04583672362354217, + "grad_norm": 18.601686477661133, + "learning_rate": 9.135135135135134e-07, + "loss": 0.5568, + "num_input_tokens_seen": 354418688, + "step": 169 + }, + { + "epoch": 0.04610794684024953, + "grad_norm": 21.014583587646484, + "learning_rate": 9.18918918918919e-07, + "loss": 0.6105, + "num_input_tokens_seen": 356515840, + "step": 170 + }, + { + "epoch": 0.046379170056956874, + "grad_norm": 22.21685218811035, + "learning_rate": 9.243243243243243e-07, + "loss": 0.5955, + "num_input_tokens_seen": 358612992, + "step": 171 + }, + { + "epoch": 0.04665039327366423, + "grad_norm": 18.437280654907227, + "learning_rate": 9.297297297297297e-07, + "loss": 0.5311, + "num_input_tokens_seen": 360710144, + "step": 172 + }, + { + "epoch": 0.046921616490371576, + "grad_norm": 26.544940948486328, + "learning_rate": 9.351351351351351e-07, + "loss": 0.4681, + "num_input_tokens_seen": 362807296, + "step": 173 + }, + { + "epoch": 0.047192839707078924, + "grad_norm": 19.14557647705078, + "learning_rate": 9.405405405405406e-07, + "loss": 0.5458, + "num_input_tokens_seen": 364904448, + "step": 174 + }, + { + "epoch": 0.04746406292378628, + "grad_norm": 30.64264488220215, + "learning_rate": 9.459459459459459e-07, + "loss": 0.4979, + "num_input_tokens_seen": 367001600, + "step": 175 + }, + { + "epoch": 0.047735286140493625, + "grad_norm": 23.881771087646484, + "learning_rate": 9.513513513513513e-07, + "loss": 0.7094, + "num_input_tokens_seen": 369098752, + "step": 176 + }, + { + "epoch": 0.04800650935720097, + "grad_norm": 25.534412384033203, + "learning_rate": 9.567567567567567e-07, + "loss": 0.7444, + "num_input_tokens_seen": 371195904, + "step": 177 + }, + { + "epoch": 0.04827773257390833, + "grad_norm": 29.173614501953125, + "learning_rate": 9.621621621621622e-07, + "loss": 0.7494, + "num_input_tokens_seen": 373293056, + "step": 178 + }, + { + "epoch": 0.048548955790615675, + "grad_norm": 29.733861923217773, + "learning_rate": 9.675675675675676e-07, + "loss": 0.908, + "num_input_tokens_seen": 375390208, + "step": 179 + }, + { + "epoch": 0.04882017900732303, + "grad_norm": 18.181663513183594, + "learning_rate": 9.72972972972973e-07, + "loss": 0.3587, + "num_input_tokens_seen": 377487360, + "step": 180 + }, + { + "epoch": 0.04909140222403038, + "grad_norm": 18.831315994262695, + "learning_rate": 9.783783783783782e-07, + "loss": 0.5482, + "num_input_tokens_seen": 379584512, + "step": 181 + }, + { + "epoch": 0.049362625440737724, + "grad_norm": 20.908985137939453, + "learning_rate": 9.837837837837839e-07, + "loss": 0.3593, + "num_input_tokens_seen": 381681664, + "step": 182 + }, + { + "epoch": 0.04963384865744508, + "grad_norm": 24.25267791748047, + "learning_rate": 9.89189189189189e-07, + "loss": 0.7042, + "num_input_tokens_seen": 383778816, + "step": 183 + }, + { + "epoch": 0.049905071874152426, + "grad_norm": 19.45100212097168, + "learning_rate": 9.945945945945945e-07, + "loss": 0.4047, + "num_input_tokens_seen": 385875968, + "step": 184 + }, + { + "epoch": 0.05017629509085978, + "grad_norm": 19.713361740112305, + "learning_rate": 1e-06, + "loss": 0.4491, + "num_input_tokens_seen": 387973120, + "step": 185 + }, + { + "epoch": 0.05044751830756713, + "grad_norm": 19.20697784423828, + "learning_rate": 9.99999818928562e-07, + "loss": 0.5432, + "num_input_tokens_seen": 390070272, + "step": 186 + }, + { + "epoch": 0.050718741524274476, + "grad_norm": 25.145559310913086, + "learning_rate": 9.999992757143933e-07, + "loss": 0.6321, + "num_input_tokens_seen": 392167424, + "step": 187 + }, + { + "epoch": 0.05098996474098183, + "grad_norm": 21.439443588256836, + "learning_rate": 9.999983703579313e-07, + "loss": 0.529, + "num_input_tokens_seen": 394264576, + "step": 188 + }, + { + "epoch": 0.05126118795768918, + "grad_norm": 21.965892791748047, + "learning_rate": 9.999971028599045e-07, + "loss": 0.6538, + "num_input_tokens_seen": 396361728, + "step": 189 + }, + { + "epoch": 0.051532411174396525, + "grad_norm": 17.03687286376953, + "learning_rate": 9.99995473221333e-07, + "loss": 0.4129, + "num_input_tokens_seen": 398458880, + "step": 190 + }, + { + "epoch": 0.05180363439110388, + "grad_norm": 21.389453887939453, + "learning_rate": 9.999934814435284e-07, + "loss": 0.5968, + "num_input_tokens_seen": 400556032, + "step": 191 + }, + { + "epoch": 0.05207485760781123, + "grad_norm": 22.757781982421875, + "learning_rate": 9.999911275280933e-07, + "loss": 0.604, + "num_input_tokens_seen": 402653184, + "step": 192 + }, + { + "epoch": 0.05234608082451858, + "grad_norm": 26.506017684936523, + "learning_rate": 9.999884114769223e-07, + "loss": 1.0295, + "num_input_tokens_seen": 404750336, + "step": 193 + }, + { + "epoch": 0.05261730404122593, + "grad_norm": 27.058204650878906, + "learning_rate": 9.99985333292201e-07, + "loss": 0.6544, + "num_input_tokens_seen": 406847488, + "step": 194 + }, + { + "epoch": 0.05288852725793328, + "grad_norm": 28.308189392089844, + "learning_rate": 9.999818929764068e-07, + "loss": 1.1371, + "num_input_tokens_seen": 408944640, + "step": 195 + }, + { + "epoch": 0.05315975047464063, + "grad_norm": 18.778104782104492, + "learning_rate": 9.99978090532308e-07, + "loss": 0.5368, + "num_input_tokens_seen": 411041792, + "step": 196 + }, + { + "epoch": 0.05343097369134798, + "grad_norm": 19.98699188232422, + "learning_rate": 9.99973925962965e-07, + "loss": 0.7261, + "num_input_tokens_seen": 413138944, + "step": 197 + }, + { + "epoch": 0.05370219690805533, + "grad_norm": 15.628427505493164, + "learning_rate": 9.999693992717292e-07, + "loss": 0.4129, + "num_input_tokens_seen": 415236096, + "step": 198 + }, + { + "epoch": 0.05397342012476268, + "grad_norm": 15.005875587463379, + "learning_rate": 9.999645104622434e-07, + "loss": 0.3913, + "num_input_tokens_seen": 417333248, + "step": 199 + }, + { + "epoch": 0.05424464334147003, + "grad_norm": 25.148635864257812, + "learning_rate": 9.99959259538442e-07, + "loss": 0.5475, + "num_input_tokens_seen": 419430400, + "step": 200 + }, + { + "epoch": 0.05451586655817738, + "grad_norm": 16.650300979614258, + "learning_rate": 9.99953646504551e-07, + "loss": 0.4695, + "num_input_tokens_seen": 421527552, + "step": 201 + }, + { + "epoch": 0.05478708977488473, + "grad_norm": 25.41582489013672, + "learning_rate": 9.99947671365087e-07, + "loss": 0.7833, + "num_input_tokens_seen": 423624704, + "step": 202 + }, + { + "epoch": 0.05505831299159208, + "grad_norm": 24.495014190673828, + "learning_rate": 9.99941334124859e-07, + "loss": 0.6772, + "num_input_tokens_seen": 425721856, + "step": 203 + }, + { + "epoch": 0.05532953620829943, + "grad_norm": 20.63805389404297, + "learning_rate": 9.999346347889667e-07, + "loss": 0.6223, + "num_input_tokens_seen": 427819008, + "step": 204 + }, + { + "epoch": 0.05560075942500678, + "grad_norm": 27.66077423095703, + "learning_rate": 9.999275733628017e-07, + "loss": 0.7415, + "num_input_tokens_seen": 429916160, + "step": 205 + }, + { + "epoch": 0.055871982641714134, + "grad_norm": 21.065479278564453, + "learning_rate": 9.999201498520466e-07, + "loss": 0.5617, + "num_input_tokens_seen": 432013312, + "step": 206 + }, + { + "epoch": 0.05614320585842148, + "grad_norm": 19.933509826660156, + "learning_rate": 9.999123642626758e-07, + "loss": 0.4846, + "num_input_tokens_seen": 434110464, + "step": 207 + }, + { + "epoch": 0.05641442907512883, + "grad_norm": 19.882827758789062, + "learning_rate": 9.999042166009544e-07, + "loss": 0.5699, + "num_input_tokens_seen": 436207616, + "step": 208 + }, + { + "epoch": 0.05668565229183618, + "grad_norm": 13.684101104736328, + "learning_rate": 9.998957068734399e-07, + "loss": 0.3169, + "num_input_tokens_seen": 438304768, + "step": 209 + }, + { + "epoch": 0.05695687550854353, + "grad_norm": 26.058393478393555, + "learning_rate": 9.9988683508698e-07, + "loss": 0.9399, + "num_input_tokens_seen": 440401920, + "step": 210 + }, + { + "epoch": 0.05722809872525088, + "grad_norm": 28.639570236206055, + "learning_rate": 9.99877601248715e-07, + "loss": 1.0482, + "num_input_tokens_seen": 442499072, + "step": 211 + }, + { + "epoch": 0.05749932194195823, + "grad_norm": 22.742692947387695, + "learning_rate": 9.998680053660756e-07, + "loss": 0.5136, + "num_input_tokens_seen": 444596224, + "step": 212 + }, + { + "epoch": 0.05777054515866558, + "grad_norm": 26.113285064697266, + "learning_rate": 9.998580474467842e-07, + "loss": 0.6692, + "num_input_tokens_seen": 446693376, + "step": 213 + }, + { + "epoch": 0.058041768375372935, + "grad_norm": 15.980868339538574, + "learning_rate": 9.998477274988545e-07, + "loss": 0.4405, + "num_input_tokens_seen": 448790528, + "step": 214 + }, + { + "epoch": 0.05831299159208028, + "grad_norm": 14.49191665649414, + "learning_rate": 9.998370455305918e-07, + "loss": 0.4361, + "num_input_tokens_seen": 450887680, + "step": 215 + }, + { + "epoch": 0.05858421480878763, + "grad_norm": 26.061147689819336, + "learning_rate": 9.998260015505923e-07, + "loss": 0.6016, + "num_input_tokens_seen": 452984832, + "step": 216 + }, + { + "epoch": 0.058855438025494984, + "grad_norm": 18.14351463317871, + "learning_rate": 9.998145955677438e-07, + "loss": 0.4917, + "num_input_tokens_seen": 455081984, + "step": 217 + }, + { + "epoch": 0.05912666124220233, + "grad_norm": 13.798508644104004, + "learning_rate": 9.998028275912257e-07, + "loss": 0.3657, + "num_input_tokens_seen": 457179136, + "step": 218 + }, + { + "epoch": 0.059397884458909686, + "grad_norm": 22.545015335083008, + "learning_rate": 9.997906976305082e-07, + "loss": 0.6051, + "num_input_tokens_seen": 459276288, + "step": 219 + }, + { + "epoch": 0.059669107675617034, + "grad_norm": 27.07939910888672, + "learning_rate": 9.99778205695353e-07, + "loss": 0.7532, + "num_input_tokens_seen": 461373440, + "step": 220 + }, + { + "epoch": 0.05994033089232438, + "grad_norm": 15.4199800491333, + "learning_rate": 9.997653517958132e-07, + "loss": 0.4418, + "num_input_tokens_seen": 463470592, + "step": 221 + }, + { + "epoch": 0.060211554109031735, + "grad_norm": 23.138856887817383, + "learning_rate": 9.997521359422332e-07, + "loss": 0.6708, + "num_input_tokens_seen": 465567744, + "step": 222 + }, + { + "epoch": 0.06048277732573908, + "grad_norm": 19.70016860961914, + "learning_rate": 9.997385581452484e-07, + "loss": 0.4303, + "num_input_tokens_seen": 467664896, + "step": 223 + }, + { + "epoch": 0.06075400054244643, + "grad_norm": 15.138772010803223, + "learning_rate": 9.99724618415786e-07, + "loss": 0.3837, + "num_input_tokens_seen": 469762048, + "step": 224 + }, + { + "epoch": 0.061025223759153785, + "grad_norm": 20.184555053710938, + "learning_rate": 9.997103167650637e-07, + "loss": 0.732, + "num_input_tokens_seen": 471859200, + "step": 225 + }, + { + "epoch": 0.06129644697586113, + "grad_norm": 16.30415153503418, + "learning_rate": 9.996956532045914e-07, + "loss": 0.4628, + "num_input_tokens_seen": 473956352, + "step": 226 + }, + { + "epoch": 0.06156767019256849, + "grad_norm": 28.651100158691406, + "learning_rate": 9.996806277461696e-07, + "loss": 0.9341, + "num_input_tokens_seen": 476053504, + "step": 227 + }, + { + "epoch": 0.061838893409275834, + "grad_norm": 29.19915771484375, + "learning_rate": 9.9966524040189e-07, + "loss": 0.788, + "num_input_tokens_seen": 478150656, + "step": 228 + }, + { + "epoch": 0.06211011662598318, + "grad_norm": 22.999082565307617, + "learning_rate": 9.996494911841363e-07, + "loss": 0.6176, + "num_input_tokens_seen": 480247808, + "step": 229 + }, + { + "epoch": 0.062381339842690536, + "grad_norm": 14.717134475708008, + "learning_rate": 9.996333801055823e-07, + "loss": 0.3239, + "num_input_tokens_seen": 482344960, + "step": 230 + }, + { + "epoch": 0.06265256305939788, + "grad_norm": 17.593320846557617, + "learning_rate": 9.99616907179194e-07, + "loss": 0.4652, + "num_input_tokens_seen": 484442112, + "step": 231 + }, + { + "epoch": 0.06292378627610523, + "grad_norm": 25.229026794433594, + "learning_rate": 9.996000724182278e-07, + "loss": 0.8025, + "num_input_tokens_seen": 486539264, + "step": 232 + }, + { + "epoch": 0.06319500949281258, + "grad_norm": 23.369489669799805, + "learning_rate": 9.99582875836232e-07, + "loss": 0.6686, + "num_input_tokens_seen": 488636416, + "step": 233 + }, + { + "epoch": 0.06346623270951994, + "grad_norm": 25.185129165649414, + "learning_rate": 9.995653174470456e-07, + "loss": 0.7091, + "num_input_tokens_seen": 490733568, + "step": 234 + }, + { + "epoch": 0.06373745592622729, + "grad_norm": 18.694257736206055, + "learning_rate": 9.99547397264799e-07, + "loss": 0.4416, + "num_input_tokens_seen": 492830720, + "step": 235 + }, + { + "epoch": 0.06400867914293464, + "grad_norm": 14.388599395751953, + "learning_rate": 9.995291153039135e-07, + "loss": 0.3705, + "num_input_tokens_seen": 494927872, + "step": 236 + }, + { + "epoch": 0.06427990235964198, + "grad_norm": 19.602264404296875, + "learning_rate": 9.99510471579102e-07, + "loss": 0.7077, + "num_input_tokens_seen": 497025024, + "step": 237 + }, + { + "epoch": 0.06455112557634933, + "grad_norm": 17.90461540222168, + "learning_rate": 9.99491466105368e-07, + "loss": 0.5336, + "num_input_tokens_seen": 499122176, + "step": 238 + }, + { + "epoch": 0.06482234879305669, + "grad_norm": 25.28644561767578, + "learning_rate": 9.994720988980065e-07, + "loss": 0.6312, + "num_input_tokens_seen": 501219328, + "step": 239 + }, + { + "epoch": 0.06509357200976404, + "grad_norm": 19.621496200561523, + "learning_rate": 9.994523699726035e-07, + "loss": 0.3913, + "num_input_tokens_seen": 503316480, + "step": 240 + }, + { + "epoch": 0.06536479522647139, + "grad_norm": 23.50242042541504, + "learning_rate": 9.994322793450361e-07, + "loss": 0.8346, + "num_input_tokens_seen": 505413632, + "step": 241 + }, + { + "epoch": 0.06563601844317873, + "grad_norm": 24.584815979003906, + "learning_rate": 9.994118270314725e-07, + "loss": 0.8, + "num_input_tokens_seen": 507510784, + "step": 242 + }, + { + "epoch": 0.06590724165988608, + "grad_norm": 23.060474395751953, + "learning_rate": 9.993910130483717e-07, + "loss": 0.7698, + "num_input_tokens_seen": 509607936, + "step": 243 + }, + { + "epoch": 0.06617846487659344, + "grad_norm": 14.419717788696289, + "learning_rate": 9.993698374124844e-07, + "loss": 0.4105, + "num_input_tokens_seen": 511705088, + "step": 244 + }, + { + "epoch": 0.06644968809330079, + "grad_norm": 17.786216735839844, + "learning_rate": 9.993483001408516e-07, + "loss": 0.6176, + "num_input_tokens_seen": 513802240, + "step": 245 + }, + { + "epoch": 0.06672091131000814, + "grad_norm": 24.117874145507812, + "learning_rate": 9.99326401250806e-07, + "loss": 0.7823, + "num_input_tokens_seen": 515899392, + "step": 246 + }, + { + "epoch": 0.06699213452671549, + "grad_norm": 17.07128143310547, + "learning_rate": 9.993041407599708e-07, + "loss": 0.5452, + "num_input_tokens_seen": 517996544, + "step": 247 + }, + { + "epoch": 0.06726335774342283, + "grad_norm": 16.80393409729004, + "learning_rate": 9.992815186862602e-07, + "loss": 0.4867, + "num_input_tokens_seen": 520093696, + "step": 248 + }, + { + "epoch": 0.06753458096013018, + "grad_norm": 22.029376983642578, + "learning_rate": 9.9925853504788e-07, + "loss": 0.684, + "num_input_tokens_seen": 522190848, + "step": 249 + }, + { + "epoch": 0.06780580417683754, + "grad_norm": 20.824499130249023, + "learning_rate": 9.992351898633262e-07, + "loss": 0.665, + "num_input_tokens_seen": 524288000, + "step": 250 + }, + { + "epoch": 0.06807702739354489, + "grad_norm": 13.08923625946045, + "learning_rate": 9.992114831513863e-07, + "loss": 0.2885, + "num_input_tokens_seen": 526385152, + "step": 251 + }, + { + "epoch": 0.06834825061025224, + "grad_norm": 17.171178817749023, + "learning_rate": 9.991874149311386e-07, + "loss": 0.5503, + "num_input_tokens_seen": 528482304, + "step": 252 + }, + { + "epoch": 0.06861947382695958, + "grad_norm": 28.414289474487305, + "learning_rate": 9.991629852219523e-07, + "loss": 0.974, + "num_input_tokens_seen": 530579456, + "step": 253 + }, + { + "epoch": 0.06889069704366693, + "grad_norm": 22.36134910583496, + "learning_rate": 9.991381940434873e-07, + "loss": 0.741, + "num_input_tokens_seen": 532676608, + "step": 254 + }, + { + "epoch": 0.0691619202603743, + "grad_norm": 22.147043228149414, + "learning_rate": 9.991130414156946e-07, + "loss": 0.9216, + "num_input_tokens_seen": 534773760, + "step": 255 + }, + { + "epoch": 0.06943314347708164, + "grad_norm": 17.52049446105957, + "learning_rate": 9.990875273588161e-07, + "loss": 0.6055, + "num_input_tokens_seen": 536870912, + "step": 256 + }, + { + "epoch": 0.06970436669378899, + "grad_norm": 29.130477905273438, + "learning_rate": 9.99061651893385e-07, + "loss": 0.5761, + "num_input_tokens_seen": 538968064, + "step": 257 + }, + { + "epoch": 0.06997558991049634, + "grad_norm": 18.812026977539062, + "learning_rate": 9.990354150402242e-07, + "loss": 0.5078, + "num_input_tokens_seen": 541065216, + "step": 258 + }, + { + "epoch": 0.07024681312720368, + "grad_norm": 15.69344711303711, + "learning_rate": 9.990088168204487e-07, + "loss": 0.5184, + "num_input_tokens_seen": 543162368, + "step": 259 + }, + { + "epoch": 0.07051803634391104, + "grad_norm": 24.258792877197266, + "learning_rate": 9.989818572554633e-07, + "loss": 0.8459, + "num_input_tokens_seen": 545259520, + "step": 260 + }, + { + "epoch": 0.07078925956061839, + "grad_norm": 20.177865982055664, + "learning_rate": 9.989545363669644e-07, + "loss": 0.7397, + "num_input_tokens_seen": 547356672, + "step": 261 + }, + { + "epoch": 0.07106048277732574, + "grad_norm": 14.865650177001953, + "learning_rate": 9.989268541769383e-07, + "loss": 0.3984, + "num_input_tokens_seen": 549453824, + "step": 262 + }, + { + "epoch": 0.07133170599403309, + "grad_norm": 16.958553314208984, + "learning_rate": 9.988988107076632e-07, + "loss": 0.5954, + "num_input_tokens_seen": 551550976, + "step": 263 + }, + { + "epoch": 0.07160292921074043, + "grad_norm": 17.816438674926758, + "learning_rate": 9.98870405981707e-07, + "loss": 0.504, + "num_input_tokens_seen": 553648128, + "step": 264 + }, + { + "epoch": 0.0718741524274478, + "grad_norm": 17.33941650390625, + "learning_rate": 9.988416400219288e-07, + "loss": 0.6362, + "num_input_tokens_seen": 555745280, + "step": 265 + }, + { + "epoch": 0.07214537564415514, + "grad_norm": 23.123384475708008, + "learning_rate": 9.988125128514785e-07, + "loss": 0.8346, + "num_input_tokens_seen": 557842432, + "step": 266 + }, + { + "epoch": 0.07241659886086249, + "grad_norm": 19.134353637695312, + "learning_rate": 9.987830244937964e-07, + "loss": 0.8193, + "num_input_tokens_seen": 559939584, + "step": 267 + }, + { + "epoch": 0.07268782207756984, + "grad_norm": 20.602140426635742, + "learning_rate": 9.987531749726137e-07, + "loss": 0.5401, + "num_input_tokens_seen": 562036736, + "step": 268 + }, + { + "epoch": 0.07295904529427719, + "grad_norm": 21.157861709594727, + "learning_rate": 9.98722964311952e-07, + "loss": 0.6521, + "num_input_tokens_seen": 564133888, + "step": 269 + }, + { + "epoch": 0.07323026851098453, + "grad_norm": 20.075769424438477, + "learning_rate": 9.986923925361238e-07, + "loss": 0.6853, + "num_input_tokens_seen": 566231040, + "step": 270 + }, + { + "epoch": 0.0735014917276919, + "grad_norm": 19.817943572998047, + "learning_rate": 9.98661459669732e-07, + "loss": 0.7238, + "num_input_tokens_seen": 568328192, + "step": 271 + }, + { + "epoch": 0.07377271494439924, + "grad_norm": 20.882659912109375, + "learning_rate": 9.986301657376705e-07, + "loss": 0.7125, + "num_input_tokens_seen": 570425344, + "step": 272 + }, + { + "epoch": 0.07404393816110659, + "grad_norm": 17.62346649169922, + "learning_rate": 9.985985107651231e-07, + "loss": 0.4401, + "num_input_tokens_seen": 572522496, + "step": 273 + }, + { + "epoch": 0.07431516137781394, + "grad_norm": 23.123687744140625, + "learning_rate": 9.985664947775649e-07, + "loss": 0.8847, + "num_input_tokens_seen": 574619648, + "step": 274 + }, + { + "epoch": 0.07458638459452128, + "grad_norm": 16.682695388793945, + "learning_rate": 9.985341178007608e-07, + "loss": 0.5292, + "num_input_tokens_seen": 576716800, + "step": 275 + }, + { + "epoch": 0.07485760781122865, + "grad_norm": 22.821746826171875, + "learning_rate": 9.985013798607666e-07, + "loss": 0.8337, + "num_input_tokens_seen": 578813952, + "step": 276 + }, + { + "epoch": 0.075128831027936, + "grad_norm": 23.22003746032715, + "learning_rate": 9.98468280983929e-07, + "loss": 0.5835, + "num_input_tokens_seen": 580911104, + "step": 277 + }, + { + "epoch": 0.07540005424464334, + "grad_norm": 24.205345153808594, + "learning_rate": 9.984348211968837e-07, + "loss": 0.8618, + "num_input_tokens_seen": 583008256, + "step": 278 + }, + { + "epoch": 0.07567127746135069, + "grad_norm": 19.946861267089844, + "learning_rate": 9.984010005265592e-07, + "loss": 0.6434, + "num_input_tokens_seen": 585105408, + "step": 279 + }, + { + "epoch": 0.07594250067805804, + "grad_norm": 17.581974029541016, + "learning_rate": 9.98366819000172e-07, + "loss": 0.5806, + "num_input_tokens_seen": 587202560, + "step": 280 + }, + { + "epoch": 0.0762137238947654, + "grad_norm": 18.351654052734375, + "learning_rate": 9.983322766452305e-07, + "loss": 0.4418, + "num_input_tokens_seen": 589299712, + "step": 281 + }, + { + "epoch": 0.07648494711147275, + "grad_norm": 13.627374649047852, + "learning_rate": 9.98297373489533e-07, + "loss": 0.2781, + "num_input_tokens_seen": 591396864, + "step": 282 + }, + { + "epoch": 0.07675617032818009, + "grad_norm": 20.957977294921875, + "learning_rate": 9.982621095611686e-07, + "loss": 0.6597, + "num_input_tokens_seen": 593494016, + "step": 283 + }, + { + "epoch": 0.07702739354488744, + "grad_norm": 18.29064178466797, + "learning_rate": 9.98226484888516e-07, + "loss": 0.4855, + "num_input_tokens_seen": 595591168, + "step": 284 + }, + { + "epoch": 0.07729861676159479, + "grad_norm": 18.134428024291992, + "learning_rate": 9.981904995002443e-07, + "loss": 0.4324, + "num_input_tokens_seen": 597688320, + "step": 285 + }, + { + "epoch": 0.07756983997830215, + "grad_norm": 24.423080444335938, + "learning_rate": 9.98154153425314e-07, + "loss": 0.5194, + "num_input_tokens_seen": 599785472, + "step": 286 + }, + { + "epoch": 0.0778410631950095, + "grad_norm": 23.057655334472656, + "learning_rate": 9.981174466929742e-07, + "loss": 0.5979, + "num_input_tokens_seen": 601882624, + "step": 287 + }, + { + "epoch": 0.07811228641171684, + "grad_norm": 18.815351486206055, + "learning_rate": 9.980803793327655e-07, + "loss": 0.516, + "num_input_tokens_seen": 603979776, + "step": 288 + }, + { + "epoch": 0.07838350962842419, + "grad_norm": 23.535762786865234, + "learning_rate": 9.980429513745182e-07, + "loss": 0.7408, + "num_input_tokens_seen": 606076928, + "step": 289 + }, + { + "epoch": 0.07865473284513154, + "grad_norm": 30.299724578857422, + "learning_rate": 9.980051628483532e-07, + "loss": 1.0291, + "num_input_tokens_seen": 608174080, + "step": 290 + }, + { + "epoch": 0.07892595606183889, + "grad_norm": 18.080909729003906, + "learning_rate": 9.979670137846806e-07, + "loss": 0.6044, + "num_input_tokens_seen": 610271232, + "step": 291 + }, + { + "epoch": 0.07919717927854625, + "grad_norm": 29.091594696044922, + "learning_rate": 9.97928504214202e-07, + "loss": 1.0647, + "num_input_tokens_seen": 612368384, + "step": 292 + }, + { + "epoch": 0.0794684024952536, + "grad_norm": 23.396774291992188, + "learning_rate": 9.97889634167908e-07, + "loss": 0.9621, + "num_input_tokens_seen": 614465536, + "step": 293 + }, + { + "epoch": 0.07973962571196094, + "grad_norm": 14.316418647766113, + "learning_rate": 9.978504036770802e-07, + "loss": 0.3203, + "num_input_tokens_seen": 616562688, + "step": 294 + }, + { + "epoch": 0.08001084892866829, + "grad_norm": 14.00649356842041, + "learning_rate": 9.978108127732892e-07, + "loss": 0.3896, + "num_input_tokens_seen": 618659840, + "step": 295 + }, + { + "epoch": 0.08028207214537564, + "grad_norm": 17.12964630126953, + "learning_rate": 9.977708614883965e-07, + "loss": 0.53, + "num_input_tokens_seen": 620756992, + "step": 296 + }, + { + "epoch": 0.080553295362083, + "grad_norm": 28.751781463623047, + "learning_rate": 9.977305498545537e-07, + "loss": 0.7027, + "num_input_tokens_seen": 622854144, + "step": 297 + }, + { + "epoch": 0.08082451857879035, + "grad_norm": 15.914665222167969, + "learning_rate": 9.976898779042018e-07, + "loss": 0.4594, + "num_input_tokens_seen": 624951296, + "step": 298 + }, + { + "epoch": 0.0810957417954977, + "grad_norm": 20.696914672851562, + "learning_rate": 9.976488456700717e-07, + "loss": 0.7646, + "num_input_tokens_seen": 627048448, + "step": 299 + }, + { + "epoch": 0.08136696501220504, + "grad_norm": 20.68764877319336, + "learning_rate": 9.97607453185185e-07, + "loss": 0.6133, + "num_input_tokens_seen": 629145600, + "step": 300 + }, + { + "epoch": 0.08163818822891239, + "grad_norm": 23.754941940307617, + "learning_rate": 9.97565700482853e-07, + "loss": 0.7141, + "num_input_tokens_seen": 631242752, + "step": 301 + }, + { + "epoch": 0.08190941144561975, + "grad_norm": 17.099897384643555, + "learning_rate": 9.97523587596676e-07, + "loss": 0.5012, + "num_input_tokens_seen": 633339904, + "step": 302 + }, + { + "epoch": 0.0821806346623271, + "grad_norm": 20.998018264770508, + "learning_rate": 9.974811145605453e-07, + "loss": 0.3314, + "num_input_tokens_seen": 635437056, + "step": 303 + }, + { + "epoch": 0.08245185787903445, + "grad_norm": 16.847061157226562, + "learning_rate": 9.974382814086418e-07, + "loss": 0.3722, + "num_input_tokens_seen": 637534208, + "step": 304 + }, + { + "epoch": 0.0827230810957418, + "grad_norm": 17.548187255859375, + "learning_rate": 9.973950881754353e-07, + "loss": 0.4989, + "num_input_tokens_seen": 639631360, + "step": 305 + }, + { + "epoch": 0.08299430431244914, + "grad_norm": 23.3707218170166, + "learning_rate": 9.973515348956869e-07, + "loss": 0.6737, + "num_input_tokens_seen": 641728512, + "step": 306 + }, + { + "epoch": 0.0832655275291565, + "grad_norm": 16.17925262451172, + "learning_rate": 9.97307621604446e-07, + "loss": 0.5562, + "num_input_tokens_seen": 643825664, + "step": 307 + }, + { + "epoch": 0.08353675074586385, + "grad_norm": 15.967670440673828, + "learning_rate": 9.972633483370526e-07, + "loss": 0.4135, + "num_input_tokens_seen": 645922816, + "step": 308 + }, + { + "epoch": 0.0838079739625712, + "grad_norm": 25.773571014404297, + "learning_rate": 9.97218715129136e-07, + "loss": 0.8312, + "num_input_tokens_seen": 648019968, + "step": 309 + }, + { + "epoch": 0.08407919717927854, + "grad_norm": 16.281841278076172, + "learning_rate": 9.971737220166155e-07, + "loss": 0.4918, + "num_input_tokens_seen": 650117120, + "step": 310 + }, + { + "epoch": 0.08435042039598589, + "grad_norm": 19.82733917236328, + "learning_rate": 9.971283690356997e-07, + "loss": 0.6776, + "num_input_tokens_seen": 652214272, + "step": 311 + }, + { + "epoch": 0.08462164361269324, + "grad_norm": 17.492752075195312, + "learning_rate": 9.97082656222887e-07, + "loss": 0.3084, + "num_input_tokens_seen": 654311424, + "step": 312 + }, + { + "epoch": 0.0848928668294006, + "grad_norm": 21.85450553894043, + "learning_rate": 9.970365836149654e-07, + "loss": 0.6974, + "num_input_tokens_seen": 656408576, + "step": 313 + }, + { + "epoch": 0.08516409004610795, + "grad_norm": 25.810989379882812, + "learning_rate": 9.969901512490121e-07, + "loss": 1.0774, + "num_input_tokens_seen": 658505728, + "step": 314 + }, + { + "epoch": 0.0854353132628153, + "grad_norm": 21.836181640625, + "learning_rate": 9.969433591623946e-07, + "loss": 0.6704, + "num_input_tokens_seen": 660602880, + "step": 315 + }, + { + "epoch": 0.08570653647952264, + "grad_norm": 26.0434627532959, + "learning_rate": 9.96896207392769e-07, + "loss": 1.0117, + "num_input_tokens_seen": 662700032, + "step": 316 + }, + { + "epoch": 0.08597775969622999, + "grad_norm": 20.903520584106445, + "learning_rate": 9.968486959780813e-07, + "loss": 0.5013, + "num_input_tokens_seen": 664797184, + "step": 317 + }, + { + "epoch": 0.08624898291293735, + "grad_norm": 20.330419540405273, + "learning_rate": 9.96800824956567e-07, + "loss": 0.6299, + "num_input_tokens_seen": 666894336, + "step": 318 + }, + { + "epoch": 0.0865202061296447, + "grad_norm": 19.454877853393555, + "learning_rate": 9.967525943667506e-07, + "loss": 0.5737, + "num_input_tokens_seen": 668991488, + "step": 319 + }, + { + "epoch": 0.08679142934635205, + "grad_norm": 17.064470291137695, + "learning_rate": 9.967040042474467e-07, + "loss": 0.4905, + "num_input_tokens_seen": 671088640, + "step": 320 + }, + { + "epoch": 0.0870626525630594, + "grad_norm": 23.634061813354492, + "learning_rate": 9.966550546377586e-07, + "loss": 0.8326, + "num_input_tokens_seen": 673185792, + "step": 321 + }, + { + "epoch": 0.08733387577976674, + "grad_norm": 13.882503509521484, + "learning_rate": 9.966057455770788e-07, + "loss": 0.3878, + "num_input_tokens_seen": 675282944, + "step": 322 + }, + { + "epoch": 0.0876050989964741, + "grad_norm": 23.014923095703125, + "learning_rate": 9.965560771050896e-07, + "loss": 0.7579, + "num_input_tokens_seen": 677380096, + "step": 323 + }, + { + "epoch": 0.08787632221318145, + "grad_norm": 16.645936965942383, + "learning_rate": 9.965060492617623e-07, + "loss": 0.5976, + "num_input_tokens_seen": 679477248, + "step": 324 + }, + { + "epoch": 0.0881475454298888, + "grad_norm": 21.51616096496582, + "learning_rate": 9.964556620873573e-07, + "loss": 0.5753, + "num_input_tokens_seen": 681574400, + "step": 325 + }, + { + "epoch": 0.08841876864659615, + "grad_norm": 16.587160110473633, + "learning_rate": 9.964049156224244e-07, + "loss": 0.5288, + "num_input_tokens_seen": 683671552, + "step": 326 + }, + { + "epoch": 0.0886899918633035, + "grad_norm": 15.723901748657227, + "learning_rate": 9.963538099078024e-07, + "loss": 0.412, + "num_input_tokens_seen": 685768704, + "step": 327 + }, + { + "epoch": 0.08896121508001086, + "grad_norm": 21.146015167236328, + "learning_rate": 9.963023449846194e-07, + "loss": 0.7099, + "num_input_tokens_seen": 687865856, + "step": 328 + }, + { + "epoch": 0.0892324382967182, + "grad_norm": 26.975767135620117, + "learning_rate": 9.962505208942919e-07, + "loss": 0.8877, + "num_input_tokens_seen": 689963008, + "step": 329 + }, + { + "epoch": 0.08950366151342555, + "grad_norm": 17.144739151000977, + "learning_rate": 9.961983376785264e-07, + "loss": 0.4786, + "num_input_tokens_seen": 692060160, + "step": 330 + }, + { + "epoch": 0.0897748847301329, + "grad_norm": 13.809526443481445, + "learning_rate": 9.96145795379318e-07, + "loss": 0.3761, + "num_input_tokens_seen": 694157312, + "step": 331 + }, + { + "epoch": 0.09004610794684025, + "grad_norm": 24.72681999206543, + "learning_rate": 9.960928940389503e-07, + "loss": 0.8149, + "num_input_tokens_seen": 696254464, + "step": 332 + }, + { + "epoch": 0.09031733116354759, + "grad_norm": 16.22955322265625, + "learning_rate": 9.960396336999967e-07, + "loss": 0.4442, + "num_input_tokens_seen": 698351616, + "step": 333 + }, + { + "epoch": 0.09058855438025495, + "grad_norm": 15.6668119430542, + "learning_rate": 9.95986014405319e-07, + "loss": 0.3999, + "num_input_tokens_seen": 700448768, + "step": 334 + }, + { + "epoch": 0.0908597775969623, + "grad_norm": 12.644166946411133, + "learning_rate": 9.959320361980679e-07, + "loss": 0.4071, + "num_input_tokens_seen": 702545920, + "step": 335 + }, + { + "epoch": 0.09113100081366965, + "grad_norm": 9.893462181091309, + "learning_rate": 9.95877699121683e-07, + "loss": 0.2543, + "num_input_tokens_seen": 704643072, + "step": 336 + }, + { + "epoch": 0.091402224030377, + "grad_norm": 31.306793212890625, + "learning_rate": 9.95823003219893e-07, + "loss": 1.1172, + "num_input_tokens_seen": 706740224, + "step": 337 + }, + { + "epoch": 0.09167344724708434, + "grad_norm": 14.39288330078125, + "learning_rate": 9.957679485367144e-07, + "loss": 0.3461, + "num_input_tokens_seen": 708837376, + "step": 338 + }, + { + "epoch": 0.0919446704637917, + "grad_norm": 15.527755737304688, + "learning_rate": 9.95712535116454e-07, + "loss": 0.3854, + "num_input_tokens_seen": 710934528, + "step": 339 + }, + { + "epoch": 0.09221589368049905, + "grad_norm": 22.683826446533203, + "learning_rate": 9.956567630037058e-07, + "loss": 0.6062, + "num_input_tokens_seen": 713031680, + "step": 340 + }, + { + "epoch": 0.0924871168972064, + "grad_norm": 23.317596435546875, + "learning_rate": 9.95600632243353e-07, + "loss": 0.7615, + "num_input_tokens_seen": 715128832, + "step": 341 + }, + { + "epoch": 0.09275834011391375, + "grad_norm": 23.024446487426758, + "learning_rate": 9.95544142880568e-07, + "loss": 0.6869, + "num_input_tokens_seen": 717225984, + "step": 342 + }, + { + "epoch": 0.0930295633306211, + "grad_norm": 23.672393798828125, + "learning_rate": 9.954872949608108e-07, + "loss": 0.5199, + "num_input_tokens_seen": 719323136, + "step": 343 + }, + { + "epoch": 0.09330078654732846, + "grad_norm": 23.845340728759766, + "learning_rate": 9.954300885298309e-07, + "loss": 0.6942, + "num_input_tokens_seen": 721420288, + "step": 344 + }, + { + "epoch": 0.0935720097640358, + "grad_norm": 13.440674781799316, + "learning_rate": 9.953725236336653e-07, + "loss": 0.2899, + "num_input_tokens_seen": 723517440, + "step": 345 + }, + { + "epoch": 0.09384323298074315, + "grad_norm": 24.573225021362305, + "learning_rate": 9.953146003186407e-07, + "loss": 0.602, + "num_input_tokens_seen": 725614592, + "step": 346 + }, + { + "epoch": 0.0941144561974505, + "grad_norm": 28.024015426635742, + "learning_rate": 9.952563186313711e-07, + "loss": 0.9305, + "num_input_tokens_seen": 727711744, + "step": 347 + }, + { + "epoch": 0.09438567941415785, + "grad_norm": 10.102442741394043, + "learning_rate": 9.951976786187598e-07, + "loss": 0.1536, + "num_input_tokens_seen": 729808896, + "step": 348 + }, + { + "epoch": 0.09465690263086521, + "grad_norm": 21.506269454956055, + "learning_rate": 9.951386803279973e-07, + "loss": 0.4666, + "num_input_tokens_seen": 731906048, + "step": 349 + }, + { + "epoch": 0.09492812584757256, + "grad_norm": 18.18300437927246, + "learning_rate": 9.95079323806564e-07, + "loss": 0.5502, + "num_input_tokens_seen": 734003200, + "step": 350 + }, + { + "epoch": 0.0951993490642799, + "grad_norm": 15.93714714050293, + "learning_rate": 9.950196091022274e-07, + "loss": 0.3727, + "num_input_tokens_seen": 736100352, + "step": 351 + }, + { + "epoch": 0.09547057228098725, + "grad_norm": 16.818010330200195, + "learning_rate": 9.949595362630435e-07, + "loss": 0.4137, + "num_input_tokens_seen": 738197504, + "step": 352 + }, + { + "epoch": 0.0957417954976946, + "grad_norm": 19.742679595947266, + "learning_rate": 9.948991053373567e-07, + "loss": 0.6657, + "num_input_tokens_seen": 740294656, + "step": 353 + }, + { + "epoch": 0.09601301871440195, + "grad_norm": 21.378673553466797, + "learning_rate": 9.948383163738e-07, + "loss": 0.6044, + "num_input_tokens_seen": 742391808, + "step": 354 + }, + { + "epoch": 0.09628424193110931, + "grad_norm": 23.491744995117188, + "learning_rate": 9.947771694212933e-07, + "loss": 0.5978, + "num_input_tokens_seen": 744488960, + "step": 355 + }, + { + "epoch": 0.09655546514781665, + "grad_norm": 20.706626892089844, + "learning_rate": 9.947156645290456e-07, + "loss": 0.6907, + "num_input_tokens_seen": 746586112, + "step": 356 + }, + { + "epoch": 0.096826688364524, + "grad_norm": 23.280305862426758, + "learning_rate": 9.94653801746554e-07, + "loss": 0.7526, + "num_input_tokens_seen": 748683264, + "step": 357 + }, + { + "epoch": 0.09709791158123135, + "grad_norm": 18.05268669128418, + "learning_rate": 9.945915811236029e-07, + "loss": 0.5342, + "num_input_tokens_seen": 750780416, + "step": 358 + }, + { + "epoch": 0.0973691347979387, + "grad_norm": 25.603046417236328, + "learning_rate": 9.945290027102654e-07, + "loss": 0.4357, + "num_input_tokens_seen": 752877568, + "step": 359 + }, + { + "epoch": 0.09764035801464606, + "grad_norm": 20.616479873657227, + "learning_rate": 9.944660665569023e-07, + "loss": 0.6687, + "num_input_tokens_seen": 754974720, + "step": 360 + }, + { + "epoch": 0.0979115812313534, + "grad_norm": 15.590941429138184, + "learning_rate": 9.944027727141617e-07, + "loss": 0.5111, + "num_input_tokens_seen": 757071872, + "step": 361 + }, + { + "epoch": 0.09818280444806075, + "grad_norm": 15.695627212524414, + "learning_rate": 9.943391212329805e-07, + "loss": 0.4803, + "num_input_tokens_seen": 759169024, + "step": 362 + }, + { + "epoch": 0.0984540276647681, + "grad_norm": 26.64590072631836, + "learning_rate": 9.942751121645828e-07, + "loss": 1.1437, + "num_input_tokens_seen": 761266176, + "step": 363 + }, + { + "epoch": 0.09872525088147545, + "grad_norm": 16.354394912719727, + "learning_rate": 9.94210745560481e-07, + "loss": 0.5516, + "num_input_tokens_seen": 763363328, + "step": 364 + }, + { + "epoch": 0.09899647409818281, + "grad_norm": 18.546751022338867, + "learning_rate": 9.941460214724747e-07, + "loss": 0.4108, + "num_input_tokens_seen": 765460480, + "step": 365 + }, + { + "epoch": 0.09926769731489016, + "grad_norm": 25.187475204467773, + "learning_rate": 9.94080939952651e-07, + "loss": 0.8697, + "num_input_tokens_seen": 767557632, + "step": 366 + }, + { + "epoch": 0.0995389205315975, + "grad_norm": 21.675792694091797, + "learning_rate": 9.940155010533855e-07, + "loss": 0.7298, + "num_input_tokens_seen": 769654784, + "step": 367 + }, + { + "epoch": 0.09981014374830485, + "grad_norm": 17.939720153808594, + "learning_rate": 9.939497048273407e-07, + "loss": 0.7668, + "num_input_tokens_seen": 771751936, + "step": 368 + }, + { + "epoch": 0.1000813669650122, + "grad_norm": 22.402225494384766, + "learning_rate": 9.938835513274672e-07, + "loss": 0.7696, + "num_input_tokens_seen": 773849088, + "step": 369 + }, + { + "epoch": 0.10035259018171956, + "grad_norm": 14.176339149475098, + "learning_rate": 9.938170406070025e-07, + "loss": 0.4614, + "num_input_tokens_seen": 775946240, + "step": 370 + }, + { + "epoch": 0.10062381339842691, + "grad_norm": 16.77955436706543, + "learning_rate": 9.937501727194721e-07, + "loss": 0.4625, + "num_input_tokens_seen": 778043392, + "step": 371 + }, + { + "epoch": 0.10089503661513426, + "grad_norm": 17.967620849609375, + "learning_rate": 9.936829477186884e-07, + "loss": 0.6052, + "num_input_tokens_seen": 780140544, + "step": 372 + }, + { + "epoch": 0.1011662598318416, + "grad_norm": 15.005168914794922, + "learning_rate": 9.93615365658752e-07, + "loss": 0.3321, + "num_input_tokens_seen": 782237696, + "step": 373 + }, + { + "epoch": 0.10143748304854895, + "grad_norm": 19.981107711791992, + "learning_rate": 9.9354742659405e-07, + "loss": 0.7432, + "num_input_tokens_seen": 784334848, + "step": 374 + }, + { + "epoch": 0.10170870626525631, + "grad_norm": 28.166027069091797, + "learning_rate": 9.934791305792575e-07, + "loss": 1.0001, + "num_input_tokens_seen": 786432000, + "step": 375 + }, + { + "epoch": 0.10197992948196366, + "grad_norm": 26.656635284423828, + "learning_rate": 9.934104776693363e-07, + "loss": 1.1056, + "num_input_tokens_seen": 788529152, + "step": 376 + }, + { + "epoch": 0.10225115269867101, + "grad_norm": 14.768815040588379, + "learning_rate": 9.933414679195354e-07, + "loss": 0.4943, + "num_input_tokens_seen": 790626304, + "step": 377 + }, + { + "epoch": 0.10252237591537836, + "grad_norm": 18.024293899536133, + "learning_rate": 9.932721013853917e-07, + "loss": 0.503, + "num_input_tokens_seen": 792723456, + "step": 378 + }, + { + "epoch": 0.1027935991320857, + "grad_norm": 15.003963470458984, + "learning_rate": 9.932023781227287e-07, + "loss": 0.339, + "num_input_tokens_seen": 794820608, + "step": 379 + }, + { + "epoch": 0.10306482234879305, + "grad_norm": 11.605240821838379, + "learning_rate": 9.931322981876567e-07, + "loss": 0.3065, + "num_input_tokens_seen": 796917760, + "step": 380 + }, + { + "epoch": 0.10333604556550041, + "grad_norm": 12.732186317443848, + "learning_rate": 9.930618616365737e-07, + "loss": 0.3062, + "num_input_tokens_seen": 799014912, + "step": 381 + }, + { + "epoch": 0.10360726878220776, + "grad_norm": 19.49980354309082, + "learning_rate": 9.92991068526164e-07, + "loss": 0.6661, + "num_input_tokens_seen": 801112064, + "step": 382 + }, + { + "epoch": 0.1038784919989151, + "grad_norm": 17.00780487060547, + "learning_rate": 9.929199189133996e-07, + "loss": 0.4904, + "num_input_tokens_seen": 803209216, + "step": 383 + }, + { + "epoch": 0.10414971521562245, + "grad_norm": 23.928136825561523, + "learning_rate": 9.928484128555388e-07, + "loss": 0.9294, + "num_input_tokens_seen": 805306368, + "step": 384 + }, + { + "epoch": 0.1044209384323298, + "grad_norm": 19.024593353271484, + "learning_rate": 9.92776550410127e-07, + "loss": 0.6439, + "num_input_tokens_seen": 807403520, + "step": 385 + }, + { + "epoch": 0.10469216164903716, + "grad_norm": 15.482383728027344, + "learning_rate": 9.927043316349962e-07, + "loss": 0.3997, + "num_input_tokens_seen": 809500672, + "step": 386 + }, + { + "epoch": 0.10496338486574451, + "grad_norm": 14.57456111907959, + "learning_rate": 9.926317565882657e-07, + "loss": 0.472, + "num_input_tokens_seen": 811597824, + "step": 387 + }, + { + "epoch": 0.10523460808245186, + "grad_norm": 13.767419815063477, + "learning_rate": 9.925588253283407e-07, + "loss": 0.4513, + "num_input_tokens_seen": 813694976, + "step": 388 + }, + { + "epoch": 0.1055058312991592, + "grad_norm": 15.937477111816406, + "learning_rate": 9.924855379139136e-07, + "loss": 0.4371, + "num_input_tokens_seen": 815792128, + "step": 389 + }, + { + "epoch": 0.10577705451586655, + "grad_norm": 22.166065216064453, + "learning_rate": 9.924118944039635e-07, + "loss": 0.569, + "num_input_tokens_seen": 817889280, + "step": 390 + }, + { + "epoch": 0.10604827773257391, + "grad_norm": 19.830108642578125, + "learning_rate": 9.923378948577558e-07, + "loss": 0.7168, + "num_input_tokens_seen": 819986432, + "step": 391 + }, + { + "epoch": 0.10631950094928126, + "grad_norm": 15.125102043151855, + "learning_rate": 9.922635393348425e-07, + "loss": 0.4538, + "num_input_tokens_seen": 822083584, + "step": 392 + }, + { + "epoch": 0.10659072416598861, + "grad_norm": 15.82544994354248, + "learning_rate": 9.92188827895062e-07, + "loss": 0.4769, + "num_input_tokens_seen": 824180736, + "step": 393 + }, + { + "epoch": 0.10686194738269596, + "grad_norm": 23.284385681152344, + "learning_rate": 9.921137605985397e-07, + "loss": 0.8903, + "num_input_tokens_seen": 826277888, + "step": 394 + }, + { + "epoch": 0.1071331705994033, + "grad_norm": 19.174997329711914, + "learning_rate": 9.920383375056863e-07, + "loss": 0.6117, + "num_input_tokens_seen": 828375040, + "step": 395 + }, + { + "epoch": 0.10740439381611067, + "grad_norm": 18.127817153930664, + "learning_rate": 9.919625586771998e-07, + "loss": 0.7048, + "num_input_tokens_seen": 830472192, + "step": 396 + }, + { + "epoch": 0.10767561703281801, + "grad_norm": 18.79850196838379, + "learning_rate": 9.918864241740639e-07, + "loss": 0.6382, + "num_input_tokens_seen": 832569344, + "step": 397 + }, + { + "epoch": 0.10794684024952536, + "grad_norm": 19.67146110534668, + "learning_rate": 9.918099340575487e-07, + "loss": 0.5327, + "num_input_tokens_seen": 834666496, + "step": 398 + }, + { + "epoch": 0.10821806346623271, + "grad_norm": 19.64447593688965, + "learning_rate": 9.91733088389211e-07, + "loss": 0.5638, + "num_input_tokens_seen": 836763648, + "step": 399 + }, + { + "epoch": 0.10848928668294006, + "grad_norm": 15.106411933898926, + "learning_rate": 9.916558872308929e-07, + "loss": 0.3852, + "num_input_tokens_seen": 838860800, + "step": 400 + }, + { + "epoch": 0.1087605098996474, + "grad_norm": 15.882865905761719, + "learning_rate": 9.915783306447229e-07, + "loss": 0.4999, + "num_input_tokens_seen": 840957952, + "step": 401 + }, + { + "epoch": 0.10903173311635476, + "grad_norm": 19.47708511352539, + "learning_rate": 9.915004186931156e-07, + "loss": 0.5697, + "num_input_tokens_seen": 843055104, + "step": 402 + }, + { + "epoch": 0.10930295633306211, + "grad_norm": 13.521337509155273, + "learning_rate": 9.91422151438772e-07, + "loss": 0.3383, + "num_input_tokens_seen": 845152256, + "step": 403 + }, + { + "epoch": 0.10957417954976946, + "grad_norm": 23.238975524902344, + "learning_rate": 9.91343528944678e-07, + "loss": 0.8781, + "num_input_tokens_seen": 847249408, + "step": 404 + }, + { + "epoch": 0.10984540276647681, + "grad_norm": 13.926328659057617, + "learning_rate": 9.912645512741064e-07, + "loss": 0.4149, + "num_input_tokens_seen": 849346560, + "step": 405 + }, + { + "epoch": 0.11011662598318415, + "grad_norm": 25.375770568847656, + "learning_rate": 9.911852184906151e-07, + "loss": 0.5727, + "num_input_tokens_seen": 851443712, + "step": 406 + }, + { + "epoch": 0.11038784919989152, + "grad_norm": 37.60099411010742, + "learning_rate": 9.911055306580485e-07, + "loss": 0.5142, + "num_input_tokens_seen": 853540864, + "step": 407 + }, + { + "epoch": 0.11065907241659886, + "grad_norm": 16.037012100219727, + "learning_rate": 9.910254878405361e-07, + "loss": 0.4008, + "num_input_tokens_seen": 855638016, + "step": 408 + }, + { + "epoch": 0.11093029563330621, + "grad_norm": 16.561771392822266, + "learning_rate": 9.909450901024935e-07, + "loss": 0.393, + "num_input_tokens_seen": 857735168, + "step": 409 + }, + { + "epoch": 0.11120151885001356, + "grad_norm": 19.961044311523438, + "learning_rate": 9.908643375086213e-07, + "loss": 0.7223, + "num_input_tokens_seen": 859832320, + "step": 410 + }, + { + "epoch": 0.1114727420667209, + "grad_norm": 14.175226211547852, + "learning_rate": 9.907832301239066e-07, + "loss": 0.408, + "num_input_tokens_seen": 861929472, + "step": 411 + }, + { + "epoch": 0.11174396528342827, + "grad_norm": 31.357851028442383, + "learning_rate": 9.907017680136213e-07, + "loss": 0.8096, + "num_input_tokens_seen": 864026624, + "step": 412 + }, + { + "epoch": 0.11201518850013562, + "grad_norm": 17.310956954956055, + "learning_rate": 9.90619951243323e-07, + "loss": 0.5495, + "num_input_tokens_seen": 866123776, + "step": 413 + }, + { + "epoch": 0.11228641171684296, + "grad_norm": 17.816402435302734, + "learning_rate": 9.905377798788547e-07, + "loss": 0.6646, + "num_input_tokens_seen": 868220928, + "step": 414 + }, + { + "epoch": 0.11255763493355031, + "grad_norm": 14.325615882873535, + "learning_rate": 9.904552539863452e-07, + "loss": 0.4028, + "num_input_tokens_seen": 870318080, + "step": 415 + }, + { + "epoch": 0.11282885815025766, + "grad_norm": 23.365257263183594, + "learning_rate": 9.903723736322075e-07, + "loss": 0.7955, + "num_input_tokens_seen": 872415232, + "step": 416 + }, + { + "epoch": 0.11310008136696502, + "grad_norm": 18.676393508911133, + "learning_rate": 9.90289138883141e-07, + "loss": 0.7679, + "num_input_tokens_seen": 874512384, + "step": 417 + }, + { + "epoch": 0.11337130458367237, + "grad_norm": 20.179346084594727, + "learning_rate": 9.9020554980613e-07, + "loss": 0.607, + "num_input_tokens_seen": 876609536, + "step": 418 + }, + { + "epoch": 0.11364252780037971, + "grad_norm": 30.090450286865234, + "learning_rate": 9.901216064684434e-07, + "loss": 0.9661, + "num_input_tokens_seen": 878706688, + "step": 419 + }, + { + "epoch": 0.11391375101708706, + "grad_norm": 15.964401245117188, + "learning_rate": 9.900373089376357e-07, + "loss": 0.4133, + "num_input_tokens_seen": 880803840, + "step": 420 + }, + { + "epoch": 0.11418497423379441, + "grad_norm": 15.243666648864746, + "learning_rate": 9.899526572815465e-07, + "loss": 0.3792, + "num_input_tokens_seen": 882900992, + "step": 421 + }, + { + "epoch": 0.11445619745050176, + "grad_norm": 15.618511199951172, + "learning_rate": 9.898676515683001e-07, + "loss": 0.4035, + "num_input_tokens_seen": 884998144, + "step": 422 + }, + { + "epoch": 0.11472742066720912, + "grad_norm": 20.854814529418945, + "learning_rate": 9.897822918663062e-07, + "loss": 0.6001, + "num_input_tokens_seen": 887095296, + "step": 423 + }, + { + "epoch": 0.11499864388391647, + "grad_norm": 23.2158145904541, + "learning_rate": 9.896965782442584e-07, + "loss": 0.6086, + "num_input_tokens_seen": 889192448, + "step": 424 + }, + { + "epoch": 0.11526986710062381, + "grad_norm": 21.221647262573242, + "learning_rate": 9.896105107711365e-07, + "loss": 0.5437, + "num_input_tokens_seen": 891289600, + "step": 425 + }, + { + "epoch": 0.11554109031733116, + "grad_norm": 19.0914249420166, + "learning_rate": 9.895240895162037e-07, + "loss": 0.5197, + "num_input_tokens_seen": 893386752, + "step": 426 + }, + { + "epoch": 0.11581231353403851, + "grad_norm": 19.16167449951172, + "learning_rate": 9.89437314549009e-07, + "loss": 0.4462, + "num_input_tokens_seen": 895483904, + "step": 427 + }, + { + "epoch": 0.11608353675074587, + "grad_norm": 13.635269165039062, + "learning_rate": 9.89350185939385e-07, + "loss": 0.2727, + "num_input_tokens_seen": 897581056, + "step": 428 + }, + { + "epoch": 0.11635475996745322, + "grad_norm": 29.480058670043945, + "learning_rate": 9.8926270375745e-07, + "loss": 0.8433, + "num_input_tokens_seen": 899678208, + "step": 429 + }, + { + "epoch": 0.11662598318416056, + "grad_norm": 14.914876937866211, + "learning_rate": 9.891748680736064e-07, + "loss": 0.4803, + "num_input_tokens_seen": 901775360, + "step": 430 + }, + { + "epoch": 0.11689720640086791, + "grad_norm": 26.978042602539062, + "learning_rate": 9.890866789585407e-07, + "loss": 0.6345, + "num_input_tokens_seen": 903872512, + "step": 431 + }, + { + "epoch": 0.11716842961757526, + "grad_norm": 16.035795211791992, + "learning_rate": 9.889981364832245e-07, + "loss": 0.379, + "num_input_tokens_seen": 905969664, + "step": 432 + }, + { + "epoch": 0.11743965283428262, + "grad_norm": 13.532285690307617, + "learning_rate": 9.889092407189129e-07, + "loss": 0.3684, + "num_input_tokens_seen": 908066816, + "step": 433 + }, + { + "epoch": 0.11771087605098997, + "grad_norm": 16.438556671142578, + "learning_rate": 9.88819991737146e-07, + "loss": 0.4133, + "num_input_tokens_seen": 910163968, + "step": 434 + }, + { + "epoch": 0.11798209926769732, + "grad_norm": 19.439929962158203, + "learning_rate": 9.887303896097483e-07, + "loss": 0.6069, + "num_input_tokens_seen": 912261120, + "step": 435 + }, + { + "epoch": 0.11825332248440466, + "grad_norm": 19.114585876464844, + "learning_rate": 9.88640434408828e-07, + "loss": 0.7073, + "num_input_tokens_seen": 914358272, + "step": 436 + }, + { + "epoch": 0.11852454570111201, + "grad_norm": 16.253965377807617, + "learning_rate": 9.885501262067776e-07, + "loss": 0.4096, + "num_input_tokens_seen": 916455424, + "step": 437 + }, + { + "epoch": 0.11879576891781937, + "grad_norm": 32.117279052734375, + "learning_rate": 9.884594650762734e-07, + "loss": 0.9919, + "num_input_tokens_seen": 918552576, + "step": 438 + }, + { + "epoch": 0.11906699213452672, + "grad_norm": 18.713329315185547, + "learning_rate": 9.883684510902767e-07, + "loss": 0.5087, + "num_input_tokens_seen": 920649728, + "step": 439 + }, + { + "epoch": 0.11933821535123407, + "grad_norm": 19.065391540527344, + "learning_rate": 9.882770843220316e-07, + "loss": 0.6909, + "num_input_tokens_seen": 922746880, + "step": 440 + }, + { + "epoch": 0.11960943856794141, + "grad_norm": 17.04804039001465, + "learning_rate": 9.881853648450667e-07, + "loss": 0.4044, + "num_input_tokens_seen": 924844032, + "step": 441 + }, + { + "epoch": 0.11988066178464876, + "grad_norm": 16.544666290283203, + "learning_rate": 9.880932927331942e-07, + "loss": 0.393, + "num_input_tokens_seen": 926941184, + "step": 442 + }, + { + "epoch": 0.12015188500135611, + "grad_norm": 16.260129928588867, + "learning_rate": 9.880008680605104e-07, + "loss": 0.488, + "num_input_tokens_seen": 929038336, + "step": 443 + }, + { + "epoch": 0.12042310821806347, + "grad_norm": 17.787076950073242, + "learning_rate": 9.879080909013955e-07, + "loss": 0.3265, + "num_input_tokens_seen": 931135488, + "step": 444 + }, + { + "epoch": 0.12069433143477082, + "grad_norm": 19.43463134765625, + "learning_rate": 9.878149613305125e-07, + "loss": 0.5961, + "num_input_tokens_seen": 933232640, + "step": 445 + }, + { + "epoch": 0.12096555465147817, + "grad_norm": 13.461503028869629, + "learning_rate": 9.877214794228087e-07, + "loss": 0.3355, + "num_input_tokens_seen": 935329792, + "step": 446 + }, + { + "epoch": 0.12123677786818551, + "grad_norm": 21.85091781616211, + "learning_rate": 9.876276452535147e-07, + "loss": 0.6174, + "num_input_tokens_seen": 937426944, + "step": 447 + }, + { + "epoch": 0.12150800108489286, + "grad_norm": 15.593317031860352, + "learning_rate": 9.87533458898145e-07, + "loss": 0.3744, + "num_input_tokens_seen": 939524096, + "step": 448 + }, + { + "epoch": 0.12177922430160022, + "grad_norm": 14.874285697937012, + "learning_rate": 9.874389204324967e-07, + "loss": 0.4385, + "num_input_tokens_seen": 941621248, + "step": 449 + }, + { + "epoch": 0.12205044751830757, + "grad_norm": 16.8289737701416, + "learning_rate": 9.873440299326513e-07, + "loss": 0.5447, + "num_input_tokens_seen": 943718400, + "step": 450 + }, + { + "epoch": 0.12232167073501492, + "grad_norm": 17.553146362304688, + "learning_rate": 9.872487874749726e-07, + "loss": 0.5429, + "num_input_tokens_seen": 945815552, + "step": 451 + }, + { + "epoch": 0.12259289395172226, + "grad_norm": 17.820201873779297, + "learning_rate": 9.871531931361084e-07, + "loss": 0.4431, + "num_input_tokens_seen": 947912704, + "step": 452 + }, + { + "epoch": 0.12286411716842961, + "grad_norm": 19.300172805786133, + "learning_rate": 9.870572469929892e-07, + "loss": 0.5022, + "num_input_tokens_seen": 950009856, + "step": 453 + }, + { + "epoch": 0.12313534038513697, + "grad_norm": 15.839350700378418, + "learning_rate": 9.869609491228288e-07, + "loss": 0.3954, + "num_input_tokens_seen": 952107008, + "step": 454 + }, + { + "epoch": 0.12340656360184432, + "grad_norm": 12.486570358276367, + "learning_rate": 9.868642996031243e-07, + "loss": 0.3107, + "num_input_tokens_seen": 954204160, + "step": 455 + }, + { + "epoch": 0.12367778681855167, + "grad_norm": 17.16594886779785, + "learning_rate": 9.867672985116553e-07, + "loss": 0.4053, + "num_input_tokens_seen": 956301312, + "step": 456 + }, + { + "epoch": 0.12394901003525902, + "grad_norm": 18.139127731323242, + "learning_rate": 9.866699459264846e-07, + "loss": 0.5018, + "num_input_tokens_seen": 958398464, + "step": 457 + }, + { + "epoch": 0.12422023325196636, + "grad_norm": 16.21996307373047, + "learning_rate": 9.865722419259582e-07, + "loss": 0.4317, + "num_input_tokens_seen": 960495616, + "step": 458 + }, + { + "epoch": 0.12449145646867373, + "grad_norm": 27.431745529174805, + "learning_rate": 9.864741865887042e-07, + "loss": 0.8576, + "num_input_tokens_seen": 962592768, + "step": 459 + }, + { + "epoch": 0.12476267968538107, + "grad_norm": 21.745189666748047, + "learning_rate": 9.86375779993634e-07, + "loss": 0.6545, + "num_input_tokens_seen": 964689920, + "step": 460 + }, + { + "epoch": 0.1250339029020884, + "grad_norm": 18.489675521850586, + "learning_rate": 9.86277022219941e-07, + "loss": 0.6012, + "num_input_tokens_seen": 966787072, + "step": 461 + }, + { + "epoch": 0.12530512611879577, + "grad_norm": 25.21011734008789, + "learning_rate": 9.861779133471025e-07, + "loss": 0.9681, + "num_input_tokens_seen": 968884224, + "step": 462 + }, + { + "epoch": 0.12557634933550313, + "grad_norm": 20.88198471069336, + "learning_rate": 9.86078453454877e-07, + "loss": 0.3297, + "num_input_tokens_seen": 970981376, + "step": 463 + }, + { + "epoch": 0.12584757255221046, + "grad_norm": 17.169158935546875, + "learning_rate": 9.859786426233061e-07, + "loss": 0.4617, + "num_input_tokens_seen": 973078528, + "step": 464 + }, + { + "epoch": 0.12611879576891782, + "grad_norm": 13.607377052307129, + "learning_rate": 9.85878480932714e-07, + "loss": 0.3207, + "num_input_tokens_seen": 975175680, + "step": 465 + }, + { + "epoch": 0.12639001898562516, + "grad_norm": 24.081562042236328, + "learning_rate": 9.857779684637068e-07, + "loss": 0.8452, + "num_input_tokens_seen": 977272832, + "step": 466 + }, + { + "epoch": 0.12666124220233252, + "grad_norm": 15.01082992553711, + "learning_rate": 9.856771052971733e-07, + "loss": 0.3289, + "num_input_tokens_seen": 979369984, + "step": 467 + }, + { + "epoch": 0.12693246541903988, + "grad_norm": 21.138933181762695, + "learning_rate": 9.85575891514284e-07, + "loss": 0.6089, + "num_input_tokens_seen": 981467136, + "step": 468 + }, + { + "epoch": 0.12720368863574721, + "grad_norm": 15.94664478302002, + "learning_rate": 9.85474327196492e-07, + "loss": 0.3488, + "num_input_tokens_seen": 983564288, + "step": 469 + }, + { + "epoch": 0.12747491185245458, + "grad_norm": 16.990678787231445, + "learning_rate": 9.853724124255328e-07, + "loss": 0.4157, + "num_input_tokens_seen": 985661440, + "step": 470 + }, + { + "epoch": 0.1277461350691619, + "grad_norm": 26.423606872558594, + "learning_rate": 9.85270147283423e-07, + "loss": 0.6508, + "num_input_tokens_seen": 987758592, + "step": 471 + }, + { + "epoch": 0.12801735828586927, + "grad_norm": 25.734800338745117, + "learning_rate": 9.85167531852462e-07, + "loss": 0.8907, + "num_input_tokens_seen": 989855744, + "step": 472 + }, + { + "epoch": 0.12828858150257663, + "grad_norm": 15.54914379119873, + "learning_rate": 9.850645662152308e-07, + "loss": 0.3228, + "num_input_tokens_seen": 991952896, + "step": 473 + }, + { + "epoch": 0.12855980471928397, + "grad_norm": 17.9279727935791, + "learning_rate": 9.84961250454592e-07, + "loss": 0.4734, + "num_input_tokens_seen": 994050048, + "step": 474 + }, + { + "epoch": 0.12883102793599133, + "grad_norm": 16.743040084838867, + "learning_rate": 9.848575846536902e-07, + "loss": 0.4972, + "num_input_tokens_seen": 996147200, + "step": 475 + }, + { + "epoch": 0.12910225115269866, + "grad_norm": 15.414085388183594, + "learning_rate": 9.847535688959523e-07, + "loss": 0.5441, + "num_input_tokens_seen": 998244352, + "step": 476 + }, + { + "epoch": 0.12937347436940602, + "grad_norm": 21.557668685913086, + "learning_rate": 9.846492032650855e-07, + "loss": 0.5477, + "num_input_tokens_seen": 1000341504, + "step": 477 + }, + { + "epoch": 0.12964469758611338, + "grad_norm": 17.637996673583984, + "learning_rate": 9.845444878450794e-07, + "loss": 0.4591, + "num_input_tokens_seen": 1002438656, + "step": 478 + }, + { + "epoch": 0.12991592080282072, + "grad_norm": 19.702138900756836, + "learning_rate": 9.844394227202053e-07, + "loss": 0.6216, + "num_input_tokens_seen": 1004535808, + "step": 479 + }, + { + "epoch": 0.13018714401952808, + "grad_norm": 13.955646514892578, + "learning_rate": 9.843340079750154e-07, + "loss": 0.4318, + "num_input_tokens_seen": 1006632960, + "step": 480 + }, + { + "epoch": 0.1304583672362354, + "grad_norm": 12.4781494140625, + "learning_rate": 9.842282436943435e-07, + "loss": 0.3931, + "num_input_tokens_seen": 1008730112, + "step": 481 + }, + { + "epoch": 0.13072959045294277, + "grad_norm": 20.692325592041016, + "learning_rate": 9.841221299633049e-07, + "loss": 0.5389, + "num_input_tokens_seen": 1010827264, + "step": 482 + }, + { + "epoch": 0.13100081366965013, + "grad_norm": 21.611936569213867, + "learning_rate": 9.840156668672953e-07, + "loss": 0.644, + "num_input_tokens_seen": 1012924416, + "step": 483 + }, + { + "epoch": 0.13127203688635747, + "grad_norm": 10.290907859802246, + "learning_rate": 9.839088544919927e-07, + "loss": 0.3037, + "num_input_tokens_seen": 1015021568, + "step": 484 + }, + { + "epoch": 0.13154326010306483, + "grad_norm": 16.245471954345703, + "learning_rate": 9.838016929233555e-07, + "loss": 0.4827, + "num_input_tokens_seen": 1017118720, + "step": 485 + }, + { + "epoch": 0.13181448331977216, + "grad_norm": 22.62381935119629, + "learning_rate": 9.836941822476232e-07, + "loss": 0.3803, + "num_input_tokens_seen": 1019215872, + "step": 486 + }, + { + "epoch": 0.13208570653647952, + "grad_norm": 19.10595703125, + "learning_rate": 9.835863225513163e-07, + "loss": 0.6431, + "num_input_tokens_seen": 1021313024, + "step": 487 + }, + { + "epoch": 0.13235692975318689, + "grad_norm": 21.225801467895508, + "learning_rate": 9.83478113921236e-07, + "loss": 0.8905, + "num_input_tokens_seen": 1023410176, + "step": 488 + }, + { + "epoch": 0.13262815296989422, + "grad_norm": 17.778553009033203, + "learning_rate": 9.833695564444652e-07, + "loss": 0.5267, + "num_input_tokens_seen": 1025507328, + "step": 489 + }, + { + "epoch": 0.13289937618660158, + "grad_norm": 14.658015251159668, + "learning_rate": 9.832606502083658e-07, + "loss": 0.4774, + "num_input_tokens_seen": 1027604480, + "step": 490 + }, + { + "epoch": 0.13317059940330891, + "grad_norm": 20.545814514160156, + "learning_rate": 9.83151395300582e-07, + "loss": 0.5811, + "num_input_tokens_seen": 1029701632, + "step": 491 + }, + { + "epoch": 0.13344182262001628, + "grad_norm": 15.055420875549316, + "learning_rate": 9.83041791809038e-07, + "loss": 0.464, + "num_input_tokens_seen": 1031798784, + "step": 492 + }, + { + "epoch": 0.13371304583672364, + "grad_norm": 24.668725967407227, + "learning_rate": 9.829318398219385e-07, + "loss": 0.568, + "num_input_tokens_seen": 1033895936, + "step": 493 + }, + { + "epoch": 0.13398426905343097, + "grad_norm": 14.34689998626709, + "learning_rate": 9.828215394277686e-07, + "loss": 0.3721, + "num_input_tokens_seen": 1035993088, + "step": 494 + }, + { + "epoch": 0.13425549227013833, + "grad_norm": 18.64807891845703, + "learning_rate": 9.827108907152937e-07, + "loss": 0.5269, + "num_input_tokens_seen": 1038090240, + "step": 495 + }, + { + "epoch": 0.13452671548684567, + "grad_norm": 19.55156898498535, + "learning_rate": 9.825998937735599e-07, + "loss": 0.7396, + "num_input_tokens_seen": 1040187392, + "step": 496 + }, + { + "epoch": 0.13479793870355303, + "grad_norm": 22.422931671142578, + "learning_rate": 9.824885486918932e-07, + "loss": 0.6923, + "num_input_tokens_seen": 1042284544, + "step": 497 + }, + { + "epoch": 0.13506916192026036, + "grad_norm": 25.125507354736328, + "learning_rate": 9.823768555599e-07, + "loss": 0.8323, + "num_input_tokens_seen": 1044381696, + "step": 498 + }, + { + "epoch": 0.13534038513696772, + "grad_norm": 13.364242553710938, + "learning_rate": 9.822648144674664e-07, + "loss": 0.3095, + "num_input_tokens_seen": 1046478848, + "step": 499 + }, + { + "epoch": 0.13561160835367508, + "grad_norm": 15.256329536437988, + "learning_rate": 9.821524255047592e-07, + "loss": 0.4319, + "num_input_tokens_seen": 1048576000, + "step": 500 + }, + { + "epoch": 0.13588283157038242, + "grad_norm": 14.943821907043457, + "learning_rate": 9.820396887622245e-07, + "loss": 0.4693, + "num_input_tokens_seen": 1050673152, + "step": 501 + }, + { + "epoch": 0.13615405478708978, + "grad_norm": 14.425810813903809, + "learning_rate": 9.819266043305887e-07, + "loss": 0.4197, + "num_input_tokens_seen": 1052770304, + "step": 502 + }, + { + "epoch": 0.1364252780037971, + "grad_norm": 19.83941650390625, + "learning_rate": 9.818131723008576e-07, + "loss": 0.57, + "num_input_tokens_seen": 1054867456, + "step": 503 + }, + { + "epoch": 0.13669650122050447, + "grad_norm": 21.977590560913086, + "learning_rate": 9.816993927643174e-07, + "loss": 0.834, + "num_input_tokens_seen": 1056964608, + "step": 504 + }, + { + "epoch": 0.13696772443721184, + "grad_norm": 15.31468677520752, + "learning_rate": 9.815852658125332e-07, + "loss": 0.4329, + "num_input_tokens_seen": 1059061760, + "step": 505 + }, + { + "epoch": 0.13723894765391917, + "grad_norm": 21.36570167541504, + "learning_rate": 9.8147079153735e-07, + "loss": 0.6736, + "num_input_tokens_seen": 1061158912, + "step": 506 + }, + { + "epoch": 0.13751017087062653, + "grad_norm": 23.33683204650879, + "learning_rate": 9.813559700308925e-07, + "loss": 0.3126, + "num_input_tokens_seen": 1063256064, + "step": 507 + }, + { + "epoch": 0.13778139408733386, + "grad_norm": 20.741209030151367, + "learning_rate": 9.812408013855646e-07, + "loss": 0.7422, + "num_input_tokens_seen": 1065353216, + "step": 508 + }, + { + "epoch": 0.13805261730404123, + "grad_norm": 19.539844512939453, + "learning_rate": 9.811252856940496e-07, + "loss": 0.7108, + "num_input_tokens_seen": 1067450368, + "step": 509 + }, + { + "epoch": 0.1383238405207486, + "grad_norm": 9.709354400634766, + "learning_rate": 9.810094230493104e-07, + "loss": 0.2551, + "num_input_tokens_seen": 1069547520, + "step": 510 + }, + { + "epoch": 0.13859506373745592, + "grad_norm": 22.28021240234375, + "learning_rate": 9.808932135445885e-07, + "loss": 0.5543, + "num_input_tokens_seen": 1071644672, + "step": 511 + }, + { + "epoch": 0.13886628695416328, + "grad_norm": 18.827821731567383, + "learning_rate": 9.807766572734052e-07, + "loss": 0.6843, + "num_input_tokens_seen": 1073741824, + "step": 512 + }, + { + "epoch": 0.13913751017087062, + "grad_norm": 14.550572395324707, + "learning_rate": 9.806597543295603e-07, + "loss": 0.4268, + "num_input_tokens_seen": 1075838976, + "step": 513 + }, + { + "epoch": 0.13940873338757798, + "grad_norm": 19.760929107666016, + "learning_rate": 9.80542504807133e-07, + "loss": 0.6295, + "num_input_tokens_seen": 1077936128, + "step": 514 + }, + { + "epoch": 0.13967995660428534, + "grad_norm": 97.42051696777344, + "learning_rate": 9.804249088004812e-07, + "loss": 0.3536, + "num_input_tokens_seen": 1080033280, + "step": 515 + }, + { + "epoch": 0.13995117982099267, + "grad_norm": 13.345549583435059, + "learning_rate": 9.803069664042416e-07, + "loss": 0.3695, + "num_input_tokens_seen": 1082130432, + "step": 516 + }, + { + "epoch": 0.14022240303770003, + "grad_norm": 17.45563316345215, + "learning_rate": 9.801886777133297e-07, + "loss": 0.5706, + "num_input_tokens_seen": 1084227584, + "step": 517 + }, + { + "epoch": 0.14049362625440737, + "grad_norm": 20.731975555419922, + "learning_rate": 9.8007004282294e-07, + "loss": 0.4581, + "num_input_tokens_seen": 1086324736, + "step": 518 + }, + { + "epoch": 0.14076484947111473, + "grad_norm": 16.413488388061523, + "learning_rate": 9.799510618285454e-07, + "loss": 0.4683, + "num_input_tokens_seen": 1088421888, + "step": 519 + }, + { + "epoch": 0.1410360726878221, + "grad_norm": 23.03536033630371, + "learning_rate": 9.79831734825897e-07, + "loss": 0.7835, + "num_input_tokens_seen": 1090519040, + "step": 520 + }, + { + "epoch": 0.14130729590452942, + "grad_norm": 23.096599578857422, + "learning_rate": 9.797120619110245e-07, + "loss": 0.6651, + "num_input_tokens_seen": 1092616192, + "step": 521 + }, + { + "epoch": 0.14157851912123678, + "grad_norm": 20.34779167175293, + "learning_rate": 9.795920431802365e-07, + "loss": 0.7001, + "num_input_tokens_seen": 1094713344, + "step": 522 + }, + { + "epoch": 0.14184974233794412, + "grad_norm": 15.03393840789795, + "learning_rate": 9.794716787301194e-07, + "loss": 0.5215, + "num_input_tokens_seen": 1096810496, + "step": 523 + }, + { + "epoch": 0.14212096555465148, + "grad_norm": 22.243600845336914, + "learning_rate": 9.793509686575378e-07, + "loss": 0.7136, + "num_input_tokens_seen": 1098907648, + "step": 524 + }, + { + "epoch": 0.14239218877135884, + "grad_norm": 18.488739013671875, + "learning_rate": 9.792299130596346e-07, + "loss": 0.7065, + "num_input_tokens_seen": 1101004800, + "step": 525 + }, + { + "epoch": 0.14266341198806617, + "grad_norm": 20.99821662902832, + "learning_rate": 9.79108512033831e-07, + "loss": 0.8054, + "num_input_tokens_seen": 1103101952, + "step": 526 + }, + { + "epoch": 0.14293463520477354, + "grad_norm": 21.02731704711914, + "learning_rate": 9.789867656778254e-07, + "loss": 0.5969, + "num_input_tokens_seen": 1105199104, + "step": 527 + }, + { + "epoch": 0.14320585842148087, + "grad_norm": 18.518896102905273, + "learning_rate": 9.788646740895952e-07, + "loss": 0.6722, + "num_input_tokens_seen": 1107296256, + "step": 528 + }, + { + "epoch": 0.14347708163818823, + "grad_norm": 23.96672248840332, + "learning_rate": 9.787422373673945e-07, + "loss": 0.4893, + "num_input_tokens_seen": 1109393408, + "step": 529 + }, + { + "epoch": 0.1437483048548956, + "grad_norm": 18.719541549682617, + "learning_rate": 9.786194556097564e-07, + "loss": 0.6283, + "num_input_tokens_seen": 1111490560, + "step": 530 + }, + { + "epoch": 0.14401952807160293, + "grad_norm": 25.956584930419922, + "learning_rate": 9.784963289154902e-07, + "loss": 0.9686, + "num_input_tokens_seen": 1113587712, + "step": 531 + }, + { + "epoch": 0.1442907512883103, + "grad_norm": 14.612069129943848, + "learning_rate": 9.783728573836843e-07, + "loss": 0.4835, + "num_input_tokens_seen": 1115684864, + "step": 532 + }, + { + "epoch": 0.14456197450501762, + "grad_norm": 11.605941772460938, + "learning_rate": 9.782490411137035e-07, + "loss": 0.3696, + "num_input_tokens_seen": 1117782016, + "step": 533 + }, + { + "epoch": 0.14483319772172498, + "grad_norm": 17.17340850830078, + "learning_rate": 9.781248802051904e-07, + "loss": 0.602, + "num_input_tokens_seen": 1119879168, + "step": 534 + }, + { + "epoch": 0.14510442093843234, + "grad_norm": 17.984195709228516, + "learning_rate": 9.780003747580651e-07, + "loss": 0.6911, + "num_input_tokens_seen": 1121976320, + "step": 535 + }, + { + "epoch": 0.14537564415513968, + "grad_norm": 12.63048267364502, + "learning_rate": 9.778755248725248e-07, + "loss": 0.4061, + "num_input_tokens_seen": 1124073472, + "step": 536 + }, + { + "epoch": 0.14564686737184704, + "grad_norm": 15.961474418640137, + "learning_rate": 9.77750330649044e-07, + "loss": 0.5802, + "num_input_tokens_seen": 1126170624, + "step": 537 + }, + { + "epoch": 0.14591809058855437, + "grad_norm": 26.961637496948242, + "learning_rate": 9.776247921883743e-07, + "loss": 0.9477, + "num_input_tokens_seen": 1128267776, + "step": 538 + }, + { + "epoch": 0.14618931380526173, + "grad_norm": 16.94624137878418, + "learning_rate": 9.774989095915442e-07, + "loss": 0.4624, + "num_input_tokens_seen": 1130364928, + "step": 539 + }, + { + "epoch": 0.14646053702196907, + "grad_norm": 21.434429168701172, + "learning_rate": 9.77372682959859e-07, + "loss": 0.715, + "num_input_tokens_seen": 1132462080, + "step": 540 + }, + { + "epoch": 0.14673176023867643, + "grad_norm": 17.671539306640625, + "learning_rate": 9.772461123949015e-07, + "loss": 0.6978, + "num_input_tokens_seen": 1134559232, + "step": 541 + }, + { + "epoch": 0.1470029834553838, + "grad_norm": 19.538497924804688, + "learning_rate": 9.771191979985303e-07, + "loss": 0.7391, + "num_input_tokens_seen": 1136656384, + "step": 542 + }, + { + "epoch": 0.14727420667209112, + "grad_norm": 13.516701698303223, + "learning_rate": 9.76991939872882e-07, + "loss": 0.3681, + "num_input_tokens_seen": 1138753536, + "step": 543 + }, + { + "epoch": 0.14754542988879848, + "grad_norm": 15.903840065002441, + "learning_rate": 9.768643381203686e-07, + "loss": 0.5984, + "num_input_tokens_seen": 1140850688, + "step": 544 + }, + { + "epoch": 0.14781665310550582, + "grad_norm": 18.386430740356445, + "learning_rate": 9.767363928436793e-07, + "loss": 0.5663, + "num_input_tokens_seen": 1142947840, + "step": 545 + }, + { + "epoch": 0.14808787632221318, + "grad_norm": 21.846033096313477, + "learning_rate": 9.766081041457795e-07, + "loss": 0.6786, + "num_input_tokens_seen": 1145044992, + "step": 546 + }, + { + "epoch": 0.14835909953892054, + "grad_norm": 18.91754722595215, + "learning_rate": 9.764794721299113e-07, + "loss": 0.7288, + "num_input_tokens_seen": 1147142144, + "step": 547 + }, + { + "epoch": 0.14863032275562787, + "grad_norm": 16.840843200683594, + "learning_rate": 9.763504968995927e-07, + "loss": 0.3676, + "num_input_tokens_seen": 1149239296, + "step": 548 + }, + { + "epoch": 0.14890154597233524, + "grad_norm": 24.498634338378906, + "learning_rate": 9.762211785586178e-07, + "loss": 0.7899, + "num_input_tokens_seen": 1151336448, + "step": 549 + }, + { + "epoch": 0.14917276918904257, + "grad_norm": 16.3289737701416, + "learning_rate": 9.760915172110574e-07, + "loss": 0.4054, + "num_input_tokens_seen": 1153433600, + "step": 550 + }, + { + "epoch": 0.14944399240574993, + "grad_norm": 23.780595779418945, + "learning_rate": 9.759615129612579e-07, + "loss": 0.3006, + "num_input_tokens_seen": 1155530752, + "step": 551 + }, + { + "epoch": 0.1497152156224573, + "grad_norm": 19.122037887573242, + "learning_rate": 9.75831165913842e-07, + "loss": 0.6412, + "num_input_tokens_seen": 1157627904, + "step": 552 + }, + { + "epoch": 0.14998643883916463, + "grad_norm": 19.873661041259766, + "learning_rate": 9.757004761737077e-07, + "loss": 0.4966, + "num_input_tokens_seen": 1159725056, + "step": 553 + }, + { + "epoch": 0.150257662055872, + "grad_norm": 14.898545265197754, + "learning_rate": 9.755694438460293e-07, + "loss": 0.4161, + "num_input_tokens_seen": 1161822208, + "step": 554 + }, + { + "epoch": 0.15052888527257932, + "grad_norm": 18.01747703552246, + "learning_rate": 9.754380690362565e-07, + "loss": 0.5063, + "num_input_tokens_seen": 1163919360, + "step": 555 + }, + { + "epoch": 0.15080010848928668, + "grad_norm": 15.000629425048828, + "learning_rate": 9.75306351850115e-07, + "loss": 0.4822, + "num_input_tokens_seen": 1166016512, + "step": 556 + }, + { + "epoch": 0.15107133170599404, + "grad_norm": 17.407854080200195, + "learning_rate": 9.751742923936055e-07, + "loss": 0.4074, + "num_input_tokens_seen": 1168113664, + "step": 557 + }, + { + "epoch": 0.15134255492270138, + "grad_norm": 21.74121856689453, + "learning_rate": 9.75041890773005e-07, + "loss": 0.5514, + "num_input_tokens_seen": 1170210816, + "step": 558 + }, + { + "epoch": 0.15161377813940874, + "grad_norm": 16.295948028564453, + "learning_rate": 9.749091470948643e-07, + "loss": 0.4223, + "num_input_tokens_seen": 1172307968, + "step": 559 + }, + { + "epoch": 0.15188500135611607, + "grad_norm": 14.042396545410156, + "learning_rate": 9.747760614660111e-07, + "loss": 0.4214, + "num_input_tokens_seen": 1174405120, + "step": 560 + }, + { + "epoch": 0.15215622457282343, + "grad_norm": 22.4013614654541, + "learning_rate": 9.746426339935477e-07, + "loss": 0.4355, + "num_input_tokens_seen": 1176502272, + "step": 561 + }, + { + "epoch": 0.1524274477895308, + "grad_norm": 11.559626579284668, + "learning_rate": 9.745088647848515e-07, + "loss": 0.3142, + "num_input_tokens_seen": 1178599424, + "step": 562 + }, + { + "epoch": 0.15269867100623813, + "grad_norm": 18.684513092041016, + "learning_rate": 9.743747539475744e-07, + "loss": 0.7304, + "num_input_tokens_seen": 1180696576, + "step": 563 + }, + { + "epoch": 0.1529698942229455, + "grad_norm": 17.113676071166992, + "learning_rate": 9.74240301589644e-07, + "loss": 0.5681, + "num_input_tokens_seen": 1182793728, + "step": 564 + }, + { + "epoch": 0.15324111743965282, + "grad_norm": 13.977578163146973, + "learning_rate": 9.741055078192626e-07, + "loss": 0.4496, + "num_input_tokens_seen": 1184890880, + "step": 565 + }, + { + "epoch": 0.15351234065636019, + "grad_norm": 15.038694381713867, + "learning_rate": 9.739703727449068e-07, + "loss": 0.5166, + "num_input_tokens_seen": 1186988032, + "step": 566 + }, + { + "epoch": 0.15378356387306755, + "grad_norm": 15.867288589477539, + "learning_rate": 9.738348964753283e-07, + "loss": 0.3617, + "num_input_tokens_seen": 1189085184, + "step": 567 + }, + { + "epoch": 0.15405478708977488, + "grad_norm": 13.204479217529297, + "learning_rate": 9.736990791195532e-07, + "loss": 0.4219, + "num_input_tokens_seen": 1191182336, + "step": 568 + }, + { + "epoch": 0.15432601030648224, + "grad_norm": 13.286027908325195, + "learning_rate": 9.735629207868824e-07, + "loss": 0.3559, + "num_input_tokens_seen": 1193279488, + "step": 569 + }, + { + "epoch": 0.15459723352318958, + "grad_norm": 15.835883140563965, + "learning_rate": 9.734264215868904e-07, + "loss": 0.4897, + "num_input_tokens_seen": 1195376640, + "step": 570 + }, + { + "epoch": 0.15486845673989694, + "grad_norm": 17.764678955078125, + "learning_rate": 9.73289581629427e-07, + "loss": 0.4451, + "num_input_tokens_seen": 1197473792, + "step": 571 + }, + { + "epoch": 0.1551396799566043, + "grad_norm": 21.274494171142578, + "learning_rate": 9.73152401024616e-07, + "loss": 0.6454, + "num_input_tokens_seen": 1199570944, + "step": 572 + }, + { + "epoch": 0.15541090317331163, + "grad_norm": 17.158714294433594, + "learning_rate": 9.730148798828543e-07, + "loss": 0.5486, + "num_input_tokens_seen": 1201668096, + "step": 573 + }, + { + "epoch": 0.155682126390019, + "grad_norm": 23.825565338134766, + "learning_rate": 9.728770183148143e-07, + "loss": 0.7084, + "num_input_tokens_seen": 1203765248, + "step": 574 + }, + { + "epoch": 0.15595334960672633, + "grad_norm": 19.30858612060547, + "learning_rate": 9.727388164314415e-07, + "loss": 0.6812, + "num_input_tokens_seen": 1205862400, + "step": 575 + }, + { + "epoch": 0.1562245728234337, + "grad_norm": 17.701261520385742, + "learning_rate": 9.72600274343956e-07, + "loss": 0.5748, + "num_input_tokens_seen": 1207959552, + "step": 576 + }, + { + "epoch": 0.15649579604014105, + "grad_norm": 11.965795516967773, + "learning_rate": 9.724613921638506e-07, + "loss": 0.2864, + "num_input_tokens_seen": 1210056704, + "step": 577 + }, + { + "epoch": 0.15676701925684838, + "grad_norm": 55.033424377441406, + "learning_rate": 9.723221700028928e-07, + "loss": 0.5696, + "num_input_tokens_seen": 1212153856, + "step": 578 + }, + { + "epoch": 0.15703824247355574, + "grad_norm": 29.383079528808594, + "learning_rate": 9.72182607973123e-07, + "loss": 0.6414, + "num_input_tokens_seen": 1214251008, + "step": 579 + }, + { + "epoch": 0.15730946569026308, + "grad_norm": 19.812969207763672, + "learning_rate": 9.720427061868558e-07, + "loss": 0.6445, + "num_input_tokens_seen": 1216348160, + "step": 580 + }, + { + "epoch": 0.15758068890697044, + "grad_norm": 15.276076316833496, + "learning_rate": 9.71902464756678e-07, + "loss": 0.3634, + "num_input_tokens_seen": 1218445312, + "step": 581 + }, + { + "epoch": 0.15785191212367777, + "grad_norm": 17.89413070678711, + "learning_rate": 9.717618837954517e-07, + "loss": 0.5405, + "num_input_tokens_seen": 1220542464, + "step": 582 + }, + { + "epoch": 0.15812313534038513, + "grad_norm": 17.395740509033203, + "learning_rate": 9.716209634163102e-07, + "loss": 0.5115, + "num_input_tokens_seen": 1222639616, + "step": 583 + }, + { + "epoch": 0.1583943585570925, + "grad_norm": 13.388041496276855, + "learning_rate": 9.714797037326616e-07, + "loss": 0.3766, + "num_input_tokens_seen": 1224736768, + "step": 584 + }, + { + "epoch": 0.15866558177379983, + "grad_norm": 22.101512908935547, + "learning_rate": 9.713381048581855e-07, + "loss": 0.7648, + "num_input_tokens_seen": 1226833920, + "step": 585 + }, + { + "epoch": 0.1589368049905072, + "grad_norm": 18.211149215698242, + "learning_rate": 9.71196166906836e-07, + "loss": 0.4216, + "num_input_tokens_seen": 1228931072, + "step": 586 + }, + { + "epoch": 0.15920802820721452, + "grad_norm": 18.592321395874023, + "learning_rate": 9.71053889992839e-07, + "loss": 0.6394, + "num_input_tokens_seen": 1231028224, + "step": 587 + }, + { + "epoch": 0.1594792514239219, + "grad_norm": 13.54053020477295, + "learning_rate": 9.709112742306936e-07, + "loss": 0.4373, + "num_input_tokens_seen": 1233125376, + "step": 588 + }, + { + "epoch": 0.15975047464062925, + "grad_norm": 12.805086135864258, + "learning_rate": 9.707683197351715e-07, + "loss": 0.3885, + "num_input_tokens_seen": 1235222528, + "step": 589 + }, + { + "epoch": 0.16002169785733658, + "grad_norm": 12.619560241699219, + "learning_rate": 9.706250266213173e-07, + "loss": 0.2475, + "num_input_tokens_seen": 1237319680, + "step": 590 + }, + { + "epoch": 0.16029292107404394, + "grad_norm": 13.020508766174316, + "learning_rate": 9.704813950044476e-07, + "loss": 0.3102, + "num_input_tokens_seen": 1239416832, + "step": 591 + }, + { + "epoch": 0.16056414429075128, + "grad_norm": 22.488468170166016, + "learning_rate": 9.703374250001516e-07, + "loss": 0.7365, + "num_input_tokens_seen": 1241513984, + "step": 592 + }, + { + "epoch": 0.16083536750745864, + "grad_norm": 17.70863151550293, + "learning_rate": 9.70193116724291e-07, + "loss": 0.6276, + "num_input_tokens_seen": 1243611136, + "step": 593 + }, + { + "epoch": 0.161106590724166, + "grad_norm": 19.667980194091797, + "learning_rate": 9.700484702929996e-07, + "loss": 0.5559, + "num_input_tokens_seen": 1245708288, + "step": 594 + }, + { + "epoch": 0.16137781394087333, + "grad_norm": 21.86808967590332, + "learning_rate": 9.699034858226834e-07, + "loss": 0.5977, + "num_input_tokens_seen": 1247805440, + "step": 595 + }, + { + "epoch": 0.1616490371575807, + "grad_norm": 14.311076164245605, + "learning_rate": 9.697581634300202e-07, + "loss": 0.4061, + "num_input_tokens_seen": 1249902592, + "step": 596 + }, + { + "epoch": 0.16192026037428803, + "grad_norm": 22.86678123474121, + "learning_rate": 9.6961250323196e-07, + "loss": 0.8865, + "num_input_tokens_seen": 1251999744, + "step": 597 + }, + { + "epoch": 0.1621914835909954, + "grad_norm": 27.982072830200195, + "learning_rate": 9.69466505345725e-07, + "loss": 0.9566, + "num_input_tokens_seen": 1254096896, + "step": 598 + }, + { + "epoch": 0.16246270680770275, + "grad_norm": 20.294218063354492, + "learning_rate": 9.69320169888808e-07, + "loss": 0.6676, + "num_input_tokens_seen": 1256194048, + "step": 599 + }, + { + "epoch": 0.16273393002441008, + "grad_norm": 19.871686935424805, + "learning_rate": 9.691734969789746e-07, + "loss": 0.5383, + "num_input_tokens_seen": 1258291200, + "step": 600 + }, + { + "epoch": 0.16300515324111745, + "grad_norm": 18.279027938842773, + "learning_rate": 9.690264867342618e-07, + "loss": 0.4495, + "num_input_tokens_seen": 1260388352, + "step": 601 + }, + { + "epoch": 0.16327637645782478, + "grad_norm": 16.278715133666992, + "learning_rate": 9.688791392729775e-07, + "loss": 0.5764, + "num_input_tokens_seen": 1262485504, + "step": 602 + }, + { + "epoch": 0.16354759967453214, + "grad_norm": 22.23771095275879, + "learning_rate": 9.687314547137016e-07, + "loss": 0.4324, + "num_input_tokens_seen": 1264582656, + "step": 603 + }, + { + "epoch": 0.1638188228912395, + "grad_norm": 12.856112480163574, + "learning_rate": 9.685834331752846e-07, + "loss": 0.2747, + "num_input_tokens_seen": 1266679808, + "step": 604 + }, + { + "epoch": 0.16409004610794684, + "grad_norm": 22.844928741455078, + "learning_rate": 9.684350747768492e-07, + "loss": 0.9193, + "num_input_tokens_seen": 1268776960, + "step": 605 + }, + { + "epoch": 0.1643612693246542, + "grad_norm": 29.020858764648438, + "learning_rate": 9.68286379637788e-07, + "loss": 0.4867, + "num_input_tokens_seen": 1270874112, + "step": 606 + }, + { + "epoch": 0.16463249254136153, + "grad_norm": 14.029394149780273, + "learning_rate": 9.681373478777654e-07, + "loss": 0.3723, + "num_input_tokens_seen": 1272971264, + "step": 607 + }, + { + "epoch": 0.1649037157580689, + "grad_norm": 15.189117431640625, + "learning_rate": 9.679879796167166e-07, + "loss": 0.4124, + "num_input_tokens_seen": 1275068416, + "step": 608 + }, + { + "epoch": 0.16517493897477625, + "grad_norm": 19.106834411621094, + "learning_rate": 9.678382749748477e-07, + "loss": 0.5435, + "num_input_tokens_seen": 1277165568, + "step": 609 + }, + { + "epoch": 0.1654461621914836, + "grad_norm": 17.628881454467773, + "learning_rate": 9.676882340726345e-07, + "loss": 0.5957, + "num_input_tokens_seen": 1279262720, + "step": 610 + }, + { + "epoch": 0.16571738540819095, + "grad_norm": 23.732179641723633, + "learning_rate": 9.675378570308253e-07, + "loss": 0.6903, + "num_input_tokens_seen": 1281359872, + "step": 611 + }, + { + "epoch": 0.16598860862489828, + "grad_norm": 13.643631935119629, + "learning_rate": 9.673871439704369e-07, + "loss": 0.395, + "num_input_tokens_seen": 1283457024, + "step": 612 + }, + { + "epoch": 0.16625983184160564, + "grad_norm": 17.708269119262695, + "learning_rate": 9.672360950127578e-07, + "loss": 0.5302, + "num_input_tokens_seen": 1285554176, + "step": 613 + }, + { + "epoch": 0.166531055058313, + "grad_norm": 13.756535530090332, + "learning_rate": 9.670847102793464e-07, + "loss": 0.3048, + "num_input_tokens_seen": 1287651328, + "step": 614 + }, + { + "epoch": 0.16680227827502034, + "grad_norm": 16.69892692565918, + "learning_rate": 9.669329898920317e-07, + "loss": 0.5407, + "num_input_tokens_seen": 1289748480, + "step": 615 + }, + { + "epoch": 0.1670735014917277, + "grad_norm": 22.133914947509766, + "learning_rate": 9.66780933972912e-07, + "loss": 0.626, + "num_input_tokens_seen": 1291845632, + "step": 616 + }, + { + "epoch": 0.16734472470843503, + "grad_norm": 13.63410472869873, + "learning_rate": 9.666285426443564e-07, + "loss": 0.3245, + "num_input_tokens_seen": 1293942784, + "step": 617 + }, + { + "epoch": 0.1676159479251424, + "grad_norm": 15.757242202758789, + "learning_rate": 9.664758160290036e-07, + "loss": 0.4598, + "num_input_tokens_seen": 1296039936, + "step": 618 + }, + { + "epoch": 0.16788717114184976, + "grad_norm": 25.88416290283203, + "learning_rate": 9.66322754249762e-07, + "loss": 0.6762, + "num_input_tokens_seen": 1298137088, + "step": 619 + }, + { + "epoch": 0.1681583943585571, + "grad_norm": 25.853208541870117, + "learning_rate": 9.661693574298102e-07, + "loss": 0.8816, + "num_input_tokens_seen": 1300234240, + "step": 620 + }, + { + "epoch": 0.16842961757526445, + "grad_norm": 15.599662780761719, + "learning_rate": 9.66015625692596e-07, + "loss": 0.4388, + "num_input_tokens_seen": 1302331392, + "step": 621 + }, + { + "epoch": 0.16870084079197178, + "grad_norm": 12.559205055236816, + "learning_rate": 9.658615591618366e-07, + "loss": 0.3097, + "num_input_tokens_seen": 1304428544, + "step": 622 + }, + { + "epoch": 0.16897206400867915, + "grad_norm": 15.277921676635742, + "learning_rate": 9.657071579615191e-07, + "loss": 0.4326, + "num_input_tokens_seen": 1306525696, + "step": 623 + }, + { + "epoch": 0.16924328722538648, + "grad_norm": 23.061508178710938, + "learning_rate": 9.655524222159e-07, + "loss": 0.7207, + "num_input_tokens_seen": 1308622848, + "step": 624 + }, + { + "epoch": 0.16951451044209384, + "grad_norm": 21.415958404541016, + "learning_rate": 9.653973520495042e-07, + "loss": 0.7055, + "num_input_tokens_seen": 1310720000, + "step": 625 + }, + { + "epoch": 0.1697857336588012, + "grad_norm": 16.70574188232422, + "learning_rate": 9.652419475871267e-07, + "loss": 0.5012, + "num_input_tokens_seen": 1312817152, + "step": 626 + }, + { + "epoch": 0.17005695687550854, + "grad_norm": 26.579748153686523, + "learning_rate": 9.650862089538307e-07, + "loss": 0.514, + "num_input_tokens_seen": 1314914304, + "step": 627 + }, + { + "epoch": 0.1703281800922159, + "grad_norm": 23.841176986694336, + "learning_rate": 9.64930136274949e-07, + "loss": 0.6051, + "num_input_tokens_seen": 1317011456, + "step": 628 + }, + { + "epoch": 0.17059940330892323, + "grad_norm": 18.95991325378418, + "learning_rate": 9.647737296760828e-07, + "loss": 0.6007, + "num_input_tokens_seen": 1319108608, + "step": 629 + }, + { + "epoch": 0.1708706265256306, + "grad_norm": 15.76582145690918, + "learning_rate": 9.646169892831025e-07, + "loss": 0.4593, + "num_input_tokens_seen": 1321205760, + "step": 630 + }, + { + "epoch": 0.17114184974233795, + "grad_norm": 18.453969955444336, + "learning_rate": 9.644599152221465e-07, + "loss": 0.5481, + "num_input_tokens_seen": 1323302912, + "step": 631 + }, + { + "epoch": 0.1714130729590453, + "grad_norm": 16.834604263305664, + "learning_rate": 9.643025076196219e-07, + "loss": 0.592, + "num_input_tokens_seen": 1325400064, + "step": 632 + }, + { + "epoch": 0.17168429617575265, + "grad_norm": 18.951011657714844, + "learning_rate": 9.641447666022048e-07, + "loss": 0.5194, + "num_input_tokens_seen": 1327497216, + "step": 633 + }, + { + "epoch": 0.17195551939245998, + "grad_norm": 22.448795318603516, + "learning_rate": 9.639866922968387e-07, + "loss": 0.6618, + "num_input_tokens_seen": 1329594368, + "step": 634 + }, + { + "epoch": 0.17222674260916734, + "grad_norm": 23.163719177246094, + "learning_rate": 9.638282848307361e-07, + "loss": 0.7076, + "num_input_tokens_seen": 1331691520, + "step": 635 + }, + { + "epoch": 0.1724979658258747, + "grad_norm": 21.734291076660156, + "learning_rate": 9.636695443313773e-07, + "loss": 0.6584, + "num_input_tokens_seen": 1333788672, + "step": 636 + }, + { + "epoch": 0.17276918904258204, + "grad_norm": 26.138351440429688, + "learning_rate": 9.635104709265103e-07, + "loss": 0.7811, + "num_input_tokens_seen": 1335885824, + "step": 637 + }, + { + "epoch": 0.1730404122592894, + "grad_norm": 14.809904098510742, + "learning_rate": 9.633510647441518e-07, + "loss": 0.3775, + "num_input_tokens_seen": 1337982976, + "step": 638 + }, + { + "epoch": 0.17331163547599673, + "grad_norm": 23.86665916442871, + "learning_rate": 9.631913259125854e-07, + "loss": 0.9476, + "num_input_tokens_seen": 1340080128, + "step": 639 + }, + { + "epoch": 0.1735828586927041, + "grad_norm": 12.233245849609375, + "learning_rate": 9.630312545603631e-07, + "loss": 0.3191, + "num_input_tokens_seen": 1342177280, + "step": 640 + }, + { + "epoch": 0.17385408190941146, + "grad_norm": 25.316692352294922, + "learning_rate": 9.628708508163041e-07, + "loss": 0.7073, + "num_input_tokens_seen": 1344274432, + "step": 641 + }, + { + "epoch": 0.1741253051261188, + "grad_norm": 11.938849449157715, + "learning_rate": 9.627101148094952e-07, + "loss": 0.3728, + "num_input_tokens_seen": 1346371584, + "step": 642 + }, + { + "epoch": 0.17439652834282615, + "grad_norm": 24.418840408325195, + "learning_rate": 9.625490466692906e-07, + "loss": 0.7563, + "num_input_tokens_seen": 1348468736, + "step": 643 + }, + { + "epoch": 0.17466775155953349, + "grad_norm": 15.90093994140625, + "learning_rate": 9.623876465253122e-07, + "loss": 0.4849, + "num_input_tokens_seen": 1350565888, + "step": 644 + }, + { + "epoch": 0.17493897477624085, + "grad_norm": 18.447975158691406, + "learning_rate": 9.622259145074482e-07, + "loss": 0.4677, + "num_input_tokens_seen": 1352663040, + "step": 645 + }, + { + "epoch": 0.1752101979929482, + "grad_norm": 17.22858238220215, + "learning_rate": 9.620638507458547e-07, + "loss": 0.6084, + "num_input_tokens_seen": 1354760192, + "step": 646 + }, + { + "epoch": 0.17548142120965554, + "grad_norm": 21.59812355041504, + "learning_rate": 9.619014553709542e-07, + "loss": 0.8913, + "num_input_tokens_seen": 1356857344, + "step": 647 + }, + { + "epoch": 0.1757526444263629, + "grad_norm": 23.786243438720703, + "learning_rate": 9.617387285134364e-07, + "loss": 0.8757, + "num_input_tokens_seen": 1358954496, + "step": 648 + }, + { + "epoch": 0.17602386764307024, + "grad_norm": 12.938894271850586, + "learning_rate": 9.615756703042575e-07, + "loss": 0.4185, + "num_input_tokens_seen": 1361051648, + "step": 649 + }, + { + "epoch": 0.1762950908597776, + "grad_norm": 21.32219696044922, + "learning_rate": 9.61412280874641e-07, + "loss": 0.472, + "num_input_tokens_seen": 1363148800, + "step": 650 + }, + { + "epoch": 0.17656631407648496, + "grad_norm": 20.01197624206543, + "learning_rate": 9.612485603560763e-07, + "loss": 0.8356, + "num_input_tokens_seen": 1365245952, + "step": 651 + }, + { + "epoch": 0.1768375372931923, + "grad_norm": 17.365571975708008, + "learning_rate": 9.610845088803194e-07, + "loss": 0.532, + "num_input_tokens_seen": 1367343104, + "step": 652 + }, + { + "epoch": 0.17710876050989965, + "grad_norm": 18.095643997192383, + "learning_rate": 9.609201265793927e-07, + "loss": 0.6, + "num_input_tokens_seen": 1369440256, + "step": 653 + }, + { + "epoch": 0.177379983726607, + "grad_norm": 18.0372257232666, + "learning_rate": 9.607554135855847e-07, + "loss": 0.5181, + "num_input_tokens_seen": 1371537408, + "step": 654 + }, + { + "epoch": 0.17765120694331435, + "grad_norm": 19.283889770507812, + "learning_rate": 9.605903700314503e-07, + "loss": 0.4867, + "num_input_tokens_seen": 1373634560, + "step": 655 + }, + { + "epoch": 0.1779224301600217, + "grad_norm": 15.220179557800293, + "learning_rate": 9.604249960498102e-07, + "loss": 0.3913, + "num_input_tokens_seen": 1375731712, + "step": 656 + }, + { + "epoch": 0.17819365337672904, + "grad_norm": 16.81816864013672, + "learning_rate": 9.602592917737512e-07, + "loss": 0.4373, + "num_input_tokens_seen": 1377828864, + "step": 657 + }, + { + "epoch": 0.1784648765934364, + "grad_norm": 27.435163497924805, + "learning_rate": 9.600932573366254e-07, + "loss": 0.9561, + "num_input_tokens_seen": 1379926016, + "step": 658 + }, + { + "epoch": 0.17873609981014374, + "grad_norm": 13.818350791931152, + "learning_rate": 9.599268928720518e-07, + "loss": 0.414, + "num_input_tokens_seen": 1382023168, + "step": 659 + }, + { + "epoch": 0.1790073230268511, + "grad_norm": 13.987568855285645, + "learning_rate": 9.597601985139132e-07, + "loss": 0.3532, + "num_input_tokens_seen": 1384120320, + "step": 660 + }, + { + "epoch": 0.17927854624355846, + "grad_norm": 38.5766487121582, + "learning_rate": 9.595931743963596e-07, + "loss": 0.501, + "num_input_tokens_seen": 1386217472, + "step": 661 + }, + { + "epoch": 0.1795497694602658, + "grad_norm": 15.663421630859375, + "learning_rate": 9.594258206538054e-07, + "loss": 0.4319, + "num_input_tokens_seen": 1388314624, + "step": 662 + }, + { + "epoch": 0.17982099267697316, + "grad_norm": 14.648031234741211, + "learning_rate": 9.592581374209306e-07, + "loss": 0.3603, + "num_input_tokens_seen": 1390411776, + "step": 663 + }, + { + "epoch": 0.1800922158936805, + "grad_norm": 19.08344078063965, + "learning_rate": 9.590901248326802e-07, + "loss": 0.5985, + "num_input_tokens_seen": 1392508928, + "step": 664 + }, + { + "epoch": 0.18036343911038785, + "grad_norm": 14.050150871276855, + "learning_rate": 9.589217830242645e-07, + "loss": 0.3836, + "num_input_tokens_seen": 1394606080, + "step": 665 + }, + { + "epoch": 0.18063466232709519, + "grad_norm": 13.421784400939941, + "learning_rate": 9.587531121311582e-07, + "loss": 0.3193, + "num_input_tokens_seen": 1396703232, + "step": 666 + }, + { + "epoch": 0.18090588554380255, + "grad_norm": 22.5211181640625, + "learning_rate": 9.585841122891016e-07, + "loss": 0.8333, + "num_input_tokens_seen": 1398800384, + "step": 667 + }, + { + "epoch": 0.1811771087605099, + "grad_norm": 18.792020797729492, + "learning_rate": 9.584147836340992e-07, + "loss": 0.578, + "num_input_tokens_seen": 1400897536, + "step": 668 + }, + { + "epoch": 0.18144833197721724, + "grad_norm": 16.05472755432129, + "learning_rate": 9.582451263024202e-07, + "loss": 0.5295, + "num_input_tokens_seen": 1402994688, + "step": 669 + }, + { + "epoch": 0.1817195551939246, + "grad_norm": 24.35643768310547, + "learning_rate": 9.580751404305985e-07, + "loss": 0.7578, + "num_input_tokens_seen": 1405091840, + "step": 670 + }, + { + "epoch": 0.18199077841063194, + "grad_norm": 21.20308494567871, + "learning_rate": 9.579048261554321e-07, + "loss": 0.9025, + "num_input_tokens_seen": 1407188992, + "step": 671 + }, + { + "epoch": 0.1822620016273393, + "grad_norm": 19.037288665771484, + "learning_rate": 9.577341836139837e-07, + "loss": 0.4646, + "num_input_tokens_seen": 1409286144, + "step": 672 + }, + { + "epoch": 0.18253322484404666, + "grad_norm": 18.97392463684082, + "learning_rate": 9.575632129435796e-07, + "loss": 0.5964, + "num_input_tokens_seen": 1411383296, + "step": 673 + }, + { + "epoch": 0.182804448060754, + "grad_norm": 12.664912223815918, + "learning_rate": 9.573919142818109e-07, + "loss": 0.4433, + "num_input_tokens_seen": 1413480448, + "step": 674 + }, + { + "epoch": 0.18307567127746135, + "grad_norm": 19.070024490356445, + "learning_rate": 9.572202877665317e-07, + "loss": 0.6704, + "num_input_tokens_seen": 1415577600, + "step": 675 + }, + { + "epoch": 0.1833468944941687, + "grad_norm": 11.524867057800293, + "learning_rate": 9.57048333535861e-07, + "loss": 0.3089, + "num_input_tokens_seen": 1417674752, + "step": 676 + }, + { + "epoch": 0.18361811771087605, + "grad_norm": 23.75655746459961, + "learning_rate": 9.568760517281808e-07, + "loss": 0.9763, + "num_input_tokens_seen": 1419771904, + "step": 677 + }, + { + "epoch": 0.1838893409275834, + "grad_norm": 13.743852615356445, + "learning_rate": 9.56703442482137e-07, + "loss": 0.3573, + "num_input_tokens_seen": 1421869056, + "step": 678 + }, + { + "epoch": 0.18416056414429074, + "grad_norm": 17.890188217163086, + "learning_rate": 9.565305059366385e-07, + "loss": 0.7959, + "num_input_tokens_seen": 1423966208, + "step": 679 + }, + { + "epoch": 0.1844317873609981, + "grad_norm": 12.271766662597656, + "learning_rate": 9.563572422308588e-07, + "loss": 0.3559, + "num_input_tokens_seen": 1426063360, + "step": 680 + }, + { + "epoch": 0.18470301057770544, + "grad_norm": 17.936405181884766, + "learning_rate": 9.561836515042336e-07, + "loss": 0.5253, + "num_input_tokens_seen": 1428160512, + "step": 681 + }, + { + "epoch": 0.1849742337944128, + "grad_norm": 18.60642433166504, + "learning_rate": 9.56009733896462e-07, + "loss": 0.4834, + "num_input_tokens_seen": 1430257664, + "step": 682 + }, + { + "epoch": 0.18524545701112016, + "grad_norm": 19.449522018432617, + "learning_rate": 9.558354895475065e-07, + "loss": 0.6774, + "num_input_tokens_seen": 1432354816, + "step": 683 + }, + { + "epoch": 0.1855166802278275, + "grad_norm": 27.314105987548828, + "learning_rate": 9.55660918597592e-07, + "loss": 0.9876, + "num_input_tokens_seen": 1434451968, + "step": 684 + }, + { + "epoch": 0.18578790344453486, + "grad_norm": 24.136520385742188, + "learning_rate": 9.55486021187207e-07, + "loss": 0.6132, + "num_input_tokens_seen": 1436549120, + "step": 685 + }, + { + "epoch": 0.1860591266612422, + "grad_norm": 12.596097946166992, + "learning_rate": 9.553107974571018e-07, + "loss": 0.3146, + "num_input_tokens_seen": 1438646272, + "step": 686 + }, + { + "epoch": 0.18633034987794955, + "grad_norm": 20.519437789916992, + "learning_rate": 9.551352475482902e-07, + "loss": 0.7173, + "num_input_tokens_seen": 1440743424, + "step": 687 + }, + { + "epoch": 0.18660157309465691, + "grad_norm": 16.180757522583008, + "learning_rate": 9.549593716020478e-07, + "loss": 0.4588, + "num_input_tokens_seen": 1442840576, + "step": 688 + }, + { + "epoch": 0.18687279631136425, + "grad_norm": 12.175859451293945, + "learning_rate": 9.54783169759913e-07, + "loss": 0.3998, + "num_input_tokens_seen": 1444937728, + "step": 689 + }, + { + "epoch": 0.1871440195280716, + "grad_norm": 15.341544151306152, + "learning_rate": 9.546066421636867e-07, + "loss": 0.4147, + "num_input_tokens_seen": 1447034880, + "step": 690 + }, + { + "epoch": 0.18741524274477894, + "grad_norm": 11.2804536819458, + "learning_rate": 9.54429788955431e-07, + "loss": 0.2544, + "num_input_tokens_seen": 1449132032, + "step": 691 + }, + { + "epoch": 0.1876864659614863, + "grad_norm": 18.1593017578125, + "learning_rate": 9.542526102774704e-07, + "loss": 0.596, + "num_input_tokens_seen": 1451229184, + "step": 692 + }, + { + "epoch": 0.18795768917819367, + "grad_norm": 16.799240112304688, + "learning_rate": 9.540751062723923e-07, + "loss": 0.6559, + "num_input_tokens_seen": 1453326336, + "step": 693 + }, + { + "epoch": 0.188228912394901, + "grad_norm": 19.996797561645508, + "learning_rate": 9.53897277083045e-07, + "loss": 0.6829, + "num_input_tokens_seen": 1455423488, + "step": 694 + }, + { + "epoch": 0.18850013561160836, + "grad_norm": 15.80327033996582, + "learning_rate": 9.537191228525382e-07, + "loss": 0.4173, + "num_input_tokens_seen": 1457520640, + "step": 695 + }, + { + "epoch": 0.1887713588283157, + "grad_norm": 18.673648834228516, + "learning_rate": 9.535406437242444e-07, + "loss": 0.5306, + "num_input_tokens_seen": 1459617792, + "step": 696 + }, + { + "epoch": 0.18904258204502306, + "grad_norm": 16.424015045166016, + "learning_rate": 9.533618398417962e-07, + "loss": 0.5724, + "num_input_tokens_seen": 1461714944, + "step": 697 + }, + { + "epoch": 0.18931380526173042, + "grad_norm": 24.49404525756836, + "learning_rate": 9.531827113490883e-07, + "loss": 0.6432, + "num_input_tokens_seen": 1463812096, + "step": 698 + }, + { + "epoch": 0.18958502847843775, + "grad_norm": 16.11602783203125, + "learning_rate": 9.530032583902767e-07, + "loss": 0.6071, + "num_input_tokens_seen": 1465909248, + "step": 699 + }, + { + "epoch": 0.1898562516951451, + "grad_norm": 18.514514923095703, + "learning_rate": 9.528234811097781e-07, + "loss": 0.6862, + "num_input_tokens_seen": 1468006400, + "step": 700 + }, + { + "epoch": 0.19012747491185245, + "grad_norm": 24.56780242919922, + "learning_rate": 9.526433796522702e-07, + "loss": 0.6434, + "num_input_tokens_seen": 1470103552, + "step": 701 + }, + { + "epoch": 0.1903986981285598, + "grad_norm": 17.129905700683594, + "learning_rate": 9.524629541626925e-07, + "loss": 0.4748, + "num_input_tokens_seen": 1472200704, + "step": 702 + }, + { + "epoch": 0.19066992134526717, + "grad_norm": 22.451244354248047, + "learning_rate": 9.522822047862438e-07, + "loss": 0.6388, + "num_input_tokens_seen": 1474297856, + "step": 703 + }, + { + "epoch": 0.1909411445619745, + "grad_norm": 21.370609283447266, + "learning_rate": 9.521011316683849e-07, + "loss": 0.6206, + "num_input_tokens_seen": 1476395008, + "step": 704 + }, + { + "epoch": 0.19121236777868186, + "grad_norm": 18.505590438842773, + "learning_rate": 9.519197349548364e-07, + "loss": 0.6211, + "num_input_tokens_seen": 1478492160, + "step": 705 + }, + { + "epoch": 0.1914835909953892, + "grad_norm": 16.266847610473633, + "learning_rate": 9.517380147915791e-07, + "loss": 0.4589, + "num_input_tokens_seen": 1480589312, + "step": 706 + }, + { + "epoch": 0.19175481421209656, + "grad_norm": 23.907617568969727, + "learning_rate": 9.515559713248549e-07, + "loss": 0.6771, + "num_input_tokens_seen": 1482686464, + "step": 707 + }, + { + "epoch": 0.1920260374288039, + "grad_norm": 11.614529609680176, + "learning_rate": 9.513736047011653e-07, + "loss": 0.3148, + "num_input_tokens_seen": 1484783616, + "step": 708 + }, + { + "epoch": 0.19229726064551125, + "grad_norm": 17.292818069458008, + "learning_rate": 9.511909150672721e-07, + "loss": 0.599, + "num_input_tokens_seen": 1486880768, + "step": 709 + }, + { + "epoch": 0.19256848386221861, + "grad_norm": 15.411133766174316, + "learning_rate": 9.510079025701967e-07, + "loss": 0.5165, + "num_input_tokens_seen": 1488977920, + "step": 710 + }, + { + "epoch": 0.19283970707892595, + "grad_norm": 14.638044357299805, + "learning_rate": 9.508245673572209e-07, + "loss": 0.3889, + "num_input_tokens_seen": 1491075072, + "step": 711 + }, + { + "epoch": 0.1931109302956333, + "grad_norm": 13.973809242248535, + "learning_rate": 9.50640909575886e-07, + "loss": 0.3034, + "num_input_tokens_seen": 1493172224, + "step": 712 + }, + { + "epoch": 0.19338215351234064, + "grad_norm": 22.724477767944336, + "learning_rate": 9.504569293739923e-07, + "loss": 0.773, + "num_input_tokens_seen": 1495269376, + "step": 713 + }, + { + "epoch": 0.193653376729048, + "grad_norm": 19.064922332763672, + "learning_rate": 9.502726268996005e-07, + "loss": 0.5808, + "num_input_tokens_seen": 1497366528, + "step": 714 + }, + { + "epoch": 0.19392459994575537, + "grad_norm": 12.686622619628906, + "learning_rate": 9.5008800230103e-07, + "loss": 0.2699, + "num_input_tokens_seen": 1499463680, + "step": 715 + }, + { + "epoch": 0.1941958231624627, + "grad_norm": 17.352428436279297, + "learning_rate": 9.499030557268599e-07, + "loss": 0.6613, + "num_input_tokens_seen": 1501560832, + "step": 716 + }, + { + "epoch": 0.19446704637917006, + "grad_norm": 13.888693809509277, + "learning_rate": 9.497177873259279e-07, + "loss": 0.4085, + "num_input_tokens_seen": 1503657984, + "step": 717 + }, + { + "epoch": 0.1947382695958774, + "grad_norm": 19.89103889465332, + "learning_rate": 9.495321972473311e-07, + "loss": 0.3917, + "num_input_tokens_seen": 1505755136, + "step": 718 + }, + { + "epoch": 0.19500949281258476, + "grad_norm": 13.726130485534668, + "learning_rate": 9.493462856404251e-07, + "loss": 0.4566, + "num_input_tokens_seen": 1507852288, + "step": 719 + }, + { + "epoch": 0.19528071602929212, + "grad_norm": 14.963589668273926, + "learning_rate": 9.491600526548247e-07, + "loss": 0.4609, + "num_input_tokens_seen": 1509949440, + "step": 720 + }, + { + "epoch": 0.19555193924599945, + "grad_norm": 13.92644214630127, + "learning_rate": 9.489734984404033e-07, + "loss": 0.3863, + "num_input_tokens_seen": 1512046592, + "step": 721 + }, + { + "epoch": 0.1958231624627068, + "grad_norm": 21.550809860229492, + "learning_rate": 9.487866231472922e-07, + "loss": 0.4927, + "num_input_tokens_seen": 1514143744, + "step": 722 + }, + { + "epoch": 0.19609438567941415, + "grad_norm": 10.747212409973145, + "learning_rate": 9.48599426925882e-07, + "loss": 0.2856, + "num_input_tokens_seen": 1516240896, + "step": 723 + }, + { + "epoch": 0.1963656088961215, + "grad_norm": 21.09810447692871, + "learning_rate": 9.484119099268206e-07, + "loss": 0.7103, + "num_input_tokens_seen": 1518338048, + "step": 724 + }, + { + "epoch": 0.19663683211282887, + "grad_norm": 13.561016082763672, + "learning_rate": 9.482240723010148e-07, + "loss": 0.3584, + "num_input_tokens_seen": 1520435200, + "step": 725 + }, + { + "epoch": 0.1969080553295362, + "grad_norm": 23.81125259399414, + "learning_rate": 9.480359141996295e-07, + "loss": 0.7733, + "num_input_tokens_seen": 1522532352, + "step": 726 + }, + { + "epoch": 0.19717927854624356, + "grad_norm": 15.629355430603027, + "learning_rate": 9.478474357740864e-07, + "loss": 0.4239, + "num_input_tokens_seen": 1524629504, + "step": 727 + }, + { + "epoch": 0.1974505017629509, + "grad_norm": 17.766565322875977, + "learning_rate": 9.476586371760665e-07, + "loss": 0.4563, + "num_input_tokens_seen": 1526726656, + "step": 728 + }, + { + "epoch": 0.19772172497965826, + "grad_norm": 40.20060348510742, + "learning_rate": 9.474695185575072e-07, + "loss": 0.8098, + "num_input_tokens_seen": 1528823808, + "step": 729 + }, + { + "epoch": 0.19799294819636562, + "grad_norm": 13.600529670715332, + "learning_rate": 9.472800800706044e-07, + "loss": 0.3496, + "num_input_tokens_seen": 1530920960, + "step": 730 + }, + { + "epoch": 0.19826417141307295, + "grad_norm": 27.296430587768555, + "learning_rate": 9.470903218678108e-07, + "loss": 0.8791, + "num_input_tokens_seen": 1533018112, + "step": 731 + }, + { + "epoch": 0.19853539462978032, + "grad_norm": 23.39369773864746, + "learning_rate": 9.469002441018366e-07, + "loss": 0.7726, + "num_input_tokens_seen": 1535115264, + "step": 732 + }, + { + "epoch": 0.19880661784648765, + "grad_norm": 25.535316467285156, + "learning_rate": 9.467098469256488e-07, + "loss": 0.525, + "num_input_tokens_seen": 1537212416, + "step": 733 + }, + { + "epoch": 0.199077841063195, + "grad_norm": 12.456023216247559, + "learning_rate": 9.465191304924725e-07, + "loss": 0.3788, + "num_input_tokens_seen": 1539309568, + "step": 734 + }, + { + "epoch": 0.19934906427990237, + "grad_norm": 17.167804718017578, + "learning_rate": 9.463280949557885e-07, + "loss": 0.5477, + "num_input_tokens_seen": 1541406720, + "step": 735 + }, + { + "epoch": 0.1996202874966097, + "grad_norm": 16.55001449584961, + "learning_rate": 9.46136740469335e-07, + "loss": 0.5201, + "num_input_tokens_seen": 1543503872, + "step": 736 + }, + { + "epoch": 0.19989151071331707, + "grad_norm": 17.66445541381836, + "learning_rate": 9.45945067187107e-07, + "loss": 0.4485, + "num_input_tokens_seen": 1545601024, + "step": 737 + }, + { + "epoch": 0.2001627339300244, + "grad_norm": 15.181243896484375, + "learning_rate": 9.457530752633557e-07, + "loss": 0.5618, + "num_input_tokens_seen": 1547698176, + "step": 738 + }, + { + "epoch": 0.20043395714673176, + "grad_norm": 10.440348625183105, + "learning_rate": 9.455607648525889e-07, + "loss": 0.2362, + "num_input_tokens_seen": 1549795328, + "step": 739 + }, + { + "epoch": 0.20070518036343912, + "grad_norm": 12.595712661743164, + "learning_rate": 9.45368136109571e-07, + "loss": 0.2883, + "num_input_tokens_seen": 1551892480, + "step": 740 + }, + { + "epoch": 0.20097640358014646, + "grad_norm": 24.40172004699707, + "learning_rate": 9.451751891893217e-07, + "loss": 0.7001, + "num_input_tokens_seen": 1553989632, + "step": 741 + }, + { + "epoch": 0.20124762679685382, + "grad_norm": 16.860057830810547, + "learning_rate": 9.449819242471179e-07, + "loss": 0.6415, + "num_input_tokens_seen": 1556086784, + "step": 742 + }, + { + "epoch": 0.20151885001356115, + "grad_norm": 16.089021682739258, + "learning_rate": 9.447883414384916e-07, + "loss": 0.4937, + "num_input_tokens_seen": 1558183936, + "step": 743 + }, + { + "epoch": 0.2017900732302685, + "grad_norm": 15.555678367614746, + "learning_rate": 9.445944409192308e-07, + "loss": 0.3251, + "num_input_tokens_seen": 1560281088, + "step": 744 + }, + { + "epoch": 0.20206129644697587, + "grad_norm": 16.365802764892578, + "learning_rate": 9.444002228453796e-07, + "loss": 0.3863, + "num_input_tokens_seen": 1562378240, + "step": 745 + }, + { + "epoch": 0.2023325196636832, + "grad_norm": 19.373912811279297, + "learning_rate": 9.442056873732369e-07, + "loss": 0.7479, + "num_input_tokens_seen": 1564475392, + "step": 746 + }, + { + "epoch": 0.20260374288039057, + "grad_norm": 22.873510360717773, + "learning_rate": 9.440108346593579e-07, + "loss": 0.5717, + "num_input_tokens_seen": 1566572544, + "step": 747 + }, + { + "epoch": 0.2028749660970979, + "grad_norm": 21.443544387817383, + "learning_rate": 9.438156648605521e-07, + "loss": 0.799, + "num_input_tokens_seen": 1568669696, + "step": 748 + }, + { + "epoch": 0.20314618931380526, + "grad_norm": 19.906049728393555, + "learning_rate": 9.436201781338852e-07, + "loss": 0.4752, + "num_input_tokens_seen": 1570766848, + "step": 749 + }, + { + "epoch": 0.20341741253051263, + "grad_norm": 14.483591079711914, + "learning_rate": 9.434243746366771e-07, + "loss": 0.4946, + "num_input_tokens_seen": 1572864000, + "step": 750 + }, + { + "epoch": 0.20368863574721996, + "grad_norm": 15.546069145202637, + "learning_rate": 9.432282545265034e-07, + "loss": 0.4443, + "num_input_tokens_seen": 1574961152, + "step": 751 + }, + { + "epoch": 0.20395985896392732, + "grad_norm": 13.788752555847168, + "learning_rate": 9.430318179611938e-07, + "loss": 0.4133, + "num_input_tokens_seen": 1577058304, + "step": 752 + }, + { + "epoch": 0.20423108218063465, + "grad_norm": 22.282867431640625, + "learning_rate": 9.42835065098833e-07, + "loss": 0.631, + "num_input_tokens_seen": 1579155456, + "step": 753 + }, + { + "epoch": 0.20450230539734202, + "grad_norm": 24.22948455810547, + "learning_rate": 9.426379960977605e-07, + "loss": 0.5669, + "num_input_tokens_seen": 1581252608, + "step": 754 + }, + { + "epoch": 0.20477352861404935, + "grad_norm": 11.243971824645996, + "learning_rate": 9.424406111165697e-07, + "loss": 0.2371, + "num_input_tokens_seen": 1583349760, + "step": 755 + }, + { + "epoch": 0.2050447518307567, + "grad_norm": 25.394519805908203, + "learning_rate": 9.422429103141084e-07, + "loss": 1.0849, + "num_input_tokens_seen": 1585446912, + "step": 756 + }, + { + "epoch": 0.20531597504746407, + "grad_norm": 18.955663681030273, + "learning_rate": 9.42044893849479e-07, + "loss": 0.4138, + "num_input_tokens_seen": 1587544064, + "step": 757 + }, + { + "epoch": 0.2055871982641714, + "grad_norm": 22.107629776000977, + "learning_rate": 9.418465618820374e-07, + "loss": 0.7176, + "num_input_tokens_seen": 1589641216, + "step": 758 + }, + { + "epoch": 0.20585842148087877, + "grad_norm": 12.619873046875, + "learning_rate": 9.416479145713936e-07, + "loss": 0.3041, + "num_input_tokens_seen": 1591738368, + "step": 759 + }, + { + "epoch": 0.2061296446975861, + "grad_norm": 22.457910537719727, + "learning_rate": 9.414489520774114e-07, + "loss": 0.596, + "num_input_tokens_seen": 1593835520, + "step": 760 + }, + { + "epoch": 0.20640086791429346, + "grad_norm": 20.29651641845703, + "learning_rate": 9.412496745602084e-07, + "loss": 0.6392, + "num_input_tokens_seen": 1595932672, + "step": 761 + }, + { + "epoch": 0.20667209113100082, + "grad_norm": 17.73131561279297, + "learning_rate": 9.410500821801556e-07, + "loss": 0.5468, + "num_input_tokens_seen": 1598029824, + "step": 762 + }, + { + "epoch": 0.20694331434770816, + "grad_norm": 22.578598022460938, + "learning_rate": 9.408501750978769e-07, + "loss": 0.7247, + "num_input_tokens_seen": 1600126976, + "step": 763 + }, + { + "epoch": 0.20721453756441552, + "grad_norm": 13.882994651794434, + "learning_rate": 9.406499534742503e-07, + "loss": 0.3753, + "num_input_tokens_seen": 1602224128, + "step": 764 + }, + { + "epoch": 0.20748576078112285, + "grad_norm": 14.860766410827637, + "learning_rate": 9.404494174704068e-07, + "loss": 0.4422, + "num_input_tokens_seen": 1604321280, + "step": 765 + }, + { + "epoch": 0.2077569839978302, + "grad_norm": 12.610158920288086, + "learning_rate": 9.402485672477296e-07, + "loss": 0.412, + "num_input_tokens_seen": 1606418432, + "step": 766 + }, + { + "epoch": 0.20802820721453757, + "grad_norm": 13.11758804321289, + "learning_rate": 9.400474029678555e-07, + "loss": 0.3497, + "num_input_tokens_seen": 1608515584, + "step": 767 + }, + { + "epoch": 0.2082994304312449, + "grad_norm": 18.06613540649414, + "learning_rate": 9.39845924792674e-07, + "loss": 0.5784, + "num_input_tokens_seen": 1610612736, + "step": 768 + }, + { + "epoch": 0.20857065364795227, + "grad_norm": 26.419235229492188, + "learning_rate": 9.396441328843268e-07, + "loss": 0.6932, + "num_input_tokens_seen": 1612709888, + "step": 769 + }, + { + "epoch": 0.2088418768646596, + "grad_norm": 17.12129020690918, + "learning_rate": 9.394420274052088e-07, + "loss": 0.6488, + "num_input_tokens_seen": 1614807040, + "step": 770 + }, + { + "epoch": 0.20911310008136696, + "grad_norm": 16.961605072021484, + "learning_rate": 9.392396085179662e-07, + "loss": 0.45, + "num_input_tokens_seen": 1616904192, + "step": 771 + }, + { + "epoch": 0.20938432329807433, + "grad_norm": 17.352319717407227, + "learning_rate": 9.390368763854985e-07, + "loss": 0.5387, + "num_input_tokens_seen": 1619001344, + "step": 772 + }, + { + "epoch": 0.20965554651478166, + "grad_norm": 16.856760025024414, + "learning_rate": 9.388338311709566e-07, + "loss": 0.4673, + "num_input_tokens_seen": 1621098496, + "step": 773 + }, + { + "epoch": 0.20992676973148902, + "grad_norm": 20.43520164489746, + "learning_rate": 9.386304730377437e-07, + "loss": 0.6915, + "num_input_tokens_seen": 1623195648, + "step": 774 + }, + { + "epoch": 0.21019799294819635, + "grad_norm": 15.013359069824219, + "learning_rate": 9.384268021495145e-07, + "loss": 0.4143, + "num_input_tokens_seen": 1625292800, + "step": 775 + }, + { + "epoch": 0.21046921616490372, + "grad_norm": 15.033038139343262, + "learning_rate": 9.382228186701756e-07, + "loss": 0.4449, + "num_input_tokens_seen": 1627389952, + "step": 776 + }, + { + "epoch": 0.21074043938161108, + "grad_norm": 14.816195487976074, + "learning_rate": 9.380185227638854e-07, + "loss": 0.4166, + "num_input_tokens_seen": 1629487104, + "step": 777 + }, + { + "epoch": 0.2110116625983184, + "grad_norm": 15.444920539855957, + "learning_rate": 9.378139145950532e-07, + "loss": 0.4416, + "num_input_tokens_seen": 1631584256, + "step": 778 + }, + { + "epoch": 0.21128288581502577, + "grad_norm": 13.680757522583008, + "learning_rate": 9.376089943283398e-07, + "loss": 0.3259, + "num_input_tokens_seen": 1633681408, + "step": 779 + }, + { + "epoch": 0.2115541090317331, + "grad_norm": 20.651817321777344, + "learning_rate": 9.374037621286574e-07, + "loss": 0.5884, + "num_input_tokens_seen": 1635778560, + "step": 780 + }, + { + "epoch": 0.21182533224844047, + "grad_norm": 14.277517318725586, + "learning_rate": 9.371982181611692e-07, + "loss": 0.4881, + "num_input_tokens_seen": 1637875712, + "step": 781 + }, + { + "epoch": 0.21209655546514783, + "grad_norm": 15.769245147705078, + "learning_rate": 9.369923625912888e-07, + "loss": 0.4704, + "num_input_tokens_seen": 1639972864, + "step": 782 + }, + { + "epoch": 0.21236777868185516, + "grad_norm": 13.333930969238281, + "learning_rate": 9.367861955846813e-07, + "loss": 0.3845, + "num_input_tokens_seen": 1642070016, + "step": 783 + }, + { + "epoch": 0.21263900189856252, + "grad_norm": 14.731000900268555, + "learning_rate": 9.365797173072619e-07, + "loss": 0.4233, + "num_input_tokens_seen": 1644167168, + "step": 784 + }, + { + "epoch": 0.21291022511526986, + "grad_norm": 19.93707275390625, + "learning_rate": 9.363729279251965e-07, + "loss": 0.651, + "num_input_tokens_seen": 1646264320, + "step": 785 + }, + { + "epoch": 0.21318144833197722, + "grad_norm": 17.735177993774414, + "learning_rate": 9.361658276049012e-07, + "loss": 0.5842, + "num_input_tokens_seen": 1648361472, + "step": 786 + }, + { + "epoch": 0.21345267154868458, + "grad_norm": 13.280116081237793, + "learning_rate": 9.359584165130426e-07, + "loss": 0.3729, + "num_input_tokens_seen": 1650458624, + "step": 787 + }, + { + "epoch": 0.21372389476539191, + "grad_norm": 18.659069061279297, + "learning_rate": 9.357506948165372e-07, + "loss": 0.6783, + "num_input_tokens_seen": 1652555776, + "step": 788 + }, + { + "epoch": 0.21399511798209928, + "grad_norm": 14.321599960327148, + "learning_rate": 9.355426626825516e-07, + "loss": 0.3726, + "num_input_tokens_seen": 1654652928, + "step": 789 + }, + { + "epoch": 0.2142663411988066, + "grad_norm": 14.604723930358887, + "learning_rate": 9.353343202785019e-07, + "loss": 0.4038, + "num_input_tokens_seen": 1656750080, + "step": 790 + }, + { + "epoch": 0.21453756441551397, + "grad_norm": 16.1937313079834, + "learning_rate": 9.351256677720542e-07, + "loss": 0.5191, + "num_input_tokens_seen": 1658847232, + "step": 791 + }, + { + "epoch": 0.21480878763222133, + "grad_norm": 22.89767074584961, + "learning_rate": 9.349167053311245e-07, + "loss": 0.6142, + "num_input_tokens_seen": 1660944384, + "step": 792 + }, + { + "epoch": 0.21508001084892867, + "grad_norm": 16.107093811035156, + "learning_rate": 9.347074331238774e-07, + "loss": 0.4582, + "num_input_tokens_seen": 1663041536, + "step": 793 + }, + { + "epoch": 0.21535123406563603, + "grad_norm": 11.947312355041504, + "learning_rate": 9.344978513187271e-07, + "loss": 0.2851, + "num_input_tokens_seen": 1665138688, + "step": 794 + }, + { + "epoch": 0.21562245728234336, + "grad_norm": 26.070587158203125, + "learning_rate": 9.342879600843376e-07, + "loss": 0.9693, + "num_input_tokens_seen": 1667235840, + "step": 795 + }, + { + "epoch": 0.21589368049905072, + "grad_norm": 16.457632064819336, + "learning_rate": 9.34077759589621e-07, + "loss": 0.4114, + "num_input_tokens_seen": 1669332992, + "step": 796 + }, + { + "epoch": 0.21616490371575806, + "grad_norm": 27.994348526000977, + "learning_rate": 9.338672500037387e-07, + "loss": 0.8297, + "num_input_tokens_seen": 1671430144, + "step": 797 + }, + { + "epoch": 0.21643612693246542, + "grad_norm": 16.979135513305664, + "learning_rate": 9.336564314961008e-07, + "loss": 0.4932, + "num_input_tokens_seen": 1673527296, + "step": 798 + }, + { + "epoch": 0.21670735014917278, + "grad_norm": 16.894512176513672, + "learning_rate": 9.334453042363661e-07, + "loss": 0.3409, + "num_input_tokens_seen": 1675624448, + "step": 799 + }, + { + "epoch": 0.2169785733658801, + "grad_norm": 20.867141723632812, + "learning_rate": 9.332338683944415e-07, + "loss": 0.8633, + "num_input_tokens_seen": 1677721600, + "step": 800 + }, + { + "epoch": 0.00027122321670735016, + "grad_norm": 14.524723052978516, + "learning_rate": 9.33022124140483e-07, + "loss": 0.3659, + "num_input_tokens_seen": 1679818752, + "step": 801 + }, + { + "epoch": 0.0005424464334147003, + "grad_norm": 12.789175987243652, + "learning_rate": 9.32810071644894e-07, + "loss": 0.3889, + "num_input_tokens_seen": 1681915904, + "step": 802 + }, + { + "epoch": 0.0008136696501220504, + "grad_norm": 15.237256050109863, + "learning_rate": 9.325977110783263e-07, + "loss": 0.3584, + "num_input_tokens_seen": 1684013056, + "step": 803 + }, + { + "epoch": 0.0010848928668294006, + "grad_norm": 17.71392250061035, + "learning_rate": 9.323850426116797e-07, + "loss": 0.5716, + "num_input_tokens_seen": 1686110208, + "step": 804 + }, + { + "epoch": 0.0013561160835367507, + "grad_norm": 20.835195541381836, + "learning_rate": 9.321720664161017e-07, + "loss": 0.5411, + "num_input_tokens_seen": 1688207360, + "step": 805 + }, + { + "epoch": 0.0016273393002441008, + "grad_norm": 20.98845100402832, + "learning_rate": 9.319587826629872e-07, + "loss": 0.5051, + "num_input_tokens_seen": 1690304512, + "step": 806 + }, + { + "epoch": 0.001898562516951451, + "grad_norm": 17.86595916748047, + "learning_rate": 9.317451915239792e-07, + "loss": 0.6283, + "num_input_tokens_seen": 1692401664, + "step": 807 + }, + { + "epoch": 0.0021697857336588013, + "grad_norm": 19.295974731445312, + "learning_rate": 9.315312931709674e-07, + "loss": 0.5475, + "num_input_tokens_seen": 1694498816, + "step": 808 + }, + { + "epoch": 0.0024410089503661514, + "grad_norm": 20.173925399780273, + "learning_rate": 9.313170877760892e-07, + "loss": 0.6961, + "num_input_tokens_seen": 1696595968, + "step": 809 + }, + { + "epoch": 0.0027122321670735015, + "grad_norm": 13.733884811401367, + "learning_rate": 9.311025755117291e-07, + "loss": 0.3755, + "num_input_tokens_seen": 1698693120, + "step": 810 + }, + { + "epoch": 0.0029834553837808516, + "grad_norm": 19.169233322143555, + "learning_rate": 9.308877565505181e-07, + "loss": 0.6359, + "num_input_tokens_seen": 1700790272, + "step": 811 + }, + { + "epoch": 0.0032546786004882017, + "grad_norm": 16.250288009643555, + "learning_rate": 9.306726310653346e-07, + "loss": 0.3631, + "num_input_tokens_seen": 1702887424, + "step": 812 + }, + { + "epoch": 0.003525901817195552, + "grad_norm": 34.65564727783203, + "learning_rate": 9.304571992293032e-07, + "loss": 0.672, + "num_input_tokens_seen": 1704984576, + "step": 813 + }, + { + "epoch": 0.003797125033902902, + "grad_norm": 30.45555877685547, + "learning_rate": 9.302414612157954e-07, + "loss": 0.9443, + "num_input_tokens_seen": 1707081728, + "step": 814 + }, + { + "epoch": 0.0040683482506102524, + "grad_norm": 19.15201759338379, + "learning_rate": 9.300254171984289e-07, + "loss": 0.5574, + "num_input_tokens_seen": 1709178880, + "step": 815 + }, + { + "epoch": 0.0043395714673176026, + "grad_norm": 29.86651039123535, + "learning_rate": 9.298090673510677e-07, + "loss": 0.724, + "num_input_tokens_seen": 1711276032, + "step": 816 + }, + { + "epoch": 0.004610794684024953, + "grad_norm": 18.136457443237305, + "learning_rate": 9.295924118478218e-07, + "loss": 0.6076, + "num_input_tokens_seen": 1713373184, + "step": 817 + }, + { + "epoch": 0.004882017900732303, + "grad_norm": 19.57548713684082, + "learning_rate": 9.293754508630473e-07, + "loss": 0.7823, + "num_input_tokens_seen": 1715470336, + "step": 818 + }, + { + "epoch": 0.005153241117439653, + "grad_norm": 14.906988143920898, + "learning_rate": 9.291581845713466e-07, + "loss": 0.4778, + "num_input_tokens_seen": 1717567488, + "step": 819 + }, + { + "epoch": 0.005424464334147003, + "grad_norm": 14.388298034667969, + "learning_rate": 9.289406131475665e-07, + "loss": 0.3655, + "num_input_tokens_seen": 1719664640, + "step": 820 + }, + { + "epoch": 0.005695687550854353, + "grad_norm": 19.326364517211914, + "learning_rate": 9.287227367668012e-07, + "loss": 0.6397, + "num_input_tokens_seen": 1721761792, + "step": 821 + }, + { + "epoch": 0.005966910767561703, + "grad_norm": 12.356776237487793, + "learning_rate": 9.285045556043885e-07, + "loss": 0.3671, + "num_input_tokens_seen": 1723858944, + "step": 822 + }, + { + "epoch": 0.006238133984269053, + "grad_norm": 18.93061065673828, + "learning_rate": 9.282860698359128e-07, + "loss": 0.763, + "num_input_tokens_seen": 1725956096, + "step": 823 + }, + { + "epoch": 0.006509357200976403, + "grad_norm": 23.214996337890625, + "learning_rate": 9.280672796372029e-07, + "loss": 0.9767, + "num_input_tokens_seen": 1728053248, + "step": 824 + }, + { + "epoch": 0.0067805804176837535, + "grad_norm": 21.05919075012207, + "learning_rate": 9.278481851843327e-07, + "loss": 0.6971, + "num_input_tokens_seen": 1730150400, + "step": 825 + }, + { + "epoch": 0.007051803634391104, + "grad_norm": 20.329505920410156, + "learning_rate": 9.276287866536215e-07, + "loss": 0.6836, + "num_input_tokens_seen": 1732247552, + "step": 826 + }, + { + "epoch": 0.007323026851098454, + "grad_norm": 13.536745071411133, + "learning_rate": 9.274090842216326e-07, + "loss": 0.3354, + "num_input_tokens_seen": 1734344704, + "step": 827 + }, + { + "epoch": 0.007594250067805804, + "grad_norm": 19.788000106811523, + "learning_rate": 9.271890780651741e-07, + "loss": 0.648, + "num_input_tokens_seen": 1736441856, + "step": 828 + }, + { + "epoch": 0.007865473284513154, + "grad_norm": 11.306535720825195, + "learning_rate": 9.269687683612987e-07, + "loss": 0.2731, + "num_input_tokens_seen": 1738539008, + "step": 829 + }, + { + "epoch": 0.008136696501220505, + "grad_norm": 20.76605796813965, + "learning_rate": 9.267481552873033e-07, + "loss": 0.4084, + "num_input_tokens_seen": 1740636160, + "step": 830 + }, + { + "epoch": 0.008407919717927854, + "grad_norm": 17.467897415161133, + "learning_rate": 9.265272390207289e-07, + "loss": 0.4862, + "num_input_tokens_seen": 1742733312, + "step": 831 + }, + { + "epoch": 0.008679142934635205, + "grad_norm": 21.455224990844727, + "learning_rate": 9.263060197393603e-07, + "loss": 0.5836, + "num_input_tokens_seen": 1744830464, + "step": 832 + }, + { + "epoch": 0.008950366151342554, + "grad_norm": 22.814332962036133, + "learning_rate": 9.260844976212268e-07, + "loss": 0.9138, + "num_input_tokens_seen": 1746927616, + "step": 833 + }, + { + "epoch": 0.009221589368049905, + "grad_norm": 13.567204475402832, + "learning_rate": 9.258626728446004e-07, + "loss": 0.3955, + "num_input_tokens_seen": 1749024768, + "step": 834 + }, + { + "epoch": 0.009492812584757255, + "grad_norm": 19.025236129760742, + "learning_rate": 9.256405455879977e-07, + "loss": 0.7177, + "num_input_tokens_seen": 1751121920, + "step": 835 + }, + { + "epoch": 0.009764035801464606, + "grad_norm": 22.544050216674805, + "learning_rate": 9.25418116030178e-07, + "loss": 0.6642, + "num_input_tokens_seen": 1753219072, + "step": 836 + }, + { + "epoch": 0.010035259018171955, + "grad_norm": 13.098361015319824, + "learning_rate": 9.251953843501443e-07, + "loss": 0.3442, + "num_input_tokens_seen": 1755316224, + "step": 837 + }, + { + "epoch": 0.010306482234879306, + "grad_norm": 20.6568660736084, + "learning_rate": 9.249723507271425e-07, + "loss": 0.5423, + "num_input_tokens_seen": 1757413376, + "step": 838 + }, + { + "epoch": 0.010577705451586655, + "grad_norm": 17.13654899597168, + "learning_rate": 9.247490153406617e-07, + "loss": 0.5084, + "num_input_tokens_seen": 1759510528, + "step": 839 + }, + { + "epoch": 0.010848928668294006, + "grad_norm": 16.935338973999023, + "learning_rate": 9.245253783704334e-07, + "loss": 0.3407, + "num_input_tokens_seen": 1761607680, + "step": 840 + }, + { + "epoch": 0.011120151885001357, + "grad_norm": 12.196868896484375, + "learning_rate": 9.243014399964324e-07, + "loss": 0.3577, + "num_input_tokens_seen": 1763704832, + "step": 841 + }, + { + "epoch": 0.011391375101708706, + "grad_norm": 10.492193222045898, + "learning_rate": 9.240772003988758e-07, + "loss": 0.3152, + "num_input_tokens_seen": 1765801984, + "step": 842 + }, + { + "epoch": 0.011662598318416057, + "grad_norm": 17.036666870117188, + "learning_rate": 9.238526597582229e-07, + "loss": 0.4958, + "num_input_tokens_seen": 1767899136, + "step": 843 + }, + { + "epoch": 0.011933821535123406, + "grad_norm": 26.499019622802734, + "learning_rate": 9.236278182551758e-07, + "loss": 0.6916, + "num_input_tokens_seen": 1769996288, + "step": 844 + }, + { + "epoch": 0.012205044751830757, + "grad_norm": 18.293750762939453, + "learning_rate": 9.23402676070678e-07, + "loss": 0.5991, + "num_input_tokens_seen": 1772093440, + "step": 845 + }, + { + "epoch": 0.012476267968538107, + "grad_norm": 19.971363067626953, + "learning_rate": 9.231772333859154e-07, + "loss": 0.6295, + "num_input_tokens_seen": 1774190592, + "step": 846 + }, + { + "epoch": 0.012747491185245458, + "grad_norm": 15.316537857055664, + "learning_rate": 9.22951490382316e-07, + "loss": 0.3458, + "num_input_tokens_seen": 1776287744, + "step": 847 + }, + { + "epoch": 0.013018714401952807, + "grad_norm": 13.23933219909668, + "learning_rate": 9.22725447241549e-07, + "loss": 0.3044, + "num_input_tokens_seen": 1778384896, + "step": 848 + }, + { + "epoch": 0.013289937618660158, + "grad_norm": 17.797571182250977, + "learning_rate": 9.224991041455252e-07, + "loss": 0.4336, + "num_input_tokens_seen": 1780482048, + "step": 849 + }, + { + "epoch": 0.013561160835367507, + "grad_norm": 17.58323860168457, + "learning_rate": 9.222724612763971e-07, + "loss": 0.6113, + "num_input_tokens_seen": 1782579200, + "step": 850 + }, + { + "epoch": 0.013832384052074858, + "grad_norm": 17.832801818847656, + "learning_rate": 9.220455188165582e-07, + "loss": 0.6039, + "num_input_tokens_seen": 1784676352, + "step": 851 + }, + { + "epoch": 0.014103607268782207, + "grad_norm": 23.844926834106445, + "learning_rate": 9.218182769486433e-07, + "loss": 0.7489, + "num_input_tokens_seen": 1786773504, + "step": 852 + }, + { + "epoch": 0.014374830485489558, + "grad_norm": 16.524341583251953, + "learning_rate": 9.215907358555276e-07, + "loss": 0.615, + "num_input_tokens_seen": 1788870656, + "step": 853 + }, + { + "epoch": 0.014646053702196907, + "grad_norm": 24.72736358642578, + "learning_rate": 9.213628957203277e-07, + "loss": 0.867, + "num_input_tokens_seen": 1790967808, + "step": 854 + }, + { + "epoch": 0.014917276918904258, + "grad_norm": 25.9926700592041, + "learning_rate": 9.21134756726401e-07, + "loss": 0.8105, + "num_input_tokens_seen": 1793064960, + "step": 855 + }, + { + "epoch": 0.015188500135611608, + "grad_norm": 20.51833724975586, + "learning_rate": 9.209063190573445e-07, + "loss": 0.5625, + "num_input_tokens_seen": 1795162112, + "step": 856 + }, + { + "epoch": 0.015459723352318959, + "grad_norm": 15.484442710876465, + "learning_rate": 9.206775828969967e-07, + "loss": 0.4131, + "num_input_tokens_seen": 1797259264, + "step": 857 + }, + { + "epoch": 0.015730946569026308, + "grad_norm": 12.139299392700195, + "learning_rate": 9.204485484294355e-07, + "loss": 0.3329, + "num_input_tokens_seen": 1799356416, + "step": 858 + }, + { + "epoch": 0.01600216978573366, + "grad_norm": 15.683771133422852, + "learning_rate": 9.202192158389791e-07, + "loss": 0.5659, + "num_input_tokens_seen": 1801453568, + "step": 859 + }, + { + "epoch": 0.01627339300244101, + "grad_norm": 21.66155242919922, + "learning_rate": 9.199895853101856e-07, + "loss": 0.6709, + "num_input_tokens_seen": 1803550720, + "step": 860 + }, + { + "epoch": 0.01654461621914836, + "grad_norm": 19.560569763183594, + "learning_rate": 9.197596570278529e-07, + "loss": 0.7169, + "num_input_tokens_seen": 1805647872, + "step": 861 + }, + { + "epoch": 0.016815839435855708, + "grad_norm": 14.26892375946045, + "learning_rate": 9.19529431177019e-07, + "loss": 0.3676, + "num_input_tokens_seen": 1807745024, + "step": 862 + }, + { + "epoch": 0.01708706265256306, + "grad_norm": 9.662439346313477, + "learning_rate": 9.192989079429603e-07, + "loss": 0.1885, + "num_input_tokens_seen": 1809842176, + "step": 863 + }, + { + "epoch": 0.01735828586927041, + "grad_norm": 16.50687026977539, + "learning_rate": 9.190680875111934e-07, + "loss": 0.4003, + "num_input_tokens_seen": 1811939328, + "step": 864 + }, + { + "epoch": 0.01762950908597776, + "grad_norm": 16.06983757019043, + "learning_rate": 9.188369700674735e-07, + "loss": 0.4239, + "num_input_tokens_seen": 1814036480, + "step": 865 + }, + { + "epoch": 0.01790073230268511, + "grad_norm": 17.57181739807129, + "learning_rate": 9.186055557977957e-07, + "loss": 0.5002, + "num_input_tokens_seen": 1816133632, + "step": 866 + }, + { + "epoch": 0.01817195551939246, + "grad_norm": 22.786888122558594, + "learning_rate": 9.18373844888393e-07, + "loss": 0.6752, + "num_input_tokens_seen": 1818230784, + "step": 867 + }, + { + "epoch": 0.01844317873609981, + "grad_norm": 15.312600135803223, + "learning_rate": 9.181418375257374e-07, + "loss": 0.477, + "num_input_tokens_seen": 1820327936, + "step": 868 + }, + { + "epoch": 0.01871440195280716, + "grad_norm": 21.413515090942383, + "learning_rate": 9.179095338965401e-07, + "loss": 0.6745, + "num_input_tokens_seen": 1822425088, + "step": 869 + }, + { + "epoch": 0.01898562516951451, + "grad_norm": 14.375008583068848, + "learning_rate": 9.176769341877497e-07, + "loss": 0.4547, + "num_input_tokens_seen": 1824522240, + "step": 870 + }, + { + "epoch": 0.01925684838622186, + "grad_norm": 18.940818786621094, + "learning_rate": 9.17444038586554e-07, + "loss": 0.5549, + "num_input_tokens_seen": 1826619392, + "step": 871 + }, + { + "epoch": 0.01952807160292921, + "grad_norm": 22.97831916809082, + "learning_rate": 9.172108472803782e-07, + "loss": 0.6916, + "num_input_tokens_seen": 1828716544, + "step": 872 + }, + { + "epoch": 0.019799294819636562, + "grad_norm": 33.489131927490234, + "learning_rate": 9.16977360456886e-07, + "loss": 0.464, + "num_input_tokens_seen": 1830813696, + "step": 873 + }, + { + "epoch": 0.02007051803634391, + "grad_norm": 12.543787002563477, + "learning_rate": 9.167435783039786e-07, + "loss": 0.3587, + "num_input_tokens_seen": 1832910848, + "step": 874 + }, + { + "epoch": 0.02034174125305126, + "grad_norm": 16.099515914916992, + "learning_rate": 9.165095010097949e-07, + "loss": 0.5598, + "num_input_tokens_seen": 1835008000, + "step": 875 + }, + { + "epoch": 0.02061296446975861, + "grad_norm": 18.598281860351562, + "learning_rate": 9.162751287627116e-07, + "loss": 0.645, + "num_input_tokens_seen": 1837105152, + "step": 876 + }, + { + "epoch": 0.020884187686465962, + "grad_norm": 14.963810920715332, + "learning_rate": 9.160404617513424e-07, + "loss": 0.4911, + "num_input_tokens_seen": 1839202304, + "step": 877 + }, + { + "epoch": 0.02115541090317331, + "grad_norm": 21.489377975463867, + "learning_rate": 9.158055001645385e-07, + "loss": 0.8007, + "num_input_tokens_seen": 1841299456, + "step": 878 + }, + { + "epoch": 0.02142663411988066, + "grad_norm": 14.658339500427246, + "learning_rate": 9.155702441913881e-07, + "loss": 0.4235, + "num_input_tokens_seen": 1843396608, + "step": 879 + }, + { + "epoch": 0.021697857336588012, + "grad_norm": 20.184280395507812, + "learning_rate": 9.15334694021216e-07, + "loss": 0.5487, + "num_input_tokens_seen": 1845493760, + "step": 880 + }, + { + "epoch": 0.021969080553295363, + "grad_norm": 17.544336318969727, + "learning_rate": 9.150988498435843e-07, + "loss": 0.558, + "num_input_tokens_seen": 1847590912, + "step": 881 + }, + { + "epoch": 0.022240303770002714, + "grad_norm": 12.21959400177002, + "learning_rate": 9.148627118482912e-07, + "loss": 0.3181, + "num_input_tokens_seen": 1849688064, + "step": 882 + }, + { + "epoch": 0.02251152698671006, + "grad_norm": 17.65806007385254, + "learning_rate": 9.146262802253717e-07, + "loss": 0.7004, + "num_input_tokens_seen": 1851785216, + "step": 883 + }, + { + "epoch": 0.022782750203417412, + "grad_norm": 11.67097282409668, + "learning_rate": 9.14389555165097e-07, + "loss": 0.3288, + "num_input_tokens_seen": 1853882368, + "step": 884 + }, + { + "epoch": 0.023053973420124763, + "grad_norm": 17.335294723510742, + "learning_rate": 9.141525368579742e-07, + "loss": 0.4907, + "num_input_tokens_seen": 1855979520, + "step": 885 + }, + { + "epoch": 0.023325196636832114, + "grad_norm": 21.265504837036133, + "learning_rate": 9.139152254947469e-07, + "loss": 0.424, + "num_input_tokens_seen": 1858076672, + "step": 886 + }, + { + "epoch": 0.023596419853539462, + "grad_norm": 12.256847381591797, + "learning_rate": 9.136776212663942e-07, + "loss": 0.4297, + "num_input_tokens_seen": 1860173824, + "step": 887 + }, + { + "epoch": 0.023867643070246813, + "grad_norm": 20.520448684692383, + "learning_rate": 9.134397243641307e-07, + "loss": 0.6221, + "num_input_tokens_seen": 1862270976, + "step": 888 + }, + { + "epoch": 0.024138866286954164, + "grad_norm": 15.912782669067383, + "learning_rate": 9.132015349794069e-07, + "loss": 0.3873, + "num_input_tokens_seen": 1864368128, + "step": 889 + }, + { + "epoch": 0.024410089503661515, + "grad_norm": 12.093875885009766, + "learning_rate": 9.129630533039086e-07, + "loss": 0.2584, + "num_input_tokens_seen": 1866465280, + "step": 890 + }, + { + "epoch": 0.024681312720368862, + "grad_norm": 25.329559326171875, + "learning_rate": 9.127242795295569e-07, + "loss": 0.7337, + "num_input_tokens_seen": 1868562432, + "step": 891 + }, + { + "epoch": 0.024952535937076213, + "grad_norm": 23.77729034423828, + "learning_rate": 9.124852138485076e-07, + "loss": 0.5747, + "num_input_tokens_seen": 1870659584, + "step": 892 + }, + { + "epoch": 0.025223759153783564, + "grad_norm": 24.904544830322266, + "learning_rate": 9.12245856453152e-07, + "loss": 0.8033, + "num_input_tokens_seen": 1872756736, + "step": 893 + }, + { + "epoch": 0.025494982370490915, + "grad_norm": 19.269147872924805, + "learning_rate": 9.120062075361155e-07, + "loss": 0.632, + "num_input_tokens_seen": 1874853888, + "step": 894 + }, + { + "epoch": 0.025766205587198263, + "grad_norm": 19.273340225219727, + "learning_rate": 9.117662672902584e-07, + "loss": 0.6049, + "num_input_tokens_seen": 1876951040, + "step": 895 + }, + { + "epoch": 0.026037428803905614, + "grad_norm": 16.550512313842773, + "learning_rate": 9.115260359086757e-07, + "loss": 0.4171, + "num_input_tokens_seen": 1879048192, + "step": 896 + }, + { + "epoch": 0.026308652020612965, + "grad_norm": 21.415002822875977, + "learning_rate": 9.112855135846964e-07, + "loss": 0.6028, + "num_input_tokens_seen": 1881145344, + "step": 897 + }, + { + "epoch": 0.026579875237320316, + "grad_norm": 13.791115760803223, + "learning_rate": 9.110447005118836e-07, + "loss": 0.4228, + "num_input_tokens_seen": 1883242496, + "step": 898 + }, + { + "epoch": 0.026851098454027666, + "grad_norm": 12.052141189575195, + "learning_rate": 9.108035968840348e-07, + "loss": 0.3593, + "num_input_tokens_seen": 1885339648, + "step": 899 + }, + { + "epoch": 0.027122321670735014, + "grad_norm": 19.59334373474121, + "learning_rate": 9.105622028951806e-07, + "loss": 0.6185, + "num_input_tokens_seen": 1887436800, + "step": 900 + }, + { + "epoch": 0.027393544887442365, + "grad_norm": 13.629782676696777, + "learning_rate": 9.103205187395861e-07, + "loss": 0.3392, + "num_input_tokens_seen": 1889533952, + "step": 901 + }, + { + "epoch": 0.027664768104149716, + "grad_norm": 14.33342170715332, + "learning_rate": 9.100785446117493e-07, + "loss": 0.388, + "num_input_tokens_seen": 1891631104, + "step": 902 + }, + { + "epoch": 0.027935991320857067, + "grad_norm": 17.475322723388672, + "learning_rate": 9.098362807064017e-07, + "loss": 0.5221, + "num_input_tokens_seen": 1893728256, + "step": 903 + }, + { + "epoch": 0.028207214537564414, + "grad_norm": 15.671577453613281, + "learning_rate": 9.095937272185083e-07, + "loss": 0.3685, + "num_input_tokens_seen": 1895825408, + "step": 904 + }, + { + "epoch": 0.028478437754271765, + "grad_norm": 16.664928436279297, + "learning_rate": 9.093508843432667e-07, + "loss": 0.4047, + "num_input_tokens_seen": 1897922560, + "step": 905 + }, + { + "epoch": 0.028749660970979116, + "grad_norm": 23.7731876373291, + "learning_rate": 9.091077522761078e-07, + "loss": 0.6866, + "num_input_tokens_seen": 1900019712, + "step": 906 + }, + { + "epoch": 0.029020884187686467, + "grad_norm": 18.917259216308594, + "learning_rate": 9.088643312126948e-07, + "loss": 0.6913, + "num_input_tokens_seen": 1902116864, + "step": 907 + }, + { + "epoch": 0.029292107404393815, + "grad_norm": 16.237077713012695, + "learning_rate": 9.086206213489239e-07, + "loss": 0.4826, + "num_input_tokens_seen": 1904214016, + "step": 908 + }, + { + "epoch": 0.029563330621101166, + "grad_norm": 19.56369400024414, + "learning_rate": 9.083766228809234e-07, + "loss": 0.6956, + "num_input_tokens_seen": 1906311168, + "step": 909 + }, + { + "epoch": 0.029834553837808517, + "grad_norm": 14.219721794128418, + "learning_rate": 9.081323360050543e-07, + "loss": 0.4732, + "num_input_tokens_seen": 1908408320, + "step": 910 + }, + { + "epoch": 0.030105777054515868, + "grad_norm": 17.175357818603516, + "learning_rate": 9.078877609179088e-07, + "loss": 0.4612, + "num_input_tokens_seen": 1910505472, + "step": 911 + }, + { + "epoch": 0.030377000271223215, + "grad_norm": 12.589146614074707, + "learning_rate": 9.076428978163121e-07, + "loss": 0.3071, + "num_input_tokens_seen": 1912602624, + "step": 912 + }, + { + "epoch": 0.030648223487930566, + "grad_norm": 24.949342727661133, + "learning_rate": 9.073977468973206e-07, + "loss": 1.031, + "num_input_tokens_seen": 1914699776, + "step": 913 + }, + { + "epoch": 0.030919446704637917, + "grad_norm": 17.8113956451416, + "learning_rate": 9.071523083582223e-07, + "loss": 0.3996, + "num_input_tokens_seen": 1916796928, + "step": 914 + }, + { + "epoch": 0.031190669921345268, + "grad_norm": 25.642656326293945, + "learning_rate": 9.06906582396537e-07, + "loss": 0.6838, + "num_input_tokens_seen": 1918894080, + "step": 915 + }, + { + "epoch": 0.031461893138052616, + "grad_norm": 13.68373966217041, + "learning_rate": 9.066605692100155e-07, + "loss": 0.3932, + "num_input_tokens_seen": 1920991232, + "step": 916 + }, + { + "epoch": 0.03173311635475997, + "grad_norm": 12.269515037536621, + "learning_rate": 9.064142689966397e-07, + "loss": 0.3473, + "num_input_tokens_seen": 1923088384, + "step": 917 + }, + { + "epoch": 0.03200433957146732, + "grad_norm": 19.712133407592773, + "learning_rate": 9.061676819546229e-07, + "loss": 0.8176, + "num_input_tokens_seen": 1925185536, + "step": 918 + }, + { + "epoch": 0.032275562788174665, + "grad_norm": 15.215006828308105, + "learning_rate": 9.059208082824087e-07, + "loss": 0.4822, + "num_input_tokens_seen": 1927282688, + "step": 919 + }, + { + "epoch": 0.03254678600488202, + "grad_norm": 20.545522689819336, + "learning_rate": 9.05673648178672e-07, + "loss": 0.6293, + "num_input_tokens_seen": 1929379840, + "step": 920 + }, + { + "epoch": 0.03281800922158937, + "grad_norm": 13.10300350189209, + "learning_rate": 9.054262018423175e-07, + "loss": 0.3147, + "num_input_tokens_seen": 1931476992, + "step": 921 + }, + { + "epoch": 0.03308923243829672, + "grad_norm": 23.32817268371582, + "learning_rate": 9.051784694724808e-07, + "loss": 0.7481, + "num_input_tokens_seen": 1933574144, + "step": 922 + }, + { + "epoch": 0.03336045565500407, + "grad_norm": 10.944870948791504, + "learning_rate": 9.049304512685274e-07, + "loss": 0.2809, + "num_input_tokens_seen": 1935671296, + "step": 923 + }, + { + "epoch": 0.033631678871711417, + "grad_norm": 11.271623611450195, + "learning_rate": 9.046821474300527e-07, + "loss": 0.297, + "num_input_tokens_seen": 1937768448, + "step": 924 + }, + { + "epoch": 0.03390290208841877, + "grad_norm": 17.19096565246582, + "learning_rate": 9.044335581568827e-07, + "loss": 0.5428, + "num_input_tokens_seen": 1939865600, + "step": 925 + }, + { + "epoch": 0.03417412530512612, + "grad_norm": 13.172834396362305, + "learning_rate": 9.041846836490723e-07, + "loss": 0.3738, + "num_input_tokens_seen": 1941962752, + "step": 926 + }, + { + "epoch": 0.034445348521833466, + "grad_norm": 13.624527931213379, + "learning_rate": 9.03935524106906e-07, + "loss": 0.39, + "num_input_tokens_seen": 1944059904, + "step": 927 + }, + { + "epoch": 0.03471657173854082, + "grad_norm": 11.111466407775879, + "learning_rate": 9.036860797308984e-07, + "loss": 0.2882, + "num_input_tokens_seen": 1946157056, + "step": 928 + }, + { + "epoch": 0.03498779495524817, + "grad_norm": 18.21550178527832, + "learning_rate": 9.034363507217925e-07, + "loss": 0.4821, + "num_input_tokens_seen": 1948254208, + "step": 929 + }, + { + "epoch": 0.03525901817195552, + "grad_norm": 25.724552154541016, + "learning_rate": 9.031863372805606e-07, + "loss": 0.6339, + "num_input_tokens_seen": 1950351360, + "step": 930 + }, + { + "epoch": 0.03553024138866287, + "grad_norm": 22.317489624023438, + "learning_rate": 9.029360396084043e-07, + "loss": 0.2785, + "num_input_tokens_seen": 1952448512, + "step": 931 + }, + { + "epoch": 0.03580146460537022, + "grad_norm": 15.543082237243652, + "learning_rate": 9.026854579067537e-07, + "loss": 0.502, + "num_input_tokens_seen": 1954545664, + "step": 932 + }, + { + "epoch": 0.03607268782207757, + "grad_norm": 18.427461624145508, + "learning_rate": 9.024345923772671e-07, + "loss": 0.4526, + "num_input_tokens_seen": 1956642816, + "step": 933 + }, + { + "epoch": 0.03634391103878492, + "grad_norm": 13.186530113220215, + "learning_rate": 9.021834432218317e-07, + "loss": 0.3389, + "num_input_tokens_seen": 1958739968, + "step": 934 + }, + { + "epoch": 0.03661513425549227, + "grad_norm": 13.932114601135254, + "learning_rate": 9.019320106425629e-07, + "loss": 0.3434, + "num_input_tokens_seen": 1960837120, + "step": 935 + }, + { + "epoch": 0.03688635747219962, + "grad_norm": 12.41275691986084, + "learning_rate": 9.016802948418038e-07, + "loss": 0.2254, + "num_input_tokens_seen": 1962934272, + "step": 936 + }, + { + "epoch": 0.03715758068890697, + "grad_norm": 18.870506286621094, + "learning_rate": 9.014282960221257e-07, + "loss": 0.5891, + "num_input_tokens_seen": 1965031424, + "step": 937 + }, + { + "epoch": 0.03742880390561432, + "grad_norm": 15.504498481750488, + "learning_rate": 9.01176014386328e-07, + "loss": 0.4418, + "num_input_tokens_seen": 1967128576, + "step": 938 + }, + { + "epoch": 0.03770002712232167, + "grad_norm": 19.771808624267578, + "learning_rate": 9.009234501374371e-07, + "loss": 0.6721, + "num_input_tokens_seen": 1969225728, + "step": 939 + }, + { + "epoch": 0.03797125033902902, + "grad_norm": 23.42245101928711, + "learning_rate": 9.006706034787071e-07, + "loss": 0.5628, + "num_input_tokens_seen": 1971322880, + "step": 940 + }, + { + "epoch": 0.03824247355573637, + "grad_norm": 18.261873245239258, + "learning_rate": 9.004174746136196e-07, + "loss": 0.6278, + "num_input_tokens_seen": 1973420032, + "step": 941 + }, + { + "epoch": 0.03851369677244372, + "grad_norm": 15.880273818969727, + "learning_rate": 9.001640637458829e-07, + "loss": 0.4231, + "num_input_tokens_seen": 1975517184, + "step": 942 + }, + { + "epoch": 0.038784919989151075, + "grad_norm": 26.52276611328125, + "learning_rate": 8.999103710794323e-07, + "loss": 0.8923, + "num_input_tokens_seen": 1977614336, + "step": 943 + }, + { + "epoch": 0.03905614320585842, + "grad_norm": 12.265298843383789, + "learning_rate": 8.996563968184302e-07, + "loss": 0.3194, + "num_input_tokens_seen": 1979711488, + "step": 944 + }, + { + "epoch": 0.03932736642256577, + "grad_norm": 20.407365798950195, + "learning_rate": 8.994021411672653e-07, + "loss": 0.7034, + "num_input_tokens_seen": 1981808640, + "step": 945 + }, + { + "epoch": 0.039598589639273124, + "grad_norm": 22.567445755004883, + "learning_rate": 8.99147604330553e-07, + "loss": 0.5223, + "num_input_tokens_seen": 1983905792, + "step": 946 + }, + { + "epoch": 0.03986981285598047, + "grad_norm": 30.489513397216797, + "learning_rate": 8.988927865131347e-07, + "loss": 0.5478, + "num_input_tokens_seen": 1986002944, + "step": 947 + }, + { + "epoch": 0.04014103607268782, + "grad_norm": 18.10291862487793, + "learning_rate": 8.986376879200783e-07, + "loss": 0.5236, + "num_input_tokens_seen": 1988100096, + "step": 948 + }, + { + "epoch": 0.040412259289395173, + "grad_norm": 13.505256652832031, + "learning_rate": 8.983823087566772e-07, + "loss": 0.2878, + "num_input_tokens_seen": 1990197248, + "step": 949 + }, + { + "epoch": 0.04068348250610252, + "grad_norm": 16.355928421020508, + "learning_rate": 8.981266492284511e-07, + "loss": 0.5118, + "num_input_tokens_seen": 1992294400, + "step": 950 + }, + { + "epoch": 0.040954705722809875, + "grad_norm": 20.954561233520508, + "learning_rate": 8.978707095411446e-07, + "loss": 0.6593, + "num_input_tokens_seen": 1994391552, + "step": 951 + }, + { + "epoch": 0.04122592893951722, + "grad_norm": 28.557451248168945, + "learning_rate": 8.976144899007288e-07, + "loss": 1.0425, + "num_input_tokens_seen": 1996488704, + "step": 952 + }, + { + "epoch": 0.04149715215622457, + "grad_norm": 13.614970207214355, + "learning_rate": 8.973579905133991e-07, + "loss": 0.4375, + "num_input_tokens_seen": 1998585856, + "step": 953 + }, + { + "epoch": 0.041768375372931925, + "grad_norm": 10.78772258758545, + "learning_rate": 8.971012115855766e-07, + "loss": 0.2845, + "num_input_tokens_seen": 2000683008, + "step": 954 + }, + { + "epoch": 0.04203959858963927, + "grad_norm": 17.9395694732666, + "learning_rate": 8.968441533239073e-07, + "loss": 0.4906, + "num_input_tokens_seen": 2002780160, + "step": 955 + }, + { + "epoch": 0.04231082180634662, + "grad_norm": 13.34605884552002, + "learning_rate": 8.965868159352616e-07, + "loss": 0.4028, + "num_input_tokens_seen": 2004877312, + "step": 956 + }, + { + "epoch": 0.042582045023053974, + "grad_norm": 16.155813217163086, + "learning_rate": 8.963291996267354e-07, + "loss": 0.3782, + "num_input_tokens_seen": 2006974464, + "step": 957 + }, + { + "epoch": 0.04285326823976132, + "grad_norm": 11.215263366699219, + "learning_rate": 8.960713046056478e-07, + "loss": 0.3243, + "num_input_tokens_seen": 2009071616, + "step": 958 + }, + { + "epoch": 0.043124491456468676, + "grad_norm": 13.548431396484375, + "learning_rate": 8.958131310795434e-07, + "loss": 0.4452, + "num_input_tokens_seen": 2011168768, + "step": 959 + }, + { + "epoch": 0.043395714673176024, + "grad_norm": 13.86419677734375, + "learning_rate": 8.955546792561902e-07, + "loss": 0.4006, + "num_input_tokens_seen": 2013265920, + "step": 960 + }, + { + "epoch": 0.04366693788988337, + "grad_norm": 18.16084098815918, + "learning_rate": 8.952959493435806e-07, + "loss": 0.3906, + "num_input_tokens_seen": 2015363072, + "step": 961 + }, + { + "epoch": 0.043938161106590726, + "grad_norm": 18.875059127807617, + "learning_rate": 8.950369415499304e-07, + "loss": 0.5608, + "num_input_tokens_seen": 2017460224, + "step": 962 + }, + { + "epoch": 0.04420938432329807, + "grad_norm": 22.01349639892578, + "learning_rate": 8.947776560836793e-07, + "loss": 0.7657, + "num_input_tokens_seen": 2019557376, + "step": 963 + }, + { + "epoch": 0.04448060754000543, + "grad_norm": 17.236188888549805, + "learning_rate": 8.945180931534902e-07, + "loss": 0.4577, + "num_input_tokens_seen": 2021654528, + "step": 964 + }, + { + "epoch": 0.044751830756712775, + "grad_norm": 15.46218204498291, + "learning_rate": 8.942582529682496e-07, + "loss": 0.4122, + "num_input_tokens_seen": 2023751680, + "step": 965 + }, + { + "epoch": 0.04502305397342012, + "grad_norm": 14.264276504516602, + "learning_rate": 8.939981357370672e-07, + "loss": 0.3415, + "num_input_tokens_seen": 2025848832, + "step": 966 + }, + { + "epoch": 0.04529427719012748, + "grad_norm": 17.676618576049805, + "learning_rate": 8.937377416692752e-07, + "loss": 0.4217, + "num_input_tokens_seen": 2027945984, + "step": 967 + }, + { + "epoch": 0.045565500406834825, + "grad_norm": 16.618602752685547, + "learning_rate": 8.934770709744289e-07, + "loss": 0.4929, + "num_input_tokens_seen": 2030043136, + "step": 968 + }, + { + "epoch": 0.04583672362354217, + "grad_norm": 13.402697563171387, + "learning_rate": 8.93216123862306e-07, + "loss": 0.3282, + "num_input_tokens_seen": 2032140288, + "step": 969 + }, + { + "epoch": 0.04610794684024953, + "grad_norm": 24.799013137817383, + "learning_rate": 8.929549005429071e-07, + "loss": 1.1138, + "num_input_tokens_seen": 2034237440, + "step": 970 + }, + { + "epoch": 0.046379170056956874, + "grad_norm": 17.392616271972656, + "learning_rate": 8.926934012264546e-07, + "loss": 0.4161, + "num_input_tokens_seen": 2036334592, + "step": 971 + }, + { + "epoch": 0.04665039327366423, + "grad_norm": 18.09572410583496, + "learning_rate": 8.924316261233933e-07, + "loss": 0.567, + "num_input_tokens_seen": 2038431744, + "step": 972 + }, + { + "epoch": 0.046921616490371576, + "grad_norm": 14.815757751464844, + "learning_rate": 8.921695754443898e-07, + "loss": 0.3801, + "num_input_tokens_seen": 2040528896, + "step": 973 + }, + { + "epoch": 0.047192839707078924, + "grad_norm": 16.38593864440918, + "learning_rate": 8.919072494003325e-07, + "loss": 0.4794, + "num_input_tokens_seen": 2042626048, + "step": 974 + }, + { + "epoch": 0.04746406292378628, + "grad_norm": 13.091635704040527, + "learning_rate": 8.916446482023313e-07, + "loss": 0.394, + "num_input_tokens_seen": 2044723200, + "step": 975 + }, + { + "epoch": 0.047735286140493625, + "grad_norm": 17.225982666015625, + "learning_rate": 8.913817720617178e-07, + "loss": 0.5268, + "num_input_tokens_seen": 2046820352, + "step": 976 + }, + { + "epoch": 0.04800650935720097, + "grad_norm": 12.275423049926758, + "learning_rate": 8.911186211900448e-07, + "loss": 0.3478, + "num_input_tokens_seen": 2048917504, + "step": 977 + }, + { + "epoch": 0.04827773257390833, + "grad_norm": 19.183958053588867, + "learning_rate": 8.908551957990858e-07, + "loss": 0.619, + "num_input_tokens_seen": 2051014656, + "step": 978 + }, + { + "epoch": 0.048548955790615675, + "grad_norm": 22.74364471435547, + "learning_rate": 8.90591496100836e-07, + "loss": 0.6647, + "num_input_tokens_seen": 2053111808, + "step": 979 + }, + { + "epoch": 0.04882017900732303, + "grad_norm": 20.398710250854492, + "learning_rate": 8.903275223075104e-07, + "loss": 0.5442, + "num_input_tokens_seen": 2055208960, + "step": 980 + }, + { + "epoch": 0.04909140222403038, + "grad_norm": 19.808082580566406, + "learning_rate": 8.900632746315454e-07, + "loss": 0.683, + "num_input_tokens_seen": 2057306112, + "step": 981 + }, + { + "epoch": 0.049362625440737724, + "grad_norm": 26.348705291748047, + "learning_rate": 8.897987532855972e-07, + "loss": 1.0303, + "num_input_tokens_seen": 2059403264, + "step": 982 + }, + { + "epoch": 0.04963384865744508, + "grad_norm": 21.460338592529297, + "learning_rate": 8.895339584825429e-07, + "loss": 0.5595, + "num_input_tokens_seen": 2061500416, + "step": 983 + }, + { + "epoch": 0.049905071874152426, + "grad_norm": 14.994651794433594, + "learning_rate": 8.892688904354787e-07, + "loss": 0.4599, + "num_input_tokens_seen": 2063597568, + "step": 984 + }, + { + "epoch": 0.05017629509085978, + "grad_norm": 12.65827751159668, + "learning_rate": 8.890035493577219e-07, + "loss": 0.373, + "num_input_tokens_seen": 2065694720, + "step": 985 + }, + { + "epoch": 0.05044751830756713, + "grad_norm": 22.335948944091797, + "learning_rate": 8.887379354628085e-07, + "loss": 0.9004, + "num_input_tokens_seen": 2067791872, + "step": 986 + }, + { + "epoch": 0.050718741524274476, + "grad_norm": 13.694454193115234, + "learning_rate": 8.884720489644945e-07, + "loss": 0.3551, + "num_input_tokens_seen": 2069889024, + "step": 987 + }, + { + "epoch": 0.05098996474098183, + "grad_norm": 20.320920944213867, + "learning_rate": 8.882058900767555e-07, + "loss": 0.735, + "num_input_tokens_seen": 2071986176, + "step": 988 + }, + { + "epoch": 0.05126118795768918, + "grad_norm": 19.12615966796875, + "learning_rate": 8.879394590137857e-07, + "loss": 0.5857, + "num_input_tokens_seen": 2074083328, + "step": 989 + }, + { + "epoch": 0.051532411174396525, + "grad_norm": 16.400854110717773, + "learning_rate": 8.876727559899989e-07, + "loss": 0.3749, + "num_input_tokens_seen": 2076180480, + "step": 990 + }, + { + "epoch": 0.05180363439110388, + "grad_norm": 24.33226203918457, + "learning_rate": 8.874057812200274e-07, + "loss": 0.9273, + "num_input_tokens_seen": 2078277632, + "step": 991 + }, + { + "epoch": 0.05207485760781123, + "grad_norm": 17.637216567993164, + "learning_rate": 8.871385349187225e-07, + "loss": 0.6876, + "num_input_tokens_seen": 2080374784, + "step": 992 + }, + { + "epoch": 0.05234608082451858, + "grad_norm": 22.15195083618164, + "learning_rate": 8.868710173011538e-07, + "loss": 0.7319, + "num_input_tokens_seen": 2082471936, + "step": 993 + }, + { + "epoch": 0.05261730404122593, + "grad_norm": 16.152517318725586, + "learning_rate": 8.866032285826091e-07, + "loss": 0.4949, + "num_input_tokens_seen": 2084569088, + "step": 994 + }, + { + "epoch": 0.05288852725793328, + "grad_norm": 13.20339584350586, + "learning_rate": 8.86335168978595e-07, + "loss": 0.3432, + "num_input_tokens_seen": 2086666240, + "step": 995 + }, + { + "epoch": 0.05315975047464063, + "grad_norm": 12.26603889465332, + "learning_rate": 8.860668387048353e-07, + "loss": 0.2759, + "num_input_tokens_seen": 2088763392, + "step": 996 + }, + { + "epoch": 0.05343097369134798, + "grad_norm": 12.985957145690918, + "learning_rate": 8.85798237977272e-07, + "loss": 0.3028, + "num_input_tokens_seen": 2090860544, + "step": 997 + }, + { + "epoch": 0.05370219690805533, + "grad_norm": 17.120885848999023, + "learning_rate": 8.85529367012065e-07, + "loss": 0.63, + "num_input_tokens_seen": 2092957696, + "step": 998 + }, + { + "epoch": 0.05397342012476268, + "grad_norm": 17.12887191772461, + "learning_rate": 8.852602260255911e-07, + "loss": 0.4986, + "num_input_tokens_seen": 2095054848, + "step": 999 + }, + { + "epoch": 0.05424464334147003, + "grad_norm": 19.522432327270508, + "learning_rate": 8.849908152344451e-07, + "loss": 0.7428, + "num_input_tokens_seen": 2097152000, + "step": 1000 + }, + { + "epoch": 0.05451586655817738, + "grad_norm": 19.57962417602539, + "learning_rate": 8.847211348554382e-07, + "loss": 0.3491, + "num_input_tokens_seen": 2099249152, + "step": 1001 + }, + { + "epoch": 0.05478708977488473, + "grad_norm": 15.201761245727539, + "learning_rate": 8.844511851055991e-07, + "loss": 0.4238, + "num_input_tokens_seen": 2101346304, + "step": 1002 + }, + { + "epoch": 0.05505831299159208, + "grad_norm": 15.727082252502441, + "learning_rate": 8.841809662021731e-07, + "loss": 0.4377, + "num_input_tokens_seen": 2103443456, + "step": 1003 + }, + { + "epoch": 0.05532953620829943, + "grad_norm": 22.993534088134766, + "learning_rate": 8.839104783626219e-07, + "loss": 0.7603, + "num_input_tokens_seen": 2105540608, + "step": 1004 + }, + { + "epoch": 0.05560075942500678, + "grad_norm": 13.314918518066406, + "learning_rate": 8.836397218046239e-07, + "loss": 0.3576, + "num_input_tokens_seen": 2107637760, + "step": 1005 + }, + { + "epoch": 0.055871982641714134, + "grad_norm": 14.432062149047852, + "learning_rate": 8.83368696746074e-07, + "loss": 0.4315, + "num_input_tokens_seen": 2109734912, + "step": 1006 + }, + { + "epoch": 0.05614320585842148, + "grad_norm": 20.760156631469727, + "learning_rate": 8.830974034050824e-07, + "loss": 0.7692, + "num_input_tokens_seen": 2111832064, + "step": 1007 + }, + { + "epoch": 0.05641442907512883, + "grad_norm": 10.294124603271484, + "learning_rate": 8.828258419999759e-07, + "loss": 0.283, + "num_input_tokens_seen": 2113929216, + "step": 1008 + }, + { + "epoch": 0.05668565229183618, + "grad_norm": 10.813933372497559, + "learning_rate": 8.825540127492965e-07, + "loss": 0.2858, + "num_input_tokens_seen": 2116026368, + "step": 1009 + }, + { + "epoch": 0.05695687550854353, + "grad_norm": 23.46965217590332, + "learning_rate": 8.822819158718026e-07, + "loss": 0.717, + "num_input_tokens_seen": 2118123520, + "step": 1010 + }, + { + "epoch": 0.05722809872525088, + "grad_norm": 19.347179412841797, + "learning_rate": 8.820095515864669e-07, + "loss": 0.5001, + "num_input_tokens_seen": 2120220672, + "step": 1011 + }, + { + "epoch": 0.05749932194195823, + "grad_norm": 18.49291229248047, + "learning_rate": 8.81736920112478e-07, + "loss": 0.5409, + "num_input_tokens_seen": 2122317824, + "step": 1012 + }, + { + "epoch": 0.05777054515866558, + "grad_norm": 14.770519256591797, + "learning_rate": 8.814640216692391e-07, + "loss": 0.4008, + "num_input_tokens_seen": 2124414976, + "step": 1013 + }, + { + "epoch": 0.058041768375372935, + "grad_norm": 22.392454147338867, + "learning_rate": 8.81190856476369e-07, + "loss": 0.8029, + "num_input_tokens_seen": 2126512128, + "step": 1014 + }, + { + "epoch": 0.05831299159208028, + "grad_norm": 19.39711570739746, + "learning_rate": 8.809174247537003e-07, + "loss": 0.5948, + "num_input_tokens_seen": 2128609280, + "step": 1015 + }, + { + "epoch": 0.05858421480878763, + "grad_norm": 11.22283935546875, + "learning_rate": 8.806437267212805e-07, + "loss": 0.2487, + "num_input_tokens_seen": 2130706432, + "step": 1016 + }, + { + "epoch": 0.058855438025494984, + "grad_norm": 23.879262924194336, + "learning_rate": 8.803697625993713e-07, + "loss": 0.8866, + "num_input_tokens_seen": 2132803584, + "step": 1017 + }, + { + "epoch": 0.05912666124220233, + "grad_norm": 17.010164260864258, + "learning_rate": 8.800955326084487e-07, + "loss": 0.6154, + "num_input_tokens_seen": 2134900736, + "step": 1018 + }, + { + "epoch": 0.059397884458909686, + "grad_norm": 12.31965160369873, + "learning_rate": 8.798210369692025e-07, + "loss": 0.2381, + "num_input_tokens_seen": 2136997888, + "step": 1019 + }, + { + "epoch": 0.059669107675617034, + "grad_norm": 14.956830978393555, + "learning_rate": 8.795462759025364e-07, + "loss": 0.4989, + "num_input_tokens_seen": 2139095040, + "step": 1020 + }, + { + "epoch": 0.05994033089232438, + "grad_norm": 11.59531021118164, + "learning_rate": 8.792712496295677e-07, + "loss": 0.3168, + "num_input_tokens_seen": 2141192192, + "step": 1021 + }, + { + "epoch": 0.060211554109031735, + "grad_norm": 15.695453643798828, + "learning_rate": 8.789959583716268e-07, + "loss": 0.4643, + "num_input_tokens_seen": 2143289344, + "step": 1022 + }, + { + "epoch": 0.06048277732573908, + "grad_norm": 12.228116989135742, + "learning_rate": 8.787204023502579e-07, + "loss": 0.2886, + "num_input_tokens_seen": 2145386496, + "step": 1023 + }, + { + "epoch": 0.06075400054244643, + "grad_norm": 15.869124412536621, + "learning_rate": 8.78444581787218e-07, + "loss": 0.4706, + "num_input_tokens_seen": 2147483648, + "step": 1024 + }, + { + "epoch": 0.061025223759153785, + "grad_norm": 17.77741241455078, + "learning_rate": 8.781684969044769e-07, + "loss": 0.4961, + "num_input_tokens_seen": 2149580800, + "step": 1025 + }, + { + "epoch": 0.06129644697586113, + "grad_norm": 18.270593643188477, + "learning_rate": 8.778921479242173e-07, + "loss": 0.5621, + "num_input_tokens_seen": 2151677952, + "step": 1026 + }, + { + "epoch": 0.06156767019256849, + "grad_norm": 10.54736614227295, + "learning_rate": 8.776155350688342e-07, + "loss": 0.2092, + "num_input_tokens_seen": 2153775104, + "step": 1027 + }, + { + "epoch": 0.061838893409275834, + "grad_norm": 14.527905464172363, + "learning_rate": 8.773386585609352e-07, + "loss": 0.2644, + "num_input_tokens_seen": 2155872256, + "step": 1028 + }, + { + "epoch": 0.06211011662598318, + "grad_norm": 14.493075370788574, + "learning_rate": 8.770615186233398e-07, + "loss": 0.4327, + "num_input_tokens_seen": 2157969408, + "step": 1029 + }, + { + "epoch": 0.062381339842690536, + "grad_norm": 15.283905982971191, + "learning_rate": 8.7678411547908e-07, + "loss": 0.3887, + "num_input_tokens_seen": 2160066560, + "step": 1030 + }, + { + "epoch": 0.06265256305939788, + "grad_norm": 22.41847801208496, + "learning_rate": 8.76506449351399e-07, + "loss": 0.8746, + "num_input_tokens_seen": 2162163712, + "step": 1031 + }, + { + "epoch": 0.06292378627610523, + "grad_norm": 12.715897560119629, + "learning_rate": 8.762285204637522e-07, + "loss": 0.3222, + "num_input_tokens_seen": 2164260864, + "step": 1032 + }, + { + "epoch": 0.06319500949281258, + "grad_norm": 25.04868507385254, + "learning_rate": 8.75950329039806e-07, + "loss": 0.9287, + "num_input_tokens_seen": 2166358016, + "step": 1033 + }, + { + "epoch": 0.06346623270951994, + "grad_norm": 14.874964714050293, + "learning_rate": 8.756718753034381e-07, + "loss": 0.4544, + "num_input_tokens_seen": 2168455168, + "step": 1034 + }, + { + "epoch": 0.06373745592622729, + "grad_norm": 15.657155990600586, + "learning_rate": 8.75393159478738e-07, + "loss": 0.4122, + "num_input_tokens_seen": 2170552320, + "step": 1035 + }, + { + "epoch": 0.06400867914293464, + "grad_norm": 20.458627700805664, + "learning_rate": 8.751141817900052e-07, + "loss": 0.5967, + "num_input_tokens_seen": 2172649472, + "step": 1036 + }, + { + "epoch": 0.06427990235964198, + "grad_norm": 11.38912296295166, + "learning_rate": 8.748349424617504e-07, + "loss": 0.2614, + "num_input_tokens_seen": 2174746624, + "step": 1037 + }, + { + "epoch": 0.06455112557634933, + "grad_norm": 20.700756072998047, + "learning_rate": 8.745554417186946e-07, + "loss": 0.6497, + "num_input_tokens_seen": 2176843776, + "step": 1038 + }, + { + "epoch": 0.06482234879305669, + "grad_norm": 20.341596603393555, + "learning_rate": 8.742756797857698e-07, + "loss": 0.7146, + "num_input_tokens_seen": 2178940928, + "step": 1039 + }, + { + "epoch": 0.06509357200976404, + "grad_norm": 17.363826751708984, + "learning_rate": 8.739956568881174e-07, + "loss": 0.4446, + "num_input_tokens_seen": 2181038080, + "step": 1040 + }, + { + "epoch": 0.06536479522647139, + "grad_norm": 21.04477882385254, + "learning_rate": 8.737153732510894e-07, + "loss": 0.4774, + "num_input_tokens_seen": 2183135232, + "step": 1041 + }, + { + "epoch": 0.06563601844317873, + "grad_norm": 13.529879570007324, + "learning_rate": 8.734348291002472e-07, + "loss": 0.4042, + "num_input_tokens_seen": 2185232384, + "step": 1042 + }, + { + "epoch": 0.06590724165988608, + "grad_norm": 17.152034759521484, + "learning_rate": 8.731540246613621e-07, + "loss": 0.6437, + "num_input_tokens_seen": 2187329536, + "step": 1043 + }, + { + "epoch": 0.06617846487659344, + "grad_norm": 11.518980979919434, + "learning_rate": 8.728729601604149e-07, + "loss": 0.2645, + "num_input_tokens_seen": 2189426688, + "step": 1044 + }, + { + "epoch": 0.06644968809330079, + "grad_norm": 19.25179100036621, + "learning_rate": 8.725916358235956e-07, + "loss": 0.5292, + "num_input_tokens_seen": 2191523840, + "step": 1045 + }, + { + "epoch": 0.06672091131000814, + "grad_norm": 19.196977615356445, + "learning_rate": 8.723100518773034e-07, + "loss": 0.6961, + "num_input_tokens_seen": 2193620992, + "step": 1046 + }, + { + "epoch": 0.06699213452671549, + "grad_norm": 12.074904441833496, + "learning_rate": 8.720282085481463e-07, + "loss": 0.3306, + "num_input_tokens_seen": 2195718144, + "step": 1047 + }, + { + "epoch": 0.06726335774342283, + "grad_norm": 15.995697975158691, + "learning_rate": 8.717461060629408e-07, + "loss": 0.4436, + "num_input_tokens_seen": 2197815296, + "step": 1048 + }, + { + "epoch": 0.06753458096013018, + "grad_norm": 16.870420455932617, + "learning_rate": 8.714637446487127e-07, + "loss": 0.5042, + "num_input_tokens_seen": 2199912448, + "step": 1049 + }, + { + "epoch": 0.06780580417683754, + "grad_norm": 16.621347427368164, + "learning_rate": 8.711811245326955e-07, + "loss": 0.4686, + "num_input_tokens_seen": 2202009600, + "step": 1050 + }, + { + "epoch": 0.06807702739354489, + "grad_norm": 20.809621810913086, + "learning_rate": 8.70898245942331e-07, + "loss": 0.5436, + "num_input_tokens_seen": 2204106752, + "step": 1051 + }, + { + "epoch": 0.06834825061025224, + "grad_norm": 18.218713760375977, + "learning_rate": 8.706151091052693e-07, + "loss": 0.5587, + "num_input_tokens_seen": 2206203904, + "step": 1052 + }, + { + "epoch": 0.06861947382695958, + "grad_norm": 15.873334884643555, + "learning_rate": 8.703317142493681e-07, + "loss": 0.5315, + "num_input_tokens_seen": 2208301056, + "step": 1053 + }, + { + "epoch": 0.06889069704366693, + "grad_norm": 22.45003890991211, + "learning_rate": 8.700480616026928e-07, + "loss": 0.6862, + "num_input_tokens_seen": 2210398208, + "step": 1054 + }, + { + "epoch": 0.0691619202603743, + "grad_norm": 23.83209991455078, + "learning_rate": 8.697641513935164e-07, + "loss": 0.5649, + "num_input_tokens_seen": 2212495360, + "step": 1055 + }, + { + "epoch": 0.06943314347708164, + "grad_norm": 16.99089241027832, + "learning_rate": 8.694799838503186e-07, + "loss": 0.5258, + "num_input_tokens_seen": 2214592512, + "step": 1056 + }, + { + "epoch": 0.06970436669378899, + "grad_norm": 12.274675369262695, + "learning_rate": 8.691955592017872e-07, + "loss": 0.3312, + "num_input_tokens_seen": 2216689664, + "step": 1057 + }, + { + "epoch": 0.06997558991049634, + "grad_norm": 20.667945861816406, + "learning_rate": 8.689108776768159e-07, + "loss": 0.6978, + "num_input_tokens_seen": 2218786816, + "step": 1058 + }, + { + "epoch": 0.07024681312720368, + "grad_norm": 18.657791137695312, + "learning_rate": 8.686259395045056e-07, + "loss": 0.6072, + "num_input_tokens_seen": 2220883968, + "step": 1059 + }, + { + "epoch": 0.07051803634391104, + "grad_norm": 15.566442489624023, + "learning_rate": 8.68340744914164e-07, + "loss": 0.4635, + "num_input_tokens_seen": 2222981120, + "step": 1060 + }, + { + "epoch": 0.07078925956061839, + "grad_norm": 14.146040916442871, + "learning_rate": 8.680552941353045e-07, + "loss": 0.2827, + "num_input_tokens_seen": 2225078272, + "step": 1061 + }, + { + "epoch": 0.07106048277732574, + "grad_norm": 22.9758358001709, + "learning_rate": 8.677695873976473e-07, + "loss": 0.6515, + "num_input_tokens_seen": 2227175424, + "step": 1062 + }, + { + "epoch": 0.07133170599403309, + "grad_norm": 14.749287605285645, + "learning_rate": 8.674836249311182e-07, + "loss": 0.4001, + "num_input_tokens_seen": 2229272576, + "step": 1063 + }, + { + "epoch": 0.07160292921074043, + "grad_norm": 18.237518310546875, + "learning_rate": 8.671974069658488e-07, + "loss": 0.6569, + "num_input_tokens_seen": 2231369728, + "step": 1064 + }, + { + "epoch": 0.0718741524274478, + "grad_norm": 21.23828125, + "learning_rate": 8.669109337321767e-07, + "loss": 0.5926, + "num_input_tokens_seen": 2233466880, + "step": 1065 + }, + { + "epoch": 0.07214537564415514, + "grad_norm": 13.614550590515137, + "learning_rate": 8.666242054606444e-07, + "loss": 0.4136, + "num_input_tokens_seen": 2235564032, + "step": 1066 + }, + { + "epoch": 0.07241659886086249, + "grad_norm": 19.72222900390625, + "learning_rate": 8.66337222382e-07, + "loss": 0.5478, + "num_input_tokens_seen": 2237661184, + "step": 1067 + }, + { + "epoch": 0.07268782207756984, + "grad_norm": 24.28289222717285, + "learning_rate": 8.660499847271965e-07, + "loss": 0.7417, + "num_input_tokens_seen": 2239758336, + "step": 1068 + }, + { + "epoch": 0.07295904529427719, + "grad_norm": 15.440895080566406, + "learning_rate": 8.657624927273919e-07, + "loss": 0.3927, + "num_input_tokens_seen": 2241855488, + "step": 1069 + }, + { + "epoch": 0.07323026851098453, + "grad_norm": 14.023824691772461, + "learning_rate": 8.654747466139488e-07, + "loss": 0.4269, + "num_input_tokens_seen": 2243952640, + "step": 1070 + }, + { + "epoch": 0.0735014917276919, + "grad_norm": 25.52799415588379, + "learning_rate": 8.651867466184344e-07, + "loss": 0.5167, + "num_input_tokens_seen": 2246049792, + "step": 1071 + }, + { + "epoch": 0.07377271494439924, + "grad_norm": 15.417862892150879, + "learning_rate": 8.6489849297262e-07, + "loss": 0.4347, + "num_input_tokens_seen": 2248146944, + "step": 1072 + }, + { + "epoch": 0.07404393816110659, + "grad_norm": 17.311777114868164, + "learning_rate": 8.646099859084812e-07, + "loss": 0.4029, + "num_input_tokens_seen": 2250244096, + "step": 1073 + }, + { + "epoch": 0.07431516137781394, + "grad_norm": 15.483137130737305, + "learning_rate": 8.643212256581978e-07, + "loss": 0.4559, + "num_input_tokens_seen": 2252341248, + "step": 1074 + }, + { + "epoch": 0.07458638459452128, + "grad_norm": 18.89014434814453, + "learning_rate": 8.640322124541525e-07, + "loss": 0.4146, + "num_input_tokens_seen": 2254438400, + "step": 1075 + }, + { + "epoch": 0.07485760781122865, + "grad_norm": 23.15615463256836, + "learning_rate": 8.637429465289324e-07, + "loss": 0.7084, + "num_input_tokens_seen": 2256535552, + "step": 1076 + }, + { + "epoch": 0.075128831027936, + "grad_norm": 26.009380340576172, + "learning_rate": 8.63453428115328e-07, + "loss": 0.9022, + "num_input_tokens_seen": 2258632704, + "step": 1077 + }, + { + "epoch": 0.07540005424464334, + "grad_norm": 28.278987884521484, + "learning_rate": 8.631636574463321e-07, + "loss": 0.9285, + "num_input_tokens_seen": 2260729856, + "step": 1078 + }, + { + "epoch": 0.07567127746135069, + "grad_norm": 15.612220764160156, + "learning_rate": 8.628736347551417e-07, + "loss": 0.5127, + "num_input_tokens_seen": 2262827008, + "step": 1079 + }, + { + "epoch": 0.07594250067805804, + "grad_norm": 15.422523498535156, + "learning_rate": 8.625833602751559e-07, + "loss": 0.4181, + "num_input_tokens_seen": 2264924160, + "step": 1080 + }, + { + "epoch": 0.0762137238947654, + "grad_norm": 26.36939239501953, + "learning_rate": 8.622928342399762e-07, + "loss": 0.9629, + "num_input_tokens_seen": 2267021312, + "step": 1081 + }, + { + "epoch": 0.07648494711147275, + "grad_norm": 17.090736389160156, + "learning_rate": 8.620020568834072e-07, + "loss": 0.6093, + "num_input_tokens_seen": 2269118464, + "step": 1082 + }, + { + "epoch": 0.07675617032818009, + "grad_norm": 22.48678207397461, + "learning_rate": 8.617110284394553e-07, + "loss": 0.5369, + "num_input_tokens_seen": 2271215616, + "step": 1083 + }, + { + "epoch": 0.07702739354488744, + "grad_norm": 16.42003059387207, + "learning_rate": 8.614197491423293e-07, + "loss": 0.3614, + "num_input_tokens_seen": 2273312768, + "step": 1084 + }, + { + "epoch": 0.07729861676159479, + "grad_norm": 45.502445220947266, + "learning_rate": 8.611282192264396e-07, + "loss": 0.7799, + "num_input_tokens_seen": 2275409920, + "step": 1085 + }, + { + "epoch": 0.07756983997830215, + "grad_norm": 11.837736129760742, + "learning_rate": 8.608364389263984e-07, + "loss": 0.3233, + "num_input_tokens_seen": 2277507072, + "step": 1086 + }, + { + "epoch": 0.0778410631950095, + "grad_norm": 18.987957000732422, + "learning_rate": 8.605444084770192e-07, + "loss": 0.5218, + "num_input_tokens_seen": 2279604224, + "step": 1087 + }, + { + "epoch": 0.07811228641171684, + "grad_norm": 17.756649017333984, + "learning_rate": 8.602521281133173e-07, + "loss": 0.4539, + "num_input_tokens_seen": 2281701376, + "step": 1088 + }, + { + "epoch": 0.07838350962842419, + "grad_norm": 21.006437301635742, + "learning_rate": 8.599595980705085e-07, + "loss": 0.2876, + "num_input_tokens_seen": 2283798528, + "step": 1089 + }, + { + "epoch": 0.07865473284513154, + "grad_norm": 17.649852752685547, + "learning_rate": 8.596668185840102e-07, + "loss": 0.5252, + "num_input_tokens_seen": 2285895680, + "step": 1090 + }, + { + "epoch": 0.07892595606183889, + "grad_norm": 15.316368103027344, + "learning_rate": 8.593737898894398e-07, + "loss": 0.5004, + "num_input_tokens_seen": 2287992832, + "step": 1091 + }, + { + "epoch": 0.07919717927854625, + "grad_norm": 16.443462371826172, + "learning_rate": 8.59080512222616e-07, + "loss": 0.5126, + "num_input_tokens_seen": 2290089984, + "step": 1092 + }, + { + "epoch": 0.0794684024952536, + "grad_norm": 25.65877342224121, + "learning_rate": 8.587869858195574e-07, + "loss": 0.6624, + "num_input_tokens_seen": 2292187136, + "step": 1093 + }, + { + "epoch": 0.07973962571196094, + "grad_norm": 15.421908378601074, + "learning_rate": 8.584932109164826e-07, + "loss": 0.5005, + "num_input_tokens_seen": 2294284288, + "step": 1094 + }, + { + "epoch": 0.08001084892866829, + "grad_norm": 12.648368835449219, + "learning_rate": 8.581991877498109e-07, + "loss": 0.3672, + "num_input_tokens_seen": 2296381440, + "step": 1095 + }, + { + "epoch": 0.08028207214537564, + "grad_norm": 20.8604679107666, + "learning_rate": 8.579049165561607e-07, + "loss": 0.6201, + "num_input_tokens_seen": 2298478592, + "step": 1096 + }, + { + "epoch": 0.080553295362083, + "grad_norm": 18.767131805419922, + "learning_rate": 8.576103975723502e-07, + "loss": 0.6181, + "num_input_tokens_seen": 2300575744, + "step": 1097 + }, + { + "epoch": 0.08082451857879035, + "grad_norm": 18.214879989624023, + "learning_rate": 8.573156310353974e-07, + "loss": 0.5909, + "num_input_tokens_seen": 2302672896, + "step": 1098 + }, + { + "epoch": 0.0810957417954977, + "grad_norm": 15.940201759338379, + "learning_rate": 8.570206171825188e-07, + "loss": 0.5828, + "num_input_tokens_seen": 2304770048, + "step": 1099 + }, + { + "epoch": 0.08136696501220504, + "grad_norm": 13.9312744140625, + "learning_rate": 8.567253562511306e-07, + "loss": 0.4176, + "num_input_tokens_seen": 2306867200, + "step": 1100 + }, + { + "epoch": 0.08163818822891239, + "grad_norm": 16.388742446899414, + "learning_rate": 8.564298484788472e-07, + "loss": 0.4367, + "num_input_tokens_seen": 2308964352, + "step": 1101 + }, + { + "epoch": 0.08190941144561975, + "grad_norm": 11.300383567810059, + "learning_rate": 8.561340941034825e-07, + "loss": 0.2912, + "num_input_tokens_seen": 2311061504, + "step": 1102 + }, + { + "epoch": 0.0821806346623271, + "grad_norm": 19.192779541015625, + "learning_rate": 8.55838093363048e-07, + "loss": 0.6453, + "num_input_tokens_seen": 2313158656, + "step": 1103 + }, + { + "epoch": 0.08245185787903445, + "grad_norm": 20.092891693115234, + "learning_rate": 8.555418464957542e-07, + "loss": 0.6856, + "num_input_tokens_seen": 2315255808, + "step": 1104 + }, + { + "epoch": 0.0827230810957418, + "grad_norm": 16.484115600585938, + "learning_rate": 8.552453537400089e-07, + "loss": 0.4858, + "num_input_tokens_seen": 2317352960, + "step": 1105 + }, + { + "epoch": 0.08299430431244914, + "grad_norm": 27.20563316345215, + "learning_rate": 8.549486153344183e-07, + "loss": 0.7113, + "num_input_tokens_seen": 2319450112, + "step": 1106 + }, + { + "epoch": 0.0832655275291565, + "grad_norm": 17.81060028076172, + "learning_rate": 8.546516315177863e-07, + "loss": 0.6143, + "num_input_tokens_seen": 2321547264, + "step": 1107 + }, + { + "epoch": 0.08353675074586385, + "grad_norm": 17.569068908691406, + "learning_rate": 8.543544025291143e-07, + "loss": 0.5014, + "num_input_tokens_seen": 2323644416, + "step": 1108 + }, + { + "epoch": 0.0838079739625712, + "grad_norm": 15.904462814331055, + "learning_rate": 8.540569286076004e-07, + "loss": 0.4685, + "num_input_tokens_seen": 2325741568, + "step": 1109 + }, + { + "epoch": 0.08407919717927854, + "grad_norm": 19.58460807800293, + "learning_rate": 8.537592099926407e-07, + "loss": 0.6062, + "num_input_tokens_seen": 2327838720, + "step": 1110 + }, + { + "epoch": 0.08435042039598589, + "grad_norm": 13.513802528381348, + "learning_rate": 8.534612469238278e-07, + "loss": 0.3705, + "num_input_tokens_seen": 2329935872, + "step": 1111 + }, + { + "epoch": 0.08462164361269324, + "grad_norm": 20.021507263183594, + "learning_rate": 8.531630396409507e-07, + "loss": 0.677, + "num_input_tokens_seen": 2332033024, + "step": 1112 + }, + { + "epoch": 0.0848928668294006, + "grad_norm": 14.840436935424805, + "learning_rate": 8.528645883839956e-07, + "loss": 0.5015, + "num_input_tokens_seen": 2334130176, + "step": 1113 + }, + { + "epoch": 0.08516409004610795, + "grad_norm": 15.523248672485352, + "learning_rate": 8.525658933931448e-07, + "loss": 0.5022, + "num_input_tokens_seen": 2336227328, + "step": 1114 + }, + { + "epoch": 0.0854353132628153, + "grad_norm": 17.14645004272461, + "learning_rate": 8.522669549087762e-07, + "loss": 0.5963, + "num_input_tokens_seen": 2338324480, + "step": 1115 + }, + { + "epoch": 0.08570653647952264, + "grad_norm": 20.818403244018555, + "learning_rate": 8.519677731714645e-07, + "loss": 0.7184, + "num_input_tokens_seen": 2340421632, + "step": 1116 + }, + { + "epoch": 0.08597775969622999, + "grad_norm": 13.36194896697998, + "learning_rate": 8.516683484219797e-07, + "loss": 0.4394, + "num_input_tokens_seen": 2342518784, + "step": 1117 + }, + { + "epoch": 0.08624898291293735, + "grad_norm": 18.085691452026367, + "learning_rate": 8.513686809012875e-07, + "loss": 0.5567, + "num_input_tokens_seen": 2344615936, + "step": 1118 + }, + { + "epoch": 0.0865202061296447, + "grad_norm": 18.329633712768555, + "learning_rate": 8.510687708505489e-07, + "loss": 0.5698, + "num_input_tokens_seen": 2346713088, + "step": 1119 + }, + { + "epoch": 0.08679142934635205, + "grad_norm": 23.978832244873047, + "learning_rate": 8.507686185111199e-07, + "loss": 0.5401, + "num_input_tokens_seen": 2348810240, + "step": 1120 + }, + { + "epoch": 0.0870626525630594, + "grad_norm": 19.02138328552246, + "learning_rate": 8.504682241245516e-07, + "loss": 0.3782, + "num_input_tokens_seen": 2350907392, + "step": 1121 + }, + { + "epoch": 0.08733387577976674, + "grad_norm": 10.588142395019531, + "learning_rate": 8.501675879325906e-07, + "loss": 0.2031, + "num_input_tokens_seen": 2353004544, + "step": 1122 + }, + { + "epoch": 0.0876050989964741, + "grad_norm": 18.56726837158203, + "learning_rate": 8.498667101771769e-07, + "loss": 0.4395, + "num_input_tokens_seen": 2355101696, + "step": 1123 + }, + { + "epoch": 0.08787632221318145, + "grad_norm": 21.876890182495117, + "learning_rate": 8.495655911004456e-07, + "loss": 0.6843, + "num_input_tokens_seen": 2357198848, + "step": 1124 + }, + { + "epoch": 0.0881475454298888, + "grad_norm": 17.283998489379883, + "learning_rate": 8.492642309447257e-07, + "loss": 0.508, + "num_input_tokens_seen": 2359296000, + "step": 1125 + }, + { + "epoch": 0.08841876864659615, + "grad_norm": 12.905929565429688, + "learning_rate": 8.489626299525409e-07, + "loss": 0.3963, + "num_input_tokens_seen": 2361393152, + "step": 1126 + }, + { + "epoch": 0.0886899918633035, + "grad_norm": 14.936822891235352, + "learning_rate": 8.486607883666077e-07, + "loss": 0.4069, + "num_input_tokens_seen": 2363490304, + "step": 1127 + }, + { + "epoch": 0.08896121508001086, + "grad_norm": 16.91001319885254, + "learning_rate": 8.483587064298372e-07, + "loss": 0.5104, + "num_input_tokens_seen": 2365587456, + "step": 1128 + }, + { + "epoch": 0.0892324382967182, + "grad_norm": 15.12585735321045, + "learning_rate": 8.480563843853328e-07, + "loss": 0.3908, + "num_input_tokens_seen": 2367684608, + "step": 1129 + }, + { + "epoch": 0.08950366151342555, + "grad_norm": 25.309404373168945, + "learning_rate": 8.477538224763923e-07, + "loss": 0.6482, + "num_input_tokens_seen": 2369781760, + "step": 1130 + }, + { + "epoch": 0.0897748847301329, + "grad_norm": 26.27945899963379, + "learning_rate": 8.474510209465058e-07, + "loss": 0.4492, + "num_input_tokens_seen": 2371878912, + "step": 1131 + }, + { + "epoch": 0.09004610794684025, + "grad_norm": 19.251035690307617, + "learning_rate": 8.471479800393565e-07, + "loss": 0.545, + "num_input_tokens_seen": 2373976064, + "step": 1132 + }, + { + "epoch": 0.09031733116354759, + "grad_norm": 19.276018142700195, + "learning_rate": 8.468446999988202e-07, + "loss": 0.5393, + "num_input_tokens_seen": 2376073216, + "step": 1133 + }, + { + "epoch": 0.09058855438025495, + "grad_norm": 25.423450469970703, + "learning_rate": 8.465411810689653e-07, + "loss": 0.9826, + "num_input_tokens_seen": 2378170368, + "step": 1134 + }, + { + "epoch": 0.0908597775969623, + "grad_norm": 17.12239646911621, + "learning_rate": 8.462374234940517e-07, + "loss": 0.487, + "num_input_tokens_seen": 2380267520, + "step": 1135 + }, + { + "epoch": 0.09113100081366965, + "grad_norm": 14.614494323730469, + "learning_rate": 8.459334275185325e-07, + "loss": 0.3742, + "num_input_tokens_seen": 2382364672, + "step": 1136 + }, + { + "epoch": 0.091402224030377, + "grad_norm": 27.48741340637207, + "learning_rate": 8.456291933870521e-07, + "loss": 1.0046, + "num_input_tokens_seen": 2384461824, + "step": 1137 + }, + { + "epoch": 0.09167344724708434, + "grad_norm": 12.554402351379395, + "learning_rate": 8.453247213444463e-07, + "loss": 0.2549, + "num_input_tokens_seen": 2386558976, + "step": 1138 + }, + { + "epoch": 0.0919446704637917, + "grad_norm": 14.78433895111084, + "learning_rate": 8.450200116357428e-07, + "loss": 0.5053, + "num_input_tokens_seen": 2388656128, + "step": 1139 + }, + { + "epoch": 0.09221589368049905, + "grad_norm": 12.076690673828125, + "learning_rate": 8.4471506450616e-07, + "loss": 0.3622, + "num_input_tokens_seen": 2390753280, + "step": 1140 + }, + { + "epoch": 0.0924871168972064, + "grad_norm": 18.700605392456055, + "learning_rate": 8.444098802011083e-07, + "loss": 0.472, + "num_input_tokens_seen": 2392850432, + "step": 1141 + }, + { + "epoch": 0.09275834011391375, + "grad_norm": 19.415464401245117, + "learning_rate": 8.441044589661881e-07, + "loss": 0.6367, + "num_input_tokens_seen": 2394947584, + "step": 1142 + }, + { + "epoch": 0.0930295633306211, + "grad_norm": 19.17083740234375, + "learning_rate": 8.437988010471907e-07, + "loss": 0.51, + "num_input_tokens_seen": 2397044736, + "step": 1143 + }, + { + "epoch": 0.09330078654732846, + "grad_norm": 16.67664337158203, + "learning_rate": 8.434929066900982e-07, + "loss": 0.533, + "num_input_tokens_seen": 2399141888, + "step": 1144 + }, + { + "epoch": 0.0935720097640358, + "grad_norm": 20.507274627685547, + "learning_rate": 8.431867761410826e-07, + "loss": 0.8218, + "num_input_tokens_seen": 2401239040, + "step": 1145 + }, + { + "epoch": 0.09384323298074315, + "grad_norm": 14.855219841003418, + "learning_rate": 8.42880409646506e-07, + "loss": 0.4415, + "num_input_tokens_seen": 2403336192, + "step": 1146 + }, + { + "epoch": 0.0941144561974505, + "grad_norm": 19.247512817382812, + "learning_rate": 8.42573807452921e-07, + "loss": 0.485, + "num_input_tokens_seen": 2405433344, + "step": 1147 + }, + { + "epoch": 0.09438567941415785, + "grad_norm": 23.100740432739258, + "learning_rate": 8.422669698070687e-07, + "loss": 0.5588, + "num_input_tokens_seen": 2407530496, + "step": 1148 + }, + { + "epoch": 0.09465690263086521, + "grad_norm": 12.322291374206543, + "learning_rate": 8.419598969558808e-07, + "loss": 0.2426, + "num_input_tokens_seen": 2409627648, + "step": 1149 + }, + { + "epoch": 0.09492812584757256, + "grad_norm": 11.358101844787598, + "learning_rate": 8.416525891464776e-07, + "loss": 0.3201, + "num_input_tokens_seen": 2411724800, + "step": 1150 + }, + { + "epoch": 0.0951993490642799, + "grad_norm": 16.695714950561523, + "learning_rate": 8.413450466261691e-07, + "loss": 0.4272, + "num_input_tokens_seen": 2413821952, + "step": 1151 + }, + { + "epoch": 0.09547057228098725, + "grad_norm": 14.923744201660156, + "learning_rate": 8.410372696424535e-07, + "loss": 0.3545, + "num_input_tokens_seen": 2415919104, + "step": 1152 + }, + { + "epoch": 0.0957417954976946, + "grad_norm": 13.441437721252441, + "learning_rate": 8.40729258443018e-07, + "loss": 0.4019, + "num_input_tokens_seen": 2418016256, + "step": 1153 + }, + { + "epoch": 0.09601301871440195, + "grad_norm": 13.833053588867188, + "learning_rate": 8.404210132757385e-07, + "loss": 0.3352, + "num_input_tokens_seen": 2420113408, + "step": 1154 + }, + { + "epoch": 0.09628424193110931, + "grad_norm": 13.472502708435059, + "learning_rate": 8.401125343886787e-07, + "loss": 0.5448, + "num_input_tokens_seen": 2422210560, + "step": 1155 + }, + { + "epoch": 0.09655546514781665, + "grad_norm": 10.587477684020996, + "learning_rate": 8.398038220300908e-07, + "loss": 0.3018, + "num_input_tokens_seen": 2424307712, + "step": 1156 + }, + { + "epoch": 0.096826688364524, + "grad_norm": 19.353105545043945, + "learning_rate": 8.39494876448415e-07, + "loss": 0.7406, + "num_input_tokens_seen": 2426404864, + "step": 1157 + }, + { + "epoch": 0.09709791158123135, + "grad_norm": 24.70726203918457, + "learning_rate": 8.391856978922785e-07, + "loss": 0.8482, + "num_input_tokens_seen": 2428502016, + "step": 1158 + }, + { + "epoch": 0.0973691347979387, + "grad_norm": 15.665159225463867, + "learning_rate": 8.38876286610497e-07, + "loss": 0.232, + "num_input_tokens_seen": 2430599168, + "step": 1159 + }, + { + "epoch": 0.09764035801464606, + "grad_norm": 19.37482452392578, + "learning_rate": 8.385666428520723e-07, + "loss": 0.6336, + "num_input_tokens_seen": 2432696320, + "step": 1160 + }, + { + "epoch": 0.0979115812313534, + "grad_norm": 21.107303619384766, + "learning_rate": 8.382567668661943e-07, + "loss": 0.6754, + "num_input_tokens_seen": 2434793472, + "step": 1161 + }, + { + "epoch": 0.09818280444806075, + "grad_norm": 21.173263549804688, + "learning_rate": 8.379466589022393e-07, + "loss": 0.613, + "num_input_tokens_seen": 2436890624, + "step": 1162 + }, + { + "epoch": 0.0984540276647681, + "grad_norm": 23.236482620239258, + "learning_rate": 8.376363192097703e-07, + "loss": 0.5053, + "num_input_tokens_seen": 2438987776, + "step": 1163 + }, + { + "epoch": 0.09872525088147545, + "grad_norm": 24.89634132385254, + "learning_rate": 8.37325748038537e-07, + "loss": 0.6246, + "num_input_tokens_seen": 2441084928, + "step": 1164 + }, + { + "epoch": 0.09899647409818281, + "grad_norm": 17.755111694335938, + "learning_rate": 8.370149456384754e-07, + "loss": 0.4242, + "num_input_tokens_seen": 2443182080, + "step": 1165 + }, + { + "epoch": 0.09926769731489016, + "grad_norm": 12.603349685668945, + "learning_rate": 8.36703912259707e-07, + "loss": 0.2753, + "num_input_tokens_seen": 2445279232, + "step": 1166 + }, + { + "epoch": 0.0995389205315975, + "grad_norm": 24.28446388244629, + "learning_rate": 8.363926481525402e-07, + "loss": 0.7312, + "num_input_tokens_seen": 2447376384, + "step": 1167 + }, + { + "epoch": 0.09981014374830485, + "grad_norm": 16.006052017211914, + "learning_rate": 8.360811535674682e-07, + "loss": 0.478, + "num_input_tokens_seen": 2449473536, + "step": 1168 + }, + { + "epoch": 0.1000813669650122, + "grad_norm": 25.193614959716797, + "learning_rate": 8.357694287551698e-07, + "loss": 0.779, + "num_input_tokens_seen": 2451570688, + "step": 1169 + }, + { + "epoch": 0.10035259018171956, + "grad_norm": 14.870660781860352, + "learning_rate": 8.354574739665096e-07, + "loss": 0.496, + "num_input_tokens_seen": 2453667840, + "step": 1170 + }, + { + "epoch": 0.10062381339842691, + "grad_norm": 17.222614288330078, + "learning_rate": 8.351452894525368e-07, + "loss": 0.305, + "num_input_tokens_seen": 2455764992, + "step": 1171 + }, + { + "epoch": 0.10089503661513426, + "grad_norm": 14.791529655456543, + "learning_rate": 8.348328754644855e-07, + "loss": 0.4074, + "num_input_tokens_seen": 2457862144, + "step": 1172 + }, + { + "epoch": 0.1011662598318416, + "grad_norm": 20.521638870239258, + "learning_rate": 8.34520232253775e-07, + "loss": 0.6192, + "num_input_tokens_seen": 2459959296, + "step": 1173 + }, + { + "epoch": 0.10143748304854895, + "grad_norm": 25.750980377197266, + "learning_rate": 8.342073600720082e-07, + "loss": 0.6189, + "num_input_tokens_seen": 2462056448, + "step": 1174 + }, + { + "epoch": 0.10170870626525631, + "grad_norm": 18.543386459350586, + "learning_rate": 8.33894259170973e-07, + "loss": 0.477, + "num_input_tokens_seen": 2464153600, + "step": 1175 + }, + { + "epoch": 0.10197992948196366, + "grad_norm": 13.476463317871094, + "learning_rate": 8.335809298026409e-07, + "loss": 0.3135, + "num_input_tokens_seen": 2466250752, + "step": 1176 + }, + { + "epoch": 0.10225115269867101, + "grad_norm": 19.402223587036133, + "learning_rate": 8.332673722191677e-07, + "loss": 0.3671, + "num_input_tokens_seen": 2468347904, + "step": 1177 + }, + { + "epoch": 0.10252237591537836, + "grad_norm": 21.820781707763672, + "learning_rate": 8.329535866728922e-07, + "loss": 0.4054, + "num_input_tokens_seen": 2470445056, + "step": 1178 + }, + { + "epoch": 0.1027935991320857, + "grad_norm": 19.219013214111328, + "learning_rate": 8.326395734163375e-07, + "loss": 0.6286, + "num_input_tokens_seen": 2472542208, + "step": 1179 + }, + { + "epoch": 0.10306482234879305, + "grad_norm": 22.508085250854492, + "learning_rate": 8.323253327022094e-07, + "loss": 0.4077, + "num_input_tokens_seen": 2474639360, + "step": 1180 + }, + { + "epoch": 0.10333604556550041, + "grad_norm": 12.82617473602295, + "learning_rate": 8.320108647833967e-07, + "loss": 0.3092, + "num_input_tokens_seen": 2476736512, + "step": 1181 + }, + { + "epoch": 0.10360726878220776, + "grad_norm": 12.152515411376953, + "learning_rate": 8.316961699129714e-07, + "loss": 0.2255, + "num_input_tokens_seen": 2478833664, + "step": 1182 + }, + { + "epoch": 0.1038784919989151, + "grad_norm": 20.471603393554688, + "learning_rate": 8.313812483441879e-07, + "loss": 0.6904, + "num_input_tokens_seen": 2480930816, + "step": 1183 + }, + { + "epoch": 0.10414971521562245, + "grad_norm": 14.457162857055664, + "learning_rate": 8.310661003304829e-07, + "loss": 0.3722, + "num_input_tokens_seen": 2483027968, + "step": 1184 + }, + { + "epoch": 0.1044209384323298, + "grad_norm": 25.704435348510742, + "learning_rate": 8.30750726125476e-07, + "loss": 0.5091, + "num_input_tokens_seen": 2485125120, + "step": 1185 + }, + { + "epoch": 0.10469216164903716, + "grad_norm": 16.720243453979492, + "learning_rate": 8.304351259829678e-07, + "loss": 0.4869, + "num_input_tokens_seen": 2487222272, + "step": 1186 + }, + { + "epoch": 0.10496338486574451, + "grad_norm": 14.137613296508789, + "learning_rate": 8.301193001569418e-07, + "loss": 0.3771, + "num_input_tokens_seen": 2489319424, + "step": 1187 + }, + { + "epoch": 0.10523460808245186, + "grad_norm": 15.297533988952637, + "learning_rate": 8.298032489015623e-07, + "loss": 0.317, + "num_input_tokens_seen": 2491416576, + "step": 1188 + }, + { + "epoch": 0.1055058312991592, + "grad_norm": 21.42536163330078, + "learning_rate": 8.294869724711752e-07, + "loss": 0.5508, + "num_input_tokens_seen": 2493513728, + "step": 1189 + }, + { + "epoch": 0.10577705451586655, + "grad_norm": 14.231220245361328, + "learning_rate": 8.291704711203082e-07, + "loss": 0.3826, + "num_input_tokens_seen": 2495610880, + "step": 1190 + }, + { + "epoch": 0.10604827773257391, + "grad_norm": 16.597139358520508, + "learning_rate": 8.288537451036691e-07, + "loss": 0.3962, + "num_input_tokens_seen": 2497708032, + "step": 1191 + }, + { + "epoch": 0.10631950094928126, + "grad_norm": 25.333345413208008, + "learning_rate": 8.28536794676147e-07, + "loss": 0.3347, + "num_input_tokens_seen": 2499805184, + "step": 1192 + }, + { + "epoch": 0.10659072416598861, + "grad_norm": 29.860403060913086, + "learning_rate": 8.282196200928119e-07, + "loss": 0.4289, + "num_input_tokens_seen": 2501902336, + "step": 1193 + }, + { + "epoch": 0.10686194738269596, + "grad_norm": 12.253214836120605, + "learning_rate": 8.279022216089135e-07, + "loss": 0.3256, + "num_input_tokens_seen": 2503999488, + "step": 1194 + }, + { + "epoch": 0.1071331705994033, + "grad_norm": 17.277803421020508, + "learning_rate": 8.275845994798821e-07, + "loss": 0.4446, + "num_input_tokens_seen": 2506096640, + "step": 1195 + }, + { + "epoch": 0.10740439381611067, + "grad_norm": 13.697084426879883, + "learning_rate": 8.272667539613281e-07, + "loss": 0.4053, + "num_input_tokens_seen": 2508193792, + "step": 1196 + }, + { + "epoch": 0.10767561703281801, + "grad_norm": 22.16970443725586, + "learning_rate": 8.26948685309041e-07, + "loss": 0.7259, + "num_input_tokens_seen": 2510290944, + "step": 1197 + }, + { + "epoch": 0.10794684024952536, + "grad_norm": 16.95806884765625, + "learning_rate": 8.266303937789908e-07, + "loss": 0.4267, + "num_input_tokens_seen": 2512388096, + "step": 1198 + }, + { + "epoch": 0.10821806346623271, + "grad_norm": 15.100922584533691, + "learning_rate": 8.263118796273263e-07, + "loss": 0.5967, + "num_input_tokens_seen": 2514485248, + "step": 1199 + }, + { + "epoch": 0.10848928668294006, + "grad_norm": 20.644742965698242, + "learning_rate": 8.259931431103754e-07, + "loss": 0.4512, + "num_input_tokens_seen": 2516582400, + "step": 1200 + }, + { + "epoch": 0.1087605098996474, + "grad_norm": 16.619178771972656, + "learning_rate": 8.256741844846452e-07, + "loss": 0.5051, + "num_input_tokens_seen": 2518679552, + "step": 1201 + }, + { + "epoch": 0.10903173311635476, + "grad_norm": 13.504317283630371, + "learning_rate": 8.253550040068216e-07, + "loss": 0.3861, + "num_input_tokens_seen": 2520776704, + "step": 1202 + }, + { + "epoch": 0.10930295633306211, + "grad_norm": 15.428912162780762, + "learning_rate": 8.250356019337688e-07, + "loss": 0.417, + "num_input_tokens_seen": 2522873856, + "step": 1203 + }, + { + "epoch": 0.10957417954976946, + "grad_norm": 25.92603874206543, + "learning_rate": 8.247159785225295e-07, + "loss": 0.9491, + "num_input_tokens_seen": 2524971008, + "step": 1204 + }, + { + "epoch": 0.10984540276647681, + "grad_norm": 21.164356231689453, + "learning_rate": 8.243961340303245e-07, + "loss": 0.756, + "num_input_tokens_seen": 2527068160, + "step": 1205 + }, + { + "epoch": 0.11011662598318415, + "grad_norm": 9.049589157104492, + "learning_rate": 8.240760687145521e-07, + "loss": 0.2077, + "num_input_tokens_seen": 2529165312, + "step": 1206 + }, + { + "epoch": 0.11038784919989152, + "grad_norm": 17.34234046936035, + "learning_rate": 8.237557828327891e-07, + "loss": 0.6235, + "num_input_tokens_seen": 2531262464, + "step": 1207 + }, + { + "epoch": 0.11065907241659886, + "grad_norm": 21.27731704711914, + "learning_rate": 8.234352766427894e-07, + "loss": 0.6889, + "num_input_tokens_seen": 2533359616, + "step": 1208 + }, + { + "epoch": 0.11093029563330621, + "grad_norm": 21.866683959960938, + "learning_rate": 8.231145504024838e-07, + "loss": 0.7666, + "num_input_tokens_seen": 2535456768, + "step": 1209 + }, + { + "epoch": 0.11120151885001356, + "grad_norm": 12.252056121826172, + "learning_rate": 8.22793604369981e-07, + "loss": 0.3829, + "num_input_tokens_seen": 2537553920, + "step": 1210 + }, + { + "epoch": 0.1114727420667209, + "grad_norm": 18.94308853149414, + "learning_rate": 8.224724388035659e-07, + "loss": 0.4294, + "num_input_tokens_seen": 2539651072, + "step": 1211 + }, + { + "epoch": 0.11174396528342827, + "grad_norm": 18.54946517944336, + "learning_rate": 8.221510539617003e-07, + "loss": 0.5497, + "num_input_tokens_seen": 2541748224, + "step": 1212 + }, + { + "epoch": 0.11201518850013562, + "grad_norm": 13.319576263427734, + "learning_rate": 8.218294501030226e-07, + "loss": 0.3906, + "num_input_tokens_seen": 2543845376, + "step": 1213 + }, + { + "epoch": 0.11228641171684296, + "grad_norm": 14.632641792297363, + "learning_rate": 8.215076274863476e-07, + "loss": 0.4973, + "num_input_tokens_seen": 2545942528, + "step": 1214 + }, + { + "epoch": 0.11255763493355031, + "grad_norm": 14.780923843383789, + "learning_rate": 8.211855863706654e-07, + "loss": 0.3886, + "num_input_tokens_seen": 2548039680, + "step": 1215 + }, + { + "epoch": 0.11282885815025766, + "grad_norm": 17.888994216918945, + "learning_rate": 8.208633270151426e-07, + "loss": 0.5352, + "num_input_tokens_seen": 2550136832, + "step": 1216 + }, + { + "epoch": 0.11310008136696502, + "grad_norm": 21.13353157043457, + "learning_rate": 8.205408496791216e-07, + "loss": 0.6396, + "num_input_tokens_seen": 2552233984, + "step": 1217 + }, + { + "epoch": 0.11337130458367237, + "grad_norm": 15.377924919128418, + "learning_rate": 8.202181546221193e-07, + "loss": 0.3852, + "num_input_tokens_seen": 2554331136, + "step": 1218 + }, + { + "epoch": 0.11364252780037971, + "grad_norm": 19.80136489868164, + "learning_rate": 8.19895242103829e-07, + "loss": 0.7451, + "num_input_tokens_seen": 2556428288, + "step": 1219 + }, + { + "epoch": 0.11391375101708706, + "grad_norm": 20.684595108032227, + "learning_rate": 8.19572112384118e-07, + "loss": 0.7119, + "num_input_tokens_seen": 2558525440, + "step": 1220 + }, + { + "epoch": 0.11418497423379441, + "grad_norm": 14.285638809204102, + "learning_rate": 8.192487657230288e-07, + "loss": 0.3151, + "num_input_tokens_seen": 2560622592, + "step": 1221 + }, + { + "epoch": 0.11445619745050176, + "grad_norm": 15.149106979370117, + "learning_rate": 8.18925202380779e-07, + "loss": 0.3626, + "num_input_tokens_seen": 2562719744, + "step": 1222 + }, + { + "epoch": 0.11472742066720912, + "grad_norm": 16.11283302307129, + "learning_rate": 8.186014226177594e-07, + "loss": 0.4939, + "num_input_tokens_seen": 2564816896, + "step": 1223 + }, + { + "epoch": 0.11499864388391647, + "grad_norm": 16.55274200439453, + "learning_rate": 8.18277426694536e-07, + "loss": 0.4951, + "num_input_tokens_seen": 2566914048, + "step": 1224 + }, + { + "epoch": 0.11526986710062381, + "grad_norm": 16.552610397338867, + "learning_rate": 8.179532148718483e-07, + "loss": 0.4981, + "num_input_tokens_seen": 2569011200, + "step": 1225 + }, + { + "epoch": 0.11554109031733116, + "grad_norm": 20.186845779418945, + "learning_rate": 8.176287874106097e-07, + "loss": 0.6232, + "num_input_tokens_seen": 2571108352, + "step": 1226 + }, + { + "epoch": 0.11581231353403851, + "grad_norm": 18.383283615112305, + "learning_rate": 8.173041445719069e-07, + "loss": 0.7157, + "num_input_tokens_seen": 2573205504, + "step": 1227 + }, + { + "epoch": 0.11608353675074587, + "grad_norm": 28.215744018554688, + "learning_rate": 8.169792866170003e-07, + "loss": 0.7858, + "num_input_tokens_seen": 2575302656, + "step": 1228 + }, + { + "epoch": 0.11635475996745322, + "grad_norm": 15.474285125732422, + "learning_rate": 8.166542138073232e-07, + "loss": 0.5306, + "num_input_tokens_seen": 2577399808, + "step": 1229 + }, + { + "epoch": 0.11662598318416056, + "grad_norm": 30.002483367919922, + "learning_rate": 8.163289264044817e-07, + "loss": 0.5879, + "num_input_tokens_seen": 2579496960, + "step": 1230 + }, + { + "epoch": 0.11689720640086791, + "grad_norm": 17.683021545410156, + "learning_rate": 8.160034246702548e-07, + "loss": 0.5828, + "num_input_tokens_seen": 2581594112, + "step": 1231 + }, + { + "epoch": 0.11716842961757526, + "grad_norm": 18.124019622802734, + "learning_rate": 8.156777088665939e-07, + "loss": 0.4591, + "num_input_tokens_seen": 2583691264, + "step": 1232 + }, + { + "epoch": 0.11743965283428262, + "grad_norm": 11.012289047241211, + "learning_rate": 8.153517792556226e-07, + "loss": 0.2872, + "num_input_tokens_seen": 2585788416, + "step": 1233 + }, + { + "epoch": 0.11771087605098997, + "grad_norm": 19.892656326293945, + "learning_rate": 8.15025636099637e-07, + "loss": 0.6456, + "num_input_tokens_seen": 2587885568, + "step": 1234 + }, + { + "epoch": 0.11798209926769732, + "grad_norm": 14.017526626586914, + "learning_rate": 8.146992796611042e-07, + "loss": 0.3137, + "num_input_tokens_seen": 2589982720, + "step": 1235 + }, + { + "epoch": 0.11825332248440466, + "grad_norm": 21.26898765563965, + "learning_rate": 8.143727102026638e-07, + "loss": 0.8757, + "num_input_tokens_seen": 2592079872, + "step": 1236 + }, + { + "epoch": 0.11852454570111201, + "grad_norm": 10.634222030639648, + "learning_rate": 8.140459279871264e-07, + "loss": 0.3173, + "num_input_tokens_seen": 2594177024, + "step": 1237 + }, + { + "epoch": 0.11879576891781937, + "grad_norm": 20.876935958862305, + "learning_rate": 8.137189332774738e-07, + "loss": 0.7255, + "num_input_tokens_seen": 2596274176, + "step": 1238 + }, + { + "epoch": 0.11906699213452672, + "grad_norm": 13.55611801147461, + "learning_rate": 8.133917263368589e-07, + "loss": 0.3449, + "num_input_tokens_seen": 2598371328, + "step": 1239 + }, + { + "epoch": 0.11933821535123407, + "grad_norm": 23.81563949584961, + "learning_rate": 8.130643074286056e-07, + "loss": 0.4658, + "num_input_tokens_seen": 2600468480, + "step": 1240 + }, + { + "epoch": 0.11960943856794141, + "grad_norm": 15.136983871459961, + "learning_rate": 8.127366768162077e-07, + "loss": 0.4361, + "num_input_tokens_seen": 2602565632, + "step": 1241 + }, + { + "epoch": 0.11988066178464876, + "grad_norm": 14.615781784057617, + "learning_rate": 8.124088347633304e-07, + "loss": 0.3938, + "num_input_tokens_seen": 2604662784, + "step": 1242 + }, + { + "epoch": 0.12015188500135611, + "grad_norm": 13.33740234375, + "learning_rate": 8.120807815338083e-07, + "loss": 0.377, + "num_input_tokens_seen": 2606759936, + "step": 1243 + }, + { + "epoch": 0.12042310821806347, + "grad_norm": 16.450172424316406, + "learning_rate": 8.11752517391646e-07, + "loss": 0.3627, + "num_input_tokens_seen": 2608857088, + "step": 1244 + }, + { + "epoch": 0.12069433143477082, + "grad_norm": 12.913908958435059, + "learning_rate": 8.114240426010183e-07, + "loss": 0.3407, + "num_input_tokens_seen": 2610954240, + "step": 1245 + }, + { + "epoch": 0.12096555465147817, + "grad_norm": 19.959516525268555, + "learning_rate": 8.11095357426269e-07, + "loss": 0.6169, + "num_input_tokens_seen": 2613051392, + "step": 1246 + }, + { + "epoch": 0.12123677786818551, + "grad_norm": 15.97438907623291, + "learning_rate": 8.107664621319113e-07, + "loss": 0.5242, + "num_input_tokens_seen": 2615148544, + "step": 1247 + }, + { + "epoch": 0.12150800108489286, + "grad_norm": 14.837607383728027, + "learning_rate": 8.10437356982628e-07, + "loss": 0.3952, + "num_input_tokens_seen": 2617245696, + "step": 1248 + }, + { + "epoch": 0.12177922430160022, + "grad_norm": 15.677940368652344, + "learning_rate": 8.1010804224327e-07, + "loss": 0.4423, + "num_input_tokens_seen": 2619342848, + "step": 1249 + }, + { + "epoch": 0.12205044751830757, + "grad_norm": 13.296334266662598, + "learning_rate": 8.097785181788574e-07, + "loss": 0.3551, + "num_input_tokens_seen": 2621440000, + "step": 1250 + }, + { + "epoch": 0.12232167073501492, + "grad_norm": 17.426509857177734, + "learning_rate": 8.09448785054579e-07, + "loss": 0.371, + "num_input_tokens_seen": 2623537152, + "step": 1251 + }, + { + "epoch": 0.12259289395172226, + "grad_norm": 24.391117095947266, + "learning_rate": 8.091188431357908e-07, + "loss": 0.9746, + "num_input_tokens_seen": 2625634304, + "step": 1252 + }, + { + "epoch": 0.12286411716842961, + "grad_norm": 22.590383529663086, + "learning_rate": 8.087886926880181e-07, + "loss": 0.9044, + "num_input_tokens_seen": 2627731456, + "step": 1253 + }, + { + "epoch": 0.12313534038513697, + "grad_norm": 15.536231994628906, + "learning_rate": 8.084583339769531e-07, + "loss": 0.4665, + "num_input_tokens_seen": 2629828608, + "step": 1254 + }, + { + "epoch": 0.12340656360184432, + "grad_norm": 16.86744499206543, + "learning_rate": 8.081277672684557e-07, + "loss": 0.3953, + "num_input_tokens_seen": 2631925760, + "step": 1255 + }, + { + "epoch": 0.12367778681855167, + "grad_norm": 18.4736385345459, + "learning_rate": 8.077969928285541e-07, + "loss": 0.5952, + "num_input_tokens_seen": 2634022912, + "step": 1256 + }, + { + "epoch": 0.12394901003525902, + "grad_norm": 15.2562837600708, + "learning_rate": 8.074660109234424e-07, + "loss": 0.4361, + "num_input_tokens_seen": 2636120064, + "step": 1257 + }, + { + "epoch": 0.12422023325196636, + "grad_norm": 20.01945686340332, + "learning_rate": 8.071348218194823e-07, + "loss": 0.5753, + "num_input_tokens_seen": 2638217216, + "step": 1258 + }, + { + "epoch": 0.12449145646867373, + "grad_norm": 20.903608322143555, + "learning_rate": 8.068034257832026e-07, + "loss": 0.7434, + "num_input_tokens_seen": 2640314368, + "step": 1259 + }, + { + "epoch": 0.12476267968538107, + "grad_norm": 18.132099151611328, + "learning_rate": 8.064718230812976e-07, + "loss": 0.5717, + "num_input_tokens_seen": 2642411520, + "step": 1260 + }, + { + "epoch": 0.1250339029020884, + "grad_norm": 19.513261795043945, + "learning_rate": 8.06140013980629e-07, + "loss": 0.6942, + "num_input_tokens_seen": 2644508672, + "step": 1261 + }, + { + "epoch": 0.12530512611879577, + "grad_norm": 15.275543212890625, + "learning_rate": 8.05807998748224e-07, + "loss": 0.3526, + "num_input_tokens_seen": 2646605824, + "step": 1262 + }, + { + "epoch": 0.12557634933550313, + "grad_norm": 15.771416664123535, + "learning_rate": 8.05475777651276e-07, + "loss": 0.3085, + "num_input_tokens_seen": 2648702976, + "step": 1263 + }, + { + "epoch": 0.12584757255221046, + "grad_norm": 20.19382095336914, + "learning_rate": 8.051433509571435e-07, + "loss": 0.6849, + "num_input_tokens_seen": 2650800128, + "step": 1264 + }, + { + "epoch": 0.12611879576891782, + "grad_norm": 19.399974822998047, + "learning_rate": 8.04810718933351e-07, + "loss": 0.5637, + "num_input_tokens_seen": 2652897280, + "step": 1265 + }, + { + "epoch": 0.12639001898562516, + "grad_norm": 14.588866233825684, + "learning_rate": 8.044778818475884e-07, + "loss": 0.3837, + "num_input_tokens_seen": 2654994432, + "step": 1266 + }, + { + "epoch": 0.12666124220233252, + "grad_norm": 18.231657028198242, + "learning_rate": 8.0414483996771e-07, + "loss": 0.6066, + "num_input_tokens_seen": 2657091584, + "step": 1267 + }, + { + "epoch": 0.12693246541903988, + "grad_norm": 21.163400650024414, + "learning_rate": 8.038115935617355e-07, + "loss": 0.8268, + "num_input_tokens_seen": 2659188736, + "step": 1268 + }, + { + "epoch": 0.12720368863574721, + "grad_norm": 13.187846183776855, + "learning_rate": 8.034781428978484e-07, + "loss": 0.3457, + "num_input_tokens_seen": 2661285888, + "step": 1269 + }, + { + "epoch": 0.12747491185245458, + "grad_norm": 11.001519203186035, + "learning_rate": 8.031444882443976e-07, + "loss": 0.223, + "num_input_tokens_seen": 2663383040, + "step": 1270 + }, + { + "epoch": 0.1277461350691619, + "grad_norm": 16.197965621948242, + "learning_rate": 8.028106298698957e-07, + "loss": 0.5616, + "num_input_tokens_seen": 2665480192, + "step": 1271 + }, + { + "epoch": 0.12801735828586927, + "grad_norm": 13.498997688293457, + "learning_rate": 8.024765680430188e-07, + "loss": 0.314, + "num_input_tokens_seen": 2667577344, + "step": 1272 + }, + { + "epoch": 0.12828858150257663, + "grad_norm": 18.05949592590332, + "learning_rate": 8.021423030326075e-07, + "loss": 0.585, + "num_input_tokens_seen": 2669674496, + "step": 1273 + }, + { + "epoch": 0.12855980471928397, + "grad_norm": 10.726323127746582, + "learning_rate": 8.018078351076653e-07, + "loss": 0.2417, + "num_input_tokens_seen": 2671771648, + "step": 1274 + }, + { + "epoch": 0.12883102793599133, + "grad_norm": 13.169657707214355, + "learning_rate": 8.014731645373595e-07, + "loss": 0.3998, + "num_input_tokens_seen": 2673868800, + "step": 1275 + }, + { + "epoch": 0.12910225115269866, + "grad_norm": 18.775423049926758, + "learning_rate": 8.011382915910203e-07, + "loss": 0.5083, + "num_input_tokens_seen": 2675965952, + "step": 1276 + }, + { + "epoch": 0.12937347436940602, + "grad_norm": 14.191473007202148, + "learning_rate": 8.008032165381403e-07, + "loss": 0.324, + "num_input_tokens_seen": 2678063104, + "step": 1277 + }, + { + "epoch": 0.12964469758611338, + "grad_norm": 19.03130531311035, + "learning_rate": 8.004679396483756e-07, + "loss": 0.2584, + "num_input_tokens_seen": 2680160256, + "step": 1278 + }, + { + "epoch": 0.12991592080282072, + "grad_norm": 22.582111358642578, + "learning_rate": 8.001324611915441e-07, + "loss": 0.8237, + "num_input_tokens_seen": 2682257408, + "step": 1279 + }, + { + "epoch": 0.13018714401952808, + "grad_norm": 16.168638229370117, + "learning_rate": 7.99796781437626e-07, + "loss": 0.4733, + "num_input_tokens_seen": 2684354560, + "step": 1280 + }, + { + "epoch": 0.1304583672362354, + "grad_norm": 11.249455451965332, + "learning_rate": 7.994609006567635e-07, + "loss": 0.2726, + "num_input_tokens_seen": 2686451712, + "step": 1281 + }, + { + "epoch": 0.13072959045294277, + "grad_norm": 24.108003616333008, + "learning_rate": 7.99124819119261e-07, + "loss": 0.4514, + "num_input_tokens_seen": 2688548864, + "step": 1282 + }, + { + "epoch": 0.13100081366965013, + "grad_norm": 22.541616439819336, + "learning_rate": 7.987885370955841e-07, + "loss": 0.6907, + "num_input_tokens_seen": 2690646016, + "step": 1283 + }, + { + "epoch": 0.13127203688635747, + "grad_norm": 17.23971176147461, + "learning_rate": 7.984520548563594e-07, + "loss": 0.6375, + "num_input_tokens_seen": 2692743168, + "step": 1284 + }, + { + "epoch": 0.13154326010306483, + "grad_norm": 14.531922340393066, + "learning_rate": 7.981153726723755e-07, + "loss": 0.4198, + "num_input_tokens_seen": 2694840320, + "step": 1285 + }, + { + "epoch": 0.13181448331977216, + "grad_norm": 12.036006927490234, + "learning_rate": 7.977784908145809e-07, + "loss": 0.3334, + "num_input_tokens_seen": 2696937472, + "step": 1286 + }, + { + "epoch": 0.13208570653647952, + "grad_norm": 10.315123558044434, + "learning_rate": 7.974414095540858e-07, + "loss": 0.2007, + "num_input_tokens_seen": 2699034624, + "step": 1287 + }, + { + "epoch": 0.13235692975318689, + "grad_norm": 10.276031494140625, + "learning_rate": 7.971041291621601e-07, + "loss": 0.3, + "num_input_tokens_seen": 2701131776, + "step": 1288 + }, + { + "epoch": 0.13262815296989422, + "grad_norm": 16.799928665161133, + "learning_rate": 7.967666499102341e-07, + "loss": 0.4751, + "num_input_tokens_seen": 2703228928, + "step": 1289 + }, + { + "epoch": 0.13289937618660158, + "grad_norm": 13.778836250305176, + "learning_rate": 7.964289720698986e-07, + "loss": 0.3534, + "num_input_tokens_seen": 2705326080, + "step": 1290 + }, + { + "epoch": 0.13317059940330891, + "grad_norm": 17.865440368652344, + "learning_rate": 7.960910959129037e-07, + "loss": 0.6054, + "num_input_tokens_seen": 2707423232, + "step": 1291 + }, + { + "epoch": 0.13344182262001628, + "grad_norm": 17.34958267211914, + "learning_rate": 7.957530217111591e-07, + "loss": 0.5136, + "num_input_tokens_seen": 2709520384, + "step": 1292 + }, + { + "epoch": 0.13371304583672364, + "grad_norm": 24.52153205871582, + "learning_rate": 7.954147497367343e-07, + "loss": 0.853, + "num_input_tokens_seen": 2711617536, + "step": 1293 + }, + { + "epoch": 0.13398426905343097, + "grad_norm": 18.36350440979004, + "learning_rate": 7.950762802618576e-07, + "loss": 0.4762, + "num_input_tokens_seen": 2713714688, + "step": 1294 + }, + { + "epoch": 0.13425549227013833, + "grad_norm": 17.231550216674805, + "learning_rate": 7.947376135589164e-07, + "loss": 0.5823, + "num_input_tokens_seen": 2715811840, + "step": 1295 + }, + { + "epoch": 0.13452671548684567, + "grad_norm": 22.24859619140625, + "learning_rate": 7.943987499004569e-07, + "loss": 0.7359, + "num_input_tokens_seen": 2717908992, + "step": 1296 + }, + { + "epoch": 0.13479793870355303, + "grad_norm": 22.393712997436523, + "learning_rate": 7.940596895591835e-07, + "loss": 0.6906, + "num_input_tokens_seen": 2720006144, + "step": 1297 + }, + { + "epoch": 0.13506916192026036, + "grad_norm": 13.456273078918457, + "learning_rate": 7.93720432807959e-07, + "loss": 0.3609, + "num_input_tokens_seen": 2722103296, + "step": 1298 + }, + { + "epoch": 0.13534038513696772, + "grad_norm": 18.195858001708984, + "learning_rate": 7.933809799198045e-07, + "loss": 0.5547, + "num_input_tokens_seen": 2724200448, + "step": 1299 + }, + { + "epoch": 0.13561160835367508, + "grad_norm": 23.925018310546875, + "learning_rate": 7.930413311678987e-07, + "loss": 0.4255, + "num_input_tokens_seen": 2726297600, + "step": 1300 + }, + { + "epoch": 0.13588283157038242, + "grad_norm": 17.64158821105957, + "learning_rate": 7.92701486825578e-07, + "loss": 0.3462, + "num_input_tokens_seen": 2728394752, + "step": 1301 + }, + { + "epoch": 0.13615405478708978, + "grad_norm": 25.17290496826172, + "learning_rate": 7.923614471663361e-07, + "loss": 0.5714, + "num_input_tokens_seen": 2730491904, + "step": 1302 + }, + { + "epoch": 0.1364252780037971, + "grad_norm": 10.592412948608398, + "learning_rate": 7.920212124638241e-07, + "loss": 0.2677, + "num_input_tokens_seen": 2732589056, + "step": 1303 + }, + { + "epoch": 0.13669650122050447, + "grad_norm": 12.378978729248047, + "learning_rate": 7.916807829918499e-07, + "loss": 0.3103, + "num_input_tokens_seen": 2734686208, + "step": 1304 + }, + { + "epoch": 0.13696772443721184, + "grad_norm": 17.801116943359375, + "learning_rate": 7.913401590243781e-07, + "loss": 0.3225, + "num_input_tokens_seen": 2736783360, + "step": 1305 + }, + { + "epoch": 0.13723894765391917, + "grad_norm": 19.611040115356445, + "learning_rate": 7.909993408355302e-07, + "loss": 0.5376, + "num_input_tokens_seen": 2738880512, + "step": 1306 + }, + { + "epoch": 0.13751017087062653, + "grad_norm": 20.099191665649414, + "learning_rate": 7.906583286995834e-07, + "loss": 0.644, + "num_input_tokens_seen": 2740977664, + "step": 1307 + }, + { + "epoch": 0.13778139408733386, + "grad_norm": 13.194962501525879, + "learning_rate": 7.903171228909714e-07, + "loss": 0.2898, + "num_input_tokens_seen": 2743074816, + "step": 1308 + }, + { + "epoch": 0.13805261730404123, + "grad_norm": 13.878562927246094, + "learning_rate": 7.899757236842836e-07, + "loss": 0.3795, + "num_input_tokens_seen": 2745171968, + "step": 1309 + }, + { + "epoch": 0.1383238405207486, + "grad_norm": 25.963001251220703, + "learning_rate": 7.89634131354265e-07, + "loss": 0.7278, + "num_input_tokens_seen": 2747269120, + "step": 1310 + }, + { + "epoch": 0.13859506373745592, + "grad_norm": 19.090431213378906, + "learning_rate": 7.892923461758165e-07, + "loss": 0.5589, + "num_input_tokens_seen": 2749366272, + "step": 1311 + }, + { + "epoch": 0.13886628695416328, + "grad_norm": 15.916430473327637, + "learning_rate": 7.889503684239933e-07, + "loss": 0.4635, + "num_input_tokens_seen": 2751463424, + "step": 1312 + }, + { + "epoch": 0.13913751017087062, + "grad_norm": 12.155187606811523, + "learning_rate": 7.886081983740066e-07, + "loss": 0.3096, + "num_input_tokens_seen": 2753560576, + "step": 1313 + }, + { + "epoch": 0.13940873338757798, + "grad_norm": 19.226377487182617, + "learning_rate": 7.882658363012214e-07, + "loss": 0.5558, + "num_input_tokens_seen": 2755657728, + "step": 1314 + }, + { + "epoch": 0.13967995660428534, + "grad_norm": 16.780195236206055, + "learning_rate": 7.879232824811579e-07, + "loss": 0.4074, + "num_input_tokens_seen": 2757754880, + "step": 1315 + }, + { + "epoch": 0.13995117982099267, + "grad_norm": 24.132198333740234, + "learning_rate": 7.875805371894904e-07, + "loss": 0.8599, + "num_input_tokens_seen": 2759852032, + "step": 1316 + }, + { + "epoch": 0.14022240303770003, + "grad_norm": 16.48426055908203, + "learning_rate": 7.872376007020469e-07, + "loss": 0.4509, + "num_input_tokens_seen": 2761949184, + "step": 1317 + }, + { + "epoch": 0.14049362625440737, + "grad_norm": 13.625266075134277, + "learning_rate": 7.8689447329481e-07, + "loss": 0.3677, + "num_input_tokens_seen": 2764046336, + "step": 1318 + }, + { + "epoch": 0.14076484947111473, + "grad_norm": 17.589357376098633, + "learning_rate": 7.865511552439156e-07, + "loss": 0.5272, + "num_input_tokens_seen": 2766143488, + "step": 1319 + }, + { + "epoch": 0.1410360726878221, + "grad_norm": 15.197576522827148, + "learning_rate": 7.862076468256529e-07, + "loss": 0.5088, + "num_input_tokens_seen": 2768240640, + "step": 1320 + }, + { + "epoch": 0.14130729590452942, + "grad_norm": 18.043136596679688, + "learning_rate": 7.858639483164643e-07, + "loss": 0.4647, + "num_input_tokens_seen": 2770337792, + "step": 1321 + }, + { + "epoch": 0.14157851912123678, + "grad_norm": 15.092884063720703, + "learning_rate": 7.855200599929456e-07, + "loss": 0.5039, + "num_input_tokens_seen": 2772434944, + "step": 1322 + }, + { + "epoch": 0.14184974233794412, + "grad_norm": 15.708367347717285, + "learning_rate": 7.851759821318447e-07, + "loss": 0.424, + "num_input_tokens_seen": 2774532096, + "step": 1323 + }, + { + "epoch": 0.14212096555465148, + "grad_norm": 22.44023323059082, + "learning_rate": 7.848317150100624e-07, + "loss": 0.5721, + "num_input_tokens_seen": 2776629248, + "step": 1324 + }, + { + "epoch": 0.14239218877135884, + "grad_norm": 13.735273361206055, + "learning_rate": 7.84487258904652e-07, + "loss": 0.4114, + "num_input_tokens_seen": 2778726400, + "step": 1325 + }, + { + "epoch": 0.14266341198806617, + "grad_norm": 24.59619140625, + "learning_rate": 7.841426140928184e-07, + "loss": 0.7805, + "num_input_tokens_seen": 2780823552, + "step": 1326 + }, + { + "epoch": 0.14293463520477354, + "grad_norm": 24.768539428710938, + "learning_rate": 7.837977808519189e-07, + "loss": 0.7438, + "num_input_tokens_seen": 2782920704, + "step": 1327 + }, + { + "epoch": 0.14320585842148087, + "grad_norm": 20.256254196166992, + "learning_rate": 7.834527594594618e-07, + "loss": 0.6571, + "num_input_tokens_seen": 2785017856, + "step": 1328 + }, + { + "epoch": 0.14347708163818823, + "grad_norm": 18.815889358520508, + "learning_rate": 7.831075501931078e-07, + "loss": 0.5158, + "num_input_tokens_seen": 2787115008, + "step": 1329 + }, + { + "epoch": 0.1437483048548956, + "grad_norm": 18.49612045288086, + "learning_rate": 7.827621533306677e-07, + "loss": 0.6597, + "num_input_tokens_seen": 2789212160, + "step": 1330 + }, + { + "epoch": 0.14401952807160293, + "grad_norm": 15.946340560913086, + "learning_rate": 7.824165691501036e-07, + "loss": 0.479, + "num_input_tokens_seen": 2791309312, + "step": 1331 + }, + { + "epoch": 0.1442907512883103, + "grad_norm": 17.465255737304688, + "learning_rate": 7.820707979295289e-07, + "loss": 0.3821, + "num_input_tokens_seen": 2793406464, + "step": 1332 + }, + { + "epoch": 0.14456197450501762, + "grad_norm": 28.799354553222656, + "learning_rate": 7.81724839947207e-07, + "loss": 0.6755, + "num_input_tokens_seen": 2795503616, + "step": 1333 + }, + { + "epoch": 0.14483319772172498, + "grad_norm": 19.517778396606445, + "learning_rate": 7.813786954815517e-07, + "loss": 0.4634, + "num_input_tokens_seen": 2797600768, + "step": 1334 + }, + { + "epoch": 0.14510442093843234, + "grad_norm": 18.770734786987305, + "learning_rate": 7.810323648111269e-07, + "loss": 0.3967, + "num_input_tokens_seen": 2799697920, + "step": 1335 + }, + { + "epoch": 0.14537564415513968, + "grad_norm": 14.419391632080078, + "learning_rate": 7.806858482146463e-07, + "loss": 0.4468, + "num_input_tokens_seen": 2801795072, + "step": 1336 + }, + { + "epoch": 0.14564686737184704, + "grad_norm": 17.39090919494629, + "learning_rate": 7.803391459709733e-07, + "loss": 0.4719, + "num_input_tokens_seen": 2803892224, + "step": 1337 + }, + { + "epoch": 0.14591809058855437, + "grad_norm": 16.507200241088867, + "learning_rate": 7.799922583591206e-07, + "loss": 0.4681, + "num_input_tokens_seen": 2805989376, + "step": 1338 + }, + { + "epoch": 0.14618931380526173, + "grad_norm": 14.502838134765625, + "learning_rate": 7.796451856582504e-07, + "loss": 0.4491, + "num_input_tokens_seen": 2808086528, + "step": 1339 + }, + { + "epoch": 0.14646053702196907, + "grad_norm": 12.314398765563965, + "learning_rate": 7.792979281476734e-07, + "loss": 0.332, + "num_input_tokens_seen": 2810183680, + "step": 1340 + }, + { + "epoch": 0.14673176023867643, + "grad_norm": 13.12981128692627, + "learning_rate": 7.789504861068492e-07, + "loss": 0.4365, + "num_input_tokens_seen": 2812280832, + "step": 1341 + }, + { + "epoch": 0.1470029834553838, + "grad_norm": 19.02878761291504, + "learning_rate": 7.78602859815386e-07, + "loss": 0.516, + "num_input_tokens_seen": 2814377984, + "step": 1342 + }, + { + "epoch": 0.14727420667209112, + "grad_norm": 11.80683422088623, + "learning_rate": 7.782550495530402e-07, + "loss": 0.2956, + "num_input_tokens_seen": 2816475136, + "step": 1343 + }, + { + "epoch": 0.14754542988879848, + "grad_norm": 23.65553855895996, + "learning_rate": 7.779070555997162e-07, + "loss": 0.7255, + "num_input_tokens_seen": 2818572288, + "step": 1344 + }, + { + "epoch": 0.14781665310550582, + "grad_norm": 20.80998420715332, + "learning_rate": 7.775588782354666e-07, + "loss": 0.6579, + "num_input_tokens_seen": 2820669440, + "step": 1345 + }, + { + "epoch": 0.14808787632221318, + "grad_norm": 20.969602584838867, + "learning_rate": 7.77210517740491e-07, + "loss": 0.7579, + "num_input_tokens_seen": 2822766592, + "step": 1346 + }, + { + "epoch": 0.14835909953892054, + "grad_norm": 9.614218711853027, + "learning_rate": 7.768619743951366e-07, + "loss": 0.2488, + "num_input_tokens_seen": 2824863744, + "step": 1347 + }, + { + "epoch": 0.14863032275562787, + "grad_norm": 12.458992004394531, + "learning_rate": 7.765132484798978e-07, + "loss": 0.3422, + "num_input_tokens_seen": 2826960896, + "step": 1348 + }, + { + "epoch": 0.14890154597233524, + "grad_norm": 31.408218383789062, + "learning_rate": 7.761643402754163e-07, + "loss": 0.5978, + "num_input_tokens_seen": 2829058048, + "step": 1349 + }, + { + "epoch": 0.14917276918904257, + "grad_norm": 19.415218353271484, + "learning_rate": 7.7581525006248e-07, + "loss": 0.5435, + "num_input_tokens_seen": 2831155200, + "step": 1350 + }, + { + "epoch": 0.14944399240574993, + "grad_norm": 17.554052352905273, + "learning_rate": 7.754659781220232e-07, + "loss": 0.4826, + "num_input_tokens_seen": 2833252352, + "step": 1351 + }, + { + "epoch": 0.1497152156224573, + "grad_norm": 14.18742561340332, + "learning_rate": 7.751165247351269e-07, + "loss": 0.3339, + "num_input_tokens_seen": 2835349504, + "step": 1352 + }, + { + "epoch": 0.14998643883916463, + "grad_norm": 15.89776611328125, + "learning_rate": 7.747668901830178e-07, + "loss": 0.5135, + "num_input_tokens_seen": 2837446656, + "step": 1353 + }, + { + "epoch": 0.150257662055872, + "grad_norm": 19.270442962646484, + "learning_rate": 7.744170747470685e-07, + "loss": 0.5667, + "num_input_tokens_seen": 2839543808, + "step": 1354 + }, + { + "epoch": 0.15052888527257932, + "grad_norm": 23.312227249145508, + "learning_rate": 7.740670787087972e-07, + "loss": 0.954, + "num_input_tokens_seen": 2841640960, + "step": 1355 + }, + { + "epoch": 0.15080010848928668, + "grad_norm": 13.209382057189941, + "learning_rate": 7.737169023498672e-07, + "loss": 0.3264, + "num_input_tokens_seen": 2843738112, + "step": 1356 + }, + { + "epoch": 0.15107133170599404, + "grad_norm": 15.09355354309082, + "learning_rate": 7.733665459520873e-07, + "loss": 0.3447, + "num_input_tokens_seen": 2845835264, + "step": 1357 + }, + { + "epoch": 0.15134255492270138, + "grad_norm": 12.502537727355957, + "learning_rate": 7.730160097974109e-07, + "loss": 0.3267, + "num_input_tokens_seen": 2847932416, + "step": 1358 + }, + { + "epoch": 0.15161377813940874, + "grad_norm": 17.678531646728516, + "learning_rate": 7.726652941679365e-07, + "loss": 0.4452, + "num_input_tokens_seen": 2850029568, + "step": 1359 + }, + { + "epoch": 0.15188500135611607, + "grad_norm": 14.068532943725586, + "learning_rate": 7.723143993459061e-07, + "loss": 0.443, + "num_input_tokens_seen": 2852126720, + "step": 1360 + }, + { + "epoch": 0.15215622457282343, + "grad_norm": 17.834699630737305, + "learning_rate": 7.719633256137067e-07, + "loss": 0.4935, + "num_input_tokens_seen": 2854223872, + "step": 1361 + }, + { + "epoch": 0.1524274477895308, + "grad_norm": 23.74162483215332, + "learning_rate": 7.716120732538696e-07, + "loss": 0.2926, + "num_input_tokens_seen": 2856321024, + "step": 1362 + }, + { + "epoch": 0.15269867100623813, + "grad_norm": 21.309246063232422, + "learning_rate": 7.712606425490687e-07, + "loss": 0.6429, + "num_input_tokens_seen": 2858418176, + "step": 1363 + }, + { + "epoch": 0.1529698942229455, + "grad_norm": 28.85659408569336, + "learning_rate": 7.70909033782122e-07, + "loss": 0.522, + "num_input_tokens_seen": 2860515328, + "step": 1364 + }, + { + "epoch": 0.15324111743965282, + "grad_norm": 16.417978286743164, + "learning_rate": 7.705572472359913e-07, + "loss": 0.5151, + "num_input_tokens_seen": 2862612480, + "step": 1365 + }, + { + "epoch": 0.15351234065636019, + "grad_norm": 15.138096809387207, + "learning_rate": 7.702052831937811e-07, + "loss": 0.4831, + "num_input_tokens_seen": 2864709632, + "step": 1366 + }, + { + "epoch": 0.15378356387306755, + "grad_norm": 17.662302017211914, + "learning_rate": 7.698531419387382e-07, + "loss": 0.6095, + "num_input_tokens_seen": 2866806784, + "step": 1367 + }, + { + "epoch": 0.15405478708977488, + "grad_norm": 13.90149211883545, + "learning_rate": 7.695008237542526e-07, + "loss": 0.3174, + "num_input_tokens_seen": 2868903936, + "step": 1368 + }, + { + "epoch": 0.15432601030648224, + "grad_norm": 25.922029495239258, + "learning_rate": 7.691483289238569e-07, + "loss": 0.9373, + "num_input_tokens_seen": 2871001088, + "step": 1369 + }, + { + "epoch": 0.15459723352318958, + "grad_norm": 18.99474334716797, + "learning_rate": 7.687956577312251e-07, + "loss": 0.5871, + "num_input_tokens_seen": 2873098240, + "step": 1370 + }, + { + "epoch": 0.15486845673989694, + "grad_norm": 16.25048828125, + "learning_rate": 7.684428104601739e-07, + "loss": 0.3569, + "num_input_tokens_seen": 2875195392, + "step": 1371 + }, + { + "epoch": 0.1551396799566043, + "grad_norm": 12.14373779296875, + "learning_rate": 7.680897873946611e-07, + "loss": 0.3744, + "num_input_tokens_seen": 2877292544, + "step": 1372 + }, + { + "epoch": 0.15541090317331163, + "grad_norm": 15.390382766723633, + "learning_rate": 7.677365888187864e-07, + "loss": 0.4764, + "num_input_tokens_seen": 2879389696, + "step": 1373 + }, + { + "epoch": 0.155682126390019, + "grad_norm": 16.843217849731445, + "learning_rate": 7.673832150167906e-07, + "loss": 0.4289, + "num_input_tokens_seen": 2881486848, + "step": 1374 + }, + { + "epoch": 0.15595334960672633, + "grad_norm": 13.040255546569824, + "learning_rate": 7.670296662730552e-07, + "loss": 0.3706, + "num_input_tokens_seen": 2883584000, + "step": 1375 + }, + { + "epoch": 0.1562245728234337, + "grad_norm": 16.71036148071289, + "learning_rate": 7.66675942872103e-07, + "loss": 0.5491, + "num_input_tokens_seen": 2885681152, + "step": 1376 + }, + { + "epoch": 0.15649579604014105, + "grad_norm": 14.318278312683105, + "learning_rate": 7.663220450985973e-07, + "loss": 0.401, + "num_input_tokens_seen": 2887778304, + "step": 1377 + }, + { + "epoch": 0.15676701925684838, + "grad_norm": 20.72394371032715, + "learning_rate": 7.659679732373413e-07, + "loss": 0.6904, + "num_input_tokens_seen": 2889875456, + "step": 1378 + }, + { + "epoch": 0.15703824247355574, + "grad_norm": 12.829943656921387, + "learning_rate": 7.656137275732786e-07, + "loss": 0.3329, + "num_input_tokens_seen": 2891972608, + "step": 1379 + }, + { + "epoch": 0.15730946569026308, + "grad_norm": 19.38332748413086, + "learning_rate": 7.652593083914927e-07, + "loss": 0.6388, + "num_input_tokens_seen": 2894069760, + "step": 1380 + }, + { + "epoch": 0.15758068890697044, + "grad_norm": 19.630373001098633, + "learning_rate": 7.649047159772064e-07, + "loss": 0.5891, + "num_input_tokens_seen": 2896166912, + "step": 1381 + }, + { + "epoch": 0.15785191212367777, + "grad_norm": 15.700737953186035, + "learning_rate": 7.645499506157827e-07, + "loss": 0.42, + "num_input_tokens_seen": 2898264064, + "step": 1382 + }, + { + "epoch": 0.15812313534038513, + "grad_norm": 17.65544319152832, + "learning_rate": 7.641950125927228e-07, + "loss": 0.5836, + "num_input_tokens_seen": 2900361216, + "step": 1383 + }, + { + "epoch": 0.1583943585570925, + "grad_norm": 12.071609497070312, + "learning_rate": 7.638399021936675e-07, + "loss": 0.3025, + "num_input_tokens_seen": 2902458368, + "step": 1384 + }, + { + "epoch": 0.15866558177379983, + "grad_norm": 16.76353645324707, + "learning_rate": 7.634846197043963e-07, + "loss": 0.4026, + "num_input_tokens_seen": 2904555520, + "step": 1385 + }, + { + "epoch": 0.1589368049905072, + "grad_norm": 18.88435173034668, + "learning_rate": 7.631291654108266e-07, + "loss": 0.642, + "num_input_tokens_seen": 2906652672, + "step": 1386 + }, + { + "epoch": 0.15920802820721452, + "grad_norm": 25.31294822692871, + "learning_rate": 7.627735395990149e-07, + "loss": 0.8431, + "num_input_tokens_seen": 2908749824, + "step": 1387 + }, + { + "epoch": 0.1594792514239219, + "grad_norm": 30.660036087036133, + "learning_rate": 7.624177425551552e-07, + "loss": 0.9865, + "num_input_tokens_seen": 2910846976, + "step": 1388 + }, + { + "epoch": 0.15975047464062925, + "grad_norm": 23.799144744873047, + "learning_rate": 7.620617745655793e-07, + "loss": 0.7311, + "num_input_tokens_seen": 2912944128, + "step": 1389 + }, + { + "epoch": 0.16002169785733658, + "grad_norm": 21.239885330200195, + "learning_rate": 7.617056359167568e-07, + "loss": 0.4711, + "num_input_tokens_seen": 2915041280, + "step": 1390 + }, + { + "epoch": 0.16029292107404394, + "grad_norm": 19.135162353515625, + "learning_rate": 7.613493268952947e-07, + "loss": 0.5114, + "num_input_tokens_seen": 2917138432, + "step": 1391 + }, + { + "epoch": 0.16056414429075128, + "grad_norm": 16.555891036987305, + "learning_rate": 7.609928477879365e-07, + "loss": 0.3154, + "num_input_tokens_seen": 2919235584, + "step": 1392 + }, + { + "epoch": 0.16083536750745864, + "grad_norm": 28.75040626525879, + "learning_rate": 7.606361988815633e-07, + "loss": 0.5111, + "num_input_tokens_seen": 2921332736, + "step": 1393 + }, + { + "epoch": 0.161106590724166, + "grad_norm": 22.3262939453125, + "learning_rate": 7.602793804631927e-07, + "loss": 0.2485, + "num_input_tokens_seen": 2923429888, + "step": 1394 + }, + { + "epoch": 0.16137781394087333, + "grad_norm": 15.991744041442871, + "learning_rate": 7.599223928199781e-07, + "loss": 0.4578, + "num_input_tokens_seen": 2925527040, + "step": 1395 + }, + { + "epoch": 0.1616490371575807, + "grad_norm": 19.883453369140625, + "learning_rate": 7.595652362392103e-07, + "loss": 0.5837, + "num_input_tokens_seen": 2927624192, + "step": 1396 + }, + { + "epoch": 0.16192026037428803, + "grad_norm": 18.910655975341797, + "learning_rate": 7.592079110083146e-07, + "loss": 0.6984, + "num_input_tokens_seen": 2929721344, + "step": 1397 + }, + { + "epoch": 0.1621914835909954, + "grad_norm": 14.154722213745117, + "learning_rate": 7.588504174148532e-07, + "loss": 0.4046, + "num_input_tokens_seen": 2931818496, + "step": 1398 + }, + { + "epoch": 0.16246270680770275, + "grad_norm": 24.498065948486328, + "learning_rate": 7.584927557465233e-07, + "loss": 0.729, + "num_input_tokens_seen": 2933915648, + "step": 1399 + }, + { + "epoch": 0.16273393002441008, + "grad_norm": 18.18712043762207, + "learning_rate": 7.581349262911573e-07, + "loss": 0.3138, + "num_input_tokens_seen": 2936012800, + "step": 1400 + }, + { + "epoch": 0.16300515324111745, + "grad_norm": 16.402795791625977, + "learning_rate": 7.577769293367226e-07, + "loss": 0.4787, + "num_input_tokens_seen": 2938109952, + "step": 1401 + }, + { + "epoch": 0.16327637645782478, + "grad_norm": 16.0742130279541, + "learning_rate": 7.574187651713218e-07, + "loss": 0.4823, + "num_input_tokens_seen": 2940207104, + "step": 1402 + }, + { + "epoch": 0.16354759967453214, + "grad_norm": 16.962581634521484, + "learning_rate": 7.570604340831916e-07, + "loss": 0.392, + "num_input_tokens_seen": 2942304256, + "step": 1403 + }, + { + "epoch": 0.1638188228912395, + "grad_norm": 12.302724838256836, + "learning_rate": 7.567019363607032e-07, + "loss": 0.262, + "num_input_tokens_seen": 2944401408, + "step": 1404 + }, + { + "epoch": 0.16409004610794684, + "grad_norm": 13.02363109588623, + "learning_rate": 7.563432722923621e-07, + "loss": 0.3794, + "num_input_tokens_seen": 2946498560, + "step": 1405 + }, + { + "epoch": 0.1643612693246542, + "grad_norm": 13.745916366577148, + "learning_rate": 7.559844421668074e-07, + "loss": 0.2997, + "num_input_tokens_seen": 2948595712, + "step": 1406 + }, + { + "epoch": 0.16463249254136153, + "grad_norm": 16.339326858520508, + "learning_rate": 7.556254462728122e-07, + "loss": 0.5652, + "num_input_tokens_seen": 2950692864, + "step": 1407 + }, + { + "epoch": 0.1649037157580689, + "grad_norm": 16.774593353271484, + "learning_rate": 7.552662848992822e-07, + "loss": 0.4496, + "num_input_tokens_seen": 2952790016, + "step": 1408 + }, + { + "epoch": 0.16517493897477625, + "grad_norm": 14.789047241210938, + "learning_rate": 7.54906958335257e-07, + "loss": 0.4142, + "num_input_tokens_seen": 2954887168, + "step": 1409 + }, + { + "epoch": 0.1654461621914836, + "grad_norm": 15.732011795043945, + "learning_rate": 7.545474668699091e-07, + "loss": 0.3419, + "num_input_tokens_seen": 2956984320, + "step": 1410 + }, + { + "epoch": 0.16571738540819095, + "grad_norm": 14.704493522644043, + "learning_rate": 7.541878107925435e-07, + "loss": 0.3786, + "num_input_tokens_seen": 2959081472, + "step": 1411 + }, + { + "epoch": 0.16598860862489828, + "grad_norm": 13.177338600158691, + "learning_rate": 7.538279903925977e-07, + "loss": 0.2512, + "num_input_tokens_seen": 2961178624, + "step": 1412 + }, + { + "epoch": 0.16625983184160564, + "grad_norm": 17.341163635253906, + "learning_rate": 7.534680059596414e-07, + "loss": 0.4833, + "num_input_tokens_seen": 2963275776, + "step": 1413 + }, + { + "epoch": 0.166531055058313, + "grad_norm": 23.333568572998047, + "learning_rate": 7.531078577833765e-07, + "loss": 0.6702, + "num_input_tokens_seen": 2965372928, + "step": 1414 + }, + { + "epoch": 0.16680227827502034, + "grad_norm": 12.770888328552246, + "learning_rate": 7.527475461536363e-07, + "loss": 0.3171, + "num_input_tokens_seen": 2967470080, + "step": 1415 + }, + { + "epoch": 0.1670735014917277, + "grad_norm": 16.709857940673828, + "learning_rate": 7.523870713603864e-07, + "loss": 0.3315, + "num_input_tokens_seen": 2969567232, + "step": 1416 + }, + { + "epoch": 0.16734472470843503, + "grad_norm": 14.015904426574707, + "learning_rate": 7.520264336937227e-07, + "loss": 0.3777, + "num_input_tokens_seen": 2971664384, + "step": 1417 + }, + { + "epoch": 0.1676159479251424, + "grad_norm": 12.915022850036621, + "learning_rate": 7.516656334438727e-07, + "loss": 0.3262, + "num_input_tokens_seen": 2973761536, + "step": 1418 + }, + { + "epoch": 0.16788717114184976, + "grad_norm": 19.277992248535156, + "learning_rate": 7.513046709011951e-07, + "loss": 0.5589, + "num_input_tokens_seen": 2975858688, + "step": 1419 + }, + { + "epoch": 0.1681583943585571, + "grad_norm": 15.75462532043457, + "learning_rate": 7.509435463561785e-07, + "loss": 0.3394, + "num_input_tokens_seen": 2977955840, + "step": 1420 + }, + { + "epoch": 0.16842961757526445, + "grad_norm": 21.828617095947266, + "learning_rate": 7.505822600994423e-07, + "loss": 0.7979, + "num_input_tokens_seen": 2980052992, + "step": 1421 + }, + { + "epoch": 0.16870084079197178, + "grad_norm": 24.69605255126953, + "learning_rate": 7.502208124217357e-07, + "loss": 0.4936, + "num_input_tokens_seen": 2982150144, + "step": 1422 + }, + { + "epoch": 0.16897206400867915, + "grad_norm": 21.24989891052246, + "learning_rate": 7.498592036139383e-07, + "loss": 0.6869, + "num_input_tokens_seen": 2984247296, + "step": 1423 + }, + { + "epoch": 0.16924328722538648, + "grad_norm": 18.29302215576172, + "learning_rate": 7.494974339670591e-07, + "loss": 0.3728, + "num_input_tokens_seen": 2986344448, + "step": 1424 + }, + { + "epoch": 0.16951451044209384, + "grad_norm": 15.673701286315918, + "learning_rate": 7.491355037722365e-07, + "loss": 0.3315, + "num_input_tokens_seen": 2988441600, + "step": 1425 + }, + { + "epoch": 0.1697857336588012, + "grad_norm": 20.936105728149414, + "learning_rate": 7.487734133207382e-07, + "loss": 0.5771, + "num_input_tokens_seen": 2990538752, + "step": 1426 + }, + { + "epoch": 0.17005695687550854, + "grad_norm": 10.056077003479004, + "learning_rate": 7.484111629039607e-07, + "loss": 0.2192, + "num_input_tokens_seen": 2992635904, + "step": 1427 + }, + { + "epoch": 0.1703281800922159, + "grad_norm": 21.529075622558594, + "learning_rate": 7.480487528134292e-07, + "loss": 0.7378, + "num_input_tokens_seen": 2994733056, + "step": 1428 + }, + { + "epoch": 0.17059940330892323, + "grad_norm": 16.297054290771484, + "learning_rate": 7.47686183340798e-07, + "loss": 0.3931, + "num_input_tokens_seen": 2996830208, + "step": 1429 + }, + { + "epoch": 0.1708706265256306, + "grad_norm": 20.00302505493164, + "learning_rate": 7.473234547778489e-07, + "loss": 0.6421, + "num_input_tokens_seen": 2998927360, + "step": 1430 + }, + { + "epoch": 0.17114184974233795, + "grad_norm": 16.071239471435547, + "learning_rate": 7.46960567416492e-07, + "loss": 0.4505, + "num_input_tokens_seen": 3001024512, + "step": 1431 + }, + { + "epoch": 0.1714130729590453, + "grad_norm": 14.117545127868652, + "learning_rate": 7.465975215487655e-07, + "loss": 0.2977, + "num_input_tokens_seen": 3003121664, + "step": 1432 + }, + { + "epoch": 0.17168429617575265, + "grad_norm": 21.76426124572754, + "learning_rate": 7.462343174668346e-07, + "loss": 0.6603, + "num_input_tokens_seen": 3005218816, + "step": 1433 + }, + { + "epoch": 0.17195551939245998, + "grad_norm": 20.707962036132812, + "learning_rate": 7.458709554629924e-07, + "loss": 0.5771, + "num_input_tokens_seen": 3007315968, + "step": 1434 + }, + { + "epoch": 0.17222674260916734, + "grad_norm": 19.084461212158203, + "learning_rate": 7.455074358296586e-07, + "loss": 0.6713, + "num_input_tokens_seen": 3009413120, + "step": 1435 + }, + { + "epoch": 0.1724979658258747, + "grad_norm": 13.040257453918457, + "learning_rate": 7.451437588593802e-07, + "loss": 0.3933, + "num_input_tokens_seen": 3011510272, + "step": 1436 + }, + { + "epoch": 0.17276918904258204, + "grad_norm": 16.921184539794922, + "learning_rate": 7.447799248448303e-07, + "loss": 0.5142, + "num_input_tokens_seen": 3013607424, + "step": 1437 + }, + { + "epoch": 0.1730404122592894, + "grad_norm": 16.22494125366211, + "learning_rate": 7.444159340788088e-07, + "loss": 0.5446, + "num_input_tokens_seen": 3015704576, + "step": 1438 + }, + { + "epoch": 0.17331163547599673, + "grad_norm": 13.45175838470459, + "learning_rate": 7.440517868542417e-07, + "loss": 0.3896, + "num_input_tokens_seen": 3017801728, + "step": 1439 + }, + { + "epoch": 0.1735828586927041, + "grad_norm": 14.45242691040039, + "learning_rate": 7.436874834641807e-07, + "loss": 0.3761, + "num_input_tokens_seen": 3019898880, + "step": 1440 + }, + { + "epoch": 0.17385408190941146, + "grad_norm": 21.77591323852539, + "learning_rate": 7.433230242018035e-07, + "loss": 0.5964, + "num_input_tokens_seen": 3021996032, + "step": 1441 + }, + { + "epoch": 0.1741253051261188, + "grad_norm": 16.96351432800293, + "learning_rate": 7.429584093604128e-07, + "loss": 0.3924, + "num_input_tokens_seen": 3024093184, + "step": 1442 + }, + { + "epoch": 0.17439652834282615, + "grad_norm": 19.076356887817383, + "learning_rate": 7.425936392334368e-07, + "loss": 0.6515, + "num_input_tokens_seen": 3026190336, + "step": 1443 + }, + { + "epoch": 0.17466775155953349, + "grad_norm": 18.412277221679688, + "learning_rate": 7.422287141144287e-07, + "loss": 0.6622, + "num_input_tokens_seen": 3028287488, + "step": 1444 + }, + { + "epoch": 0.17493897477624085, + "grad_norm": 19.85443687438965, + "learning_rate": 7.418636342970665e-07, + "loss": 0.7812, + "num_input_tokens_seen": 3030384640, + "step": 1445 + }, + { + "epoch": 0.1752101979929482, + "grad_norm": 18.57454490661621, + "learning_rate": 7.414984000751521e-07, + "loss": 0.3791, + "num_input_tokens_seen": 3032481792, + "step": 1446 + }, + { + "epoch": 0.17548142120965554, + "grad_norm": 13.877761840820312, + "learning_rate": 7.411330117426125e-07, + "loss": 0.3755, + "num_input_tokens_seen": 3034578944, + "step": 1447 + }, + { + "epoch": 0.1757526444263629, + "grad_norm": 11.10472583770752, + "learning_rate": 7.407674695934983e-07, + "loss": 0.2692, + "num_input_tokens_seen": 3036676096, + "step": 1448 + }, + { + "epoch": 0.17602386764307024, + "grad_norm": 16.257774353027344, + "learning_rate": 7.404017739219836e-07, + "loss": 0.4897, + "num_input_tokens_seen": 3038773248, + "step": 1449 + }, + { + "epoch": 0.1762950908597776, + "grad_norm": 13.585163116455078, + "learning_rate": 7.400359250223667e-07, + "loss": 0.2769, + "num_input_tokens_seen": 3040870400, + "step": 1450 + }, + { + "epoch": 0.17656631407648496, + "grad_norm": 16.29515266418457, + "learning_rate": 7.396699231890689e-07, + "loss": 0.5257, + "num_input_tokens_seen": 3042967552, + "step": 1451 + }, + { + "epoch": 0.1768375372931923, + "grad_norm": 19.343111038208008, + "learning_rate": 7.393037687166342e-07, + "loss": 0.5871, + "num_input_tokens_seen": 3045064704, + "step": 1452 + }, + { + "epoch": 0.17710876050989965, + "grad_norm": 17.655189514160156, + "learning_rate": 7.3893746189973e-07, + "loss": 0.5044, + "num_input_tokens_seen": 3047161856, + "step": 1453 + }, + { + "epoch": 0.177379983726607, + "grad_norm": 15.424549102783203, + "learning_rate": 7.385710030331461e-07, + "loss": 0.4438, + "num_input_tokens_seen": 3049259008, + "step": 1454 + }, + { + "epoch": 0.17765120694331435, + "grad_norm": 15.889403343200684, + "learning_rate": 7.382043924117945e-07, + "loss": 0.3884, + "num_input_tokens_seen": 3051356160, + "step": 1455 + }, + { + "epoch": 0.1779224301600217, + "grad_norm": 11.11455249786377, + "learning_rate": 7.378376303307099e-07, + "loss": 0.2612, + "num_input_tokens_seen": 3053453312, + "step": 1456 + }, + { + "epoch": 0.17819365337672904, + "grad_norm": 18.22006607055664, + "learning_rate": 7.374707170850479e-07, + "loss": 0.45, + "num_input_tokens_seen": 3055550464, + "step": 1457 + }, + { + "epoch": 0.1784648765934364, + "grad_norm": 14.16891098022461, + "learning_rate": 7.371036529700866e-07, + "loss": 0.4225, + "num_input_tokens_seen": 3057647616, + "step": 1458 + }, + { + "epoch": 0.17873609981014374, + "grad_norm": 13.474692344665527, + "learning_rate": 7.367364382812253e-07, + "loss": 0.3465, + "num_input_tokens_seen": 3059744768, + "step": 1459 + }, + { + "epoch": 0.1790073230268511, + "grad_norm": 19.395767211914062, + "learning_rate": 7.363690733139842e-07, + "loss": 0.6558, + "num_input_tokens_seen": 3061841920, + "step": 1460 + }, + { + "epoch": 0.17927854624355846, + "grad_norm": 22.699716567993164, + "learning_rate": 7.360015583640049e-07, + "loss": 0.7213, + "num_input_tokens_seen": 3063939072, + "step": 1461 + }, + { + "epoch": 0.1795497694602658, + "grad_norm": 10.556961059570312, + "learning_rate": 7.356338937270492e-07, + "loss": 0.2935, + "num_input_tokens_seen": 3066036224, + "step": 1462 + }, + { + "epoch": 0.17982099267697316, + "grad_norm": 21.715940475463867, + "learning_rate": 7.352660796989999e-07, + "loss": 0.3626, + "num_input_tokens_seen": 3068133376, + "step": 1463 + }, + { + "epoch": 0.1800922158936805, + "grad_norm": 14.98128604888916, + "learning_rate": 7.348981165758595e-07, + "loss": 0.4329, + "num_input_tokens_seen": 3070230528, + "step": 1464 + }, + { + "epoch": 0.18036343911038785, + "grad_norm": 13.6019868850708, + "learning_rate": 7.345300046537507e-07, + "loss": 0.4274, + "num_input_tokens_seen": 3072327680, + "step": 1465 + }, + { + "epoch": 0.18063466232709519, + "grad_norm": 19.526018142700195, + "learning_rate": 7.341617442289159e-07, + "loss": 0.6069, + "num_input_tokens_seen": 3074424832, + "step": 1466 + }, + { + "epoch": 0.18090588554380255, + "grad_norm": 18.811630249023438, + "learning_rate": 7.337933355977175e-07, + "loss": 0.484, + "num_input_tokens_seen": 3076521984, + "step": 1467 + }, + { + "epoch": 0.1811771087605099, + "grad_norm": 11.840438842773438, + "learning_rate": 7.334247790566364e-07, + "loss": 0.3532, + "num_input_tokens_seen": 3078619136, + "step": 1468 + }, + { + "epoch": 0.18144833197721724, + "grad_norm": 13.839353561401367, + "learning_rate": 7.330560749022728e-07, + "loss": 0.348, + "num_input_tokens_seen": 3080716288, + "step": 1469 + }, + { + "epoch": 0.1817195551939246, + "grad_norm": 19.16689682006836, + "learning_rate": 7.326872234313459e-07, + "loss": 0.5346, + "num_input_tokens_seen": 3082813440, + "step": 1470 + }, + { + "epoch": 0.18199077841063194, + "grad_norm": 14.058456420898438, + "learning_rate": 7.323182249406936e-07, + "loss": 0.3933, + "num_input_tokens_seen": 3084910592, + "step": 1471 + }, + { + "epoch": 0.1822620016273393, + "grad_norm": 20.379125595092773, + "learning_rate": 7.319490797272714e-07, + "loss": 0.6038, + "num_input_tokens_seen": 3087007744, + "step": 1472 + }, + { + "epoch": 0.18253322484404666, + "grad_norm": 15.348321914672852, + "learning_rate": 7.315797880881535e-07, + "loss": 0.459, + "num_input_tokens_seen": 3089104896, + "step": 1473 + }, + { + "epoch": 0.182804448060754, + "grad_norm": 12.275803565979004, + "learning_rate": 7.312103503205318e-07, + "loss": 0.4144, + "num_input_tokens_seen": 3091202048, + "step": 1474 + }, + { + "epoch": 0.18307567127746135, + "grad_norm": 9.52855110168457, + "learning_rate": 7.308407667217158e-07, + "loss": 0.1598, + "num_input_tokens_seen": 3093299200, + "step": 1475 + }, + { + "epoch": 0.1833468944941687, + "grad_norm": 17.207202911376953, + "learning_rate": 7.304710375891323e-07, + "loss": 0.5072, + "num_input_tokens_seen": 3095396352, + "step": 1476 + }, + { + "epoch": 0.18361811771087605, + "grad_norm": 21.52804946899414, + "learning_rate": 7.30101163220325e-07, + "loss": 0.7901, + "num_input_tokens_seen": 3097493504, + "step": 1477 + }, + { + "epoch": 0.1838893409275834, + "grad_norm": 19.89445686340332, + "learning_rate": 7.29731143912955e-07, + "loss": 0.6151, + "num_input_tokens_seen": 3099590656, + "step": 1478 + }, + { + "epoch": 0.18416056414429074, + "grad_norm": 15.77376651763916, + "learning_rate": 7.293609799647996e-07, + "loss": 0.4936, + "num_input_tokens_seen": 3101687808, + "step": 1479 + }, + { + "epoch": 0.1844317873609981, + "grad_norm": 18.152956008911133, + "learning_rate": 7.289906716737528e-07, + "loss": 0.5218, + "num_input_tokens_seen": 3103784960, + "step": 1480 + }, + { + "epoch": 0.18470301057770544, + "grad_norm": 13.07524585723877, + "learning_rate": 7.286202193378244e-07, + "loss": 0.3309, + "num_input_tokens_seen": 3105882112, + "step": 1481 + }, + { + "epoch": 0.1849742337944128, + "grad_norm": 15.021199226379395, + "learning_rate": 7.282496232551406e-07, + "loss": 0.4835, + "num_input_tokens_seen": 3107979264, + "step": 1482 + }, + { + "epoch": 0.18524545701112016, + "grad_norm": 18.78805923461914, + "learning_rate": 7.278788837239429e-07, + "loss": 0.5135, + "num_input_tokens_seen": 3110076416, + "step": 1483 + }, + { + "epoch": 0.1855166802278275, + "grad_norm": 12.501900672912598, + "learning_rate": 7.275080010425883e-07, + "loss": 0.3278, + "num_input_tokens_seen": 3112173568, + "step": 1484 + }, + { + "epoch": 0.18578790344453486, + "grad_norm": 15.580584526062012, + "learning_rate": 7.271369755095494e-07, + "loss": 0.5373, + "num_input_tokens_seen": 3114270720, + "step": 1485 + }, + { + "epoch": 0.1860591266612422, + "grad_norm": 20.438100814819336, + "learning_rate": 7.267658074234128e-07, + "loss": 0.7682, + "num_input_tokens_seen": 3116367872, + "step": 1486 + }, + { + "epoch": 0.18633034987794955, + "grad_norm": 19.1994686126709, + "learning_rate": 7.26394497082881e-07, + "loss": 0.5544, + "num_input_tokens_seen": 3118465024, + "step": 1487 + }, + { + "epoch": 0.18660157309465691, + "grad_norm": 14.906885147094727, + "learning_rate": 7.260230447867703e-07, + "loss": 0.461, + "num_input_tokens_seen": 3120562176, + "step": 1488 + }, + { + "epoch": 0.18687279631136425, + "grad_norm": 16.73325538635254, + "learning_rate": 7.256514508340114e-07, + "loss": 0.5265, + "num_input_tokens_seen": 3122659328, + "step": 1489 + }, + { + "epoch": 0.1871440195280716, + "grad_norm": 22.4089412689209, + "learning_rate": 7.252797155236488e-07, + "loss": 0.6415, + "num_input_tokens_seen": 3124756480, + "step": 1490 + }, + { + "epoch": 0.18741524274477894, + "grad_norm": 17.32809066772461, + "learning_rate": 7.249078391548409e-07, + "loss": 0.5622, + "num_input_tokens_seen": 3126853632, + "step": 1491 + }, + { + "epoch": 0.1876864659614863, + "grad_norm": 10.146015167236328, + "learning_rate": 7.245358220268599e-07, + "loss": 0.2816, + "num_input_tokens_seen": 3128950784, + "step": 1492 + }, + { + "epoch": 0.18795768917819367, + "grad_norm": 24.50543975830078, + "learning_rate": 7.24163664439091e-07, + "loss": 0.617, + "num_input_tokens_seen": 3131047936, + "step": 1493 + }, + { + "epoch": 0.188228912394901, + "grad_norm": 13.438515663146973, + "learning_rate": 7.237913666910322e-07, + "loss": 0.3164, + "num_input_tokens_seen": 3133145088, + "step": 1494 + }, + { + "epoch": 0.18850013561160836, + "grad_norm": 17.03264045715332, + "learning_rate": 7.234189290822947e-07, + "loss": 0.5909, + "num_input_tokens_seen": 3135242240, + "step": 1495 + }, + { + "epoch": 0.1887713588283157, + "grad_norm": 14.248628616333008, + "learning_rate": 7.230463519126024e-07, + "loss": 0.4531, + "num_input_tokens_seen": 3137339392, + "step": 1496 + }, + { + "epoch": 0.18904258204502306, + "grad_norm": 16.5543212890625, + "learning_rate": 7.226736354817908e-07, + "loss": 0.433, + "num_input_tokens_seen": 3139436544, + "step": 1497 + }, + { + "epoch": 0.18931380526173042, + "grad_norm": 35.90945053100586, + "learning_rate": 7.223007800898082e-07, + "loss": 1.4103, + "num_input_tokens_seen": 3141533696, + "step": 1498 + }, + { + "epoch": 0.18958502847843775, + "grad_norm": 19.54884910583496, + "learning_rate": 7.219277860367143e-07, + "loss": 0.6804, + "num_input_tokens_seen": 3143630848, + "step": 1499 + }, + { + "epoch": 0.1898562516951451, + "grad_norm": 24.173274993896484, + "learning_rate": 7.215546536226805e-07, + "loss": 0.7872, + "num_input_tokens_seen": 3145728000, + "step": 1500 + }, + { + "epoch": 0.19012747491185245, + "grad_norm": 15.859006881713867, + "learning_rate": 7.211813831479896e-07, + "loss": 0.4702, + "num_input_tokens_seen": 3147825152, + "step": 1501 + }, + { + "epoch": 0.1903986981285598, + "grad_norm": 14.973697662353516, + "learning_rate": 7.208079749130356e-07, + "loss": 0.4567, + "num_input_tokens_seen": 3149922304, + "step": 1502 + }, + { + "epoch": 0.19066992134526717, + "grad_norm": 17.608720779418945, + "learning_rate": 7.204344292183228e-07, + "loss": 0.5384, + "num_input_tokens_seen": 3152019456, + "step": 1503 + }, + { + "epoch": 0.1909411445619745, + "grad_norm": 17.97117042541504, + "learning_rate": 7.200607463644673e-07, + "loss": 0.3972, + "num_input_tokens_seen": 3154116608, + "step": 1504 + }, + { + "epoch": 0.19121236777868186, + "grad_norm": 17.989160537719727, + "learning_rate": 7.196869266521941e-07, + "loss": 0.235, + "num_input_tokens_seen": 3156213760, + "step": 1505 + }, + { + "epoch": 0.1914835909953892, + "grad_norm": 14.863112449645996, + "learning_rate": 7.193129703823395e-07, + "loss": 0.5892, + "num_input_tokens_seen": 3158310912, + "step": 1506 + }, + { + "epoch": 0.19175481421209656, + "grad_norm": 16.317642211914062, + "learning_rate": 7.189388778558491e-07, + "loss": 0.4773, + "num_input_tokens_seen": 3160408064, + "step": 1507 + }, + { + "epoch": 0.1920260374288039, + "grad_norm": 12.433012962341309, + "learning_rate": 7.185646493737785e-07, + "loss": 0.3363, + "num_input_tokens_seen": 3162505216, + "step": 1508 + }, + { + "epoch": 0.19229726064551125, + "grad_norm": 12.628132820129395, + "learning_rate": 7.181902852372924e-07, + "loss": 0.3374, + "num_input_tokens_seen": 3164602368, + "step": 1509 + }, + { + "epoch": 0.19256848386221861, + "grad_norm": 18.057632446289062, + "learning_rate": 7.17815785747665e-07, + "loss": 0.5477, + "num_input_tokens_seen": 3166699520, + "step": 1510 + }, + { + "epoch": 0.19283970707892595, + "grad_norm": 17.424755096435547, + "learning_rate": 7.174411512062789e-07, + "loss": 0.4914, + "num_input_tokens_seen": 3168796672, + "step": 1511 + }, + { + "epoch": 0.1931109302956333, + "grad_norm": 15.669259071350098, + "learning_rate": 7.170663819146259e-07, + "loss": 0.5325, + "num_input_tokens_seen": 3170893824, + "step": 1512 + }, + { + "epoch": 0.19338215351234064, + "grad_norm": 19.925724029541016, + "learning_rate": 7.166914781743062e-07, + "loss": 0.6616, + "num_input_tokens_seen": 3172990976, + "step": 1513 + }, + { + "epoch": 0.193653376729048, + "grad_norm": 12.23149585723877, + "learning_rate": 7.16316440287028e-07, + "loss": 0.3527, + "num_input_tokens_seen": 3175088128, + "step": 1514 + }, + { + "epoch": 0.19392459994575537, + "grad_norm": 20.181726455688477, + "learning_rate": 7.159412685546073e-07, + "loss": 0.5477, + "num_input_tokens_seen": 3177185280, + "step": 1515 + }, + { + "epoch": 0.1941958231624627, + "grad_norm": 19.928884506225586, + "learning_rate": 7.155659632789683e-07, + "loss": 0.7233, + "num_input_tokens_seen": 3179282432, + "step": 1516 + }, + { + "epoch": 0.19446704637917006, + "grad_norm": 20.663057327270508, + "learning_rate": 7.151905247621422e-07, + "loss": 0.6027, + "num_input_tokens_seen": 3181379584, + "step": 1517 + }, + { + "epoch": 0.1947382695958774, + "grad_norm": 15.95964527130127, + "learning_rate": 7.148149533062678e-07, + "loss": 0.4609, + "num_input_tokens_seen": 3183476736, + "step": 1518 + }, + { + "epoch": 0.19500949281258476, + "grad_norm": 18.054109573364258, + "learning_rate": 7.144392492135908e-07, + "loss": 0.4759, + "num_input_tokens_seen": 3185573888, + "step": 1519 + }, + { + "epoch": 0.19528071602929212, + "grad_norm": 36.16549301147461, + "learning_rate": 7.140634127864632e-07, + "loss": 0.7295, + "num_input_tokens_seen": 3187671040, + "step": 1520 + }, + { + "epoch": 0.19555193924599945, + "grad_norm": 16.543094635009766, + "learning_rate": 7.136874443273442e-07, + "loss": 0.4694, + "num_input_tokens_seen": 3189768192, + "step": 1521 + }, + { + "epoch": 0.1958231624627068, + "grad_norm": 18.826963424682617, + "learning_rate": 7.133113441387988e-07, + "loss": 0.6975, + "num_input_tokens_seen": 3191865344, + "step": 1522 + }, + { + "epoch": 0.19609438567941415, + "grad_norm": 11.468395233154297, + "learning_rate": 7.129351125234979e-07, + "loss": 0.2668, + "num_input_tokens_seen": 3193962496, + "step": 1523 + }, + { + "epoch": 0.1963656088961215, + "grad_norm": 18.200729370117188, + "learning_rate": 7.125587497842189e-07, + "loss": 0.6557, + "num_input_tokens_seen": 3196059648, + "step": 1524 + }, + { + "epoch": 0.19663683211282887, + "grad_norm": 11.098851203918457, + "learning_rate": 7.121822562238436e-07, + "loss": 0.2728, + "num_input_tokens_seen": 3198156800, + "step": 1525 + }, + { + "epoch": 0.1969080553295362, + "grad_norm": 15.489395141601562, + "learning_rate": 7.118056321453601e-07, + "loss": 0.5025, + "num_input_tokens_seen": 3200253952, + "step": 1526 + }, + { + "epoch": 0.19717927854624356, + "grad_norm": 14.967645645141602, + "learning_rate": 7.11428877851861e-07, + "loss": 0.4449, + "num_input_tokens_seen": 3202351104, + "step": 1527 + }, + { + "epoch": 0.1974505017629509, + "grad_norm": 15.567303657531738, + "learning_rate": 7.110519936465438e-07, + "loss": 0.4, + "num_input_tokens_seen": 3204448256, + "step": 1528 + }, + { + "epoch": 0.19772172497965826, + "grad_norm": 16.95789337158203, + "learning_rate": 7.106749798327106e-07, + "loss": 0.3363, + "num_input_tokens_seen": 3206545408, + "step": 1529 + }, + { + "epoch": 0.19799294819636562, + "grad_norm": 16.923965454101562, + "learning_rate": 7.102978367137679e-07, + "loss": 0.4492, + "num_input_tokens_seen": 3208642560, + "step": 1530 + }, + { + "epoch": 0.19826417141307295, + "grad_norm": 16.046384811401367, + "learning_rate": 7.099205645932258e-07, + "loss": 0.4185, + "num_input_tokens_seen": 3210739712, + "step": 1531 + }, + { + "epoch": 0.19853539462978032, + "grad_norm": 20.16338348388672, + "learning_rate": 7.095431637746988e-07, + "loss": 0.6639, + "num_input_tokens_seen": 3212836864, + "step": 1532 + }, + { + "epoch": 0.19880661784648765, + "grad_norm": 12.801714897155762, + "learning_rate": 7.091656345619047e-07, + "loss": 0.3112, + "num_input_tokens_seen": 3214934016, + "step": 1533 + }, + { + "epoch": 0.199077841063195, + "grad_norm": 14.88927936553955, + "learning_rate": 7.087879772586647e-07, + "loss": 0.4997, + "num_input_tokens_seen": 3217031168, + "step": 1534 + }, + { + "epoch": 0.19934906427990237, + "grad_norm": 20.41654396057129, + "learning_rate": 7.084101921689029e-07, + "loss": 0.5001, + "num_input_tokens_seen": 3219128320, + "step": 1535 + }, + { + "epoch": 0.1996202874966097, + "grad_norm": 18.13739013671875, + "learning_rate": 7.080322795966462e-07, + "loss": 0.5536, + "num_input_tokens_seen": 3221225472, + "step": 1536 + }, + { + "epoch": 0.19989151071331707, + "grad_norm": 15.031099319458008, + "learning_rate": 7.076542398460247e-07, + "loss": 0.4459, + "num_input_tokens_seen": 3223322624, + "step": 1537 + }, + { + "epoch": 0.2001627339300244, + "grad_norm": 16.9238224029541, + "learning_rate": 7.0727607322127e-07, + "loss": 0.4325, + "num_input_tokens_seen": 3225419776, + "step": 1538 + }, + { + "epoch": 0.20043395714673176, + "grad_norm": 17.309598922729492, + "learning_rate": 7.068977800267164e-07, + "loss": 0.5089, + "num_input_tokens_seen": 3227516928, + "step": 1539 + }, + { + "epoch": 0.20070518036343912, + "grad_norm": 14.402442932128906, + "learning_rate": 7.065193605667999e-07, + "loss": 0.3359, + "num_input_tokens_seen": 3229614080, + "step": 1540 + }, + { + "epoch": 0.20097640358014646, + "grad_norm": 12.946207046508789, + "learning_rate": 7.06140815146058e-07, + "loss": 0.3825, + "num_input_tokens_seen": 3231711232, + "step": 1541 + }, + { + "epoch": 0.20124762679685382, + "grad_norm": 21.32681655883789, + "learning_rate": 7.057621440691296e-07, + "loss": 0.6832, + "num_input_tokens_seen": 3233808384, + "step": 1542 + }, + { + "epoch": 0.20151885001356115, + "grad_norm": 18.230838775634766, + "learning_rate": 7.053833476407549e-07, + "loss": 0.5939, + "num_input_tokens_seen": 3235905536, + "step": 1543 + }, + { + "epoch": 0.2017900732302685, + "grad_norm": 14.465167045593262, + "learning_rate": 7.050044261657748e-07, + "loss": 0.3152, + "num_input_tokens_seen": 3238002688, + "step": 1544 + }, + { + "epoch": 0.20206129644697587, + "grad_norm": 11.428820610046387, + "learning_rate": 7.04625379949131e-07, + "loss": 0.2369, + "num_input_tokens_seen": 3240099840, + "step": 1545 + }, + { + "epoch": 0.2023325196636832, + "grad_norm": 19.767881393432617, + "learning_rate": 7.042462092958651e-07, + "loss": 0.4465, + "num_input_tokens_seen": 3242196992, + "step": 1546 + }, + { + "epoch": 0.20260374288039057, + "grad_norm": 13.485447883605957, + "learning_rate": 7.038669145111195e-07, + "loss": 0.4017, + "num_input_tokens_seen": 3244294144, + "step": 1547 + }, + { + "epoch": 0.2028749660970979, + "grad_norm": 15.75723648071289, + "learning_rate": 7.034874959001363e-07, + "loss": 0.4824, + "num_input_tokens_seen": 3246391296, + "step": 1548 + }, + { + "epoch": 0.20314618931380526, + "grad_norm": 13.907236099243164, + "learning_rate": 7.031079537682569e-07, + "loss": 0.3523, + "num_input_tokens_seen": 3248488448, + "step": 1549 + }, + { + "epoch": 0.20341741253051263, + "grad_norm": 24.97413444519043, + "learning_rate": 7.027282884209227e-07, + "loss": 0.6136, + "num_input_tokens_seen": 3250585600, + "step": 1550 + }, + { + "epoch": 0.20368863574721996, + "grad_norm": 15.245758056640625, + "learning_rate": 7.023485001636737e-07, + "loss": 0.3508, + "num_input_tokens_seen": 3252682752, + "step": 1551 + }, + { + "epoch": 0.20395985896392732, + "grad_norm": 16.83147430419922, + "learning_rate": 7.019685893021488e-07, + "loss": 0.4695, + "num_input_tokens_seen": 3254779904, + "step": 1552 + }, + { + "epoch": 0.20423108218063465, + "grad_norm": 17.15081787109375, + "learning_rate": 7.015885561420863e-07, + "loss": 0.2958, + "num_input_tokens_seen": 3256877056, + "step": 1553 + }, + { + "epoch": 0.20450230539734202, + "grad_norm": 11.432106018066406, + "learning_rate": 7.012084009893221e-07, + "loss": 0.2439, + "num_input_tokens_seen": 3258974208, + "step": 1554 + }, + { + "epoch": 0.20477352861404935, + "grad_norm": 18.06707000732422, + "learning_rate": 7.008281241497908e-07, + "loss": 0.604, + "num_input_tokens_seen": 3261071360, + "step": 1555 + }, + { + "epoch": 0.2050447518307567, + "grad_norm": 17.187938690185547, + "learning_rate": 7.004477259295244e-07, + "loss": 0.4381, + "num_input_tokens_seen": 3263168512, + "step": 1556 + }, + { + "epoch": 0.20531597504746407, + "grad_norm": 8.261337280273438, + "learning_rate": 7.000672066346532e-07, + "loss": 0.1795, + "num_input_tokens_seen": 3265265664, + "step": 1557 + }, + { + "epoch": 0.2055871982641714, + "grad_norm": 14.19255256652832, + "learning_rate": 6.996865665714046e-07, + "loss": 0.319, + "num_input_tokens_seen": 3267362816, + "step": 1558 + }, + { + "epoch": 0.20585842148087877, + "grad_norm": 21.178258895874023, + "learning_rate": 6.993058060461035e-07, + "loss": 0.5702, + "num_input_tokens_seen": 3269459968, + "step": 1559 + }, + { + "epoch": 0.2061296446975861, + "grad_norm": 17.968137741088867, + "learning_rate": 6.989249253651708e-07, + "loss": 0.5158, + "num_input_tokens_seen": 3271557120, + "step": 1560 + }, + { + "epoch": 0.20640086791429346, + "grad_norm": 12.363228797912598, + "learning_rate": 6.985439248351254e-07, + "loss": 0.2995, + "num_input_tokens_seen": 3273654272, + "step": 1561 + }, + { + "epoch": 0.20667209113100082, + "grad_norm": 28.861257553100586, + "learning_rate": 6.981628047625818e-07, + "loss": 0.9089, + "num_input_tokens_seen": 3275751424, + "step": 1562 + }, + { + "epoch": 0.20694331434770816, + "grad_norm": 19.901887893676758, + "learning_rate": 6.977815654542508e-07, + "loss": 0.5814, + "num_input_tokens_seen": 3277848576, + "step": 1563 + }, + { + "epoch": 0.20721453756441552, + "grad_norm": 15.665919303894043, + "learning_rate": 6.974002072169395e-07, + "loss": 0.2016, + "num_input_tokens_seen": 3279945728, + "step": 1564 + }, + { + "epoch": 0.20748576078112285, + "grad_norm": 19.99432945251465, + "learning_rate": 6.970187303575505e-07, + "loss": 0.3062, + "num_input_tokens_seen": 3282042880, + "step": 1565 + }, + { + "epoch": 0.2077569839978302, + "grad_norm": 22.537017822265625, + "learning_rate": 6.966371351830818e-07, + "loss": 0.8428, + "num_input_tokens_seen": 3284140032, + "step": 1566 + }, + { + "epoch": 0.20802820721453757, + "grad_norm": 13.440853118896484, + "learning_rate": 6.962554220006265e-07, + "loss": 0.3116, + "num_input_tokens_seen": 3286237184, + "step": 1567 + }, + { + "epoch": 0.2082994304312449, + "grad_norm": 17.965656280517578, + "learning_rate": 6.958735911173729e-07, + "loss": 0.4781, + "num_input_tokens_seen": 3288334336, + "step": 1568 + }, + { + "epoch": 0.20857065364795227, + "grad_norm": 19.18067741394043, + "learning_rate": 6.954916428406045e-07, + "loss": 0.6427, + "num_input_tokens_seen": 3290431488, + "step": 1569 + }, + { + "epoch": 0.2088418768646596, + "grad_norm": 19.246889114379883, + "learning_rate": 6.95109577477698e-07, + "loss": 0.5277, + "num_input_tokens_seen": 3292528640, + "step": 1570 + }, + { + "epoch": 0.20911310008136696, + "grad_norm": 16.653470993041992, + "learning_rate": 6.947273953361255e-07, + "loss": 0.5097, + "num_input_tokens_seen": 3294625792, + "step": 1571 + }, + { + "epoch": 0.20938432329807433, + "grad_norm": 16.175172805786133, + "learning_rate": 6.943450967234524e-07, + "loss": 0.3398, + "num_input_tokens_seen": 3296722944, + "step": 1572 + }, + { + "epoch": 0.20965554651478166, + "grad_norm": 15.780593872070312, + "learning_rate": 6.939626819473384e-07, + "loss": 0.3364, + "num_input_tokens_seen": 3298820096, + "step": 1573 + }, + { + "epoch": 0.20992676973148902, + "grad_norm": 22.667606353759766, + "learning_rate": 6.93580151315536e-07, + "loss": 0.7428, + "num_input_tokens_seen": 3300917248, + "step": 1574 + }, + { + "epoch": 0.21019799294819635, + "grad_norm": 15.267520904541016, + "learning_rate": 6.931975051358914e-07, + "loss": 0.3328, + "num_input_tokens_seen": 3303014400, + "step": 1575 + }, + { + "epoch": 0.21046921616490372, + "grad_norm": 19.079408645629883, + "learning_rate": 6.928147437163439e-07, + "loss": 0.5853, + "num_input_tokens_seen": 3305111552, + "step": 1576 + }, + { + "epoch": 0.21074043938161108, + "grad_norm": 13.351947784423828, + "learning_rate": 6.92431867364925e-07, + "loss": 0.3437, + "num_input_tokens_seen": 3307208704, + "step": 1577 + }, + { + "epoch": 0.2110116625983184, + "grad_norm": 24.631118774414062, + "learning_rate": 6.920488763897593e-07, + "loss": 0.8318, + "num_input_tokens_seen": 3309305856, + "step": 1578 + }, + { + "epoch": 0.21128288581502577, + "grad_norm": 17.947328567504883, + "learning_rate": 6.916657710990632e-07, + "loss": 0.5461, + "num_input_tokens_seen": 3311403008, + "step": 1579 + }, + { + "epoch": 0.2115541090317331, + "grad_norm": 21.530193328857422, + "learning_rate": 6.912825518011452e-07, + "loss": 0.6746, + "num_input_tokens_seen": 3313500160, + "step": 1580 + }, + { + "epoch": 0.21182533224844047, + "grad_norm": 16.4180850982666, + "learning_rate": 6.90899218804406e-07, + "loss": 0.3837, + "num_input_tokens_seen": 3315597312, + "step": 1581 + }, + { + "epoch": 0.21209655546514783, + "grad_norm": 18.914493560791016, + "learning_rate": 6.905157724173369e-07, + "loss": 0.3731, + "num_input_tokens_seen": 3317694464, + "step": 1582 + }, + { + "epoch": 0.21236777868185516, + "grad_norm": 15.548588752746582, + "learning_rate": 6.901322129485212e-07, + "loss": 0.4683, + "num_input_tokens_seen": 3319791616, + "step": 1583 + }, + { + "epoch": 0.21263900189856252, + "grad_norm": 14.097211837768555, + "learning_rate": 6.897485407066329e-07, + "loss": 0.3252, + "num_input_tokens_seen": 3321888768, + "step": 1584 + }, + { + "epoch": 0.21291022511526986, + "grad_norm": 15.844295501708984, + "learning_rate": 6.893647560004369e-07, + "loss": 0.5362, + "num_input_tokens_seen": 3323985920, + "step": 1585 + }, + { + "epoch": 0.21318144833197722, + "grad_norm": 18.061901092529297, + "learning_rate": 6.889808591387885e-07, + "loss": 0.4111, + "num_input_tokens_seen": 3326083072, + "step": 1586 + }, + { + "epoch": 0.21345267154868458, + "grad_norm": 15.680930137634277, + "learning_rate": 6.885968504306334e-07, + "loss": 0.408, + "num_input_tokens_seen": 3328180224, + "step": 1587 + }, + { + "epoch": 0.21372389476539191, + "grad_norm": 22.98466682434082, + "learning_rate": 6.882127301850069e-07, + "loss": 0.684, + "num_input_tokens_seen": 3330277376, + "step": 1588 + }, + { + "epoch": 0.21399511798209928, + "grad_norm": 20.19873046875, + "learning_rate": 6.878284987110345e-07, + "loss": 0.576, + "num_input_tokens_seen": 3332374528, + "step": 1589 + }, + { + "epoch": 0.2142663411988066, + "grad_norm": 12.961610794067383, + "learning_rate": 6.874441563179313e-07, + "loss": 0.3008, + "num_input_tokens_seen": 3334471680, + "step": 1590 + }, + { + "epoch": 0.21453756441551397, + "grad_norm": 22.00704574584961, + "learning_rate": 6.870597033150012e-07, + "loss": 0.6572, + "num_input_tokens_seen": 3336568832, + "step": 1591 + }, + { + "epoch": 0.21480878763222133, + "grad_norm": 18.104747772216797, + "learning_rate": 6.866751400116374e-07, + "loss": 0.4674, + "num_input_tokens_seen": 3338665984, + "step": 1592 + }, + { + "epoch": 0.21508001084892867, + "grad_norm": 24.767946243286133, + "learning_rate": 6.862904667173216e-07, + "loss": 0.7152, + "num_input_tokens_seen": 3340763136, + "step": 1593 + }, + { + "epoch": 0.21535123406563603, + "grad_norm": 23.480615615844727, + "learning_rate": 6.859056837416245e-07, + "loss": 0.8405, + "num_input_tokens_seen": 3342860288, + "step": 1594 + }, + { + "epoch": 0.21562245728234336, + "grad_norm": 13.278249740600586, + "learning_rate": 6.855207913942048e-07, + "loss": 0.3923, + "num_input_tokens_seen": 3344957440, + "step": 1595 + }, + { + "epoch": 0.21589368049905072, + "grad_norm": 14.385771751403809, + "learning_rate": 6.85135789984809e-07, + "loss": 0.3845, + "num_input_tokens_seen": 3347054592, + "step": 1596 + }, + { + "epoch": 0.21616490371575806, + "grad_norm": 15.939520835876465, + "learning_rate": 6.847506798232719e-07, + "loss": 0.5702, + "num_input_tokens_seen": 3349151744, + "step": 1597 + }, + { + "epoch": 0.21643612693246542, + "grad_norm": 15.751465797424316, + "learning_rate": 6.843654612195152e-07, + "loss": 0.4505, + "num_input_tokens_seen": 3351248896, + "step": 1598 + }, + { + "epoch": 0.21670735014917278, + "grad_norm": 16.361385345458984, + "learning_rate": 6.839801344835484e-07, + "loss": 0.5949, + "num_input_tokens_seen": 3353346048, + "step": 1599 + }, + { + "epoch": 0.2169785733658801, + "grad_norm": 15.845523834228516, + "learning_rate": 6.835946999254677e-07, + "loss": 0.6065, + "num_input_tokens_seen": 3355443200, + "step": 1600 + }, + { + "epoch": 0.21724979658258747, + "grad_norm": 13.730551719665527, + "learning_rate": 6.83209157855456e-07, + "loss": 0.4201, + "num_input_tokens_seen": 3357540352, + "step": 1601 + }, + { + "epoch": 0.2175210197992948, + "grad_norm": 17.727937698364258, + "learning_rate": 6.828235085837831e-07, + "loss": 0.4174, + "num_input_tokens_seen": 3359637504, + "step": 1602 + }, + { + "epoch": 0.21779224301600217, + "grad_norm": 16.495952606201172, + "learning_rate": 6.824377524208047e-07, + "loss": 0.5056, + "num_input_tokens_seen": 3361734656, + "step": 1603 + }, + { + "epoch": 0.21806346623270953, + "grad_norm": 16.65219497680664, + "learning_rate": 6.820518896769629e-07, + "loss": 0.4776, + "num_input_tokens_seen": 3363831808, + "step": 1604 + }, + { + "epoch": 0.21833468944941686, + "grad_norm": 9.43620777130127, + "learning_rate": 6.816659206627853e-07, + "loss": 0.2099, + "num_input_tokens_seen": 3365928960, + "step": 1605 + }, + { + "epoch": 0.21860591266612422, + "grad_norm": 16.026655197143555, + "learning_rate": 6.812798456888849e-07, + "loss": 0.4603, + "num_input_tokens_seen": 3368026112, + "step": 1606 + }, + { + "epoch": 0.21887713588283156, + "grad_norm": 18.902629852294922, + "learning_rate": 6.808936650659605e-07, + "loss": 0.658, + "num_input_tokens_seen": 3370123264, + "step": 1607 + }, + { + "epoch": 0.21914835909953892, + "grad_norm": 18.937278747558594, + "learning_rate": 6.805073791047951e-07, + "loss": 0.7047, + "num_input_tokens_seen": 3372220416, + "step": 1608 + }, + { + "epoch": 0.21941958231624628, + "grad_norm": 19.952560424804688, + "learning_rate": 6.80120988116257e-07, + "loss": 0.303, + "num_input_tokens_seen": 3374317568, + "step": 1609 + }, + { + "epoch": 0.21969080553295361, + "grad_norm": 16.27387046813965, + "learning_rate": 6.797344924112995e-07, + "loss": 0.3612, + "num_input_tokens_seen": 3376414720, + "step": 1610 + }, + { + "epoch": 0.21996202874966098, + "grad_norm": 18.694629669189453, + "learning_rate": 6.793478923009592e-07, + "loss": 0.5804, + "num_input_tokens_seen": 3378511872, + "step": 1611 + }, + { + "epoch": 0.2202332519663683, + "grad_norm": 11.021830558776855, + "learning_rate": 6.789611880963569e-07, + "loss": 0.2376, + "num_input_tokens_seen": 3380609024, + "step": 1612 + }, + { + "epoch": 0.22050447518307567, + "grad_norm": 15.804499626159668, + "learning_rate": 6.78574380108698e-07, + "loss": 0.4313, + "num_input_tokens_seen": 3382706176, + "step": 1613 + }, + { + "epoch": 0.22077569839978303, + "grad_norm": 15.294432640075684, + "learning_rate": 6.781874686492706e-07, + "loss": 0.4538, + "num_input_tokens_seen": 3384803328, + "step": 1614 + }, + { + "epoch": 0.22104692161649037, + "grad_norm": 14.991973876953125, + "learning_rate": 6.778004540294464e-07, + "loss": 0.3465, + "num_input_tokens_seen": 3386900480, + "step": 1615 + }, + { + "epoch": 0.22131814483319773, + "grad_norm": 22.537893295288086, + "learning_rate": 6.774133365606801e-07, + "loss": 0.7659, + "num_input_tokens_seen": 3388997632, + "step": 1616 + }, + { + "epoch": 0.22158936804990506, + "grad_norm": 21.337339401245117, + "learning_rate": 6.770261165545085e-07, + "loss": 0.797, + "num_input_tokens_seen": 3391094784, + "step": 1617 + }, + { + "epoch": 0.22186059126661242, + "grad_norm": 20.40045166015625, + "learning_rate": 6.766387943225524e-07, + "loss": 0.7643, + "num_input_tokens_seen": 3393191936, + "step": 1618 + }, + { + "epoch": 0.22213181448331978, + "grad_norm": 14.96789836883545, + "learning_rate": 6.762513701765135e-07, + "loss": 0.409, + "num_input_tokens_seen": 3395289088, + "step": 1619 + }, + { + "epoch": 0.22240303770002712, + "grad_norm": 14.952929496765137, + "learning_rate": 6.75863844428176e-07, + "loss": 0.4159, + "num_input_tokens_seen": 3397386240, + "step": 1620 + }, + { + "epoch": 0.22267426091673448, + "grad_norm": 16.563785552978516, + "learning_rate": 6.754762173894061e-07, + "loss": 0.495, + "num_input_tokens_seen": 3399483392, + "step": 1621 + }, + { + "epoch": 0.2229454841334418, + "grad_norm": 13.857934951782227, + "learning_rate": 6.750884893721511e-07, + "loss": 0.2923, + "num_input_tokens_seen": 3401580544, + "step": 1622 + }, + { + "epoch": 0.22321670735014917, + "grad_norm": 16.87193489074707, + "learning_rate": 6.747006606884398e-07, + "loss": 0.4906, + "num_input_tokens_seen": 3403677696, + "step": 1623 + }, + { + "epoch": 0.22348793056685654, + "grad_norm": 28.209985733032227, + "learning_rate": 6.74312731650382e-07, + "loss": 0.5691, + "num_input_tokens_seen": 3405774848, + "step": 1624 + }, + { + "epoch": 0.22375915378356387, + "grad_norm": 18.54442596435547, + "learning_rate": 6.739247025701683e-07, + "loss": 0.4746, + "num_input_tokens_seen": 3407872000, + "step": 1625 + }, + { + "epoch": 0.22403037700027123, + "grad_norm": 21.610734939575195, + "learning_rate": 6.735365737600695e-07, + "loss": 0.8189, + "num_input_tokens_seen": 3409969152, + "step": 1626 + }, + { + "epoch": 0.22430160021697856, + "grad_norm": 13.132299423217773, + "learning_rate": 6.731483455324374e-07, + "loss": 0.3669, + "num_input_tokens_seen": 3412066304, + "step": 1627 + }, + { + "epoch": 0.22457282343368593, + "grad_norm": 10.474742889404297, + "learning_rate": 6.727600181997026e-07, + "loss": 0.2336, + "num_input_tokens_seen": 3414163456, + "step": 1628 + }, + { + "epoch": 0.2248440466503933, + "grad_norm": 20.9769287109375, + "learning_rate": 6.723715920743767e-07, + "loss": 0.5633, + "num_input_tokens_seen": 3416260608, + "step": 1629 + }, + { + "epoch": 0.22511526986710062, + "grad_norm": 14.54977798461914, + "learning_rate": 6.7198306746905e-07, + "loss": 0.3815, + "num_input_tokens_seen": 3418357760, + "step": 1630 + }, + { + "epoch": 0.22538649308380798, + "grad_norm": 19.554121017456055, + "learning_rate": 6.715944446963924e-07, + "loss": 0.5412, + "num_input_tokens_seen": 3420454912, + "step": 1631 + }, + { + "epoch": 0.22565771630051532, + "grad_norm": 23.622020721435547, + "learning_rate": 6.712057240691527e-07, + "loss": 0.9327, + "num_input_tokens_seen": 3422552064, + "step": 1632 + }, + { + "epoch": 0.22592893951722268, + "grad_norm": 13.846428871154785, + "learning_rate": 6.708169059001586e-07, + "loss": 0.3727, + "num_input_tokens_seen": 3424649216, + "step": 1633 + }, + { + "epoch": 0.22620016273393004, + "grad_norm": 20.11954689025879, + "learning_rate": 6.704279905023159e-07, + "loss": 0.5854, + "num_input_tokens_seen": 3426746368, + "step": 1634 + }, + { + "epoch": 0.22647138595063737, + "grad_norm": 28.113088607788086, + "learning_rate": 6.700389781886091e-07, + "loss": 0.6741, + "num_input_tokens_seen": 3428843520, + "step": 1635 + }, + { + "epoch": 0.22674260916734473, + "grad_norm": 12.346856117248535, + "learning_rate": 6.696498692721006e-07, + "loss": 0.3548, + "num_input_tokens_seen": 3430940672, + "step": 1636 + }, + { + "epoch": 0.22701383238405207, + "grad_norm": 16.389089584350586, + "learning_rate": 6.692606640659302e-07, + "loss": 0.5486, + "num_input_tokens_seen": 3433037824, + "step": 1637 + }, + { + "epoch": 0.22728505560075943, + "grad_norm": 24.49221420288086, + "learning_rate": 6.688713628833157e-07, + "loss": 0.7321, + "num_input_tokens_seen": 3435134976, + "step": 1638 + }, + { + "epoch": 0.22755627881746676, + "grad_norm": 16.724529266357422, + "learning_rate": 6.684819660375516e-07, + "loss": 0.4644, + "num_input_tokens_seen": 3437232128, + "step": 1639 + }, + { + "epoch": 0.22782750203417412, + "grad_norm": 20.410425186157227, + "learning_rate": 6.6809247384201e-07, + "loss": 0.6855, + "num_input_tokens_seen": 3439329280, + "step": 1640 + }, + { + "epoch": 0.22809872525088148, + "grad_norm": 14.939435005187988, + "learning_rate": 6.67702886610139e-07, + "loss": 0.4161, + "num_input_tokens_seen": 3441426432, + "step": 1641 + }, + { + "epoch": 0.22836994846758882, + "grad_norm": 16.925172805786133, + "learning_rate": 6.673132046554639e-07, + "loss": 0.6421, + "num_input_tokens_seen": 3443523584, + "step": 1642 + }, + { + "epoch": 0.22864117168429618, + "grad_norm": 11.812236785888672, + "learning_rate": 6.669234282915857e-07, + "loss": 0.2435, + "num_input_tokens_seen": 3445620736, + "step": 1643 + }, + { + "epoch": 0.2289123949010035, + "grad_norm": 16.797849655151367, + "learning_rate": 6.665335578321819e-07, + "loss": 0.4875, + "num_input_tokens_seen": 3447717888, + "step": 1644 + }, + { + "epoch": 0.22918361811771087, + "grad_norm": 20.539812088012695, + "learning_rate": 6.661435935910048e-07, + "loss": 0.6485, + "num_input_tokens_seen": 3449815040, + "step": 1645 + }, + { + "epoch": 0.22945484133441824, + "grad_norm": 12.565629959106445, + "learning_rate": 6.657535358818833e-07, + "loss": 0.3879, + "num_input_tokens_seen": 3451912192, + "step": 1646 + }, + { + "epoch": 0.22972606455112557, + "grad_norm": 21.272024154663086, + "learning_rate": 6.653633850187211e-07, + "loss": 0.6757, + "num_input_tokens_seen": 3454009344, + "step": 1647 + }, + { + "epoch": 0.22999728776783293, + "grad_norm": 17.750259399414062, + "learning_rate": 6.649731413154964e-07, + "loss": 0.4601, + "num_input_tokens_seen": 3456106496, + "step": 1648 + }, + { + "epoch": 0.23026851098454026, + "grad_norm": 17.899972915649414, + "learning_rate": 6.645828050862626e-07, + "loss": 0.5663, + "num_input_tokens_seen": 3458203648, + "step": 1649 + }, + { + "epoch": 0.23053973420124763, + "grad_norm": 10.090818405151367, + "learning_rate": 6.641923766451475e-07, + "loss": 0.1971, + "num_input_tokens_seen": 3460300800, + "step": 1650 + }, + { + "epoch": 0.230810957417955, + "grad_norm": 14.428431510925293, + "learning_rate": 6.63801856306353e-07, + "loss": 0.4693, + "num_input_tokens_seen": 3462397952, + "step": 1651 + }, + { + "epoch": 0.23108218063466232, + "grad_norm": 18.0393123626709, + "learning_rate": 6.634112443841551e-07, + "loss": 0.5592, + "num_input_tokens_seen": 3464495104, + "step": 1652 + }, + { + "epoch": 0.23135340385136968, + "grad_norm": 15.629161834716797, + "learning_rate": 6.630205411929032e-07, + "loss": 0.5662, + "num_input_tokens_seen": 3466592256, + "step": 1653 + }, + { + "epoch": 0.23162462706807702, + "grad_norm": 15.491093635559082, + "learning_rate": 6.626297470470205e-07, + "loss": 0.3379, + "num_input_tokens_seen": 3468689408, + "step": 1654 + }, + { + "epoch": 0.23189585028478438, + "grad_norm": 13.871119499206543, + "learning_rate": 6.622388622610034e-07, + "loss": 0.2769, + "num_input_tokens_seen": 3470786560, + "step": 1655 + }, + { + "epoch": 0.23216707350149174, + "grad_norm": 12.842281341552734, + "learning_rate": 6.618478871494209e-07, + "loss": 0.3845, + "num_input_tokens_seen": 3472883712, + "step": 1656 + }, + { + "epoch": 0.23243829671819907, + "grad_norm": 14.781960487365723, + "learning_rate": 6.61456822026915e-07, + "loss": 0.3783, + "num_input_tokens_seen": 3474980864, + "step": 1657 + }, + { + "epoch": 0.23270951993490643, + "grad_norm": 18.07168960571289, + "learning_rate": 6.610656672081999e-07, + "loss": 0.5104, + "num_input_tokens_seen": 3477078016, + "step": 1658 + }, + { + "epoch": 0.23298074315161377, + "grad_norm": 19.128738403320312, + "learning_rate": 6.606744230080622e-07, + "loss": 0.5309, + "num_input_tokens_seen": 3479175168, + "step": 1659 + }, + { + "epoch": 0.23325196636832113, + "grad_norm": 21.630807876586914, + "learning_rate": 6.602830897413603e-07, + "loss": 0.7695, + "num_input_tokens_seen": 3481272320, + "step": 1660 + }, + { + "epoch": 0.2335231895850285, + "grad_norm": 22.350727081298828, + "learning_rate": 6.598916677230243e-07, + "loss": 0.8973, + "num_input_tokens_seen": 3483369472, + "step": 1661 + }, + { + "epoch": 0.23379441280173582, + "grad_norm": 16.569183349609375, + "learning_rate": 6.59500157268056e-07, + "loss": 0.4983, + "num_input_tokens_seen": 3485466624, + "step": 1662 + }, + { + "epoch": 0.23406563601844319, + "grad_norm": 17.87363052368164, + "learning_rate": 6.591085586915279e-07, + "loss": 0.4791, + "num_input_tokens_seen": 3487563776, + "step": 1663 + }, + { + "epoch": 0.23433685923515052, + "grad_norm": 22.503154754638672, + "learning_rate": 6.587168723085836e-07, + "loss": 0.7941, + "num_input_tokens_seen": 3489660928, + "step": 1664 + }, + { + "epoch": 0.23460808245185788, + "grad_norm": 16.588794708251953, + "learning_rate": 6.583250984344374e-07, + "loss": 0.4266, + "num_input_tokens_seen": 3491758080, + "step": 1665 + }, + { + "epoch": 0.23487930566856524, + "grad_norm": 20.612022399902344, + "learning_rate": 6.57933237384374e-07, + "loss": 0.3887, + "num_input_tokens_seen": 3493855232, + "step": 1666 + }, + { + "epoch": 0.23515052888527258, + "grad_norm": 19.953716278076172, + "learning_rate": 6.575412894737484e-07, + "loss": 0.655, + "num_input_tokens_seen": 3495952384, + "step": 1667 + }, + { + "epoch": 0.23542175210197994, + "grad_norm": 21.730518341064453, + "learning_rate": 6.571492550179853e-07, + "loss": 0.8792, + "num_input_tokens_seen": 3498049536, + "step": 1668 + }, + { + "epoch": 0.23569297531868727, + "grad_norm": 20.50918197631836, + "learning_rate": 6.567571343325791e-07, + "loss": 0.6458, + "num_input_tokens_seen": 3500146688, + "step": 1669 + }, + { + "epoch": 0.23596419853539463, + "grad_norm": 18.293838500976562, + "learning_rate": 6.563649277330935e-07, + "loss": 0.5007, + "num_input_tokens_seen": 3502243840, + "step": 1670 + }, + { + "epoch": 0.236235421752102, + "grad_norm": 13.126777648925781, + "learning_rate": 6.559726355351617e-07, + "loss": 0.4134, + "num_input_tokens_seen": 3504340992, + "step": 1671 + }, + { + "epoch": 0.23650664496880933, + "grad_norm": 12.992348670959473, + "learning_rate": 6.555802580544853e-07, + "loss": 0.345, + "num_input_tokens_seen": 3506438144, + "step": 1672 + }, + { + "epoch": 0.2367778681855167, + "grad_norm": 15.970709800720215, + "learning_rate": 6.551877956068349e-07, + "loss": 0.413, + "num_input_tokens_seen": 3508535296, + "step": 1673 + }, + { + "epoch": 0.23704909140222402, + "grad_norm": 19.215749740600586, + "learning_rate": 6.547952485080491e-07, + "loss": 0.5876, + "num_input_tokens_seen": 3510632448, + "step": 1674 + }, + { + "epoch": 0.23732031461893138, + "grad_norm": 10.450196266174316, + "learning_rate": 6.544026170740352e-07, + "loss": 0.2881, + "num_input_tokens_seen": 3512729600, + "step": 1675 + }, + { + "epoch": 0.23759153783563874, + "grad_norm": 13.23245906829834, + "learning_rate": 6.540099016207679e-07, + "loss": 0.3993, + "num_input_tokens_seen": 3514826752, + "step": 1676 + }, + { + "epoch": 0.23786276105234608, + "grad_norm": 13.348737716674805, + "learning_rate": 6.536171024642896e-07, + "loss": 0.41, + "num_input_tokens_seen": 3516923904, + "step": 1677 + }, + { + "epoch": 0.23813398426905344, + "grad_norm": 15.210914611816406, + "learning_rate": 6.532242199207103e-07, + "loss": 0.3554, + "num_input_tokens_seen": 3519021056, + "step": 1678 + }, + { + "epoch": 0.23840520748576077, + "grad_norm": 14.020462989807129, + "learning_rate": 6.528312543062066e-07, + "loss": 0.2304, + "num_input_tokens_seen": 3521118208, + "step": 1679 + }, + { + "epoch": 0.23867643070246813, + "grad_norm": 18.36133575439453, + "learning_rate": 6.524382059370226e-07, + "loss": 0.5624, + "num_input_tokens_seen": 3523215360, + "step": 1680 + }, + { + "epoch": 0.23894765391917547, + "grad_norm": 16.818580627441406, + "learning_rate": 6.520450751294685e-07, + "loss": 0.5282, + "num_input_tokens_seen": 3525312512, + "step": 1681 + }, + { + "epoch": 0.23921887713588283, + "grad_norm": 25.353300094604492, + "learning_rate": 6.516518621999209e-07, + "loss": 0.6669, + "num_input_tokens_seen": 3527409664, + "step": 1682 + }, + { + "epoch": 0.2394901003525902, + "grad_norm": 18.05834197998047, + "learning_rate": 6.512585674648227e-07, + "loss": 0.6045, + "num_input_tokens_seen": 3529506816, + "step": 1683 + }, + { + "epoch": 0.23976132356929752, + "grad_norm": 11.763015747070312, + "learning_rate": 6.50865191240683e-07, + "loss": 0.3037, + "num_input_tokens_seen": 3531603968, + "step": 1684 + }, + { + "epoch": 0.24003254678600489, + "grad_norm": 19.146381378173828, + "learning_rate": 6.504717338440751e-07, + "loss": 0.5238, + "num_input_tokens_seen": 3533701120, + "step": 1685 + }, + { + "epoch": 0.24030377000271222, + "grad_norm": 13.004151344299316, + "learning_rate": 6.500781955916393e-07, + "loss": 0.3522, + "num_input_tokens_seen": 3535798272, + "step": 1686 + }, + { + "epoch": 0.24057499321941958, + "grad_norm": 17.20205307006836, + "learning_rate": 6.4968457680008e-07, + "loss": 0.5728, + "num_input_tokens_seen": 3537895424, + "step": 1687 + }, + { + "epoch": 0.24084621643612694, + "grad_norm": 12.467523574829102, + "learning_rate": 6.492908777861664e-07, + "loss": 0.3135, + "num_input_tokens_seen": 3539992576, + "step": 1688 + }, + { + "epoch": 0.24111743965283428, + "grad_norm": 21.274076461791992, + "learning_rate": 6.488970988667327e-07, + "loss": 0.6591, + "num_input_tokens_seen": 3542089728, + "step": 1689 + }, + { + "epoch": 0.24138866286954164, + "grad_norm": 24.0526180267334, + "learning_rate": 6.485032403586772e-07, + "loss": 0.927, + "num_input_tokens_seen": 3544186880, + "step": 1690 + }, + { + "epoch": 0.24165988608624897, + "grad_norm": 16.187849044799805, + "learning_rate": 6.481093025789621e-07, + "loss": 0.3657, + "num_input_tokens_seen": 3546284032, + "step": 1691 + }, + { + "epoch": 0.24193110930295633, + "grad_norm": 25.65354347229004, + "learning_rate": 6.477152858446136e-07, + "loss": 0.5053, + "num_input_tokens_seen": 3548381184, + "step": 1692 + }, + { + "epoch": 0.2422023325196637, + "grad_norm": 16.25813865661621, + "learning_rate": 6.473211904727216e-07, + "loss": 0.5336, + "num_input_tokens_seen": 3550478336, + "step": 1693 + }, + { + "epoch": 0.24247355573637103, + "grad_norm": 19.955923080444336, + "learning_rate": 6.469270167804386e-07, + "loss": 0.5435, + "num_input_tokens_seen": 3552575488, + "step": 1694 + }, + { + "epoch": 0.2427447789530784, + "grad_norm": 12.152280807495117, + "learning_rate": 6.46532765084981e-07, + "loss": 0.3573, + "num_input_tokens_seen": 3554672640, + "step": 1695 + }, + { + "epoch": 0.24301600216978572, + "grad_norm": 13.028290748596191, + "learning_rate": 6.461384357036274e-07, + "loss": 0.2561, + "num_input_tokens_seen": 3556769792, + "step": 1696 + }, + { + "epoch": 0.24328722538649308, + "grad_norm": 20.247438430786133, + "learning_rate": 6.457440289537191e-07, + "loss": 0.4922, + "num_input_tokens_seen": 3558866944, + "step": 1697 + }, + { + "epoch": 0.24355844860320044, + "grad_norm": 12.755880355834961, + "learning_rate": 6.453495451526595e-07, + "loss": 0.301, + "num_input_tokens_seen": 3560964096, + "step": 1698 + }, + { + "epoch": 0.24382967181990778, + "grad_norm": 13.987512588500977, + "learning_rate": 6.449549846179144e-07, + "loss": 0.3143, + "num_input_tokens_seen": 3563061248, + "step": 1699 + }, + { + "epoch": 0.24410089503661514, + "grad_norm": 26.919103622436523, + "learning_rate": 6.445603476670109e-07, + "loss": 0.5411, + "num_input_tokens_seen": 3565158400, + "step": 1700 + }, + { + "epoch": 0.24437211825332247, + "grad_norm": 21.93929672241211, + "learning_rate": 6.44165634617538e-07, + "loss": 0.7823, + "num_input_tokens_seen": 3567255552, + "step": 1701 + }, + { + "epoch": 0.24464334147002983, + "grad_norm": 16.736602783203125, + "learning_rate": 6.437708457871455e-07, + "loss": 0.4474, + "num_input_tokens_seen": 3569352704, + "step": 1702 + }, + { + "epoch": 0.2449145646867372, + "grad_norm": 17.7639217376709, + "learning_rate": 6.433759814935447e-07, + "loss": 0.5609, + "num_input_tokens_seen": 3571449856, + "step": 1703 + }, + { + "epoch": 0.24518578790344453, + "grad_norm": 12.938990592956543, + "learning_rate": 6.429810420545072e-07, + "loss": 0.3755, + "num_input_tokens_seen": 3573547008, + "step": 1704 + }, + { + "epoch": 0.2454570111201519, + "grad_norm": 19.107515335083008, + "learning_rate": 6.425860277878651e-07, + "loss": 0.6132, + "num_input_tokens_seen": 3575644160, + "step": 1705 + }, + { + "epoch": 0.24572823433685922, + "grad_norm": 16.069114685058594, + "learning_rate": 6.42190939011511e-07, + "loss": 0.4449, + "num_input_tokens_seen": 3577741312, + "step": 1706 + }, + { + "epoch": 0.2459994575535666, + "grad_norm": 18.453479766845703, + "learning_rate": 6.417957760433974e-07, + "loss": 0.4561, + "num_input_tokens_seen": 3579838464, + "step": 1707 + }, + { + "epoch": 0.24627068077027395, + "grad_norm": 22.410266876220703, + "learning_rate": 6.414005392015364e-07, + "loss": 0.4495, + "num_input_tokens_seen": 3581935616, + "step": 1708 + }, + { + "epoch": 0.24654190398698128, + "grad_norm": 18.5291690826416, + "learning_rate": 6.410052288039994e-07, + "loss": 0.647, + "num_input_tokens_seen": 3584032768, + "step": 1709 + }, + { + "epoch": 0.24681312720368864, + "grad_norm": 15.33918571472168, + "learning_rate": 6.406098451689171e-07, + "loss": 0.3701, + "num_input_tokens_seen": 3586129920, + "step": 1710 + }, + { + "epoch": 0.24708435042039598, + "grad_norm": 16.662532806396484, + "learning_rate": 6.402143886144797e-07, + "loss": 0.4231, + "num_input_tokens_seen": 3588227072, + "step": 1711 + }, + { + "epoch": 0.24735557363710334, + "grad_norm": 16.196252822875977, + "learning_rate": 6.39818859458935e-07, + "loss": 0.431, + "num_input_tokens_seen": 3590324224, + "step": 1712 + }, + { + "epoch": 0.2476267968538107, + "grad_norm": 24.346651077270508, + "learning_rate": 6.394232580205903e-07, + "loss": 0.9444, + "num_input_tokens_seen": 3592421376, + "step": 1713 + }, + { + "epoch": 0.24789802007051803, + "grad_norm": 18.19024658203125, + "learning_rate": 6.390275846178102e-07, + "loss": 0.6498, + "num_input_tokens_seen": 3594518528, + "step": 1714 + }, + { + "epoch": 0.2481692432872254, + "grad_norm": 11.127647399902344, + "learning_rate": 6.386318395690178e-07, + "loss": 0.279, + "num_input_tokens_seen": 3596615680, + "step": 1715 + }, + { + "epoch": 0.24844046650393273, + "grad_norm": 21.56477165222168, + "learning_rate": 6.382360231926935e-07, + "loss": 0.5013, + "num_input_tokens_seen": 3598712832, + "step": 1716 + }, + { + "epoch": 0.2487116897206401, + "grad_norm": 15.533906936645508, + "learning_rate": 6.378401358073754e-07, + "loss": 0.469, + "num_input_tokens_seen": 3600809984, + "step": 1717 + }, + { + "epoch": 0.24898291293734745, + "grad_norm": 16.186729431152344, + "learning_rate": 6.374441777316587e-07, + "loss": 0.5628, + "num_input_tokens_seen": 3602907136, + "step": 1718 + }, + { + "epoch": 0.24925413615405478, + "grad_norm": 14.087782859802246, + "learning_rate": 6.370481492841952e-07, + "loss": 0.4068, + "num_input_tokens_seen": 3605004288, + "step": 1719 + }, + { + "epoch": 0.24952535937076215, + "grad_norm": 14.505183219909668, + "learning_rate": 6.366520507836934e-07, + "loss": 0.4257, + "num_input_tokens_seen": 3607101440, + "step": 1720 + }, + { + "epoch": 0.24979658258746948, + "grad_norm": 14.800230979919434, + "learning_rate": 6.362558825489187e-07, + "loss": 0.3571, + "num_input_tokens_seen": 3609198592, + "step": 1721 + }, + { + "epoch": 0.2500678058041768, + "grad_norm": 18.357351303100586, + "learning_rate": 6.35859644898692e-07, + "loss": 0.5663, + "num_input_tokens_seen": 3611295744, + "step": 1722 + }, + { + "epoch": 0.2503390290208842, + "grad_norm": 18.787809371948242, + "learning_rate": 6.354633381518901e-07, + "loss": 0.6229, + "num_input_tokens_seen": 3613392896, + "step": 1723 + }, + { + "epoch": 0.25061025223759154, + "grad_norm": 22.12680435180664, + "learning_rate": 6.350669626274461e-07, + "loss": 0.4966, + "num_input_tokens_seen": 3615490048, + "step": 1724 + }, + { + "epoch": 0.2508814754542989, + "grad_norm": 14.827641487121582, + "learning_rate": 6.346705186443474e-07, + "loss": 0.4202, + "num_input_tokens_seen": 3617587200, + "step": 1725 + }, + { + "epoch": 0.25115269867100626, + "grad_norm": 16.624746322631836, + "learning_rate": 6.342740065216371e-07, + "loss": 0.5388, + "num_input_tokens_seen": 3619684352, + "step": 1726 + }, + { + "epoch": 0.25142392188771356, + "grad_norm": 13.560758590698242, + "learning_rate": 6.338774265784134e-07, + "loss": 0.3481, + "num_input_tokens_seen": 3621781504, + "step": 1727 + }, + { + "epoch": 0.2516951451044209, + "grad_norm": 17.08588409423828, + "learning_rate": 6.334807791338286e-07, + "loss": 0.4449, + "num_input_tokens_seen": 3623878656, + "step": 1728 + }, + { + "epoch": 0.2519663683211283, + "grad_norm": 25.524866104125977, + "learning_rate": 6.330840645070894e-07, + "loss": 0.6385, + "num_input_tokens_seen": 3625975808, + "step": 1729 + }, + { + "epoch": 0.25223759153783565, + "grad_norm": 16.190298080444336, + "learning_rate": 6.326872830174566e-07, + "loss": 0.5051, + "num_input_tokens_seen": 3628072960, + "step": 1730 + }, + { + "epoch": 0.252508814754543, + "grad_norm": 17.590593338012695, + "learning_rate": 6.32290434984245e-07, + "loss": 0.4167, + "num_input_tokens_seen": 3630170112, + "step": 1731 + }, + { + "epoch": 0.2527800379712503, + "grad_norm": 23.287769317626953, + "learning_rate": 6.318935207268227e-07, + "loss": 0.5106, + "num_input_tokens_seen": 3632267264, + "step": 1732 + }, + { + "epoch": 0.2530512611879577, + "grad_norm": 17.762920379638672, + "learning_rate": 6.314965405646113e-07, + "loss": 0.5227, + "num_input_tokens_seen": 3634364416, + "step": 1733 + }, + { + "epoch": 0.25332248440466504, + "grad_norm": 23.90679931640625, + "learning_rate": 6.31099494817085e-07, + "loss": 0.7039, + "num_input_tokens_seen": 3636461568, + "step": 1734 + }, + { + "epoch": 0.2535937076213724, + "grad_norm": 14.712848663330078, + "learning_rate": 6.307023838037714e-07, + "loss": 0.4308, + "num_input_tokens_seen": 3638558720, + "step": 1735 + }, + { + "epoch": 0.25386493083807976, + "grad_norm": 20.47460174560547, + "learning_rate": 6.303052078442503e-07, + "loss": 0.6353, + "num_input_tokens_seen": 3640655872, + "step": 1736 + }, + { + "epoch": 0.25413615405478707, + "grad_norm": 15.011195182800293, + "learning_rate": 6.299079672581537e-07, + "loss": 0.4013, + "num_input_tokens_seen": 3642753024, + "step": 1737 + }, + { + "epoch": 0.25440737727149443, + "grad_norm": 17.400686264038086, + "learning_rate": 6.295106623651661e-07, + "loss": 0.5446, + "num_input_tokens_seen": 3644850176, + "step": 1738 + }, + { + "epoch": 0.2546786004882018, + "grad_norm": 11.729777336120605, + "learning_rate": 6.291132934850225e-07, + "loss": 0.2983, + "num_input_tokens_seen": 3646947328, + "step": 1739 + }, + { + "epoch": 0.25494982370490915, + "grad_norm": 21.60021209716797, + "learning_rate": 6.28715860937511e-07, + "loss": 0.8498, + "num_input_tokens_seen": 3649044480, + "step": 1740 + }, + { + "epoch": 0.2552210469216165, + "grad_norm": 18.79129981994629, + "learning_rate": 6.283183650424701e-07, + "loss": 0.5259, + "num_input_tokens_seen": 3651141632, + "step": 1741 + }, + { + "epoch": 0.2554922701383238, + "grad_norm": 19.276283264160156, + "learning_rate": 6.279208061197892e-07, + "loss": 0.6427, + "num_input_tokens_seen": 3653238784, + "step": 1742 + }, + { + "epoch": 0.2557634933550312, + "grad_norm": 13.992378234863281, + "learning_rate": 6.275231844894086e-07, + "loss": 0.3971, + "num_input_tokens_seen": 3655335936, + "step": 1743 + }, + { + "epoch": 0.25603471657173854, + "grad_norm": 16.478370666503906, + "learning_rate": 6.271255004713192e-07, + "loss": 0.4849, + "num_input_tokens_seen": 3657433088, + "step": 1744 + }, + { + "epoch": 0.2563059397884459, + "grad_norm": 11.54108715057373, + "learning_rate": 6.267277543855618e-07, + "loss": 0.3075, + "num_input_tokens_seen": 3659530240, + "step": 1745 + }, + { + "epoch": 0.25657716300515326, + "grad_norm": 19.243192672729492, + "learning_rate": 6.263299465522274e-07, + "loss": 0.5958, + "num_input_tokens_seen": 3661627392, + "step": 1746 + }, + { + "epoch": 0.25684838622186057, + "grad_norm": 14.843926429748535, + "learning_rate": 6.259320772914566e-07, + "loss": 0.4542, + "num_input_tokens_seen": 3663724544, + "step": 1747 + }, + { + "epoch": 0.25711960943856793, + "grad_norm": 22.970197677612305, + "learning_rate": 6.255341469234393e-07, + "loss": 0.8589, + "num_input_tokens_seen": 3665821696, + "step": 1748 + }, + { + "epoch": 0.2573908326552753, + "grad_norm": 16.67009162902832, + "learning_rate": 6.25136155768415e-07, + "loss": 0.5667, + "num_input_tokens_seen": 3667918848, + "step": 1749 + }, + { + "epoch": 0.25766205587198265, + "grad_norm": 13.778572082519531, + "learning_rate": 6.247381041466716e-07, + "loss": 0.4136, + "num_input_tokens_seen": 3670016000, + "step": 1750 + }, + { + "epoch": 0.25793327908869, + "grad_norm": 11.786734580993652, + "learning_rate": 6.243399923785459e-07, + "loss": 0.3029, + "num_input_tokens_seen": 3672113152, + "step": 1751 + }, + { + "epoch": 0.2582045023053973, + "grad_norm": 18.904006958007812, + "learning_rate": 6.239418207844232e-07, + "loss": 0.614, + "num_input_tokens_seen": 3674210304, + "step": 1752 + }, + { + "epoch": 0.2584757255221047, + "grad_norm": 16.964792251586914, + "learning_rate": 6.23543589684737e-07, + "loss": 0.4589, + "num_input_tokens_seen": 3676307456, + "step": 1753 + }, + { + "epoch": 0.25874694873881204, + "grad_norm": 15.05766487121582, + "learning_rate": 6.231452993999683e-07, + "loss": 0.4943, + "num_input_tokens_seen": 3678404608, + "step": 1754 + }, + { + "epoch": 0.2590181719555194, + "grad_norm": 18.096086502075195, + "learning_rate": 6.22746950250646e-07, + "loss": 0.558, + "num_input_tokens_seen": 3680501760, + "step": 1755 + }, + { + "epoch": 0.25928939517222677, + "grad_norm": 15.403691291809082, + "learning_rate": 6.223485425573463e-07, + "loss": 0.4175, + "num_input_tokens_seen": 3682598912, + "step": 1756 + }, + { + "epoch": 0.2595606183889341, + "grad_norm": 14.380963325500488, + "learning_rate": 6.219500766406926e-07, + "loss": 0.3693, + "num_input_tokens_seen": 3684696064, + "step": 1757 + }, + { + "epoch": 0.25983184160564143, + "grad_norm": 20.143505096435547, + "learning_rate": 6.215515528213553e-07, + "loss": 0.5975, + "num_input_tokens_seen": 3686793216, + "step": 1758 + }, + { + "epoch": 0.2601030648223488, + "grad_norm": 18.09722900390625, + "learning_rate": 6.211529714200509e-07, + "loss": 0.3873, + "num_input_tokens_seen": 3688890368, + "step": 1759 + }, + { + "epoch": 0.26037428803905616, + "grad_norm": 18.81859588623047, + "learning_rate": 6.207543327575426e-07, + "loss": 0.511, + "num_input_tokens_seen": 3690987520, + "step": 1760 + }, + { + "epoch": 0.2606455112557635, + "grad_norm": 18.815465927124023, + "learning_rate": 6.203556371546399e-07, + "loss": 0.5123, + "num_input_tokens_seen": 3693084672, + "step": 1761 + }, + { + "epoch": 0.2609167344724708, + "grad_norm": 15.527567863464355, + "learning_rate": 6.199568849321975e-07, + "loss": 0.4696, + "num_input_tokens_seen": 3695181824, + "step": 1762 + }, + { + "epoch": 0.2611879576891782, + "grad_norm": 15.812019348144531, + "learning_rate": 6.195580764111163e-07, + "loss": 0.4165, + "num_input_tokens_seen": 3697278976, + "step": 1763 + }, + { + "epoch": 0.26145918090588555, + "grad_norm": 11.180591583251953, + "learning_rate": 6.191592119123419e-07, + "loss": 0.2559, + "num_input_tokens_seen": 3699376128, + "step": 1764 + }, + { + "epoch": 0.2617304041225929, + "grad_norm": 8.583144187927246, + "learning_rate": 6.187602917568655e-07, + "loss": 0.1839, + "num_input_tokens_seen": 3701473280, + "step": 1765 + }, + { + "epoch": 0.26200162733930027, + "grad_norm": 12.009425163269043, + "learning_rate": 6.183613162657229e-07, + "loss": 0.2641, + "num_input_tokens_seen": 3703570432, + "step": 1766 + }, + { + "epoch": 0.2622728505560076, + "grad_norm": 14.513581275939941, + "learning_rate": 6.179622857599942e-07, + "loss": 0.4814, + "num_input_tokens_seen": 3705667584, + "step": 1767 + }, + { + "epoch": 0.26254407377271494, + "grad_norm": 23.496122360229492, + "learning_rate": 6.175632005608043e-07, + "loss": 0.6047, + "num_input_tokens_seen": 3707764736, + "step": 1768 + }, + { + "epoch": 0.2628152969894223, + "grad_norm": 19.470165252685547, + "learning_rate": 6.171640609893213e-07, + "loss": 0.4534, + "num_input_tokens_seen": 3709861888, + "step": 1769 + }, + { + "epoch": 0.26308652020612966, + "grad_norm": 15.71687126159668, + "learning_rate": 6.167648673667581e-07, + "loss": 0.403, + "num_input_tokens_seen": 3711959040, + "step": 1770 + }, + { + "epoch": 0.263357743422837, + "grad_norm": 23.09616470336914, + "learning_rate": 6.1636562001437e-07, + "loss": 0.6622, + "num_input_tokens_seen": 3714056192, + "step": 1771 + }, + { + "epoch": 0.2636289666395443, + "grad_norm": 15.076805114746094, + "learning_rate": 6.159663192534561e-07, + "loss": 0.4457, + "num_input_tokens_seen": 3716153344, + "step": 1772 + }, + { + "epoch": 0.2639001898562517, + "grad_norm": 13.646811485290527, + "learning_rate": 6.155669654053592e-07, + "loss": 0.3627, + "num_input_tokens_seen": 3718250496, + "step": 1773 + }, + { + "epoch": 0.26417141307295905, + "grad_norm": 20.863006591796875, + "learning_rate": 6.151675587914631e-07, + "loss": 0.6803, + "num_input_tokens_seen": 3720347648, + "step": 1774 + }, + { + "epoch": 0.2644426362896664, + "grad_norm": 22.40181541442871, + "learning_rate": 6.147680997331958e-07, + "loss": 0.7848, + "num_input_tokens_seen": 3722444800, + "step": 1775 + }, + { + "epoch": 0.26471385950637377, + "grad_norm": 20.66728973388672, + "learning_rate": 6.143685885520263e-07, + "loss": 0.5724, + "num_input_tokens_seen": 3724541952, + "step": 1776 + }, + { + "epoch": 0.2649850827230811, + "grad_norm": 16.2211971282959, + "learning_rate": 6.13969025569466e-07, + "loss": 0.3131, + "num_input_tokens_seen": 3726639104, + "step": 1777 + }, + { + "epoch": 0.26525630593978844, + "grad_norm": 14.444147109985352, + "learning_rate": 6.135694111070684e-07, + "loss": 0.3258, + "num_input_tokens_seen": 3728736256, + "step": 1778 + }, + { + "epoch": 0.2655275291564958, + "grad_norm": 15.806771278381348, + "learning_rate": 6.131697454864277e-07, + "loss": 0.4398, + "num_input_tokens_seen": 3730833408, + "step": 1779 + }, + { + "epoch": 0.26579875237320316, + "grad_norm": 19.829689025878906, + "learning_rate": 6.127700290291794e-07, + "loss": 0.6636, + "num_input_tokens_seen": 3732930560, + "step": 1780 + }, + { + "epoch": 0.2660699755899105, + "grad_norm": 16.643007278442383, + "learning_rate": 6.123702620570005e-07, + "loss": 0.4753, + "num_input_tokens_seen": 3735027712, + "step": 1781 + }, + { + "epoch": 0.26634119880661783, + "grad_norm": 21.29400062561035, + "learning_rate": 6.11970444891608e-07, + "loss": 0.4912, + "num_input_tokens_seen": 3737124864, + "step": 1782 + }, + { + "epoch": 0.2666124220233252, + "grad_norm": 14.493866920471191, + "learning_rate": 6.115705778547597e-07, + "loss": 0.2683, + "num_input_tokens_seen": 3739222016, + "step": 1783 + }, + { + "epoch": 0.26688364524003255, + "grad_norm": 19.871868133544922, + "learning_rate": 6.111706612682532e-07, + "loss": 0.6509, + "num_input_tokens_seen": 3741319168, + "step": 1784 + }, + { + "epoch": 0.2671548684567399, + "grad_norm": 15.187943458557129, + "learning_rate": 6.107706954539261e-07, + "loss": 0.3895, + "num_input_tokens_seen": 3743416320, + "step": 1785 + }, + { + "epoch": 0.2674260916734473, + "grad_norm": 15.838203430175781, + "learning_rate": 6.103706807336559e-07, + "loss": 0.54, + "num_input_tokens_seen": 3745513472, + "step": 1786 + }, + { + "epoch": 0.2676973148901546, + "grad_norm": 12.953702926635742, + "learning_rate": 6.099706174293592e-07, + "loss": 0.4153, + "num_input_tokens_seen": 3747610624, + "step": 1787 + }, + { + "epoch": 0.26796853810686194, + "grad_norm": 14.383577346801758, + "learning_rate": 6.095705058629914e-07, + "loss": 0.2817, + "num_input_tokens_seen": 3749707776, + "step": 1788 + }, + { + "epoch": 0.2682397613235693, + "grad_norm": 14.063627243041992, + "learning_rate": 6.091703463565475e-07, + "loss": 0.3399, + "num_input_tokens_seen": 3751804928, + "step": 1789 + }, + { + "epoch": 0.26851098454027666, + "grad_norm": 16.237741470336914, + "learning_rate": 6.087701392320606e-07, + "loss": 0.5057, + "num_input_tokens_seen": 3753902080, + "step": 1790 + }, + { + "epoch": 0.26878220775698397, + "grad_norm": 12.57223129272461, + "learning_rate": 6.083698848116018e-07, + "loss": 0.3234, + "num_input_tokens_seen": 3755999232, + "step": 1791 + }, + { + "epoch": 0.26905343097369133, + "grad_norm": 26.410602569580078, + "learning_rate": 6.079695834172808e-07, + "loss": 0.9309, + "num_input_tokens_seen": 3758096384, + "step": 1792 + }, + { + "epoch": 0.2693246541903987, + "grad_norm": 17.888185501098633, + "learning_rate": 6.075692353712451e-07, + "loss": 0.4636, + "num_input_tokens_seen": 3760193536, + "step": 1793 + }, + { + "epoch": 0.26959587740710605, + "grad_norm": 20.749839782714844, + "learning_rate": 6.071688409956793e-07, + "loss": 0.5735, + "num_input_tokens_seen": 3762290688, + "step": 1794 + }, + { + "epoch": 0.2698671006238134, + "grad_norm": 23.78219223022461, + "learning_rate": 6.06768400612806e-07, + "loss": 0.5885, + "num_input_tokens_seen": 3764387840, + "step": 1795 + }, + { + "epoch": 0.2701383238405207, + "grad_norm": 25.068920135498047, + "learning_rate": 6.063679145448838e-07, + "loss": 0.7039, + "num_input_tokens_seen": 3766484992, + "step": 1796 + }, + { + "epoch": 0.2704095470572281, + "grad_norm": 15.582983016967773, + "learning_rate": 6.05967383114209e-07, + "loss": 0.3652, + "num_input_tokens_seen": 3768582144, + "step": 1797 + }, + { + "epoch": 0.27068077027393544, + "grad_norm": 27.859983444213867, + "learning_rate": 6.055668066431142e-07, + "loss": 0.6668, + "num_input_tokens_seen": 3770679296, + "step": 1798 + }, + { + "epoch": 0.2709519934906428, + "grad_norm": 12.221877098083496, + "learning_rate": 6.051661854539677e-07, + "loss": 0.3411, + "num_input_tokens_seen": 3772776448, + "step": 1799 + }, + { + "epoch": 0.27122321670735017, + "grad_norm": 20.4038143157959, + "learning_rate": 6.047655198691742e-07, + "loss": 0.794, + "num_input_tokens_seen": 3774873600, + "step": 1800 + }, + { + "epoch": 0.2714944399240575, + "grad_norm": 13.717342376708984, + "learning_rate": 6.043648102111745e-07, + "loss": 0.2082, + "num_input_tokens_seen": 3776970752, + "step": 1801 + }, + { + "epoch": 0.27176566314076483, + "grad_norm": 19.058935165405273, + "learning_rate": 6.039640568024443e-07, + "loss": 0.6036, + "num_input_tokens_seen": 3779067904, + "step": 1802 + }, + { + "epoch": 0.2720368863574722, + "grad_norm": 15.670707702636719, + "learning_rate": 6.035632599654946e-07, + "loss": 0.5712, + "num_input_tokens_seen": 3781165056, + "step": 1803 + }, + { + "epoch": 0.27230810957417956, + "grad_norm": 13.817047119140625, + "learning_rate": 6.031624200228715e-07, + "loss": 0.3603, + "num_input_tokens_seen": 3783262208, + "step": 1804 + }, + { + "epoch": 0.2725793327908869, + "grad_norm": 15.026437759399414, + "learning_rate": 6.027615372971558e-07, + "loss": 0.4361, + "num_input_tokens_seen": 3785359360, + "step": 1805 + }, + { + "epoch": 0.2728505560075942, + "grad_norm": 16.934778213500977, + "learning_rate": 6.023606121109626e-07, + "loss": 0.4687, + "num_input_tokens_seen": 3787456512, + "step": 1806 + }, + { + "epoch": 0.2731217792243016, + "grad_norm": 18.090913772583008, + "learning_rate": 6.019596447869413e-07, + "loss": 0.423, + "num_input_tokens_seen": 3789553664, + "step": 1807 + }, + { + "epoch": 0.27339300244100895, + "grad_norm": 15.30374813079834, + "learning_rate": 6.015586356477749e-07, + "loss": 0.3783, + "num_input_tokens_seen": 3791650816, + "step": 1808 + }, + { + "epoch": 0.2736642256577163, + "grad_norm": 21.7882080078125, + "learning_rate": 6.011575850161805e-07, + "loss": 0.576, + "num_input_tokens_seen": 3793747968, + "step": 1809 + }, + { + "epoch": 0.27393544887442367, + "grad_norm": 17.875171661376953, + "learning_rate": 6.007564932149086e-07, + "loss": 0.5201, + "num_input_tokens_seen": 3795845120, + "step": 1810 + }, + { + "epoch": 0.274206672091131, + "grad_norm": 15.981590270996094, + "learning_rate": 6.003553605667423e-07, + "loss": 0.3927, + "num_input_tokens_seen": 3797942272, + "step": 1811 + }, + { + "epoch": 0.27447789530783834, + "grad_norm": 14.422945022583008, + "learning_rate": 5.999541873944979e-07, + "loss": 0.4337, + "num_input_tokens_seen": 3800039424, + "step": 1812 + }, + { + "epoch": 0.2747491185245457, + "grad_norm": 17.687833786010742, + "learning_rate": 5.995529740210244e-07, + "loss": 0.4126, + "num_input_tokens_seen": 3802136576, + "step": 1813 + }, + { + "epoch": 0.27502034174125306, + "grad_norm": 11.91661262512207, + "learning_rate": 5.99151720769203e-07, + "loss": 0.2607, + "num_input_tokens_seen": 3804233728, + "step": 1814 + }, + { + "epoch": 0.2752915649579604, + "grad_norm": 13.01582145690918, + "learning_rate": 5.987504279619473e-07, + "loss": 0.4029, + "num_input_tokens_seen": 3806330880, + "step": 1815 + }, + { + "epoch": 0.2755627881746677, + "grad_norm": 14.431076049804688, + "learning_rate": 5.98349095922202e-07, + "loss": 0.3179, + "num_input_tokens_seen": 3808428032, + "step": 1816 + }, + { + "epoch": 0.2758340113913751, + "grad_norm": 23.85053825378418, + "learning_rate": 5.979477249729442e-07, + "loss": 0.9278, + "num_input_tokens_seen": 3810525184, + "step": 1817 + }, + { + "epoch": 0.27610523460808245, + "grad_norm": 16.68154525756836, + "learning_rate": 5.975463154371822e-07, + "loss": 0.5163, + "num_input_tokens_seen": 3812622336, + "step": 1818 + }, + { + "epoch": 0.2763764578247898, + "grad_norm": 20.487937927246094, + "learning_rate": 5.971448676379544e-07, + "loss": 0.4715, + "num_input_tokens_seen": 3814719488, + "step": 1819 + }, + { + "epoch": 0.2766476810414972, + "grad_norm": 11.562193870544434, + "learning_rate": 5.967433818983311e-07, + "loss": 0.3283, + "num_input_tokens_seen": 3816816640, + "step": 1820 + }, + { + "epoch": 0.2769189042582045, + "grad_norm": 14.949207305908203, + "learning_rate": 5.963418585414129e-07, + "loss": 0.3784, + "num_input_tokens_seen": 3818913792, + "step": 1821 + }, + { + "epoch": 0.27719012747491184, + "grad_norm": 13.64500617980957, + "learning_rate": 5.959402978903306e-07, + "loss": 0.2296, + "num_input_tokens_seen": 3821010944, + "step": 1822 + }, + { + "epoch": 0.2774613506916192, + "grad_norm": 20.391284942626953, + "learning_rate": 5.955387002682445e-07, + "loss": 0.7141, + "num_input_tokens_seen": 3823108096, + "step": 1823 + }, + { + "epoch": 0.27773257390832656, + "grad_norm": 12.009023666381836, + "learning_rate": 5.951370659983452e-07, + "loss": 0.3659, + "num_input_tokens_seen": 3825205248, + "step": 1824 + }, + { + "epoch": 0.2780037971250339, + "grad_norm": 12.802525520324707, + "learning_rate": 5.94735395403853e-07, + "loss": 0.273, + "num_input_tokens_seen": 3827302400, + "step": 1825 + }, + { + "epoch": 0.27827502034174123, + "grad_norm": 22.112329483032227, + "learning_rate": 5.943336888080167e-07, + "loss": 0.7285, + "num_input_tokens_seen": 3829399552, + "step": 1826 + }, + { + "epoch": 0.2785462435584486, + "grad_norm": 17.80236053466797, + "learning_rate": 5.939319465341148e-07, + "loss": 0.5338, + "num_input_tokens_seen": 3831496704, + "step": 1827 + }, + { + "epoch": 0.27881746677515595, + "grad_norm": 17.097667694091797, + "learning_rate": 5.93530168905454e-07, + "loss": 0.444, + "num_input_tokens_seen": 3833593856, + "step": 1828 + }, + { + "epoch": 0.2790886899918633, + "grad_norm": 16.976318359375, + "learning_rate": 5.931283562453696e-07, + "loss": 0.4893, + "num_input_tokens_seen": 3835691008, + "step": 1829 + }, + { + "epoch": 0.2793599132085707, + "grad_norm": 14.774353981018066, + "learning_rate": 5.927265088772255e-07, + "loss": 0.4142, + "num_input_tokens_seen": 3837788160, + "step": 1830 + }, + { + "epoch": 0.279631136425278, + "grad_norm": 17.598939895629883, + "learning_rate": 5.923246271244127e-07, + "loss": 0.5785, + "num_input_tokens_seen": 3839885312, + "step": 1831 + }, + { + "epoch": 0.27990235964198534, + "grad_norm": 18.117116928100586, + "learning_rate": 5.919227113103508e-07, + "loss": 0.4257, + "num_input_tokens_seen": 3841982464, + "step": 1832 + }, + { + "epoch": 0.2801735828586927, + "grad_norm": 12.194579124450684, + "learning_rate": 5.915207617584858e-07, + "loss": 0.279, + "num_input_tokens_seen": 3844079616, + "step": 1833 + }, + { + "epoch": 0.28044480607540007, + "grad_norm": 15.989289283752441, + "learning_rate": 5.911187787922918e-07, + "loss": 0.4616, + "num_input_tokens_seen": 3846176768, + "step": 1834 + }, + { + "epoch": 0.2807160292921074, + "grad_norm": 14.382404327392578, + "learning_rate": 5.907167627352697e-07, + "loss": 0.3184, + "num_input_tokens_seen": 3848273920, + "step": 1835 + }, + { + "epoch": 0.28098725250881473, + "grad_norm": 15.597938537597656, + "learning_rate": 5.90314713910946e-07, + "loss": 0.4872, + "num_input_tokens_seen": 3850371072, + "step": 1836 + }, + { + "epoch": 0.2812584757255221, + "grad_norm": 14.058372497558594, + "learning_rate": 5.899126326428746e-07, + "loss": 0.4539, + "num_input_tokens_seen": 3852468224, + "step": 1837 + }, + { + "epoch": 0.28152969894222946, + "grad_norm": 23.597742080688477, + "learning_rate": 5.895105192546353e-07, + "loss": 0.742, + "num_input_tokens_seen": 3854565376, + "step": 1838 + }, + { + "epoch": 0.2818009221589368, + "grad_norm": 10.927760124206543, + "learning_rate": 5.891083740698337e-07, + "loss": 0.243, + "num_input_tokens_seen": 3856662528, + "step": 1839 + }, + { + "epoch": 0.2820721453756442, + "grad_norm": 19.77141571044922, + "learning_rate": 5.887061974121007e-07, + "loss": 0.4831, + "num_input_tokens_seen": 3858759680, + "step": 1840 + }, + { + "epoch": 0.2823433685923515, + "grad_norm": 18.980165481567383, + "learning_rate": 5.883039896050928e-07, + "loss": 0.6657, + "num_input_tokens_seen": 3860856832, + "step": 1841 + }, + { + "epoch": 0.28261459180905885, + "grad_norm": 14.606121063232422, + "learning_rate": 5.879017509724919e-07, + "loss": 0.4045, + "num_input_tokens_seen": 3862953984, + "step": 1842 + }, + { + "epoch": 0.2828858150257662, + "grad_norm": 18.364437103271484, + "learning_rate": 5.874994818380039e-07, + "loss": 0.5391, + "num_input_tokens_seen": 3865051136, + "step": 1843 + }, + { + "epoch": 0.28315703824247357, + "grad_norm": 16.989797592163086, + "learning_rate": 5.8709718252536e-07, + "loss": 0.4491, + "num_input_tokens_seen": 3867148288, + "step": 1844 + }, + { + "epoch": 0.28342826145918093, + "grad_norm": 14.282411575317383, + "learning_rate": 5.866948533583152e-07, + "loss": 0.338, + "num_input_tokens_seen": 3869245440, + "step": 1845 + }, + { + "epoch": 0.28369948467588824, + "grad_norm": 11.778130531311035, + "learning_rate": 5.862924946606487e-07, + "loss": 0.2293, + "num_input_tokens_seen": 3871342592, + "step": 1846 + }, + { + "epoch": 0.2839707078925956, + "grad_norm": 17.8021297454834, + "learning_rate": 5.858901067561637e-07, + "loss": 0.4903, + "num_input_tokens_seen": 3873439744, + "step": 1847 + }, + { + "epoch": 0.28424193110930296, + "grad_norm": 21.23961067199707, + "learning_rate": 5.854876899686864e-07, + "loss": 0.8826, + "num_input_tokens_seen": 3875536896, + "step": 1848 + }, + { + "epoch": 0.2845131543260103, + "grad_norm": 21.922151565551758, + "learning_rate": 5.850852446220666e-07, + "loss": 0.6914, + "num_input_tokens_seen": 3877634048, + "step": 1849 + }, + { + "epoch": 0.2847843775427177, + "grad_norm": 15.679913520812988, + "learning_rate": 5.84682771040177e-07, + "loss": 0.3711, + "num_input_tokens_seen": 3879731200, + "step": 1850 + }, + { + "epoch": 0.285055600759425, + "grad_norm": 12.335871696472168, + "learning_rate": 5.842802695469131e-07, + "loss": 0.3313, + "num_input_tokens_seen": 3881828352, + "step": 1851 + }, + { + "epoch": 0.28532682397613235, + "grad_norm": 13.751896858215332, + "learning_rate": 5.838777404661927e-07, + "loss": 0.3604, + "num_input_tokens_seen": 3883925504, + "step": 1852 + }, + { + "epoch": 0.2855980471928397, + "grad_norm": 17.7875919342041, + "learning_rate": 5.83475184121956e-07, + "loss": 0.6989, + "num_input_tokens_seen": 3886022656, + "step": 1853 + }, + { + "epoch": 0.28586927040954707, + "grad_norm": 21.652841567993164, + "learning_rate": 5.830726008381648e-07, + "loss": 0.6116, + "num_input_tokens_seen": 3888119808, + "step": 1854 + }, + { + "epoch": 0.28614049362625443, + "grad_norm": 17.218547821044922, + "learning_rate": 5.826699909388031e-07, + "loss": 0.4374, + "num_input_tokens_seen": 3890216960, + "step": 1855 + }, + { + "epoch": 0.28641171684296174, + "grad_norm": 16.56511116027832, + "learning_rate": 5.822673547478757e-07, + "loss": 0.5781, + "num_input_tokens_seen": 3892314112, + "step": 1856 + }, + { + "epoch": 0.2866829400596691, + "grad_norm": 18.990068435668945, + "learning_rate": 5.818646925894092e-07, + "loss": 0.6474, + "num_input_tokens_seen": 3894411264, + "step": 1857 + }, + { + "epoch": 0.28695416327637646, + "grad_norm": 9.938685417175293, + "learning_rate": 5.814620047874505e-07, + "loss": 0.2562, + "num_input_tokens_seen": 3896508416, + "step": 1858 + }, + { + "epoch": 0.2872253864930838, + "grad_norm": 16.420949935913086, + "learning_rate": 5.810592916660677e-07, + "loss": 0.4856, + "num_input_tokens_seen": 3898605568, + "step": 1859 + }, + { + "epoch": 0.2874966097097912, + "grad_norm": 17.336559295654297, + "learning_rate": 5.806565535493489e-07, + "loss": 0.3751, + "num_input_tokens_seen": 3900702720, + "step": 1860 + }, + { + "epoch": 0.2877678329264985, + "grad_norm": 12.965970039367676, + "learning_rate": 5.802537907614023e-07, + "loss": 0.3521, + "num_input_tokens_seen": 3902799872, + "step": 1861 + }, + { + "epoch": 0.28803905614320585, + "grad_norm": 20.108755111694336, + "learning_rate": 5.798510036263561e-07, + "loss": 0.7629, + "num_input_tokens_seen": 3904897024, + "step": 1862 + }, + { + "epoch": 0.2883102793599132, + "grad_norm": 15.406062126159668, + "learning_rate": 5.794481924683581e-07, + "loss": 0.5382, + "num_input_tokens_seen": 3906994176, + "step": 1863 + }, + { + "epoch": 0.2885815025766206, + "grad_norm": 21.75050163269043, + "learning_rate": 5.790453576115756e-07, + "loss": 0.4859, + "num_input_tokens_seen": 3909091328, + "step": 1864 + }, + { + "epoch": 0.28885272579332794, + "grad_norm": 11.159966468811035, + "learning_rate": 5.786424993801942e-07, + "loss": 0.2867, + "num_input_tokens_seen": 3911188480, + "step": 1865 + }, + { + "epoch": 0.28912394901003524, + "grad_norm": 16.282310485839844, + "learning_rate": 5.782396180984194e-07, + "loss": 0.3638, + "num_input_tokens_seen": 3913285632, + "step": 1866 + }, + { + "epoch": 0.2893951722267426, + "grad_norm": 15.254396438598633, + "learning_rate": 5.778367140904746e-07, + "loss": 0.4206, + "num_input_tokens_seen": 3915382784, + "step": 1867 + }, + { + "epoch": 0.28966639544344996, + "grad_norm": 18.541379928588867, + "learning_rate": 5.774337876806016e-07, + "loss": 0.5837, + "num_input_tokens_seen": 3917479936, + "step": 1868 + }, + { + "epoch": 0.2899376186601573, + "grad_norm": 20.998781204223633, + "learning_rate": 5.770308391930601e-07, + "loss": 0.5548, + "num_input_tokens_seen": 3919577088, + "step": 1869 + }, + { + "epoch": 0.2902088418768647, + "grad_norm": 14.681344985961914, + "learning_rate": 5.766278689521278e-07, + "loss": 0.428, + "num_input_tokens_seen": 3921674240, + "step": 1870 + }, + { + "epoch": 0.290480065093572, + "grad_norm": 11.808730125427246, + "learning_rate": 5.762248772820999e-07, + "loss": 0.3078, + "num_input_tokens_seen": 3923771392, + "step": 1871 + }, + { + "epoch": 0.29075128831027935, + "grad_norm": 12.644157409667969, + "learning_rate": 5.758218645072887e-07, + "loss": 0.3758, + "num_input_tokens_seen": 3925868544, + "step": 1872 + }, + { + "epoch": 0.2910225115269867, + "grad_norm": 18.32072639465332, + "learning_rate": 5.754188309520235e-07, + "loss": 0.5625, + "num_input_tokens_seen": 3927965696, + "step": 1873 + }, + { + "epoch": 0.2912937347436941, + "grad_norm": 20.193140029907227, + "learning_rate": 5.750157769406504e-07, + "loss": 0.6177, + "num_input_tokens_seen": 3930062848, + "step": 1874 + }, + { + "epoch": 0.2915649579604014, + "grad_norm": 20.246875762939453, + "learning_rate": 5.74612702797532e-07, + "loss": 0.7787, + "num_input_tokens_seen": 3932160000, + "step": 1875 + }, + { + "epoch": 0.29183618117710874, + "grad_norm": 44.87726974487305, + "learning_rate": 5.74209608847047e-07, + "loss": 0.7121, + "num_input_tokens_seen": 3934257152, + "step": 1876 + }, + { + "epoch": 0.2921074043938161, + "grad_norm": 22.28316879272461, + "learning_rate": 5.7380649541359e-07, + "loss": 0.6667, + "num_input_tokens_seen": 3936354304, + "step": 1877 + }, + { + "epoch": 0.29237862761052347, + "grad_norm": 12.953632354736328, + "learning_rate": 5.734033628215714e-07, + "loss": 0.3718, + "num_input_tokens_seen": 3938451456, + "step": 1878 + }, + { + "epoch": 0.29264985082723083, + "grad_norm": 18.659269332885742, + "learning_rate": 5.730002113954169e-07, + "loss": 0.6267, + "num_input_tokens_seen": 3940548608, + "step": 1879 + }, + { + "epoch": 0.29292107404393813, + "grad_norm": 24.658376693725586, + "learning_rate": 5.725970414595675e-07, + "loss": 0.7124, + "num_input_tokens_seen": 3942645760, + "step": 1880 + }, + { + "epoch": 0.2931922972606455, + "grad_norm": 15.565204620361328, + "learning_rate": 5.721938533384791e-07, + "loss": 0.5809, + "num_input_tokens_seen": 3944742912, + "step": 1881 + }, + { + "epoch": 0.29346352047735286, + "grad_norm": 16.507081985473633, + "learning_rate": 5.717906473566219e-07, + "loss": 0.4206, + "num_input_tokens_seen": 3946840064, + "step": 1882 + }, + { + "epoch": 0.2937347436940602, + "grad_norm": 19.66336441040039, + "learning_rate": 5.71387423838481e-07, + "loss": 0.6355, + "num_input_tokens_seen": 3948937216, + "step": 1883 + }, + { + "epoch": 0.2940059669107676, + "grad_norm": 13.438267707824707, + "learning_rate": 5.709841831085553e-07, + "loss": 0.3339, + "num_input_tokens_seen": 3951034368, + "step": 1884 + }, + { + "epoch": 0.2942771901274749, + "grad_norm": 12.036945343017578, + "learning_rate": 5.705809254913576e-07, + "loss": 0.3658, + "num_input_tokens_seen": 3953131520, + "step": 1885 + }, + { + "epoch": 0.29454841334418225, + "grad_norm": 19.046716690063477, + "learning_rate": 5.70177651311414e-07, + "loss": 0.6545, + "num_input_tokens_seen": 3955228672, + "step": 1886 + }, + { + "epoch": 0.2948196365608896, + "grad_norm": 16.657411575317383, + "learning_rate": 5.697743608932646e-07, + "loss": 0.3465, + "num_input_tokens_seen": 3957325824, + "step": 1887 + }, + { + "epoch": 0.29509085977759697, + "grad_norm": 28.270238876342773, + "learning_rate": 5.693710545614621e-07, + "loss": 1.0207, + "num_input_tokens_seen": 3959422976, + "step": 1888 + }, + { + "epoch": 0.29536208299430433, + "grad_norm": 18.105026245117188, + "learning_rate": 5.689677326405719e-07, + "loss": 0.6864, + "num_input_tokens_seen": 3961520128, + "step": 1889 + }, + { + "epoch": 0.29563330621101164, + "grad_norm": 22.831960678100586, + "learning_rate": 5.685643954551722e-07, + "loss": 0.8733, + "num_input_tokens_seen": 3963617280, + "step": 1890 + }, + { + "epoch": 0.295904529427719, + "grad_norm": 18.551103591918945, + "learning_rate": 5.681610433298535e-07, + "loss": 0.5093, + "num_input_tokens_seen": 3965714432, + "step": 1891 + }, + { + "epoch": 0.29617575264442636, + "grad_norm": 15.365251541137695, + "learning_rate": 5.677576765892182e-07, + "loss": 0.4713, + "num_input_tokens_seen": 3967811584, + "step": 1892 + }, + { + "epoch": 0.2964469758611337, + "grad_norm": 16.42351722717285, + "learning_rate": 5.673542955578806e-07, + "loss": 0.4017, + "num_input_tokens_seen": 3969908736, + "step": 1893 + }, + { + "epoch": 0.2967181990778411, + "grad_norm": 15.9588623046875, + "learning_rate": 5.669509005604663e-07, + "loss": 0.3471, + "num_input_tokens_seen": 3972005888, + "step": 1894 + }, + { + "epoch": 0.2969894222945484, + "grad_norm": 21.854177474975586, + "learning_rate": 5.665474919216122e-07, + "loss": 0.7455, + "num_input_tokens_seen": 3974103040, + "step": 1895 + }, + { + "epoch": 0.29726064551125575, + "grad_norm": 15.875153541564941, + "learning_rate": 5.661440699659663e-07, + "loss": 0.4565, + "num_input_tokens_seen": 3976200192, + "step": 1896 + }, + { + "epoch": 0.2975318687279631, + "grad_norm": 13.967791557312012, + "learning_rate": 5.657406350181872e-07, + "loss": 0.3644, + "num_input_tokens_seen": 3978297344, + "step": 1897 + }, + { + "epoch": 0.2978030919446705, + "grad_norm": 22.632991790771484, + "learning_rate": 5.65337187402944e-07, + "loss": 0.7488, + "num_input_tokens_seen": 3980394496, + "step": 1898 + }, + { + "epoch": 0.29807431516137783, + "grad_norm": 35.31967544555664, + "learning_rate": 5.64933727444916e-07, + "loss": 0.5608, + "num_input_tokens_seen": 3982491648, + "step": 1899 + }, + { + "epoch": 0.29834553837808514, + "grad_norm": 13.531403541564941, + "learning_rate": 5.645302554687925e-07, + "loss": 0.3702, + "num_input_tokens_seen": 3984588800, + "step": 1900 + }, + { + "epoch": 0.2986167615947925, + "grad_norm": 15.457035064697266, + "learning_rate": 5.641267717992723e-07, + "loss": 0.4089, + "num_input_tokens_seen": 3986685952, + "step": 1901 + }, + { + "epoch": 0.29888798481149986, + "grad_norm": 15.16627311706543, + "learning_rate": 5.637232767610637e-07, + "loss": 0.4136, + "num_input_tokens_seen": 3988783104, + "step": 1902 + }, + { + "epoch": 0.2991592080282072, + "grad_norm": 15.52268123626709, + "learning_rate": 5.63319770678884e-07, + "loss": 0.4861, + "num_input_tokens_seen": 3990880256, + "step": 1903 + }, + { + "epoch": 0.2994304312449146, + "grad_norm": 14.946885108947754, + "learning_rate": 5.6291625387746e-07, + "loss": 0.3985, + "num_input_tokens_seen": 3992977408, + "step": 1904 + }, + { + "epoch": 0.2997016544616219, + "grad_norm": 18.582761764526367, + "learning_rate": 5.625127266815263e-07, + "loss": 0.3807, + "num_input_tokens_seen": 3995074560, + "step": 1905 + }, + { + "epoch": 0.29997287767832925, + "grad_norm": 17.98885726928711, + "learning_rate": 5.621091894158261e-07, + "loss": 0.3393, + "num_input_tokens_seen": 3997171712, + "step": 1906 + }, + { + "epoch": 0.3002441008950366, + "grad_norm": 17.400163650512695, + "learning_rate": 5.617056424051113e-07, + "loss": 0.5337, + "num_input_tokens_seen": 3999268864, + "step": 1907 + }, + { + "epoch": 0.300515324111744, + "grad_norm": 20.89246368408203, + "learning_rate": 5.613020859741408e-07, + "loss": 0.4942, + "num_input_tokens_seen": 4001366016, + "step": 1908 + }, + { + "epoch": 0.30078654732845134, + "grad_norm": 22.6678524017334, + "learning_rate": 5.608985204476817e-07, + "loss": 0.6841, + "num_input_tokens_seen": 4003463168, + "step": 1909 + }, + { + "epoch": 0.30105777054515864, + "grad_norm": 29.596494674682617, + "learning_rate": 5.60494946150508e-07, + "loss": 0.5315, + "num_input_tokens_seen": 4005560320, + "step": 1910 + }, + { + "epoch": 0.301328993761866, + "grad_norm": 16.15056037902832, + "learning_rate": 5.600913634074009e-07, + "loss": 0.4301, + "num_input_tokens_seen": 4007657472, + "step": 1911 + }, + { + "epoch": 0.30160021697857337, + "grad_norm": 16.99407196044922, + "learning_rate": 5.596877725431487e-07, + "loss": 0.4657, + "num_input_tokens_seen": 4009754624, + "step": 1912 + }, + { + "epoch": 0.3018714401952807, + "grad_norm": 13.268138885498047, + "learning_rate": 5.592841738825457e-07, + "loss": 0.313, + "num_input_tokens_seen": 4011851776, + "step": 1913 + }, + { + "epoch": 0.3021426634119881, + "grad_norm": 16.610029220581055, + "learning_rate": 5.588805677503928e-07, + "loss": 0.3853, + "num_input_tokens_seen": 4013948928, + "step": 1914 + }, + { + "epoch": 0.3024138866286954, + "grad_norm": 14.709389686584473, + "learning_rate": 5.58476954471497e-07, + "loss": 0.4144, + "num_input_tokens_seen": 4016046080, + "step": 1915 + }, + { + "epoch": 0.30268510984540276, + "grad_norm": 18.49999237060547, + "learning_rate": 5.580733343706708e-07, + "loss": 0.6721, + "num_input_tokens_seen": 4018143232, + "step": 1916 + }, + { + "epoch": 0.3029563330621101, + "grad_norm": 17.968942642211914, + "learning_rate": 5.576697077727323e-07, + "loss": 0.4901, + "num_input_tokens_seen": 4020240384, + "step": 1917 + }, + { + "epoch": 0.3032275562788175, + "grad_norm": 18.4853515625, + "learning_rate": 5.572660750025049e-07, + "loss": 0.4983, + "num_input_tokens_seen": 4022337536, + "step": 1918 + }, + { + "epoch": 0.30349877949552484, + "grad_norm": 15.520868301391602, + "learning_rate": 5.568624363848166e-07, + "loss": 0.3819, + "num_input_tokens_seen": 4024434688, + "step": 1919 + }, + { + "epoch": 0.30377000271223215, + "grad_norm": 10.888456344604492, + "learning_rate": 5.564587922445008e-07, + "loss": 0.2858, + "num_input_tokens_seen": 4026531840, + "step": 1920 + }, + { + "epoch": 0.3040412259289395, + "grad_norm": 13.872169494628906, + "learning_rate": 5.560551429063949e-07, + "loss": 0.3409, + "num_input_tokens_seen": 4028628992, + "step": 1921 + }, + { + "epoch": 0.30431244914564687, + "grad_norm": 22.901479721069336, + "learning_rate": 5.556514886953403e-07, + "loss": 0.7254, + "num_input_tokens_seen": 4030726144, + "step": 1922 + }, + { + "epoch": 0.30458367236235423, + "grad_norm": 15.373479843139648, + "learning_rate": 5.552478299361826e-07, + "loss": 0.4352, + "num_input_tokens_seen": 4032823296, + "step": 1923 + }, + { + "epoch": 0.3048548955790616, + "grad_norm": 17.427900314331055, + "learning_rate": 5.548441669537712e-07, + "loss": 0.6943, + "num_input_tokens_seen": 4034920448, + "step": 1924 + }, + { + "epoch": 0.3051261187957689, + "grad_norm": 20.171907424926758, + "learning_rate": 5.544405000729584e-07, + "loss": 0.5922, + "num_input_tokens_seen": 4037017600, + "step": 1925 + }, + { + "epoch": 0.30539734201247626, + "grad_norm": 15.804996490478516, + "learning_rate": 5.540368296186002e-07, + "loss": 0.4632, + "num_input_tokens_seen": 4039114752, + "step": 1926 + }, + { + "epoch": 0.3056685652291836, + "grad_norm": 13.54218864440918, + "learning_rate": 5.53633155915555e-07, + "loss": 0.3587, + "num_input_tokens_seen": 4041211904, + "step": 1927 + }, + { + "epoch": 0.305939788445891, + "grad_norm": 24.575578689575195, + "learning_rate": 5.532294792886843e-07, + "loss": 0.8946, + "num_input_tokens_seen": 4043309056, + "step": 1928 + }, + { + "epoch": 0.30621101166259834, + "grad_norm": 18.244264602661133, + "learning_rate": 5.528258000628518e-07, + "loss": 0.4468, + "num_input_tokens_seen": 4045406208, + "step": 1929 + }, + { + "epoch": 0.30648223487930565, + "grad_norm": 18.432344436645508, + "learning_rate": 5.524221185629224e-07, + "loss": 0.5916, + "num_input_tokens_seen": 4047503360, + "step": 1930 + }, + { + "epoch": 0.306753458096013, + "grad_norm": 14.628120422363281, + "learning_rate": 5.520184351137646e-07, + "loss": 0.428, + "num_input_tokens_seen": 4049600512, + "step": 1931 + }, + { + "epoch": 0.30702468131272037, + "grad_norm": 15.325323104858398, + "learning_rate": 5.51614750040247e-07, + "loss": 0.3338, + "num_input_tokens_seen": 4051697664, + "step": 1932 + }, + { + "epoch": 0.30729590452942773, + "grad_norm": 18.96306037902832, + "learning_rate": 5.512110636672396e-07, + "loss": 0.5334, + "num_input_tokens_seen": 4053794816, + "step": 1933 + }, + { + "epoch": 0.3075671277461351, + "grad_norm": 16.209426879882812, + "learning_rate": 5.508073763196146e-07, + "loss": 0.405, + "num_input_tokens_seen": 4055891968, + "step": 1934 + }, + { + "epoch": 0.3078383509628424, + "grad_norm": 17.50925064086914, + "learning_rate": 5.504036883222438e-07, + "loss": 0.4682, + "num_input_tokens_seen": 4057989120, + "step": 1935 + }, + { + "epoch": 0.30810957417954976, + "grad_norm": 21.680133819580078, + "learning_rate": 5.5e-07, + "loss": 0.6966, + "num_input_tokens_seen": 4060086272, + "step": 1936 + }, + { + "epoch": 0.3083807973962571, + "grad_norm": 16.945390701293945, + "learning_rate": 5.495963116777562e-07, + "loss": 0.3213, + "num_input_tokens_seen": 4062183424, + "step": 1937 + }, + { + "epoch": 0.3086520206129645, + "grad_norm": 19.299571990966797, + "learning_rate": 5.491926236803854e-07, + "loss": 0.6936, + "num_input_tokens_seen": 4064280576, + "step": 1938 + }, + { + "epoch": 0.30892324382967185, + "grad_norm": 16.683467864990234, + "learning_rate": 5.487889363327603e-07, + "loss": 0.5167, + "num_input_tokens_seen": 4066377728, + "step": 1939 + }, + { + "epoch": 0.30919446704637915, + "grad_norm": 11.593538284301758, + "learning_rate": 5.483852499597532e-07, + "loss": 0.308, + "num_input_tokens_seen": 4068474880, + "step": 1940 + }, + { + "epoch": 0.3094656902630865, + "grad_norm": 21.42290687561035, + "learning_rate": 5.479815648862355e-07, + "loss": 0.6182, + "num_input_tokens_seen": 4070572032, + "step": 1941 + }, + { + "epoch": 0.3097369134797939, + "grad_norm": 10.256348609924316, + "learning_rate": 5.475778814370776e-07, + "loss": 0.2313, + "num_input_tokens_seen": 4072669184, + "step": 1942 + }, + { + "epoch": 0.31000813669650124, + "grad_norm": 12.884416580200195, + "learning_rate": 5.471741999371483e-07, + "loss": 0.3522, + "num_input_tokens_seen": 4074766336, + "step": 1943 + }, + { + "epoch": 0.3102793599132086, + "grad_norm": 15.000761032104492, + "learning_rate": 5.467705207113156e-07, + "loss": 0.3817, + "num_input_tokens_seen": 4076863488, + "step": 1944 + }, + { + "epoch": 0.3105505831299159, + "grad_norm": 18.647953033447266, + "learning_rate": 5.46366844084445e-07, + "loss": 0.5017, + "num_input_tokens_seen": 4078960640, + "step": 1945 + }, + { + "epoch": 0.31082180634662326, + "grad_norm": 16.23006820678711, + "learning_rate": 5.459631703813998e-07, + "loss": 0.4558, + "num_input_tokens_seen": 4081057792, + "step": 1946 + }, + { + "epoch": 0.3110930295633306, + "grad_norm": 14.229667663574219, + "learning_rate": 5.455594999270416e-07, + "loss": 0.3032, + "num_input_tokens_seen": 4083154944, + "step": 1947 + }, + { + "epoch": 0.311364252780038, + "grad_norm": 22.243261337280273, + "learning_rate": 5.451558330462289e-07, + "loss": 0.5615, + "num_input_tokens_seen": 4085252096, + "step": 1948 + }, + { + "epoch": 0.31163547599674535, + "grad_norm": 16.879133224487305, + "learning_rate": 5.447521700638174e-07, + "loss": 0.4128, + "num_input_tokens_seen": 4087349248, + "step": 1949 + }, + { + "epoch": 0.31190669921345265, + "grad_norm": 17.294158935546875, + "learning_rate": 5.443485113046597e-07, + "loss": 0.4147, + "num_input_tokens_seen": 4089446400, + "step": 1950 + }, + { + "epoch": 0.31217792243016, + "grad_norm": 12.599308967590332, + "learning_rate": 5.439448570936052e-07, + "loss": 0.3165, + "num_input_tokens_seen": 4091543552, + "step": 1951 + }, + { + "epoch": 0.3124491456468674, + "grad_norm": 16.82218360900879, + "learning_rate": 5.435412077554991e-07, + "loss": 0.4091, + "num_input_tokens_seen": 4093640704, + "step": 1952 + }, + { + "epoch": 0.31272036886357474, + "grad_norm": 15.809752464294434, + "learning_rate": 5.431375636151833e-07, + "loss": 0.4083, + "num_input_tokens_seen": 4095737856, + "step": 1953 + }, + { + "epoch": 0.3129915920802821, + "grad_norm": 24.349023818969727, + "learning_rate": 5.427339249974952e-07, + "loss": 0.8117, + "num_input_tokens_seen": 4097835008, + "step": 1954 + }, + { + "epoch": 0.3132628152969894, + "grad_norm": 15.348997116088867, + "learning_rate": 5.423302922272677e-07, + "loss": 0.4125, + "num_input_tokens_seen": 4099932160, + "step": 1955 + }, + { + "epoch": 0.31353403851369677, + "grad_norm": 21.631223678588867, + "learning_rate": 5.419266656293293e-07, + "loss": 0.6195, + "num_input_tokens_seen": 4102029312, + "step": 1956 + }, + { + "epoch": 0.31380526173040413, + "grad_norm": 17.50019645690918, + "learning_rate": 5.415230455285031e-07, + "loss": 0.4847, + "num_input_tokens_seen": 4104126464, + "step": 1957 + }, + { + "epoch": 0.3140764849471115, + "grad_norm": 12.668447494506836, + "learning_rate": 5.411194322496073e-07, + "loss": 0.3183, + "num_input_tokens_seen": 4106223616, + "step": 1958 + }, + { + "epoch": 0.3143477081638188, + "grad_norm": 14.355048179626465, + "learning_rate": 5.407158261174543e-07, + "loss": 0.2835, + "num_input_tokens_seen": 4108320768, + "step": 1959 + }, + { + "epoch": 0.31461893138052616, + "grad_norm": 13.801246643066406, + "learning_rate": 5.403122274568514e-07, + "loss": 0.2686, + "num_input_tokens_seen": 4110417920, + "step": 1960 + }, + { + "epoch": 0.3148901545972335, + "grad_norm": 23.078720092773438, + "learning_rate": 5.399086365925991e-07, + "loss": 0.8419, + "num_input_tokens_seen": 4112515072, + "step": 1961 + }, + { + "epoch": 0.3151613778139409, + "grad_norm": 13.70639419555664, + "learning_rate": 5.395050538494922e-07, + "loss": 0.3, + "num_input_tokens_seen": 4114612224, + "step": 1962 + }, + { + "epoch": 0.31543260103064824, + "grad_norm": 11.593742370605469, + "learning_rate": 5.391014795523184e-07, + "loss": 0.2514, + "num_input_tokens_seen": 4116709376, + "step": 1963 + }, + { + "epoch": 0.31570382424735555, + "grad_norm": 17.700437545776367, + "learning_rate": 5.386979140258592e-07, + "loss": 0.5374, + "num_input_tokens_seen": 4118806528, + "step": 1964 + }, + { + "epoch": 0.3159750474640629, + "grad_norm": 13.762934684753418, + "learning_rate": 5.382943575948888e-07, + "loss": 0.3823, + "num_input_tokens_seen": 4120903680, + "step": 1965 + }, + { + "epoch": 0.31624627068077027, + "grad_norm": 21.619094848632812, + "learning_rate": 5.378908105841738e-07, + "loss": 0.5349, + "num_input_tokens_seen": 4123000832, + "step": 1966 + }, + { + "epoch": 0.31651749389747763, + "grad_norm": 16.5258731842041, + "learning_rate": 5.374872733184737e-07, + "loss": 0.4939, + "num_input_tokens_seen": 4125097984, + "step": 1967 + }, + { + "epoch": 0.316788717114185, + "grad_norm": 19.478639602661133, + "learning_rate": 5.370837461225402e-07, + "loss": 0.6686, + "num_input_tokens_seen": 4127195136, + "step": 1968 + }, + { + "epoch": 0.3170599403308923, + "grad_norm": 15.19251537322998, + "learning_rate": 5.36680229321116e-07, + "loss": 0.3994, + "num_input_tokens_seen": 4129292288, + "step": 1969 + }, + { + "epoch": 0.31733116354759966, + "grad_norm": 18.16364288330078, + "learning_rate": 5.362767232389365e-07, + "loss": 0.5067, + "num_input_tokens_seen": 4131389440, + "step": 1970 + }, + { + "epoch": 0.317602386764307, + "grad_norm": 13.165485382080078, + "learning_rate": 5.358732282007278e-07, + "loss": 0.3757, + "num_input_tokens_seen": 4133486592, + "step": 1971 + }, + { + "epoch": 0.3178736099810144, + "grad_norm": 17.412708282470703, + "learning_rate": 5.354697445312074e-07, + "loss": 0.5601, + "num_input_tokens_seen": 4135583744, + "step": 1972 + }, + { + "epoch": 0.31814483319772174, + "grad_norm": 16.720489501953125, + "learning_rate": 5.35066272555084e-07, + "loss": 0.6123, + "num_input_tokens_seen": 4137680896, + "step": 1973 + }, + { + "epoch": 0.31841605641442905, + "grad_norm": 16.069841384887695, + "learning_rate": 5.346628125970562e-07, + "loss": 0.401, + "num_input_tokens_seen": 4139778048, + "step": 1974 + }, + { + "epoch": 0.3186872796311364, + "grad_norm": 25.234981536865234, + "learning_rate": 5.342593649818129e-07, + "loss": 0.6885, + "num_input_tokens_seen": 4141875200, + "step": 1975 + }, + { + "epoch": 0.3189585028478438, + "grad_norm": 14.604377746582031, + "learning_rate": 5.338559300340338e-07, + "loss": 0.4314, + "num_input_tokens_seen": 4143972352, + "step": 1976 + }, + { + "epoch": 0.31922972606455113, + "grad_norm": 17.683347702026367, + "learning_rate": 5.334525080783879e-07, + "loss": 0.4862, + "num_input_tokens_seen": 4146069504, + "step": 1977 + }, + { + "epoch": 0.3195009492812585, + "grad_norm": 15.543233871459961, + "learning_rate": 5.330490994395338e-07, + "loss": 0.2884, + "num_input_tokens_seen": 4148166656, + "step": 1978 + }, + { + "epoch": 0.3197721724979658, + "grad_norm": 16.377304077148438, + "learning_rate": 5.326457044421195e-07, + "loss": 0.5094, + "num_input_tokens_seen": 4150263808, + "step": 1979 + }, + { + "epoch": 0.32004339571467316, + "grad_norm": 14.058331489562988, + "learning_rate": 5.322423234107818e-07, + "loss": 0.4386, + "num_input_tokens_seen": 4152360960, + "step": 1980 + }, + { + "epoch": 0.3203146189313805, + "grad_norm": 20.853776931762695, + "learning_rate": 5.318389566701465e-07, + "loss": 0.5509, + "num_input_tokens_seen": 4154458112, + "step": 1981 + }, + { + "epoch": 0.3205858421480879, + "grad_norm": 17.144161224365234, + "learning_rate": 5.314356045448278e-07, + "loss": 0.403, + "num_input_tokens_seen": 4156555264, + "step": 1982 + }, + { + "epoch": 0.32085706536479525, + "grad_norm": 17.98135757446289, + "learning_rate": 5.310322673594282e-07, + "loss": 0.4065, + "num_input_tokens_seen": 4158652416, + "step": 1983 + }, + { + "epoch": 0.32112828858150255, + "grad_norm": 9.543654441833496, + "learning_rate": 5.306289454385379e-07, + "loss": 0.2229, + "num_input_tokens_seen": 4160749568, + "step": 1984 + }, + { + "epoch": 0.3213995117982099, + "grad_norm": 15.082742691040039, + "learning_rate": 5.302256391067354e-07, + "loss": 0.4597, + "num_input_tokens_seen": 4162846720, + "step": 1985 + }, + { + "epoch": 0.3216707350149173, + "grad_norm": 14.424219131469727, + "learning_rate": 5.29822348688586e-07, + "loss": 0.4349, + "num_input_tokens_seen": 4164943872, + "step": 1986 + }, + { + "epoch": 0.32194195823162464, + "grad_norm": 17.661540985107422, + "learning_rate": 5.294190745086426e-07, + "loss": 0.4877, + "num_input_tokens_seen": 4167041024, + "step": 1987 + }, + { + "epoch": 0.322213181448332, + "grad_norm": 21.022674560546875, + "learning_rate": 5.290158168914447e-07, + "loss": 0.6855, + "num_input_tokens_seen": 4169138176, + "step": 1988 + }, + { + "epoch": 0.3224844046650393, + "grad_norm": 19.004026412963867, + "learning_rate": 5.28612576161519e-07, + "loss": 0.616, + "num_input_tokens_seen": 4171235328, + "step": 1989 + }, + { + "epoch": 0.32275562788174667, + "grad_norm": 16.85509490966797, + "learning_rate": 5.282093526433781e-07, + "loss": 0.5411, + "num_input_tokens_seen": 4173332480, + "step": 1990 + }, + { + "epoch": 0.323026851098454, + "grad_norm": 14.019049644470215, + "learning_rate": 5.27806146661521e-07, + "loss": 0.2902, + "num_input_tokens_seen": 4175429632, + "step": 1991 + }, + { + "epoch": 0.3232980743151614, + "grad_norm": 16.84299087524414, + "learning_rate": 5.274029585404326e-07, + "loss": 0.3034, + "num_input_tokens_seen": 4177526784, + "step": 1992 + }, + { + "epoch": 0.32356929753186875, + "grad_norm": 13.026562690734863, + "learning_rate": 5.269997886045833e-07, + "loss": 0.3721, + "num_input_tokens_seen": 4179623936, + "step": 1993 + }, + { + "epoch": 0.32384052074857606, + "grad_norm": 16.872188568115234, + "learning_rate": 5.265966371784287e-07, + "loss": 0.5713, + "num_input_tokens_seen": 4181721088, + "step": 1994 + }, + { + "epoch": 0.3241117439652834, + "grad_norm": 16.673782348632812, + "learning_rate": 5.261935045864101e-07, + "loss": 0.2909, + "num_input_tokens_seen": 4183818240, + "step": 1995 + }, + { + "epoch": 0.3243829671819908, + "grad_norm": 14.819953918457031, + "learning_rate": 5.257903911529532e-07, + "loss": 0.4015, + "num_input_tokens_seen": 4185915392, + "step": 1996 + }, + { + "epoch": 0.32465419039869814, + "grad_norm": 13.379781723022461, + "learning_rate": 5.253872972024681e-07, + "loss": 0.2481, + "num_input_tokens_seen": 4188012544, + "step": 1997 + }, + { + "epoch": 0.3249254136154055, + "grad_norm": 14.248574256896973, + "learning_rate": 5.249842230593497e-07, + "loss": 0.376, + "num_input_tokens_seen": 4190109696, + "step": 1998 + }, + { + "epoch": 0.3251966368321128, + "grad_norm": 22.307329177856445, + "learning_rate": 5.245811690479766e-07, + "loss": 0.3937, + "num_input_tokens_seen": 4192206848, + "step": 1999 + }, + { + "epoch": 0.32546786004882017, + "grad_norm": 14.40904426574707, + "learning_rate": 5.241781354927113e-07, + "loss": 0.3849, + "num_input_tokens_seen": 4194304000, + "step": 2000 + }, + { + "epoch": 0.32573908326552753, + "grad_norm": 18.996400833129883, + "learning_rate": 5.237751227179001e-07, + "loss": 0.3802, + "num_input_tokens_seen": 4196401152, + "step": 2001 + }, + { + "epoch": 0.3260103064822349, + "grad_norm": 17.524658203125, + "learning_rate": 5.233721310478722e-07, + "loss": 0.5296, + "num_input_tokens_seen": 4198498304, + "step": 2002 + }, + { + "epoch": 0.32628152969894225, + "grad_norm": 11.911806106567383, + "learning_rate": 5.2296916080694e-07, + "loss": 0.2159, + "num_input_tokens_seen": 4200595456, + "step": 2003 + }, + { + "epoch": 0.32655275291564956, + "grad_norm": 17.387310028076172, + "learning_rate": 5.225662123193985e-07, + "loss": 0.4125, + "num_input_tokens_seen": 4202692608, + "step": 2004 + }, + { + "epoch": 0.3268239761323569, + "grad_norm": 17.1206111907959, + "learning_rate": 5.221632859095254e-07, + "loss": 0.3276, + "num_input_tokens_seen": 4204789760, + "step": 2005 + }, + { + "epoch": 0.3270951993490643, + "grad_norm": 17.467138290405273, + "learning_rate": 5.217603819015805e-07, + "loss": 0.4545, + "num_input_tokens_seen": 4206886912, + "step": 2006 + }, + { + "epoch": 0.32736642256577164, + "grad_norm": 22.18386459350586, + "learning_rate": 5.213575006198057e-07, + "loss": 0.7608, + "num_input_tokens_seen": 4208984064, + "step": 2007 + }, + { + "epoch": 0.327637645782479, + "grad_norm": 18.72940444946289, + "learning_rate": 5.209546423884246e-07, + "loss": 0.512, + "num_input_tokens_seen": 4211081216, + "step": 2008 + }, + { + "epoch": 0.3279088689991863, + "grad_norm": 15.777796745300293, + "learning_rate": 5.20551807531642e-07, + "loss": 0.3453, + "num_input_tokens_seen": 4213178368, + "step": 2009 + }, + { + "epoch": 0.32818009221589367, + "grad_norm": 20.98493194580078, + "learning_rate": 5.20148996373644e-07, + "loss": 0.6098, + "num_input_tokens_seen": 4215275520, + "step": 2010 + }, + { + "epoch": 0.32845131543260103, + "grad_norm": 13.770686149597168, + "learning_rate": 5.197462092385978e-07, + "loss": 0.3278, + "num_input_tokens_seen": 4217372672, + "step": 2011 + }, + { + "epoch": 0.3287225386493084, + "grad_norm": 10.093859672546387, + "learning_rate": 5.193434464506511e-07, + "loss": 0.1932, + "num_input_tokens_seen": 4219469824, + "step": 2012 + }, + { + "epoch": 0.32899376186601575, + "grad_norm": 16.35398292541504, + "learning_rate": 5.189407083339324e-07, + "loss": 0.3751, + "num_input_tokens_seen": 4221566976, + "step": 2013 + }, + { + "epoch": 0.32926498508272306, + "grad_norm": 10.368598937988281, + "learning_rate": 5.185379952125494e-07, + "loss": 0.2063, + "num_input_tokens_seen": 4223664128, + "step": 2014 + }, + { + "epoch": 0.3295362082994304, + "grad_norm": 15.229095458984375, + "learning_rate": 5.18135307410591e-07, + "loss": 0.3463, + "num_input_tokens_seen": 4225761280, + "step": 2015 + }, + { + "epoch": 0.3298074315161378, + "grad_norm": 21.608442306518555, + "learning_rate": 5.177326452521242e-07, + "loss": 0.3193, + "num_input_tokens_seen": 4227858432, + "step": 2016 + }, + { + "epoch": 0.33007865473284514, + "grad_norm": 16.298704147338867, + "learning_rate": 5.17330009061197e-07, + "loss": 0.3732, + "num_input_tokens_seen": 4229955584, + "step": 2017 + }, + { + "epoch": 0.3303498779495525, + "grad_norm": 17.584896087646484, + "learning_rate": 5.169273991618351e-07, + "loss": 0.3864, + "num_input_tokens_seen": 4232052736, + "step": 2018 + }, + { + "epoch": 0.3306211011662598, + "grad_norm": 17.949689865112305, + "learning_rate": 5.165248158780441e-07, + "loss": 0.3894, + "num_input_tokens_seen": 4234149888, + "step": 2019 + }, + { + "epoch": 0.3308923243829672, + "grad_norm": 22.562408447265625, + "learning_rate": 5.161222595338073e-07, + "loss": 0.7639, + "num_input_tokens_seen": 4236247040, + "step": 2020 + }, + { + "epoch": 0.33116354759967453, + "grad_norm": 15.829678535461426, + "learning_rate": 5.157197304530869e-07, + "loss": 0.3913, + "num_input_tokens_seen": 4238344192, + "step": 2021 + }, + { + "epoch": 0.3314347708163819, + "grad_norm": 19.865497589111328, + "learning_rate": 5.15317228959823e-07, + "loss": 0.4058, + "num_input_tokens_seen": 4240441344, + "step": 2022 + }, + { + "epoch": 0.33170599403308926, + "grad_norm": 20.204696655273438, + "learning_rate": 5.149147553779335e-07, + "loss": 0.61, + "num_input_tokens_seen": 4242538496, + "step": 2023 + }, + { + "epoch": 0.33197721724979656, + "grad_norm": 11.350019454956055, + "learning_rate": 5.145123100313137e-07, + "loss": 0.2718, + "num_input_tokens_seen": 4244635648, + "step": 2024 + }, + { + "epoch": 0.3322484404665039, + "grad_norm": 20.93108558654785, + "learning_rate": 5.141098932438365e-07, + "loss": 0.4187, + "num_input_tokens_seen": 4246732800, + "step": 2025 + }, + { + "epoch": 0.3325196636832113, + "grad_norm": 17.941852569580078, + "learning_rate": 5.137075053393512e-07, + "loss": 0.3879, + "num_input_tokens_seen": 4248829952, + "step": 2026 + }, + { + "epoch": 0.33279088689991865, + "grad_norm": 23.98504638671875, + "learning_rate": 5.133051466416849e-07, + "loss": 0.6891, + "num_input_tokens_seen": 4250927104, + "step": 2027 + }, + { + "epoch": 0.333062110116626, + "grad_norm": 17.1292667388916, + "learning_rate": 5.129028174746399e-07, + "loss": 0.3502, + "num_input_tokens_seen": 4253024256, + "step": 2028 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 23.580167770385742, + "learning_rate": 5.12500518161996e-07, + "loss": 0.939, + "num_input_tokens_seen": 4255121408, + "step": 2029 + }, + { + "epoch": 0.3336045565500407, + "grad_norm": 16.317161560058594, + "learning_rate": 5.120982490275081e-07, + "loss": 0.3974, + "num_input_tokens_seen": 4257218560, + "step": 2030 + }, + { + "epoch": 0.33387577976674804, + "grad_norm": 19.563129425048828, + "learning_rate": 5.116960103949071e-07, + "loss": 0.7093, + "num_input_tokens_seen": 4259315712, + "step": 2031 + }, + { + "epoch": 0.3341470029834554, + "grad_norm": 14.194764137268066, + "learning_rate": 5.112938025878994e-07, + "loss": 0.3047, + "num_input_tokens_seen": 4261412864, + "step": 2032 + }, + { + "epoch": 0.33441822620016276, + "grad_norm": 12.254755973815918, + "learning_rate": 5.108916259301663e-07, + "loss": 0.2439, + "num_input_tokens_seen": 4263510016, + "step": 2033 + }, + { + "epoch": 0.33468944941687007, + "grad_norm": 13.140419006347656, + "learning_rate": 5.104894807453647e-07, + "loss": 0.3246, + "num_input_tokens_seen": 4265607168, + "step": 2034 + }, + { + "epoch": 0.3349606726335774, + "grad_norm": 18.896413803100586, + "learning_rate": 5.100873673571253e-07, + "loss": 0.5276, + "num_input_tokens_seen": 4267704320, + "step": 2035 + }, + { + "epoch": 0.3352318958502848, + "grad_norm": 13.808533668518066, + "learning_rate": 5.096852860890541e-07, + "loss": 0.335, + "num_input_tokens_seen": 4269801472, + "step": 2036 + }, + { + "epoch": 0.33550311906699215, + "grad_norm": 21.51602554321289, + "learning_rate": 5.092832372647304e-07, + "loss": 0.6184, + "num_input_tokens_seen": 4271898624, + "step": 2037 + }, + { + "epoch": 0.3357743422836995, + "grad_norm": 18.561599731445312, + "learning_rate": 5.08881221207708e-07, + "loss": 0.519, + "num_input_tokens_seen": 4273995776, + "step": 2038 + }, + { + "epoch": 0.3360455655004068, + "grad_norm": 15.325385093688965, + "learning_rate": 5.084792382415141e-07, + "loss": 0.2667, + "num_input_tokens_seen": 4276092928, + "step": 2039 + }, + { + "epoch": 0.3363167887171142, + "grad_norm": 15.208858489990234, + "learning_rate": 5.080772886896493e-07, + "loss": 0.429, + "num_input_tokens_seen": 4278190080, + "step": 2040 + }, + { + "epoch": 0.33658801193382154, + "grad_norm": 13.985490798950195, + "learning_rate": 5.076753728755871e-07, + "loss": 0.2648, + "num_input_tokens_seen": 4280287232, + "step": 2041 + }, + { + "epoch": 0.3368592351505289, + "grad_norm": 19.45819664001465, + "learning_rate": 5.072734911227746e-07, + "loss": 0.3776, + "num_input_tokens_seen": 4282384384, + "step": 2042 + }, + { + "epoch": 0.33713045836723626, + "grad_norm": 11.178092956542969, + "learning_rate": 5.068716437546305e-07, + "loss": 0.1879, + "num_input_tokens_seen": 4284481536, + "step": 2043 + }, + { + "epoch": 0.33740168158394357, + "grad_norm": 19.116785049438477, + "learning_rate": 5.06469831094546e-07, + "loss": 0.5829, + "num_input_tokens_seen": 4286578688, + "step": 2044 + }, + { + "epoch": 0.33767290480065093, + "grad_norm": 17.29070281982422, + "learning_rate": 5.060680534658852e-07, + "loss": 0.4346, + "num_input_tokens_seen": 4288675840, + "step": 2045 + }, + { + "epoch": 0.3379441280173583, + "grad_norm": 11.40611457824707, + "learning_rate": 5.056663111919833e-07, + "loss": 0.3758, + "num_input_tokens_seen": 4290772992, + "step": 2046 + }, + { + "epoch": 0.33821535123406565, + "grad_norm": 10.537538528442383, + "learning_rate": 5.052646045961471e-07, + "loss": 0.2305, + "num_input_tokens_seen": 4292870144, + "step": 2047 + }, + { + "epoch": 0.33848657445077296, + "grad_norm": 15.972824096679688, + "learning_rate": 5.048629340016548e-07, + "loss": 0.3792, + "num_input_tokens_seen": 4294967296, + "step": 2048 + }, + { + "epoch": 0.3387577976674803, + "grad_norm": 13.639211654663086, + "learning_rate": 5.044612997317557e-07, + "loss": 0.3346, + "num_input_tokens_seen": 4297064448, + "step": 2049 + }, + { + "epoch": 0.3390290208841877, + "grad_norm": 17.853118896484375, + "learning_rate": 5.040597021096695e-07, + "loss": 0.6396, + "num_input_tokens_seen": 4299161600, + "step": 2050 + }, + { + "epoch": 0.33930024410089504, + "grad_norm": 18.226207733154297, + "learning_rate": 5.03658141458587e-07, + "loss": 0.4423, + "num_input_tokens_seen": 4301258752, + "step": 2051 + }, + { + "epoch": 0.3395714673176024, + "grad_norm": 14.695561408996582, + "learning_rate": 5.032566181016688e-07, + "loss": 0.3318, + "num_input_tokens_seen": 4303355904, + "step": 2052 + }, + { + "epoch": 0.3398426905343097, + "grad_norm": 18.532405853271484, + "learning_rate": 5.028551323620458e-07, + "loss": 0.6093, + "num_input_tokens_seen": 4305453056, + "step": 2053 + }, + { + "epoch": 0.34011391375101707, + "grad_norm": 15.558454513549805, + "learning_rate": 5.024536845628181e-07, + "loss": 0.4432, + "num_input_tokens_seen": 4307550208, + "step": 2054 + }, + { + "epoch": 0.34038513696772443, + "grad_norm": 22.573936462402344, + "learning_rate": 5.020522750270558e-07, + "loss": 0.6549, + "num_input_tokens_seen": 4309647360, + "step": 2055 + }, + { + "epoch": 0.3406563601844318, + "grad_norm": 13.960949897766113, + "learning_rate": 5.01650904077798e-07, + "loss": 0.27, + "num_input_tokens_seen": 4311744512, + "step": 2056 + }, + { + "epoch": 0.34092758340113916, + "grad_norm": 19.82161521911621, + "learning_rate": 5.012495720380527e-07, + "loss": 0.6896, + "num_input_tokens_seen": 4313841664, + "step": 2057 + }, + { + "epoch": 0.34119880661784646, + "grad_norm": 13.285265922546387, + "learning_rate": 5.008482792307968e-07, + "loss": 0.3401, + "num_input_tokens_seen": 4315938816, + "step": 2058 + }, + { + "epoch": 0.3414700298345538, + "grad_norm": 12.160155296325684, + "learning_rate": 5.004470259789757e-07, + "loss": 0.3066, + "num_input_tokens_seen": 4318035968, + "step": 2059 + }, + { + "epoch": 0.3417412530512612, + "grad_norm": 12.414044380187988, + "learning_rate": 5.000458126055021e-07, + "loss": 0.324, + "num_input_tokens_seen": 4320133120, + "step": 2060 + }, + { + "epoch": 0.34201247626796855, + "grad_norm": 18.17831802368164, + "learning_rate": 4.996446394332578e-07, + "loss": 0.5181, + "num_input_tokens_seen": 4322230272, + "step": 2061 + }, + { + "epoch": 0.3422836994846759, + "grad_norm": 14.670184135437012, + "learning_rate": 4.992435067850914e-07, + "loss": 0.3709, + "num_input_tokens_seen": 4324327424, + "step": 2062 + }, + { + "epoch": 0.3425549227013832, + "grad_norm": 17.332820892333984, + "learning_rate": 4.988424149838192e-07, + "loss": 0.5221, + "num_input_tokens_seen": 4326424576, + "step": 2063 + }, + { + "epoch": 0.3428261459180906, + "grad_norm": 22.735279083251953, + "learning_rate": 4.984413643522251e-07, + "loss": 0.7608, + "num_input_tokens_seen": 4328521728, + "step": 2064 + }, + { + "epoch": 0.34309736913479794, + "grad_norm": 14.321521759033203, + "learning_rate": 4.980403552130589e-07, + "loss": 0.3522, + "num_input_tokens_seen": 4330618880, + "step": 2065 + }, + { + "epoch": 0.3433685923515053, + "grad_norm": 16.680944442749023, + "learning_rate": 4.976393878890375e-07, + "loss": 0.4359, + "num_input_tokens_seen": 4332716032, + "step": 2066 + }, + { + "epoch": 0.34363981556821266, + "grad_norm": 15.195837020874023, + "learning_rate": 4.972384627028442e-07, + "loss": 0.4501, + "num_input_tokens_seen": 4334813184, + "step": 2067 + }, + { + "epoch": 0.34391103878491996, + "grad_norm": 13.431507110595703, + "learning_rate": 4.968375799771285e-07, + "loss": 0.3573, + "num_input_tokens_seen": 4336910336, + "step": 2068 + }, + { + "epoch": 0.3441822620016273, + "grad_norm": 12.854487419128418, + "learning_rate": 4.964367400345053e-07, + "loss": 0.3762, + "num_input_tokens_seen": 4339007488, + "step": 2069 + }, + { + "epoch": 0.3444534852183347, + "grad_norm": 17.647762298583984, + "learning_rate": 4.960359431975555e-07, + "loss": 0.7081, + "num_input_tokens_seen": 4341104640, + "step": 2070 + }, + { + "epoch": 0.34472470843504205, + "grad_norm": 21.555879592895508, + "learning_rate": 4.956351897888256e-07, + "loss": 0.7699, + "num_input_tokens_seen": 4343201792, + "step": 2071 + }, + { + "epoch": 0.3449959316517494, + "grad_norm": 17.82149887084961, + "learning_rate": 4.952344801308258e-07, + "loss": 0.5833, + "num_input_tokens_seen": 4345298944, + "step": 2072 + }, + { + "epoch": 0.3452671548684567, + "grad_norm": 13.02746295928955, + "learning_rate": 4.948338145460324e-07, + "loss": 0.3707, + "num_input_tokens_seen": 4347396096, + "step": 2073 + }, + { + "epoch": 0.3455383780851641, + "grad_norm": 18.583114624023438, + "learning_rate": 4.944331933568858e-07, + "loss": 0.6831, + "num_input_tokens_seen": 4349493248, + "step": 2074 + }, + { + "epoch": 0.34580960130187144, + "grad_norm": 18.889436721801758, + "learning_rate": 4.94032616885791e-07, + "loss": 0.618, + "num_input_tokens_seen": 4351590400, + "step": 2075 + }, + { + "epoch": 0.3460808245185788, + "grad_norm": 14.465885162353516, + "learning_rate": 4.936320854551163e-07, + "loss": 0.4065, + "num_input_tokens_seen": 4353687552, + "step": 2076 + }, + { + "epoch": 0.34635204773528616, + "grad_norm": 11.185941696166992, + "learning_rate": 4.932315993871942e-07, + "loss": 0.2518, + "num_input_tokens_seen": 4355784704, + "step": 2077 + }, + { + "epoch": 0.34662327095199347, + "grad_norm": 14.272321701049805, + "learning_rate": 4.928311590043206e-07, + "loss": 0.3067, + "num_input_tokens_seen": 4357881856, + "step": 2078 + }, + { + "epoch": 0.34689449416870083, + "grad_norm": 23.103809356689453, + "learning_rate": 4.924307646287549e-07, + "loss": 0.6866, + "num_input_tokens_seen": 4359979008, + "step": 2079 + }, + { + "epoch": 0.3471657173854082, + "grad_norm": 24.843984603881836, + "learning_rate": 4.920304165827192e-07, + "loss": 0.7426, + "num_input_tokens_seen": 4362076160, + "step": 2080 + }, + { + "epoch": 0.34743694060211555, + "grad_norm": 15.894787788391113, + "learning_rate": 4.916301151883983e-07, + "loss": 0.4618, + "num_input_tokens_seen": 4364173312, + "step": 2081 + }, + { + "epoch": 0.3477081638188229, + "grad_norm": 19.025161743164062, + "learning_rate": 4.912298607679396e-07, + "loss": 0.6288, + "num_input_tokens_seen": 4366270464, + "step": 2082 + }, + { + "epoch": 0.3479793870355302, + "grad_norm": 17.685871124267578, + "learning_rate": 4.908296536434525e-07, + "loss": 0.4496, + "num_input_tokens_seen": 4368367616, + "step": 2083 + }, + { + "epoch": 0.3482506102522376, + "grad_norm": 26.235916137695312, + "learning_rate": 4.904294941370086e-07, + "loss": 0.7546, + "num_input_tokens_seen": 4370464768, + "step": 2084 + }, + { + "epoch": 0.34852183346894494, + "grad_norm": 12.8169584274292, + "learning_rate": 4.900293825706408e-07, + "loss": 0.2447, + "num_input_tokens_seen": 4372561920, + "step": 2085 + }, + { + "epoch": 0.3487930566856523, + "grad_norm": 16.72005271911621, + "learning_rate": 4.89629319266344e-07, + "loss": 0.4664, + "num_input_tokens_seen": 4374659072, + "step": 2086 + }, + { + "epoch": 0.34906427990235966, + "grad_norm": 14.927910804748535, + "learning_rate": 4.892293045460738e-07, + "loss": 0.3813, + "num_input_tokens_seen": 4376756224, + "step": 2087 + }, + { + "epoch": 0.34933550311906697, + "grad_norm": 14.392501831054688, + "learning_rate": 4.88829338731747e-07, + "loss": 0.3661, + "num_input_tokens_seen": 4378853376, + "step": 2088 + }, + { + "epoch": 0.34960672633577433, + "grad_norm": 15.27480697631836, + "learning_rate": 4.884294221452405e-07, + "loss": 0.3709, + "num_input_tokens_seen": 4380950528, + "step": 2089 + }, + { + "epoch": 0.3498779495524817, + "grad_norm": 16.14817237854004, + "learning_rate": 4.88029555108392e-07, + "loss": 0.58, + "num_input_tokens_seen": 4383047680, + "step": 2090 + }, + { + "epoch": 0.35014917276918905, + "grad_norm": 11.918871879577637, + "learning_rate": 4.876297379429995e-07, + "loss": 0.2754, + "num_input_tokens_seen": 4385144832, + "step": 2091 + }, + { + "epoch": 0.3504203959858964, + "grad_norm": 24.65549659729004, + "learning_rate": 4.872299709708206e-07, + "loss": 0.453, + "num_input_tokens_seen": 4387241984, + "step": 2092 + }, + { + "epoch": 0.3506916192026037, + "grad_norm": 13.130111694335938, + "learning_rate": 4.868302545135725e-07, + "loss": 0.3193, + "num_input_tokens_seen": 4389339136, + "step": 2093 + }, + { + "epoch": 0.3509628424193111, + "grad_norm": 22.114221572875977, + "learning_rate": 4.864305888929318e-07, + "loss": 0.9149, + "num_input_tokens_seen": 4391436288, + "step": 2094 + }, + { + "epoch": 0.35123406563601844, + "grad_norm": 22.851913452148438, + "learning_rate": 4.860309744305339e-07, + "loss": 0.7968, + "num_input_tokens_seen": 4393533440, + "step": 2095 + }, + { + "epoch": 0.3515052888527258, + "grad_norm": 19.785030364990234, + "learning_rate": 4.856314114479738e-07, + "loss": 0.7142, + "num_input_tokens_seen": 4395630592, + "step": 2096 + }, + { + "epoch": 0.35177651206943317, + "grad_norm": 13.180719375610352, + "learning_rate": 4.852319002668044e-07, + "loss": 0.3404, + "num_input_tokens_seen": 4397727744, + "step": 2097 + }, + { + "epoch": 0.3520477352861405, + "grad_norm": 18.110231399536133, + "learning_rate": 4.848324412085367e-07, + "loss": 0.5252, + "num_input_tokens_seen": 4399824896, + "step": 2098 + }, + { + "epoch": 0.35231895850284783, + "grad_norm": 18.176883697509766, + "learning_rate": 4.84433034594641e-07, + "loss": 0.4975, + "num_input_tokens_seen": 4401922048, + "step": 2099 + }, + { + "epoch": 0.3525901817195552, + "grad_norm": 16.192501068115234, + "learning_rate": 4.840336807465439e-07, + "loss": 0.4609, + "num_input_tokens_seen": 4404019200, + "step": 2100 + }, + { + "epoch": 0.35286140493626256, + "grad_norm": 15.730474472045898, + "learning_rate": 4.836343799856302e-07, + "loss": 0.4662, + "num_input_tokens_seen": 4406116352, + "step": 2101 + }, + { + "epoch": 0.3531326281529699, + "grad_norm": 9.630202293395996, + "learning_rate": 4.83235132633242e-07, + "loss": 0.2086, + "num_input_tokens_seen": 4408213504, + "step": 2102 + }, + { + "epoch": 0.3534038513696772, + "grad_norm": 10.681892395019531, + "learning_rate": 4.828359390106786e-07, + "loss": 0.2325, + "num_input_tokens_seen": 4410310656, + "step": 2103 + }, + { + "epoch": 0.3536750745863846, + "grad_norm": 15.098684310913086, + "learning_rate": 4.824367994391958e-07, + "loss": 0.4176, + "num_input_tokens_seen": 4412407808, + "step": 2104 + }, + { + "epoch": 0.35394629780309195, + "grad_norm": 11.902873039245605, + "learning_rate": 4.820377142400058e-07, + "loss": 0.2685, + "num_input_tokens_seen": 4414504960, + "step": 2105 + }, + { + "epoch": 0.3542175210197993, + "grad_norm": 19.398834228515625, + "learning_rate": 4.816386837342771e-07, + "loss": 0.5198, + "num_input_tokens_seen": 4416602112, + "step": 2106 + }, + { + "epoch": 0.35448874423650667, + "grad_norm": 21.086822509765625, + "learning_rate": 4.812397082431345e-07, + "loss": 0.5469, + "num_input_tokens_seen": 4418699264, + "step": 2107 + }, + { + "epoch": 0.354759967453214, + "grad_norm": 21.071117401123047, + "learning_rate": 4.808407880876581e-07, + "loss": 0.6044, + "num_input_tokens_seen": 4420796416, + "step": 2108 + }, + { + "epoch": 0.35503119066992134, + "grad_norm": 17.908489227294922, + "learning_rate": 4.804419235888838e-07, + "loss": 0.5796, + "num_input_tokens_seen": 4422893568, + "step": 2109 + }, + { + "epoch": 0.3553024138866287, + "grad_norm": 15.03857135772705, + "learning_rate": 4.800431150678026e-07, + "loss": 0.4045, + "num_input_tokens_seen": 4424990720, + "step": 2110 + }, + { + "epoch": 0.35557363710333606, + "grad_norm": 22.05478858947754, + "learning_rate": 4.796443628453603e-07, + "loss": 0.6847, + "num_input_tokens_seen": 4427087872, + "step": 2111 + }, + { + "epoch": 0.3558448603200434, + "grad_norm": 18.630102157592773, + "learning_rate": 4.792456672424574e-07, + "loss": 0.5185, + "num_input_tokens_seen": 4429185024, + "step": 2112 + }, + { + "epoch": 0.3561160835367507, + "grad_norm": 10.600038528442383, + "learning_rate": 4.788470285799492e-07, + "loss": 0.2514, + "num_input_tokens_seen": 4431282176, + "step": 2113 + }, + { + "epoch": 0.3563873067534581, + "grad_norm": 15.612627029418945, + "learning_rate": 4.784484471786447e-07, + "loss": 0.3868, + "num_input_tokens_seen": 4433379328, + "step": 2114 + }, + { + "epoch": 0.35665852997016545, + "grad_norm": 17.270404815673828, + "learning_rate": 4.780499233593073e-07, + "loss": 0.5039, + "num_input_tokens_seen": 4435476480, + "step": 2115 + }, + { + "epoch": 0.3569297531868728, + "grad_norm": 17.62727928161621, + "learning_rate": 4.776514574426538e-07, + "loss": 0.4082, + "num_input_tokens_seen": 4437573632, + "step": 2116 + }, + { + "epoch": 0.3572009764035802, + "grad_norm": 15.866716384887695, + "learning_rate": 4.772530497493541e-07, + "loss": 0.3864, + "num_input_tokens_seen": 4439670784, + "step": 2117 + }, + { + "epoch": 0.3574721996202875, + "grad_norm": 17.658111572265625, + "learning_rate": 4.768547006000317e-07, + "loss": 0.505, + "num_input_tokens_seen": 4441767936, + "step": 2118 + }, + { + "epoch": 0.35774342283699484, + "grad_norm": 17.13514518737793, + "learning_rate": 4.76456410315263e-07, + "loss": 0.5577, + "num_input_tokens_seen": 4443865088, + "step": 2119 + }, + { + "epoch": 0.3580146460537022, + "grad_norm": 12.680811882019043, + "learning_rate": 4.7605817921557666e-07, + "loss": 0.3055, + "num_input_tokens_seen": 4445962240, + "step": 2120 + }, + { + "epoch": 0.35828586927040956, + "grad_norm": 10.903252601623535, + "learning_rate": 4.756600076214541e-07, + "loss": 0.3165, + "num_input_tokens_seen": 4448059392, + "step": 2121 + }, + { + "epoch": 0.3585570924871169, + "grad_norm": 20.912986755371094, + "learning_rate": 4.7526189585332855e-07, + "loss": 0.489, + "num_input_tokens_seen": 4450156544, + "step": 2122 + }, + { + "epoch": 0.35882831570382423, + "grad_norm": 19.84836196899414, + "learning_rate": 4.748638442315851e-07, + "loss": 0.7097, + "num_input_tokens_seen": 4452253696, + "step": 2123 + }, + { + "epoch": 0.3590995389205316, + "grad_norm": 14.485566139221191, + "learning_rate": 4.7446585307656074e-07, + "loss": 0.4103, + "num_input_tokens_seen": 4454350848, + "step": 2124 + }, + { + "epoch": 0.35937076213723895, + "grad_norm": 17.28524398803711, + "learning_rate": 4.740679227085436e-07, + "loss": 0.6003, + "num_input_tokens_seen": 4456448000, + "step": 2125 + }, + { + "epoch": 0.3596419853539463, + "grad_norm": 17.543121337890625, + "learning_rate": 4.7367005344777255e-07, + "loss": 0.4924, + "num_input_tokens_seen": 4458545152, + "step": 2126 + }, + { + "epoch": 0.3599132085706537, + "grad_norm": 15.53244400024414, + "learning_rate": 4.732722456144381e-07, + "loss": 0.4224, + "num_input_tokens_seen": 4460642304, + "step": 2127 + }, + { + "epoch": 0.360184431787361, + "grad_norm": 15.329564094543457, + "learning_rate": 4.7287449952868084e-07, + "loss": 0.4343, + "num_input_tokens_seen": 4462739456, + "step": 2128 + }, + { + "epoch": 0.36045565500406834, + "grad_norm": 11.179370880126953, + "learning_rate": 4.724768155105914e-07, + "loss": 0.2533, + "num_input_tokens_seen": 4464836608, + "step": 2129 + }, + { + "epoch": 0.3607268782207757, + "grad_norm": 19.77837371826172, + "learning_rate": 4.7207919388021076e-07, + "loss": 0.6616, + "num_input_tokens_seen": 4466933760, + "step": 2130 + }, + { + "epoch": 0.36099810143748307, + "grad_norm": 18.568798065185547, + "learning_rate": 4.7168163495752977e-07, + "loss": 0.4892, + "num_input_tokens_seen": 4469030912, + "step": 2131 + }, + { + "epoch": 0.36126932465419037, + "grad_norm": 21.5096492767334, + "learning_rate": 4.7128413906248885e-07, + "loss": 0.5263, + "num_input_tokens_seen": 4471128064, + "step": 2132 + }, + { + "epoch": 0.36154054787089773, + "grad_norm": 14.280765533447266, + "learning_rate": 4.708867065149775e-07, + "loss": 0.3734, + "num_input_tokens_seen": 4473225216, + "step": 2133 + }, + { + "epoch": 0.3618117710876051, + "grad_norm": 21.687335968017578, + "learning_rate": 4.704893376348342e-07, + "loss": 0.5905, + "num_input_tokens_seen": 4475322368, + "step": 2134 + }, + { + "epoch": 0.36208299430431246, + "grad_norm": 18.599130630493164, + "learning_rate": 4.700920327418463e-07, + "loss": 0.6312, + "num_input_tokens_seen": 4477419520, + "step": 2135 + }, + { + "epoch": 0.3623542175210198, + "grad_norm": 15.053811073303223, + "learning_rate": 4.696947921557498e-07, + "loss": 0.3828, + "num_input_tokens_seen": 4479516672, + "step": 2136 + }, + { + "epoch": 0.3626254407377271, + "grad_norm": 15.113408088684082, + "learning_rate": 4.6929761619622866e-07, + "loss": 0.443, + "num_input_tokens_seen": 4481613824, + "step": 2137 + }, + { + "epoch": 0.3628966639544345, + "grad_norm": 14.625730514526367, + "learning_rate": 4.68900505182915e-07, + "loss": 0.3773, + "num_input_tokens_seen": 4483710976, + "step": 2138 + }, + { + "epoch": 0.36316788717114185, + "grad_norm": 13.605332374572754, + "learning_rate": 4.6850345943538896e-07, + "loss": 0.2264, + "num_input_tokens_seen": 4485808128, + "step": 2139 + }, + { + "epoch": 0.3634391103878492, + "grad_norm": 18.40673065185547, + "learning_rate": 4.6810647927317735e-07, + "loss": 0.5183, + "num_input_tokens_seen": 4487905280, + "step": 2140 + }, + { + "epoch": 0.36371033360455657, + "grad_norm": 17.36908721923828, + "learning_rate": 4.677095650157551e-07, + "loss": 0.4819, + "num_input_tokens_seen": 4490002432, + "step": 2141 + }, + { + "epoch": 0.3639815568212639, + "grad_norm": 17.948476791381836, + "learning_rate": 4.6731271698254326e-07, + "loss": 0.5503, + "num_input_tokens_seen": 4492099584, + "step": 2142 + }, + { + "epoch": 0.36425278003797124, + "grad_norm": 22.524932861328125, + "learning_rate": 4.669159354929105e-07, + "loss": 0.9079, + "num_input_tokens_seen": 4494196736, + "step": 2143 + }, + { + "epoch": 0.3645240032546786, + "grad_norm": 11.158336639404297, + "learning_rate": 4.6651922086617134e-07, + "loss": 0.2876, + "num_input_tokens_seen": 4496293888, + "step": 2144 + }, + { + "epoch": 0.36479522647138596, + "grad_norm": 16.561214447021484, + "learning_rate": 4.661225734215867e-07, + "loss": 0.4339, + "num_input_tokens_seen": 4498391040, + "step": 2145 + }, + { + "epoch": 0.3650664496880933, + "grad_norm": 11.833779335021973, + "learning_rate": 4.657259934783628e-07, + "loss": 0.2669, + "num_input_tokens_seen": 4500488192, + "step": 2146 + }, + { + "epoch": 0.3653376729048006, + "grad_norm": 15.685547828674316, + "learning_rate": 4.6532948135565264e-07, + "loss": 0.5366, + "num_input_tokens_seen": 4502585344, + "step": 2147 + }, + { + "epoch": 0.365608896121508, + "grad_norm": 17.699525833129883, + "learning_rate": 4.6493303737255397e-07, + "loss": 0.4777, + "num_input_tokens_seen": 4504682496, + "step": 2148 + }, + { + "epoch": 0.36588011933821535, + "grad_norm": 18.16535186767578, + "learning_rate": 4.645366618481098e-07, + "loss": 0.6268, + "num_input_tokens_seen": 4506779648, + "step": 2149 + }, + { + "epoch": 0.3661513425549227, + "grad_norm": 18.208988189697266, + "learning_rate": 4.641403551013081e-07, + "loss": 0.729, + "num_input_tokens_seen": 4508876800, + "step": 2150 + }, + { + "epoch": 0.36642256577163007, + "grad_norm": 14.158181190490723, + "learning_rate": 4.637441174510813e-07, + "loss": 0.3639, + "num_input_tokens_seen": 4510973952, + "step": 2151 + }, + { + "epoch": 0.3666937889883374, + "grad_norm": 15.75287914276123, + "learning_rate": 4.633479492163066e-07, + "loss": 0.4579, + "num_input_tokens_seen": 4513071104, + "step": 2152 + }, + { + "epoch": 0.36696501220504474, + "grad_norm": 14.739073753356934, + "learning_rate": 4.6295185071580487e-07, + "loss": 0.413, + "num_input_tokens_seen": 4515168256, + "step": 2153 + }, + { + "epoch": 0.3672362354217521, + "grad_norm": 14.379277229309082, + "learning_rate": 4.6255582226834133e-07, + "loss": 0.4131, + "num_input_tokens_seen": 4517265408, + "step": 2154 + }, + { + "epoch": 0.36750745863845946, + "grad_norm": 19.29395294189453, + "learning_rate": 4.6215986419262444e-07, + "loss": 0.5776, + "num_input_tokens_seen": 4519362560, + "step": 2155 + }, + { + "epoch": 0.3677786818551668, + "grad_norm": 22.25945281982422, + "learning_rate": 4.617639768073066e-07, + "loss": 0.9065, + "num_input_tokens_seen": 4521459712, + "step": 2156 + }, + { + "epoch": 0.36804990507187413, + "grad_norm": 11.623411178588867, + "learning_rate": 4.613681604309824e-07, + "loss": 0.302, + "num_input_tokens_seen": 4523556864, + "step": 2157 + }, + { + "epoch": 0.3683211282885815, + "grad_norm": 19.57111358642578, + "learning_rate": 4.609724153821898e-07, + "loss": 0.5756, + "num_input_tokens_seen": 4525654016, + "step": 2158 + }, + { + "epoch": 0.36859235150528885, + "grad_norm": 19.20432472229004, + "learning_rate": 4.6057674197940974e-07, + "loss": 0.4861, + "num_input_tokens_seen": 4527751168, + "step": 2159 + }, + { + "epoch": 0.3688635747219962, + "grad_norm": 12.728202819824219, + "learning_rate": 4.6018114054106494e-07, + "loss": 0.3036, + "num_input_tokens_seen": 4529848320, + "step": 2160 + }, + { + "epoch": 0.3691347979387036, + "grad_norm": 15.91971492767334, + "learning_rate": 4.597856113855203e-07, + "loss": 0.3981, + "num_input_tokens_seen": 4531945472, + "step": 2161 + }, + { + "epoch": 0.3694060211554109, + "grad_norm": 20.145366668701172, + "learning_rate": 4.593901548310828e-07, + "loss": 0.6073, + "num_input_tokens_seen": 4534042624, + "step": 2162 + }, + { + "epoch": 0.36967724437211824, + "grad_norm": 18.48992156982422, + "learning_rate": 4.5899477119600073e-07, + "loss": 0.4431, + "num_input_tokens_seen": 4536139776, + "step": 2163 + }, + { + "epoch": 0.3699484675888256, + "grad_norm": 20.453821182250977, + "learning_rate": 4.585994607984637e-07, + "loss": 0.4718, + "num_input_tokens_seen": 4538236928, + "step": 2164 + }, + { + "epoch": 0.37021969080553296, + "grad_norm": 19.20899772644043, + "learning_rate": 4.582042239566026e-07, + "loss": 0.555, + "num_input_tokens_seen": 4540334080, + "step": 2165 + }, + { + "epoch": 0.3704909140222403, + "grad_norm": 14.162814140319824, + "learning_rate": 4.578090609884889e-07, + "loss": 0.3494, + "num_input_tokens_seen": 4542431232, + "step": 2166 + }, + { + "epoch": 0.37076213723894763, + "grad_norm": 16.07235336303711, + "learning_rate": 4.57413972212135e-07, + "loss": 0.3046, + "num_input_tokens_seen": 4544528384, + "step": 2167 + }, + { + "epoch": 0.371033360455655, + "grad_norm": 18.295516967773438, + "learning_rate": 4.5701895794549293e-07, + "loss": 0.403, + "num_input_tokens_seen": 4546625536, + "step": 2168 + }, + { + "epoch": 0.37130458367236235, + "grad_norm": 19.14270782470703, + "learning_rate": 4.566240185064554e-07, + "loss": 0.3592, + "num_input_tokens_seen": 4548722688, + "step": 2169 + }, + { + "epoch": 0.3715758068890697, + "grad_norm": 17.336620330810547, + "learning_rate": 4.5622915421285446e-07, + "loss": 0.5408, + "num_input_tokens_seen": 4550819840, + "step": 2170 + }, + { + "epoch": 0.3718470301057771, + "grad_norm": 7.247825622558594, + "learning_rate": 4.558343653824619e-07, + "loss": 0.0982, + "num_input_tokens_seen": 4552916992, + "step": 2171 + }, + { + "epoch": 0.3721182533224844, + "grad_norm": 14.099793434143066, + "learning_rate": 4.55439652332989e-07, + "loss": 0.3333, + "num_input_tokens_seen": 4555014144, + "step": 2172 + }, + { + "epoch": 0.37238947653919174, + "grad_norm": 17.59613800048828, + "learning_rate": 4.550450153820857e-07, + "loss": 0.4548, + "num_input_tokens_seen": 4557111296, + "step": 2173 + }, + { + "epoch": 0.3726606997558991, + "grad_norm": 12.305207252502441, + "learning_rate": 4.5465045484734044e-07, + "loss": 0.3559, + "num_input_tokens_seen": 4559208448, + "step": 2174 + }, + { + "epoch": 0.37293192297260647, + "grad_norm": 13.379070281982422, + "learning_rate": 4.5425597104628086e-07, + "loss": 0.3236, + "num_input_tokens_seen": 4561305600, + "step": 2175 + }, + { + "epoch": 0.37320314618931383, + "grad_norm": 15.527801513671875, + "learning_rate": 4.5386156429637256e-07, + "loss": 0.4341, + "num_input_tokens_seen": 4563402752, + "step": 2176 + }, + { + "epoch": 0.37347436940602113, + "grad_norm": 22.03895378112793, + "learning_rate": 4.534672349150189e-07, + "loss": 0.3883, + "num_input_tokens_seen": 4565499904, + "step": 2177 + }, + { + "epoch": 0.3737455926227285, + "grad_norm": 15.303971290588379, + "learning_rate": 4.530729832195612e-07, + "loss": 0.311, + "num_input_tokens_seen": 4567597056, + "step": 2178 + }, + { + "epoch": 0.37401681583943586, + "grad_norm": 15.900908470153809, + "learning_rate": 4.5267880952727845e-07, + "loss": 0.3807, + "num_input_tokens_seen": 4569694208, + "step": 2179 + }, + { + "epoch": 0.3742880390561432, + "grad_norm": 26.075777053833008, + "learning_rate": 4.5228471415538637e-07, + "loss": 0.9865, + "num_input_tokens_seen": 4571791360, + "step": 2180 + }, + { + "epoch": 0.3745592622728506, + "grad_norm": 18.148820877075195, + "learning_rate": 4.518906974210379e-07, + "loss": 0.3947, + "num_input_tokens_seen": 4573888512, + "step": 2181 + }, + { + "epoch": 0.3748304854895579, + "grad_norm": 32.52322006225586, + "learning_rate": 4.514967596413228e-07, + "loss": 0.4087, + "num_input_tokens_seen": 4575985664, + "step": 2182 + }, + { + "epoch": 0.37510170870626525, + "grad_norm": 14.72100830078125, + "learning_rate": 4.511029011332672e-07, + "loss": 0.3748, + "num_input_tokens_seen": 4578082816, + "step": 2183 + }, + { + "epoch": 0.3753729319229726, + "grad_norm": 18.871889114379883, + "learning_rate": 4.507091222138335e-07, + "loss": 0.5725, + "num_input_tokens_seen": 4580179968, + "step": 2184 + }, + { + "epoch": 0.37564415513967997, + "grad_norm": 15.662956237792969, + "learning_rate": 4.503154231999202e-07, + "loss": 0.2177, + "num_input_tokens_seen": 4582277120, + "step": 2185 + }, + { + "epoch": 0.37591537835638733, + "grad_norm": 23.3785343170166, + "learning_rate": 4.499218044083608e-07, + "loss": 0.799, + "num_input_tokens_seen": 4584374272, + "step": 2186 + }, + { + "epoch": 0.37618660157309464, + "grad_norm": 15.690712928771973, + "learning_rate": 4.495282661559248e-07, + "loss": 0.4064, + "num_input_tokens_seen": 4586471424, + "step": 2187 + }, + { + "epoch": 0.376457824789802, + "grad_norm": 17.365020751953125, + "learning_rate": 4.491348087593171e-07, + "loss": 0.5982, + "num_input_tokens_seen": 4588568576, + "step": 2188 + }, + { + "epoch": 0.37672904800650936, + "grad_norm": 12.830045700073242, + "learning_rate": 4.4874143253517706e-07, + "loss": 0.2989, + "num_input_tokens_seen": 4590665728, + "step": 2189 + }, + { + "epoch": 0.3770002712232167, + "grad_norm": 21.898000717163086, + "learning_rate": 4.483481378000791e-07, + "loss": 0.7782, + "num_input_tokens_seen": 4592762880, + "step": 2190 + }, + { + "epoch": 0.3772714944399241, + "grad_norm": 17.4901065826416, + "learning_rate": 4.4795492487053155e-07, + "loss": 0.5265, + "num_input_tokens_seen": 4594860032, + "step": 2191 + }, + { + "epoch": 0.3775427176566314, + "grad_norm": 11.635929107666016, + "learning_rate": 4.4756179406297744e-07, + "loss": 0.3187, + "num_input_tokens_seen": 4596957184, + "step": 2192 + }, + { + "epoch": 0.37781394087333875, + "grad_norm": 16.157140731811523, + "learning_rate": 4.4716874569379336e-07, + "loss": 0.3864, + "num_input_tokens_seen": 4599054336, + "step": 2193 + }, + { + "epoch": 0.3780851640900461, + "grad_norm": 15.90643310546875, + "learning_rate": 4.467757800792897e-07, + "loss": 0.3689, + "num_input_tokens_seen": 4601151488, + "step": 2194 + }, + { + "epoch": 0.37835638730675347, + "grad_norm": 20.70597267150879, + "learning_rate": 4.4638289753571025e-07, + "loss": 0.6203, + "num_input_tokens_seen": 4603248640, + "step": 2195 + }, + { + "epoch": 0.37862761052346083, + "grad_norm": 13.205007553100586, + "learning_rate": 4.459900983792321e-07, + "loss": 0.3584, + "num_input_tokens_seen": 4605345792, + "step": 2196 + }, + { + "epoch": 0.37889883374016814, + "grad_norm": 15.028583526611328, + "learning_rate": 4.455973829259648e-07, + "loss": 0.2914, + "num_input_tokens_seen": 4607442944, + "step": 2197 + }, + { + "epoch": 0.3791700569568755, + "grad_norm": 14.172557830810547, + "learning_rate": 4.4520475149195093e-07, + "loss": 0.4295, + "num_input_tokens_seen": 4609540096, + "step": 2198 + }, + { + "epoch": 0.37944128017358286, + "grad_norm": 11.871790885925293, + "learning_rate": 4.4481220439316514e-07, + "loss": 0.2735, + "num_input_tokens_seen": 4611637248, + "step": 2199 + }, + { + "epoch": 0.3797125033902902, + "grad_norm": 17.767791748046875, + "learning_rate": 4.444197419455147e-07, + "loss": 0.6647, + "num_input_tokens_seen": 4613734400, + "step": 2200 + }, + { + "epoch": 0.3799837266069976, + "grad_norm": 15.653825759887695, + "learning_rate": 4.4402736446483824e-07, + "loss": 0.4934, + "num_input_tokens_seen": 4615831552, + "step": 2201 + }, + { + "epoch": 0.3802549498237049, + "grad_norm": 16.383865356445312, + "learning_rate": 4.436350722669065e-07, + "loss": 0.4065, + "num_input_tokens_seen": 4617928704, + "step": 2202 + }, + { + "epoch": 0.38052617304041225, + "grad_norm": 17.470500946044922, + "learning_rate": 4.43242865667421e-07, + "loss": 0.5536, + "num_input_tokens_seen": 4620025856, + "step": 2203 + }, + { + "epoch": 0.3807973962571196, + "grad_norm": 17.186763763427734, + "learning_rate": 4.428507449820147e-07, + "loss": 0.4773, + "num_input_tokens_seen": 4622123008, + "step": 2204 + }, + { + "epoch": 0.381068619473827, + "grad_norm": 18.592941284179688, + "learning_rate": 4.424587105262516e-07, + "loss": 0.6064, + "num_input_tokens_seen": 4624220160, + "step": 2205 + }, + { + "epoch": 0.38133984269053434, + "grad_norm": 18.531604766845703, + "learning_rate": 4.4206676261562603e-07, + "loss": 0.6243, + "num_input_tokens_seen": 4626317312, + "step": 2206 + }, + { + "epoch": 0.38161106590724164, + "grad_norm": 18.356502532958984, + "learning_rate": 4.4167490156556274e-07, + "loss": 0.3981, + "num_input_tokens_seen": 4628414464, + "step": 2207 + }, + { + "epoch": 0.381882289123949, + "grad_norm": 16.479177474975586, + "learning_rate": 4.4128312769141664e-07, + "loss": 0.4111, + "num_input_tokens_seen": 4630511616, + "step": 2208 + }, + { + "epoch": 0.38215351234065636, + "grad_norm": 13.391749382019043, + "learning_rate": 4.408914413084722e-07, + "loss": 0.3192, + "num_input_tokens_seen": 4632608768, + "step": 2209 + }, + { + "epoch": 0.3824247355573637, + "grad_norm": 24.691661834716797, + "learning_rate": 4.4049984273194406e-07, + "loss": 0.7114, + "num_input_tokens_seen": 4634705920, + "step": 2210 + }, + { + "epoch": 0.3826959587740711, + "grad_norm": 27.44068717956543, + "learning_rate": 4.401083322769756e-07, + "loss": 0.7462, + "num_input_tokens_seen": 4636803072, + "step": 2211 + }, + { + "epoch": 0.3829671819907784, + "grad_norm": 14.547937393188477, + "learning_rate": 4.3971691025863964e-07, + "loss": 0.4238, + "num_input_tokens_seen": 4638900224, + "step": 2212 + }, + { + "epoch": 0.38323840520748575, + "grad_norm": 16.86309242248535, + "learning_rate": 4.3932557699193794e-07, + "loss": 0.5338, + "num_input_tokens_seen": 4640997376, + "step": 2213 + }, + { + "epoch": 0.3835096284241931, + "grad_norm": 13.788909912109375, + "learning_rate": 4.389343327918004e-07, + "loss": 0.3702, + "num_input_tokens_seen": 4643094528, + "step": 2214 + }, + { + "epoch": 0.3837808516409005, + "grad_norm": 18.939809799194336, + "learning_rate": 4.3854317797308515e-07, + "loss": 0.4389, + "num_input_tokens_seen": 4645191680, + "step": 2215 + }, + { + "epoch": 0.3840520748576078, + "grad_norm": 13.284666061401367, + "learning_rate": 4.381521128505791e-07, + "loss": 0.3214, + "num_input_tokens_seen": 4647288832, + "step": 2216 + }, + { + "epoch": 0.38432329807431515, + "grad_norm": 10.879389762878418, + "learning_rate": 4.377611377389966e-07, + "loss": 0.2278, + "num_input_tokens_seen": 4649385984, + "step": 2217 + }, + { + "epoch": 0.3845945212910225, + "grad_norm": 12.832559585571289, + "learning_rate": 4.3737025295297937e-07, + "loss": 0.3289, + "num_input_tokens_seen": 4651483136, + "step": 2218 + }, + { + "epoch": 0.38486574450772987, + "grad_norm": 15.852336883544922, + "learning_rate": 4.3697945880709686e-07, + "loss": 0.5121, + "num_input_tokens_seen": 4653580288, + "step": 2219 + }, + { + "epoch": 0.38513696772443723, + "grad_norm": 19.836444854736328, + "learning_rate": 4.3658875561584494e-07, + "loss": 0.5452, + "num_input_tokens_seen": 4655677440, + "step": 2220 + }, + { + "epoch": 0.38540819094114454, + "grad_norm": 18.720237731933594, + "learning_rate": 4.36198143693647e-07, + "loss": 0.1762, + "num_input_tokens_seen": 4657774592, + "step": 2221 + }, + { + "epoch": 0.3856794141578519, + "grad_norm": 12.053540229797363, + "learning_rate": 4.3580762335485253e-07, + "loss": 0.2582, + "num_input_tokens_seen": 4659871744, + "step": 2222 + }, + { + "epoch": 0.38595063737455926, + "grad_norm": 21.45743751525879, + "learning_rate": 4.3541719491373743e-07, + "loss": 0.7018, + "num_input_tokens_seen": 4661968896, + "step": 2223 + }, + { + "epoch": 0.3862218605912666, + "grad_norm": 18.61931610107422, + "learning_rate": 4.350268586845035e-07, + "loss": 0.686, + "num_input_tokens_seen": 4664066048, + "step": 2224 + }, + { + "epoch": 0.386493083807974, + "grad_norm": 15.874135971069336, + "learning_rate": 4.34636614981279e-07, + "loss": 0.4922, + "num_input_tokens_seen": 4666163200, + "step": 2225 + }, + { + "epoch": 0.3867643070246813, + "grad_norm": 17.982471466064453, + "learning_rate": 4.342464641181166e-07, + "loss": 0.4819, + "num_input_tokens_seen": 4668260352, + "step": 2226 + }, + { + "epoch": 0.38703553024138865, + "grad_norm": 13.340551376342773, + "learning_rate": 4.3385640640899524e-07, + "loss": 0.3803, + "num_input_tokens_seen": 4670357504, + "step": 2227 + }, + { + "epoch": 0.387306753458096, + "grad_norm": 15.015870094299316, + "learning_rate": 4.3346644216781823e-07, + "loss": 0.4039, + "num_input_tokens_seen": 4672454656, + "step": 2228 + }, + { + "epoch": 0.38757797667480337, + "grad_norm": 13.041723251342773, + "learning_rate": 4.3307657170841417e-07, + "loss": 0.2899, + "num_input_tokens_seen": 4674551808, + "step": 2229 + }, + { + "epoch": 0.38784919989151073, + "grad_norm": 21.264972686767578, + "learning_rate": 4.326867953445362e-07, + "loss": 0.7029, + "num_input_tokens_seen": 4676648960, + "step": 2230 + }, + { + "epoch": 0.38812042310821804, + "grad_norm": 17.26335906982422, + "learning_rate": 4.322971133898611e-07, + "loss": 0.4447, + "num_input_tokens_seen": 4678746112, + "step": 2231 + }, + { + "epoch": 0.3883916463249254, + "grad_norm": 13.549259185791016, + "learning_rate": 4.319075261579901e-07, + "loss": 0.3625, + "num_input_tokens_seen": 4680843264, + "step": 2232 + }, + { + "epoch": 0.38866286954163276, + "grad_norm": 17.16545295715332, + "learning_rate": 4.3151803396244833e-07, + "loss": 0.5931, + "num_input_tokens_seen": 4682940416, + "step": 2233 + }, + { + "epoch": 0.3889340927583401, + "grad_norm": 20.503673553466797, + "learning_rate": 4.3112863711668435e-07, + "loss": 0.3044, + "num_input_tokens_seen": 4685037568, + "step": 2234 + }, + { + "epoch": 0.3892053159750475, + "grad_norm": 24.06279182434082, + "learning_rate": 4.3073933593406974e-07, + "loss": 0.9052, + "num_input_tokens_seen": 4687134720, + "step": 2235 + }, + { + "epoch": 0.3894765391917548, + "grad_norm": 13.339831352233887, + "learning_rate": 4.303501307278994e-07, + "loss": 0.3166, + "num_input_tokens_seen": 4689231872, + "step": 2236 + }, + { + "epoch": 0.38974776240846215, + "grad_norm": 14.58847713470459, + "learning_rate": 4.299610218113908e-07, + "loss": 0.4647, + "num_input_tokens_seen": 4691329024, + "step": 2237 + }, + { + "epoch": 0.3900189856251695, + "grad_norm": 16.615280151367188, + "learning_rate": 4.2957200949768414e-07, + "loss": 0.3683, + "num_input_tokens_seen": 4693426176, + "step": 2238 + }, + { + "epoch": 0.3902902088418769, + "grad_norm": 25.591224670410156, + "learning_rate": 4.2918309409984145e-07, + "loss": 0.6683, + "num_input_tokens_seen": 4695523328, + "step": 2239 + }, + { + "epoch": 0.39056143205858423, + "grad_norm": 9.293283462524414, + "learning_rate": 4.2879427593084714e-07, + "loss": 0.1616, + "num_input_tokens_seen": 4697620480, + "step": 2240 + }, + { + "epoch": 0.39083265527529154, + "grad_norm": 16.830848693847656, + "learning_rate": 4.2840555530360756e-07, + "loss": 0.3878, + "num_input_tokens_seen": 4699717632, + "step": 2241 + }, + { + "epoch": 0.3911038784919989, + "grad_norm": 13.508627891540527, + "learning_rate": 4.280169325309502e-07, + "loss": 0.3504, + "num_input_tokens_seen": 4701814784, + "step": 2242 + }, + { + "epoch": 0.39137510170870626, + "grad_norm": 12.19189739227295, + "learning_rate": 4.276284079256235e-07, + "loss": 0.2894, + "num_input_tokens_seen": 4703911936, + "step": 2243 + }, + { + "epoch": 0.3916463249254136, + "grad_norm": 24.510799407958984, + "learning_rate": 4.272399818002974e-07, + "loss": 0.5448, + "num_input_tokens_seen": 4706009088, + "step": 2244 + }, + { + "epoch": 0.391917548142121, + "grad_norm": 17.75606918334961, + "learning_rate": 4.268516544675628e-07, + "loss": 0.2909, + "num_input_tokens_seen": 4708106240, + "step": 2245 + }, + { + "epoch": 0.3921887713588283, + "grad_norm": 16.183746337890625, + "learning_rate": 4.2646342623993035e-07, + "loss": 0.3659, + "num_input_tokens_seen": 4710203392, + "step": 2246 + }, + { + "epoch": 0.39245999457553565, + "grad_norm": 12.006651878356934, + "learning_rate": 4.2607529742983174e-07, + "loss": 0.3025, + "num_input_tokens_seen": 4712300544, + "step": 2247 + }, + { + "epoch": 0.392731217792243, + "grad_norm": 22.091772079467773, + "learning_rate": 4.2568726834961797e-07, + "loss": 0.6958, + "num_input_tokens_seen": 4714397696, + "step": 2248 + }, + { + "epoch": 0.3930024410089504, + "grad_norm": 13.590865135192871, + "learning_rate": 4.252993393115601e-07, + "loss": 0.2623, + "num_input_tokens_seen": 4716494848, + "step": 2249 + }, + { + "epoch": 0.39327366422565774, + "grad_norm": 18.128318786621094, + "learning_rate": 4.249115106278489e-07, + "loss": 0.4497, + "num_input_tokens_seen": 4718592000, + "step": 2250 + }, + { + "epoch": 0.39354488744236504, + "grad_norm": 17.97321319580078, + "learning_rate": 4.2452378261059397e-07, + "loss": 0.5144, + "num_input_tokens_seen": 4720689152, + "step": 2251 + }, + { + "epoch": 0.3938161106590724, + "grad_norm": 17.074485778808594, + "learning_rate": 4.24136155571824e-07, + "loss": 0.4629, + "num_input_tokens_seen": 4722786304, + "step": 2252 + }, + { + "epoch": 0.39408733387577977, + "grad_norm": 16.58645248413086, + "learning_rate": 4.2374862982348657e-07, + "loss": 0.4546, + "num_input_tokens_seen": 4724883456, + "step": 2253 + }, + { + "epoch": 0.3943585570924871, + "grad_norm": 20.560569763183594, + "learning_rate": 4.233612056774477e-07, + "loss": 0.8026, + "num_input_tokens_seen": 4726980608, + "step": 2254 + }, + { + "epoch": 0.3946297803091945, + "grad_norm": 13.33758544921875, + "learning_rate": 4.2297388344549146e-07, + "loss": 0.2919, + "num_input_tokens_seen": 4729077760, + "step": 2255 + }, + { + "epoch": 0.3949010035259018, + "grad_norm": 17.15109634399414, + "learning_rate": 4.2258666343932004e-07, + "loss": 0.4084, + "num_input_tokens_seen": 4731174912, + "step": 2256 + }, + { + "epoch": 0.39517222674260916, + "grad_norm": 16.682199478149414, + "learning_rate": 4.2219954597055354e-07, + "loss": 0.365, + "num_input_tokens_seen": 4733272064, + "step": 2257 + }, + { + "epoch": 0.3954434499593165, + "grad_norm": 13.481849670410156, + "learning_rate": 4.2181253135072925e-07, + "loss": 0.3236, + "num_input_tokens_seen": 4735369216, + "step": 2258 + }, + { + "epoch": 0.3957146731760239, + "grad_norm": 27.910301208496094, + "learning_rate": 4.21425619891302e-07, + "loss": 0.5844, + "num_input_tokens_seen": 4737466368, + "step": 2259 + }, + { + "epoch": 0.39598589639273124, + "grad_norm": 24.167097091674805, + "learning_rate": 4.21038811903643e-07, + "loss": 0.6685, + "num_input_tokens_seen": 4739563520, + "step": 2260 + }, + { + "epoch": 0.39625711960943855, + "grad_norm": 17.239904403686523, + "learning_rate": 4.206521076990409e-07, + "loss": 0.5551, + "num_input_tokens_seen": 4741660672, + "step": 2261 + }, + { + "epoch": 0.3965283428261459, + "grad_norm": 21.12584114074707, + "learning_rate": 4.202655075887005e-07, + "loss": 0.4008, + "num_input_tokens_seen": 4743757824, + "step": 2262 + }, + { + "epoch": 0.39679956604285327, + "grad_norm": 12.652347564697266, + "learning_rate": 4.1987901188374286e-07, + "loss": 0.2911, + "num_input_tokens_seen": 4745854976, + "step": 2263 + }, + { + "epoch": 0.39707078925956063, + "grad_norm": 13.351825714111328, + "learning_rate": 4.194926208952051e-07, + "loss": 0.414, + "num_input_tokens_seen": 4747952128, + "step": 2264 + }, + { + "epoch": 0.397342012476268, + "grad_norm": 13.11346435546875, + "learning_rate": 4.191063349340397e-07, + "loss": 0.3764, + "num_input_tokens_seen": 4750049280, + "step": 2265 + }, + { + "epoch": 0.3976132356929753, + "grad_norm": 13.020018577575684, + "learning_rate": 4.1872015431111505e-07, + "loss": 0.3352, + "num_input_tokens_seen": 4752146432, + "step": 2266 + }, + { + "epoch": 0.39788445890968266, + "grad_norm": 20.551406860351562, + "learning_rate": 4.1833407933721476e-07, + "loss": 0.6615, + "num_input_tokens_seen": 4754243584, + "step": 2267 + }, + { + "epoch": 0.39815568212639, + "grad_norm": 18.05826759338379, + "learning_rate": 4.17948110323037e-07, + "loss": 0.3388, + "num_input_tokens_seen": 4756340736, + "step": 2268 + }, + { + "epoch": 0.3984269053430974, + "grad_norm": 13.186254501342773, + "learning_rate": 4.1756224757919513e-07, + "loss": 0.3536, + "num_input_tokens_seen": 4758437888, + "step": 2269 + }, + { + "epoch": 0.39869812855980474, + "grad_norm": 19.050817489624023, + "learning_rate": 4.17176491416217e-07, + "loss": 0.5714, + "num_input_tokens_seen": 4760535040, + "step": 2270 + }, + { + "epoch": 0.39896935177651205, + "grad_norm": 13.171993255615234, + "learning_rate": 4.1679084214454405e-07, + "loss": 0.2979, + "num_input_tokens_seen": 4762632192, + "step": 2271 + }, + { + "epoch": 0.3992405749932194, + "grad_norm": 19.698951721191406, + "learning_rate": 4.1640530007453245e-07, + "loss": 0.4981, + "num_input_tokens_seen": 4764729344, + "step": 2272 + }, + { + "epoch": 0.39951179820992677, + "grad_norm": 14.195073127746582, + "learning_rate": 4.1601986551645163e-07, + "loss": 0.3392, + "num_input_tokens_seen": 4766826496, + "step": 2273 + }, + { + "epoch": 0.39978302142663413, + "grad_norm": 14.913055419921875, + "learning_rate": 4.156345387804847e-07, + "loss": 0.3579, + "num_input_tokens_seen": 4768923648, + "step": 2274 + }, + { + "epoch": 0.4000542446433415, + "grad_norm": 14.513870239257812, + "learning_rate": 4.152493201767281e-07, + "loss": 0.3177, + "num_input_tokens_seen": 4771020800, + "step": 2275 + }, + { + "epoch": 0.4003254678600488, + "grad_norm": 18.92793083190918, + "learning_rate": 4.1486421001519087e-07, + "loss": 0.6674, + "num_input_tokens_seen": 4773117952, + "step": 2276 + }, + { + "epoch": 0.40059669107675616, + "grad_norm": 14.322976112365723, + "learning_rate": 4.1447920860579524e-07, + "loss": 0.3269, + "num_input_tokens_seen": 4775215104, + "step": 2277 + }, + { + "epoch": 0.4008679142934635, + "grad_norm": 20.151758193969727, + "learning_rate": 4.1409431625837545e-07, + "loss": 0.5806, + "num_input_tokens_seen": 4777312256, + "step": 2278 + }, + { + "epoch": 0.4011391375101709, + "grad_norm": 17.131452560424805, + "learning_rate": 4.137095332826784e-07, + "loss": 0.3868, + "num_input_tokens_seen": 4779409408, + "step": 2279 + }, + { + "epoch": 0.40141036072687825, + "grad_norm": 15.69462776184082, + "learning_rate": 4.1332485998836277e-07, + "loss": 0.4183, + "num_input_tokens_seen": 4781506560, + "step": 2280 + }, + { + "epoch": 0.40168158394358555, + "grad_norm": 13.978616714477539, + "learning_rate": 4.129402966849987e-07, + "loss": 0.265, + "num_input_tokens_seen": 4783603712, + "step": 2281 + }, + { + "epoch": 0.4019528071602929, + "grad_norm": 17.568675994873047, + "learning_rate": 4.1255584368206877e-07, + "loss": 0.5206, + "num_input_tokens_seen": 4785700864, + "step": 2282 + }, + { + "epoch": 0.4022240303770003, + "grad_norm": 26.41534423828125, + "learning_rate": 4.121715012889655e-07, + "loss": 0.3761, + "num_input_tokens_seen": 4787798016, + "step": 2283 + }, + { + "epoch": 0.40249525359370764, + "grad_norm": 14.635917663574219, + "learning_rate": 4.1178726981499313e-07, + "loss": 0.3851, + "num_input_tokens_seen": 4789895168, + "step": 2284 + }, + { + "epoch": 0.402766476810415, + "grad_norm": 18.9488582611084, + "learning_rate": 4.1140314956936663e-07, + "loss": 0.6027, + "num_input_tokens_seen": 4791992320, + "step": 2285 + }, + { + "epoch": 0.4030377000271223, + "grad_norm": 17.327831268310547, + "learning_rate": 4.1101914086121137e-07, + "loss": 0.269, + "num_input_tokens_seen": 4794089472, + "step": 2286 + }, + { + "epoch": 0.40330892324382966, + "grad_norm": 15.44780445098877, + "learning_rate": 4.106352439995632e-07, + "loss": 0.4427, + "num_input_tokens_seen": 4796186624, + "step": 2287 + }, + { + "epoch": 0.403580146460537, + "grad_norm": 21.893918991088867, + "learning_rate": 4.102514592933671e-07, + "loss": 0.3452, + "num_input_tokens_seen": 4798283776, + "step": 2288 + }, + { + "epoch": 0.4038513696772444, + "grad_norm": 15.465867042541504, + "learning_rate": 4.098677870514788e-07, + "loss": 0.4563, + "num_input_tokens_seen": 4800380928, + "step": 2289 + }, + { + "epoch": 0.40412259289395175, + "grad_norm": 18.96814727783203, + "learning_rate": 4.094842275826631e-07, + "loss": 0.4338, + "num_input_tokens_seen": 4802478080, + "step": 2290 + }, + { + "epoch": 0.40439381611065905, + "grad_norm": 18.4038143157959, + "learning_rate": 4.091007811955941e-07, + "loss": 0.4135, + "num_input_tokens_seen": 4804575232, + "step": 2291 + }, + { + "epoch": 0.4046650393273664, + "grad_norm": 16.829374313354492, + "learning_rate": 4.0871744819885467e-07, + "loss": 0.5425, + "num_input_tokens_seen": 4806672384, + "step": 2292 + }, + { + "epoch": 0.4049362625440738, + "grad_norm": 16.508983612060547, + "learning_rate": 4.0833422890093684e-07, + "loss": 0.5627, + "num_input_tokens_seen": 4808769536, + "step": 2293 + }, + { + "epoch": 0.40520748576078114, + "grad_norm": 19.47481918334961, + "learning_rate": 4.0795112361024075e-07, + "loss": 0.5495, + "num_input_tokens_seen": 4810866688, + "step": 2294 + }, + { + "epoch": 0.4054787089774885, + "grad_norm": 12.910174369812012, + "learning_rate": 4.0756813263507496e-07, + "loss": 0.3162, + "num_input_tokens_seen": 4812963840, + "step": 2295 + }, + { + "epoch": 0.4057499321941958, + "grad_norm": 15.603672981262207, + "learning_rate": 4.0718525628365617e-07, + "loss": 0.3257, + "num_input_tokens_seen": 4815060992, + "step": 2296 + }, + { + "epoch": 0.40602115541090317, + "grad_norm": 17.36051368713379, + "learning_rate": 4.0680249486410845e-07, + "loss": 0.4128, + "num_input_tokens_seen": 4817158144, + "step": 2297 + }, + { + "epoch": 0.40629237862761053, + "grad_norm": 20.466211318969727, + "learning_rate": 4.0641984868446386e-07, + "loss": 0.5455, + "num_input_tokens_seen": 4819255296, + "step": 2298 + }, + { + "epoch": 0.4065636018443179, + "grad_norm": 15.06020736694336, + "learning_rate": 4.0603731805266175e-07, + "loss": 0.3718, + "num_input_tokens_seen": 4821352448, + "step": 2299 + }, + { + "epoch": 0.40683482506102525, + "grad_norm": 16.064971923828125, + "learning_rate": 4.056549032765476e-07, + "loss": 0.4691, + "num_input_tokens_seen": 4823449600, + "step": 2300 + }, + { + "epoch": 0.40710604827773256, + "grad_norm": 24.106367111206055, + "learning_rate": 4.0527260466387446e-07, + "loss": 0.7604, + "num_input_tokens_seen": 4825546752, + "step": 2301 + }, + { + "epoch": 0.4073772714944399, + "grad_norm": 17.96955680847168, + "learning_rate": 4.0489042252230197e-07, + "loss": 0.4701, + "num_input_tokens_seen": 4827643904, + "step": 2302 + }, + { + "epoch": 0.4076484947111473, + "grad_norm": 13.112820625305176, + "learning_rate": 4.045083571593955e-07, + "loss": 0.36, + "num_input_tokens_seen": 4829741056, + "step": 2303 + }, + { + "epoch": 0.40791971792785464, + "grad_norm": 19.882753372192383, + "learning_rate": 4.0412640888262697e-07, + "loss": 0.7148, + "num_input_tokens_seen": 4831838208, + "step": 2304 + }, + { + "epoch": 0.40819094114456195, + "grad_norm": 15.251856803894043, + "learning_rate": 4.0374457799937354e-07, + "loss": 0.3871, + "num_input_tokens_seen": 4833935360, + "step": 2305 + }, + { + "epoch": 0.4084621643612693, + "grad_norm": 18.7488956451416, + "learning_rate": 4.033628648169184e-07, + "loss": 0.3569, + "num_input_tokens_seen": 4836032512, + "step": 2306 + }, + { + "epoch": 0.40873338757797667, + "grad_norm": 20.533279418945312, + "learning_rate": 4.0298126964244947e-07, + "loss": 0.561, + "num_input_tokens_seen": 4838129664, + "step": 2307 + }, + { + "epoch": 0.40900461079468403, + "grad_norm": 13.255075454711914, + "learning_rate": 4.025997927830604e-07, + "loss": 0.3199, + "num_input_tokens_seen": 4840226816, + "step": 2308 + }, + { + "epoch": 0.4092758340113914, + "grad_norm": 18.75166130065918, + "learning_rate": 4.022184345457492e-07, + "loss": 0.3569, + "num_input_tokens_seen": 4842323968, + "step": 2309 + }, + { + "epoch": 0.4095470572280987, + "grad_norm": 18.911184310913086, + "learning_rate": 4.0183719523741846e-07, + "loss": 0.5254, + "num_input_tokens_seen": 4844421120, + "step": 2310 + }, + { + "epoch": 0.40981828044480606, + "grad_norm": 18.690418243408203, + "learning_rate": 4.014560751648747e-07, + "loss": 0.4653, + "num_input_tokens_seen": 4846518272, + "step": 2311 + }, + { + "epoch": 0.4100895036615134, + "grad_norm": 19.980985641479492, + "learning_rate": 4.0107507463482924e-07, + "loss": 0.4713, + "num_input_tokens_seen": 4848615424, + "step": 2312 + }, + { + "epoch": 0.4103607268782208, + "grad_norm": 18.602523803710938, + "learning_rate": 4.0069419395389657e-07, + "loss": 0.5677, + "num_input_tokens_seen": 4850712576, + "step": 2313 + }, + { + "epoch": 0.41063195009492814, + "grad_norm": 14.569324493408203, + "learning_rate": 4.0031343342859526e-07, + "loss": 0.4518, + "num_input_tokens_seen": 4852809728, + "step": 2314 + }, + { + "epoch": 0.41090317331163545, + "grad_norm": 17.369930267333984, + "learning_rate": 3.9993279336534667e-07, + "loss": 0.5966, + "num_input_tokens_seen": 4854906880, + "step": 2315 + }, + { + "epoch": 0.4111743965283428, + "grad_norm": 16.838441848754883, + "learning_rate": 3.995522740704758e-07, + "loss": 0.568, + "num_input_tokens_seen": 4857004032, + "step": 2316 + }, + { + "epoch": 0.4114456197450502, + "grad_norm": 16.305984497070312, + "learning_rate": 3.991718758502094e-07, + "loss": 0.3684, + "num_input_tokens_seen": 4859101184, + "step": 2317 + }, + { + "epoch": 0.41171684296175753, + "grad_norm": 27.06982421875, + "learning_rate": 3.987915990106779e-07, + "loss": 0.5516, + "num_input_tokens_seen": 4861198336, + "step": 2318 + }, + { + "epoch": 0.4119880661784649, + "grad_norm": 15.275781631469727, + "learning_rate": 3.984114438579137e-07, + "loss": 0.417, + "num_input_tokens_seen": 4863295488, + "step": 2319 + }, + { + "epoch": 0.4122592893951722, + "grad_norm": 14.07772159576416, + "learning_rate": 3.980314106978512e-07, + "loss": 0.3962, + "num_input_tokens_seen": 4865392640, + "step": 2320 + }, + { + "epoch": 0.41253051261187956, + "grad_norm": 14.107834815979004, + "learning_rate": 3.976514998363265e-07, + "loss": 0.3945, + "num_input_tokens_seen": 4867489792, + "step": 2321 + }, + { + "epoch": 0.4128017358285869, + "grad_norm": 15.831769943237305, + "learning_rate": 3.972717115790773e-07, + "loss": 0.3535, + "num_input_tokens_seen": 4869586944, + "step": 2322 + }, + { + "epoch": 0.4130729590452943, + "grad_norm": 16.84862518310547, + "learning_rate": 3.968920462317431e-07, + "loss": 0.4851, + "num_input_tokens_seen": 4871684096, + "step": 2323 + }, + { + "epoch": 0.41334418226200165, + "grad_norm": 12.755271911621094, + "learning_rate": 3.965125040998637e-07, + "loss": 0.3473, + "num_input_tokens_seen": 4873781248, + "step": 2324 + }, + { + "epoch": 0.41361540547870895, + "grad_norm": 15.923316955566406, + "learning_rate": 3.961330854888805e-07, + "loss": 0.4099, + "num_input_tokens_seen": 4875878400, + "step": 2325 + }, + { + "epoch": 0.4138866286954163, + "grad_norm": 11.70615291595459, + "learning_rate": 3.9575379070413485e-07, + "loss": 0.2539, + "num_input_tokens_seen": 4877975552, + "step": 2326 + }, + { + "epoch": 0.4141578519121237, + "grad_norm": 27.65726661682129, + "learning_rate": 3.953746200508693e-07, + "loss": 0.7379, + "num_input_tokens_seen": 4880072704, + "step": 2327 + }, + { + "epoch": 0.41442907512883104, + "grad_norm": 20.938331604003906, + "learning_rate": 3.9499557383422534e-07, + "loss": 0.7167, + "num_input_tokens_seen": 4882169856, + "step": 2328 + }, + { + "epoch": 0.4147002983455384, + "grad_norm": 13.526237487792969, + "learning_rate": 3.94616652359245e-07, + "loss": 0.2987, + "num_input_tokens_seen": 4884267008, + "step": 2329 + }, + { + "epoch": 0.4149715215622457, + "grad_norm": 10.629281997680664, + "learning_rate": 3.942378559308703e-07, + "loss": 0.2497, + "num_input_tokens_seen": 4886364160, + "step": 2330 + }, + { + "epoch": 0.41524274477895307, + "grad_norm": 15.359818458557129, + "learning_rate": 3.93859184853942e-07, + "loss": 0.3717, + "num_input_tokens_seen": 4888461312, + "step": 2331 + }, + { + "epoch": 0.4155139679956604, + "grad_norm": 13.921889305114746, + "learning_rate": 3.934806394332001e-07, + "loss": 0.3598, + "num_input_tokens_seen": 4890558464, + "step": 2332 + }, + { + "epoch": 0.4157851912123678, + "grad_norm": 21.2630558013916, + "learning_rate": 3.9310221997328363e-07, + "loss": 0.5743, + "num_input_tokens_seen": 4892655616, + "step": 2333 + }, + { + "epoch": 0.41605641442907515, + "grad_norm": 20.155832290649414, + "learning_rate": 3.9272392677873e-07, + "loss": 0.6111, + "num_input_tokens_seen": 4894752768, + "step": 2334 + }, + { + "epoch": 0.41632763764578246, + "grad_norm": 24.92179298400879, + "learning_rate": 3.9234576015397536e-07, + "loss": 0.8151, + "num_input_tokens_seen": 4896849920, + "step": 2335 + }, + { + "epoch": 0.4165988608624898, + "grad_norm": 14.339614868164062, + "learning_rate": 3.9196772040335367e-07, + "loss": 0.325, + "num_input_tokens_seen": 4898947072, + "step": 2336 + }, + { + "epoch": 0.4168700840791972, + "grad_norm": 24.353408813476562, + "learning_rate": 3.915898078310972e-07, + "loss": 0.6659, + "num_input_tokens_seen": 4901044224, + "step": 2337 + }, + { + "epoch": 0.41714130729590454, + "grad_norm": 20.132877349853516, + "learning_rate": 3.9121202274133525e-07, + "loss": 0.5632, + "num_input_tokens_seen": 4903141376, + "step": 2338 + }, + { + "epoch": 0.4174125305126119, + "grad_norm": 17.58623695373535, + "learning_rate": 3.9083436543809536e-07, + "loss": 0.6138, + "num_input_tokens_seen": 4905238528, + "step": 2339 + }, + { + "epoch": 0.4176837537293192, + "grad_norm": 14.02330493927002, + "learning_rate": 3.904568362253011e-07, + "loss": 0.3483, + "num_input_tokens_seen": 4907335680, + "step": 2340 + }, + { + "epoch": 0.41795497694602657, + "grad_norm": 15.86801528930664, + "learning_rate": 3.9007943540677426e-07, + "loss": 0.4415, + "num_input_tokens_seen": 4909432832, + "step": 2341 + }, + { + "epoch": 0.41822620016273393, + "grad_norm": 14.43091869354248, + "learning_rate": 3.897021632862321e-07, + "loss": 0.3732, + "num_input_tokens_seen": 4911529984, + "step": 2342 + }, + { + "epoch": 0.4184974233794413, + "grad_norm": 15.3654203414917, + "learning_rate": 3.893250201672893e-07, + "loss": 0.4415, + "num_input_tokens_seen": 4913627136, + "step": 2343 + }, + { + "epoch": 0.41876864659614865, + "grad_norm": 15.816032409667969, + "learning_rate": 3.889480063534563e-07, + "loss": 0.4123, + "num_input_tokens_seen": 4915724288, + "step": 2344 + }, + { + "epoch": 0.41903986981285596, + "grad_norm": 12.658620834350586, + "learning_rate": 3.88571122148139e-07, + "loss": 0.2871, + "num_input_tokens_seen": 4917821440, + "step": 2345 + }, + { + "epoch": 0.4193110930295633, + "grad_norm": 14.271703720092773, + "learning_rate": 3.881943678546399e-07, + "loss": 0.4812, + "num_input_tokens_seen": 4919918592, + "step": 2346 + }, + { + "epoch": 0.4195823162462707, + "grad_norm": 25.85860824584961, + "learning_rate": 3.878177437761564e-07, + "loss": 0.9452, + "num_input_tokens_seen": 4922015744, + "step": 2347 + }, + { + "epoch": 0.41985353946297804, + "grad_norm": 17.978992462158203, + "learning_rate": 3.8744125021578123e-07, + "loss": 0.47, + "num_input_tokens_seen": 4924112896, + "step": 2348 + }, + { + "epoch": 0.4201247626796854, + "grad_norm": 21.466154098510742, + "learning_rate": 3.87064887476502e-07, + "loss": 0.6245, + "num_input_tokens_seen": 4926210048, + "step": 2349 + }, + { + "epoch": 0.4203959858963927, + "grad_norm": 17.588224411010742, + "learning_rate": 3.8668865586120124e-07, + "loss": 0.3928, + "num_input_tokens_seen": 4928307200, + "step": 2350 + }, + { + "epoch": 0.42066720911310007, + "grad_norm": 19.852426528930664, + "learning_rate": 3.8631255567265573e-07, + "loss": 0.5021, + "num_input_tokens_seen": 4930404352, + "step": 2351 + }, + { + "epoch": 0.42093843232980743, + "grad_norm": 17.130460739135742, + "learning_rate": 3.859365872135367e-07, + "loss": 0.3786, + "num_input_tokens_seen": 4932501504, + "step": 2352 + }, + { + "epoch": 0.4212096555465148, + "grad_norm": 15.021489143371582, + "learning_rate": 3.8556075078640925e-07, + "loss": 0.4072, + "num_input_tokens_seen": 4934598656, + "step": 2353 + }, + { + "epoch": 0.42148087876322216, + "grad_norm": 18.32381820678711, + "learning_rate": 3.85185046693732e-07, + "loss": 0.5125, + "num_input_tokens_seen": 4936695808, + "step": 2354 + }, + { + "epoch": 0.42175210197992946, + "grad_norm": 14.185894966125488, + "learning_rate": 3.8480947523785767e-07, + "loss": 0.4148, + "num_input_tokens_seen": 4938792960, + "step": 2355 + }, + { + "epoch": 0.4220233251966368, + "grad_norm": 17.715118408203125, + "learning_rate": 3.844340367210318e-07, + "loss": 0.497, + "num_input_tokens_seen": 4940890112, + "step": 2356 + }, + { + "epoch": 0.4222945484133442, + "grad_norm": 13.911568641662598, + "learning_rate": 3.840587314453928e-07, + "loss": 0.3361, + "num_input_tokens_seen": 4942987264, + "step": 2357 + }, + { + "epoch": 0.42256577163005155, + "grad_norm": 16.71425437927246, + "learning_rate": 3.8368355971297204e-07, + "loss": 0.3143, + "num_input_tokens_seen": 4945084416, + "step": 2358 + }, + { + "epoch": 0.4228369948467589, + "grad_norm": 12.690278053283691, + "learning_rate": 3.8330852182569374e-07, + "loss": 0.2808, + "num_input_tokens_seen": 4947181568, + "step": 2359 + }, + { + "epoch": 0.4231082180634662, + "grad_norm": 17.643646240234375, + "learning_rate": 3.8293361808537404e-07, + "loss": 0.532, + "num_input_tokens_seen": 4949278720, + "step": 2360 + }, + { + "epoch": 0.4233794412801736, + "grad_norm": 19.790626525878906, + "learning_rate": 3.825588487937211e-07, + "loss": 0.5061, + "num_input_tokens_seen": 4951375872, + "step": 2361 + }, + { + "epoch": 0.42365066449688094, + "grad_norm": 17.743762969970703, + "learning_rate": 3.821842142523352e-07, + "loss": 0.2747, + "num_input_tokens_seen": 4953473024, + "step": 2362 + }, + { + "epoch": 0.4239218877135883, + "grad_norm": 18.460596084594727, + "learning_rate": 3.818097147627076e-07, + "loss": 0.4853, + "num_input_tokens_seen": 4955570176, + "step": 2363 + }, + { + "epoch": 0.42419311093029566, + "grad_norm": 20.445358276367188, + "learning_rate": 3.8143535062622154e-07, + "loss": 0.563, + "num_input_tokens_seen": 4957667328, + "step": 2364 + }, + { + "epoch": 0.42446433414700296, + "grad_norm": 21.970741271972656, + "learning_rate": 3.8106112214415087e-07, + "loss": 0.6022, + "num_input_tokens_seen": 4959764480, + "step": 2365 + }, + { + "epoch": 0.4247355573637103, + "grad_norm": 15.904183387756348, + "learning_rate": 3.8068702961766053e-07, + "loss": 0.4444, + "num_input_tokens_seen": 4961861632, + "step": 2366 + }, + { + "epoch": 0.4250067805804177, + "grad_norm": 9.053658485412598, + "learning_rate": 3.8031307334780594e-07, + "loss": 0.1744, + "num_input_tokens_seen": 4963958784, + "step": 2367 + }, + { + "epoch": 0.42527800379712505, + "grad_norm": 14.599189758300781, + "learning_rate": 3.7993925363553294e-07, + "loss": 0.3963, + "num_input_tokens_seen": 4966055936, + "step": 2368 + }, + { + "epoch": 0.4255492270138324, + "grad_norm": 18.069921493530273, + "learning_rate": 3.795655707816772e-07, + "loss": 0.5241, + "num_input_tokens_seen": 4968153088, + "step": 2369 + }, + { + "epoch": 0.4258204502305397, + "grad_norm": 19.06882095336914, + "learning_rate": 3.791920250869645e-07, + "loss": 0.515, + "num_input_tokens_seen": 4970250240, + "step": 2370 + }, + { + "epoch": 0.4260916734472471, + "grad_norm": 26.823545455932617, + "learning_rate": 3.7881861685201033e-07, + "loss": 0.94, + "num_input_tokens_seen": 4972347392, + "step": 2371 + }, + { + "epoch": 0.42636289666395444, + "grad_norm": 12.676380157470703, + "learning_rate": 3.784453463773194e-07, + "loss": 0.2774, + "num_input_tokens_seen": 4974444544, + "step": 2372 + }, + { + "epoch": 0.4266341198806618, + "grad_norm": 19.58039665222168, + "learning_rate": 3.780722139632858e-07, + "loss": 0.5369, + "num_input_tokens_seen": 4976541696, + "step": 2373 + }, + { + "epoch": 0.42690534309736916, + "grad_norm": 23.31465721130371, + "learning_rate": 3.776992199101918e-07, + "loss": 0.7374, + "num_input_tokens_seen": 4978638848, + "step": 2374 + }, + { + "epoch": 0.42717656631407647, + "grad_norm": 18.41512107849121, + "learning_rate": 3.773263645182091e-07, + "loss": 0.5331, + "num_input_tokens_seen": 4980736000, + "step": 2375 + }, + { + "epoch": 0.42744778953078383, + "grad_norm": 15.421567916870117, + "learning_rate": 3.769536480873976e-07, + "loss": 0.3192, + "num_input_tokens_seen": 4982833152, + "step": 2376 + }, + { + "epoch": 0.4277190127474912, + "grad_norm": 19.701927185058594, + "learning_rate": 3.765810709177052e-07, + "loss": 0.6033, + "num_input_tokens_seen": 4984930304, + "step": 2377 + }, + { + "epoch": 0.42799023596419855, + "grad_norm": 20.730276107788086, + "learning_rate": 3.762086333089678e-07, + "loss": 0.6628, + "num_input_tokens_seen": 4987027456, + "step": 2378 + }, + { + "epoch": 0.4282614591809059, + "grad_norm": 17.648303985595703, + "learning_rate": 3.758363355609092e-07, + "loss": 0.4378, + "num_input_tokens_seen": 4989124608, + "step": 2379 + }, + { + "epoch": 0.4285326823976132, + "grad_norm": 14.124812126159668, + "learning_rate": 3.7546417797314023e-07, + "loss": 0.3577, + "num_input_tokens_seen": 4991221760, + "step": 2380 + }, + { + "epoch": 0.4288039056143206, + "grad_norm": 16.618432998657227, + "learning_rate": 3.7509216084515916e-07, + "loss": 0.4176, + "num_input_tokens_seen": 4993318912, + "step": 2381 + }, + { + "epoch": 0.42907512883102794, + "grad_norm": 18.444599151611328, + "learning_rate": 3.747202844763514e-07, + "loss": 0.553, + "num_input_tokens_seen": 4995416064, + "step": 2382 + }, + { + "epoch": 0.4293463520477353, + "grad_norm": 21.762187957763672, + "learning_rate": 3.743485491659887e-07, + "loss": 0.6629, + "num_input_tokens_seen": 4997513216, + "step": 2383 + }, + { + "epoch": 0.42961757526444266, + "grad_norm": 16.13730239868164, + "learning_rate": 3.739769552132298e-07, + "loss": 0.4474, + "num_input_tokens_seen": 4999610368, + "step": 2384 + }, + { + "epoch": 0.42988879848114997, + "grad_norm": 12.197819709777832, + "learning_rate": 3.73605502917119e-07, + "loss": 0.2769, + "num_input_tokens_seen": 5001707520, + "step": 2385 + }, + { + "epoch": 0.43016002169785733, + "grad_norm": 16.63471031188965, + "learning_rate": 3.7323419257658716e-07, + "loss": 0.5748, + "num_input_tokens_seen": 5003804672, + "step": 2386 + }, + { + "epoch": 0.4304312449145647, + "grad_norm": 13.66933822631836, + "learning_rate": 3.728630244904507e-07, + "loss": 0.3293, + "num_input_tokens_seen": 5005901824, + "step": 2387 + }, + { + "epoch": 0.43070246813127205, + "grad_norm": 15.115860939025879, + "learning_rate": 3.724919989574116e-07, + "loss": 0.4252, + "num_input_tokens_seen": 5007998976, + "step": 2388 + }, + { + "epoch": 0.43097369134797936, + "grad_norm": 21.96157455444336, + "learning_rate": 3.7212111627605704e-07, + "loss": 0.7401, + "num_input_tokens_seen": 5010096128, + "step": 2389 + }, + { + "epoch": 0.4312449145646867, + "grad_norm": 17.18273162841797, + "learning_rate": 3.717503767448593e-07, + "loss": 0.4966, + "num_input_tokens_seen": 5012193280, + "step": 2390 + }, + { + "epoch": 0.4315161377813941, + "grad_norm": 23.42146110534668, + "learning_rate": 3.7137978066217555e-07, + "loss": 0.4974, + "num_input_tokens_seen": 5014290432, + "step": 2391 + }, + { + "epoch": 0.43178736099810144, + "grad_norm": 21.250967025756836, + "learning_rate": 3.710093283262472e-07, + "loss": 0.7683, + "num_input_tokens_seen": 5016387584, + "step": 2392 + }, + { + "epoch": 0.4320585842148088, + "grad_norm": 14.832980155944824, + "learning_rate": 3.706390200352003e-07, + "loss": 0.347, + "num_input_tokens_seen": 5018484736, + "step": 2393 + }, + { + "epoch": 0.4323298074315161, + "grad_norm": 20.22023582458496, + "learning_rate": 3.7026885608704494e-07, + "loss": 0.4751, + "num_input_tokens_seen": 5020581888, + "step": 2394 + }, + { + "epoch": 0.4326010306482235, + "grad_norm": 15.793986320495605, + "learning_rate": 3.6989883677967483e-07, + "loss": 0.3332, + "num_input_tokens_seen": 5022679040, + "step": 2395 + }, + { + "epoch": 0.43287225386493083, + "grad_norm": 15.396735191345215, + "learning_rate": 3.6952896241086783e-07, + "loss": 0.4048, + "num_input_tokens_seen": 5024776192, + "step": 2396 + }, + { + "epoch": 0.4331434770816382, + "grad_norm": 16.177085876464844, + "learning_rate": 3.6915923327828423e-07, + "loss": 0.3603, + "num_input_tokens_seen": 5026873344, + "step": 2397 + }, + { + "epoch": 0.43341470029834556, + "grad_norm": 12.686070442199707, + "learning_rate": 3.6878964967946813e-07, + "loss": 0.3433, + "num_input_tokens_seen": 5028970496, + "step": 2398 + }, + { + "epoch": 0.43368592351505286, + "grad_norm": 16.852243423461914, + "learning_rate": 3.6842021191184636e-07, + "loss": 0.4353, + "num_input_tokens_seen": 5031067648, + "step": 2399 + }, + { + "epoch": 0.4339571467317602, + "grad_norm": 25.399932861328125, + "learning_rate": 3.6805092027272853e-07, + "loss": 0.8005, + "num_input_tokens_seen": 5033164800, + "step": 2400 + }, + { + "epoch": 0.00027122321670735016, + "grad_norm": 17.1519832611084, + "learning_rate": 3.6768177505930655e-07, + "loss": 0.4364, + "num_input_tokens_seen": 5035261952, + "step": 2401 + }, + { + "epoch": 0.0005424464334147003, + "grad_norm": 19.901262283325195, + "learning_rate": 3.67312776568654e-07, + "loss": 0.5551, + "num_input_tokens_seen": 5037359104, + "step": 2402 + }, + { + "epoch": 0.0008136696501220504, + "grad_norm": 20.185373306274414, + "learning_rate": 3.669439250977272e-07, + "loss": 0.624, + "num_input_tokens_seen": 5039456256, + "step": 2403 + }, + { + "epoch": 0.0010848928668294006, + "grad_norm": 17.927940368652344, + "learning_rate": 3.665752209433637e-07, + "loss": 0.5393, + "num_input_tokens_seen": 5041553408, + "step": 2404 + }, + { + "epoch": 0.0013561160835367507, + "grad_norm": 13.626057624816895, + "learning_rate": 3.6620666440228254e-07, + "loss": 0.3492, + "num_input_tokens_seen": 5043650560, + "step": 2405 + }, + { + "epoch": 0.0016273393002441008, + "grad_norm": 15.946044921875, + "learning_rate": 3.6583825577108397e-07, + "loss": 0.2869, + "num_input_tokens_seen": 5045747712, + "step": 2406 + }, + { + "epoch": 0.001898562516951451, + "grad_norm": 24.774717330932617, + "learning_rate": 3.654699953462494e-07, + "loss": 0.544, + "num_input_tokens_seen": 5047844864, + "step": 2407 + }, + { + "epoch": 0.0021697857336588013, + "grad_norm": 14.22808837890625, + "learning_rate": 3.651018834241406e-07, + "loss": 0.34, + "num_input_tokens_seen": 5049942016, + "step": 2408 + }, + { + "epoch": 0.0024410089503661514, + "grad_norm": 13.739176750183105, + "learning_rate": 3.6473392030100014e-07, + "loss": 0.3746, + "num_input_tokens_seen": 5052039168, + "step": 2409 + }, + { + "epoch": 0.0027122321670735015, + "grad_norm": 16.19430160522461, + "learning_rate": 3.6436610627295074e-07, + "loss": 0.4212, + "num_input_tokens_seen": 5054136320, + "step": 2410 + }, + { + "epoch": 0.0029834553837808516, + "grad_norm": 18.812156677246094, + "learning_rate": 3.639984416359949e-07, + "loss": 0.6517, + "num_input_tokens_seen": 5056233472, + "step": 2411 + }, + { + "epoch": 0.0032546786004882017, + "grad_norm": 10.88464069366455, + "learning_rate": 3.636309266860156e-07, + "loss": 0.301, + "num_input_tokens_seen": 5058330624, + "step": 2412 + }, + { + "epoch": 0.003525901817195552, + "grad_norm": 17.772342681884766, + "learning_rate": 3.6326356171877483e-07, + "loss": 0.5711, + "num_input_tokens_seen": 5060427776, + "step": 2413 + }, + { + "epoch": 0.003797125033902902, + "grad_norm": 18.594541549682617, + "learning_rate": 3.6289634702991343e-07, + "loss": 0.3508, + "num_input_tokens_seen": 5062524928, + "step": 2414 + }, + { + "epoch": 0.0040683482506102524, + "grad_norm": 12.95483112335205, + "learning_rate": 3.625292829149521e-07, + "loss": 0.2594, + "num_input_tokens_seen": 5064622080, + "step": 2415 + }, + { + "epoch": 0.0043395714673176026, + "grad_norm": 23.838085174560547, + "learning_rate": 3.6216236966929015e-07, + "loss": 0.7946, + "num_input_tokens_seen": 5066719232, + "step": 2416 + }, + { + "epoch": 0.004610794684024953, + "grad_norm": 17.340089797973633, + "learning_rate": 3.6179560758820527e-07, + "loss": 0.359, + "num_input_tokens_seen": 5068816384, + "step": 2417 + }, + { + "epoch": 0.004882017900732303, + "grad_norm": 30.172731399536133, + "learning_rate": 3.61428996966854e-07, + "loss": 0.5987, + "num_input_tokens_seen": 5070913536, + "step": 2418 + }, + { + "epoch": 0.005153241117439653, + "grad_norm": 16.280479431152344, + "learning_rate": 3.610625381002701e-07, + "loss": 0.311, + "num_input_tokens_seen": 5073010688, + "step": 2419 + }, + { + "epoch": 0.005424464334147003, + "grad_norm": 17.15519142150879, + "learning_rate": 3.606962312833659e-07, + "loss": 0.495, + "num_input_tokens_seen": 5075107840, + "step": 2420 + }, + { + "epoch": 0.005695687550854353, + "grad_norm": 16.51264190673828, + "learning_rate": 3.603300768109311e-07, + "loss": 0.3901, + "num_input_tokens_seen": 5077204992, + "step": 2421 + }, + { + "epoch": 0.005966910767561703, + "grad_norm": 22.195186614990234, + "learning_rate": 3.5996407497763305e-07, + "loss": 0.6579, + "num_input_tokens_seen": 5079302144, + "step": 2422 + }, + { + "epoch": 0.006238133984269053, + "grad_norm": 14.603880882263184, + "learning_rate": 3.5959822607801617e-07, + "loss": 0.3808, + "num_input_tokens_seen": 5081399296, + "step": 2423 + }, + { + "epoch": 0.006509357200976403, + "grad_norm": 19.53980255126953, + "learning_rate": 3.592325304065018e-07, + "loss": 0.5294, + "num_input_tokens_seen": 5083496448, + "step": 2424 + }, + { + "epoch": 0.0067805804176837535, + "grad_norm": 17.005908966064453, + "learning_rate": 3.588669882573875e-07, + "loss": 0.4601, + "num_input_tokens_seen": 5085593600, + "step": 2425 + }, + { + "epoch": 0.007051803634391104, + "grad_norm": 26.571182250976562, + "learning_rate": 3.5850159992484787e-07, + "loss": 0.9515, + "num_input_tokens_seen": 5087690752, + "step": 2426 + }, + { + "epoch": 0.007323026851098454, + "grad_norm": 15.586816787719727, + "learning_rate": 3.581363657029336e-07, + "loss": 0.3284, + "num_input_tokens_seen": 5089787904, + "step": 2427 + }, + { + "epoch": 0.007594250067805804, + "grad_norm": 9.308566093444824, + "learning_rate": 3.5777128588557126e-07, + "loss": 0.1959, + "num_input_tokens_seen": 5091885056, + "step": 2428 + }, + { + "epoch": 0.007865473284513154, + "grad_norm": 18.024173736572266, + "learning_rate": 3.574063607665633e-07, + "loss": 0.6265, + "num_input_tokens_seen": 5093982208, + "step": 2429 + }, + { + "epoch": 0.008136696501220505, + "grad_norm": 17.78046417236328, + "learning_rate": 3.570415906395873e-07, + "loss": 0.4991, + "num_input_tokens_seen": 5096079360, + "step": 2430 + }, + { + "epoch": 0.008407919717927854, + "grad_norm": 16.535097122192383, + "learning_rate": 3.5667697579819655e-07, + "loss": 0.455, + "num_input_tokens_seen": 5098176512, + "step": 2431 + }, + { + "epoch": 0.008679142934635205, + "grad_norm": 18.70627784729004, + "learning_rate": 3.563125165358193e-07, + "loss": 0.5008, + "num_input_tokens_seen": 5100273664, + "step": 2432 + }, + { + "epoch": 0.008950366151342554, + "grad_norm": 17.156047821044922, + "learning_rate": 3.559482131457583e-07, + "loss": 0.4694, + "num_input_tokens_seen": 5102370816, + "step": 2433 + }, + { + "epoch": 0.009221589368049905, + "grad_norm": 15.62370777130127, + "learning_rate": 3.5558406592119115e-07, + "loss": 0.4304, + "num_input_tokens_seen": 5104467968, + "step": 2434 + }, + { + "epoch": 0.009492812584757255, + "grad_norm": 14.981012344360352, + "learning_rate": 3.552200751551697e-07, + "loss": 0.4884, + "num_input_tokens_seen": 5106565120, + "step": 2435 + }, + { + "epoch": 0.009764035801464606, + "grad_norm": 16.954042434692383, + "learning_rate": 3.548562411406201e-07, + "loss": 0.4192, + "num_input_tokens_seen": 5108662272, + "step": 2436 + }, + { + "epoch": 0.010035259018171955, + "grad_norm": 20.573305130004883, + "learning_rate": 3.544925641703413e-07, + "loss": 0.4403, + "num_input_tokens_seen": 5110759424, + "step": 2437 + }, + { + "epoch": 0.010306482234879306, + "grad_norm": 22.3472957611084, + "learning_rate": 3.5412904453700754e-07, + "loss": 0.6544, + "num_input_tokens_seen": 5112856576, + "step": 2438 + }, + { + "epoch": 0.010577705451586655, + "grad_norm": 33.12949752807617, + "learning_rate": 3.537656825331653e-07, + "loss": 0.6427, + "num_input_tokens_seen": 5114953728, + "step": 2439 + }, + { + "epoch": 0.010848928668294006, + "grad_norm": 18.552413940429688, + "learning_rate": 3.534024784512345e-07, + "loss": 0.501, + "num_input_tokens_seen": 5117050880, + "step": 2440 + }, + { + "epoch": 0.011120151885001357, + "grad_norm": 15.270557403564453, + "learning_rate": 3.5303943258350813e-07, + "loss": 0.3836, + "num_input_tokens_seen": 5119148032, + "step": 2441 + }, + { + "epoch": 0.011391375101708706, + "grad_norm": 32.084510803222656, + "learning_rate": 3.526765452221512e-07, + "loss": 0.3691, + "num_input_tokens_seen": 5121245184, + "step": 2442 + }, + { + "epoch": 0.011662598318416057, + "grad_norm": 15.427001953125, + "learning_rate": 3.523138166592021e-07, + "loss": 0.4435, + "num_input_tokens_seen": 5123342336, + "step": 2443 + }, + { + "epoch": 0.011933821535123406, + "grad_norm": 21.460124969482422, + "learning_rate": 3.5195124718657075e-07, + "loss": 0.725, + "num_input_tokens_seen": 5125439488, + "step": 2444 + }, + { + "epoch": 0.012205044751830757, + "grad_norm": 15.456875801086426, + "learning_rate": 3.5158883709603946e-07, + "loss": 0.2977, + "num_input_tokens_seen": 5127536640, + "step": 2445 + }, + { + "epoch": 0.012476267968538107, + "grad_norm": 16.195775985717773, + "learning_rate": 3.5122658667926177e-07, + "loss": 0.5319, + "num_input_tokens_seen": 5129633792, + "step": 2446 + }, + { + "epoch": 0.012747491185245458, + "grad_norm": 16.876602172851562, + "learning_rate": 3.5086449622776346e-07, + "loss": 0.5982, + "num_input_tokens_seen": 5131730944, + "step": 2447 + }, + { + "epoch": 0.013018714401952807, + "grad_norm": 16.981779098510742, + "learning_rate": 3.505025660329408e-07, + "loss": 0.4243, + "num_input_tokens_seen": 5133828096, + "step": 2448 + }, + { + "epoch": 0.013289937618660158, + "grad_norm": 13.226367950439453, + "learning_rate": 3.5014079638606164e-07, + "loss": 0.3373, + "num_input_tokens_seen": 5135925248, + "step": 2449 + }, + { + "epoch": 0.013561160835367507, + "grad_norm": 14.811175346374512, + "learning_rate": 3.497791875782643e-07, + "loss": 0.3153, + "num_input_tokens_seen": 5138022400, + "step": 2450 + }, + { + "epoch": 0.013832384052074858, + "grad_norm": 17.1142578125, + "learning_rate": 3.4941773990055777e-07, + "loss": 0.4019, + "num_input_tokens_seen": 5140119552, + "step": 2451 + }, + { + "epoch": 0.014103607268782207, + "grad_norm": 24.696069717407227, + "learning_rate": 3.490564536438215e-07, + "loss": 0.7408, + "num_input_tokens_seen": 5142216704, + "step": 2452 + }, + { + "epoch": 0.014374830485489558, + "grad_norm": 15.740670204162598, + "learning_rate": 3.4869532909880485e-07, + "loss": 0.4626, + "num_input_tokens_seen": 5144313856, + "step": 2453 + }, + { + "epoch": 0.014646053702196907, + "grad_norm": 17.356019973754883, + "learning_rate": 3.483343665561271e-07, + "loss": 0.3852, + "num_input_tokens_seen": 5146411008, + "step": 2454 + }, + { + "epoch": 0.014917276918904258, + "grad_norm": 19.49802589416504, + "learning_rate": 3.479735663062773e-07, + "loss": 0.5229, + "num_input_tokens_seen": 5148508160, + "step": 2455 + }, + { + "epoch": 0.015188500135611608, + "grad_norm": 11.227045059204102, + "learning_rate": 3.4761292863961354e-07, + "loss": 0.3333, + "num_input_tokens_seen": 5150605312, + "step": 2456 + }, + { + "epoch": 0.015459723352318959, + "grad_norm": 17.022945404052734, + "learning_rate": 3.4725245384636347e-07, + "loss": 0.4247, + "num_input_tokens_seen": 5152702464, + "step": 2457 + }, + { + "epoch": 0.015730946569026308, + "grad_norm": 12.745018005371094, + "learning_rate": 3.4689214221662364e-07, + "loss": 0.2605, + "num_input_tokens_seen": 5154799616, + "step": 2458 + }, + { + "epoch": 0.01600216978573366, + "grad_norm": 16.132686614990234, + "learning_rate": 3.465319940403587e-07, + "loss": 0.5055, + "num_input_tokens_seen": 5156896768, + "step": 2459 + }, + { + "epoch": 0.01627339300244101, + "grad_norm": 15.20864486694336, + "learning_rate": 3.4617200960740247e-07, + "loss": 0.3678, + "num_input_tokens_seen": 5158993920, + "step": 2460 + }, + { + "epoch": 0.01654461621914836, + "grad_norm": 12.260754585266113, + "learning_rate": 3.4581218920745663e-07, + "loss": 0.3402, + "num_input_tokens_seen": 5161091072, + "step": 2461 + }, + { + "epoch": 0.016815839435855708, + "grad_norm": 13.886555671691895, + "learning_rate": 3.454525331300908e-07, + "loss": 0.3149, + "num_input_tokens_seen": 5163188224, + "step": 2462 + }, + { + "epoch": 0.01708706265256306, + "grad_norm": 22.19728660583496, + "learning_rate": 3.450930416647429e-07, + "loss": 0.4094, + "num_input_tokens_seen": 5165285376, + "step": 2463 + }, + { + "epoch": 0.01735828586927041, + "grad_norm": 14.57243824005127, + "learning_rate": 3.4473371510071795e-07, + "loss": 0.4385, + "num_input_tokens_seen": 5167382528, + "step": 2464 + }, + { + "epoch": 0.01762950908597776, + "grad_norm": 20.04316520690918, + "learning_rate": 3.4437455372718795e-07, + "loss": 0.5753, + "num_input_tokens_seen": 5169479680, + "step": 2465 + }, + { + "epoch": 0.01790073230268511, + "grad_norm": 18.932300567626953, + "learning_rate": 3.440155578331925e-07, + "loss": 0.3834, + "num_input_tokens_seen": 5171576832, + "step": 2466 + }, + { + "epoch": 0.01817195551939246, + "grad_norm": 15.249947547912598, + "learning_rate": 3.4365672770763783e-07, + "loss": 0.3425, + "num_input_tokens_seen": 5173673984, + "step": 2467 + }, + { + "epoch": 0.01844317873609981, + "grad_norm": 16.69132423400879, + "learning_rate": 3.432980636392967e-07, + "loss": 0.3937, + "num_input_tokens_seen": 5175771136, + "step": 2468 + }, + { + "epoch": 0.01871440195280716, + "grad_norm": 14.121512413024902, + "learning_rate": 3.429395659168084e-07, + "loss": 0.3516, + "num_input_tokens_seen": 5177868288, + "step": 2469 + }, + { + "epoch": 0.01898562516951451, + "grad_norm": 19.293235778808594, + "learning_rate": 3.425812348286782e-07, + "loss": 0.5579, + "num_input_tokens_seen": 5179965440, + "step": 2470 + }, + { + "epoch": 0.01925684838622186, + "grad_norm": 26.16200065612793, + "learning_rate": 3.422230706632774e-07, + "loss": 0.8517, + "num_input_tokens_seen": 5182062592, + "step": 2471 + }, + { + "epoch": 0.01952807160292921, + "grad_norm": 16.774105072021484, + "learning_rate": 3.418650737088427e-07, + "loss": 0.4796, + "num_input_tokens_seen": 5184159744, + "step": 2472 + }, + { + "epoch": 0.019799294819636562, + "grad_norm": 16.8173885345459, + "learning_rate": 3.415072442534767e-07, + "loss": 0.3243, + "num_input_tokens_seen": 5186256896, + "step": 2473 + }, + { + "epoch": 0.02007051803634391, + "grad_norm": 14.54384994506836, + "learning_rate": 3.411495825851467e-07, + "loss": 0.3466, + "num_input_tokens_seen": 5188354048, + "step": 2474 + }, + { + "epoch": 0.02034174125305126, + "grad_norm": 16.708059310913086, + "learning_rate": 3.4079208899168545e-07, + "loss": 0.3709, + "num_input_tokens_seen": 5190451200, + "step": 2475 + }, + { + "epoch": 0.02061296446975861, + "grad_norm": 18.377099990844727, + "learning_rate": 3.404347637607899e-07, + "loss": 0.4063, + "num_input_tokens_seen": 5192548352, + "step": 2476 + }, + { + "epoch": 0.020884187686465962, + "grad_norm": 21.520936965942383, + "learning_rate": 3.400776071800219e-07, + "loss": 0.4975, + "num_input_tokens_seen": 5194645504, + "step": 2477 + }, + { + "epoch": 0.02115541090317331, + "grad_norm": 12.55679702758789, + "learning_rate": 3.3972061953680734e-07, + "loss": 0.2697, + "num_input_tokens_seen": 5196742656, + "step": 2478 + }, + { + "epoch": 0.02142663411988066, + "grad_norm": 16.432504653930664, + "learning_rate": 3.3936380111843666e-07, + "loss": 0.4419, + "num_input_tokens_seen": 5198839808, + "step": 2479 + }, + { + "epoch": 0.021697857336588012, + "grad_norm": 12.978721618652344, + "learning_rate": 3.390071522120635e-07, + "loss": 0.3704, + "num_input_tokens_seen": 5200936960, + "step": 2480 + }, + { + "epoch": 0.021969080553295363, + "grad_norm": 13.821551322937012, + "learning_rate": 3.3865067310470554e-07, + "loss": 0.3597, + "num_input_tokens_seen": 5203034112, + "step": 2481 + }, + { + "epoch": 0.022240303770002714, + "grad_norm": 13.253721237182617, + "learning_rate": 3.3829436408324316e-07, + "loss": 0.3515, + "num_input_tokens_seen": 5205131264, + "step": 2482 + }, + { + "epoch": 0.02251152698671006, + "grad_norm": 15.398584365844727, + "learning_rate": 3.3793822543442074e-07, + "loss": 0.4505, + "num_input_tokens_seen": 5207228416, + "step": 2483 + }, + { + "epoch": 0.022782750203417412, + "grad_norm": 17.346776962280273, + "learning_rate": 3.3758225744484483e-07, + "loss": 0.4072, + "num_input_tokens_seen": 5209325568, + "step": 2484 + }, + { + "epoch": 0.023053973420124763, + "grad_norm": 14.590849876403809, + "learning_rate": 3.372264604009851e-07, + "loss": 0.4183, + "num_input_tokens_seen": 5211422720, + "step": 2485 + }, + { + "epoch": 0.023325196636832114, + "grad_norm": 16.34534454345703, + "learning_rate": 3.3687083458917344e-07, + "loss": 0.4176, + "num_input_tokens_seen": 5213519872, + "step": 2486 + }, + { + "epoch": 0.023596419853539462, + "grad_norm": 15.394360542297363, + "learning_rate": 3.3651538029560377e-07, + "loss": 0.3908, + "num_input_tokens_seen": 5215617024, + "step": 2487 + }, + { + "epoch": 0.023867643070246813, + "grad_norm": 15.60722541809082, + "learning_rate": 3.361600978063325e-07, + "loss": 0.4385, + "num_input_tokens_seen": 5217714176, + "step": 2488 + }, + { + "epoch": 0.024138866286954164, + "grad_norm": 18.028072357177734, + "learning_rate": 3.358049874072771e-07, + "loss": 0.4593, + "num_input_tokens_seen": 5219811328, + "step": 2489 + }, + { + "epoch": 0.024410089503661515, + "grad_norm": 18.258270263671875, + "learning_rate": 3.3545004938421734e-07, + "loss": 0.4059, + "num_input_tokens_seen": 5221908480, + "step": 2490 + }, + { + "epoch": 0.024681312720368862, + "grad_norm": 21.254680633544922, + "learning_rate": 3.3509528402279357e-07, + "loss": 0.7823, + "num_input_tokens_seen": 5224005632, + "step": 2491 + }, + { + "epoch": 0.024952535937076213, + "grad_norm": 37.15273666381836, + "learning_rate": 3.347406916085074e-07, + "loss": 0.9546, + "num_input_tokens_seen": 5226102784, + "step": 2492 + }, + { + "epoch": 0.025223759153783564, + "grad_norm": 19.634872436523438, + "learning_rate": 3.3438627242672164e-07, + "loss": 0.7096, + "num_input_tokens_seen": 5228199936, + "step": 2493 + }, + { + "epoch": 0.025494982370490915, + "grad_norm": 18.12146759033203, + "learning_rate": 3.3403202676265875e-07, + "loss": 0.4221, + "num_input_tokens_seen": 5230297088, + "step": 2494 + }, + { + "epoch": 0.025766205587198263, + "grad_norm": 11.500085830688477, + "learning_rate": 3.336779549014026e-07, + "loss": 0.2631, + "num_input_tokens_seen": 5232394240, + "step": 2495 + }, + { + "epoch": 0.026037428803905614, + "grad_norm": 13.041985511779785, + "learning_rate": 3.333240571278968e-07, + "loss": 0.3061, + "num_input_tokens_seen": 5234491392, + "step": 2496 + }, + { + "epoch": 0.026308652020612965, + "grad_norm": 9.483246803283691, + "learning_rate": 3.3297033372694473e-07, + "loss": 0.2412, + "num_input_tokens_seen": 5236588544, + "step": 2497 + }, + { + "epoch": 0.026579875237320316, + "grad_norm": 20.214548110961914, + "learning_rate": 3.3261678498320954e-07, + "loss": 0.8186, + "num_input_tokens_seen": 5238685696, + "step": 2498 + }, + { + "epoch": 0.026851098454027666, + "grad_norm": 16.735645294189453, + "learning_rate": 3.3226341118121367e-07, + "loss": 0.5537, + "num_input_tokens_seen": 5240782848, + "step": 2499 + }, + { + "epoch": 0.027122321670735014, + "grad_norm": 11.246064186096191, + "learning_rate": 3.319102126053389e-07, + "loss": 0.3094, + "num_input_tokens_seen": 5242880000, + "step": 2500 + }, + { + "epoch": 0.027393544887442365, + "grad_norm": 16.4909610748291, + "learning_rate": 3.315571895398261e-07, + "loss": 0.2749, + "num_input_tokens_seen": 5244977152, + "step": 2501 + }, + { + "epoch": 0.027664768104149716, + "grad_norm": 14.262845993041992, + "learning_rate": 3.312043422687749e-07, + "loss": 0.3926, + "num_input_tokens_seen": 5247074304, + "step": 2502 + }, + { + "epoch": 0.027935991320857067, + "grad_norm": 11.372760772705078, + "learning_rate": 3.3085167107614297e-07, + "loss": 0.2012, + "num_input_tokens_seen": 5249171456, + "step": 2503 + }, + { + "epoch": 0.028207214537564414, + "grad_norm": 21.803176879882812, + "learning_rate": 3.3049917624574737e-07, + "loss": 0.7148, + "num_input_tokens_seen": 5251268608, + "step": 2504 + }, + { + "epoch": 0.028478437754271765, + "grad_norm": 14.963140487670898, + "learning_rate": 3.301468580612619e-07, + "loss": 0.3626, + "num_input_tokens_seen": 5253365760, + "step": 2505 + }, + { + "epoch": 0.028749660970979116, + "grad_norm": 16.734357833862305, + "learning_rate": 3.2979471680621903e-07, + "loss": 0.325, + "num_input_tokens_seen": 5255462912, + "step": 2506 + }, + { + "epoch": 0.029020884187686467, + "grad_norm": 18.527103424072266, + "learning_rate": 3.2944275276400857e-07, + "loss": 0.5537, + "num_input_tokens_seen": 5257560064, + "step": 2507 + }, + { + "epoch": 0.029292107404393815, + "grad_norm": 17.63797950744629, + "learning_rate": 3.290909662178779e-07, + "loss": 0.4739, + "num_input_tokens_seen": 5259657216, + "step": 2508 + }, + { + "epoch": 0.029563330621101166, + "grad_norm": 11.94525146484375, + "learning_rate": 3.2873935745093145e-07, + "loss": 0.1794, + "num_input_tokens_seen": 5261754368, + "step": 2509 + }, + { + "epoch": 0.029834553837808517, + "grad_norm": 15.735551834106445, + "learning_rate": 3.283879267461305e-07, + "loss": 0.3807, + "num_input_tokens_seen": 5263851520, + "step": 2510 + }, + { + "epoch": 0.030105777054515868, + "grad_norm": 14.347485542297363, + "learning_rate": 3.280366743862931e-07, + "loss": 0.2682, + "num_input_tokens_seen": 5265948672, + "step": 2511 + }, + { + "epoch": 0.030377000271223215, + "grad_norm": 16.711332321166992, + "learning_rate": 3.276856006540939e-07, + "loss": 0.4171, + "num_input_tokens_seen": 5268045824, + "step": 2512 + }, + { + "epoch": 0.030648223487930566, + "grad_norm": 22.094377517700195, + "learning_rate": 3.2733470583206357e-07, + "loss": 0.7826, + "num_input_tokens_seen": 5270142976, + "step": 2513 + }, + { + "epoch": 0.030919446704637917, + "grad_norm": 24.63713264465332, + "learning_rate": 3.2698399020258895e-07, + "loss": 0.6522, + "num_input_tokens_seen": 5272240128, + "step": 2514 + }, + { + "epoch": 0.031190669921345268, + "grad_norm": 14.236292839050293, + "learning_rate": 3.266334540479128e-07, + "loss": 0.3224, + "num_input_tokens_seen": 5274337280, + "step": 2515 + }, + { + "epoch": 0.031461893138052616, + "grad_norm": 17.167448043823242, + "learning_rate": 3.262830976501329e-07, + "loss": 0.4438, + "num_input_tokens_seen": 5276434432, + "step": 2516 + }, + { + "epoch": 0.03173311635475997, + "grad_norm": 14.156777381896973, + "learning_rate": 3.2593292129120295e-07, + "loss": 0.3641, + "num_input_tokens_seen": 5278531584, + "step": 2517 + }, + { + "epoch": 0.03200433957146732, + "grad_norm": 16.016916275024414, + "learning_rate": 3.2558292525293156e-07, + "loss": 0.4017, + "num_input_tokens_seen": 5280628736, + "step": 2518 + }, + { + "epoch": 0.032275562788174665, + "grad_norm": 17.823566436767578, + "learning_rate": 3.2523310981698213e-07, + "loss": 0.4632, + "num_input_tokens_seen": 5282725888, + "step": 2519 + }, + { + "epoch": 0.03254678600488202, + "grad_norm": 21.00741958618164, + "learning_rate": 3.24883475264873e-07, + "loss": 0.6673, + "num_input_tokens_seen": 5284823040, + "step": 2520 + }, + { + "epoch": 0.03281800922158937, + "grad_norm": 14.62010669708252, + "learning_rate": 3.2453402187797684e-07, + "loss": 0.4991, + "num_input_tokens_seen": 5286920192, + "step": 2521 + }, + { + "epoch": 0.03308923243829672, + "grad_norm": 17.261287689208984, + "learning_rate": 3.241847499375201e-07, + "loss": 0.4899, + "num_input_tokens_seen": 5289017344, + "step": 2522 + }, + { + "epoch": 0.03336045565500407, + "grad_norm": 14.625133514404297, + "learning_rate": 3.238356597245837e-07, + "loss": 0.2637, + "num_input_tokens_seen": 5291114496, + "step": 2523 + }, + { + "epoch": 0.033631678871711417, + "grad_norm": 17.17628288269043, + "learning_rate": 3.2348675152010217e-07, + "loss": 0.4681, + "num_input_tokens_seen": 5293211648, + "step": 2524 + }, + { + "epoch": 0.03390290208841877, + "grad_norm": 21.153200149536133, + "learning_rate": 3.2313802560486353e-07, + "loss": 0.5437, + "num_input_tokens_seen": 5295308800, + "step": 2525 + }, + { + "epoch": 0.03417412530512612, + "grad_norm": 21.99086570739746, + "learning_rate": 3.2278948225950916e-07, + "loss": 0.607, + "num_input_tokens_seen": 5297405952, + "step": 2526 + }, + { + "epoch": 0.034445348521833466, + "grad_norm": 16.050336837768555, + "learning_rate": 3.2244112176453343e-07, + "loss": 0.4772, + "num_input_tokens_seen": 5299503104, + "step": 2527 + }, + { + "epoch": 0.03471657173854082, + "grad_norm": 19.21502685546875, + "learning_rate": 3.2209294440028366e-07, + "loss": 0.6767, + "num_input_tokens_seen": 5301600256, + "step": 2528 + }, + { + "epoch": 0.03498779495524817, + "grad_norm": 14.736310958862305, + "learning_rate": 3.2174495044695973e-07, + "loss": 0.3405, + "num_input_tokens_seen": 5303697408, + "step": 2529 + }, + { + "epoch": 0.03525901817195552, + "grad_norm": 17.85927391052246, + "learning_rate": 3.2139714018461396e-07, + "loss": 0.4872, + "num_input_tokens_seen": 5305794560, + "step": 2530 + }, + { + "epoch": 0.03553024138866287, + "grad_norm": 16.15865707397461, + "learning_rate": 3.2104951389315073e-07, + "loss": 0.5331, + "num_input_tokens_seen": 5307891712, + "step": 2531 + }, + { + "epoch": 0.03580146460537022, + "grad_norm": 21.816003799438477, + "learning_rate": 3.207020718523266e-07, + "loss": 0.6059, + "num_input_tokens_seen": 5309988864, + "step": 2532 + }, + { + "epoch": 0.03607268782207757, + "grad_norm": 15.507525444030762, + "learning_rate": 3.2035481434174966e-07, + "loss": 0.3919, + "num_input_tokens_seen": 5312086016, + "step": 2533 + }, + { + "epoch": 0.03634391103878492, + "grad_norm": 15.8746919631958, + "learning_rate": 3.200077416408794e-07, + "loss": 0.4488, + "num_input_tokens_seen": 5314183168, + "step": 2534 + }, + { + "epoch": 0.03661513425549227, + "grad_norm": 20.78430938720703, + "learning_rate": 3.196608540290266e-07, + "loss": 0.627, + "num_input_tokens_seen": 5316280320, + "step": 2535 + }, + { + "epoch": 0.03688635747219962, + "grad_norm": 21.117902755737305, + "learning_rate": 3.193141517853536e-07, + "loss": 0.5527, + "num_input_tokens_seen": 5318377472, + "step": 2536 + }, + { + "epoch": 0.03715758068890697, + "grad_norm": 14.952088356018066, + "learning_rate": 3.1896763518887305e-07, + "loss": 0.4613, + "num_input_tokens_seen": 5320474624, + "step": 2537 + }, + { + "epoch": 0.03742880390561432, + "grad_norm": 25.334875106811523, + "learning_rate": 3.186213045184484e-07, + "loss": 0.8986, + "num_input_tokens_seen": 5322571776, + "step": 2538 + }, + { + "epoch": 0.03770002712232167, + "grad_norm": 12.866583824157715, + "learning_rate": 3.1827516005279306e-07, + "loss": 0.3267, + "num_input_tokens_seen": 5324668928, + "step": 2539 + }, + { + "epoch": 0.03797125033902902, + "grad_norm": 14.773965835571289, + "learning_rate": 3.1792920207047114e-07, + "loss": 0.3742, + "num_input_tokens_seen": 5326766080, + "step": 2540 + }, + { + "epoch": 0.03824247355573637, + "grad_norm": 17.13579559326172, + "learning_rate": 3.175834308498964e-07, + "loss": 0.4086, + "num_input_tokens_seen": 5328863232, + "step": 2541 + }, + { + "epoch": 0.03851369677244372, + "grad_norm": 16.100215911865234, + "learning_rate": 3.172378466693325e-07, + "loss": 0.3569, + "num_input_tokens_seen": 5330960384, + "step": 2542 + }, + { + "epoch": 0.038784919989151075, + "grad_norm": 22.463777542114258, + "learning_rate": 3.168924498068923e-07, + "loss": 0.9546, + "num_input_tokens_seen": 5333057536, + "step": 2543 + }, + { + "epoch": 0.03905614320585842, + "grad_norm": 17.804658889770508, + "learning_rate": 3.1654724054053805e-07, + "loss": 0.4266, + "num_input_tokens_seen": 5335154688, + "step": 2544 + }, + { + "epoch": 0.03932736642256577, + "grad_norm": 16.298038482666016, + "learning_rate": 3.1620221914808115e-07, + "loss": 0.4095, + "num_input_tokens_seen": 5337251840, + "step": 2545 + }, + { + "epoch": 0.039598589639273124, + "grad_norm": 18.965286254882812, + "learning_rate": 3.1585738590718157e-07, + "loss": 0.3647, + "num_input_tokens_seen": 5339348992, + "step": 2546 + }, + { + "epoch": 0.03986981285598047, + "grad_norm": 22.938982009887695, + "learning_rate": 3.1551274109534805e-07, + "loss": 0.6584, + "num_input_tokens_seen": 5341446144, + "step": 2547 + }, + { + "epoch": 0.04014103607268782, + "grad_norm": 20.005109786987305, + "learning_rate": 3.151682849899376e-07, + "loss": 0.6629, + "num_input_tokens_seen": 5343543296, + "step": 2548 + }, + { + "epoch": 0.040412259289395173, + "grad_norm": 16.92494773864746, + "learning_rate": 3.148240178681553e-07, + "loss": 0.4334, + "num_input_tokens_seen": 5345640448, + "step": 2549 + }, + { + "epoch": 0.04068348250610252, + "grad_norm": 20.46824073791504, + "learning_rate": 3.1447994000705456e-07, + "loss": 0.6027, + "num_input_tokens_seen": 5347737600, + "step": 2550 + }, + { + "epoch": 0.040954705722809875, + "grad_norm": 13.558602333068848, + "learning_rate": 3.141360516835356e-07, + "loss": 0.3774, + "num_input_tokens_seen": 5349834752, + "step": 2551 + }, + { + "epoch": 0.04122592893951722, + "grad_norm": 8.050308227539062, + "learning_rate": 3.1379235317434703e-07, + "loss": 0.1835, + "num_input_tokens_seen": 5351931904, + "step": 2552 + }, + { + "epoch": 0.04149715215622457, + "grad_norm": 15.542895317077637, + "learning_rate": 3.134488447560843e-07, + "loss": 0.3894, + "num_input_tokens_seen": 5354029056, + "step": 2553 + }, + { + "epoch": 0.041768375372931925, + "grad_norm": 25.710311889648438, + "learning_rate": 3.1310552670518987e-07, + "loss": 0.7527, + "num_input_tokens_seen": 5356126208, + "step": 2554 + }, + { + "epoch": 0.04203959858963927, + "grad_norm": 19.23603057861328, + "learning_rate": 3.127623992979532e-07, + "loss": 0.4483, + "num_input_tokens_seen": 5358223360, + "step": 2555 + }, + { + "epoch": 0.04231082180634662, + "grad_norm": 16.744998931884766, + "learning_rate": 3.124194628105098e-07, + "loss": 0.3604, + "num_input_tokens_seen": 5360320512, + "step": 2556 + }, + { + "epoch": 0.042582045023053974, + "grad_norm": 19.892236709594727, + "learning_rate": 3.120767175188422e-07, + "loss": 0.5432, + "num_input_tokens_seen": 5362417664, + "step": 2557 + }, + { + "epoch": 0.04285326823976132, + "grad_norm": 21.464439392089844, + "learning_rate": 3.1173416369877864e-07, + "loss": 0.6695, + "num_input_tokens_seen": 5364514816, + "step": 2558 + }, + { + "epoch": 0.043124491456468676, + "grad_norm": 18.040634155273438, + "learning_rate": 3.1139180162599346e-07, + "loss": 0.4755, + "num_input_tokens_seen": 5366611968, + "step": 2559 + }, + { + "epoch": 0.043395714673176024, + "grad_norm": 16.184511184692383, + "learning_rate": 3.110496315760065e-07, + "loss": 0.3587, + "num_input_tokens_seen": 5368709120, + "step": 2560 + }, + { + "epoch": 0.04366693788988337, + "grad_norm": 14.748477935791016, + "learning_rate": 3.107076538241835e-07, + "loss": 0.4959, + "num_input_tokens_seen": 5370806272, + "step": 2561 + }, + { + "epoch": 0.043938161106590726, + "grad_norm": 15.208799362182617, + "learning_rate": 3.103658686457349e-07, + "loss": 0.3855, + "num_input_tokens_seen": 5372903424, + "step": 2562 + }, + { + "epoch": 0.04420938432329807, + "grad_norm": 14.134151458740234, + "learning_rate": 3.1002427631571646e-07, + "loss": 0.3119, + "num_input_tokens_seen": 5375000576, + "step": 2563 + }, + { + "epoch": 0.04448060754000543, + "grad_norm": 14.675880432128906, + "learning_rate": 3.0968287710902866e-07, + "loss": 0.3008, + "num_input_tokens_seen": 5377097728, + "step": 2564 + }, + { + "epoch": 0.044751830756712775, + "grad_norm": 18.92854881286621, + "learning_rate": 3.0934167130041666e-07, + "loss": 0.638, + "num_input_tokens_seen": 5379194880, + "step": 2565 + }, + { + "epoch": 0.04502305397342012, + "grad_norm": 17.33856773376465, + "learning_rate": 3.090006591644698e-07, + "loss": 0.5573, + "num_input_tokens_seen": 5381292032, + "step": 2566 + }, + { + "epoch": 0.04529427719012748, + "grad_norm": 10.942505836486816, + "learning_rate": 3.0865984097562183e-07, + "loss": 0.2354, + "num_input_tokens_seen": 5383389184, + "step": 2567 + }, + { + "epoch": 0.045565500406834825, + "grad_norm": 17.489700317382812, + "learning_rate": 3.083192170081501e-07, + "loss": 0.5418, + "num_input_tokens_seen": 5385486336, + "step": 2568 + }, + { + "epoch": 0.04583672362354217, + "grad_norm": 17.590167999267578, + "learning_rate": 3.079787875361759e-07, + "loss": 0.4361, + "num_input_tokens_seen": 5387583488, + "step": 2569 + }, + { + "epoch": 0.04610794684024953, + "grad_norm": 20.286165237426758, + "learning_rate": 3.0763855283366386e-07, + "loss": 0.4253, + "num_input_tokens_seen": 5389680640, + "step": 2570 + }, + { + "epoch": 0.046379170056956874, + "grad_norm": 13.312159538269043, + "learning_rate": 3.07298513174422e-07, + "loss": 0.2986, + "num_input_tokens_seen": 5391777792, + "step": 2571 + }, + { + "epoch": 0.04665039327366423, + "grad_norm": 17.008472442626953, + "learning_rate": 3.0695866883210143e-07, + "loss": 0.5234, + "num_input_tokens_seen": 5393874944, + "step": 2572 + }, + { + "epoch": 0.046921616490371576, + "grad_norm": 25.539731979370117, + "learning_rate": 3.0661902008019556e-07, + "loss": 0.6498, + "num_input_tokens_seen": 5395972096, + "step": 2573 + }, + { + "epoch": 0.047192839707078924, + "grad_norm": 23.586692810058594, + "learning_rate": 3.062795671920411e-07, + "loss": 0.8046, + "num_input_tokens_seen": 5398069248, + "step": 2574 + }, + { + "epoch": 0.04746406292378628, + "grad_norm": 13.434320449829102, + "learning_rate": 3.059403104408166e-07, + "loss": 0.3061, + "num_input_tokens_seen": 5400166400, + "step": 2575 + }, + { + "epoch": 0.047735286140493625, + "grad_norm": 11.639397621154785, + "learning_rate": 3.0560125009954296e-07, + "loss": 0.2875, + "num_input_tokens_seen": 5402263552, + "step": 2576 + }, + { + "epoch": 0.04800650935720097, + "grad_norm": 23.569503784179688, + "learning_rate": 3.0526238644108335e-07, + "loss": 0.5249, + "num_input_tokens_seen": 5404360704, + "step": 2577 + }, + { + "epoch": 0.04827773257390833, + "grad_norm": 17.796600341796875, + "learning_rate": 3.049237197381424e-07, + "loss": 0.5019, + "num_input_tokens_seen": 5406457856, + "step": 2578 + }, + { + "epoch": 0.048548955790615675, + "grad_norm": 18.38736915588379, + "learning_rate": 3.045852502632657e-07, + "loss": 0.5809, + "num_input_tokens_seen": 5408555008, + "step": 2579 + }, + { + "epoch": 0.04882017900732303, + "grad_norm": 10.763379096984863, + "learning_rate": 3.042469782888409e-07, + "loss": 0.2202, + "num_input_tokens_seen": 5410652160, + "step": 2580 + }, + { + "epoch": 0.04909140222403038, + "grad_norm": 31.74101448059082, + "learning_rate": 3.0390890408709645e-07, + "loss": 0.4292, + "num_input_tokens_seen": 5412749312, + "step": 2581 + }, + { + "epoch": 0.049362625440737724, + "grad_norm": 14.116605758666992, + "learning_rate": 3.0357102793010145e-07, + "loss": 0.3474, + "num_input_tokens_seen": 5414846464, + "step": 2582 + }, + { + "epoch": 0.04963384865744508, + "grad_norm": 11.441993713378906, + "learning_rate": 3.032333500897659e-07, + "loss": 0.1974, + "num_input_tokens_seen": 5416943616, + "step": 2583 + }, + { + "epoch": 0.049905071874152426, + "grad_norm": 17.127378463745117, + "learning_rate": 3.0289587083784e-07, + "loss": 0.4156, + "num_input_tokens_seen": 5419040768, + "step": 2584 + }, + { + "epoch": 0.05017629509085978, + "grad_norm": 13.827837944030762, + "learning_rate": 3.0255859044591425e-07, + "loss": 0.3239, + "num_input_tokens_seen": 5421137920, + "step": 2585 + }, + { + "epoch": 0.05044751830756713, + "grad_norm": 11.431439399719238, + "learning_rate": 3.02221509185419e-07, + "loss": 0.2857, + "num_input_tokens_seen": 5423235072, + "step": 2586 + }, + { + "epoch": 0.050718741524274476, + "grad_norm": 23.875659942626953, + "learning_rate": 3.0188462732762457e-07, + "loss": 0.3933, + "num_input_tokens_seen": 5425332224, + "step": 2587 + }, + { + "epoch": 0.05098996474098183, + "grad_norm": 16.398712158203125, + "learning_rate": 3.015479451436406e-07, + "loss": 0.3873, + "num_input_tokens_seen": 5427429376, + "step": 2588 + }, + { + "epoch": 0.05126118795768918, + "grad_norm": 17.132213592529297, + "learning_rate": 3.0121146290441593e-07, + "loss": 0.4602, + "num_input_tokens_seen": 5429526528, + "step": 2589 + }, + { + "epoch": 0.051532411174396525, + "grad_norm": 21.008514404296875, + "learning_rate": 3.0087518088073905e-07, + "loss": 0.6185, + "num_input_tokens_seen": 5431623680, + "step": 2590 + }, + { + "epoch": 0.05180363439110388, + "grad_norm": 19.426393508911133, + "learning_rate": 3.005390993432366e-07, + "loss": 0.4214, + "num_input_tokens_seen": 5433720832, + "step": 2591 + }, + { + "epoch": 0.05207485760781123, + "grad_norm": 15.54585075378418, + "learning_rate": 3.00203218562374e-07, + "loss": 0.3765, + "num_input_tokens_seen": 5435817984, + "step": 2592 + }, + { + "epoch": 0.05234608082451858, + "grad_norm": 15.346580505371094, + "learning_rate": 2.9986753880845596e-07, + "loss": 0.2658, + "num_input_tokens_seen": 5437915136, + "step": 2593 + }, + { + "epoch": 0.05261730404122593, + "grad_norm": 11.858630180358887, + "learning_rate": 2.9953206035162433e-07, + "loss": 0.2507, + "num_input_tokens_seen": 5440012288, + "step": 2594 + }, + { + "epoch": 0.05288852725793328, + "grad_norm": 14.775229454040527, + "learning_rate": 2.991967834618597e-07, + "loss": 0.3481, + "num_input_tokens_seen": 5442109440, + "step": 2595 + }, + { + "epoch": 0.05315975047464063, + "grad_norm": 12.877067565917969, + "learning_rate": 2.9886170840897977e-07, + "loss": 0.3183, + "num_input_tokens_seen": 5444206592, + "step": 2596 + }, + { + "epoch": 0.05343097369134798, + "grad_norm": 20.334400177001953, + "learning_rate": 2.9852683546264047e-07, + "loss": 0.4371, + "num_input_tokens_seen": 5446303744, + "step": 2597 + }, + { + "epoch": 0.05370219690805533, + "grad_norm": 24.98883819580078, + "learning_rate": 2.9819216489233467e-07, + "loss": 0.7361, + "num_input_tokens_seen": 5448400896, + "step": 2598 + }, + { + "epoch": 0.05397342012476268, + "grad_norm": 12.599200248718262, + "learning_rate": 2.978576969673926e-07, + "loss": 0.3259, + "num_input_tokens_seen": 5450498048, + "step": 2599 + }, + { + "epoch": 0.05424464334147003, + "grad_norm": 17.292268753051758, + "learning_rate": 2.9752343195698125e-07, + "loss": 0.463, + "num_input_tokens_seen": 5452595200, + "step": 2600 + }, + { + "epoch": 0.05451586655817738, + "grad_norm": 18.126787185668945, + "learning_rate": 2.9718937013010444e-07, + "loss": 0.3436, + "num_input_tokens_seen": 5454692352, + "step": 2601 + }, + { + "epoch": 0.05478708977488473, + "grad_norm": 9.295369148254395, + "learning_rate": 2.9685551175560235e-07, + "loss": 0.192, + "num_input_tokens_seen": 5456789504, + "step": 2602 + }, + { + "epoch": 0.05505831299159208, + "grad_norm": 21.22579574584961, + "learning_rate": 2.965218571021516e-07, + "loss": 0.7479, + "num_input_tokens_seen": 5458886656, + "step": 2603 + }, + { + "epoch": 0.05532953620829943, + "grad_norm": 18.786785125732422, + "learning_rate": 2.9618840643826464e-07, + "loss": 0.4214, + "num_input_tokens_seen": 5460983808, + "step": 2604 + }, + { + "epoch": 0.05560075942500678, + "grad_norm": 20.445255279541016, + "learning_rate": 2.958551600322899e-07, + "loss": 0.5877, + "num_input_tokens_seen": 5463080960, + "step": 2605 + }, + { + "epoch": 0.055871982641714134, + "grad_norm": 19.826480865478516, + "learning_rate": 2.9552211815241156e-07, + "loss": 0.5026, + "num_input_tokens_seen": 5465178112, + "step": 2606 + }, + { + "epoch": 0.05614320585842148, + "grad_norm": 14.695202827453613, + "learning_rate": 2.9518928106664897e-07, + "loss": 0.35, + "num_input_tokens_seen": 5467275264, + "step": 2607 + }, + { + "epoch": 0.05641442907512883, + "grad_norm": 16.892658233642578, + "learning_rate": 2.9485664904285643e-07, + "loss": 0.4947, + "num_input_tokens_seen": 5469372416, + "step": 2608 + }, + { + "epoch": 0.05668565229183618, + "grad_norm": 19.319557189941406, + "learning_rate": 2.94524222348724e-07, + "loss": 0.5193, + "num_input_tokens_seen": 5471469568, + "step": 2609 + }, + { + "epoch": 0.05695687550854353, + "grad_norm": 21.24896240234375, + "learning_rate": 2.9419200125177585e-07, + "loss": 0.3985, + "num_input_tokens_seen": 5473566720, + "step": 2610 + }, + { + "epoch": 0.05722809872525088, + "grad_norm": 11.423199653625488, + "learning_rate": 2.938599860193709e-07, + "loss": 0.2865, + "num_input_tokens_seen": 5475663872, + "step": 2611 + }, + { + "epoch": 0.05749932194195823, + "grad_norm": 11.481456756591797, + "learning_rate": 2.935281769187025e-07, + "loss": 0.2112, + "num_input_tokens_seen": 5477761024, + "step": 2612 + }, + { + "epoch": 0.05777054515866558, + "grad_norm": 19.210725784301758, + "learning_rate": 2.9319657421679757e-07, + "loss": 0.4367, + "num_input_tokens_seen": 5479858176, + "step": 2613 + }, + { + "epoch": 0.058041768375372935, + "grad_norm": 17.227354049682617, + "learning_rate": 2.928651781805177e-07, + "loss": 0.3199, + "num_input_tokens_seen": 5481955328, + "step": 2614 + }, + { + "epoch": 0.05831299159208028, + "grad_norm": 11.751642227172852, + "learning_rate": 2.9253398907655775e-07, + "loss": 0.2538, + "num_input_tokens_seen": 5484052480, + "step": 2615 + }, + { + "epoch": 0.05858421480878763, + "grad_norm": 11.568734169006348, + "learning_rate": 2.9220300717144597e-07, + "loss": 0.2633, + "num_input_tokens_seen": 5486149632, + "step": 2616 + }, + { + "epoch": 0.058855438025494984, + "grad_norm": 12.480401992797852, + "learning_rate": 2.9187223273154406e-07, + "loss": 0.2719, + "num_input_tokens_seen": 5488246784, + "step": 2617 + }, + { + "epoch": 0.05912666124220233, + "grad_norm": 21.550291061401367, + "learning_rate": 2.91541666023047e-07, + "loss": 0.5558, + "num_input_tokens_seen": 5490343936, + "step": 2618 + }, + { + "epoch": 0.059397884458909686, + "grad_norm": 20.7867374420166, + "learning_rate": 2.9121130731198204e-07, + "loss": 0.5337, + "num_input_tokens_seen": 5492441088, + "step": 2619 + }, + { + "epoch": 0.059669107675617034, + "grad_norm": 15.338828086853027, + "learning_rate": 2.9088115686420917e-07, + "loss": 0.4042, + "num_input_tokens_seen": 5494538240, + "step": 2620 + }, + { + "epoch": 0.05994033089232438, + "grad_norm": 17.88315200805664, + "learning_rate": 2.9055121494542115e-07, + "loss": 0.3717, + "num_input_tokens_seen": 5496635392, + "step": 2621 + }, + { + "epoch": 0.060211554109031735, + "grad_norm": 12.472522735595703, + "learning_rate": 2.9022148182114247e-07, + "loss": 0.2657, + "num_input_tokens_seen": 5498732544, + "step": 2622 + }, + { + "epoch": 0.06048277732573908, + "grad_norm": 23.858972549438477, + "learning_rate": 2.8989195775673e-07, + "loss": 0.5611, + "num_input_tokens_seen": 5500829696, + "step": 2623 + }, + { + "epoch": 0.06075400054244643, + "grad_norm": 12.803933143615723, + "learning_rate": 2.89562643017372e-07, + "loss": 0.3324, + "num_input_tokens_seen": 5502926848, + "step": 2624 + }, + { + "epoch": 0.061025223759153785, + "grad_norm": 26.708951950073242, + "learning_rate": 2.8923353786808857e-07, + "loss": 0.4797, + "num_input_tokens_seen": 5505024000, + "step": 2625 + }, + { + "epoch": 0.06129644697586113, + "grad_norm": 25.36354637145996, + "learning_rate": 2.8890464257373105e-07, + "loss": 0.8212, + "num_input_tokens_seen": 5507121152, + "step": 2626 + }, + { + "epoch": 0.06156767019256849, + "grad_norm": 13.080056190490723, + "learning_rate": 2.8857595739898164e-07, + "loss": 0.3743, + "num_input_tokens_seen": 5509218304, + "step": 2627 + }, + { + "epoch": 0.061838893409275834, + "grad_norm": 19.789167404174805, + "learning_rate": 2.8824748260835386e-07, + "loss": 0.652, + "num_input_tokens_seen": 5511315456, + "step": 2628 + }, + { + "epoch": 0.06211011662598318, + "grad_norm": 28.163496017456055, + "learning_rate": 2.879192184661918e-07, + "loss": 0.6427, + "num_input_tokens_seen": 5513412608, + "step": 2629 + }, + { + "epoch": 0.062381339842690536, + "grad_norm": 15.175286293029785, + "learning_rate": 2.8759116523666973e-07, + "loss": 0.353, + "num_input_tokens_seen": 5515509760, + "step": 2630 + }, + { + "epoch": 0.06265256305939788, + "grad_norm": 18.326745986938477, + "learning_rate": 2.872633231837923e-07, + "loss": 0.5979, + "num_input_tokens_seen": 5517606912, + "step": 2631 + }, + { + "epoch": 0.06292378627610523, + "grad_norm": 14.984691619873047, + "learning_rate": 2.869356925713946e-07, + "loss": 0.2892, + "num_input_tokens_seen": 5519704064, + "step": 2632 + }, + { + "epoch": 0.06319500949281258, + "grad_norm": 20.763242721557617, + "learning_rate": 2.86608273663141e-07, + "loss": 0.6904, + "num_input_tokens_seen": 5521801216, + "step": 2633 + }, + { + "epoch": 0.06346623270951994, + "grad_norm": 19.28316307067871, + "learning_rate": 2.8628106672252614e-07, + "loss": 0.7099, + "num_input_tokens_seen": 5523898368, + "step": 2634 + }, + { + "epoch": 0.06373745592622729, + "grad_norm": 14.820695877075195, + "learning_rate": 2.859540720128737e-07, + "loss": 0.3436, + "num_input_tokens_seen": 5525995520, + "step": 2635 + }, + { + "epoch": 0.06400867914293464, + "grad_norm": 17.229780197143555, + "learning_rate": 2.856272897973362e-07, + "loss": 0.397, + "num_input_tokens_seen": 5528092672, + "step": 2636 + }, + { + "epoch": 0.06427990235964198, + "grad_norm": 19.926528930664062, + "learning_rate": 2.853007203388958e-07, + "loss": 0.5819, + "num_input_tokens_seen": 5530189824, + "step": 2637 + }, + { + "epoch": 0.06455112557634933, + "grad_norm": 12.162901878356934, + "learning_rate": 2.849743639003631e-07, + "loss": 0.3037, + "num_input_tokens_seen": 5532286976, + "step": 2638 + }, + { + "epoch": 0.06482234879305669, + "grad_norm": 20.921995162963867, + "learning_rate": 2.846482207443773e-07, + "loss": 0.4711, + "num_input_tokens_seen": 5534384128, + "step": 2639 + }, + { + "epoch": 0.06509357200976404, + "grad_norm": 16.384029388427734, + "learning_rate": 2.843222911334061e-07, + "loss": 0.3534, + "num_input_tokens_seen": 5536481280, + "step": 2640 + }, + { + "epoch": 0.06536479522647139, + "grad_norm": 22.67833137512207, + "learning_rate": 2.839965753297452e-07, + "loss": 0.6336, + "num_input_tokens_seen": 5538578432, + "step": 2641 + }, + { + "epoch": 0.06563601844317873, + "grad_norm": 12.454771995544434, + "learning_rate": 2.8367107359551835e-07, + "loss": 0.3167, + "num_input_tokens_seen": 5540675584, + "step": 2642 + }, + { + "epoch": 0.06590724165988608, + "grad_norm": 18.632858276367188, + "learning_rate": 2.8334578619267683e-07, + "loss": 0.5943, + "num_input_tokens_seen": 5542772736, + "step": 2643 + }, + { + "epoch": 0.06617846487659344, + "grad_norm": 10.916572570800781, + "learning_rate": 2.830207133829997e-07, + "loss": 0.2241, + "num_input_tokens_seen": 5544869888, + "step": 2644 + }, + { + "epoch": 0.06644968809330079, + "grad_norm": 20.393115997314453, + "learning_rate": 2.8269585542809305e-07, + "loss": 0.5303, + "num_input_tokens_seen": 5546967040, + "step": 2645 + }, + { + "epoch": 0.06672091131000814, + "grad_norm": 16.837371826171875, + "learning_rate": 2.8237121258939036e-07, + "loss": 0.4077, + "num_input_tokens_seen": 5549064192, + "step": 2646 + }, + { + "epoch": 0.06699213452671549, + "grad_norm": 25.16261863708496, + "learning_rate": 2.8204678512815185e-07, + "loss": 0.518, + "num_input_tokens_seen": 5551161344, + "step": 2647 + }, + { + "epoch": 0.06726335774342283, + "grad_norm": 21.067689895629883, + "learning_rate": 2.8172257330546414e-07, + "loss": 0.572, + "num_input_tokens_seen": 5553258496, + "step": 2648 + }, + { + "epoch": 0.06753458096013018, + "grad_norm": 16.49931526184082, + "learning_rate": 2.8139857738224055e-07, + "loss": 0.3443, + "num_input_tokens_seen": 5555355648, + "step": 2649 + }, + { + "epoch": 0.06780580417683754, + "grad_norm": 15.077832221984863, + "learning_rate": 2.81074797619221e-07, + "loss": 0.3556, + "num_input_tokens_seen": 5557452800, + "step": 2650 + }, + { + "epoch": 0.06807702739354489, + "grad_norm": 14.75187873840332, + "learning_rate": 2.8075123427697093e-07, + "loss": 0.3608, + "num_input_tokens_seen": 5559549952, + "step": 2651 + }, + { + "epoch": 0.06834825061025224, + "grad_norm": 11.628194808959961, + "learning_rate": 2.8042788761588204e-07, + "loss": 0.2241, + "num_input_tokens_seen": 5561647104, + "step": 2652 + }, + { + "epoch": 0.06861947382695958, + "grad_norm": 17.89313316345215, + "learning_rate": 2.8010475789617105e-07, + "loss": 0.5842, + "num_input_tokens_seen": 5563744256, + "step": 2653 + }, + { + "epoch": 0.06889069704366693, + "grad_norm": 17.917509078979492, + "learning_rate": 2.797818453778806e-07, + "loss": 0.5279, + "num_input_tokens_seen": 5565841408, + "step": 2654 + }, + { + "epoch": 0.0691619202603743, + "grad_norm": 14.405590057373047, + "learning_rate": 2.794591503208785e-07, + "loss": 0.3903, + "num_input_tokens_seen": 5567938560, + "step": 2655 + }, + { + "epoch": 0.06943314347708164, + "grad_norm": 22.006885528564453, + "learning_rate": 2.791366729848574e-07, + "loss": 0.6449, + "num_input_tokens_seen": 5570035712, + "step": 2656 + }, + { + "epoch": 0.06970436669378899, + "grad_norm": 23.890174865722656, + "learning_rate": 2.7881441362933464e-07, + "loss": 0.6725, + "num_input_tokens_seen": 5572132864, + "step": 2657 + }, + { + "epoch": 0.06997558991049634, + "grad_norm": 19.865018844604492, + "learning_rate": 2.784923725136525e-07, + "loss": 0.5287, + "num_input_tokens_seen": 5574230016, + "step": 2658 + }, + { + "epoch": 0.07024681312720368, + "grad_norm": 9.786247253417969, + "learning_rate": 2.781705498969773e-07, + "loss": 0.2702, + "num_input_tokens_seen": 5576327168, + "step": 2659 + }, + { + "epoch": 0.07051803634391104, + "grad_norm": 13.455928802490234, + "learning_rate": 2.7784894603829966e-07, + "loss": 0.3915, + "num_input_tokens_seen": 5578424320, + "step": 2660 + }, + { + "epoch": 0.07078925956061839, + "grad_norm": 15.833970069885254, + "learning_rate": 2.7752756119643416e-07, + "loss": 0.4046, + "num_input_tokens_seen": 5580521472, + "step": 2661 + }, + { + "epoch": 0.07106048277732574, + "grad_norm": 13.186774253845215, + "learning_rate": 2.77206395630019e-07, + "loss": 0.233, + "num_input_tokens_seen": 5582618624, + "step": 2662 + }, + { + "epoch": 0.07133170599403309, + "grad_norm": 16.846803665161133, + "learning_rate": 2.7688544959751615e-07, + "loss": 0.3992, + "num_input_tokens_seen": 5584715776, + "step": 2663 + }, + { + "epoch": 0.07160292921074043, + "grad_norm": 16.44211769104004, + "learning_rate": 2.765647233572108e-07, + "loss": 0.454, + "num_input_tokens_seen": 5586812928, + "step": 2664 + }, + { + "epoch": 0.0718741524274478, + "grad_norm": 21.622333526611328, + "learning_rate": 2.7624421716721086e-07, + "loss": 0.6375, + "num_input_tokens_seen": 5588910080, + "step": 2665 + }, + { + "epoch": 0.07214537564415514, + "grad_norm": 13.752047538757324, + "learning_rate": 2.7592393128544784e-07, + "loss": 0.3815, + "num_input_tokens_seen": 5591007232, + "step": 2666 + }, + { + "epoch": 0.07241659886086249, + "grad_norm": 15.634025573730469, + "learning_rate": 2.7560386596967553e-07, + "loss": 0.3299, + "num_input_tokens_seen": 5593104384, + "step": 2667 + }, + { + "epoch": 0.07268782207756984, + "grad_norm": 22.741350173950195, + "learning_rate": 2.7528402147747045e-07, + "loss": 0.8418, + "num_input_tokens_seen": 5595201536, + "step": 2668 + }, + { + "epoch": 0.07295904529427719, + "grad_norm": 18.136112213134766, + "learning_rate": 2.7496439806623124e-07, + "loss": 0.5166, + "num_input_tokens_seen": 5597298688, + "step": 2669 + }, + { + "epoch": 0.07323026851098453, + "grad_norm": 13.299650192260742, + "learning_rate": 2.746449959931784e-07, + "loss": 0.2879, + "num_input_tokens_seen": 5599395840, + "step": 2670 + }, + { + "epoch": 0.0735014917276919, + "grad_norm": 12.364994049072266, + "learning_rate": 2.743258155153548e-07, + "loss": 0.2716, + "num_input_tokens_seen": 5601492992, + "step": 2671 + }, + { + "epoch": 0.07377271494439924, + "grad_norm": 22.236961364746094, + "learning_rate": 2.740068568896247e-07, + "loss": 0.6404, + "num_input_tokens_seen": 5603590144, + "step": 2672 + }, + { + "epoch": 0.07404393816110659, + "grad_norm": 18.271963119506836, + "learning_rate": 2.7368812037267387e-07, + "loss": 0.646, + "num_input_tokens_seen": 5605687296, + "step": 2673 + }, + { + "epoch": 0.07431516137781394, + "grad_norm": 22.82843017578125, + "learning_rate": 2.7336960622100907e-07, + "loss": 0.7859, + "num_input_tokens_seen": 5607784448, + "step": 2674 + }, + { + "epoch": 0.07458638459452128, + "grad_norm": 11.289409637451172, + "learning_rate": 2.7305131469095906e-07, + "loss": 0.1964, + "num_input_tokens_seen": 5609881600, + "step": 2675 + }, + { + "epoch": 0.07485760781122865, + "grad_norm": 20.22431182861328, + "learning_rate": 2.7273324603867203e-07, + "loss": 0.5049, + "num_input_tokens_seen": 5611978752, + "step": 2676 + }, + { + "epoch": 0.075128831027936, + "grad_norm": 21.272485733032227, + "learning_rate": 2.7241540052011787e-07, + "loss": 0.6532, + "num_input_tokens_seen": 5614075904, + "step": 2677 + }, + { + "epoch": 0.07540005424464334, + "grad_norm": 10.40255069732666, + "learning_rate": 2.720977783910865e-07, + "loss": 0.2299, + "num_input_tokens_seen": 5616173056, + "step": 2678 + }, + { + "epoch": 0.07567127746135069, + "grad_norm": 18.323423385620117, + "learning_rate": 2.717803799071881e-07, + "loss": 0.4719, + "num_input_tokens_seen": 5618270208, + "step": 2679 + }, + { + "epoch": 0.07594250067805804, + "grad_norm": 16.696556091308594, + "learning_rate": 2.714632053238529e-07, + "loss": 0.5637, + "num_input_tokens_seen": 5620367360, + "step": 2680 + }, + { + "epoch": 0.0762137238947654, + "grad_norm": 17.217605590820312, + "learning_rate": 2.711462548963309e-07, + "loss": 0.3831, + "num_input_tokens_seen": 5622464512, + "step": 2681 + }, + { + "epoch": 0.07648494711147275, + "grad_norm": 17.2362003326416, + "learning_rate": 2.708295288796918e-07, + "loss": 0.4221, + "num_input_tokens_seen": 5624561664, + "step": 2682 + }, + { + "epoch": 0.07675617032818009, + "grad_norm": 19.473421096801758, + "learning_rate": 2.7051302752882467e-07, + "loss": 0.5495, + "num_input_tokens_seen": 5626658816, + "step": 2683 + }, + { + "epoch": 0.07702739354488744, + "grad_norm": 18.977439880371094, + "learning_rate": 2.7019675109843777e-07, + "loss": 0.4145, + "num_input_tokens_seen": 5628755968, + "step": 2684 + }, + { + "epoch": 0.07729861676159479, + "grad_norm": 22.364627838134766, + "learning_rate": 2.6988069984305817e-07, + "loss": 0.5008, + "num_input_tokens_seen": 5630853120, + "step": 2685 + }, + { + "epoch": 0.07756983997830215, + "grad_norm": 19.7006893157959, + "learning_rate": 2.6956487401703207e-07, + "loss": 0.5733, + "num_input_tokens_seen": 5632950272, + "step": 2686 + }, + { + "epoch": 0.0778410631950095, + "grad_norm": 22.176889419555664, + "learning_rate": 2.692492738745241e-07, + "loss": 0.662, + "num_input_tokens_seen": 5635047424, + "step": 2687 + }, + { + "epoch": 0.07811228641171684, + "grad_norm": 22.010507583618164, + "learning_rate": 2.689338996695171e-07, + "loss": 0.7364, + "num_input_tokens_seen": 5637144576, + "step": 2688 + }, + { + "epoch": 0.07838350962842419, + "grad_norm": 13.696541786193848, + "learning_rate": 2.686187516558122e-07, + "loss": 0.3851, + "num_input_tokens_seen": 5639241728, + "step": 2689 + }, + { + "epoch": 0.07865473284513154, + "grad_norm": 27.36061668395996, + "learning_rate": 2.683038300870285e-07, + "loss": 0.571, + "num_input_tokens_seen": 5641338880, + "step": 2690 + }, + { + "epoch": 0.07892595606183889, + "grad_norm": 14.405625343322754, + "learning_rate": 2.679891352166032e-07, + "loss": 0.3617, + "num_input_tokens_seen": 5643436032, + "step": 2691 + }, + { + "epoch": 0.07919717927854625, + "grad_norm": 19.312789916992188, + "learning_rate": 2.6767466729779074e-07, + "loss": 0.552, + "num_input_tokens_seen": 5645533184, + "step": 2692 + }, + { + "epoch": 0.0794684024952536, + "grad_norm": 23.497774124145508, + "learning_rate": 2.6736042658366255e-07, + "loss": 0.5278, + "num_input_tokens_seen": 5647630336, + "step": 2693 + }, + { + "epoch": 0.07973962571196094, + "grad_norm": 18.922243118286133, + "learning_rate": 2.6704641332710783e-07, + "loss": 0.5603, + "num_input_tokens_seen": 5649727488, + "step": 2694 + }, + { + "epoch": 0.08001084892866829, + "grad_norm": 18.824480056762695, + "learning_rate": 2.6673262778083246e-07, + "loss": 0.4909, + "num_input_tokens_seen": 5651824640, + "step": 2695 + }, + { + "epoch": 0.08028207214537564, + "grad_norm": 18.664011001586914, + "learning_rate": 2.6641907019735914e-07, + "loss": 0.4259, + "num_input_tokens_seen": 5653921792, + "step": 2696 + }, + { + "epoch": 0.080553295362083, + "grad_norm": 12.166492462158203, + "learning_rate": 2.6610574082902704e-07, + "loss": 0.2075, + "num_input_tokens_seen": 5656018944, + "step": 2697 + }, + { + "epoch": 0.08082451857879035, + "grad_norm": 17.41253662109375, + "learning_rate": 2.657926399279918e-07, + "loss": 0.5047, + "num_input_tokens_seen": 5658116096, + "step": 2698 + }, + { + "epoch": 0.0810957417954977, + "grad_norm": 12.47386646270752, + "learning_rate": 2.65479767746225e-07, + "loss": 0.2702, + "num_input_tokens_seen": 5660213248, + "step": 2699 + }, + { + "epoch": 0.08136696501220504, + "grad_norm": 15.2111234664917, + "learning_rate": 2.651671245355144e-07, + "loss": 0.2429, + "num_input_tokens_seen": 5662310400, + "step": 2700 + }, + { + "epoch": 0.08163818822891239, + "grad_norm": 20.175670623779297, + "learning_rate": 2.6485471054746315e-07, + "loss": 0.4939, + "num_input_tokens_seen": 5664407552, + "step": 2701 + }, + { + "epoch": 0.08190941144561975, + "grad_norm": 15.249000549316406, + "learning_rate": 2.645425260334904e-07, + "loss": 0.4021, + "num_input_tokens_seen": 5666504704, + "step": 2702 + }, + { + "epoch": 0.0821806346623271, + "grad_norm": 23.084077835083008, + "learning_rate": 2.6423057124483015e-07, + "loss": 0.7402, + "num_input_tokens_seen": 5668601856, + "step": 2703 + }, + { + "epoch": 0.08245185787903445, + "grad_norm": 11.628067016601562, + "learning_rate": 2.6391884643253197e-07, + "loss": 0.2521, + "num_input_tokens_seen": 5670699008, + "step": 2704 + }, + { + "epoch": 0.0827230810957418, + "grad_norm": 14.505975723266602, + "learning_rate": 2.6360735184745984e-07, + "loss": 0.3268, + "num_input_tokens_seen": 5672796160, + "step": 2705 + }, + { + "epoch": 0.08299430431244914, + "grad_norm": 16.887388229370117, + "learning_rate": 2.6329608774029285e-07, + "loss": 0.4597, + "num_input_tokens_seen": 5674893312, + "step": 2706 + }, + { + "epoch": 0.0832655275291565, + "grad_norm": 24.89675521850586, + "learning_rate": 2.6298505436152457e-07, + "loss": 0.6038, + "num_input_tokens_seen": 5676990464, + "step": 2707 + }, + { + "epoch": 0.08353675074586385, + "grad_norm": 16.737106323242188, + "learning_rate": 2.626742519614629e-07, + "loss": 0.3616, + "num_input_tokens_seen": 5679087616, + "step": 2708 + }, + { + "epoch": 0.0838079739625712, + "grad_norm": 15.908254623413086, + "learning_rate": 2.623636807902298e-07, + "loss": 0.4107, + "num_input_tokens_seen": 5681184768, + "step": 2709 + }, + { + "epoch": 0.08407919717927854, + "grad_norm": 12.272135734558105, + "learning_rate": 2.620533410977609e-07, + "loss": 0.2693, + "num_input_tokens_seen": 5683281920, + "step": 2710 + }, + { + "epoch": 0.08435042039598589, + "grad_norm": 18.592920303344727, + "learning_rate": 2.617432331338059e-07, + "loss": 0.5119, + "num_input_tokens_seen": 5685379072, + "step": 2711 + }, + { + "epoch": 0.08462164361269324, + "grad_norm": 21.81186294555664, + "learning_rate": 2.614333571479279e-07, + "loss": 0.6332, + "num_input_tokens_seen": 5687476224, + "step": 2712 + }, + { + "epoch": 0.0848928668294006, + "grad_norm": 13.03216552734375, + "learning_rate": 2.6112371338950325e-07, + "loss": 0.2644, + "num_input_tokens_seen": 5689573376, + "step": 2713 + }, + { + "epoch": 0.08516409004610795, + "grad_norm": 21.6326904296875, + "learning_rate": 2.608143021077215e-07, + "loss": 0.5831, + "num_input_tokens_seen": 5691670528, + "step": 2714 + }, + { + "epoch": 0.0854353132628153, + "grad_norm": 13.137306213378906, + "learning_rate": 2.6050512355158503e-07, + "loss": 0.3289, + "num_input_tokens_seen": 5693767680, + "step": 2715 + }, + { + "epoch": 0.08570653647952264, + "grad_norm": 20.455364227294922, + "learning_rate": 2.601961779699091e-07, + "loss": 0.687, + "num_input_tokens_seen": 5695864832, + "step": 2716 + }, + { + "epoch": 0.08597775969622999, + "grad_norm": 14.189478874206543, + "learning_rate": 2.5988746561132136e-07, + "loss": 0.3623, + "num_input_tokens_seen": 5697961984, + "step": 2717 + }, + { + "epoch": 0.08624898291293735, + "grad_norm": 14.868339538574219, + "learning_rate": 2.595789867242616e-07, + "loss": 0.3914, + "num_input_tokens_seen": 5700059136, + "step": 2718 + }, + { + "epoch": 0.0865202061296447, + "grad_norm": 16.712444305419922, + "learning_rate": 2.5927074155698203e-07, + "loss": 0.3688, + "num_input_tokens_seen": 5702156288, + "step": 2719 + }, + { + "epoch": 0.08679142934635205, + "grad_norm": 23.333152770996094, + "learning_rate": 2.589627303575465e-07, + "loss": 0.7762, + "num_input_tokens_seen": 5704253440, + "step": 2720 + }, + { + "epoch": 0.0870626525630594, + "grad_norm": 21.890504837036133, + "learning_rate": 2.5865495337383103e-07, + "loss": 0.4798, + "num_input_tokens_seen": 5706350592, + "step": 2721 + }, + { + "epoch": 0.08733387577976674, + "grad_norm": 17.646825790405273, + "learning_rate": 2.5834741085352223e-07, + "loss": 0.4479, + "num_input_tokens_seen": 5708447744, + "step": 2722 + }, + { + "epoch": 0.0876050989964741, + "grad_norm": 20.30681800842285, + "learning_rate": 2.5804010304411914e-07, + "loss": 0.6757, + "num_input_tokens_seen": 5710544896, + "step": 2723 + }, + { + "epoch": 0.08787632221318145, + "grad_norm": 9.638075828552246, + "learning_rate": 2.5773303019293123e-07, + "loss": 0.2059, + "num_input_tokens_seen": 5712642048, + "step": 2724 + }, + { + "epoch": 0.0881475454298888, + "grad_norm": 11.49023723602295, + "learning_rate": 2.5742619254707905e-07, + "loss": 0.1925, + "num_input_tokens_seen": 5714739200, + "step": 2725 + }, + { + "epoch": 0.08841876864659615, + "grad_norm": 18.779539108276367, + "learning_rate": 2.5711959035349396e-07, + "loss": 0.4855, + "num_input_tokens_seen": 5716836352, + "step": 2726 + }, + { + "epoch": 0.0886899918633035, + "grad_norm": 11.857897758483887, + "learning_rate": 2.568132238589175e-07, + "loss": 0.2486, + "num_input_tokens_seen": 5718933504, + "step": 2727 + }, + { + "epoch": 0.08896121508001086, + "grad_norm": 20.438987731933594, + "learning_rate": 2.565070933099019e-07, + "loss": 0.4293, + "num_input_tokens_seen": 5721030656, + "step": 2728 + }, + { + "epoch": 0.0892324382967182, + "grad_norm": 15.0324125289917, + "learning_rate": 2.5620119895280935e-07, + "loss": 0.3677, + "num_input_tokens_seen": 5723127808, + "step": 2729 + }, + { + "epoch": 0.08950366151342555, + "grad_norm": 15.559118270874023, + "learning_rate": 2.5589554103381195e-07, + "loss": 0.3675, + "num_input_tokens_seen": 5725224960, + "step": 2730 + }, + { + "epoch": 0.0897748847301329, + "grad_norm": 19.351423263549805, + "learning_rate": 2.5559011979889155e-07, + "loss": 0.425, + "num_input_tokens_seen": 5727322112, + "step": 2731 + }, + { + "epoch": 0.09004610794684025, + "grad_norm": 13.77000904083252, + "learning_rate": 2.552849354938399e-07, + "loss": 0.3441, + "num_input_tokens_seen": 5729419264, + "step": 2732 + }, + { + "epoch": 0.09031733116354759, + "grad_norm": 13.648260116577148, + "learning_rate": 2.549799883642573e-07, + "loss": 0.3423, + "num_input_tokens_seen": 5731516416, + "step": 2733 + }, + { + "epoch": 0.09058855438025495, + "grad_norm": 12.919384002685547, + "learning_rate": 2.5467527865555366e-07, + "loss": 0.2816, + "num_input_tokens_seen": 5733613568, + "step": 2734 + }, + { + "epoch": 0.0908597775969623, + "grad_norm": 20.780710220336914, + "learning_rate": 2.5437080661294785e-07, + "loss": 0.6129, + "num_input_tokens_seen": 5735710720, + "step": 2735 + }, + { + "epoch": 0.09113100081366965, + "grad_norm": 22.01446533203125, + "learning_rate": 2.5406657248146735e-07, + "loss": 0.5003, + "num_input_tokens_seen": 5737807872, + "step": 2736 + }, + { + "epoch": 0.091402224030377, + "grad_norm": 19.538734436035156, + "learning_rate": 2.5376257650594823e-07, + "loss": 0.4947, + "num_input_tokens_seen": 5739905024, + "step": 2737 + }, + { + "epoch": 0.09167344724708434, + "grad_norm": 13.193866729736328, + "learning_rate": 2.5345881893103484e-07, + "loss": 0.2547, + "num_input_tokens_seen": 5742002176, + "step": 2738 + }, + { + "epoch": 0.0919446704637917, + "grad_norm": 23.290098190307617, + "learning_rate": 2.5315530000117973e-07, + "loss": 0.7383, + "num_input_tokens_seen": 5744099328, + "step": 2739 + }, + { + "epoch": 0.09221589368049905, + "grad_norm": 22.168949127197266, + "learning_rate": 2.528520199606434e-07, + "loss": 0.7584, + "num_input_tokens_seen": 5746196480, + "step": 2740 + }, + { + "epoch": 0.0924871168972064, + "grad_norm": 20.419174194335938, + "learning_rate": 2.525489790534941e-07, + "loss": 0.6092, + "num_input_tokens_seen": 5748293632, + "step": 2741 + }, + { + "epoch": 0.09275834011391375, + "grad_norm": 23.462411880493164, + "learning_rate": 2.5224617752360766e-07, + "loss": 0.6697, + "num_input_tokens_seen": 5750390784, + "step": 2742 + }, + { + "epoch": 0.0930295633306211, + "grad_norm": 15.432395935058594, + "learning_rate": 2.519436156146671e-07, + "loss": 0.4207, + "num_input_tokens_seen": 5752487936, + "step": 2743 + }, + { + "epoch": 0.09330078654732846, + "grad_norm": 17.80773162841797, + "learning_rate": 2.51641293570163e-07, + "loss": 0.5068, + "num_input_tokens_seen": 5754585088, + "step": 2744 + }, + { + "epoch": 0.0935720097640358, + "grad_norm": 13.011022567749023, + "learning_rate": 2.513392116333922e-07, + "loss": 0.2518, + "num_input_tokens_seen": 5756682240, + "step": 2745 + }, + { + "epoch": 0.09384323298074315, + "grad_norm": 16.054059982299805, + "learning_rate": 2.510373700474592e-07, + "loss": 0.4552, + "num_input_tokens_seen": 5758779392, + "step": 2746 + }, + { + "epoch": 0.0941144561974505, + "grad_norm": 13.27901840209961, + "learning_rate": 2.5073576905527407e-07, + "loss": 0.2891, + "num_input_tokens_seen": 5760876544, + "step": 2747 + }, + { + "epoch": 0.09438567941415785, + "grad_norm": 18.663637161254883, + "learning_rate": 2.5043440889955434e-07, + "loss": 0.5392, + "num_input_tokens_seen": 5762973696, + "step": 2748 + }, + { + "epoch": 0.09465690263086521, + "grad_norm": 17.319080352783203, + "learning_rate": 2.501332898228232e-07, + "loss": 0.4573, + "num_input_tokens_seen": 5765070848, + "step": 2749 + }, + { + "epoch": 0.09492812584757256, + "grad_norm": 17.932674407958984, + "learning_rate": 2.4983241206740945e-07, + "loss": 0.3974, + "num_input_tokens_seen": 5767168000, + "step": 2750 + }, + { + "epoch": 0.0951993490642799, + "grad_norm": 16.8289737701416, + "learning_rate": 2.495317758754483e-07, + "loss": 0.5203, + "num_input_tokens_seen": 5769265152, + "step": 2751 + }, + { + "epoch": 0.09547057228098725, + "grad_norm": 18.582918167114258, + "learning_rate": 2.492313814888802e-07, + "loss": 0.4871, + "num_input_tokens_seen": 5771362304, + "step": 2752 + }, + { + "epoch": 0.0957417954976946, + "grad_norm": 13.084208488464355, + "learning_rate": 2.4893122914945124e-07, + "loss": 0.3114, + "num_input_tokens_seen": 5773459456, + "step": 2753 + }, + { + "epoch": 0.09601301871440195, + "grad_norm": 23.180992126464844, + "learning_rate": 2.4863131909871247e-07, + "loss": 0.4209, + "num_input_tokens_seen": 5775556608, + "step": 2754 + }, + { + "epoch": 0.09628424193110931, + "grad_norm": 19.62271499633789, + "learning_rate": 2.483316515780202e-07, + "loss": 0.5382, + "num_input_tokens_seen": 5777653760, + "step": 2755 + }, + { + "epoch": 0.09655546514781665, + "grad_norm": 11.799657821655273, + "learning_rate": 2.480322268285354e-07, + "loss": 0.2602, + "num_input_tokens_seen": 5779750912, + "step": 2756 + }, + { + "epoch": 0.096826688364524, + "grad_norm": 14.03824234008789, + "learning_rate": 2.4773304509122374e-07, + "loss": 0.3326, + "num_input_tokens_seen": 5781848064, + "step": 2757 + }, + { + "epoch": 0.09709791158123135, + "grad_norm": 18.534460067749023, + "learning_rate": 2.474341066068553e-07, + "loss": 0.4274, + "num_input_tokens_seen": 5783945216, + "step": 2758 + }, + { + "epoch": 0.0973691347979387, + "grad_norm": 22.347103118896484, + "learning_rate": 2.4713541161600434e-07, + "loss": 0.681, + "num_input_tokens_seen": 5786042368, + "step": 2759 + }, + { + "epoch": 0.09764035801464606, + "grad_norm": 17.200597763061523, + "learning_rate": 2.4683696035904926e-07, + "loss": 0.4139, + "num_input_tokens_seen": 5788139520, + "step": 2760 + }, + { + "epoch": 0.0979115812313534, + "grad_norm": 24.39967918395996, + "learning_rate": 2.465387530761724e-07, + "loss": 0.2853, + "num_input_tokens_seen": 5790236672, + "step": 2761 + }, + { + "epoch": 0.09818280444806075, + "grad_norm": 15.185997009277344, + "learning_rate": 2.462407900073594e-07, + "loss": 0.3234, + "num_input_tokens_seen": 5792333824, + "step": 2762 + }, + { + "epoch": 0.0984540276647681, + "grad_norm": 19.6497745513916, + "learning_rate": 2.459430713923995e-07, + "loss": 0.6504, + "num_input_tokens_seen": 5794430976, + "step": 2763 + }, + { + "epoch": 0.09872525088147545, + "grad_norm": 11.902987480163574, + "learning_rate": 2.4564559747088573e-07, + "loss": 0.2229, + "num_input_tokens_seen": 5796528128, + "step": 2764 + }, + { + "epoch": 0.09899647409818281, + "grad_norm": 34.26702880859375, + "learning_rate": 2.4534836848221355e-07, + "loss": 0.4959, + "num_input_tokens_seen": 5798625280, + "step": 2765 + }, + { + "epoch": 0.09926769731489016, + "grad_norm": 16.864425659179688, + "learning_rate": 2.450513846655817e-07, + "loss": 0.4531, + "num_input_tokens_seen": 5800722432, + "step": 2766 + }, + { + "epoch": 0.0995389205315975, + "grad_norm": 15.229866981506348, + "learning_rate": 2.4475464625999113e-07, + "loss": 0.4869, + "num_input_tokens_seen": 5802819584, + "step": 2767 + }, + { + "epoch": 0.09981014374830485, + "grad_norm": 13.635716438293457, + "learning_rate": 2.444581535042459e-07, + "loss": 0.3666, + "num_input_tokens_seen": 5804916736, + "step": 2768 + }, + { + "epoch": 0.1000813669650122, + "grad_norm": 24.428754806518555, + "learning_rate": 2.441619066369519e-07, + "loss": 0.9287, + "num_input_tokens_seen": 5807013888, + "step": 2769 + }, + { + "epoch": 0.10035259018171956, + "grad_norm": 19.71752166748047, + "learning_rate": 2.438659058965175e-07, + "loss": 0.5323, + "num_input_tokens_seen": 5809111040, + "step": 2770 + }, + { + "epoch": 0.10062381339842691, + "grad_norm": 17.558313369750977, + "learning_rate": 2.435701515211527e-07, + "loss": 0.3992, + "num_input_tokens_seen": 5811208192, + "step": 2771 + }, + { + "epoch": 0.10089503661513426, + "grad_norm": 15.69237995147705, + "learning_rate": 2.4327464374886955e-07, + "loss": 0.3217, + "num_input_tokens_seen": 5813305344, + "step": 2772 + }, + { + "epoch": 0.1011662598318416, + "grad_norm": 20.842071533203125, + "learning_rate": 2.4297938281748124e-07, + "loss": 0.459, + "num_input_tokens_seen": 5815402496, + "step": 2773 + }, + { + "epoch": 0.10143748304854895, + "grad_norm": 19.98575782775879, + "learning_rate": 2.4268436896460267e-07, + "loss": 0.5564, + "num_input_tokens_seen": 5817499648, + "step": 2774 + }, + { + "epoch": 0.10170870626525631, + "grad_norm": 18.332754135131836, + "learning_rate": 2.423896024276497e-07, + "loss": 0.6737, + "num_input_tokens_seen": 5819596800, + "step": 2775 + }, + { + "epoch": 0.10197992948196366, + "grad_norm": 22.827144622802734, + "learning_rate": 2.4209508344383926e-07, + "loss": 0.553, + "num_input_tokens_seen": 5821693952, + "step": 2776 + }, + { + "epoch": 0.10225115269867101, + "grad_norm": 21.232194900512695, + "learning_rate": 2.4180081225018906e-07, + "loss": 0.6379, + "num_input_tokens_seen": 5823791104, + "step": 2777 + }, + { + "epoch": 0.10252237591537836, + "grad_norm": 19.167123794555664, + "learning_rate": 2.4150678908351744e-07, + "loss": 0.7381, + "num_input_tokens_seen": 5825888256, + "step": 2778 + }, + { + "epoch": 0.1027935991320857, + "grad_norm": 15.559260368347168, + "learning_rate": 2.4121301418044264e-07, + "loss": 0.3415, + "num_input_tokens_seen": 5827985408, + "step": 2779 + }, + { + "epoch": 0.10306482234879305, + "grad_norm": 12.264792442321777, + "learning_rate": 2.409194877773839e-07, + "loss": 0.274, + "num_input_tokens_seen": 5830082560, + "step": 2780 + }, + { + "epoch": 0.10333604556550041, + "grad_norm": 17.90496826171875, + "learning_rate": 2.4062621011056006e-07, + "loss": 0.4599, + "num_input_tokens_seen": 5832179712, + "step": 2781 + }, + { + "epoch": 0.10360726878220776, + "grad_norm": 12.02820110321045, + "learning_rate": 2.4033318141598977e-07, + "loss": 0.3293, + "num_input_tokens_seen": 5834276864, + "step": 2782 + }, + { + "epoch": 0.1038784919989151, + "grad_norm": 15.775694847106934, + "learning_rate": 2.4004040192949155e-07, + "loss": 0.4602, + "num_input_tokens_seen": 5836374016, + "step": 2783 + }, + { + "epoch": 0.10414971521562245, + "grad_norm": 18.71192741394043, + "learning_rate": 2.3974787188668284e-07, + "loss": 0.3823, + "num_input_tokens_seen": 5838471168, + "step": 2784 + }, + { + "epoch": 0.1044209384323298, + "grad_norm": 20.695791244506836, + "learning_rate": 2.3945559152298085e-07, + "loss": 0.5613, + "num_input_tokens_seen": 5840568320, + "step": 2785 + }, + { + "epoch": 0.10469216164903716, + "grad_norm": 14.568918228149414, + "learning_rate": 2.3916356107360174e-07, + "loss": 0.3668, + "num_input_tokens_seen": 5842665472, + "step": 2786 + }, + { + "epoch": 0.10496338486574451, + "grad_norm": 18.7004337310791, + "learning_rate": 2.388717807735605e-07, + "loss": 0.518, + "num_input_tokens_seen": 5844762624, + "step": 2787 + }, + { + "epoch": 0.10523460808245186, + "grad_norm": 11.687546730041504, + "learning_rate": 2.385802508576706e-07, + "loss": 0.2456, + "num_input_tokens_seen": 5846859776, + "step": 2788 + }, + { + "epoch": 0.1055058312991592, + "grad_norm": 14.417143821716309, + "learning_rate": 2.382889715605447e-07, + "loss": 0.2584, + "num_input_tokens_seen": 5848956928, + "step": 2789 + }, + { + "epoch": 0.10577705451586655, + "grad_norm": 14.506068229675293, + "learning_rate": 2.3799794311659286e-07, + "loss": 0.2735, + "num_input_tokens_seen": 5851054080, + "step": 2790 + }, + { + "epoch": 0.10604827773257391, + "grad_norm": 30.506330490112305, + "learning_rate": 2.3770716576002383e-07, + "loss": 0.5316, + "num_input_tokens_seen": 5853151232, + "step": 2791 + }, + { + "epoch": 0.10631950094928126, + "grad_norm": 18.837438583374023, + "learning_rate": 2.3741663972484416e-07, + "loss": 0.5209, + "num_input_tokens_seen": 5855248384, + "step": 2792 + }, + { + "epoch": 0.10659072416598861, + "grad_norm": 13.7747163772583, + "learning_rate": 2.3712636524485817e-07, + "loss": 0.2876, + "num_input_tokens_seen": 5857345536, + "step": 2793 + }, + { + "epoch": 0.10686194738269596, + "grad_norm": 13.69284439086914, + "learning_rate": 2.3683634255366773e-07, + "loss": 0.2466, + "num_input_tokens_seen": 5859442688, + "step": 2794 + }, + { + "epoch": 0.1071331705994033, + "grad_norm": 23.49444007873535, + "learning_rate": 2.3654657188467203e-07, + "loss": 0.5751, + "num_input_tokens_seen": 5861539840, + "step": 2795 + }, + { + "epoch": 0.10740439381611067, + "grad_norm": 32.023529052734375, + "learning_rate": 2.3625705347106748e-07, + "loss": 0.4901, + "num_input_tokens_seen": 5863636992, + "step": 2796 + }, + { + "epoch": 0.10767561703281801, + "grad_norm": 18.634899139404297, + "learning_rate": 2.3596778754584752e-07, + "loss": 0.4605, + "num_input_tokens_seen": 5865734144, + "step": 2797 + }, + { + "epoch": 0.10794684024952536, + "grad_norm": 17.947357177734375, + "learning_rate": 2.356787743418023e-07, + "loss": 0.4397, + "num_input_tokens_seen": 5867831296, + "step": 2798 + }, + { + "epoch": 0.10821806346623271, + "grad_norm": 10.587057113647461, + "learning_rate": 2.3539001409151867e-07, + "loss": 0.2498, + "num_input_tokens_seen": 5869928448, + "step": 2799 + }, + { + "epoch": 0.10848928668294006, + "grad_norm": 15.883231163024902, + "learning_rate": 2.351015070273799e-07, + "loss": 0.4346, + "num_input_tokens_seen": 5872025600, + "step": 2800 + }, + { + "epoch": 0.1087605098996474, + "grad_norm": 22.052234649658203, + "learning_rate": 2.3481325338156568e-07, + "loss": 0.7427, + "num_input_tokens_seen": 5874122752, + "step": 2801 + }, + { + "epoch": 0.10903173311635476, + "grad_norm": 15.02530288696289, + "learning_rate": 2.3452525338605126e-07, + "loss": 0.2588, + "num_input_tokens_seen": 5876219904, + "step": 2802 + }, + { + "epoch": 0.10930295633306211, + "grad_norm": 15.054925918579102, + "learning_rate": 2.3423750727260813e-07, + "loss": 0.2948, + "num_input_tokens_seen": 5878317056, + "step": 2803 + }, + { + "epoch": 0.10957417954976946, + "grad_norm": 12.671406745910645, + "learning_rate": 2.3395001527280343e-07, + "loss": 0.2361, + "num_input_tokens_seen": 5880414208, + "step": 2804 + }, + { + "epoch": 0.10984540276647681, + "grad_norm": 20.93232536315918, + "learning_rate": 2.3366277761799993e-07, + "loss": 0.663, + "num_input_tokens_seen": 5882511360, + "step": 2805 + }, + { + "epoch": 0.11011662598318415, + "grad_norm": 18.05968475341797, + "learning_rate": 2.333757945393557e-07, + "loss": 0.5558, + "num_input_tokens_seen": 5884608512, + "step": 2806 + }, + { + "epoch": 0.11038784919989152, + "grad_norm": 15.030449867248535, + "learning_rate": 2.3308906626782333e-07, + "loss": 0.4106, + "num_input_tokens_seen": 5886705664, + "step": 2807 + }, + { + "epoch": 0.11065907241659886, + "grad_norm": 20.000356674194336, + "learning_rate": 2.3280259303415115e-07, + "loss": 0.5442, + "num_input_tokens_seen": 5888802816, + "step": 2808 + }, + { + "epoch": 0.11093029563330621, + "grad_norm": 10.596423149108887, + "learning_rate": 2.3251637506888178e-07, + "loss": 0.1988, + "num_input_tokens_seen": 5890899968, + "step": 2809 + }, + { + "epoch": 0.11120151885001356, + "grad_norm": 13.657599449157715, + "learning_rate": 2.3223041260235265e-07, + "loss": 0.3359, + "num_input_tokens_seen": 5892997120, + "step": 2810 + }, + { + "epoch": 0.1114727420667209, + "grad_norm": 12.095483779907227, + "learning_rate": 2.3194470586469543e-07, + "loss": 0.218, + "num_input_tokens_seen": 5895094272, + "step": 2811 + }, + { + "epoch": 0.11174396528342827, + "grad_norm": 16.444936752319336, + "learning_rate": 2.3165925508583598e-07, + "loss": 0.3819, + "num_input_tokens_seen": 5897191424, + "step": 2812 + }, + { + "epoch": 0.11201518850013562, + "grad_norm": 20.455156326293945, + "learning_rate": 2.313740604954943e-07, + "loss": 0.5105, + "num_input_tokens_seen": 5899288576, + "step": 2813 + }, + { + "epoch": 0.11228641171684296, + "grad_norm": 15.869861602783203, + "learning_rate": 2.3108912232318413e-07, + "loss": 0.3829, + "num_input_tokens_seen": 5901385728, + "step": 2814 + }, + { + "epoch": 0.11255763493355031, + "grad_norm": 13.574760437011719, + "learning_rate": 2.3080444079821284e-07, + "loss": 0.4095, + "num_input_tokens_seen": 5903482880, + "step": 2815 + }, + { + "epoch": 0.11282885815025766, + "grad_norm": 27.635112762451172, + "learning_rate": 2.305200161496813e-07, + "loss": 0.8766, + "num_input_tokens_seen": 5905580032, + "step": 2816 + }, + { + "epoch": 0.11310008136696502, + "grad_norm": 20.717670440673828, + "learning_rate": 2.3023584860648364e-07, + "loss": 0.6796, + "num_input_tokens_seen": 5907677184, + "step": 2817 + }, + { + "epoch": 0.11337130458367237, + "grad_norm": 17.95741081237793, + "learning_rate": 2.2995193839730727e-07, + "loss": 0.469, + "num_input_tokens_seen": 5909774336, + "step": 2818 + }, + { + "epoch": 0.11364252780037971, + "grad_norm": 15.287071228027344, + "learning_rate": 2.2966828575063196e-07, + "loss": 0.3544, + "num_input_tokens_seen": 5911871488, + "step": 2819 + }, + { + "epoch": 0.11391375101708706, + "grad_norm": 17.377723693847656, + "learning_rate": 2.2938489089473065e-07, + "loss": 0.4254, + "num_input_tokens_seen": 5913968640, + "step": 2820 + }, + { + "epoch": 0.11418497423379441, + "grad_norm": 21.972118377685547, + "learning_rate": 2.2910175405766896e-07, + "loss": 0.7153, + "num_input_tokens_seen": 5916065792, + "step": 2821 + }, + { + "epoch": 0.11445619745050176, + "grad_norm": 21.45004653930664, + "learning_rate": 2.288188754673045e-07, + "loss": 0.4351, + "num_input_tokens_seen": 5918162944, + "step": 2822 + }, + { + "epoch": 0.11472742066720912, + "grad_norm": 20.235944747924805, + "learning_rate": 2.2853625535128735e-07, + "loss": 0.6523, + "num_input_tokens_seen": 5920260096, + "step": 2823 + }, + { + "epoch": 0.11499864388391647, + "grad_norm": 12.907526016235352, + "learning_rate": 2.2825389393705922e-07, + "loss": 0.2849, + "num_input_tokens_seen": 5922357248, + "step": 2824 + }, + { + "epoch": 0.11526986710062381, + "grad_norm": 19.846851348876953, + "learning_rate": 2.2797179145185384e-07, + "loss": 0.5741, + "num_input_tokens_seen": 5924454400, + "step": 2825 + }, + { + "epoch": 0.11554109031733116, + "grad_norm": 14.583490371704102, + "learning_rate": 2.2768994812269666e-07, + "loss": 0.2806, + "num_input_tokens_seen": 5926551552, + "step": 2826 + }, + { + "epoch": 0.11581231353403851, + "grad_norm": 16.440160751342773, + "learning_rate": 2.2740836417640432e-07, + "loss": 0.3765, + "num_input_tokens_seen": 5928648704, + "step": 2827 + }, + { + "epoch": 0.11608353675074587, + "grad_norm": 22.054208755493164, + "learning_rate": 2.271270398395851e-07, + "loss": 0.6938, + "num_input_tokens_seen": 5930745856, + "step": 2828 + }, + { + "epoch": 0.11635475996745322, + "grad_norm": 13.32007122039795, + "learning_rate": 2.2684597533863793e-07, + "loss": 0.406, + "num_input_tokens_seen": 5932843008, + "step": 2829 + }, + { + "epoch": 0.11662598318416056, + "grad_norm": 13.135971069335938, + "learning_rate": 2.2656517089975286e-07, + "loss": 0.2882, + "num_input_tokens_seen": 5934940160, + "step": 2830 + }, + { + "epoch": 0.11689720640086791, + "grad_norm": 23.185508728027344, + "learning_rate": 2.2628462674891062e-07, + "loss": 0.4794, + "num_input_tokens_seen": 5937037312, + "step": 2831 + }, + { + "epoch": 0.11716842961757526, + "grad_norm": 16.763158798217773, + "learning_rate": 2.2600434311188253e-07, + "loss": 0.4259, + "num_input_tokens_seen": 5939134464, + "step": 2832 + }, + { + "epoch": 0.11743965283428262, + "grad_norm": 28.8845272064209, + "learning_rate": 2.2572432021423018e-07, + "loss": 0.5011, + "num_input_tokens_seen": 5941231616, + "step": 2833 + }, + { + "epoch": 0.11771087605098997, + "grad_norm": 21.761306762695312, + "learning_rate": 2.2544455828130528e-07, + "loss": 0.6964, + "num_input_tokens_seen": 5943328768, + "step": 2834 + }, + { + "epoch": 0.11798209926769732, + "grad_norm": 18.940256118774414, + "learning_rate": 2.251650575382498e-07, + "loss": 0.5045, + "num_input_tokens_seen": 5945425920, + "step": 2835 + }, + { + "epoch": 0.11825332248440466, + "grad_norm": 16.255231857299805, + "learning_rate": 2.2488581820999481e-07, + "loss": 0.4833, + "num_input_tokens_seen": 5947523072, + "step": 2836 + }, + { + "epoch": 0.11852454570111201, + "grad_norm": 19.86233139038086, + "learning_rate": 2.2460684052126195e-07, + "loss": 0.678, + "num_input_tokens_seen": 5949620224, + "step": 2837 + }, + { + "epoch": 0.11879576891781937, + "grad_norm": 17.203176498413086, + "learning_rate": 2.243281246965617e-07, + "loss": 0.5081, + "num_input_tokens_seen": 5951717376, + "step": 2838 + }, + { + "epoch": 0.11906699213452672, + "grad_norm": 11.493597984313965, + "learning_rate": 2.24049670960194e-07, + "loss": 0.2694, + "num_input_tokens_seen": 5953814528, + "step": 2839 + }, + { + "epoch": 0.11933821535123407, + "grad_norm": 15.567400932312012, + "learning_rate": 2.2377147953624776e-07, + "loss": 0.3385, + "num_input_tokens_seen": 5955911680, + "step": 2840 + }, + { + "epoch": 0.11960943856794141, + "grad_norm": 20.829069137573242, + "learning_rate": 2.2349355064860104e-07, + "loss": 0.5221, + "num_input_tokens_seen": 5958008832, + "step": 2841 + }, + { + "epoch": 0.11988066178464876, + "grad_norm": 25.062725067138672, + "learning_rate": 2.2321588452092006e-07, + "loss": 0.6818, + "num_input_tokens_seen": 5960105984, + "step": 2842 + }, + { + "epoch": 0.12015188500135611, + "grad_norm": 12.187406539916992, + "learning_rate": 2.229384813766602e-07, + "loss": 0.2983, + "num_input_tokens_seen": 5962203136, + "step": 2843 + }, + { + "epoch": 0.12042310821806347, + "grad_norm": 12.41606616973877, + "learning_rate": 2.2266134143906496e-07, + "loss": 0.3099, + "num_input_tokens_seen": 5964300288, + "step": 2844 + }, + { + "epoch": 0.12069433143477082, + "grad_norm": 15.00600528717041, + "learning_rate": 2.2238446493116572e-07, + "loss": 0.3705, + "num_input_tokens_seen": 5966397440, + "step": 2845 + }, + { + "epoch": 0.12096555465147817, + "grad_norm": 19.592304229736328, + "learning_rate": 2.2210785207578275e-07, + "loss": 0.5625, + "num_input_tokens_seen": 5968494592, + "step": 2846 + }, + { + "epoch": 0.12123677786818551, + "grad_norm": 11.538047790527344, + "learning_rate": 2.2183150309552308e-07, + "loss": 0.2395, + "num_input_tokens_seen": 5970591744, + "step": 2847 + }, + { + "epoch": 0.12150800108489286, + "grad_norm": 17.946090698242188, + "learning_rate": 2.2155541821278196e-07, + "loss": 0.4497, + "num_input_tokens_seen": 5972688896, + "step": 2848 + }, + { + "epoch": 0.12177922430160022, + "grad_norm": 13.891773223876953, + "learning_rate": 2.2127959764974203e-07, + "loss": 0.2957, + "num_input_tokens_seen": 5974786048, + "step": 2849 + }, + { + "epoch": 0.12205044751830757, + "grad_norm": 17.93677520751953, + "learning_rate": 2.2100404162837317e-07, + "loss": 0.4703, + "num_input_tokens_seen": 5976883200, + "step": 2850 + }, + { + "epoch": 0.12232167073501492, + "grad_norm": 21.524486541748047, + "learning_rate": 2.2072875037043232e-07, + "loss": 0.4836, + "num_input_tokens_seen": 5978980352, + "step": 2851 + }, + { + "epoch": 0.12259289395172226, + "grad_norm": 12.988191604614258, + "learning_rate": 2.204537240974636e-07, + "loss": 0.3129, + "num_input_tokens_seen": 5981077504, + "step": 2852 + }, + { + "epoch": 0.12286411716842961, + "grad_norm": 20.595354080200195, + "learning_rate": 2.2017896303079743e-07, + "loss": 0.5311, + "num_input_tokens_seen": 5983174656, + "step": 2853 + }, + { + "epoch": 0.12313534038513697, + "grad_norm": 22.666582107543945, + "learning_rate": 2.1990446739155128e-07, + "loss": 0.4373, + "num_input_tokens_seen": 5985271808, + "step": 2854 + }, + { + "epoch": 0.12340656360184432, + "grad_norm": 11.09825611114502, + "learning_rate": 2.1963023740062864e-07, + "loss": 0.2112, + "num_input_tokens_seen": 5987368960, + "step": 2855 + }, + { + "epoch": 0.12367778681855167, + "grad_norm": 11.390056610107422, + "learning_rate": 2.1935627327871948e-07, + "loss": 0.2525, + "num_input_tokens_seen": 5989466112, + "step": 2856 + }, + { + "epoch": 0.12394901003525902, + "grad_norm": 12.461858749389648, + "learning_rate": 2.1908257524629963e-07, + "loss": 0.4179, + "num_input_tokens_seen": 5991563264, + "step": 2857 + }, + { + "epoch": 0.12422023325196636, + "grad_norm": 17.467121124267578, + "learning_rate": 2.1880914352363106e-07, + "loss": 0.5047, + "num_input_tokens_seen": 5993660416, + "step": 2858 + }, + { + "epoch": 0.12449145646867373, + "grad_norm": 18.427640914916992, + "learning_rate": 2.1853597833076088e-07, + "loss": 0.5304, + "num_input_tokens_seen": 5995757568, + "step": 2859 + }, + { + "epoch": 0.12476267968538107, + "grad_norm": 13.977510452270508, + "learning_rate": 2.1826307988752212e-07, + "loss": 0.4055, + "num_input_tokens_seen": 5997854720, + "step": 2860 + }, + { + "epoch": 0.1250339029020884, + "grad_norm": 15.963011741638184, + "learning_rate": 2.179904484135331e-07, + "loss": 0.3931, + "num_input_tokens_seen": 5999951872, + "step": 2861 + }, + { + "epoch": 0.12530512611879577, + "grad_norm": 17.05299186706543, + "learning_rate": 2.177180841281974e-07, + "loss": 0.4642, + "num_input_tokens_seen": 6002049024, + "step": 2862 + }, + { + "epoch": 0.12557634933550313, + "grad_norm": 15.496081352233887, + "learning_rate": 2.1744598725070347e-07, + "loss": 0.425, + "num_input_tokens_seen": 6004146176, + "step": 2863 + }, + { + "epoch": 0.12584757255221046, + "grad_norm": 18.021099090576172, + "learning_rate": 2.1717415800002425e-07, + "loss": 0.5045, + "num_input_tokens_seen": 6006243328, + "step": 2864 + }, + { + "epoch": 0.12611879576891782, + "grad_norm": 16.96639633178711, + "learning_rate": 2.1690259659491768e-07, + "loss": 0.5096, + "num_input_tokens_seen": 6008340480, + "step": 2865 + }, + { + "epoch": 0.12639001898562516, + "grad_norm": 14.744760513305664, + "learning_rate": 2.166313032539261e-07, + "loss": 0.3753, + "num_input_tokens_seen": 6010437632, + "step": 2866 + }, + { + "epoch": 0.12666124220233252, + "grad_norm": 16.412687301635742, + "learning_rate": 2.1636027819537605e-07, + "loss": 0.4849, + "num_input_tokens_seen": 6012534784, + "step": 2867 + }, + { + "epoch": 0.12693246541903988, + "grad_norm": 14.01508617401123, + "learning_rate": 2.160895216373781e-07, + "loss": 0.2672, + "num_input_tokens_seen": 6014631936, + "step": 2868 + }, + { + "epoch": 0.12720368863574721, + "grad_norm": 9.786606788635254, + "learning_rate": 2.15819033797827e-07, + "loss": 0.2023, + "num_input_tokens_seen": 6016729088, + "step": 2869 + }, + { + "epoch": 0.12747491185245458, + "grad_norm": 15.194939613342285, + "learning_rate": 2.1554881489440092e-07, + "loss": 0.4128, + "num_input_tokens_seen": 6018826240, + "step": 2870 + }, + { + "epoch": 0.1277461350691619, + "grad_norm": 24.29368019104004, + "learning_rate": 2.1527886514456178e-07, + "loss": 0.5414, + "num_input_tokens_seen": 6020923392, + "step": 2871 + }, + { + "epoch": 0.12801735828586927, + "grad_norm": 14.61825180053711, + "learning_rate": 2.150091847655549e-07, + "loss": 0.3288, + "num_input_tokens_seen": 6023020544, + "step": 2872 + }, + { + "epoch": 0.12828858150257663, + "grad_norm": 10.37586784362793, + "learning_rate": 2.1473977397440878e-07, + "loss": 0.1956, + "num_input_tokens_seen": 6025117696, + "step": 2873 + }, + { + "epoch": 0.12855980471928397, + "grad_norm": 17.528772354125977, + "learning_rate": 2.14470632987935e-07, + "loss": 0.5343, + "num_input_tokens_seen": 6027214848, + "step": 2874 + }, + { + "epoch": 0.12883102793599133, + "grad_norm": 13.622241020202637, + "learning_rate": 2.1420176202272805e-07, + "loss": 0.193, + "num_input_tokens_seen": 6029312000, + "step": 2875 + }, + { + "epoch": 0.12910225115269866, + "grad_norm": 17.937719345092773, + "learning_rate": 2.1393316129516482e-07, + "loss": 0.448, + "num_input_tokens_seen": 6031409152, + "step": 2876 + }, + { + "epoch": 0.12937347436940602, + "grad_norm": 19.698455810546875, + "learning_rate": 2.1366483102140497e-07, + "loss": 0.3603, + "num_input_tokens_seen": 6033506304, + "step": 2877 + }, + { + "epoch": 0.12964469758611338, + "grad_norm": 15.03511905670166, + "learning_rate": 2.1339677141739074e-07, + "loss": 0.3772, + "num_input_tokens_seen": 6035603456, + "step": 2878 + }, + { + "epoch": 0.12991592080282072, + "grad_norm": 17.960357666015625, + "learning_rate": 2.1312898269884616e-07, + "loss": 0.5342, + "num_input_tokens_seen": 6037700608, + "step": 2879 + }, + { + "epoch": 0.13018714401952808, + "grad_norm": 12.057799339294434, + "learning_rate": 2.128614650812775e-07, + "loss": 0.2405, + "num_input_tokens_seen": 6039797760, + "step": 2880 + }, + { + "epoch": 0.1304583672362354, + "grad_norm": 27.220239639282227, + "learning_rate": 2.125942187799726e-07, + "loss": 0.6078, + "num_input_tokens_seen": 6041894912, + "step": 2881 + }, + { + "epoch": 0.13072959045294277, + "grad_norm": 27.639814376831055, + "learning_rate": 2.123272440100012e-07, + "loss": 0.7583, + "num_input_tokens_seen": 6043992064, + "step": 2882 + }, + { + "epoch": 0.13100081366965013, + "grad_norm": 22.158348083496094, + "learning_rate": 2.1206054098621434e-07, + "loss": 0.6885, + "num_input_tokens_seen": 6046089216, + "step": 2883 + }, + { + "epoch": 0.13127203688635747, + "grad_norm": 26.35175895690918, + "learning_rate": 2.117941099232446e-07, + "loss": 0.7587, + "num_input_tokens_seen": 6048186368, + "step": 2884 + }, + { + "epoch": 0.13154326010306483, + "grad_norm": 14.008932113647461, + "learning_rate": 2.1152795103550547e-07, + "loss": 0.2805, + "num_input_tokens_seen": 6050283520, + "step": 2885 + }, + { + "epoch": 0.13181448331977216, + "grad_norm": 21.59432029724121, + "learning_rate": 2.1126206453719157e-07, + "loss": 0.6467, + "num_input_tokens_seen": 6052380672, + "step": 2886 + }, + { + "epoch": 0.13208570653647952, + "grad_norm": 16.01569938659668, + "learning_rate": 2.1099645064227817e-07, + "loss": 0.3664, + "num_input_tokens_seen": 6054477824, + "step": 2887 + }, + { + "epoch": 0.13235692975318689, + "grad_norm": 15.261370658874512, + "learning_rate": 2.1073110956452126e-07, + "loss": 0.2808, + "num_input_tokens_seen": 6056574976, + "step": 2888 + }, + { + "epoch": 0.13262815296989422, + "grad_norm": 14.421086311340332, + "learning_rate": 2.1046604151745723e-07, + "loss": 0.3063, + "num_input_tokens_seen": 6058672128, + "step": 2889 + }, + { + "epoch": 0.13289937618660158, + "grad_norm": 15.327587127685547, + "learning_rate": 2.1020124671440274e-07, + "loss": 0.3161, + "num_input_tokens_seen": 6060769280, + "step": 2890 + }, + { + "epoch": 0.13317059940330891, + "grad_norm": 18.59784507751465, + "learning_rate": 2.0993672536845458e-07, + "loss": 0.3972, + "num_input_tokens_seen": 6062866432, + "step": 2891 + }, + { + "epoch": 0.13344182262001628, + "grad_norm": 16.773000717163086, + "learning_rate": 2.0967247769248968e-07, + "loss": 0.4275, + "num_input_tokens_seen": 6064963584, + "step": 2892 + }, + { + "epoch": 0.13371304583672364, + "grad_norm": 14.18702220916748, + "learning_rate": 2.0940850389916398e-07, + "loss": 0.2471, + "num_input_tokens_seen": 6067060736, + "step": 2893 + }, + { + "epoch": 0.13398426905343097, + "grad_norm": 11.325303077697754, + "learning_rate": 2.09144804200914e-07, + "loss": 0.176, + "num_input_tokens_seen": 6069157888, + "step": 2894 + }, + { + "epoch": 0.13425549227013833, + "grad_norm": 13.870182037353516, + "learning_rate": 2.0888137880995514e-07, + "loss": 0.2562, + "num_input_tokens_seen": 6071255040, + "step": 2895 + }, + { + "epoch": 0.13452671548684567, + "grad_norm": 21.369096755981445, + "learning_rate": 2.086182279382821e-07, + "loss": 0.5646, + "num_input_tokens_seen": 6073352192, + "step": 2896 + }, + { + "epoch": 0.13479793870355303, + "grad_norm": 17.719070434570312, + "learning_rate": 2.083553517976686e-07, + "loss": 0.4004, + "num_input_tokens_seen": 6075449344, + "step": 2897 + }, + { + "epoch": 0.13506916192026036, + "grad_norm": 16.208087921142578, + "learning_rate": 2.0809275059966764e-07, + "loss": 0.4522, + "num_input_tokens_seen": 6077546496, + "step": 2898 + }, + { + "epoch": 0.13534038513696772, + "grad_norm": 11.811439514160156, + "learning_rate": 2.0783042455561023e-07, + "loss": 0.2981, + "num_input_tokens_seen": 6079643648, + "step": 2899 + }, + { + "epoch": 0.13561160835367508, + "grad_norm": 14.665522575378418, + "learning_rate": 2.0756837387660676e-07, + "loss": 0.3378, + "num_input_tokens_seen": 6081740800, + "step": 2900 + }, + { + "epoch": 0.13588283157038242, + "grad_norm": 18.123260498046875, + "learning_rate": 2.073065987735454e-07, + "loss": 0.3911, + "num_input_tokens_seen": 6083837952, + "step": 2901 + }, + { + "epoch": 0.13615405478708978, + "grad_norm": 15.937180519104004, + "learning_rate": 2.070450994570928e-07, + "loss": 0.4768, + "num_input_tokens_seen": 6085935104, + "step": 2902 + }, + { + "epoch": 0.1364252780037971, + "grad_norm": 18.534635543823242, + "learning_rate": 2.0678387613769397e-07, + "loss": 0.4538, + "num_input_tokens_seen": 6088032256, + "step": 2903 + }, + { + "epoch": 0.13669650122050447, + "grad_norm": 19.72084617614746, + "learning_rate": 2.0652292902557117e-07, + "loss": 0.5692, + "num_input_tokens_seen": 6090129408, + "step": 2904 + }, + { + "epoch": 0.13696772443721184, + "grad_norm": 23.22869110107422, + "learning_rate": 2.0626225833072487e-07, + "loss": 0.7825, + "num_input_tokens_seen": 6092226560, + "step": 2905 + }, + { + "epoch": 0.13723894765391917, + "grad_norm": 16.263364791870117, + "learning_rate": 2.0600186426293282e-07, + "loss": 0.3674, + "num_input_tokens_seen": 6094323712, + "step": 2906 + }, + { + "epoch": 0.13751017087062653, + "grad_norm": 15.390429496765137, + "learning_rate": 2.0574174703175034e-07, + "loss": 0.2301, + "num_input_tokens_seen": 6096420864, + "step": 2907 + }, + { + "epoch": 0.13778139408733386, + "grad_norm": 19.23081398010254, + "learning_rate": 2.0548190684650981e-07, + "loss": 0.4732, + "num_input_tokens_seen": 6098518016, + "step": 2908 + }, + { + "epoch": 0.13805261730404123, + "grad_norm": 16.774734497070312, + "learning_rate": 2.052223439163208e-07, + "loss": 0.5119, + "num_input_tokens_seen": 6100615168, + "step": 2909 + }, + { + "epoch": 0.1383238405207486, + "grad_norm": 11.67673110961914, + "learning_rate": 2.049630584500696e-07, + "loss": 0.2929, + "num_input_tokens_seen": 6102712320, + "step": 2910 + }, + { + "epoch": 0.13859506373745592, + "grad_norm": 22.138757705688477, + "learning_rate": 2.0470405065641938e-07, + "loss": 0.5621, + "num_input_tokens_seen": 6104809472, + "step": 2911 + }, + { + "epoch": 0.13886628695416328, + "grad_norm": 18.90143585205078, + "learning_rate": 2.0444532074380973e-07, + "loss": 0.5335, + "num_input_tokens_seen": 6106906624, + "step": 2912 + }, + { + "epoch": 0.13913751017087062, + "grad_norm": 13.576014518737793, + "learning_rate": 2.0418686892045654e-07, + "loss": 0.237, + "num_input_tokens_seen": 6109003776, + "step": 2913 + }, + { + "epoch": 0.13940873338757798, + "grad_norm": 16.365047454833984, + "learning_rate": 2.039286953943521e-07, + "loss": 0.4888, + "num_input_tokens_seen": 6111100928, + "step": 2914 + }, + { + "epoch": 0.13967995660428534, + "grad_norm": 15.33917236328125, + "learning_rate": 2.0367080037326472e-07, + "loss": 0.3311, + "num_input_tokens_seen": 6113198080, + "step": 2915 + }, + { + "epoch": 0.13995117982099267, + "grad_norm": 14.788158416748047, + "learning_rate": 2.0341318406473833e-07, + "loss": 0.3541, + "num_input_tokens_seen": 6115295232, + "step": 2916 + }, + { + "epoch": 0.14022240303770003, + "grad_norm": 18.039945602416992, + "learning_rate": 2.031558466760927e-07, + "loss": 0.4563, + "num_input_tokens_seen": 6117392384, + "step": 2917 + }, + { + "epoch": 0.14049362625440737, + "grad_norm": 20.71190071105957, + "learning_rate": 2.0289878841442325e-07, + "loss": 0.5077, + "num_input_tokens_seen": 6119489536, + "step": 2918 + }, + { + "epoch": 0.14076484947111473, + "grad_norm": 19.41553497314453, + "learning_rate": 2.0264200948660076e-07, + "loss": 0.4001, + "num_input_tokens_seen": 6121586688, + "step": 2919 + }, + { + "epoch": 0.1410360726878221, + "grad_norm": 17.448558807373047, + "learning_rate": 2.0238551009927125e-07, + "loss": 0.4873, + "num_input_tokens_seen": 6123683840, + "step": 2920 + }, + { + "epoch": 0.14130729590452942, + "grad_norm": 14.663477897644043, + "learning_rate": 2.021292904588553e-07, + "loss": 0.3583, + "num_input_tokens_seen": 6125780992, + "step": 2921 + }, + { + "epoch": 0.14157851912123678, + "grad_norm": 19.37281608581543, + "learning_rate": 2.0187335077154905e-07, + "loss": 0.4864, + "num_input_tokens_seen": 6127878144, + "step": 2922 + }, + { + "epoch": 0.14184974233794412, + "grad_norm": 16.566606521606445, + "learning_rate": 2.0161769124332278e-07, + "loss": 0.5179, + "num_input_tokens_seen": 6129975296, + "step": 2923 + }, + { + "epoch": 0.14212096555465148, + "grad_norm": 14.152243614196777, + "learning_rate": 2.013623120799217e-07, + "loss": 0.3427, + "num_input_tokens_seen": 6132072448, + "step": 2924 + }, + { + "epoch": 0.14239218877135884, + "grad_norm": 17.58519744873047, + "learning_rate": 2.0110721348686523e-07, + "loss": 0.4143, + "num_input_tokens_seen": 6134169600, + "step": 2925 + }, + { + "epoch": 0.14266341198806617, + "grad_norm": 11.517704010009766, + "learning_rate": 2.0085239566944702e-07, + "loss": 0.1861, + "num_input_tokens_seen": 6136266752, + "step": 2926 + }, + { + "epoch": 0.14293463520477354, + "grad_norm": 19.695545196533203, + "learning_rate": 2.0059785883273463e-07, + "loss": 0.4736, + "num_input_tokens_seen": 6138363904, + "step": 2927 + }, + { + "epoch": 0.14320585842148087, + "grad_norm": 17.48176383972168, + "learning_rate": 2.003436031815698e-07, + "loss": 0.4326, + "num_input_tokens_seen": 6140461056, + "step": 2928 + }, + { + "epoch": 0.14347708163818823, + "grad_norm": 15.76534652709961, + "learning_rate": 2.0008962892056762e-07, + "loss": 0.3545, + "num_input_tokens_seen": 6142558208, + "step": 2929 + }, + { + "epoch": 0.1437483048548956, + "grad_norm": 16.841144561767578, + "learning_rate": 1.998359362541171e-07, + "loss": 0.4996, + "num_input_tokens_seen": 6144655360, + "step": 2930 + }, + { + "epoch": 0.14401952807160293, + "grad_norm": 20.11952781677246, + "learning_rate": 1.9958252538638032e-07, + "loss": 0.4388, + "num_input_tokens_seen": 6146752512, + "step": 2931 + }, + { + "epoch": 0.1442907512883103, + "grad_norm": 14.645740509033203, + "learning_rate": 1.9932939652129283e-07, + "loss": 0.2301, + "num_input_tokens_seen": 6148849664, + "step": 2932 + }, + { + "epoch": 0.14456197450501762, + "grad_norm": 20.628215789794922, + "learning_rate": 1.990765498625629e-07, + "loss": 0.651, + "num_input_tokens_seen": 6150946816, + "step": 2933 + }, + { + "epoch": 0.14483319772172498, + "grad_norm": 13.291131019592285, + "learning_rate": 1.988239856136719e-07, + "loss": 0.3269, + "num_input_tokens_seen": 6153043968, + "step": 2934 + }, + { + "epoch": 0.14510442093843234, + "grad_norm": 16.319238662719727, + "learning_rate": 1.9857170397787415e-07, + "loss": 0.4404, + "num_input_tokens_seen": 6155141120, + "step": 2935 + }, + { + "epoch": 0.14537564415513968, + "grad_norm": 14.976720809936523, + "learning_rate": 1.9831970515819625e-07, + "loss": 0.3358, + "num_input_tokens_seen": 6157238272, + "step": 2936 + }, + { + "epoch": 0.14564686737184704, + "grad_norm": 10.872328758239746, + "learning_rate": 1.980679893574373e-07, + "loss": 0.2542, + "num_input_tokens_seen": 6159335424, + "step": 2937 + }, + { + "epoch": 0.14591809058855437, + "grad_norm": 19.26199722290039, + "learning_rate": 1.9781655677816838e-07, + "loss": 0.4478, + "num_input_tokens_seen": 6161432576, + "step": 2938 + }, + { + "epoch": 0.14618931380526173, + "grad_norm": 12.88478946685791, + "learning_rate": 1.97565407622733e-07, + "loss": 0.3474, + "num_input_tokens_seen": 6163529728, + "step": 2939 + }, + { + "epoch": 0.14646053702196907, + "grad_norm": 21.489870071411133, + "learning_rate": 1.9731454209324644e-07, + "loss": 0.6245, + "num_input_tokens_seen": 6165626880, + "step": 2940 + }, + { + "epoch": 0.14673176023867643, + "grad_norm": 18.655330657958984, + "learning_rate": 1.9706396039159568e-07, + "loss": 0.5171, + "num_input_tokens_seen": 6167724032, + "step": 2941 + }, + { + "epoch": 0.1470029834553838, + "grad_norm": 15.826581001281738, + "learning_rate": 1.9681366271943945e-07, + "loss": 0.309, + "num_input_tokens_seen": 6169821184, + "step": 2942 + }, + { + "epoch": 0.14727420667209112, + "grad_norm": 16.819740295410156, + "learning_rate": 1.965636492782077e-07, + "loss": 0.2265, + "num_input_tokens_seen": 6171918336, + "step": 2943 + }, + { + "epoch": 0.14754542988879848, + "grad_norm": 27.977834701538086, + "learning_rate": 1.963139202691017e-07, + "loss": 0.8497, + "num_input_tokens_seen": 6174015488, + "step": 2944 + }, + { + "epoch": 0.14781665310550582, + "grad_norm": 29.797565460205078, + "learning_rate": 1.9606447589309397e-07, + "loss": 0.7042, + "num_input_tokens_seen": 6176112640, + "step": 2945 + }, + { + "epoch": 0.14808787632221318, + "grad_norm": 14.23031234741211, + "learning_rate": 1.9581531635092773e-07, + "loss": 0.2392, + "num_input_tokens_seen": 6178209792, + "step": 2946 + }, + { + "epoch": 0.14835909953892054, + "grad_norm": 14.467376708984375, + "learning_rate": 1.9556644184311728e-07, + "loss": 0.3154, + "num_input_tokens_seen": 6180306944, + "step": 2947 + }, + { + "epoch": 0.14863032275562787, + "grad_norm": 17.21703338623047, + "learning_rate": 1.9531785256994716e-07, + "loss": 0.4227, + "num_input_tokens_seen": 6182404096, + "step": 2948 + }, + { + "epoch": 0.14890154597233524, + "grad_norm": 19.657089233398438, + "learning_rate": 1.9506954873147276e-07, + "loss": 0.5356, + "num_input_tokens_seen": 6184501248, + "step": 2949 + }, + { + "epoch": 0.14917276918904257, + "grad_norm": 16.794424057006836, + "learning_rate": 1.9482153052751921e-07, + "loss": 0.4391, + "num_input_tokens_seen": 6186598400, + "step": 2950 + }, + { + "epoch": 0.14944399240574993, + "grad_norm": 18.118986129760742, + "learning_rate": 1.9457379815768245e-07, + "loss": 0.5088, + "num_input_tokens_seen": 6188695552, + "step": 2951 + }, + { + "epoch": 0.1497152156224573, + "grad_norm": 17.5322265625, + "learning_rate": 1.9432635182132795e-07, + "loss": 0.4736, + "num_input_tokens_seen": 6190792704, + "step": 2952 + }, + { + "epoch": 0.14998643883916463, + "grad_norm": 15.88696575164795, + "learning_rate": 1.9407919171759112e-07, + "loss": 0.3153, + "num_input_tokens_seen": 6192889856, + "step": 2953 + }, + { + "epoch": 0.150257662055872, + "grad_norm": 30.527063369750977, + "learning_rate": 1.9383231804537704e-07, + "loss": 0.4207, + "num_input_tokens_seen": 6194987008, + "step": 2954 + }, + { + "epoch": 0.15052888527257932, + "grad_norm": 14.228474617004395, + "learning_rate": 1.9358573100336034e-07, + "loss": 0.3108, + "num_input_tokens_seen": 6197084160, + "step": 2955 + }, + { + "epoch": 0.15080010848928668, + "grad_norm": 17.981746673583984, + "learning_rate": 1.933394307899846e-07, + "loss": 0.3895, + "num_input_tokens_seen": 6199181312, + "step": 2956 + }, + { + "epoch": 0.15107133170599404, + "grad_norm": 27.902082443237305, + "learning_rate": 1.9309341760346304e-07, + "loss": 0.7074, + "num_input_tokens_seen": 6201278464, + "step": 2957 + }, + { + "epoch": 0.15134255492270138, + "grad_norm": 14.604863166809082, + "learning_rate": 1.9284769164177767e-07, + "loss": 0.2475, + "num_input_tokens_seen": 6203375616, + "step": 2958 + }, + { + "epoch": 0.15161377813940874, + "grad_norm": 23.595659255981445, + "learning_rate": 1.9260225310267925e-07, + "loss": 0.4611, + "num_input_tokens_seen": 6205472768, + "step": 2959 + }, + { + "epoch": 0.15188500135611607, + "grad_norm": 20.186174392700195, + "learning_rate": 1.9235710218368784e-07, + "loss": 0.5274, + "num_input_tokens_seen": 6207569920, + "step": 2960 + }, + { + "epoch": 0.15215622457282343, + "grad_norm": 15.750053405761719, + "learning_rate": 1.9211223908209114e-07, + "loss": 0.4078, + "num_input_tokens_seen": 6209667072, + "step": 2961 + }, + { + "epoch": 0.1524274477895308, + "grad_norm": 17.78439712524414, + "learning_rate": 1.9186766399494581e-07, + "loss": 0.3896, + "num_input_tokens_seen": 6211764224, + "step": 2962 + }, + { + "epoch": 0.15269867100623813, + "grad_norm": 15.00125503540039, + "learning_rate": 1.9162337711907657e-07, + "loss": 0.3784, + "num_input_tokens_seen": 6213861376, + "step": 2963 + }, + { + "epoch": 0.1529698942229455, + "grad_norm": 10.61737060546875, + "learning_rate": 1.9137937865107606e-07, + "loss": 0.1549, + "num_input_tokens_seen": 6215958528, + "step": 2964 + }, + { + "epoch": 0.15324111743965282, + "grad_norm": 15.642570495605469, + "learning_rate": 1.9113566878730515e-07, + "loss": 0.2978, + "num_input_tokens_seen": 6218055680, + "step": 2965 + }, + { + "epoch": 0.15351234065636019, + "grad_norm": 24.957725524902344, + "learning_rate": 1.9089224772389223e-07, + "loss": 0.9192, + "num_input_tokens_seen": 6220152832, + "step": 2966 + }, + { + "epoch": 0.15378356387306755, + "grad_norm": 12.823304176330566, + "learning_rate": 1.9064911565673328e-07, + "loss": 0.2818, + "num_input_tokens_seen": 6222249984, + "step": 2967 + }, + { + "epoch": 0.15405478708977488, + "grad_norm": 18.57845687866211, + "learning_rate": 1.9040627278149168e-07, + "loss": 0.4243, + "num_input_tokens_seen": 6224347136, + "step": 2968 + }, + { + "epoch": 0.15432601030648224, + "grad_norm": 27.991865158081055, + "learning_rate": 1.9016371929359824e-07, + "loss": 0.901, + "num_input_tokens_seen": 6226444288, + "step": 2969 + }, + { + "epoch": 0.15459723352318958, + "grad_norm": 40.3478889465332, + "learning_rate": 1.8992145538825066e-07, + "loss": 0.6081, + "num_input_tokens_seen": 6228541440, + "step": 2970 + }, + { + "epoch": 0.15486845673989694, + "grad_norm": 16.422760009765625, + "learning_rate": 1.8967948126041383e-07, + "loss": 0.4356, + "num_input_tokens_seen": 6230638592, + "step": 2971 + }, + { + "epoch": 0.1551396799566043, + "grad_norm": 17.511781692504883, + "learning_rate": 1.8943779710481938e-07, + "loss": 0.4776, + "num_input_tokens_seen": 6232735744, + "step": 2972 + }, + { + "epoch": 0.15541090317331163, + "grad_norm": 30.01336097717285, + "learning_rate": 1.891964031159653e-07, + "loss": 0.8041, + "num_input_tokens_seen": 6234832896, + "step": 2973 + }, + { + "epoch": 0.155682126390019, + "grad_norm": 21.783706665039062, + "learning_rate": 1.8895529948811638e-07, + "loss": 0.5671, + "num_input_tokens_seen": 6236930048, + "step": 2974 + }, + { + "epoch": 0.15595334960672633, + "grad_norm": 20.175209045410156, + "learning_rate": 1.8871448641530353e-07, + "loss": 0.411, + "num_input_tokens_seen": 6239027200, + "step": 2975 + }, + { + "epoch": 0.1562245728234337, + "grad_norm": 14.450340270996094, + "learning_rate": 1.8847396409132423e-07, + "loss": 0.4136, + "num_input_tokens_seen": 6241124352, + "step": 2976 + }, + { + "epoch": 0.15649579604014105, + "grad_norm": 16.498634338378906, + "learning_rate": 1.8823373270974164e-07, + "loss": 0.4456, + "num_input_tokens_seen": 6243221504, + "step": 2977 + }, + { + "epoch": 0.15676701925684838, + "grad_norm": 16.770217895507812, + "learning_rate": 1.8799379246388463e-07, + "loss": 0.2795, + "num_input_tokens_seen": 6245318656, + "step": 2978 + }, + { + "epoch": 0.15703824247355574, + "grad_norm": 25.7247371673584, + "learning_rate": 1.8775414354684804e-07, + "loss": 0.8234, + "num_input_tokens_seen": 6247415808, + "step": 2979 + }, + { + "epoch": 0.15730946569026308, + "grad_norm": 19.52436065673828, + "learning_rate": 1.875147861514923e-07, + "loss": 0.1999, + "num_input_tokens_seen": 6249512960, + "step": 2980 + }, + { + "epoch": 0.15758068890697044, + "grad_norm": 13.377266883850098, + "learning_rate": 1.8727572047044308e-07, + "loss": 0.3077, + "num_input_tokens_seen": 6251610112, + "step": 2981 + }, + { + "epoch": 0.15785191212367777, + "grad_norm": 15.139887809753418, + "learning_rate": 1.8703694669609133e-07, + "loss": 0.3646, + "num_input_tokens_seen": 6253707264, + "step": 2982 + }, + { + "epoch": 0.15812313534038513, + "grad_norm": 16.056617736816406, + "learning_rate": 1.8679846502059306e-07, + "loss": 0.4268, + "num_input_tokens_seen": 6255804416, + "step": 2983 + }, + { + "epoch": 0.1583943585570925, + "grad_norm": 15.632225036621094, + "learning_rate": 1.8656027563586934e-07, + "loss": 0.3879, + "num_input_tokens_seen": 6257901568, + "step": 2984 + }, + { + "epoch": 0.15866558177379983, + "grad_norm": 9.523789405822754, + "learning_rate": 1.863223787336059e-07, + "loss": 0.1322, + "num_input_tokens_seen": 6259998720, + "step": 2985 + }, + { + "epoch": 0.1589368049905072, + "grad_norm": 19.751140594482422, + "learning_rate": 1.8608477450525308e-07, + "loss": 0.5607, + "num_input_tokens_seen": 6262095872, + "step": 2986 + }, + { + "epoch": 0.15920802820721452, + "grad_norm": 13.901773452758789, + "learning_rate": 1.8584746314202574e-07, + "loss": 0.2927, + "num_input_tokens_seen": 6264193024, + "step": 2987 + }, + { + "epoch": 0.1594792514239219, + "grad_norm": 19.8528995513916, + "learning_rate": 1.8561044483490301e-07, + "loss": 0.5888, + "num_input_tokens_seen": 6266290176, + "step": 2988 + }, + { + "epoch": 0.15975047464062925, + "grad_norm": 15.824024200439453, + "learning_rate": 1.8537371977462835e-07, + "loss": 0.4884, + "num_input_tokens_seen": 6268387328, + "step": 2989 + }, + { + "epoch": 0.16002169785733658, + "grad_norm": 23.57025718688965, + "learning_rate": 1.8513728815170885e-07, + "loss": 0.6673, + "num_input_tokens_seen": 6270484480, + "step": 2990 + }, + { + "epoch": 0.16029292107404394, + "grad_norm": 24.425518035888672, + "learning_rate": 1.8490115015641566e-07, + "loss": 0.546, + "num_input_tokens_seen": 6272581632, + "step": 2991 + }, + { + "epoch": 0.16056414429075128, + "grad_norm": 27.48299217224121, + "learning_rate": 1.846653059787839e-07, + "loss": 0.8573, + "num_input_tokens_seen": 6274678784, + "step": 2992 + }, + { + "epoch": 0.16083536750745864, + "grad_norm": 26.690526962280273, + "learning_rate": 1.8442975580861185e-07, + "loss": 0.7799, + "num_input_tokens_seen": 6276775936, + "step": 2993 + }, + { + "epoch": 0.161106590724166, + "grad_norm": 17.080846786499023, + "learning_rate": 1.8419449983546136e-07, + "loss": 0.3413, + "num_input_tokens_seen": 6278873088, + "step": 2994 + }, + { + "epoch": 0.16137781394087333, + "grad_norm": 17.67878532409668, + "learning_rate": 1.8395953824865762e-07, + "loss": 0.285, + "num_input_tokens_seen": 6280970240, + "step": 2995 + }, + { + "epoch": 0.1616490371575807, + "grad_norm": 23.767698287963867, + "learning_rate": 1.8372487123728844e-07, + "loss": 0.468, + "num_input_tokens_seen": 6283067392, + "step": 2996 + }, + { + "epoch": 0.16192026037428803, + "grad_norm": 16.759511947631836, + "learning_rate": 1.8349049899020514e-07, + "loss": 0.4408, + "num_input_tokens_seen": 6285164544, + "step": 2997 + }, + { + "epoch": 0.1621914835909954, + "grad_norm": 13.694295883178711, + "learning_rate": 1.832564216960215e-07, + "loss": 0.3345, + "num_input_tokens_seen": 6287261696, + "step": 2998 + }, + { + "epoch": 0.16246270680770275, + "grad_norm": 15.297809600830078, + "learning_rate": 1.8302263954311408e-07, + "loss": 0.3348, + "num_input_tokens_seen": 6289358848, + "step": 2999 + }, + { + "epoch": 0.16273393002441008, + "grad_norm": 20.984760284423828, + "learning_rate": 1.827891527196218e-07, + "loss": 0.7217, + "num_input_tokens_seen": 6291456000, + "step": 3000 + }, + { + "epoch": 0.16300515324111745, + "grad_norm": 16.016870498657227, + "learning_rate": 1.8255596141344605e-07, + "loss": 0.2591, + "num_input_tokens_seen": 6293553152, + "step": 3001 + }, + { + "epoch": 0.16327637645782478, + "grad_norm": 15.872649192810059, + "learning_rate": 1.823230658122502e-07, + "loss": 0.4054, + "num_input_tokens_seen": 6295650304, + "step": 3002 + }, + { + "epoch": 0.16354759967453214, + "grad_norm": 17.43962287902832, + "learning_rate": 1.820904661034599e-07, + "loss": 0.4199, + "num_input_tokens_seen": 6297747456, + "step": 3003 + }, + { + "epoch": 0.1638188228912395, + "grad_norm": 16.44698715209961, + "learning_rate": 1.8185816247426245e-07, + "loss": 0.3451, + "num_input_tokens_seen": 6299844608, + "step": 3004 + }, + { + "epoch": 0.16409004610794684, + "grad_norm": 22.08724021911621, + "learning_rate": 1.8162615511160701e-07, + "loss": 0.6525, + "num_input_tokens_seen": 6301941760, + "step": 3005 + }, + { + "epoch": 0.1643612693246542, + "grad_norm": 13.724467277526855, + "learning_rate": 1.8139444420220438e-07, + "loss": 0.297, + "num_input_tokens_seen": 6304038912, + "step": 3006 + }, + { + "epoch": 0.16463249254136153, + "grad_norm": 24.55242156982422, + "learning_rate": 1.8116302993252636e-07, + "loss": 0.8337, + "num_input_tokens_seen": 6306136064, + "step": 3007 + }, + { + "epoch": 0.1649037157580689, + "grad_norm": 12.129997253417969, + "learning_rate": 1.809319124888067e-07, + "loss": 0.2984, + "num_input_tokens_seen": 6308233216, + "step": 3008 + }, + { + "epoch": 0.16517493897477625, + "grad_norm": 12.80477523803711, + "learning_rate": 1.8070109205703972e-07, + "loss": 0.2444, + "num_input_tokens_seen": 6310330368, + "step": 3009 + }, + { + "epoch": 0.1654461621914836, + "grad_norm": 21.404075622558594, + "learning_rate": 1.80470568822981e-07, + "loss": 0.5638, + "num_input_tokens_seen": 6312427520, + "step": 3010 + }, + { + "epoch": 0.16571738540819095, + "grad_norm": 24.48054313659668, + "learning_rate": 1.8024034297214686e-07, + "loss": 0.8146, + "num_input_tokens_seen": 6314524672, + "step": 3011 + }, + { + "epoch": 0.16598860862489828, + "grad_norm": 16.312585830688477, + "learning_rate": 1.8001041468981442e-07, + "loss": 0.4131, + "num_input_tokens_seen": 6316621824, + "step": 3012 + }, + { + "epoch": 0.16625983184160564, + "grad_norm": 14.948566436767578, + "learning_rate": 1.79780784161021e-07, + "loss": 0.4172, + "num_input_tokens_seen": 6318718976, + "step": 3013 + }, + { + "epoch": 0.166531055058313, + "grad_norm": 23.79063606262207, + "learning_rate": 1.7955145157056456e-07, + "loss": 0.6344, + "num_input_tokens_seen": 6320816128, + "step": 3014 + }, + { + "epoch": 0.16680227827502034, + "grad_norm": 19.02938461303711, + "learning_rate": 1.793224171030033e-07, + "loss": 0.565, + "num_input_tokens_seen": 6322913280, + "step": 3015 + }, + { + "epoch": 0.1670735014917277, + "grad_norm": 16.11639976501465, + "learning_rate": 1.7909368094265532e-07, + "loss": 0.4183, + "num_input_tokens_seen": 6325010432, + "step": 3016 + }, + { + "epoch": 0.16734472470843503, + "grad_norm": 14.722761154174805, + "learning_rate": 1.788652432735991e-07, + "loss": 0.2774, + "num_input_tokens_seen": 6327107584, + "step": 3017 + }, + { + "epoch": 0.1676159479251424, + "grad_norm": 11.639079093933105, + "learning_rate": 1.7863710427967222e-07, + "loss": 0.2382, + "num_input_tokens_seen": 6329204736, + "step": 3018 + }, + { + "epoch": 0.16788717114184976, + "grad_norm": 18.875831604003906, + "learning_rate": 1.784092641444725e-07, + "loss": 0.5038, + "num_input_tokens_seen": 6331301888, + "step": 3019 + }, + { + "epoch": 0.1681583943585571, + "grad_norm": 21.543594360351562, + "learning_rate": 1.7818172305135683e-07, + "loss": 0.5799, + "num_input_tokens_seen": 6333399040, + "step": 3020 + }, + { + "epoch": 0.16842961757526445, + "grad_norm": 17.74955177307129, + "learning_rate": 1.7795448118344174e-07, + "loss": 0.4241, + "num_input_tokens_seen": 6335496192, + "step": 3021 + }, + { + "epoch": 0.16870084079197178, + "grad_norm": 18.136749267578125, + "learning_rate": 1.7772753872360286e-07, + "loss": 0.4974, + "num_input_tokens_seen": 6337593344, + "step": 3022 + }, + { + "epoch": 0.16897206400867915, + "grad_norm": 16.745140075683594, + "learning_rate": 1.7750089585447473e-07, + "loss": 0.4158, + "num_input_tokens_seen": 6339690496, + "step": 3023 + }, + { + "epoch": 0.16924328722538648, + "grad_norm": 13.123845100402832, + "learning_rate": 1.77274552758451e-07, + "loss": 0.3088, + "num_input_tokens_seen": 6341787648, + "step": 3024 + }, + { + "epoch": 0.16951451044209384, + "grad_norm": 13.37322998046875, + "learning_rate": 1.770485096176839e-07, + "loss": 0.351, + "num_input_tokens_seen": 6343884800, + "step": 3025 + }, + { + "epoch": 0.1697857336588012, + "grad_norm": 20.165077209472656, + "learning_rate": 1.7682276661408448e-07, + "loss": 0.6836, + "num_input_tokens_seen": 6345981952, + "step": 3026 + }, + { + "epoch": 0.17005695687550854, + "grad_norm": 15.843926429748535, + "learning_rate": 1.7659732392932202e-07, + "loss": 0.4345, + "num_input_tokens_seen": 6348079104, + "step": 3027 + }, + { + "epoch": 0.1703281800922159, + "grad_norm": 12.77824878692627, + "learning_rate": 1.7637218174482422e-07, + "loss": 0.3054, + "num_input_tokens_seen": 6350176256, + "step": 3028 + }, + { + "epoch": 0.17059940330892323, + "grad_norm": 15.227442741394043, + "learning_rate": 1.7614734024177707e-07, + "loss": 0.4641, + "num_input_tokens_seen": 6352273408, + "step": 3029 + }, + { + "epoch": 0.1708706265256306, + "grad_norm": 20.998775482177734, + "learning_rate": 1.7592279960112424e-07, + "loss": 0.649, + "num_input_tokens_seen": 6354370560, + "step": 3030 + }, + { + "epoch": 0.17114184974233795, + "grad_norm": 14.684011459350586, + "learning_rate": 1.7569856000356758e-07, + "loss": 0.3258, + "num_input_tokens_seen": 6356467712, + "step": 3031 + }, + { + "epoch": 0.1714130729590453, + "grad_norm": 22.120227813720703, + "learning_rate": 1.754746216295665e-07, + "loss": 0.6493, + "num_input_tokens_seen": 6358564864, + "step": 3032 + }, + { + "epoch": 0.17168429617575265, + "grad_norm": 25.33750343322754, + "learning_rate": 1.7525098465933834e-07, + "loss": 0.6875, + "num_input_tokens_seen": 6360662016, + "step": 3033 + }, + { + "epoch": 0.17195551939245998, + "grad_norm": 14.21183967590332, + "learning_rate": 1.750276492728575e-07, + "loss": 0.2934, + "num_input_tokens_seen": 6362759168, + "step": 3034 + }, + { + "epoch": 0.17222674260916734, + "grad_norm": 13.687496185302734, + "learning_rate": 1.748046156498557e-07, + "loss": 0.337, + "num_input_tokens_seen": 6364856320, + "step": 3035 + }, + { + "epoch": 0.1724979658258747, + "grad_norm": 16.938383102416992, + "learning_rate": 1.74581883969822e-07, + "loss": 0.5263, + "num_input_tokens_seen": 6366953472, + "step": 3036 + }, + { + "epoch": 0.17276918904258204, + "grad_norm": 15.311517715454102, + "learning_rate": 1.7435945441200232e-07, + "loss": 0.4494, + "num_input_tokens_seen": 6369050624, + "step": 3037 + }, + { + "epoch": 0.1730404122592894, + "grad_norm": 15.489421844482422, + "learning_rate": 1.7413732715539954e-07, + "loss": 0.3876, + "num_input_tokens_seen": 6371147776, + "step": 3038 + }, + { + "epoch": 0.17331163547599673, + "grad_norm": 21.3696231842041, + "learning_rate": 1.7391550237877326e-07, + "loss": 0.6567, + "num_input_tokens_seen": 6373244928, + "step": 3039 + }, + { + "epoch": 0.1735828586927041, + "grad_norm": 16.473154067993164, + "learning_rate": 1.7369398026063958e-07, + "loss": 0.4457, + "num_input_tokens_seen": 6375342080, + "step": 3040 + }, + { + "epoch": 0.17385408190941146, + "grad_norm": 17.838611602783203, + "learning_rate": 1.7347276097927105e-07, + "loss": 0.6155, + "num_input_tokens_seen": 6377439232, + "step": 3041 + }, + { + "epoch": 0.1741253051261188, + "grad_norm": 17.602170944213867, + "learning_rate": 1.732518447126966e-07, + "loss": 0.4533, + "num_input_tokens_seen": 6379536384, + "step": 3042 + }, + { + "epoch": 0.17439652834282615, + "grad_norm": 25.11285400390625, + "learning_rate": 1.730312316387012e-07, + "loss": 0.4754, + "num_input_tokens_seen": 6381633536, + "step": 3043 + }, + { + "epoch": 0.17466775155953349, + "grad_norm": 15.209177017211914, + "learning_rate": 1.7281092193482582e-07, + "loss": 0.4263, + "num_input_tokens_seen": 6383730688, + "step": 3044 + }, + { + "epoch": 0.17493897477624085, + "grad_norm": 27.183820724487305, + "learning_rate": 1.7259091577836737e-07, + "loss": 0.5498, + "num_input_tokens_seen": 6385827840, + "step": 3045 + }, + { + "epoch": 0.1752101979929482, + "grad_norm": 17.119733810424805, + "learning_rate": 1.723712133463785e-07, + "loss": 0.5436, + "num_input_tokens_seen": 6387924992, + "step": 3046 + }, + { + "epoch": 0.17548142120965554, + "grad_norm": 14.52611255645752, + "learning_rate": 1.7215181481566727e-07, + "loss": 0.2955, + "num_input_tokens_seen": 6390022144, + "step": 3047 + }, + { + "epoch": 0.1757526444263629, + "grad_norm": 11.77927303314209, + "learning_rate": 1.719327203627971e-07, + "loss": 0.2517, + "num_input_tokens_seen": 6392119296, + "step": 3048 + }, + { + "epoch": 0.17602386764307024, + "grad_norm": 16.277713775634766, + "learning_rate": 1.7171393016408715e-07, + "loss": 0.3709, + "num_input_tokens_seen": 6394216448, + "step": 3049 + }, + { + "epoch": 0.1762950908597776, + "grad_norm": 18.34269142150879, + "learning_rate": 1.7149544439561135e-07, + "loss": 0.3434, + "num_input_tokens_seen": 6396313600, + "step": 3050 + }, + { + "epoch": 0.17656631407648496, + "grad_norm": 15.846407890319824, + "learning_rate": 1.7127726323319874e-07, + "loss": 0.3524, + "num_input_tokens_seen": 6398410752, + "step": 3051 + }, + { + "epoch": 0.1768375372931923, + "grad_norm": 13.641855239868164, + "learning_rate": 1.7105938685243339e-07, + "loss": 0.2759, + "num_input_tokens_seen": 6400507904, + "step": 3052 + }, + { + "epoch": 0.17710876050989965, + "grad_norm": 16.64573097229004, + "learning_rate": 1.7084181542865355e-07, + "loss": 0.506, + "num_input_tokens_seen": 6402605056, + "step": 3053 + }, + { + "epoch": 0.177379983726607, + "grad_norm": 21.56814193725586, + "learning_rate": 1.7062454913695264e-07, + "loss": 0.5754, + "num_input_tokens_seen": 6404702208, + "step": 3054 + }, + { + "epoch": 0.17765120694331435, + "grad_norm": 13.835966110229492, + "learning_rate": 1.7040758815217827e-07, + "loss": 0.3337, + "num_input_tokens_seen": 6406799360, + "step": 3055 + }, + { + "epoch": 0.1779224301600217, + "grad_norm": 19.247650146484375, + "learning_rate": 1.7019093264893236e-07, + "loss": 0.625, + "num_input_tokens_seen": 6408896512, + "step": 3056 + }, + { + "epoch": 0.17819365337672904, + "grad_norm": 13.838505744934082, + "learning_rate": 1.6997458280157113e-07, + "loss": 0.3558, + "num_input_tokens_seen": 6410993664, + "step": 3057 + }, + { + "epoch": 0.1784648765934364, + "grad_norm": 19.8231143951416, + "learning_rate": 1.6975853878420457e-07, + "loss": 0.5823, + "num_input_tokens_seen": 6413090816, + "step": 3058 + }, + { + "epoch": 0.17873609981014374, + "grad_norm": 16.521181106567383, + "learning_rate": 1.6954280077069676e-07, + "loss": 0.3463, + "num_input_tokens_seen": 6415187968, + "step": 3059 + }, + { + "epoch": 0.1790073230268511, + "grad_norm": 20.04188346862793, + "learning_rate": 1.6932736893466536e-07, + "loss": 0.4292, + "num_input_tokens_seen": 6417285120, + "step": 3060 + }, + { + "epoch": 0.17927854624355846, + "grad_norm": 15.362913131713867, + "learning_rate": 1.691122434494818e-07, + "loss": 0.3455, + "num_input_tokens_seen": 6419382272, + "step": 3061 + }, + { + "epoch": 0.1795497694602658, + "grad_norm": 17.9241943359375, + "learning_rate": 1.688974244882709e-07, + "loss": 0.5172, + "num_input_tokens_seen": 6421479424, + "step": 3062 + }, + { + "epoch": 0.17982099267697316, + "grad_norm": 16.651180267333984, + "learning_rate": 1.686829122239108e-07, + "loss": 0.4723, + "num_input_tokens_seen": 6423576576, + "step": 3063 + }, + { + "epoch": 0.1800922158936805, + "grad_norm": 21.878660202026367, + "learning_rate": 1.684687068290326e-07, + "loss": 0.5919, + "num_input_tokens_seen": 6425673728, + "step": 3064 + }, + { + "epoch": 0.18036343911038785, + "grad_norm": 16.2543888092041, + "learning_rate": 1.682548084760208e-07, + "loss": 0.3433, + "num_input_tokens_seen": 6427770880, + "step": 3065 + }, + { + "epoch": 0.18063466232709519, + "grad_norm": 14.593696594238281, + "learning_rate": 1.6804121733701277e-07, + "loss": 0.3266, + "num_input_tokens_seen": 6429868032, + "step": 3066 + }, + { + "epoch": 0.18090588554380255, + "grad_norm": 30.589252471923828, + "learning_rate": 1.678279335838983e-07, + "loss": 1.0168, + "num_input_tokens_seen": 6431965184, + "step": 3067 + }, + { + "epoch": 0.1811771087605099, + "grad_norm": 16.187679290771484, + "learning_rate": 1.676149573883202e-07, + "loss": 0.2938, + "num_input_tokens_seen": 6434062336, + "step": 3068 + }, + { + "epoch": 0.18144833197721724, + "grad_norm": 20.22064208984375, + "learning_rate": 1.674022889216737e-07, + "loss": 0.4911, + "num_input_tokens_seen": 6436159488, + "step": 3069 + }, + { + "epoch": 0.1817195551939246, + "grad_norm": 15.452803611755371, + "learning_rate": 1.67189928355106e-07, + "loss": 0.3566, + "num_input_tokens_seen": 6438256640, + "step": 3070 + }, + { + "epoch": 0.18199077841063194, + "grad_norm": 21.338388442993164, + "learning_rate": 1.66977875859517e-07, + "loss": 0.4931, + "num_input_tokens_seen": 6440353792, + "step": 3071 + }, + { + "epoch": 0.1822620016273393, + "grad_norm": 15.379291534423828, + "learning_rate": 1.6676613160555846e-07, + "loss": 0.3819, + "num_input_tokens_seen": 6442450944, + "step": 3072 + }, + { + "epoch": 0.18253322484404666, + "grad_norm": 18.543752670288086, + "learning_rate": 1.665546957636339e-07, + "loss": 0.5989, + "num_input_tokens_seen": 6444548096, + "step": 3073 + }, + { + "epoch": 0.182804448060754, + "grad_norm": 13.839360237121582, + "learning_rate": 1.6634356850389926e-07, + "loss": 0.3388, + "num_input_tokens_seen": 6446645248, + "step": 3074 + }, + { + "epoch": 0.18307567127746135, + "grad_norm": 14.323488235473633, + "learning_rate": 1.6613274999626134e-07, + "loss": 0.4291, + "num_input_tokens_seen": 6448742400, + "step": 3075 + }, + { + "epoch": 0.1833468944941687, + "grad_norm": 19.329299926757812, + "learning_rate": 1.65922240410379e-07, + "loss": 0.5632, + "num_input_tokens_seen": 6450839552, + "step": 3076 + }, + { + "epoch": 0.18361811771087605, + "grad_norm": 14.438604354858398, + "learning_rate": 1.6571203991566234e-07, + "loss": 0.3609, + "num_input_tokens_seen": 6452936704, + "step": 3077 + }, + { + "epoch": 0.1838893409275834, + "grad_norm": 16.972644805908203, + "learning_rate": 1.6550214868127276e-07, + "loss": 0.2715, + "num_input_tokens_seen": 6455033856, + "step": 3078 + }, + { + "epoch": 0.18416056414429074, + "grad_norm": 19.990781784057617, + "learning_rate": 1.6529256687612264e-07, + "loss": 0.5057, + "num_input_tokens_seen": 6457131008, + "step": 3079 + }, + { + "epoch": 0.1844317873609981, + "grad_norm": 12.420730590820312, + "learning_rate": 1.6508329466887548e-07, + "loss": 0.32, + "num_input_tokens_seen": 6459228160, + "step": 3080 + }, + { + "epoch": 0.18470301057770544, + "grad_norm": 18.54897689819336, + "learning_rate": 1.6487433222794566e-07, + "loss": 0.531, + "num_input_tokens_seen": 6461325312, + "step": 3081 + }, + { + "epoch": 0.1849742337944128, + "grad_norm": 13.507473945617676, + "learning_rate": 1.6466567972149806e-07, + "loss": 0.2632, + "num_input_tokens_seen": 6463422464, + "step": 3082 + }, + { + "epoch": 0.18524545701112016, + "grad_norm": 13.425512313842773, + "learning_rate": 1.6445733731744842e-07, + "loss": 0.3308, + "num_input_tokens_seen": 6465519616, + "step": 3083 + }, + { + "epoch": 0.1855166802278275, + "grad_norm": 14.376126289367676, + "learning_rate": 1.642493051834627e-07, + "loss": 0.3104, + "num_input_tokens_seen": 6467616768, + "step": 3084 + }, + { + "epoch": 0.18578790344453486, + "grad_norm": 13.848085403442383, + "learning_rate": 1.6404158348695729e-07, + "loss": 0.3902, + "num_input_tokens_seen": 6469713920, + "step": 3085 + }, + { + "epoch": 0.1860591266612422, + "grad_norm": 17.45004653930664, + "learning_rate": 1.6383417239509878e-07, + "loss": 0.5748, + "num_input_tokens_seen": 6471811072, + "step": 3086 + }, + { + "epoch": 0.18633034987794955, + "grad_norm": 19.52586555480957, + "learning_rate": 1.6362707207480347e-07, + "loss": 0.6891, + "num_input_tokens_seen": 6473908224, + "step": 3087 + }, + { + "epoch": 0.18660157309465691, + "grad_norm": 24.07430076599121, + "learning_rate": 1.6342028269273802e-07, + "loss": 0.6888, + "num_input_tokens_seen": 6476005376, + "step": 3088 + }, + { + "epoch": 0.18687279631136425, + "grad_norm": 19.478736877441406, + "learning_rate": 1.6321380441531852e-07, + "loss": 0.6213, + "num_input_tokens_seen": 6478102528, + "step": 3089 + }, + { + "epoch": 0.1871440195280716, + "grad_norm": 13.465045928955078, + "learning_rate": 1.63007637408711e-07, + "loss": 0.3439, + "num_input_tokens_seen": 6480199680, + "step": 3090 + }, + { + "epoch": 0.18741524274477894, + "grad_norm": 16.045684814453125, + "learning_rate": 1.6280178183883081e-07, + "loss": 0.4287, + "num_input_tokens_seen": 6482296832, + "step": 3091 + }, + { + "epoch": 0.1876864659614863, + "grad_norm": 18.508325576782227, + "learning_rate": 1.6259623787134258e-07, + "loss": 0.4006, + "num_input_tokens_seen": 6484393984, + "step": 3092 + }, + { + "epoch": 0.18795768917819367, + "grad_norm": 16.57149314880371, + "learning_rate": 1.6239100567166026e-07, + "loss": 0.392, + "num_input_tokens_seen": 6486491136, + "step": 3093 + }, + { + "epoch": 0.188228912394901, + "grad_norm": 13.103677749633789, + "learning_rate": 1.6218608540494693e-07, + "loss": 0.3385, + "num_input_tokens_seen": 6488588288, + "step": 3094 + }, + { + "epoch": 0.18850013561160836, + "grad_norm": 17.10627555847168, + "learning_rate": 1.619814772361147e-07, + "loss": 0.3766, + "num_input_tokens_seen": 6490685440, + "step": 3095 + }, + { + "epoch": 0.1887713588283157, + "grad_norm": 18.120052337646484, + "learning_rate": 1.6177718132982441e-07, + "loss": 0.5857, + "num_input_tokens_seen": 6492782592, + "step": 3096 + }, + { + "epoch": 0.18904258204502306, + "grad_norm": 19.000185012817383, + "learning_rate": 1.6157319785048555e-07, + "loss": 0.5742, + "num_input_tokens_seen": 6494879744, + "step": 3097 + }, + { + "epoch": 0.18931380526173042, + "grad_norm": 18.840848922729492, + "learning_rate": 1.6136952696225634e-07, + "loss": 0.3558, + "num_input_tokens_seen": 6496976896, + "step": 3098 + }, + { + "epoch": 0.18958502847843775, + "grad_norm": 19.136682510375977, + "learning_rate": 1.6116616882904332e-07, + "loss": 0.5141, + "num_input_tokens_seen": 6499074048, + "step": 3099 + }, + { + "epoch": 0.1898562516951451, + "grad_norm": 17.346790313720703, + "learning_rate": 1.6096312361450142e-07, + "loss": 0.434, + "num_input_tokens_seen": 6501171200, + "step": 3100 + }, + { + "epoch": 0.19012747491185245, + "grad_norm": 15.207001686096191, + "learning_rate": 1.6076039148203373e-07, + "loss": 0.4427, + "num_input_tokens_seen": 6503268352, + "step": 3101 + }, + { + "epoch": 0.1903986981285598, + "grad_norm": 10.525275230407715, + "learning_rate": 1.6055797259479125e-07, + "loss": 0.253, + "num_input_tokens_seen": 6505365504, + "step": 3102 + }, + { + "epoch": 0.19066992134526717, + "grad_norm": 27.057727813720703, + "learning_rate": 1.6035586711567318e-07, + "loss": 0.6012, + "num_input_tokens_seen": 6507462656, + "step": 3103 + }, + { + "epoch": 0.1909411445619745, + "grad_norm": 29.3339786529541, + "learning_rate": 1.601540752073261e-07, + "loss": 0.673, + "num_input_tokens_seen": 6509559808, + "step": 3104 + }, + { + "epoch": 0.19121236777868186, + "grad_norm": 12.424835205078125, + "learning_rate": 1.5995259703214445e-07, + "loss": 0.2948, + "num_input_tokens_seen": 6511656960, + "step": 3105 + }, + { + "epoch": 0.1914835909953892, + "grad_norm": 14.519540786743164, + "learning_rate": 1.5975143275227039e-07, + "loss": 0.3906, + "num_input_tokens_seen": 6513754112, + "step": 3106 + }, + { + "epoch": 0.19175481421209656, + "grad_norm": 18.084651947021484, + "learning_rate": 1.5955058252959318e-07, + "loss": 0.4539, + "num_input_tokens_seen": 6515851264, + "step": 3107 + }, + { + "epoch": 0.1920260374288039, + "grad_norm": 23.673160552978516, + "learning_rate": 1.5935004652574947e-07, + "loss": 0.5936, + "num_input_tokens_seen": 6517948416, + "step": 3108 + }, + { + "epoch": 0.19229726064551125, + "grad_norm": 13.00130844116211, + "learning_rate": 1.591498249021231e-07, + "loss": 0.2686, + "num_input_tokens_seen": 6520045568, + "step": 3109 + }, + { + "epoch": 0.19256848386221861, + "grad_norm": 21.13102912902832, + "learning_rate": 1.5894991781984456e-07, + "loss": 0.6705, + "num_input_tokens_seen": 6522142720, + "step": 3110 + }, + { + "epoch": 0.19283970707892595, + "grad_norm": 10.354960441589355, + "learning_rate": 1.587503254397916e-07, + "loss": 0.1552, + "num_input_tokens_seen": 6524239872, + "step": 3111 + }, + { + "epoch": 0.1931109302956333, + "grad_norm": 16.40614128112793, + "learning_rate": 1.585510479225886e-07, + "loss": 0.2692, + "num_input_tokens_seen": 6526337024, + "step": 3112 + }, + { + "epoch": 0.19338215351234064, + "grad_norm": 14.614860534667969, + "learning_rate": 1.5835208542860648e-07, + "loss": 0.3284, + "num_input_tokens_seen": 6528434176, + "step": 3113 + }, + { + "epoch": 0.193653376729048, + "grad_norm": 11.804844856262207, + "learning_rate": 1.581534381179627e-07, + "loss": 0.3218, + "num_input_tokens_seen": 6530531328, + "step": 3114 + }, + { + "epoch": 0.19392459994575537, + "grad_norm": 19.765899658203125, + "learning_rate": 1.5795510615052104e-07, + "loss": 0.5768, + "num_input_tokens_seen": 6532628480, + "step": 3115 + }, + { + "epoch": 0.1941958231624627, + "grad_norm": 18.440826416015625, + "learning_rate": 1.5775708968589155e-07, + "loss": 0.4025, + "num_input_tokens_seen": 6534725632, + "step": 3116 + }, + { + "epoch": 0.19446704637917006, + "grad_norm": 27.12670135498047, + "learning_rate": 1.575593888834303e-07, + "loss": 0.4297, + "num_input_tokens_seen": 6536822784, + "step": 3117 + }, + { + "epoch": 0.1947382695958774, + "grad_norm": 22.037416458129883, + "learning_rate": 1.5736200390223942e-07, + "loss": 0.7779, + "num_input_tokens_seen": 6538919936, + "step": 3118 + }, + { + "epoch": 0.19500949281258476, + "grad_norm": 15.226699829101562, + "learning_rate": 1.5716493490116684e-07, + "loss": 0.3269, + "num_input_tokens_seen": 6541017088, + "step": 3119 + }, + { + "epoch": 0.19528071602929212, + "grad_norm": 19.05426788330078, + "learning_rate": 1.5696818203880624e-07, + "loss": 0.4225, + "num_input_tokens_seen": 6543114240, + "step": 3120 + }, + { + "epoch": 0.19555193924599945, + "grad_norm": 15.81242847442627, + "learning_rate": 1.5677174547349655e-07, + "loss": 0.561, + "num_input_tokens_seen": 6545211392, + "step": 3121 + }, + { + "epoch": 0.1958231624627068, + "grad_norm": 24.574951171875, + "learning_rate": 1.565756253633228e-07, + "loss": 0.5197, + "num_input_tokens_seen": 6547308544, + "step": 3122 + }, + { + "epoch": 0.19609438567941415, + "grad_norm": 21.462337493896484, + "learning_rate": 1.5637982186611481e-07, + "loss": 0.4524, + "num_input_tokens_seen": 6549405696, + "step": 3123 + }, + { + "epoch": 0.1963656088961215, + "grad_norm": 18.190275192260742, + "learning_rate": 1.5618433513944787e-07, + "loss": 0.2997, + "num_input_tokens_seen": 6551502848, + "step": 3124 + }, + { + "epoch": 0.19663683211282887, + "grad_norm": 18.96411895751953, + "learning_rate": 1.5598916534064216e-07, + "loss": 0.5017, + "num_input_tokens_seen": 6553600000, + "step": 3125 + }, + { + "epoch": 0.1969080553295362, + "grad_norm": 16.543214797973633, + "learning_rate": 1.557943126267631e-07, + "loss": 0.4162, + "num_input_tokens_seen": 6555697152, + "step": 3126 + }, + { + "epoch": 0.19717927854624356, + "grad_norm": 13.697303771972656, + "learning_rate": 1.5559977715462044e-07, + "loss": 0.287, + "num_input_tokens_seen": 6557794304, + "step": 3127 + }, + { + "epoch": 0.1974505017629509, + "grad_norm": 19.32088279724121, + "learning_rate": 1.5540555908076914e-07, + "loss": 0.6185, + "num_input_tokens_seen": 6559891456, + "step": 3128 + }, + { + "epoch": 0.19772172497965826, + "grad_norm": 21.070152282714844, + "learning_rate": 1.5521165856150841e-07, + "loss": 0.4217, + "num_input_tokens_seen": 6561988608, + "step": 3129 + }, + { + "epoch": 0.19799294819636562, + "grad_norm": 27.660669326782227, + "learning_rate": 1.55018075752882e-07, + "loss": 0.5169, + "num_input_tokens_seen": 6564085760, + "step": 3130 + }, + { + "epoch": 0.19826417141307295, + "grad_norm": 17.022554397583008, + "learning_rate": 1.5482481081067825e-07, + "loss": 0.4193, + "num_input_tokens_seen": 6566182912, + "step": 3131 + }, + { + "epoch": 0.19853539462978032, + "grad_norm": 18.43861198425293, + "learning_rate": 1.546318638904291e-07, + "loss": 0.4522, + "num_input_tokens_seen": 6568280064, + "step": 3132 + }, + { + "epoch": 0.19880661784648765, + "grad_norm": 15.052275657653809, + "learning_rate": 1.5443923514741103e-07, + "loss": 0.2211, + "num_input_tokens_seen": 6570377216, + "step": 3133 + }, + { + "epoch": 0.199077841063195, + "grad_norm": 21.32815933227539, + "learning_rate": 1.5424692473664429e-07, + "loss": 0.3405, + "num_input_tokens_seen": 6572474368, + "step": 3134 + }, + { + "epoch": 0.19934906427990237, + "grad_norm": 20.877336502075195, + "learning_rate": 1.5405493281289302e-07, + "loss": 0.6306, + "num_input_tokens_seen": 6574571520, + "step": 3135 + }, + { + "epoch": 0.1996202874966097, + "grad_norm": 21.885692596435547, + "learning_rate": 1.5386325953066494e-07, + "loss": 0.6523, + "num_input_tokens_seen": 6576668672, + "step": 3136 + }, + { + "epoch": 0.19989151071331707, + "grad_norm": 13.137401580810547, + "learning_rate": 1.5367190504421157e-07, + "loss": 0.3159, + "num_input_tokens_seen": 6578765824, + "step": 3137 + }, + { + "epoch": 0.2001627339300244, + "grad_norm": 15.956317901611328, + "learning_rate": 1.5348086950752753e-07, + "loss": 0.3431, + "num_input_tokens_seen": 6580862976, + "step": 3138 + }, + { + "epoch": 0.20043395714673176, + "grad_norm": 16.24814796447754, + "learning_rate": 1.532901530743511e-07, + "loss": 0.4469, + "num_input_tokens_seen": 6582960128, + "step": 3139 + }, + { + "epoch": 0.20070518036343912, + "grad_norm": 13.850639343261719, + "learning_rate": 1.530997558981635e-07, + "loss": 0.4047, + "num_input_tokens_seen": 6585057280, + "step": 3140 + }, + { + "epoch": 0.20097640358014646, + "grad_norm": 21.258054733276367, + "learning_rate": 1.5290967813218919e-07, + "loss": 0.7333, + "num_input_tokens_seen": 6587154432, + "step": 3141 + }, + { + "epoch": 0.20124762679685382, + "grad_norm": 24.493776321411133, + "learning_rate": 1.527199199293955e-07, + "loss": 0.3701, + "num_input_tokens_seen": 6589251584, + "step": 3142 + }, + { + "epoch": 0.20151885001356115, + "grad_norm": 25.23069190979004, + "learning_rate": 1.525304814424927e-07, + "loss": 0.7751, + "num_input_tokens_seen": 6591348736, + "step": 3143 + }, + { + "epoch": 0.2017900732302685, + "grad_norm": 16.206256866455078, + "learning_rate": 1.5234136282393356e-07, + "loss": 0.3786, + "num_input_tokens_seen": 6593445888, + "step": 3144 + }, + { + "epoch": 0.20206129644697587, + "grad_norm": 18.445281982421875, + "learning_rate": 1.5215256422591357e-07, + "loss": 0.5419, + "num_input_tokens_seen": 6595543040, + "step": 3145 + }, + { + "epoch": 0.2023325196636832, + "grad_norm": 13.487834930419922, + "learning_rate": 1.5196408580037058e-07, + "loss": 0.225, + "num_input_tokens_seen": 6597640192, + "step": 3146 + }, + { + "epoch": 0.20260374288039057, + "grad_norm": 18.042003631591797, + "learning_rate": 1.51775927698985e-07, + "loss": 0.2773, + "num_input_tokens_seen": 6599737344, + "step": 3147 + }, + { + "epoch": 0.2028749660970979, + "grad_norm": 14.105890274047852, + "learning_rate": 1.5158809007317926e-07, + "loss": 0.3948, + "num_input_tokens_seen": 6601834496, + "step": 3148 + }, + { + "epoch": 0.20314618931380526, + "grad_norm": 22.028852462768555, + "learning_rate": 1.514005730741181e-07, + "loss": 0.7842, + "num_input_tokens_seen": 6603931648, + "step": 3149 + }, + { + "epoch": 0.20341741253051263, + "grad_norm": 18.55408477783203, + "learning_rate": 1.512133768527077e-07, + "loss": 0.4541, + "num_input_tokens_seen": 6606028800, + "step": 3150 + }, + { + "epoch": 0.20368863574721996, + "grad_norm": 15.917207717895508, + "learning_rate": 1.510265015595967e-07, + "loss": 0.3687, + "num_input_tokens_seen": 6608125952, + "step": 3151 + }, + { + "epoch": 0.20395985896392732, + "grad_norm": 25.865598678588867, + "learning_rate": 1.5083994734517518e-07, + "loss": 0.6256, + "num_input_tokens_seen": 6610223104, + "step": 3152 + }, + { + "epoch": 0.20423108218063465, + "grad_norm": 17.131990432739258, + "learning_rate": 1.5065371435957484e-07, + "loss": 0.4382, + "num_input_tokens_seen": 6612320256, + "step": 3153 + }, + { + "epoch": 0.20450230539734202, + "grad_norm": 24.471385955810547, + "learning_rate": 1.5046780275266897e-07, + "loss": 0.7498, + "num_input_tokens_seen": 6614417408, + "step": 3154 + }, + { + "epoch": 0.20477352861404935, + "grad_norm": 17.423646926879883, + "learning_rate": 1.5028221267407207e-07, + "loss": 0.4527, + "num_input_tokens_seen": 6616514560, + "step": 3155 + }, + { + "epoch": 0.2050447518307567, + "grad_norm": 13.580658912658691, + "learning_rate": 1.5009694427314007e-07, + "loss": 0.2501, + "num_input_tokens_seen": 6618611712, + "step": 3156 + }, + { + "epoch": 0.20531597504746407, + "grad_norm": 16.855371475219727, + "learning_rate": 1.4991199769896983e-07, + "loss": 0.33, + "num_input_tokens_seen": 6620708864, + "step": 3157 + }, + { + "epoch": 0.2055871982641714, + "grad_norm": 14.530333518981934, + "learning_rate": 1.497273731003994e-07, + "loss": 0.3885, + "num_input_tokens_seen": 6622806016, + "step": 3158 + }, + { + "epoch": 0.20585842148087877, + "grad_norm": 13.189663887023926, + "learning_rate": 1.4954307062600758e-07, + "loss": 0.3511, + "num_input_tokens_seen": 6624903168, + "step": 3159 + }, + { + "epoch": 0.2061296446975861, + "grad_norm": 21.233369827270508, + "learning_rate": 1.4935909042411412e-07, + "loss": 0.6974, + "num_input_tokens_seen": 6627000320, + "step": 3160 + }, + { + "epoch": 0.20640086791429346, + "grad_norm": 19.892419815063477, + "learning_rate": 1.4917543264277901e-07, + "loss": 0.498, + "num_input_tokens_seen": 6629097472, + "step": 3161 + }, + { + "epoch": 0.20667209113100082, + "grad_norm": 14.9336519241333, + "learning_rate": 1.4899209742980317e-07, + "loss": 0.4197, + "num_input_tokens_seen": 6631194624, + "step": 3162 + }, + { + "epoch": 0.20694331434770816, + "grad_norm": 19.203628540039062, + "learning_rate": 1.488090849327279e-07, + "loss": 0.5148, + "num_input_tokens_seen": 6633291776, + "step": 3163 + }, + { + "epoch": 0.20721453756441552, + "grad_norm": 14.907687187194824, + "learning_rate": 1.4862639529883463e-07, + "loss": 0.3568, + "num_input_tokens_seen": 6635388928, + "step": 3164 + }, + { + "epoch": 0.20748576078112285, + "grad_norm": 16.64046287536621, + "learning_rate": 1.4844402867514503e-07, + "loss": 0.3888, + "num_input_tokens_seen": 6637486080, + "step": 3165 + }, + { + "epoch": 0.2077569839978302, + "grad_norm": 12.00853157043457, + "learning_rate": 1.4826198520842093e-07, + "loss": 0.2793, + "num_input_tokens_seen": 6639583232, + "step": 3166 + }, + { + "epoch": 0.20802820721453757, + "grad_norm": 15.106518745422363, + "learning_rate": 1.4808026504516374e-07, + "loss": 0.4134, + "num_input_tokens_seen": 6641680384, + "step": 3167 + }, + { + "epoch": 0.2082994304312449, + "grad_norm": 18.982995986938477, + "learning_rate": 1.4789886833161506e-07, + "loss": 0.5532, + "num_input_tokens_seen": 6643777536, + "step": 3168 + }, + { + "epoch": 0.20857065364795227, + "grad_norm": 12.5213041305542, + "learning_rate": 1.477177952137561e-07, + "loss": 0.3146, + "num_input_tokens_seen": 6645874688, + "step": 3169 + }, + { + "epoch": 0.2088418768646596, + "grad_norm": 18.02132797241211, + "learning_rate": 1.4753704583730754e-07, + "loss": 0.5158, + "num_input_tokens_seen": 6647971840, + "step": 3170 + }, + { + "epoch": 0.20911310008136696, + "grad_norm": 16.98950958251953, + "learning_rate": 1.4735662034772962e-07, + "loss": 0.3841, + "num_input_tokens_seen": 6650068992, + "step": 3171 + }, + { + "epoch": 0.20938432329807433, + "grad_norm": 15.854743003845215, + "learning_rate": 1.47176518890222e-07, + "loss": 0.327, + "num_input_tokens_seen": 6652166144, + "step": 3172 + }, + { + "epoch": 0.20965554651478166, + "grad_norm": 14.455223083496094, + "learning_rate": 1.4699674160972337e-07, + "loss": 0.3407, + "num_input_tokens_seen": 6654263296, + "step": 3173 + }, + { + "epoch": 0.20992676973148902, + "grad_norm": 12.59441089630127, + "learning_rate": 1.4681728865091165e-07, + "loss": 0.2415, + "num_input_tokens_seen": 6656360448, + "step": 3174 + }, + { + "epoch": 0.21019799294819635, + "grad_norm": 21.2336483001709, + "learning_rate": 1.466381601582038e-07, + "loss": 0.529, + "num_input_tokens_seen": 6658457600, + "step": 3175 + }, + { + "epoch": 0.21046921616490372, + "grad_norm": 15.553447723388672, + "learning_rate": 1.4645935627575562e-07, + "loss": 0.2731, + "num_input_tokens_seen": 6660554752, + "step": 3176 + }, + { + "epoch": 0.21074043938161108, + "grad_norm": 19.533782958984375, + "learning_rate": 1.462808771474617e-07, + "loss": 0.626, + "num_input_tokens_seen": 6662651904, + "step": 3177 + }, + { + "epoch": 0.2110116625983184, + "grad_norm": 11.384428024291992, + "learning_rate": 1.4610272291695503e-07, + "loss": 0.2152, + "num_input_tokens_seen": 6664749056, + "step": 3178 + }, + { + "epoch": 0.21128288581502577, + "grad_norm": 17.49107551574707, + "learning_rate": 1.4592489372760764e-07, + "loss": 0.3965, + "num_input_tokens_seen": 6666846208, + "step": 3179 + }, + { + "epoch": 0.2115541090317331, + "grad_norm": 14.759407043457031, + "learning_rate": 1.4574738972252953e-07, + "loss": 0.3181, + "num_input_tokens_seen": 6668943360, + "step": 3180 + }, + { + "epoch": 0.21182533224844047, + "grad_norm": 15.087119102478027, + "learning_rate": 1.4557021104456923e-07, + "loss": 0.4154, + "num_input_tokens_seen": 6671040512, + "step": 3181 + }, + { + "epoch": 0.21209655546514783, + "grad_norm": 17.55137062072754, + "learning_rate": 1.4539335783631346e-07, + "loss": 0.3897, + "num_input_tokens_seen": 6673137664, + "step": 3182 + }, + { + "epoch": 0.21236777868185516, + "grad_norm": 14.304892539978027, + "learning_rate": 1.4521683024008687e-07, + "loss": 0.4131, + "num_input_tokens_seen": 6675234816, + "step": 3183 + }, + { + "epoch": 0.21263900189856252, + "grad_norm": 17.37781524658203, + "learning_rate": 1.450406283979521e-07, + "loss": 0.4752, + "num_input_tokens_seen": 6677331968, + "step": 3184 + }, + { + "epoch": 0.21291022511526986, + "grad_norm": 11.948087692260742, + "learning_rate": 1.4486475245170977e-07, + "loss": 0.2446, + "num_input_tokens_seen": 6679429120, + "step": 3185 + }, + { + "epoch": 0.21318144833197722, + "grad_norm": 17.137893676757812, + "learning_rate": 1.446892025428981e-07, + "loss": 0.3559, + "num_input_tokens_seen": 6681526272, + "step": 3186 + }, + { + "epoch": 0.21345267154868458, + "grad_norm": 8.884380340576172, + "learning_rate": 1.4451397881279298e-07, + "loss": 0.1706, + "num_input_tokens_seen": 6683623424, + "step": 3187 + }, + { + "epoch": 0.21372389476539191, + "grad_norm": 16.802011489868164, + "learning_rate": 1.4433908140240792e-07, + "loss": 0.3805, + "num_input_tokens_seen": 6685720576, + "step": 3188 + }, + { + "epoch": 0.21399511798209928, + "grad_norm": 20.294599533081055, + "learning_rate": 1.4416451045249354e-07, + "loss": 0.3829, + "num_input_tokens_seen": 6687817728, + "step": 3189 + }, + { + "epoch": 0.2142663411988066, + "grad_norm": 16.205463409423828, + "learning_rate": 1.43990266103538e-07, + "loss": 0.3615, + "num_input_tokens_seen": 6689914880, + "step": 3190 + }, + { + "epoch": 0.21453756441551397, + "grad_norm": 17.31773567199707, + "learning_rate": 1.4381634849576644e-07, + "loss": 0.3264, + "num_input_tokens_seen": 6692012032, + "step": 3191 + }, + { + "epoch": 0.21480878763222133, + "grad_norm": 13.5274658203125, + "learning_rate": 1.4364275776914112e-07, + "loss": 0.3105, + "num_input_tokens_seen": 6694109184, + "step": 3192 + }, + { + "epoch": 0.21508001084892867, + "grad_norm": 12.372538566589355, + "learning_rate": 1.4346949406336136e-07, + "loss": 0.3113, + "num_input_tokens_seen": 6696206336, + "step": 3193 + }, + { + "epoch": 0.21535123406563603, + "grad_norm": 12.951509475708008, + "learning_rate": 1.4329655751786318e-07, + "loss": 0.2954, + "num_input_tokens_seen": 6698303488, + "step": 3194 + }, + { + "epoch": 0.21562245728234336, + "grad_norm": 16.507877349853516, + "learning_rate": 1.4312394827181925e-07, + "loss": 0.4798, + "num_input_tokens_seen": 6700400640, + "step": 3195 + }, + { + "epoch": 0.21589368049905072, + "grad_norm": 15.880234718322754, + "learning_rate": 1.4295166646413898e-07, + "loss": 0.3777, + "num_input_tokens_seen": 6702497792, + "step": 3196 + }, + { + "epoch": 0.21616490371575806, + "grad_norm": 15.432538986206055, + "learning_rate": 1.4277971223346825e-07, + "loss": 0.3242, + "num_input_tokens_seen": 6704594944, + "step": 3197 + }, + { + "epoch": 0.21643612693246542, + "grad_norm": 18.914257049560547, + "learning_rate": 1.4260808571818917e-07, + "loss": 0.4032, + "num_input_tokens_seen": 6706692096, + "step": 3198 + }, + { + "epoch": 0.21670735014917278, + "grad_norm": 17.466224670410156, + "learning_rate": 1.4243678705642027e-07, + "loss": 0.3723, + "num_input_tokens_seen": 6708789248, + "step": 3199 + }, + { + "epoch": 0.2169785733658801, + "grad_norm": 19.855295181274414, + "learning_rate": 1.4226581638601635e-07, + "loss": 0.4615, + "num_input_tokens_seen": 6710886400, + "step": 3200 + }, + { + "epoch": 0.00027122321670735016, + "grad_norm": 16.647418975830078, + "learning_rate": 1.4209517384456786e-07, + "loss": 0.4811, + "num_input_tokens_seen": 6712983552, + "step": 3201 + }, + { + "epoch": 0.0005424464334147003, + "grad_norm": 10.773716926574707, + "learning_rate": 1.4192485956940153e-07, + "loss": 0.2869, + "num_input_tokens_seen": 6715080704, + "step": 3202 + }, + { + "epoch": 0.0008136696501220504, + "grad_norm": 10.768502235412598, + "learning_rate": 1.4175487369757973e-07, + "loss": 0.2319, + "num_input_tokens_seen": 6717177856, + "step": 3203 + }, + { + "epoch": 0.0010848928668294006, + "grad_norm": 20.341934204101562, + "learning_rate": 1.4158521636590079e-07, + "loss": 0.5739, + "num_input_tokens_seen": 6719275008, + "step": 3204 + }, + { + "epoch": 0.0013561160835367507, + "grad_norm": 21.294525146484375, + "learning_rate": 1.4141588771089838e-07, + "loss": 0.7022, + "num_input_tokens_seen": 6721372160, + "step": 3205 + }, + { + "epoch": 0.0016273393002441008, + "grad_norm": 12.000500679016113, + "learning_rate": 1.412468878688418e-07, + "loss": 0.2761, + "num_input_tokens_seen": 6723469312, + "step": 3206 + }, + { + "epoch": 0.001898562516951451, + "grad_norm": 11.831374168395996, + "learning_rate": 1.410782169757356e-07, + "loss": 0.2413, + "num_input_tokens_seen": 6725566464, + "step": 3207 + }, + { + "epoch": 0.0021697857336588013, + "grad_norm": 15.416332244873047, + "learning_rate": 1.4090987516731977e-07, + "loss": 0.4252, + "num_input_tokens_seen": 6727663616, + "step": 3208 + }, + { + "epoch": 0.0024410089503661514, + "grad_norm": 15.296663284301758, + "learning_rate": 1.4074186257906934e-07, + "loss": 0.3768, + "num_input_tokens_seen": 6729760768, + "step": 3209 + }, + { + "epoch": 0.0027122321670735015, + "grad_norm": 17.990812301635742, + "learning_rate": 1.4057417934619453e-07, + "loss": 0.5308, + "num_input_tokens_seen": 6731857920, + "step": 3210 + }, + { + "epoch": 0.0029834553837808516, + "grad_norm": 20.814661026000977, + "learning_rate": 1.404068256036403e-07, + "loss": 0.498, + "num_input_tokens_seen": 6733955072, + "step": 3211 + }, + { + "epoch": 0.0032546786004882017, + "grad_norm": 17.336063385009766, + "learning_rate": 1.4023980148608667e-07, + "loss": 0.363, + "num_input_tokens_seen": 6736052224, + "step": 3212 + }, + { + "epoch": 0.003525901817195552, + "grad_norm": 9.499157905578613, + "learning_rate": 1.4007310712794827e-07, + "loss": 0.1552, + "num_input_tokens_seen": 6738149376, + "step": 3213 + }, + { + "epoch": 0.003797125033902902, + "grad_norm": 22.045595169067383, + "learning_rate": 1.3990674266337442e-07, + "loss": 0.5238, + "num_input_tokens_seen": 6740246528, + "step": 3214 + }, + { + "epoch": 0.0040683482506102524, + "grad_norm": 12.480164527893066, + "learning_rate": 1.3974070822624883e-07, + "loss": 0.311, + "num_input_tokens_seen": 6742343680, + "step": 3215 + }, + { + "epoch": 0.0043395714673176026, + "grad_norm": 17.63022232055664, + "learning_rate": 1.3957500395018977e-07, + "loss": 0.508, + "num_input_tokens_seen": 6744440832, + "step": 3216 + }, + { + "epoch": 0.004610794684024953, + "grad_norm": 19.386255264282227, + "learning_rate": 1.394096299685497e-07, + "loss": 0.5619, + "num_input_tokens_seen": 6746537984, + "step": 3217 + }, + { + "epoch": 0.004882017900732303, + "grad_norm": 13.394591331481934, + "learning_rate": 1.3924458641441532e-07, + "loss": 0.371, + "num_input_tokens_seen": 6748635136, + "step": 3218 + }, + { + "epoch": 0.005153241117439653, + "grad_norm": 18.7851505279541, + "learning_rate": 1.3907987342060725e-07, + "loss": 0.5936, + "num_input_tokens_seen": 6750732288, + "step": 3219 + }, + { + "epoch": 0.005424464334147003, + "grad_norm": 20.079437255859375, + "learning_rate": 1.389154911196805e-07, + "loss": 0.5704, + "num_input_tokens_seen": 6752829440, + "step": 3220 + }, + { + "epoch": 0.005695687550854353, + "grad_norm": 11.2096586227417, + "learning_rate": 1.3875143964392355e-07, + "loss": 0.2784, + "num_input_tokens_seen": 6754926592, + "step": 3221 + }, + { + "epoch": 0.005966910767561703, + "grad_norm": 23.216882705688477, + "learning_rate": 1.3858771912535877e-07, + "loss": 0.8026, + "num_input_tokens_seen": 6757023744, + "step": 3222 + }, + { + "epoch": 0.006238133984269053, + "grad_norm": 10.511091232299805, + "learning_rate": 1.3842432969574238e-07, + "loss": 0.1695, + "num_input_tokens_seen": 6759120896, + "step": 3223 + }, + { + "epoch": 0.006509357200976403, + "grad_norm": 14.655221939086914, + "learning_rate": 1.382612714865637e-07, + "loss": 0.3329, + "num_input_tokens_seen": 6761218048, + "step": 3224 + }, + { + "epoch": 0.0067805804176837535, + "grad_norm": 18.49831771850586, + "learning_rate": 1.3809854462904596e-07, + "loss": 0.5104, + "num_input_tokens_seen": 6763315200, + "step": 3225 + }, + { + "epoch": 0.007051803634391104, + "grad_norm": 9.733176231384277, + "learning_rate": 1.3793614925414542e-07, + "loss": 0.1723, + "num_input_tokens_seen": 6765412352, + "step": 3226 + }, + { + "epoch": 0.007323026851098454, + "grad_norm": 16.505550384521484, + "learning_rate": 1.3777408549255182e-07, + "loss": 0.3004, + "num_input_tokens_seen": 6767509504, + "step": 3227 + }, + { + "epoch": 0.007594250067805804, + "grad_norm": 12.750898361206055, + "learning_rate": 1.3761235347468784e-07, + "loss": 0.1774, + "num_input_tokens_seen": 6769606656, + "step": 3228 + }, + { + "epoch": 0.007865473284513154, + "grad_norm": 11.027116775512695, + "learning_rate": 1.3745095333070926e-07, + "loss": 0.2317, + "num_input_tokens_seen": 6771703808, + "step": 3229 + }, + { + "epoch": 0.008136696501220505, + "grad_norm": 26.27062225341797, + "learning_rate": 1.3728988519050476e-07, + "loss": 0.9626, + "num_input_tokens_seen": 6773800960, + "step": 3230 + }, + { + "epoch": 0.008407919717927854, + "grad_norm": 15.877495765686035, + "learning_rate": 1.3712914918369595e-07, + "loss": 0.4503, + "num_input_tokens_seen": 6775898112, + "step": 3231 + }, + { + "epoch": 0.008679142934635205, + "grad_norm": 21.709930419921875, + "learning_rate": 1.369687454396369e-07, + "loss": 0.4794, + "num_input_tokens_seen": 6777995264, + "step": 3232 + }, + { + "epoch": 0.008950366151342554, + "grad_norm": 12.499728202819824, + "learning_rate": 1.3680867408741457e-07, + "loss": 0.2759, + "num_input_tokens_seen": 6780092416, + "step": 3233 + }, + { + "epoch": 0.009221589368049905, + "grad_norm": 22.60646629333496, + "learning_rate": 1.366489352558483e-07, + "loss": 0.5445, + "num_input_tokens_seen": 6782189568, + "step": 3234 + }, + { + "epoch": 0.009492812584757255, + "grad_norm": 17.723960876464844, + "learning_rate": 1.3648952907348957e-07, + "loss": 0.4128, + "num_input_tokens_seen": 6784286720, + "step": 3235 + }, + { + "epoch": 0.009764035801464606, + "grad_norm": 16.3974552154541, + "learning_rate": 1.3633045566862268e-07, + "loss": 0.4623, + "num_input_tokens_seen": 6786383872, + "step": 3236 + }, + { + "epoch": 0.010035259018171955, + "grad_norm": 13.922554016113281, + "learning_rate": 1.3617171516926382e-07, + "loss": 0.3435, + "num_input_tokens_seen": 6788481024, + "step": 3237 + }, + { + "epoch": 0.010306482234879306, + "grad_norm": 10.876972198486328, + "learning_rate": 1.3601330770316122e-07, + "loss": 0.2045, + "num_input_tokens_seen": 6790578176, + "step": 3238 + }, + { + "epoch": 0.010577705451586655, + "grad_norm": 17.26404571533203, + "learning_rate": 1.3585523339779523e-07, + "loss": 0.3805, + "num_input_tokens_seen": 6792675328, + "step": 3239 + }, + { + "epoch": 0.010848928668294006, + "grad_norm": 24.898168563842773, + "learning_rate": 1.356974923803781e-07, + "loss": 0.8073, + "num_input_tokens_seen": 6794772480, + "step": 3240 + }, + { + "epoch": 0.011120151885001357, + "grad_norm": 16.236896514892578, + "learning_rate": 1.3554008477785367e-07, + "loss": 0.4327, + "num_input_tokens_seen": 6796869632, + "step": 3241 + }, + { + "epoch": 0.011391375101708706, + "grad_norm": 18.53495216369629, + "learning_rate": 1.3538301071689756e-07, + "loss": 0.3856, + "num_input_tokens_seen": 6798966784, + "step": 3242 + }, + { + "epoch": 0.011662598318416057, + "grad_norm": 18.698938369750977, + "learning_rate": 1.3522627032391715e-07, + "loss": 0.4498, + "num_input_tokens_seen": 6801063936, + "step": 3243 + }, + { + "epoch": 0.011933821535123406, + "grad_norm": 14.774138450622559, + "learning_rate": 1.3506986372505098e-07, + "loss": 0.3556, + "num_input_tokens_seen": 6803161088, + "step": 3244 + }, + { + "epoch": 0.012205044751830757, + "grad_norm": 22.23430633544922, + "learning_rate": 1.3491379104616938e-07, + "loss": 0.5173, + "num_input_tokens_seen": 6805258240, + "step": 3245 + }, + { + "epoch": 0.012476267968538107, + "grad_norm": 12.263484954833984, + "learning_rate": 1.3475805241287345e-07, + "loss": 0.2426, + "num_input_tokens_seen": 6807355392, + "step": 3246 + }, + { + "epoch": 0.012747491185245458, + "grad_norm": 22.76350212097168, + "learning_rate": 1.3460264795049577e-07, + "loss": 0.6469, + "num_input_tokens_seen": 6809452544, + "step": 3247 + }, + { + "epoch": 0.013018714401952807, + "grad_norm": 16.658710479736328, + "learning_rate": 1.344475777841e-07, + "loss": 0.3359, + "num_input_tokens_seen": 6811549696, + "step": 3248 + }, + { + "epoch": 0.013289937618660158, + "grad_norm": 12.79448127746582, + "learning_rate": 1.3429284203848074e-07, + "loss": 0.2698, + "num_input_tokens_seen": 6813646848, + "step": 3249 + }, + { + "epoch": 0.013561160835367507, + "grad_norm": 31.765995025634766, + "learning_rate": 1.3413844083816334e-07, + "loss": 1.1975, + "num_input_tokens_seen": 6815744000, + "step": 3250 + }, + { + "epoch": 0.013832384052074858, + "grad_norm": 17.021753311157227, + "learning_rate": 1.3398437430740403e-07, + "loss": 0.3363, + "num_input_tokens_seen": 6817841152, + "step": 3251 + }, + { + "epoch": 0.014103607268782207, + "grad_norm": 14.463712692260742, + "learning_rate": 1.3383064257018978e-07, + "loss": 0.3068, + "num_input_tokens_seen": 6819938304, + "step": 3252 + }, + { + "epoch": 0.014374830485489558, + "grad_norm": 16.731229782104492, + "learning_rate": 1.3367724575023786e-07, + "loss": 0.2803, + "num_input_tokens_seen": 6822035456, + "step": 3253 + }, + { + "epoch": 0.014646053702196907, + "grad_norm": 22.050737380981445, + "learning_rate": 1.3352418397099638e-07, + "loss": 0.6333, + "num_input_tokens_seen": 6824132608, + "step": 3254 + }, + { + "epoch": 0.014917276918904258, + "grad_norm": 25.376235961914062, + "learning_rate": 1.333714573556435e-07, + "loss": 0.4512, + "num_input_tokens_seen": 6826229760, + "step": 3255 + }, + { + "epoch": 0.015188500135611608, + "grad_norm": 16.40929412841797, + "learning_rate": 1.332190660270879e-07, + "loss": 0.4892, + "num_input_tokens_seen": 6828326912, + "step": 3256 + }, + { + "epoch": 0.015459723352318959, + "grad_norm": 17.429452896118164, + "learning_rate": 1.3306701010796832e-07, + "loss": 0.4432, + "num_input_tokens_seen": 6830424064, + "step": 3257 + }, + { + "epoch": 0.015730946569026308, + "grad_norm": 18.253334045410156, + "learning_rate": 1.3291528972065347e-07, + "loss": 0.5687, + "num_input_tokens_seen": 6832521216, + "step": 3258 + }, + { + "epoch": 0.01600216978573366, + "grad_norm": 15.493392944335938, + "learning_rate": 1.327639049872422e-07, + "loss": 0.3613, + "num_input_tokens_seen": 6834618368, + "step": 3259 + }, + { + "epoch": 0.01627339300244101, + "grad_norm": 17.719694137573242, + "learning_rate": 1.326128560295631e-07, + "loss": 0.3888, + "num_input_tokens_seen": 6836715520, + "step": 3260 + }, + { + "epoch": 0.01654461621914836, + "grad_norm": 17.160886764526367, + "learning_rate": 1.324621429691748e-07, + "loss": 0.4548, + "num_input_tokens_seen": 6838812672, + "step": 3261 + }, + { + "epoch": 0.016815839435855708, + "grad_norm": 19.88508415222168, + "learning_rate": 1.3231176592736528e-07, + "loss": 0.4992, + "num_input_tokens_seen": 6840909824, + "step": 3262 + }, + { + "epoch": 0.01708706265256306, + "grad_norm": 20.55489730834961, + "learning_rate": 1.3216172502515248e-07, + "loss": 0.5841, + "num_input_tokens_seen": 6843006976, + "step": 3263 + }, + { + "epoch": 0.01735828586927041, + "grad_norm": 19.093385696411133, + "learning_rate": 1.320120203832833e-07, + "loss": 0.5585, + "num_input_tokens_seen": 6845104128, + "step": 3264 + }, + { + "epoch": 0.01762950908597776, + "grad_norm": 22.638851165771484, + "learning_rate": 1.3186265212223459e-07, + "loss": 0.4794, + "num_input_tokens_seen": 6847201280, + "step": 3265 + }, + { + "epoch": 0.01790073230268511, + "grad_norm": 18.409488677978516, + "learning_rate": 1.3171362036221202e-07, + "loss": 0.4148, + "num_input_tokens_seen": 6849298432, + "step": 3266 + }, + { + "epoch": 0.01817195551939246, + "grad_norm": 25.409086227416992, + "learning_rate": 1.3156492522315088e-07, + "loss": 0.7068, + "num_input_tokens_seen": 6851395584, + "step": 3267 + }, + { + "epoch": 0.01844317873609981, + "grad_norm": 17.42267608642578, + "learning_rate": 1.314165668247153e-07, + "loss": 0.3872, + "num_input_tokens_seen": 6853492736, + "step": 3268 + }, + { + "epoch": 0.01871440195280716, + "grad_norm": 20.826908111572266, + "learning_rate": 1.3126854528629843e-07, + "loss": 0.6012, + "num_input_tokens_seen": 6855589888, + "step": 3269 + }, + { + "epoch": 0.01898562516951451, + "grad_norm": 27.992067337036133, + "learning_rate": 1.3112086072702238e-07, + "loss": 0.3769, + "num_input_tokens_seen": 6857687040, + "step": 3270 + }, + { + "epoch": 0.01925684838622186, + "grad_norm": 17.23652458190918, + "learning_rate": 1.3097351326573814e-07, + "loss": 0.4274, + "num_input_tokens_seen": 6859784192, + "step": 3271 + }, + { + "epoch": 0.01952807160292921, + "grad_norm": 27.117645263671875, + "learning_rate": 1.3082650302102524e-07, + "loss": 0.5901, + "num_input_tokens_seen": 6861881344, + "step": 3272 + }, + { + "epoch": 0.019799294819636562, + "grad_norm": 10.570221900939941, + "learning_rate": 1.30679830111192e-07, + "loss": 0.1871, + "num_input_tokens_seen": 6863978496, + "step": 3273 + }, + { + "epoch": 0.02007051803634391, + "grad_norm": 16.369604110717773, + "learning_rate": 1.3053349465427518e-07, + "loss": 0.4029, + "num_input_tokens_seen": 6866075648, + "step": 3274 + }, + { + "epoch": 0.02034174125305126, + "grad_norm": 16.295419692993164, + "learning_rate": 1.303874967680399e-07, + "loss": 0.3059, + "num_input_tokens_seen": 6868172800, + "step": 3275 + }, + { + "epoch": 0.02061296446975861, + "grad_norm": 13.2958345413208, + "learning_rate": 1.302418365699798e-07, + "loss": 0.2834, + "num_input_tokens_seen": 6870269952, + "step": 3276 + }, + { + "epoch": 0.020884187686465962, + "grad_norm": 24.707290649414062, + "learning_rate": 1.300965141773167e-07, + "loss": 0.8627, + "num_input_tokens_seen": 6872367104, + "step": 3277 + }, + { + "epoch": 0.02115541090317331, + "grad_norm": 13.416788101196289, + "learning_rate": 1.2995152970700044e-07, + "loss": 0.2876, + "num_input_tokens_seen": 6874464256, + "step": 3278 + }, + { + "epoch": 0.02142663411988066, + "grad_norm": 18.29564666748047, + "learning_rate": 1.2980688327570906e-07, + "loss": 0.5197, + "num_input_tokens_seen": 6876561408, + "step": 3279 + }, + { + "epoch": 0.021697857336588012, + "grad_norm": 14.38173770904541, + "learning_rate": 1.296625749998485e-07, + "loss": 0.336, + "num_input_tokens_seen": 6878658560, + "step": 3280 + }, + { + "epoch": 0.021969080553295363, + "grad_norm": 11.88624382019043, + "learning_rate": 1.2951860499555253e-07, + "loss": 0.228, + "num_input_tokens_seen": 6880755712, + "step": 3281 + }, + { + "epoch": 0.022240303770002714, + "grad_norm": 20.323486328125, + "learning_rate": 1.2937497337868274e-07, + "loss": 0.5354, + "num_input_tokens_seen": 6882852864, + "step": 3282 + }, + { + "epoch": 0.02251152698671006, + "grad_norm": 18.997339248657227, + "learning_rate": 1.2923168026482843e-07, + "loss": 0.4723, + "num_input_tokens_seen": 6884950016, + "step": 3283 + }, + { + "epoch": 0.022782750203417412, + "grad_norm": 16.928192138671875, + "learning_rate": 1.290887257693064e-07, + "loss": 0.3251, + "num_input_tokens_seen": 6887047168, + "step": 3284 + }, + { + "epoch": 0.023053973420124763, + "grad_norm": 18.173126220703125, + "learning_rate": 1.2894611000716103e-07, + "loss": 0.4975, + "num_input_tokens_seen": 6889144320, + "step": 3285 + }, + { + "epoch": 0.023325196636832114, + "grad_norm": 17.16127586364746, + "learning_rate": 1.28803833093164e-07, + "loss": 0.4848, + "num_input_tokens_seen": 6891241472, + "step": 3286 + }, + { + "epoch": 0.023596419853539462, + "grad_norm": 14.565631866455078, + "learning_rate": 1.286618951418144e-07, + "loss": 0.299, + "num_input_tokens_seen": 6893338624, + "step": 3287 + }, + { + "epoch": 0.023867643070246813, + "grad_norm": 16.098228454589844, + "learning_rate": 1.285202962673385e-07, + "loss": 0.4612, + "num_input_tokens_seen": 6895435776, + "step": 3288 + }, + { + "epoch": 0.024138866286954164, + "grad_norm": 19.208843231201172, + "learning_rate": 1.2837903658368965e-07, + "loss": 0.5771, + "num_input_tokens_seen": 6897532928, + "step": 3289 + }, + { + "epoch": 0.024410089503661515, + "grad_norm": 14.503118515014648, + "learning_rate": 1.2823811620454833e-07, + "loss": 0.309, + "num_input_tokens_seen": 6899630080, + "step": 3290 + }, + { + "epoch": 0.024681312720368862, + "grad_norm": 18.937856674194336, + "learning_rate": 1.2809753524332188e-07, + "loss": 0.5335, + "num_input_tokens_seen": 6901727232, + "step": 3291 + }, + { + "epoch": 0.024952535937076213, + "grad_norm": 20.493385314941406, + "learning_rate": 1.2795729381314434e-07, + "loss": 0.5826, + "num_input_tokens_seen": 6903824384, + "step": 3292 + }, + { + "epoch": 0.025223759153783564, + "grad_norm": 18.82896614074707, + "learning_rate": 1.2781739202687694e-07, + "loss": 0.5127, + "num_input_tokens_seen": 6905921536, + "step": 3293 + }, + { + "epoch": 0.025494982370490915, + "grad_norm": 22.182172775268555, + "learning_rate": 1.2767782999710715e-07, + "loss": 0.7663, + "num_input_tokens_seen": 6908018688, + "step": 3294 + }, + { + "epoch": 0.025766205587198263, + "grad_norm": 21.46929359436035, + "learning_rate": 1.2753860783614925e-07, + "loss": 0.6303, + "num_input_tokens_seen": 6910115840, + "step": 3295 + }, + { + "epoch": 0.026037428803905614, + "grad_norm": 16.43645668029785, + "learning_rate": 1.2739972565604396e-07, + "loss": 0.3882, + "num_input_tokens_seen": 6912212992, + "step": 3296 + }, + { + "epoch": 0.026308652020612965, + "grad_norm": 19.76049041748047, + "learning_rate": 1.2726118356855834e-07, + "loss": 0.5901, + "num_input_tokens_seen": 6914310144, + "step": 3297 + }, + { + "epoch": 0.026579875237320316, + "grad_norm": 18.00474739074707, + "learning_rate": 1.271229816851857e-07, + "loss": 0.4005, + "num_input_tokens_seen": 6916407296, + "step": 3298 + }, + { + "epoch": 0.026851098454027666, + "grad_norm": 19.077592849731445, + "learning_rate": 1.2698512011714578e-07, + "loss": 0.3887, + "num_input_tokens_seen": 6918504448, + "step": 3299 + }, + { + "epoch": 0.027122321670735014, + "grad_norm": 20.036828994750977, + "learning_rate": 1.2684759897538422e-07, + "loss": 0.4026, + "num_input_tokens_seen": 6920601600, + "step": 3300 + }, + { + "epoch": 0.027393544887442365, + "grad_norm": 19.472291946411133, + "learning_rate": 1.2671041837057287e-07, + "loss": 0.5011, + "num_input_tokens_seen": 6922698752, + "step": 3301 + }, + { + "epoch": 0.027664768104149716, + "grad_norm": 21.726613998413086, + "learning_rate": 1.2657357841310941e-07, + "loss": 0.4298, + "num_input_tokens_seen": 6924795904, + "step": 3302 + }, + { + "epoch": 0.027935991320857067, + "grad_norm": 14.84683895111084, + "learning_rate": 1.2643707921311765e-07, + "loss": 0.3162, + "num_input_tokens_seen": 6926893056, + "step": 3303 + }, + { + "epoch": 0.028207214537564414, + "grad_norm": 12.949082374572754, + "learning_rate": 1.2630092088044664e-07, + "loss": 0.2217, + "num_input_tokens_seen": 6928990208, + "step": 3304 + }, + { + "epoch": 0.028478437754271765, + "grad_norm": 15.199170112609863, + "learning_rate": 1.2616510352467158e-07, + "loss": 0.335, + "num_input_tokens_seen": 6931087360, + "step": 3305 + }, + { + "epoch": 0.028749660970979116, + "grad_norm": 24.491796493530273, + "learning_rate": 1.2602962725509307e-07, + "loss": 0.7631, + "num_input_tokens_seen": 6933184512, + "step": 3306 + }, + { + "epoch": 0.029020884187686467, + "grad_norm": 18.071081161499023, + "learning_rate": 1.2589449218073728e-07, + "loss": 0.4094, + "num_input_tokens_seen": 6935281664, + "step": 3307 + }, + { + "epoch": 0.029292107404393815, + "grad_norm": 13.082253456115723, + "learning_rate": 1.2575969841035578e-07, + "loss": 0.3208, + "num_input_tokens_seen": 6937378816, + "step": 3308 + }, + { + "epoch": 0.029563330621101166, + "grad_norm": 20.97207260131836, + "learning_rate": 1.256252460524255e-07, + "loss": 0.6206, + "num_input_tokens_seen": 6939475968, + "step": 3309 + }, + { + "epoch": 0.029834553837808517, + "grad_norm": 20.7148494720459, + "learning_rate": 1.2549113521514844e-07, + "loss": 0.6038, + "num_input_tokens_seen": 6941573120, + "step": 3310 + }, + { + "epoch": 0.030105777054515868, + "grad_norm": 13.58342170715332, + "learning_rate": 1.2535736600645212e-07, + "loss": 0.3072, + "num_input_tokens_seen": 6943670272, + "step": 3311 + }, + { + "epoch": 0.030377000271223215, + "grad_norm": 18.122154235839844, + "learning_rate": 1.2522393853398868e-07, + "loss": 0.4495, + "num_input_tokens_seen": 6945767424, + "step": 3312 + }, + { + "epoch": 0.030648223487930566, + "grad_norm": 11.082249641418457, + "learning_rate": 1.2509085290513563e-07, + "loss": 0.1881, + "num_input_tokens_seen": 6947864576, + "step": 3313 + }, + { + "epoch": 0.030919446704637917, + "grad_norm": 17.34503173828125, + "learning_rate": 1.2495810922699517e-07, + "loss": 0.3816, + "num_input_tokens_seen": 6949961728, + "step": 3314 + }, + { + "epoch": 0.031190669921345268, + "grad_norm": 13.956634521484375, + "learning_rate": 1.2482570760639439e-07, + "loss": 0.3201, + "num_input_tokens_seen": 6952058880, + "step": 3315 + }, + { + "epoch": 0.031461893138052616, + "grad_norm": 13.51434326171875, + "learning_rate": 1.24693648149885e-07, + "loss": 0.2497, + "num_input_tokens_seen": 6954156032, + "step": 3316 + }, + { + "epoch": 0.03173311635475997, + "grad_norm": 16.2774600982666, + "learning_rate": 1.2456193096374338e-07, + "loss": 0.2518, + "num_input_tokens_seen": 6956253184, + "step": 3317 + }, + { + "epoch": 0.03200433957146732, + "grad_norm": 21.98696517944336, + "learning_rate": 1.244305561539707e-07, + "loss": 0.5708, + "num_input_tokens_seen": 6958350336, + "step": 3318 + }, + { + "epoch": 0.032275562788174665, + "grad_norm": 20.506793975830078, + "learning_rate": 1.242995238262923e-07, + "loss": 0.4993, + "num_input_tokens_seen": 6960447488, + "step": 3319 + }, + { + "epoch": 0.03254678600488202, + "grad_norm": 14.927400588989258, + "learning_rate": 1.2416883408615809e-07, + "loss": 0.4024, + "num_input_tokens_seen": 6962544640, + "step": 3320 + }, + { + "epoch": 0.03281800922158937, + "grad_norm": 16.304428100585938, + "learning_rate": 1.2403848703874202e-07, + "loss": 0.3532, + "num_input_tokens_seen": 6964641792, + "step": 3321 + }, + { + "epoch": 0.03308923243829672, + "grad_norm": 15.736098289489746, + "learning_rate": 1.2390848278894264e-07, + "loss": 0.447, + "num_input_tokens_seen": 6966738944, + "step": 3322 + }, + { + "epoch": 0.03336045565500407, + "grad_norm": 20.3814697265625, + "learning_rate": 1.2377882144138222e-07, + "loss": 0.4226, + "num_input_tokens_seen": 6968836096, + "step": 3323 + }, + { + "epoch": 0.033631678871711417, + "grad_norm": 20.88442039489746, + "learning_rate": 1.2364950310040744e-07, + "loss": 0.5159, + "num_input_tokens_seen": 6970933248, + "step": 3324 + }, + { + "epoch": 0.03390290208841877, + "grad_norm": 13.660783767700195, + "learning_rate": 1.235205278700887e-07, + "loss": 0.2405, + "num_input_tokens_seen": 6973030400, + "step": 3325 + }, + { + "epoch": 0.03417412530512612, + "grad_norm": 22.85435676574707, + "learning_rate": 1.233918958542204e-07, + "loss": 0.7742, + "num_input_tokens_seen": 6975127552, + "step": 3326 + }, + { + "epoch": 0.034445348521833466, + "grad_norm": 13.575337409973145, + "learning_rate": 1.2326360715632069e-07, + "loss": 0.3336, + "num_input_tokens_seen": 6977224704, + "step": 3327 + }, + { + "epoch": 0.03471657173854082, + "grad_norm": 24.44587516784668, + "learning_rate": 1.231356618796314e-07, + "loss": 0.6159, + "num_input_tokens_seen": 6979321856, + "step": 3328 + }, + { + "epoch": 0.03498779495524817, + "grad_norm": 23.5541934967041, + "learning_rate": 1.2300806012711798e-07, + "loss": 0.659, + "num_input_tokens_seen": 6981419008, + "step": 3329 + }, + { + "epoch": 0.03525901817195552, + "grad_norm": 9.155745506286621, + "learning_rate": 1.228808020014696e-07, + "loss": 0.1895, + "num_input_tokens_seen": 6983516160, + "step": 3330 + }, + { + "epoch": 0.03553024138866287, + "grad_norm": 16.62866973876953, + "learning_rate": 1.227538876050987e-07, + "loss": 0.4427, + "num_input_tokens_seen": 6985613312, + "step": 3331 + }, + { + "epoch": 0.03580146460537022, + "grad_norm": 15.631593704223633, + "learning_rate": 1.226273170401411e-07, + "loss": 0.357, + "num_input_tokens_seen": 6987710464, + "step": 3332 + }, + { + "epoch": 0.03607268782207757, + "grad_norm": 20.449111938476562, + "learning_rate": 1.2250109040845589e-07, + "loss": 0.4486, + "num_input_tokens_seen": 6989807616, + "step": 3333 + }, + { + "epoch": 0.03634391103878492, + "grad_norm": 19.299829483032227, + "learning_rate": 1.2237520781162567e-07, + "loss": 0.2722, + "num_input_tokens_seen": 6991904768, + "step": 3334 + }, + { + "epoch": 0.03661513425549227, + "grad_norm": 17.439783096313477, + "learning_rate": 1.222496693509559e-07, + "loss": 0.4216, + "num_input_tokens_seen": 6994001920, + "step": 3335 + }, + { + "epoch": 0.03688635747219962, + "grad_norm": 16.625341415405273, + "learning_rate": 1.2212447512747506e-07, + "loss": 0.3902, + "num_input_tokens_seen": 6996099072, + "step": 3336 + }, + { + "epoch": 0.03715758068890697, + "grad_norm": 12.145569801330566, + "learning_rate": 1.2199962524193492e-07, + "loss": 0.2277, + "num_input_tokens_seen": 6998196224, + "step": 3337 + }, + { + "epoch": 0.03742880390561432, + "grad_norm": 19.74405860900879, + "learning_rate": 1.218751197948096e-07, + "loss": 0.4426, + "num_input_tokens_seen": 7000293376, + "step": 3338 + }, + { + "epoch": 0.03770002712232167, + "grad_norm": 20.95228385925293, + "learning_rate": 1.2175095888629657e-07, + "loss": 0.5911, + "num_input_tokens_seen": 7002390528, + "step": 3339 + }, + { + "epoch": 0.03797125033902902, + "grad_norm": 23.399641036987305, + "learning_rate": 1.2162714261631575e-07, + "loss": 0.6537, + "num_input_tokens_seen": 7004487680, + "step": 3340 + }, + { + "epoch": 0.03824247355573637, + "grad_norm": 11.986372947692871, + "learning_rate": 1.215036710845097e-07, + "loss": 0.2545, + "num_input_tokens_seen": 7006584832, + "step": 3341 + }, + { + "epoch": 0.03851369677244372, + "grad_norm": 16.912973403930664, + "learning_rate": 1.213805443902437e-07, + "loss": 0.386, + "num_input_tokens_seen": 7008681984, + "step": 3342 + }, + { + "epoch": 0.038784919989151075, + "grad_norm": 18.482542037963867, + "learning_rate": 1.212577626326054e-07, + "loss": 0.3066, + "num_input_tokens_seen": 7010779136, + "step": 3343 + }, + { + "epoch": 0.03905614320585842, + "grad_norm": 16.53171157836914, + "learning_rate": 1.2113532591040488e-07, + "loss": 0.3742, + "num_input_tokens_seen": 7012876288, + "step": 3344 + }, + { + "epoch": 0.03932736642256577, + "grad_norm": 18.05655860900879, + "learning_rate": 1.2101323432217454e-07, + "loss": 0.3583, + "num_input_tokens_seen": 7014973440, + "step": 3345 + }, + { + "epoch": 0.039598589639273124, + "grad_norm": 16.21353530883789, + "learning_rate": 1.208914879661691e-07, + "loss": 0.3726, + "num_input_tokens_seen": 7017070592, + "step": 3346 + }, + { + "epoch": 0.03986981285598047, + "grad_norm": 11.822895050048828, + "learning_rate": 1.2077008694036527e-07, + "loss": 0.2253, + "num_input_tokens_seen": 7019167744, + "step": 3347 + }, + { + "epoch": 0.04014103607268782, + "grad_norm": 19.47376823425293, + "learning_rate": 1.2064903134246221e-07, + "loss": 0.4391, + "num_input_tokens_seen": 7021264896, + "step": 3348 + }, + { + "epoch": 0.040412259289395173, + "grad_norm": 18.84421730041504, + "learning_rate": 1.2052832126988053e-07, + "loss": 0.6866, + "num_input_tokens_seen": 7023362048, + "step": 3349 + }, + { + "epoch": 0.04068348250610252, + "grad_norm": 18.107643127441406, + "learning_rate": 1.2040795681976338e-07, + "loss": 0.5646, + "num_input_tokens_seen": 7025459200, + "step": 3350 + }, + { + "epoch": 0.040954705722809875, + "grad_norm": 18.47949981689453, + "learning_rate": 1.2028793808897537e-07, + "loss": 0.5655, + "num_input_tokens_seen": 7027556352, + "step": 3351 + }, + { + "epoch": 0.04122592893951722, + "grad_norm": 18.35427474975586, + "learning_rate": 1.20168265174103e-07, + "loss": 0.3471, + "num_input_tokens_seen": 7029653504, + "step": 3352 + }, + { + "epoch": 0.04149715215622457, + "grad_norm": 13.998270034790039, + "learning_rate": 1.2004893817145456e-07, + "loss": 0.2921, + "num_input_tokens_seen": 7031750656, + "step": 3353 + }, + { + "epoch": 0.041768375372931925, + "grad_norm": 26.075870513916016, + "learning_rate": 1.1992995717705982e-07, + "loss": 0.7065, + "num_input_tokens_seen": 7033847808, + "step": 3354 + }, + { + "epoch": 0.04203959858963927, + "grad_norm": 14.8284912109375, + "learning_rate": 1.198113222866702e-07, + "loss": 0.2938, + "num_input_tokens_seen": 7035944960, + "step": 3355 + }, + { + "epoch": 0.04231082180634662, + "grad_norm": 15.237571716308594, + "learning_rate": 1.1969303359575845e-07, + "loss": 0.3828, + "num_input_tokens_seen": 7038042112, + "step": 3356 + }, + { + "epoch": 0.042582045023053974, + "grad_norm": 14.325973510742188, + "learning_rate": 1.1957509119951885e-07, + "loss": 0.3513, + "num_input_tokens_seen": 7040139264, + "step": 3357 + }, + { + "epoch": 0.04285326823976132, + "grad_norm": 14.475112915039062, + "learning_rate": 1.1945749519286694e-07, + "loss": 0.277, + "num_input_tokens_seen": 7042236416, + "step": 3358 + }, + { + "epoch": 0.043124491456468676, + "grad_norm": 16.315767288208008, + "learning_rate": 1.193402456704396e-07, + "loss": 0.383, + "num_input_tokens_seen": 7044333568, + "step": 3359 + }, + { + "epoch": 0.043395714673176024, + "grad_norm": 19.562829971313477, + "learning_rate": 1.1922334272659474e-07, + "loss": 0.4815, + "num_input_tokens_seen": 7046430720, + "step": 3360 + }, + { + "epoch": 0.04366693788988337, + "grad_norm": 19.108549118041992, + "learning_rate": 1.1910678645541137e-07, + "loss": 0.4433, + "num_input_tokens_seen": 7048527872, + "step": 3361 + }, + { + "epoch": 0.043938161106590726, + "grad_norm": 12.979419708251953, + "learning_rate": 1.1899057695068954e-07, + "loss": 0.2735, + "num_input_tokens_seen": 7050625024, + "step": 3362 + }, + { + "epoch": 0.04420938432329807, + "grad_norm": 20.514404296875, + "learning_rate": 1.1887471430595028e-07, + "loss": 0.5001, + "num_input_tokens_seen": 7052722176, + "step": 3363 + }, + { + "epoch": 0.04448060754000543, + "grad_norm": 20.933198928833008, + "learning_rate": 1.187591986144354e-07, + "loss": 0.492, + "num_input_tokens_seen": 7054819328, + "step": 3364 + }, + { + "epoch": 0.044751830756712775, + "grad_norm": 21.284048080444336, + "learning_rate": 1.1864402996910748e-07, + "loss": 0.6521, + "num_input_tokens_seen": 7056916480, + "step": 3365 + }, + { + "epoch": 0.04502305397342012, + "grad_norm": 30.391963958740234, + "learning_rate": 1.1852920846265002e-07, + "loss": 0.4218, + "num_input_tokens_seen": 7059013632, + "step": 3366 + }, + { + "epoch": 0.04529427719012748, + "grad_norm": 20.589599609375, + "learning_rate": 1.1841473418746685e-07, + "loss": 0.5948, + "num_input_tokens_seen": 7061110784, + "step": 3367 + }, + { + "epoch": 0.045565500406834825, + "grad_norm": 16.451263427734375, + "learning_rate": 1.1830060723568256e-07, + "loss": 0.3326, + "num_input_tokens_seen": 7063207936, + "step": 3368 + }, + { + "epoch": 0.04583672362354217, + "grad_norm": 18.889476776123047, + "learning_rate": 1.1818682769914226e-07, + "loss": 0.4692, + "num_input_tokens_seen": 7065305088, + "step": 3369 + }, + { + "epoch": 0.04610794684024953, + "grad_norm": 22.5378475189209, + "learning_rate": 1.1807339566941123e-07, + "loss": 0.5904, + "num_input_tokens_seen": 7067402240, + "step": 3370 + }, + { + "epoch": 0.046379170056956874, + "grad_norm": 18.752378463745117, + "learning_rate": 1.1796031123777546e-07, + "loss": 0.668, + "num_input_tokens_seen": 7069499392, + "step": 3371 + }, + { + "epoch": 0.04665039327366423, + "grad_norm": 14.500882148742676, + "learning_rate": 1.178475744952408e-07, + "loss": 0.3436, + "num_input_tokens_seen": 7071596544, + "step": 3372 + }, + { + "epoch": 0.046921616490371576, + "grad_norm": 16.3090877532959, + "learning_rate": 1.1773518553253352e-07, + "loss": 0.3595, + "num_input_tokens_seen": 7073693696, + "step": 3373 + }, + { + "epoch": 0.047192839707078924, + "grad_norm": 20.062055587768555, + "learning_rate": 1.1762314444010002e-07, + "loss": 0.4885, + "num_input_tokens_seen": 7075790848, + "step": 3374 + }, + { + "epoch": 0.04746406292378628, + "grad_norm": 13.69767951965332, + "learning_rate": 1.1751145130810675e-07, + "loss": 0.1721, + "num_input_tokens_seen": 7077888000, + "step": 3375 + }, + { + "epoch": 0.047735286140493625, + "grad_norm": 15.124690055847168, + "learning_rate": 1.1740010622644008e-07, + "loss": 0.3988, + "num_input_tokens_seen": 7079985152, + "step": 3376 + }, + { + "epoch": 0.04800650935720097, + "grad_norm": 22.62101936340332, + "learning_rate": 1.1728910928470629e-07, + "loss": 0.53, + "num_input_tokens_seen": 7082082304, + "step": 3377 + }, + { + "epoch": 0.04827773257390833, + "grad_norm": 12.13560962677002, + "learning_rate": 1.1717846057223143e-07, + "loss": 0.2442, + "num_input_tokens_seen": 7084179456, + "step": 3378 + }, + { + "epoch": 0.048548955790615675, + "grad_norm": 22.917356491088867, + "learning_rate": 1.1706816017806142e-07, + "loss": 0.548, + "num_input_tokens_seen": 7086276608, + "step": 3379 + }, + { + "epoch": 0.04882017900732303, + "grad_norm": 26.45726776123047, + "learning_rate": 1.1695820819096186e-07, + "loss": 0.4848, + "num_input_tokens_seen": 7088373760, + "step": 3380 + }, + { + "epoch": 0.04909140222403038, + "grad_norm": 25.90103530883789, + "learning_rate": 1.1684860469941785e-07, + "loss": 0.8083, + "num_input_tokens_seen": 7090470912, + "step": 3381 + }, + { + "epoch": 0.049362625440737724, + "grad_norm": 22.642337799072266, + "learning_rate": 1.1673934979163417e-07, + "loss": 0.5493, + "num_input_tokens_seen": 7092568064, + "step": 3382 + }, + { + "epoch": 0.04963384865744508, + "grad_norm": 11.539298057556152, + "learning_rate": 1.1663044355553493e-07, + "loss": 0.2542, + "num_input_tokens_seen": 7094665216, + "step": 3383 + }, + { + "epoch": 0.049905071874152426, + "grad_norm": 19.434553146362305, + "learning_rate": 1.1652188607876379e-07, + "loss": 0.4264, + "num_input_tokens_seen": 7096762368, + "step": 3384 + }, + { + "epoch": 0.05017629509085978, + "grad_norm": 28.15933609008789, + "learning_rate": 1.1641367744868372e-07, + "loss": 0.8053, + "num_input_tokens_seen": 7098859520, + "step": 3385 + }, + { + "epoch": 0.05044751830756713, + "grad_norm": 23.65896224975586, + "learning_rate": 1.1630581775237681e-07, + "loss": 0.3073, + "num_input_tokens_seen": 7100956672, + "step": 3386 + }, + { + "epoch": 0.050718741524274476, + "grad_norm": 14.19971752166748, + "learning_rate": 1.1619830707664446e-07, + "loss": 0.2546, + "num_input_tokens_seen": 7103053824, + "step": 3387 + }, + { + "epoch": 0.05098996474098183, + "grad_norm": 17.83209800720215, + "learning_rate": 1.160911455080073e-07, + "loss": 0.3369, + "num_input_tokens_seen": 7105150976, + "step": 3388 + }, + { + "epoch": 0.05126118795768918, + "grad_norm": 21.761837005615234, + "learning_rate": 1.1598433313270472e-07, + "loss": 0.5246, + "num_input_tokens_seen": 7107248128, + "step": 3389 + }, + { + "epoch": 0.051532411174396525, + "grad_norm": 26.923656463623047, + "learning_rate": 1.1587787003669523e-07, + "loss": 0.4256, + "num_input_tokens_seen": 7109345280, + "step": 3390 + }, + { + "epoch": 0.05180363439110388, + "grad_norm": 16.93385124206543, + "learning_rate": 1.1577175630565646e-07, + "loss": 0.4239, + "num_input_tokens_seen": 7111442432, + "step": 3391 + }, + { + "epoch": 0.05207485760781123, + "grad_norm": 28.331945419311523, + "learning_rate": 1.1566599202498456e-07, + "loss": 0.7451, + "num_input_tokens_seen": 7113539584, + "step": 3392 + }, + { + "epoch": 0.05234608082451858, + "grad_norm": 13.455109596252441, + "learning_rate": 1.1556057727979468e-07, + "loss": 0.3069, + "num_input_tokens_seen": 7115636736, + "step": 3393 + }, + { + "epoch": 0.05261730404122593, + "grad_norm": 22.329641342163086, + "learning_rate": 1.1545551215492055e-07, + "loss": 0.5828, + "num_input_tokens_seen": 7117733888, + "step": 3394 + }, + { + "epoch": 0.05288852725793328, + "grad_norm": 23.203813552856445, + "learning_rate": 1.1535079673491458e-07, + "loss": 0.4792, + "num_input_tokens_seen": 7119831040, + "step": 3395 + }, + { + "epoch": 0.05315975047464063, + "grad_norm": 12.3374662399292, + "learning_rate": 1.1524643110404778e-07, + "loss": 0.2758, + "num_input_tokens_seen": 7121928192, + "step": 3396 + }, + { + "epoch": 0.05343097369134798, + "grad_norm": 16.85381507873535, + "learning_rate": 1.1514241534630963e-07, + "loss": 0.3412, + "num_input_tokens_seen": 7124025344, + "step": 3397 + }, + { + "epoch": 0.05370219690805533, + "grad_norm": 18.195323944091797, + "learning_rate": 1.1503874954540804e-07, + "loss": 0.429, + "num_input_tokens_seen": 7126122496, + "step": 3398 + }, + { + "epoch": 0.05397342012476268, + "grad_norm": 15.976231575012207, + "learning_rate": 1.1493543378476927e-07, + "loss": 0.3129, + "num_input_tokens_seen": 7128219648, + "step": 3399 + }, + { + "epoch": 0.05424464334147003, + "grad_norm": 15.415371894836426, + "learning_rate": 1.1483246814753798e-07, + "loss": 0.4292, + "num_input_tokens_seen": 7130316800, + "step": 3400 + }, + { + "epoch": 0.05451586655817738, + "grad_norm": 12.561007499694824, + "learning_rate": 1.1472985271657697e-07, + "loss": 0.3157, + "num_input_tokens_seen": 7132413952, + "step": 3401 + }, + { + "epoch": 0.05478708977488473, + "grad_norm": 19.007686614990234, + "learning_rate": 1.1462758757446728e-07, + "loss": 0.515, + "num_input_tokens_seen": 7134511104, + "step": 3402 + }, + { + "epoch": 0.05505831299159208, + "grad_norm": 14.344964027404785, + "learning_rate": 1.1452567280350789e-07, + "loss": 0.2947, + "num_input_tokens_seen": 7136608256, + "step": 3403 + }, + { + "epoch": 0.05532953620829943, + "grad_norm": 14.677167892456055, + "learning_rate": 1.1442410848571602e-07, + "loss": 0.2962, + "num_input_tokens_seen": 7138705408, + "step": 3404 + }, + { + "epoch": 0.05560075942500678, + "grad_norm": 15.091597557067871, + "learning_rate": 1.1432289470282683e-07, + "loss": 0.3201, + "num_input_tokens_seen": 7140802560, + "step": 3405 + }, + { + "epoch": 0.055871982641714134, + "grad_norm": 16.691532135009766, + "learning_rate": 1.1422203153629312e-07, + "loss": 0.3313, + "num_input_tokens_seen": 7142899712, + "step": 3406 + }, + { + "epoch": 0.05614320585842148, + "grad_norm": 16.004104614257812, + "learning_rate": 1.1412151906728589e-07, + "loss": 0.3258, + "num_input_tokens_seen": 7144996864, + "step": 3407 + }, + { + "epoch": 0.05641442907512883, + "grad_norm": 20.203420639038086, + "learning_rate": 1.1402135737669372e-07, + "loss": 0.6703, + "num_input_tokens_seen": 7147094016, + "step": 3408 + }, + { + "epoch": 0.05668565229183618, + "grad_norm": 16.784305572509766, + "learning_rate": 1.1392154654512289e-07, + "loss": 0.4107, + "num_input_tokens_seen": 7149191168, + "step": 3409 + }, + { + "epoch": 0.05695687550854353, + "grad_norm": 18.371356964111328, + "learning_rate": 1.1382208665289742e-07, + "loss": 0.5004, + "num_input_tokens_seen": 7151288320, + "step": 3410 + }, + { + "epoch": 0.05722809872525088, + "grad_norm": 28.510942459106445, + "learning_rate": 1.1372297778005883e-07, + "loss": 0.7699, + "num_input_tokens_seen": 7153385472, + "step": 3411 + }, + { + "epoch": 0.05749932194195823, + "grad_norm": 20.71713638305664, + "learning_rate": 1.1362422000636609e-07, + "loss": 0.399, + "num_input_tokens_seen": 7155482624, + "step": 3412 + }, + { + "epoch": 0.05777054515866558, + "grad_norm": 17.85915184020996, + "learning_rate": 1.135258134112958e-07, + "loss": 0.4794, + "num_input_tokens_seen": 7157579776, + "step": 3413 + }, + { + "epoch": 0.058041768375372935, + "grad_norm": 16.834300994873047, + "learning_rate": 1.1342775807404177e-07, + "loss": 0.4149, + "num_input_tokens_seen": 7159676928, + "step": 3414 + }, + { + "epoch": 0.05831299159208028, + "grad_norm": 14.176383018493652, + "learning_rate": 1.1333005407351516e-07, + "loss": 0.3177, + "num_input_tokens_seen": 7161774080, + "step": 3415 + }, + { + "epoch": 0.05858421480878763, + "grad_norm": 20.5920352935791, + "learning_rate": 1.1323270148834461e-07, + "loss": 0.6466, + "num_input_tokens_seen": 7163871232, + "step": 3416 + }, + { + "epoch": 0.058855438025494984, + "grad_norm": 20.29216194152832, + "learning_rate": 1.1313570039687571e-07, + "loss": 0.5934, + "num_input_tokens_seen": 7165968384, + "step": 3417 + }, + { + "epoch": 0.05912666124220233, + "grad_norm": 19.53738784790039, + "learning_rate": 1.1303905087717111e-07, + "loss": 0.4848, + "num_input_tokens_seen": 7168065536, + "step": 3418 + }, + { + "epoch": 0.059397884458909686, + "grad_norm": 16.945344924926758, + "learning_rate": 1.1294275300701085e-07, + "loss": 0.3478, + "num_input_tokens_seen": 7170162688, + "step": 3419 + }, + { + "epoch": 0.059669107675617034, + "grad_norm": 17.19941520690918, + "learning_rate": 1.1284680686389163e-07, + "loss": 0.4514, + "num_input_tokens_seen": 7172259840, + "step": 3420 + }, + { + "epoch": 0.05994033089232438, + "grad_norm": 14.423097610473633, + "learning_rate": 1.1275121252502738e-07, + "loss": 0.3855, + "num_input_tokens_seen": 7174356992, + "step": 3421 + }, + { + "epoch": 0.060211554109031735, + "grad_norm": 13.27149772644043, + "learning_rate": 1.1265597006734872e-07, + "loss": 0.3488, + "num_input_tokens_seen": 7176454144, + "step": 3422 + }, + { + "epoch": 0.06048277732573908, + "grad_norm": 17.755043029785156, + "learning_rate": 1.1256107956750319e-07, + "loss": 0.4548, + "num_input_tokens_seen": 7178551296, + "step": 3423 + }, + { + "epoch": 0.06075400054244643, + "grad_norm": 15.190084457397461, + "learning_rate": 1.1246654110185501e-07, + "loss": 0.186, + "num_input_tokens_seen": 7180648448, + "step": 3424 + }, + { + "epoch": 0.061025223759153785, + "grad_norm": 23.48501968383789, + "learning_rate": 1.1237235474648516e-07, + "loss": 0.852, + "num_input_tokens_seen": 7182745600, + "step": 3425 + }, + { + "epoch": 0.06129644697586113, + "grad_norm": 18.804317474365234, + "learning_rate": 1.1227852057719125e-07, + "loss": 0.5567, + "num_input_tokens_seen": 7184842752, + "step": 3426 + }, + { + "epoch": 0.06156767019256849, + "grad_norm": 18.396987915039062, + "learning_rate": 1.121850386694875e-07, + "loss": 0.4162, + "num_input_tokens_seen": 7186939904, + "step": 3427 + }, + { + "epoch": 0.061838893409275834, + "grad_norm": 18.709394454956055, + "learning_rate": 1.1209190909860453e-07, + "loss": 0.5276, + "num_input_tokens_seen": 7189037056, + "step": 3428 + }, + { + "epoch": 0.06211011662598318, + "grad_norm": 25.96376609802246, + "learning_rate": 1.119991319394894e-07, + "loss": 0.5129, + "num_input_tokens_seen": 7191134208, + "step": 3429 + }, + { + "epoch": 0.062381339842690536, + "grad_norm": 24.267688751220703, + "learning_rate": 1.1190670726680579e-07, + "loss": 0.6664, + "num_input_tokens_seen": 7193231360, + "step": 3430 + }, + { + "epoch": 0.06265256305939788, + "grad_norm": 12.63032054901123, + "learning_rate": 1.1181463515493336e-07, + "loss": 0.2624, + "num_input_tokens_seen": 7195328512, + "step": 3431 + }, + { + "epoch": 0.06292378627610523, + "grad_norm": 18.082197189331055, + "learning_rate": 1.1172291567796846e-07, + "loss": 0.3512, + "num_input_tokens_seen": 7197425664, + "step": 3432 + }, + { + "epoch": 0.06319500949281258, + "grad_norm": 13.270282745361328, + "learning_rate": 1.1163154890972333e-07, + "loss": 0.3531, + "num_input_tokens_seen": 7199522816, + "step": 3433 + }, + { + "epoch": 0.06346623270951994, + "grad_norm": 11.08203411102295, + "learning_rate": 1.1154053492372654e-07, + "loss": 0.2167, + "num_input_tokens_seen": 7201619968, + "step": 3434 + }, + { + "epoch": 0.06373745592622729, + "grad_norm": 18.60203742980957, + "learning_rate": 1.1144987379322254e-07, + "loss": 0.5286, + "num_input_tokens_seen": 7203717120, + "step": 3435 + }, + { + "epoch": 0.06400867914293464, + "grad_norm": 22.605010986328125, + "learning_rate": 1.1135956559117207e-07, + "loss": 0.7746, + "num_input_tokens_seen": 7205814272, + "step": 3436 + }, + { + "epoch": 0.06427990235964198, + "grad_norm": 10.375788688659668, + "learning_rate": 1.1126961039025168e-07, + "loss": 0.1999, + "num_input_tokens_seen": 7207911424, + "step": 3437 + }, + { + "epoch": 0.06455112557634933, + "grad_norm": 16.99294662475586, + "learning_rate": 1.111800082628539e-07, + "loss": 0.3321, + "num_input_tokens_seen": 7210008576, + "step": 3438 + }, + { + "epoch": 0.06482234879305669, + "grad_norm": 18.679296493530273, + "learning_rate": 1.1109075928108715e-07, + "loss": 0.517, + "num_input_tokens_seen": 7212105728, + "step": 3439 + }, + { + "epoch": 0.06509357200976404, + "grad_norm": 25.95256233215332, + "learning_rate": 1.1100186351677567e-07, + "loss": 0.8827, + "num_input_tokens_seen": 7214202880, + "step": 3440 + }, + { + "epoch": 0.06536479522647139, + "grad_norm": 15.406830787658691, + "learning_rate": 1.1091332104145921e-07, + "loss": 0.412, + "num_input_tokens_seen": 7216300032, + "step": 3441 + }, + { + "epoch": 0.06563601844317873, + "grad_norm": 17.281946182250977, + "learning_rate": 1.1082513192639353e-07, + "loss": 0.4441, + "num_input_tokens_seen": 7218397184, + "step": 3442 + }, + { + "epoch": 0.06590724165988608, + "grad_norm": 11.579797744750977, + "learning_rate": 1.1073729624254984e-07, + "loss": 0.2542, + "num_input_tokens_seen": 7220494336, + "step": 3443 + }, + { + "epoch": 0.06617846487659344, + "grad_norm": 20.977197647094727, + "learning_rate": 1.1064981406061494e-07, + "loss": 0.6309, + "num_input_tokens_seen": 7222591488, + "step": 3444 + }, + { + "epoch": 0.06644968809330079, + "grad_norm": 15.974722862243652, + "learning_rate": 1.1056268545099117e-07, + "loss": 0.4147, + "num_input_tokens_seen": 7224688640, + "step": 3445 + }, + { + "epoch": 0.06672091131000814, + "grad_norm": 11.669343948364258, + "learning_rate": 1.1047591048379635e-07, + "loss": 0.2354, + "num_input_tokens_seen": 7226785792, + "step": 3446 + }, + { + "epoch": 0.06699213452671549, + "grad_norm": 24.988540649414062, + "learning_rate": 1.1038948922886355e-07, + "loss": 0.6374, + "num_input_tokens_seen": 7228882944, + "step": 3447 + }, + { + "epoch": 0.06726335774342283, + "grad_norm": 17.950305938720703, + "learning_rate": 1.1030342175574144e-07, + "loss": 0.4621, + "num_input_tokens_seen": 7230980096, + "step": 3448 + }, + { + "epoch": 0.06753458096013018, + "grad_norm": 23.91762351989746, + "learning_rate": 1.1021770813369378e-07, + "loss": 0.5804, + "num_input_tokens_seen": 7233077248, + "step": 3449 + }, + { + "epoch": 0.06780580417683754, + "grad_norm": 17.67766571044922, + "learning_rate": 1.1013234843169967e-07, + "loss": 0.2842, + "num_input_tokens_seen": 7235174400, + "step": 3450 + }, + { + "epoch": 0.06807702739354489, + "grad_norm": 14.409975051879883, + "learning_rate": 1.100473427184534e-07, + "loss": 0.2676, + "num_input_tokens_seen": 7237271552, + "step": 3451 + }, + { + "epoch": 0.06834825061025224, + "grad_norm": 25.468711853027344, + "learning_rate": 1.0996269106236425e-07, + "loss": 0.553, + "num_input_tokens_seen": 7239368704, + "step": 3452 + }, + { + "epoch": 0.06861947382695958, + "grad_norm": 18.466487884521484, + "learning_rate": 1.0987839353155661e-07, + "loss": 0.3635, + "num_input_tokens_seen": 7241465856, + "step": 3453 + }, + { + "epoch": 0.06889069704366693, + "grad_norm": 15.085163116455078, + "learning_rate": 1.0979445019387e-07, + "loss": 0.4023, + "num_input_tokens_seen": 7243563008, + "step": 3454 + }, + { + "epoch": 0.0691619202603743, + "grad_norm": 15.840180397033691, + "learning_rate": 1.0971086111685883e-07, + "loss": 0.3546, + "num_input_tokens_seen": 7245660160, + "step": 3455 + }, + { + "epoch": 0.06943314347708164, + "grad_norm": 20.410505294799805, + "learning_rate": 1.0962762636779235e-07, + "loss": 0.5338, + "num_input_tokens_seen": 7247757312, + "step": 3456 + }, + { + "epoch": 0.06970436669378899, + "grad_norm": 23.19679832458496, + "learning_rate": 1.0954474601365482e-07, + "loss": 0.6095, + "num_input_tokens_seen": 7249854464, + "step": 3457 + }, + { + "epoch": 0.06997558991049634, + "grad_norm": 14.401229858398438, + "learning_rate": 1.094622201211451e-07, + "loss": 0.4222, + "num_input_tokens_seen": 7251951616, + "step": 3458 + }, + { + "epoch": 0.07024681312720368, + "grad_norm": 17.72669792175293, + "learning_rate": 1.0938004875667689e-07, + "loss": 0.4721, + "num_input_tokens_seen": 7254048768, + "step": 3459 + }, + { + "epoch": 0.07051803634391104, + "grad_norm": 15.16402530670166, + "learning_rate": 1.0929823198637866e-07, + "loss": 0.3393, + "num_input_tokens_seen": 7256145920, + "step": 3460 + }, + { + "epoch": 0.07078925956061839, + "grad_norm": 17.32206153869629, + "learning_rate": 1.0921676987609335e-07, + "loss": 0.5465, + "num_input_tokens_seen": 7258243072, + "step": 3461 + }, + { + "epoch": 0.07106048277732574, + "grad_norm": 20.354280471801758, + "learning_rate": 1.0913566249137865e-07, + "loss": 0.6731, + "num_input_tokens_seen": 7260340224, + "step": 3462 + }, + { + "epoch": 0.07133170599403309, + "grad_norm": 17.26376724243164, + "learning_rate": 1.0905490989750656e-07, + "loss": 0.5397, + "num_input_tokens_seen": 7262437376, + "step": 3463 + }, + { + "epoch": 0.07160292921074043, + "grad_norm": 22.16339683532715, + "learning_rate": 1.0897451215946378e-07, + "loss": 0.6668, + "num_input_tokens_seen": 7264534528, + "step": 3464 + }, + { + "epoch": 0.0718741524274478, + "grad_norm": 25.3452091217041, + "learning_rate": 1.0889446934195141e-07, + "loss": 0.5578, + "num_input_tokens_seen": 7266631680, + "step": 3465 + }, + { + "epoch": 0.07214537564415514, + "grad_norm": 20.26123809814453, + "learning_rate": 1.0881478150938475e-07, + "loss": 0.4272, + "num_input_tokens_seen": 7268728832, + "step": 3466 + }, + { + "epoch": 0.07241659886086249, + "grad_norm": 15.355372428894043, + "learning_rate": 1.0873544872589361e-07, + "loss": 0.3703, + "num_input_tokens_seen": 7270825984, + "step": 3467 + }, + { + "epoch": 0.07268782207756984, + "grad_norm": 19.550086975097656, + "learning_rate": 1.08656471055322e-07, + "loss": 0.4287, + "num_input_tokens_seen": 7272923136, + "step": 3468 + }, + { + "epoch": 0.07295904529427719, + "grad_norm": 23.889299392700195, + "learning_rate": 1.0857784856122812e-07, + "loss": 0.8543, + "num_input_tokens_seen": 7275020288, + "step": 3469 + }, + { + "epoch": 0.07323026851098453, + "grad_norm": 17.23611831665039, + "learning_rate": 1.084995813068843e-07, + "loss": 0.3986, + "num_input_tokens_seen": 7277117440, + "step": 3470 + }, + { + "epoch": 0.0735014917276919, + "grad_norm": 20.09700584411621, + "learning_rate": 1.0842166935527716e-07, + "loss": 0.5305, + "num_input_tokens_seen": 7279214592, + "step": 3471 + }, + { + "epoch": 0.07377271494439924, + "grad_norm": 17.18355941772461, + "learning_rate": 1.0834411276910715e-07, + "loss": 0.3567, + "num_input_tokens_seen": 7281311744, + "step": 3472 + }, + { + "epoch": 0.07404393816110659, + "grad_norm": 20.5414981842041, + "learning_rate": 1.0826691161078895e-07, + "loss": 0.5759, + "num_input_tokens_seen": 7283408896, + "step": 3473 + }, + { + "epoch": 0.07431516137781394, + "grad_norm": 14.221673965454102, + "learning_rate": 1.0819006594245114e-07, + "loss": 0.3052, + "num_input_tokens_seen": 7285506048, + "step": 3474 + }, + { + "epoch": 0.07458638459452128, + "grad_norm": 21.631595611572266, + "learning_rate": 1.0811357582593613e-07, + "loss": 0.669, + "num_input_tokens_seen": 7287603200, + "step": 3475 + }, + { + "epoch": 0.07485760781122865, + "grad_norm": 23.90924644470215, + "learning_rate": 1.0803744132280025e-07, + "loss": 0.8402, + "num_input_tokens_seen": 7289700352, + "step": 3476 + }, + { + "epoch": 0.075128831027936, + "grad_norm": 19.435346603393555, + "learning_rate": 1.0796166249431371e-07, + "loss": 0.5147, + "num_input_tokens_seen": 7291797504, + "step": 3477 + }, + { + "epoch": 0.07540005424464334, + "grad_norm": 20.027788162231445, + "learning_rate": 1.0788623940146032e-07, + "loss": 0.5814, + "num_input_tokens_seen": 7293894656, + "step": 3478 + }, + { + "epoch": 0.07567127746135069, + "grad_norm": 15.927900314331055, + "learning_rate": 1.0781117210493781e-07, + "loss": 0.3769, + "num_input_tokens_seen": 7295991808, + "step": 3479 + }, + { + "epoch": 0.07594250067805804, + "grad_norm": 29.270769119262695, + "learning_rate": 1.0773646066515748e-07, + "loss": 0.6378, + "num_input_tokens_seen": 7298088960, + "step": 3480 + }, + { + "epoch": 0.0762137238947654, + "grad_norm": 15.728423118591309, + "learning_rate": 1.0766210514224419e-07, + "loss": 0.2933, + "num_input_tokens_seen": 7300186112, + "step": 3481 + }, + { + "epoch": 0.07648494711147275, + "grad_norm": 19.27528953552246, + "learning_rate": 1.0758810559603651e-07, + "loss": 0.4516, + "num_input_tokens_seen": 7302283264, + "step": 3482 + }, + { + "epoch": 0.07675617032818009, + "grad_norm": 16.80342674255371, + "learning_rate": 1.0751446208608642e-07, + "loss": 0.3451, + "num_input_tokens_seen": 7304380416, + "step": 3483 + }, + { + "epoch": 0.07702739354488744, + "grad_norm": 14.538717269897461, + "learning_rate": 1.0744117467165938e-07, + "loss": 0.4333, + "num_input_tokens_seen": 7306477568, + "step": 3484 + }, + { + "epoch": 0.07729861676159479, + "grad_norm": 15.928664207458496, + "learning_rate": 1.0736824341173442e-07, + "loss": 0.3792, + "num_input_tokens_seen": 7308574720, + "step": 3485 + }, + { + "epoch": 0.07756983997830215, + "grad_norm": 17.85283088684082, + "learning_rate": 1.0729566836500373e-07, + "loss": 0.4084, + "num_input_tokens_seen": 7310671872, + "step": 3486 + }, + { + "epoch": 0.0778410631950095, + "grad_norm": 25.531211853027344, + "learning_rate": 1.07223449589873e-07, + "loss": 0.4754, + "num_input_tokens_seen": 7312769024, + "step": 3487 + }, + { + "epoch": 0.07811228641171684, + "grad_norm": 12.72594165802002, + "learning_rate": 1.0715158714446109e-07, + "loss": 0.3349, + "num_input_tokens_seen": 7314866176, + "step": 3488 + }, + { + "epoch": 0.07838350962842419, + "grad_norm": 17.275711059570312, + "learning_rate": 1.0708008108660026e-07, + "loss": 0.3707, + "num_input_tokens_seen": 7316963328, + "step": 3489 + }, + { + "epoch": 0.07865473284513154, + "grad_norm": 13.209538459777832, + "learning_rate": 1.0700893147383582e-07, + "loss": 0.2781, + "num_input_tokens_seen": 7319060480, + "step": 3490 + }, + { + "epoch": 0.07892595606183889, + "grad_norm": 17.69983673095703, + "learning_rate": 1.069381383634263e-07, + "loss": 0.5252, + "num_input_tokens_seen": 7321157632, + "step": 3491 + }, + { + "epoch": 0.07919717927854625, + "grad_norm": 16.224292755126953, + "learning_rate": 1.0686770181234322e-07, + "loss": 0.4176, + "num_input_tokens_seen": 7323254784, + "step": 3492 + }, + { + "epoch": 0.0794684024952536, + "grad_norm": 15.399229049682617, + "learning_rate": 1.0679762187727129e-07, + "loss": 0.3494, + "num_input_tokens_seen": 7325351936, + "step": 3493 + }, + { + "epoch": 0.07973962571196094, + "grad_norm": 14.10446548461914, + "learning_rate": 1.0672789861460818e-07, + "loss": 0.2833, + "num_input_tokens_seen": 7327449088, + "step": 3494 + }, + { + "epoch": 0.08001084892866829, + "grad_norm": 20.31482696533203, + "learning_rate": 1.0665853208046449e-07, + "loss": 0.57, + "num_input_tokens_seen": 7329546240, + "step": 3495 + }, + { + "epoch": 0.08028207214537564, + "grad_norm": 18.91710090637207, + "learning_rate": 1.0658952233066381e-07, + "loss": 0.3401, + "num_input_tokens_seen": 7331643392, + "step": 3496 + }, + { + "epoch": 0.080553295362083, + "grad_norm": 18.957792282104492, + "learning_rate": 1.0652086942074255e-07, + "loss": 0.5459, + "num_input_tokens_seen": 7333740544, + "step": 3497 + }, + { + "epoch": 0.08082451857879035, + "grad_norm": 16.166566848754883, + "learning_rate": 1.0645257340594988e-07, + "loss": 0.2876, + "num_input_tokens_seen": 7335837696, + "step": 3498 + }, + { + "epoch": 0.0810957417954977, + "grad_norm": 14.657668113708496, + "learning_rate": 1.06384634341248e-07, + "loss": 0.3123, + "num_input_tokens_seen": 7337934848, + "step": 3499 + }, + { + "epoch": 0.08136696501220504, + "grad_norm": 16.774099349975586, + "learning_rate": 1.0631705228131149e-07, + "loss": 0.3603, + "num_input_tokens_seen": 7340032000, + "step": 3500 + }, + { + "epoch": 0.08163818822891239, + "grad_norm": 18.32398223876953, + "learning_rate": 1.0624982728052795e-07, + "loss": 0.261, + "num_input_tokens_seen": 7342129152, + "step": 3501 + }, + { + "epoch": 0.08190941144561975, + "grad_norm": 13.998818397521973, + "learning_rate": 1.0618295939299752e-07, + "loss": 0.3176, + "num_input_tokens_seen": 7344226304, + "step": 3502 + }, + { + "epoch": 0.0821806346623271, + "grad_norm": 20.430179595947266, + "learning_rate": 1.0611644867253284e-07, + "loss": 0.6581, + "num_input_tokens_seen": 7346323456, + "step": 3503 + }, + { + "epoch": 0.08245185787903445, + "grad_norm": 22.212148666381836, + "learning_rate": 1.0605029517265918e-07, + "loss": 0.637, + "num_input_tokens_seen": 7348420608, + "step": 3504 + }, + { + "epoch": 0.0827230810957418, + "grad_norm": 18.4092960357666, + "learning_rate": 1.0598449894661445e-07, + "loss": 0.3632, + "num_input_tokens_seen": 7350517760, + "step": 3505 + }, + { + "epoch": 0.08299430431244914, + "grad_norm": 19.119739532470703, + "learning_rate": 1.0591906004734895e-07, + "loss": 0.6235, + "num_input_tokens_seen": 7352614912, + "step": 3506 + }, + { + "epoch": 0.0832655275291565, + "grad_norm": 22.81473159790039, + "learning_rate": 1.0585397852752544e-07, + "loss": 0.8699, + "num_input_tokens_seen": 7354712064, + "step": 3507 + }, + { + "epoch": 0.08353675074586385, + "grad_norm": 17.07512855529785, + "learning_rate": 1.0578925443951895e-07, + "loss": 0.568, + "num_input_tokens_seen": 7356809216, + "step": 3508 + }, + { + "epoch": 0.0838079739625712, + "grad_norm": 10.896271705627441, + "learning_rate": 1.0572488783541702e-07, + "loss": 0.2377, + "num_input_tokens_seen": 7358906368, + "step": 3509 + }, + { + "epoch": 0.08407919717927854, + "grad_norm": 15.978910446166992, + "learning_rate": 1.0566087876701941e-07, + "loss": 0.3153, + "num_input_tokens_seen": 7361003520, + "step": 3510 + }, + { + "epoch": 0.08435042039598589, + "grad_norm": 21.776466369628906, + "learning_rate": 1.0559722728583825e-07, + "loss": 0.5863, + "num_input_tokens_seen": 7363100672, + "step": 3511 + }, + { + "epoch": 0.08462164361269324, + "grad_norm": 11.799039840698242, + "learning_rate": 1.0553393344309775e-07, + "loss": 0.3144, + "num_input_tokens_seen": 7365197824, + "step": 3512 + }, + { + "epoch": 0.0848928668294006, + "grad_norm": 14.083271980285645, + "learning_rate": 1.054709972897344e-07, + "loss": 0.3156, + "num_input_tokens_seen": 7367294976, + "step": 3513 + }, + { + "epoch": 0.08516409004610795, + "grad_norm": 20.98029899597168, + "learning_rate": 1.0540841887639698e-07, + "loss": 0.7161, + "num_input_tokens_seen": 7369392128, + "step": 3514 + }, + { + "epoch": 0.0854353132628153, + "grad_norm": 16.61429786682129, + "learning_rate": 1.0534619825344596e-07, + "loss": 0.3341, + "num_input_tokens_seen": 7371489280, + "step": 3515 + }, + { + "epoch": 0.08570653647952264, + "grad_norm": 13.247276306152344, + "learning_rate": 1.052843354709543e-07, + "loss": 0.3035, + "num_input_tokens_seen": 7373586432, + "step": 3516 + }, + { + "epoch": 0.08597775969622999, + "grad_norm": 17.826738357543945, + "learning_rate": 1.0522283057870675e-07, + "loss": 0.3785, + "num_input_tokens_seen": 7375683584, + "step": 3517 + }, + { + "epoch": 0.08624898291293735, + "grad_norm": 21.997249603271484, + "learning_rate": 1.0516168362620013e-07, + "loss": 0.4728, + "num_input_tokens_seen": 7377780736, + "step": 3518 + }, + { + "epoch": 0.0865202061296447, + "grad_norm": 18.08944320678711, + "learning_rate": 1.0510089466264321e-07, + "loss": 0.4232, + "num_input_tokens_seen": 7379877888, + "step": 3519 + }, + { + "epoch": 0.08679142934635205, + "grad_norm": 12.958785057067871, + "learning_rate": 1.0504046373695648e-07, + "loss": 0.2725, + "num_input_tokens_seen": 7381975040, + "step": 3520 + }, + { + "epoch": 0.0870626525630594, + "grad_norm": 18.204368591308594, + "learning_rate": 1.0498039089777265e-07, + "loss": 0.5077, + "num_input_tokens_seen": 7384072192, + "step": 3521 + }, + { + "epoch": 0.08733387577976674, + "grad_norm": 15.504716873168945, + "learning_rate": 1.0492067619343594e-07, + "loss": 0.4001, + "num_input_tokens_seen": 7386169344, + "step": 3522 + }, + { + "epoch": 0.0876050989964741, + "grad_norm": 12.572887420654297, + "learning_rate": 1.0486131967200254e-07, + "loss": 0.196, + "num_input_tokens_seen": 7388266496, + "step": 3523 + }, + { + "epoch": 0.08787632221318145, + "grad_norm": 12.989893913269043, + "learning_rate": 1.048023213812403e-07, + "loss": 0.2463, + "num_input_tokens_seen": 7390363648, + "step": 3524 + }, + { + "epoch": 0.0881475454298888, + "grad_norm": 18.88071632385254, + "learning_rate": 1.0474368136862876e-07, + "loss": 0.4123, + "num_input_tokens_seen": 7392460800, + "step": 3525 + }, + { + "epoch": 0.08841876864659615, + "grad_norm": 24.264707565307617, + "learning_rate": 1.0468539968135922e-07, + "loss": 0.6547, + "num_input_tokens_seen": 7394557952, + "step": 3526 + }, + { + "epoch": 0.0886899918633035, + "grad_norm": 17.752891540527344, + "learning_rate": 1.046274763663345e-07, + "loss": 0.4053, + "num_input_tokens_seen": 7396655104, + "step": 3527 + }, + { + "epoch": 0.08896121508001086, + "grad_norm": 21.44291114807129, + "learning_rate": 1.045699114701691e-07, + "loss": 0.5896, + "num_input_tokens_seen": 7398752256, + "step": 3528 + }, + { + "epoch": 0.0892324382967182, + "grad_norm": 27.565635681152344, + "learning_rate": 1.0451270503918906e-07, + "loss": 0.6174, + "num_input_tokens_seen": 7400849408, + "step": 3529 + }, + { + "epoch": 0.08950366151342555, + "grad_norm": 22.987125396728516, + "learning_rate": 1.0445585711943205e-07, + "loss": 0.6561, + "num_input_tokens_seen": 7402946560, + "step": 3530 + }, + { + "epoch": 0.0897748847301329, + "grad_norm": 17.22407341003418, + "learning_rate": 1.0439936775664699e-07, + "loss": 0.5277, + "num_input_tokens_seen": 7405043712, + "step": 3531 + }, + { + "epoch": 0.09004610794684025, + "grad_norm": 17.74474334716797, + "learning_rate": 1.043432369962943e-07, + "loss": 0.3715, + "num_input_tokens_seen": 7407140864, + "step": 3532 + }, + { + "epoch": 0.09031733116354759, + "grad_norm": 14.838858604431152, + "learning_rate": 1.0428746488354606e-07, + "loss": 0.4398, + "num_input_tokens_seen": 7409238016, + "step": 3533 + }, + { + "epoch": 0.09058855438025495, + "grad_norm": 15.398836135864258, + "learning_rate": 1.0423205146328548e-07, + "loss": 0.3574, + "num_input_tokens_seen": 7411335168, + "step": 3534 + }, + { + "epoch": 0.0908597775969623, + "grad_norm": 20.782089233398438, + "learning_rate": 1.0417699678010708e-07, + "loss": 0.539, + "num_input_tokens_seen": 7413432320, + "step": 3535 + }, + { + "epoch": 0.09113100081366965, + "grad_norm": 23.51567840576172, + "learning_rate": 1.0412230087831689e-07, + "loss": 0.595, + "num_input_tokens_seen": 7415529472, + "step": 3536 + }, + { + "epoch": 0.091402224030377, + "grad_norm": 20.366703033447266, + "learning_rate": 1.0406796380193203e-07, + "loss": 0.4471, + "num_input_tokens_seen": 7417626624, + "step": 3537 + }, + { + "epoch": 0.09167344724708434, + "grad_norm": 18.491153717041016, + "learning_rate": 1.0401398559468098e-07, + "loss": 0.5784, + "num_input_tokens_seen": 7419723776, + "step": 3538 + }, + { + "epoch": 0.0919446704637917, + "grad_norm": 17.398160934448242, + "learning_rate": 1.0396036630000324e-07, + "loss": 0.4378, + "num_input_tokens_seen": 7421820928, + "step": 3539 + }, + { + "epoch": 0.09221589368049905, + "grad_norm": 17.634761810302734, + "learning_rate": 1.0390710596104965e-07, + "loss": 0.3753, + "num_input_tokens_seen": 7423918080, + "step": 3540 + }, + { + "epoch": 0.0924871168972064, + "grad_norm": 19.80194664001465, + "learning_rate": 1.0385420462068206e-07, + "loss": 0.5528, + "num_input_tokens_seen": 7426015232, + "step": 3541 + }, + { + "epoch": 0.09275834011391375, + "grad_norm": 13.204157829284668, + "learning_rate": 1.0380166232147354e-07, + "loss": 0.2905, + "num_input_tokens_seen": 7428112384, + "step": 3542 + }, + { + "epoch": 0.0930295633306211, + "grad_norm": 13.604270935058594, + "learning_rate": 1.0374947910570805e-07, + "loss": 0.3773, + "num_input_tokens_seen": 7430209536, + "step": 3543 + }, + { + "epoch": 0.09330078654732846, + "grad_norm": 15.047932624816895, + "learning_rate": 1.0369765501538067e-07, + "loss": 0.3638, + "num_input_tokens_seen": 7432306688, + "step": 3544 + }, + { + "epoch": 0.0935720097640358, + "grad_norm": 16.72162628173828, + "learning_rate": 1.0364619009219743e-07, + "loss": 0.3592, + "num_input_tokens_seen": 7434403840, + "step": 3545 + }, + { + "epoch": 0.09384323298074315, + "grad_norm": 21.374954223632812, + "learning_rate": 1.0359508437757544e-07, + "loss": 0.3736, + "num_input_tokens_seen": 7436500992, + "step": 3546 + }, + { + "epoch": 0.0941144561974505, + "grad_norm": 16.41029930114746, + "learning_rate": 1.0354433791264255e-07, + "loss": 0.4029, + "num_input_tokens_seen": 7438598144, + "step": 3547 + }, + { + "epoch": 0.09438567941415785, + "grad_norm": 30.30008888244629, + "learning_rate": 1.0349395073823768e-07, + "loss": 0.6386, + "num_input_tokens_seen": 7440695296, + "step": 3548 + }, + { + "epoch": 0.09465690263086521, + "grad_norm": 15.258624076843262, + "learning_rate": 1.0344392289491038e-07, + "loss": 0.3967, + "num_input_tokens_seen": 7442792448, + "step": 3549 + }, + { + "epoch": 0.09492812584757256, + "grad_norm": 21.32552719116211, + "learning_rate": 1.0339425442292118e-07, + "loss": 0.7916, + "num_input_tokens_seen": 7444889600, + "step": 3550 + }, + { + "epoch": 0.0951993490642799, + "grad_norm": 17.294597625732422, + "learning_rate": 1.0334494536224146e-07, + "loss": 0.5297, + "num_input_tokens_seen": 7446986752, + "step": 3551 + }, + { + "epoch": 0.09547057228098725, + "grad_norm": 17.55919647216797, + "learning_rate": 1.0329599575255321e-07, + "loss": 0.509, + "num_input_tokens_seen": 7449083904, + "step": 3552 + }, + { + "epoch": 0.0957417954976946, + "grad_norm": 15.990652084350586, + "learning_rate": 1.0324740563324923e-07, + "loss": 0.3171, + "num_input_tokens_seen": 7451181056, + "step": 3553 + }, + { + "epoch": 0.09601301871440195, + "grad_norm": 24.02924156188965, + "learning_rate": 1.0319917504343297e-07, + "loss": 0.7525, + "num_input_tokens_seen": 7453278208, + "step": 3554 + }, + { + "epoch": 0.09628424193110931, + "grad_norm": 15.403027534484863, + "learning_rate": 1.0315130402191866e-07, + "loss": 0.3421, + "num_input_tokens_seen": 7455375360, + "step": 3555 + }, + { + "epoch": 0.09655546514781665, + "grad_norm": 16.647611618041992, + "learning_rate": 1.0310379260723094e-07, + "loss": 0.4427, + "num_input_tokens_seen": 7457472512, + "step": 3556 + }, + { + "epoch": 0.096826688364524, + "grad_norm": 20.491514205932617, + "learning_rate": 1.0305664083760532e-07, + "loss": 0.6526, + "num_input_tokens_seen": 7459569664, + "step": 3557 + }, + { + "epoch": 0.09709791158123135, + "grad_norm": 15.219748497009277, + "learning_rate": 1.0300984875098772e-07, + "loss": 0.5078, + "num_input_tokens_seen": 7461666816, + "step": 3558 + }, + { + "epoch": 0.0973691347979387, + "grad_norm": 18.49822235107422, + "learning_rate": 1.0296341638503458e-07, + "loss": 0.4535, + "num_input_tokens_seen": 7463763968, + "step": 3559 + }, + { + "epoch": 0.09764035801464606, + "grad_norm": 17.297460556030273, + "learning_rate": 1.029173437771129e-07, + "loss": 0.3632, + "num_input_tokens_seen": 7465861120, + "step": 3560 + }, + { + "epoch": 0.0979115812313534, + "grad_norm": 22.80646514892578, + "learning_rate": 1.0287163096430024e-07, + "loss": 0.7611, + "num_input_tokens_seen": 7467958272, + "step": 3561 + }, + { + "epoch": 0.09818280444806075, + "grad_norm": 15.242216110229492, + "learning_rate": 1.0282627798338444e-07, + "loss": 0.338, + "num_input_tokens_seen": 7470055424, + "step": 3562 + }, + { + "epoch": 0.0984540276647681, + "grad_norm": 17.453556060791016, + "learning_rate": 1.0278128487086387e-07, + "loss": 0.5657, + "num_input_tokens_seen": 7472152576, + "step": 3563 + }, + { + "epoch": 0.09872525088147545, + "grad_norm": 16.23352813720703, + "learning_rate": 1.0273665166294735e-07, + "loss": 0.3696, + "num_input_tokens_seen": 7474249728, + "step": 3564 + }, + { + "epoch": 0.09899647409818281, + "grad_norm": 14.19936466217041, + "learning_rate": 1.0269237839555398e-07, + "loss": 0.3618, + "num_input_tokens_seen": 7476346880, + "step": 3565 + }, + { + "epoch": 0.09926769731489016, + "grad_norm": 11.769580841064453, + "learning_rate": 1.0264846510431307e-07, + "loss": 0.2983, + "num_input_tokens_seen": 7478444032, + "step": 3566 + }, + { + "epoch": 0.0995389205315975, + "grad_norm": 22.436574935913086, + "learning_rate": 1.0260491182456453e-07, + "loss": 0.8378, + "num_input_tokens_seen": 7480541184, + "step": 3567 + }, + { + "epoch": 0.09981014374830485, + "grad_norm": 21.576202392578125, + "learning_rate": 1.0256171859135826e-07, + "loss": 0.3932, + "num_input_tokens_seen": 7482638336, + "step": 3568 + }, + { + "epoch": 0.1000813669650122, + "grad_norm": 15.524735450744629, + "learning_rate": 1.0251888543945458e-07, + "loss": 0.413, + "num_input_tokens_seen": 7484735488, + "step": 3569 + }, + { + "epoch": 0.10035259018171956, + "grad_norm": 14.626888275146484, + "learning_rate": 1.0247641240332397e-07, + "loss": 0.4167, + "num_input_tokens_seen": 7486832640, + "step": 3570 + }, + { + "epoch": 0.10062381339842691, + "grad_norm": 14.8865327835083, + "learning_rate": 1.0243429951714714e-07, + "loss": 0.42, + "num_input_tokens_seen": 7488929792, + "step": 3571 + }, + { + "epoch": 0.10089503661513426, + "grad_norm": 17.109973907470703, + "learning_rate": 1.023925468148149e-07, + "loss": 0.4622, + "num_input_tokens_seen": 7491026944, + "step": 3572 + }, + { + "epoch": 0.1011662598318416, + "grad_norm": 8.667223930358887, + "learning_rate": 1.023511543299283e-07, + "loss": 0.1671, + "num_input_tokens_seen": 7493124096, + "step": 3573 + }, + { + "epoch": 0.10143748304854895, + "grad_norm": 17.68228530883789, + "learning_rate": 1.0231012209579831e-07, + "loss": 0.4523, + "num_input_tokens_seen": 7495221248, + "step": 3574 + }, + { + "epoch": 0.10170870626525631, + "grad_norm": 21.331649780273438, + "learning_rate": 1.0226945014544624e-07, + "loss": 0.5904, + "num_input_tokens_seen": 7497318400, + "step": 3575 + }, + { + "epoch": 0.10197992948196366, + "grad_norm": 19.670902252197266, + "learning_rate": 1.0222913851160335e-07, + "loss": 0.6652, + "num_input_tokens_seen": 7499415552, + "step": 3576 + }, + { + "epoch": 0.10225115269867101, + "grad_norm": 15.4076509475708, + "learning_rate": 1.0218918722671074e-07, + "loss": 0.425, + "num_input_tokens_seen": 7501512704, + "step": 3577 + }, + { + "epoch": 0.10252237591537836, + "grad_norm": 19.192432403564453, + "learning_rate": 1.0214959632291984e-07, + "loss": 0.4974, + "num_input_tokens_seen": 7503609856, + "step": 3578 + }, + { + "epoch": 0.1027935991320857, + "grad_norm": 15.295207977294922, + "learning_rate": 1.0211036583209181e-07, + "loss": 0.3527, + "num_input_tokens_seen": 7505707008, + "step": 3579 + }, + { + "epoch": 0.10306482234879305, + "grad_norm": 13.086195945739746, + "learning_rate": 1.0207149578579789e-07, + "loss": 0.2886, + "num_input_tokens_seen": 7507804160, + "step": 3580 + }, + { + "epoch": 0.10333604556550041, + "grad_norm": 18.579437255859375, + "learning_rate": 1.0203298621531923e-07, + "loss": 0.4335, + "num_input_tokens_seen": 7509901312, + "step": 3581 + }, + { + "epoch": 0.10360726878220776, + "grad_norm": 18.218303680419922, + "learning_rate": 1.0199483715164687e-07, + "loss": 0.4943, + "num_input_tokens_seen": 7511998464, + "step": 3582 + }, + { + "epoch": 0.1038784919989151, + "grad_norm": 27.71494483947754, + "learning_rate": 1.0195704862548167e-07, + "loss": 0.8784, + "num_input_tokens_seen": 7514095616, + "step": 3583 + }, + { + "epoch": 0.10414971521562245, + "grad_norm": 17.35464096069336, + "learning_rate": 1.0191962066723448e-07, + "loss": 0.3978, + "num_input_tokens_seen": 7516192768, + "step": 3584 + }, + { + "epoch": 0.1044209384323298, + "grad_norm": 11.098041534423828, + "learning_rate": 1.0188255330702583e-07, + "loss": 0.251, + "num_input_tokens_seen": 7518289920, + "step": 3585 + }, + { + "epoch": 0.10469216164903716, + "grad_norm": 20.41391372680664, + "learning_rate": 1.0184584657468615e-07, + "loss": 0.588, + "num_input_tokens_seen": 7520387072, + "step": 3586 + }, + { + "epoch": 0.10496338486574451, + "grad_norm": 15.097143173217773, + "learning_rate": 1.018095004997556e-07, + "loss": 0.2949, + "num_input_tokens_seen": 7522484224, + "step": 3587 + }, + { + "epoch": 0.10523460808245186, + "grad_norm": 14.162942886352539, + "learning_rate": 1.0177351511148414e-07, + "loss": 0.3288, + "num_input_tokens_seen": 7524581376, + "step": 3588 + }, + { + "epoch": 0.1055058312991592, + "grad_norm": 17.567161560058594, + "learning_rate": 1.0173789043883147e-07, + "loss": 0.4861, + "num_input_tokens_seen": 7526678528, + "step": 3589 + }, + { + "epoch": 0.10577705451586655, + "grad_norm": 14.005763053894043, + "learning_rate": 1.0170262651046687e-07, + "loss": 0.3948, + "num_input_tokens_seen": 7528775680, + "step": 3590 + }, + { + "epoch": 0.10604827773257391, + "grad_norm": 17.03684425354004, + "learning_rate": 1.0166772335476951e-07, + "loss": 0.5062, + "num_input_tokens_seen": 7530872832, + "step": 3591 + }, + { + "epoch": 0.10631950094928126, + "grad_norm": 17.549358367919922, + "learning_rate": 1.0163318099982808e-07, + "loss": 0.4414, + "num_input_tokens_seen": 7532969984, + "step": 3592 + }, + { + "epoch": 0.10659072416598861, + "grad_norm": 16.84699821472168, + "learning_rate": 1.0159899947344094e-07, + "loss": 0.4222, + "num_input_tokens_seen": 7535067136, + "step": 3593 + }, + { + "epoch": 0.10686194738269596, + "grad_norm": 13.90577507019043, + "learning_rate": 1.0156517880311614e-07, + "loss": 0.3399, + "num_input_tokens_seen": 7537164288, + "step": 3594 + }, + { + "epoch": 0.1071331705994033, + "grad_norm": 14.83264446258545, + "learning_rate": 1.0153171901607118e-07, + "loss": 0.3014, + "num_input_tokens_seen": 7539261440, + "step": 3595 + }, + { + "epoch": 0.10740439381611067, + "grad_norm": 21.388154983520508, + "learning_rate": 1.0149862013923329e-07, + "loss": 0.6559, + "num_input_tokens_seen": 7541358592, + "step": 3596 + }, + { + "epoch": 0.10767561703281801, + "grad_norm": 16.71360206604004, + "learning_rate": 1.0146588219923917e-07, + "loss": 0.4451, + "num_input_tokens_seen": 7543455744, + "step": 3597 + }, + { + "epoch": 0.10794684024952536, + "grad_norm": 12.09107780456543, + "learning_rate": 1.0143350522243509e-07, + "loss": 0.3744, + "num_input_tokens_seen": 7545552896, + "step": 3598 + }, + { + "epoch": 0.10821806346623271, + "grad_norm": 16.030540466308594, + "learning_rate": 1.0140148923487675e-07, + "loss": 0.4641, + "num_input_tokens_seen": 7547650048, + "step": 3599 + }, + { + "epoch": 0.10848928668294006, + "grad_norm": 18.602020263671875, + "learning_rate": 1.0136983426232945e-07, + "loss": 0.477, + "num_input_tokens_seen": 7549747200, + "step": 3600 + }, + { + "epoch": 0.1087605098996474, + "grad_norm": 11.135970115661621, + "learning_rate": 1.0133854033026789e-07, + "loss": 0.255, + "num_input_tokens_seen": 7551844352, + "step": 3601 + }, + { + "epoch": 0.10903173311635476, + "grad_norm": 16.609188079833984, + "learning_rate": 1.0130760746387622e-07, + "loss": 0.2572, + "num_input_tokens_seen": 7553941504, + "step": 3602 + }, + { + "epoch": 0.10930295633306211, + "grad_norm": 13.517497062683105, + "learning_rate": 1.0127703568804805e-07, + "loss": 0.3303, + "num_input_tokens_seen": 7556038656, + "step": 3603 + }, + { + "epoch": 0.10957417954976946, + "grad_norm": 16.163911819458008, + "learning_rate": 1.0124682502738638e-07, + "loss": 0.3366, + "num_input_tokens_seen": 7558135808, + "step": 3604 + }, + { + "epoch": 0.10984540276647681, + "grad_norm": 19.574230194091797, + "learning_rate": 1.0121697550620365e-07, + "loss": 0.5456, + "num_input_tokens_seen": 7560232960, + "step": 3605 + }, + { + "epoch": 0.11011662598318415, + "grad_norm": 17.653837203979492, + "learning_rate": 1.0118748714852156e-07, + "loss": 0.4387, + "num_input_tokens_seen": 7562330112, + "step": 3606 + }, + { + "epoch": 0.11038784919989152, + "grad_norm": 13.368509292602539, + "learning_rate": 1.011583599780712e-07, + "loss": 0.1869, + "num_input_tokens_seen": 7564427264, + "step": 3607 + }, + { + "epoch": 0.11065907241659886, + "grad_norm": 21.617624282836914, + "learning_rate": 1.01129594018293e-07, + "loss": 0.6204, + "num_input_tokens_seen": 7566524416, + "step": 3608 + }, + { + "epoch": 0.11093029563330621, + "grad_norm": 17.730438232421875, + "learning_rate": 1.0110118929233682e-07, + "loss": 0.4534, + "num_input_tokens_seen": 7568621568, + "step": 3609 + }, + { + "epoch": 0.11120151885001356, + "grad_norm": 20.859384536743164, + "learning_rate": 1.0107314582306156e-07, + "loss": 0.6147, + "num_input_tokens_seen": 7570718720, + "step": 3610 + }, + { + "epoch": 0.1114727420667209, + "grad_norm": 21.62270164489746, + "learning_rate": 1.0104546363303566e-07, + "loss": 0.5003, + "num_input_tokens_seen": 7572815872, + "step": 3611 + }, + { + "epoch": 0.11174396528342827, + "grad_norm": 22.07628631591797, + "learning_rate": 1.0101814274453661e-07, + "loss": 0.649, + "num_input_tokens_seen": 7574913024, + "step": 3612 + }, + { + "epoch": 0.11201518850013562, + "grad_norm": 19.241708755493164, + "learning_rate": 1.0099118317955127e-07, + "loss": 0.4658, + "num_input_tokens_seen": 7577010176, + "step": 3613 + }, + { + "epoch": 0.11228641171684296, + "grad_norm": 17.93199348449707, + "learning_rate": 1.0096458495977564e-07, + "loss": 0.5632, + "num_input_tokens_seen": 7579107328, + "step": 3614 + }, + { + "epoch": 0.11255763493355031, + "grad_norm": 18.892309188842773, + "learning_rate": 1.0093834810661498e-07, + "loss": 0.3877, + "num_input_tokens_seen": 7581204480, + "step": 3615 + }, + { + "epoch": 0.11282885815025766, + "grad_norm": 14.152824401855469, + "learning_rate": 1.0091247264118372e-07, + "loss": 0.3783, + "num_input_tokens_seen": 7583301632, + "step": 3616 + }, + { + "epoch": 0.11310008136696502, + "grad_norm": 11.564390182495117, + "learning_rate": 1.0088695858430539e-07, + "loss": 0.261, + "num_input_tokens_seen": 7585398784, + "step": 3617 + }, + { + "epoch": 0.11337130458367237, + "grad_norm": 23.13407325744629, + "learning_rate": 1.0086180595651278e-07, + "loss": 0.486, + "num_input_tokens_seen": 7587495936, + "step": 3618 + }, + { + "epoch": 0.11364252780037971, + "grad_norm": 17.028270721435547, + "learning_rate": 1.0083701477804778e-07, + "loss": 0.554, + "num_input_tokens_seen": 7589593088, + "step": 3619 + }, + { + "epoch": 0.11391375101708706, + "grad_norm": 17.975419998168945, + "learning_rate": 1.0081258506886134e-07, + "loss": 0.5359, + "num_input_tokens_seen": 7591690240, + "step": 3620 + }, + { + "epoch": 0.11418497423379441, + "grad_norm": 14.600398063659668, + "learning_rate": 1.0078851684861357e-07, + "loss": 0.3641, + "num_input_tokens_seen": 7593787392, + "step": 3621 + }, + { + "epoch": 0.11445619745050176, + "grad_norm": 14.766218185424805, + "learning_rate": 1.0076481013667376e-07, + "loss": 0.2436, + "num_input_tokens_seen": 7595884544, + "step": 3622 + }, + { + "epoch": 0.11472742066720912, + "grad_norm": 15.354504585266113, + "learning_rate": 1.0074146495212001e-07, + "loss": 0.2984, + "num_input_tokens_seen": 7597981696, + "step": 3623 + }, + { + "epoch": 0.11499864388391647, + "grad_norm": 19.848752975463867, + "learning_rate": 1.0071848131373972e-07, + "loss": 0.7206, + "num_input_tokens_seen": 7600078848, + "step": 3624 + }, + { + "epoch": 0.11526986710062381, + "grad_norm": 13.409833908081055, + "learning_rate": 1.0069585924002924e-07, + "loss": 0.3718, + "num_input_tokens_seen": 7602176000, + "step": 3625 + }, + { + "epoch": 0.11554109031733116, + "grad_norm": 25.440654754638672, + "learning_rate": 1.0067359874919395e-07, + "loss": 0.7582, + "num_input_tokens_seen": 7604273152, + "step": 3626 + }, + { + "epoch": 0.11581231353403851, + "grad_norm": 12.968038558959961, + "learning_rate": 1.0065169985914826e-07, + "loss": 0.2706, + "num_input_tokens_seen": 7606370304, + "step": 3627 + }, + { + "epoch": 0.11608353675074587, + "grad_norm": 21.27517318725586, + "learning_rate": 1.0063016258751553e-07, + "loss": 0.6234, + "num_input_tokens_seen": 7608467456, + "step": 3628 + }, + { + "epoch": 0.11635475996745322, + "grad_norm": 17.9497127532959, + "learning_rate": 1.0060898695162816e-07, + "loss": 0.4209, + "num_input_tokens_seen": 7610564608, + "step": 3629 + }, + { + "epoch": 0.11662598318416056, + "grad_norm": 18.859006881713867, + "learning_rate": 1.005881729685275e-07, + "loss": 0.4628, + "num_input_tokens_seen": 7612661760, + "step": 3630 + }, + { + "epoch": 0.11689720640086791, + "grad_norm": 22.217741012573242, + "learning_rate": 1.0056772065496387e-07, + "loss": 0.8165, + "num_input_tokens_seen": 7614758912, + "step": 3631 + }, + { + "epoch": 0.11716842961757526, + "grad_norm": 22.35140037536621, + "learning_rate": 1.005476300273965e-07, + "loss": 0.6362, + "num_input_tokens_seen": 7616856064, + "step": 3632 + }, + { + "epoch": 0.11743965283428262, + "grad_norm": 16.316476821899414, + "learning_rate": 1.0052790110199348e-07, + "loss": 0.4338, + "num_input_tokens_seen": 7618953216, + "step": 3633 + }, + { + "epoch": 0.11771087605098997, + "grad_norm": 23.569665908813477, + "learning_rate": 1.0050853389463205e-07, + "loss": 0.6887, + "num_input_tokens_seen": 7621050368, + "step": 3634 + }, + { + "epoch": 0.11798209926769732, + "grad_norm": 12.867155075073242, + "learning_rate": 1.0048952842089805e-07, + "loss": 0.3326, + "num_input_tokens_seen": 7623147520, + "step": 3635 + }, + { + "epoch": 0.11825332248440466, + "grad_norm": 14.628716468811035, + "learning_rate": 1.0047088469608648e-07, + "loss": 0.3111, + "num_input_tokens_seen": 7625244672, + "step": 3636 + }, + { + "epoch": 0.11852454570111201, + "grad_norm": 12.584244728088379, + "learning_rate": 1.00452602735201e-07, + "loss": 0.2856, + "num_input_tokens_seen": 7627341824, + "step": 3637 + }, + { + "epoch": 0.11879576891781937, + "grad_norm": 19.87360954284668, + "learning_rate": 1.0043468255295435e-07, + "loss": 0.6433, + "num_input_tokens_seen": 7629438976, + "step": 3638 + }, + { + "epoch": 0.11906699213452672, + "grad_norm": 17.3902530670166, + "learning_rate": 1.0041712416376795e-07, + "loss": 0.5521, + "num_input_tokens_seen": 7631536128, + "step": 3639 + }, + { + "epoch": 0.11933821535123407, + "grad_norm": 13.591824531555176, + "learning_rate": 1.0039992758177211e-07, + "loss": 0.2688, + "num_input_tokens_seen": 7633633280, + "step": 3640 + }, + { + "epoch": 0.11960943856794141, + "grad_norm": 18.85526466369629, + "learning_rate": 1.0038309282080596e-07, + "loss": 0.2729, + "num_input_tokens_seen": 7635730432, + "step": 3641 + }, + { + "epoch": 0.11988066178464876, + "grad_norm": 16.387048721313477, + "learning_rate": 1.0036661989441755e-07, + "loss": 0.5371, + "num_input_tokens_seen": 7637827584, + "step": 3642 + }, + { + "epoch": 0.12015188500135611, + "grad_norm": 16.59734344482422, + "learning_rate": 1.0035050881586364e-07, + "loss": 0.4256, + "num_input_tokens_seen": 7639924736, + "step": 3643 + }, + { + "epoch": 0.12042310821806347, + "grad_norm": 15.033208847045898, + "learning_rate": 1.0033475959810974e-07, + "loss": 0.3089, + "num_input_tokens_seen": 7642021888, + "step": 3644 + }, + { + "epoch": 0.12069433143477082, + "grad_norm": 15.11539077758789, + "learning_rate": 1.0031937225383036e-07, + "loss": 0.4245, + "num_input_tokens_seen": 7644119040, + "step": 3645 + }, + { + "epoch": 0.12096555465147817, + "grad_norm": 13.645261764526367, + "learning_rate": 1.0030434679540853e-07, + "loss": 0.2949, + "num_input_tokens_seen": 7646216192, + "step": 3646 + }, + { + "epoch": 0.12123677786818551, + "grad_norm": 21.710878372192383, + "learning_rate": 1.0028968323493623e-07, + "loss": 0.5613, + "num_input_tokens_seen": 7648313344, + "step": 3647 + }, + { + "epoch": 0.12150800108489286, + "grad_norm": 18.812923431396484, + "learning_rate": 1.0027538158421413e-07, + "loss": 0.6029, + "num_input_tokens_seen": 7650410496, + "step": 3648 + }, + { + "epoch": 0.12177922430160022, + "grad_norm": 18.887786865234375, + "learning_rate": 1.002614418547516e-07, + "loss": 0.6727, + "num_input_tokens_seen": 7652507648, + "step": 3649 + }, + { + "epoch": 0.12205044751830757, + "grad_norm": 19.344097137451172, + "learning_rate": 1.0024786405776686e-07, + "loss": 0.5487, + "num_input_tokens_seen": 7654604800, + "step": 3650 + }, + { + "epoch": 0.12232167073501492, + "grad_norm": 17.132566452026367, + "learning_rate": 1.0023464820418676e-07, + "loss": 0.4229, + "num_input_tokens_seen": 7656701952, + "step": 3651 + }, + { + "epoch": 0.12259289395172226, + "grad_norm": 13.882312774658203, + "learning_rate": 1.00221794304647e-07, + "loss": 0.312, + "num_input_tokens_seen": 7658799104, + "step": 3652 + }, + { + "epoch": 0.12286411716842961, + "grad_norm": 16.40239143371582, + "learning_rate": 1.0020930236949182e-07, + "loss": 0.3204, + "num_input_tokens_seen": 7660896256, + "step": 3653 + }, + { + "epoch": 0.12313534038513697, + "grad_norm": 13.31047248840332, + "learning_rate": 1.0019717240877424e-07, + "loss": 0.277, + "num_input_tokens_seen": 7662993408, + "step": 3654 + }, + { + "epoch": 0.12340656360184432, + "grad_norm": 21.159040451049805, + "learning_rate": 1.001854044322561e-07, + "loss": 0.7039, + "num_input_tokens_seen": 7665090560, + "step": 3655 + }, + { + "epoch": 0.12367778681855167, + "grad_norm": 42.46569061279297, + "learning_rate": 1.0017399844940774e-07, + "loss": 0.4774, + "num_input_tokens_seen": 7667187712, + "step": 3656 + }, + { + "epoch": 0.12394901003525902, + "grad_norm": 16.749025344848633, + "learning_rate": 1.0016295446940827e-07, + "loss": 0.3837, + "num_input_tokens_seen": 7669284864, + "step": 3657 + }, + { + "epoch": 0.12422023325196636, + "grad_norm": 19.79422378540039, + "learning_rate": 1.001522725011455e-07, + "loss": 0.6936, + "num_input_tokens_seen": 7671382016, + "step": 3658 + }, + { + "epoch": 0.12449145646867373, + "grad_norm": 19.400150299072266, + "learning_rate": 1.0014195255321583e-07, + "loss": 0.415, + "num_input_tokens_seen": 7673479168, + "step": 3659 + }, + { + "epoch": 0.12476267968538107, + "grad_norm": 12.765349388122559, + "learning_rate": 1.0013199463392433e-07, + "loss": 0.3181, + "num_input_tokens_seen": 7675576320, + "step": 3660 + }, + { + "epoch": 0.1250339029020884, + "grad_norm": 15.518762588500977, + "learning_rate": 1.0012239875128484e-07, + "loss": 0.2617, + "num_input_tokens_seen": 7677673472, + "step": 3661 + }, + { + "epoch": 0.12530512611879577, + "grad_norm": 18.00017738342285, + "learning_rate": 1.0011316491301973e-07, + "loss": 0.4696, + "num_input_tokens_seen": 7679770624, + "step": 3662 + }, + { + "epoch": 0.12557634933550313, + "grad_norm": 19.197542190551758, + "learning_rate": 1.0010429312656006e-07, + "loss": 0.5292, + "num_input_tokens_seen": 7681867776, + "step": 3663 + }, + { + "epoch": 0.12584757255221046, + "grad_norm": 20.32849884033203, + "learning_rate": 1.000957833990454e-07, + "loss": 0.6401, + "num_input_tokens_seen": 7683964928, + "step": 3664 + }, + { + "epoch": 0.12611879576891782, + "grad_norm": 13.460699081420898, + "learning_rate": 1.0008763573732421e-07, + "loss": 0.2865, + "num_input_tokens_seen": 7686062080, + "step": 3665 + }, + { + "epoch": 0.12639001898562516, + "grad_norm": 16.389801025390625, + "learning_rate": 1.0007985014795331e-07, + "loss": 0.4368, + "num_input_tokens_seen": 7688159232, + "step": 3666 + }, + { + "epoch": 0.12666124220233252, + "grad_norm": 16.63237762451172, + "learning_rate": 1.0007242663719824e-07, + "loss": 0.3929, + "num_input_tokens_seen": 7690256384, + "step": 3667 + }, + { + "epoch": 0.12693246541903988, + "grad_norm": 22.51099967956543, + "learning_rate": 1.0006536521103325e-07, + "loss": 0.5468, + "num_input_tokens_seen": 7692353536, + "step": 3668 + }, + { + "epoch": 0.12720368863574721, + "grad_norm": 21.233259201049805, + "learning_rate": 1.0005866587514106e-07, + "loss": 0.5824, + "num_input_tokens_seen": 7694450688, + "step": 3669 + }, + { + "epoch": 0.12747491185245458, + "grad_norm": 18.1636962890625, + "learning_rate": 1.0005232863491297e-07, + "loss": 0.5925, + "num_input_tokens_seen": 7696547840, + "step": 3670 + }, + { + "epoch": 0.1277461350691619, + "grad_norm": 20.759021759033203, + "learning_rate": 1.0004635349544907e-07, + "loss": 0.5679, + "num_input_tokens_seen": 7698644992, + "step": 3671 + }, + { + "epoch": 0.12801735828586927, + "grad_norm": 16.30278778076172, + "learning_rate": 1.0004074046155789e-07, + "loss": 0.3588, + "num_input_tokens_seen": 7700742144, + "step": 3672 + }, + { + "epoch": 0.12828858150257663, + "grad_norm": 23.158729553222656, + "learning_rate": 1.000354895377565e-07, + "loss": 0.8094, + "num_input_tokens_seen": 7702839296, + "step": 3673 + }, + { + "epoch": 0.12855980471928397, + "grad_norm": 20.49749755859375, + "learning_rate": 1.0003060072827073e-07, + "loss": 0.5029, + "num_input_tokens_seen": 7704936448, + "step": 3674 + }, + { + "epoch": 0.12883102793599133, + "grad_norm": 12.774348258972168, + "learning_rate": 1.0002607403703492e-07, + "loss": 0.253, + "num_input_tokens_seen": 7707033600, + "step": 3675 + }, + { + "epoch": 0.12910225115269866, + "grad_norm": 22.22560691833496, + "learning_rate": 1.000219094676919e-07, + "loss": 0.7074, + "num_input_tokens_seen": 7709130752, + "step": 3676 + }, + { + "epoch": 0.12937347436940602, + "grad_norm": 15.757925033569336, + "learning_rate": 1.0001810702359326e-07, + "loss": 0.3633, + "num_input_tokens_seen": 7711227904, + "step": 3677 + }, + { + "epoch": 0.12964469758611338, + "grad_norm": 14.108972549438477, + "learning_rate": 1.0001466670779896e-07, + "loss": 0.3336, + "num_input_tokens_seen": 7713325056, + "step": 3678 + }, + { + "epoch": 0.12991592080282072, + "grad_norm": 9.747844696044922, + "learning_rate": 1.000115885230777e-07, + "loss": 0.1844, + "num_input_tokens_seen": 7715422208, + "step": 3679 + }, + { + "epoch": 0.13018714401952808, + "grad_norm": 13.866641998291016, + "learning_rate": 1.0000887247190662e-07, + "loss": 0.3654, + "num_input_tokens_seen": 7717519360, + "step": 3680 + }, + { + "epoch": 0.1304583672362354, + "grad_norm": 20.240041732788086, + "learning_rate": 1.000065185564716e-07, + "loss": 0.7339, + "num_input_tokens_seen": 7719616512, + "step": 3681 + }, + { + "epoch": 0.13072959045294277, + "grad_norm": 17.532907485961914, + "learning_rate": 1.0000452677866691e-07, + "loss": 0.52, + "num_input_tokens_seen": 7721713664, + "step": 3682 + }, + { + "epoch": 0.13100081366965013, + "grad_norm": 20.656099319458008, + "learning_rate": 1.0000289714009542e-07, + "loss": 0.5367, + "num_input_tokens_seen": 7723810816, + "step": 3683 + }, + { + "epoch": 0.13127203688635747, + "grad_norm": 19.092506408691406, + "learning_rate": 1.000016296420687e-07, + "loss": 0.4698, + "num_input_tokens_seen": 7725907968, + "step": 3684 + }, + { + "epoch": 0.13154326010306483, + "grad_norm": 19.04652976989746, + "learning_rate": 1.0000072428560674e-07, + "loss": 0.5093, + "num_input_tokens_seen": 7728005120, + "step": 3685 + }, + { + "epoch": 0.13181448331977216, + "grad_norm": 15.921603202819824, + "learning_rate": 1.000001810714381e-07, + "loss": 0.409, + "num_input_tokens_seen": 7730102272, + "step": 3686 + }, + { + "epoch": 0.13208570653647952, + "grad_norm": 14.163284301757812, + "learning_rate": 1e-07, + "loss": 0.3447, + "num_input_tokens_seen": 7732199424, + "step": 3687 + }, + { + "epoch": 0.13208570653647952, + "num_input_tokens_seen": 7732199424, + "step": 3687, + "total_flos": 4.352218066950134e+19, + "train_loss": 0.059379862848258066, + "train_runtime": 15793.1588, + "train_samples_per_second": 0.467, + "train_steps_per_second": 0.233 + } + ], + "logging_steps": 1.0, + "max_steps": 3687, + "num_input_tokens_seen": 7732199424, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.352218066950134e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}