diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,1360 +2,2699 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.0, + "epoch": 2.0, "eval_steps": 100, - "global_step": 704, + "global_step": 1408, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007106057914372002, - "grad_norm": 48.15382880127834, - "learning_rate": 3.6363636363636366e-06, - "loss": 4.3877, - "mean_token_accuracy": 0.4402003187686205, - "num_tokens": 5475610.0, + "grad_norm": 19.49246232730451, + "learning_rate": 1.86046511627907e-06, + "loss": 2.4934, + "mean_token_accuracy": 0.5258669804781675, + "num_tokens": 4763584.0, "step": 5 }, { "epoch": 0.014212115828744005, - "grad_norm": 14.120135940330632, - "learning_rate": 8.181818181818183e-06, - "loss": 3.149, - "mean_token_accuracy": 0.49750813096761703, - "num_tokens": 10990469.0, + "grad_norm": 12.095302419910517, + "learning_rate": 4.186046511627907e-06, + "loss": 2.3185, + "mean_token_accuracy": 0.5427483215928077, + "num_tokens": 9531720.0, "step": 10 }, { "epoch": 0.021318173743116006, - "grad_norm": 4.9582246488048884, - "learning_rate": 1.2727272727272728e-05, - "loss": 1.6042, - "mean_token_accuracy": 0.6544640183448791, - "num_tokens": 16509954.0, + "grad_norm": 6.796780116822856, + "learning_rate": 6.511627906976745e-06, + "loss": 1.8082, + "mean_token_accuracy": 0.5906881660223007, + "num_tokens": 14272673.0, "step": 15 }, { "epoch": 0.02842423165748801, - "grad_norm": 2.3061348987678967, - "learning_rate": 1.7272727272727274e-05, - "loss": 1.0542, - "mean_token_accuracy": 0.7447574809193611, - "num_tokens": 22025438.0, + "grad_norm": 2.028130586531004, + "learning_rate": 8.837209302325582e-06, + "loss": 1.4468, + "mean_token_accuracy": 0.639113237708807, + "num_tokens": 19045659.0, "step": 20 }, { "epoch": 0.03553028957186001, - "grad_norm": 3.4964571027817337, - "learning_rate": 1.999961805535155e-05, - "loss": 0.8969, - "mean_token_accuracy": 0.7685870260000229, - "num_tokens": 27537030.0, + "grad_norm": 1.0588899013746833, + "learning_rate": 1.116279069767442e-05, + "loss": 1.2276, + "mean_token_accuracy": 0.6780799143016338, + "num_tokens": 23811446.0, "step": 25 }, { "epoch": 0.04263634748623201, - "grad_norm": 3.3071652925849335, - "learning_rate": 1.9995321550350065e-05, - "loss": 0.8074, - "mean_token_accuracy": 0.7807081520557404, - "num_tokens": 33069854.0, + "grad_norm": 0.6488648360367262, + "learning_rate": 1.3488372093023257e-05, + "loss": 1.088, + "mean_token_accuracy": 0.7027405865490437, + "num_tokens": 28572862.0, "step": 30 }, { "epoch": 0.04974240540060401, - "grad_norm": 3.924372429122354, - "learning_rate": 1.998625339625423e-05, - "loss": 0.7719, - "mean_token_accuracy": 0.783011856675148, - "num_tokens": 38592423.0, + "grad_norm": 0.5415057756218988, + "learning_rate": 1.5813953488372095e-05, + "loss": 1.0242, + "mean_token_accuracy": 0.7144973143935204, + "num_tokens": 33349882.0, "step": 35 }, { "epoch": 0.05684846331497602, - "grad_norm": 0.9496737705645568, - "learning_rate": 1.9972418403347817e-05, - "loss": 0.7245, - "mean_token_accuracy": 0.7904581762850285, - "num_tokens": 44102692.0, + "grad_norm": 0.48382195510047, + "learning_rate": 1.813953488372093e-05, + "loss": 0.9712, + "mean_token_accuracy": 0.7250196196138858, + "num_tokens": 38137624.0, "step": 40 }, { "epoch": 0.06395452122934801, - "grad_norm": 0.4437647446141135, - "learning_rate": 1.9953823910527057e-05, - "loss": 0.6875, - "mean_token_accuracy": 0.8015773832798004, - "num_tokens": 49606158.0, + "grad_norm": 0.49727864530874677, + "learning_rate": 1.99999761632652e-05, + "loss": 0.9247, + "mean_token_accuracy": 0.7344170436263084, + "num_tokens": 42901997.0, "step": 45 }, { "epoch": 0.07106057914372002, - "grad_norm": 0.36104953638366394, - "learning_rate": 1.993047978140764e-05, - "loss": 0.6681, - "mean_token_accuracy": 0.8048019059002399, - "num_tokens": 55124799.0, + "grad_norm": 0.4396836808821335, + "learning_rate": 1.999914189080485e-05, + "loss": 0.8892, + "mean_token_accuracy": 0.7411722339689731, + "num_tokens": 47660479.0, "step": 50 }, { "epoch": 0.07816663705809203, - "grad_norm": 0.3451124617137935, - "learning_rate": 1.9902398399092494e-05, - "loss": 0.6356, - "mean_token_accuracy": 0.8128186449408531, - "num_tokens": 60636117.0, + "grad_norm": 0.4277947889677102, + "learning_rate": 1.9997115907865857e-05, + "loss": 0.8745, + "mean_token_accuracy": 0.7443610817193985, + "num_tokens": 52437558.0, "step": 55 }, { "epoch": 0.08527269497246402, - "grad_norm": 0.389660261004574, - "learning_rate": 1.9869594659603032e-05, - "loss": 0.6398, - "mean_token_accuracy": 0.8115979641675949, - "num_tokens": 66154262.0, + "grad_norm": 0.4233524880605398, + "learning_rate": 1.999389848273882e-05, + "loss": 0.8603, + "mean_token_accuracy": 0.747463022172451, + "num_tokens": 57198957.0, "step": 60 }, { "epoch": 0.09237875288683603, - "grad_norm": 0.31317999931398266, - "learning_rate": 1.9832085963977445e-05, - "loss": 0.6337, - "mean_token_accuracy": 0.8128221824765205, - "num_tokens": 71679661.0, + "grad_norm": 0.409057413495153, + "learning_rate": 1.998949004149094e-05, + "loss": 0.8537, + "mean_token_accuracy": 0.7483336836099624, + "num_tokens": 61961974.0, "step": 65 }, { "epoch": 0.09948481080120802, - "grad_norm": 0.3128254105868865, - "learning_rate": 1.978989220904016e-05, - "loss": 0.6227, - "mean_token_accuracy": 0.8144343480467796, - "num_tokens": 77204132.0, + "grad_norm": 0.46927095285141973, + "learning_rate": 1.9983891167909617e-05, + "loss": 0.8375, + "mean_token_accuracy": 0.7526809796690941, + "num_tokens": 66725536.0, "step": 70 }, { "epoch": 0.10659086871558003, - "grad_norm": 0.3000404040639256, - "learning_rate": 1.9743035776847377e-05, - "loss": 0.6166, - "mean_token_accuracy": 0.8157493658363819, - "num_tokens": 82747984.0, + "grad_norm": 0.4106777573300543, + "learning_rate": 1.9977102603425134e-05, + "loss": 0.8309, + "mean_token_accuracy": 0.7542230375111103, + "num_tokens": 71469095.0, "step": 75 }, { "epoch": 0.11369692662995204, - "grad_norm": 0.3139627449464346, - "learning_rate": 1.9691541522814327e-05, - "loss": 0.5988, - "mean_token_accuracy": 0.8197014890611172, - "num_tokens": 88267430.0, + "grad_norm": 0.40584561150068, + "learning_rate": 1.996912524701247e-05, + "loss": 0.8258, + "mean_token_accuracy": 0.7558744698762894, + "num_tokens": 76221430.0, "step": 80 }, { "epoch": 0.12080298454432403, - "grad_norm": 0.3039876079367156, - "learning_rate": 1.963543676253048e-05, - "loss": 0.6051, - "mean_token_accuracy": 0.8184983253479003, - "num_tokens": 93790180.0, + "grad_norm": 0.4436779763048366, + "learning_rate": 1.995996015507227e-05, + "loss": 0.8152, + "mean_token_accuracy": 0.7582607261836529, + "num_tokens": 80991235.0, "step": 85 }, { "epoch": 0.12790904245869603, - "grad_norm": 0.3409273981011121, - "learning_rate": 1.9574751257269748e-05, - "loss": 0.5978, - "mean_token_accuracy": 0.8195111580193043, - "num_tokens": 99314107.0, + "grad_norm": 0.4011663933394793, + "learning_rate": 1.9949608541290924e-05, + "loss": 0.8128, + "mean_token_accuracy": 0.7592827767133713, + "num_tokens": 85760262.0, "step": 90 }, { "epoch": 0.13501510037306805, - "grad_norm": 0.2958717785828003, - "learning_rate": 1.950951719820335e-05, - "loss": 0.5902, - "mean_token_accuracy": 0.8219340682029724, - "num_tokens": 104809146.0, + "grad_norm": 0.4484975750242541, + "learning_rate": 1.9938071776479875e-05, + "loss": 0.8015, + "mean_token_accuracy": 0.7621250681579113, + "num_tokens": 90505979.0, "step": 95 }, { "epoch": 0.14212115828744004, - "grad_norm": 0.2994112787306848, - "learning_rate": 1.9439769189323727e-05, - "loss": 0.5989, - "mean_token_accuracy": 0.820228873193264, - "num_tokens": 110323602.0, + "grad_norm": 0.3973118091711915, + "learning_rate": 1.992535138839406e-05, + "loss": 0.7956, + "mean_token_accuracy": 0.7619979940354824, + "num_tokens": 95259341.0, "step": 100 }, { "epoch": 0.14212115828744004, - "eval_loss": 0.5710137486457825, - "eval_mean_token_accuracy": 0.8228120437839574, - "eval_num_tokens": 110323602.0, - "eval_runtime": 160.5102, - "eval_samples_per_second": 22.671, - "eval_steps_per_second": 0.71, + "eval_loss": 0.7710759043693542, + "eval_mean_token_accuracy": 0.7617696215186203, + "eval_num_tokens": 95259341.0, + "eval_runtime": 149.4719, + "eval_samples_per_second": 24.346, + "eval_steps_per_second": 0.763, "step": 100 }, { "epoch": 0.14922721620181204, - "grad_norm": 0.30064641734602476, - "learning_rate": 1.9365544229088517e-05, - "loss": 0.5944, - "mean_token_accuracy": 0.8212312825024128, - "num_tokens": 115858100.0, + "grad_norm": 0.4055637275946486, + "learning_rate": 1.991144906152962e-05, + "loss": 0.804, + "mean_token_accuracy": 0.7599063582718373, + "num_tokens": 100023224.0, "step": 105 }, { "epoch": 0.15633327411618406, - "grad_norm": 0.2894891012090971, - "learning_rate": 1.9286881690794425e-05, - "loss": 0.5832, - "mean_token_accuracy": 0.8244290247559547, - "num_tokens": 121393955.0, + "grad_norm": 0.5425556091455714, + "learning_rate": 1.9896366636900826e-05, + "loss": 0.7951, + "mean_token_accuracy": 0.7621362045407295, + "num_tokens": 104771415.0, "step": 110 }, { "epoch": 0.16343933203055605, - "grad_norm": 0.3177791926774768, - "learning_rate": 1.9203823301691272e-05, - "loss": 0.5946, - "mean_token_accuracy": 0.8195516988635063, - "num_tokens": 126941522.0, + "grad_norm": 0.4581521538010428, + "learning_rate": 1.9880106111796266e-05, + "loss": 0.7903, + "mean_token_accuracy": 0.7619842484593391, + "num_tokens": 109537100.0, "step": 115 }, { "epoch": 0.17054538994492804, - "grad_norm": 0.3271845261422823, - "learning_rate": 1.9116413120847425e-05, - "loss": 0.5853, - "mean_token_accuracy": 0.823321682959795, - "num_tokens": 132467661.0, + "grad_norm": 0.42123272880797796, + "learning_rate": 1.9862669639514382e-05, + "loss": 0.7886, + "mean_token_accuracy": 0.7638748176395893, + "num_tokens": 114304125.0, "step": 120 }, { "epoch": 0.17765144785930007, - "grad_norm": 0.30578663346709106, - "learning_rate": 1.902469751577826e-05, - "loss": 0.5787, - "mean_token_accuracy": 0.8247792065143585, - "num_tokens": 137980426.0, + "grad_norm": 0.3957407600605023, + "learning_rate": 1.9844059529078297e-05, + "loss": 0.7763, + "mean_token_accuracy": 0.7664125673472881, + "num_tokens": 119067772.0, "step": 125 }, { "epoch": 0.18475750577367206, - "grad_norm": 0.28728878535567604, - "learning_rate": 1.892872513785008e-05, - "loss": 0.568, - "mean_token_accuracy": 0.8274045430123806, - "num_tokens": 143490685.0, + "grad_norm": 1.1428359800811156, + "learning_rate": 1.9824278244930052e-05, + "loss": 0.7736, + "mean_token_accuracy": 0.7676285386085511, + "num_tokens": 123805148.0, "step": 130 }, { "epoch": 0.19186356368804405, - "grad_norm": 0.3081153281219037, - "learning_rate": 1.88285468964726e-05, - "loss": 0.5728, - "mean_token_accuracy": 0.8262367367744445, - "num_tokens": 149018034.0, + "grad_norm": 0.4240204964738557, + "learning_rate": 1.9803328406604252e-05, + "loss": 0.7841, + "mean_token_accuracy": 0.763801097869873, + "num_tokens": 128568367.0, "step": 135 }, { "epoch": 0.19896962160241605, - "grad_norm": 0.27248779720052707, - "learning_rate": 1.872421593209355e-05, - "loss": 0.5679, - "mean_token_accuracy": 0.8270156674087048, - "num_tokens": 154549711.0, + "grad_norm": 0.41090851408734014, + "learning_rate": 1.9781212788381177e-05, + "loss": 0.7819, + "mean_token_accuracy": 0.7644710555672646, + "num_tokens": 133343344.0, "step": 140 }, { "epoch": 0.20607567951678807, - "grad_norm": 0.2876029562240766, - "learning_rate": 1.861578758800989e-05, - "loss": 0.5739, - "mean_token_accuracy": 0.825926473736763, - "num_tokens": 160056919.0, + "grad_norm": 0.38627786124116875, + "learning_rate": 1.9757934318919386e-05, + "loss": 0.7586, + "mean_token_accuracy": 0.7707050330936909, + "num_tokens": 138078681.0, "step": 145 }, { "epoch": 0.21318173743116006, - "grad_norm": 0.2592081911074665, - "learning_rate": 1.8503319381010414e-05, - "loss": 0.5688, - "mean_token_accuracy": 0.826562087237835, - "num_tokens": 165584428.0, + "grad_norm": 0.35413336695126674, + "learning_rate": 1.973349608086791e-05, + "loss": 0.7579, + "mean_token_accuracy": 0.7715410716831684, + "num_tokens": 142812366.0, "step": 150 }, { "epoch": 0.22028779534553206, - "grad_norm": 0.2921770757113811, - "learning_rate": 1.8386870970865488e-05, - "loss": 0.5615, - "mean_token_accuracy": 0.8287016794085502, - "num_tokens": 171107691.0, + "grad_norm": 0.44320259589774924, + "learning_rate": 1.9707901310458017e-05, + "loss": 0.7649, + "mean_token_accuracy": 0.7688324272632598, + "num_tokens": 147567508.0, "step": 155 }, { "epoch": 0.22739385325990408, - "grad_norm": 0.25717865089543174, - "learning_rate": 1.8266504128679988e-05, - "loss": 0.5619, - "mean_token_accuracy": 0.8289386563003063, - "num_tokens": 176626158.0, + "grad_norm": 0.4272220313623565, + "learning_rate": 1.9681153397074658e-05, + "loss": 0.779, + "mean_token_accuracy": 0.7649676457047463, + "num_tokens": 152348975.0, "step": 160 }, { "epoch": 0.23449991117427607, - "grad_norm": 0.27820799246988237, - "learning_rate": 1.814228270412624e-05, - "loss": 0.5772, - "mean_token_accuracy": 0.8250645868480205, - "num_tokens": 182151744.0, + "grad_norm": 0.36879913627107413, + "learning_rate": 1.9653255882807625e-05, + "loss": 0.7547, + "mean_token_accuracy": 0.7709379114210606, + "num_tokens": 157094616.0, "step": 165 }, { "epoch": 0.24160596908864806, - "grad_norm": 0.2821779366904109, - "learning_rate": 1.8014272591574405e-05, - "loss": 0.5707, - "mean_token_accuracy": 0.8270999036729336, - "num_tokens": 187670198.0, + "grad_norm": 0.39000799041219736, + "learning_rate": 1.9624212461982497e-05, + "loss": 0.7594, + "mean_token_accuracy": 0.7707360699772835, + "num_tokens": 161849805.0, "step": 170 }, { "epoch": 0.2487120270030201, - "grad_norm": 0.3275959246759839, - "learning_rate": 1.7882541695138224e-05, - "loss": 0.5579, - "mean_token_accuracy": 0.8302620410919189, - "num_tokens": 193189696.0, + "grad_norm": 0.37089862448174604, + "learning_rate": 1.9594026980671423e-05, + "loss": 0.7555, + "mean_token_accuracy": 0.7713063634932041, + "num_tokens": 166609253.0, "step": 175 }, { "epoch": 0.25581808491739205, - "grad_norm": 0.29306437619143894, - "learning_rate": 1.7747159892654646e-05, - "loss": 0.5564, - "mean_token_accuracy": 0.8296913146972656, - "num_tokens": 198706595.0, + "grad_norm": 0.3895218613587052, + "learning_rate": 1.9562703436183783e-05, + "loss": 0.7641, + "mean_token_accuracy": 0.7704252451658249, + "num_tokens": 171374935.0, "step": 180 }, { "epoch": 0.2629241428317641, - "grad_norm": 0.2977278839411146, - "learning_rate": 1.7608198998616533e-05, - "loss": 0.5635, - "mean_token_accuracy": 0.8275744579732418, - "num_tokens": 204263565.0, + "grad_norm": 0.41062410007633293, + "learning_rate": 1.953024597653688e-05, + "loss": 0.7549, + "mean_token_accuracy": 0.771364139765501, + "num_tokens": 176152999.0, "step": 185 }, { "epoch": 0.2700302007461361, - "grad_norm": 0.3317727367369204, - "learning_rate": 1.7465732726077993e-05, - "loss": 0.559, - "mean_token_accuracy": 0.8284266702830791, - "num_tokens": 209753822.0, + "grad_norm": 0.41614535212567993, + "learning_rate": 1.9496658899906605e-05, + "loss": 0.7479, + "mean_token_accuracy": 0.7711076475679874, + "num_tokens": 180913997.0, "step": 190 }, { "epoch": 0.27713625866050806, - "grad_norm": 0.27234470597159066, - "learning_rate": 1.731983664755264e-05, - "loss": 0.5613, - "mean_token_accuracy": 0.8291354134678841, - "num_tokens": 215275472.0, + "grad_norm": 0.3927226802114006, + "learning_rate": 1.946194665405828e-05, + "loss": 0.7563, + "mean_token_accuracy": 0.7716620303690434, + "num_tokens": 185674484.0, "step": 195 }, { "epoch": 0.2842423165748801, - "grad_norm": 0.26544343779539864, - "learning_rate": 1.717058815492548e-05, - "loss": 0.5613, - "mean_token_accuracy": 0.8280082412064076, - "num_tokens": 220789450.0, + "grad_norm": 0.3794392681437391, + "learning_rate": 1.9426113835757637e-05, + "loss": 0.7537, + "mean_token_accuracy": 0.7706545531749726, + "num_tokens": 190438181.0, "step": 200 }, { "epoch": 0.2842423165748801, - "eval_loss": 0.5400242805480957, - "eval_mean_token_accuracy": 0.8298156413069943, - "eval_num_tokens": 220789450.0, - "eval_runtime": 161.1614, - "eval_samples_per_second": 22.58, - "eval_steps_per_second": 0.707, + "eval_loss": 0.7250556349754333, + "eval_mean_token_accuracy": 0.7720574731366676, + "eval_num_tokens": 190438181.0, + "eval_runtime": 149.9052, + "eval_samples_per_second": 24.275, + "eval_steps_per_second": 0.76, "step": 200 }, { "epoch": 0.2913483744892521, - "grad_norm": 0.2813762656625426, - "learning_rate": 1.701806641839967e-05, - "loss": 0.5615, - "mean_token_accuracy": 0.8296020865440369, - "num_tokens": 226318335.0, + "grad_norm": 0.40984266753765214, + "learning_rate": 1.9389165190162114e-05, + "loss": 0.753, + "mean_token_accuracy": 0.7713685400784016, + "num_tokens": 195189498.0, "step": 205 }, { "epoch": 0.29845443240362407, - "grad_norm": 0.2738481182825542, - "learning_rate": 1.6862352344500004e-05, - "loss": 0.5604, - "mean_token_accuracy": 0.8292979046702385, - "num_tokens": 231830330.0, + "grad_norm": 0.39761042206791, + "learning_rate": 1.935110561019246e-05, + "loss": 0.7424, + "mean_token_accuracy": 0.7745413303375244, + "num_tokens": 199953059.0, "step": 210 }, { "epoch": 0.3055604903179961, - "grad_norm": 0.2724828799767463, - "learning_rate": 1.6703528533155283e-05, - "loss": 0.5555, - "mean_token_accuracy": 0.8297731988132, - "num_tokens": 237344871.0, + "grad_norm": 0.37462688474606465, + "learning_rate": 1.931194013588481e-05, + "loss": 0.7504, + "mean_token_accuracy": 0.7734048135578633, + "num_tokens": 204732059.0, "step": 215 }, { "epoch": 0.3126665482323681, - "grad_norm": 0.26306657580185255, - "learning_rate": 1.6541679233882477e-05, - "loss": 0.5516, - "mean_token_accuracy": 0.8313047230243683, - "num_tokens": 242868508.0, + "grad_norm": 0.39514621265905453, + "learning_rate": 1.927167395372324e-05, + "loss": 0.746, + "mean_token_accuracy": 0.7722579926252365, + "num_tokens": 209507798.0, "step": 220 }, { "epoch": 0.3197726061467401, - "grad_norm": 0.2918451339288788, - "learning_rate": 1.63768903010958e-05, - "loss": 0.5549, - "mean_token_accuracy": 0.8298390731215477, - "num_tokens": 248407721.0, + "grad_norm": 0.3734047512754032, + "learning_rate": 1.9230312395952955e-05, + "loss": 0.7444, + "mean_token_accuracy": 0.7728776805102825, + "num_tokens": 214261827.0, "step": 225 }, { "epoch": 0.3268786640611121, - "grad_norm": 0.2778992598258234, - "learning_rate": 1.6209249148564437e-05, - "loss": 0.5503, - "mean_token_accuracy": 0.831806804984808, - "num_tokens": 253911671.0, + "grad_norm": 0.39160079857564095, + "learning_rate": 1.9187860939874176e-05, + "loss": 0.7509, + "mean_token_accuracy": 0.771727342903614, + "num_tokens": 219027585.0, "step": 230 }, { "epoch": 0.3339847219754841, - "grad_norm": 0.29816424123268476, - "learning_rate": 1.603884470304318e-05, - "loss": 0.5625, - "mean_token_accuracy": 0.8286240585148335, - "num_tokens": 259442838.0, + "grad_norm": 0.38214353202928264, + "learning_rate": 1.9144325207116785e-05, + "loss": 0.7388, + "mean_token_accuracy": 0.7766963444650173, + "num_tokens": 223775141.0, "step": 235 }, { "epoch": 0.3410907798898561, - "grad_norm": 0.3434968878817897, - "learning_rate": 1.5865767357100383e-05, - "loss": 0.5454, - "mean_token_accuracy": 0.8329082369804383, - "num_tokens": 264975290.0, + "grad_norm": 0.40133338005632113, + "learning_rate": 1.909971096289591e-05, + "loss": 0.7454, + "mean_token_accuracy": 0.7735047489404678, + "num_tokens": 228541180.0, "step": 240 }, { "epoch": 0.3481968378042281, - "grad_norm": 0.28342056849119923, - "learning_rate": 1.5690108921168428e-05, - "loss": 0.5509, - "mean_token_accuracy": 0.831651521474123, - "num_tokens": 270498217.0, + "grad_norm": 0.4094942578953397, + "learning_rate": 1.9054024115248448e-05, + "loss": 0.7401, + "mean_token_accuracy": 0.7752777233719825, + "num_tokens": 233303417.0, "step": 245 }, { "epoch": 0.35530289571860013, - "grad_norm": 0.2801391925223262, - "learning_rate": 1.5511962574842073e-05, - "loss": 0.5506, - "mean_token_accuracy": 0.8316259779036045, - "num_tokens": 276014342.0, + "grad_norm": 0.37342705131252163, + "learning_rate": 1.90072707142507e-05, + "loss": 0.746, + "mean_token_accuracy": 0.7732348993420601, + "num_tokens": 238086815.0, "step": 250 }, { "epoch": 0.3624089536329721, - "grad_norm": 0.275450732541475, - "learning_rate": 1.5331422817450485e-05, - "loss": 0.5533, - "mean_token_accuracy": 0.830129113048315, - "num_tokens": 281549490.0, + "grad_norm": 0.37950791393566236, + "learning_rate": 1.8959456951217187e-05, + "loss": 0.7324, + "mean_token_accuracy": 0.7766066655516625, + "num_tokens": 242856686.0, "step": 255 }, { "epoch": 0.3695150115473441, - "grad_norm": 0.30780371017473934, - "learning_rate": 1.5148585417929212e-05, - "loss": 0.5497, - "mean_token_accuracy": 0.8321508087217808, - "num_tokens": 287067865.0, + "grad_norm": 0.36431061688841127, + "learning_rate": 1.8910589157880766e-05, + "loss": 0.7389, + "mean_token_accuracy": 0.7757058747112751, + "num_tokens": 247606311.0, "step": 260 }, { "epoch": 0.37662106946171614, - "grad_norm": 0.32141139168463606, - "learning_rate": 1.4963547364018711e-05, - "loss": 0.546, - "mean_token_accuracy": 0.8327535085380078, - "num_tokens": 292575698.0, + "grad_norm": 0.4264408595860345, + "learning_rate": 1.8860673805554167e-05, + "loss": 0.74, + "mean_token_accuracy": 0.7750592313706874, + "num_tokens": 252376279.0, "step": 265 }, { "epoch": 0.3837271273760881, - "grad_norm": 0.2941090649445171, - "learning_rate": 1.477640681081632e-05, - "loss": 0.5495, - "mean_token_accuracy": 0.8316998913884163, - "num_tokens": 298104461.0, + "grad_norm": 0.3649269356510504, + "learning_rate": 1.8809717504273e-05, + "loss": 0.7294, + "mean_token_accuracy": 0.7773133426904678, + "num_tokens": 257157622.0, "step": 270 }, { "epoch": 0.39083318529046013, - "grad_norm": 0.3156113844180264, - "learning_rate": 1.4587263028709013e-05, - "loss": 0.546, - "mean_token_accuracy": 0.8325068384408951, - "num_tokens": 303615295.0, + "grad_norm": 0.4595703905257225, + "learning_rate": 1.8757727001920446e-05, + "loss": 0.7376, + "mean_token_accuracy": 0.7763620682060719, + "num_tokens": 261918809.0, "step": 275 }, { "epoch": 0.3979392432048321, - "grad_norm": 0.27848176976199285, - "learning_rate": 1.4396216350714512e-05, - "loss": 0.5473, - "mean_token_accuracy": 0.8326819702982903, - "num_tokens": 309131939.0, + "grad_norm": 0.5280527224785482, + "learning_rate": 1.8704709183333653e-05, + "loss": 0.7329, + "mean_token_accuracy": 0.7755794525146484, + "num_tokens": 266684284.0, "step": 280 }, { "epoch": 0.4050453011192041, - "grad_norm": 0.28930577025792253, - "learning_rate": 1.4203368119258759e-05, - "loss": 0.5433, - "mean_token_accuracy": 0.8328328810632228, - "num_tokens": 314661318.0, + "grad_norm": 0.37695307187112215, + "learning_rate": 1.8650671069392034e-05, + "loss": 0.7331, + "mean_token_accuracy": 0.776630100607872, + "num_tokens": 271450936.0, "step": 285 }, { "epoch": 0.41215135903357614, - "grad_norm": 0.3660686775923689, - "learning_rate": 1.4008820632417906e-05, - "loss": 0.5401, - "mean_token_accuracy": 0.8341327331960201, - "num_tokens": 320185466.0, + "grad_norm": 0.4776029246227445, + "learning_rate": 1.85956198160875e-05, + "loss": 0.7262, + "mean_token_accuracy": 0.7792015597224236, + "num_tokens": 276196347.0, "step": 290 }, { "epoch": 0.4192574169479481, - "grad_norm": 0.30653539266099966, - "learning_rate": 1.381267708965339e-05, - "loss": 0.5437, - "mean_token_accuracy": 0.8333369344472885, - "num_tokens": 325709519.0, + "grad_norm": 0.40416881485925626, + "learning_rate": 1.853956271357685e-05, + "loss": 0.7207, + "mean_token_accuracy": 0.7793174132704734, + "num_tokens": 280956754.0, "step": 295 }, { "epoch": 0.4263634748623201, - "grad_norm": 0.2716446207789635, - "learning_rate": 1.3615041537068831e-05, - "loss": 0.5515, - "mean_token_accuracy": 0.8306186564266682, - "num_tokens": 331248121.0, + "grad_norm": 0.36640042813026624, + "learning_rate": 1.8482507185216365e-05, + "loss": 0.7417, + "mean_token_accuracy": 0.7740650460124016, + "num_tokens": 285730218.0, "step": 300 }, { "epoch": 0.4263634748623201, - "eval_loss": 0.5254271030426025, - "eval_mean_token_accuracy": 0.8335580465040708, - "eval_num_tokens": 331248121.0, - "eval_runtime": 160.8696, - "eval_samples_per_second": 22.621, - "eval_steps_per_second": 0.709, + "eval_loss": 0.7050633430480957, + "eval_mean_token_accuracy": 0.7762745759989086, + "eval_num_tokens": 285730218.0, + "eval_runtime": 149.2344, + "eval_samples_per_second": 24.384, + "eval_steps_per_second": 0.764, "step": 300 }, { "epoch": 0.43346953277669215, - "grad_norm": 0.27817622720791757, - "learning_rate": 1.3416018812217866e-05, - "loss": 0.5495, - "mean_token_accuracy": 0.8312446601688862, - "num_tokens": 336772706.0, + "grad_norm": 0.39913292298186365, + "learning_rate": 1.842446078657877e-05, + "loss": 0.7328, + "mean_token_accuracy": 0.7760866671800614, + "num_tokens": 290497109.0, "step": 305 }, { "epoch": 0.4405755906910641, - "grad_norm": 0.2770013355984933, - "learning_rate": 1.3215714488492121e-05, - "loss": 0.535, - "mean_token_accuracy": 0.8355906143784523, - "num_tokens": 342302933.0, + "grad_norm": 1.1471762547137223, + "learning_rate": 1.8365431204452683e-05, + "loss": 0.7364, + "mean_token_accuracy": 0.7759052954614163, + "num_tokens": 295276239.0, "step": 310 }, { "epoch": 0.44768164860543613, - "grad_norm": 0.2774944414170579, - "learning_rate": 1.3014234819118846e-05, - "loss": 0.533, - "mean_token_accuracy": 0.8360061995685101, - "num_tokens": 347832122.0, + "grad_norm": 0.37744675876656947, + "learning_rate": 1.8305426255824713e-05, + "loss": 0.7317, + "mean_token_accuracy": 0.7751947946846485, + "num_tokens": 300042876.0, "step": 315 }, { "epoch": 0.45478770651980815, - "grad_norm": 0.2822504430152847, - "learning_rate": 1.2811686680797942e-05, - "loss": 0.5464, - "mean_token_accuracy": 0.8318669557571411, - "num_tokens": 353357495.0, + "grad_norm": 0.47691506104424974, + "learning_rate": 1.824445388684426e-05, + "loss": 0.7277, + "mean_token_accuracy": 0.7777360931038857, + "num_tokens": 304798068.0, "step": 320 }, { "epoch": 0.4618937644341801, - "grad_norm": 0.25912651097009326, - "learning_rate": 1.2608177517008268e-05, - "loss": 0.5374, - "mean_token_accuracy": 0.8345673441886902, - "num_tokens": 358876427.0, + "grad_norm": 0.3588696331537949, + "learning_rate": 1.8182522171771293e-05, + "loss": 0.726, + "mean_token_accuracy": 0.7783321216702461, + "num_tokens": 309546102.0, "step": 325 }, { "epoch": 0.46899982234855214, - "grad_norm": 0.2899391004734366, - "learning_rate": 1.240381528101327e-05, - "loss": 0.5315, - "mean_token_accuracy": 0.8358384124934674, - "num_tokens": 364395480.0, + "grad_norm": 0.4952808278830928, + "learning_rate": 1.8119639311907074e-05, + "loss": 0.738, + "mean_token_accuracy": 0.7744948998093605, + "num_tokens": 314307721.0, "step": 330 }, { "epoch": 0.47610588026292416, - "grad_norm": 0.2885199318810423, - "learning_rate": 1.2198708378596198e-05, - "loss": 0.5276, - "mean_token_accuracy": 0.8363832160830498, - "num_tokens": 369904006.0, + "grad_norm": 0.37908339080858117, + "learning_rate": 1.805581363450813e-05, + "loss": 0.7309, + "mean_token_accuracy": 0.7773234643042087, + "num_tokens": 319073411.0, "step": 335 }, { "epoch": 0.48321193817729613, - "grad_norm": 0.2803340014077296, - "learning_rate": 1.19929656105553e-05, - "loss": 0.5313, - "mean_token_accuracy": 0.8354136534035206, - "num_tokens": 375415817.0, + "grad_norm": 0.3852822861687584, + "learning_rate": 1.7991053591683508e-05, + "loss": 0.731, + "mean_token_accuracy": 0.7765265628695488, + "num_tokens": 323831947.0, "step": 340 }, { "epoch": 0.49031799609166815, - "grad_norm": 0.2841127075165474, - "learning_rate": 1.1786696114989455e-05, - "loss": 0.5324, - "mean_token_accuracy": 0.8360840916633606, - "num_tokens": 380931196.0, + "grad_norm": 0.36165483826370953, + "learning_rate": 1.7925367759275495e-05, + "loss": 0.7232, + "mean_token_accuracy": 0.7792682178318501, + "num_tokens": 328590613.0, "step": 345 }, { "epoch": 0.4974240540060402, - "grad_norm": 0.27070169414461726, - "learning_rate": 1.1580009309404887e-05, - "loss": 0.5339, - "mean_token_accuracy": 0.8355580635368824, - "num_tokens": 386462024.0, + "grad_norm": 0.4313963783745435, + "learning_rate": 1.7858764835723984e-05, + "loss": 0.7247, + "mean_token_accuracy": 0.7771173417568207, + "num_tokens": 333348383.0, "step": 350 }, { "epoch": 0.5045301119204122, - "grad_norm": 0.27595636054491235, - "learning_rate": 1.1373014832673661e-05, - "loss": 0.5363, - "mean_token_accuracy": 0.8354052670300007, - "num_tokens": 391971484.0, + "grad_norm": 0.4013920878913357, + "learning_rate": 1.7791253640914566e-05, + "loss": 0.7236, + "mean_token_accuracy": 0.778332532197237, + "num_tokens": 338109943.0, "step": 355 }, { "epoch": 0.5116361698347841, - "grad_norm": 0.2526139742391355, - "learning_rate": 1.1165822486874773e-05, - "loss": 0.5288, - "mean_token_accuracy": 0.8361794017255306, - "num_tokens": 397503191.0, + "grad_norm": 0.4057100730404507, + "learning_rate": 1.7722843115010564e-05, + "loss": 0.7221, + "mean_token_accuracy": 0.7787548579275608, + "num_tokens": 342887490.0, "step": 360 }, { "epoch": 0.5187422277491561, - "grad_norm": 0.28153871306210954, - "learning_rate": 1.0958542179048637e-05, - "loss": 0.5307, - "mean_token_accuracy": 0.8368273265659809, - "num_tokens": 403001146.0, + "grad_norm": 0.3675393081924701, + "learning_rate": 1.7653542317269134e-05, + "loss": 0.7171, + "mean_token_accuracy": 0.7794929854571819, + "num_tokens": 347628813.0, "step": 365 }, { "epoch": 0.5258482856635281, - "grad_norm": 0.3145297412780038, - "learning_rate": 1.0751283862895914e-05, - "loss": 0.5422, - "mean_token_accuracy": 0.8336515955626964, - "num_tokens": 408526858.0, + "grad_norm": 0.3635771034527727, + "learning_rate": 1.7583360424841595e-05, + "loss": 0.7272, + "mean_token_accuracy": 0.7774313412606716, + "num_tokens": 352403713.0, "step": 370 }, { "epoch": 0.5329543435779002, - "grad_norm": 0.28928430258415916, - "learning_rate": 1.0544157480451586e-05, - "loss": 0.5402, - "mean_token_accuracy": 0.833558625727892, - "num_tokens": 414051639.0, + "grad_norm": 0.38584219141761933, + "learning_rate": 1.7512306731558133e-05, + "loss": 0.7194, + "mean_token_accuracy": 0.7801453106105327, + "num_tokens": 357150848.0, "step": 375 }, { "epoch": 0.5400604014922722, - "grad_norm": 0.26142431075193084, - "learning_rate": 1.033727290376522e-05, - "loss": 0.542, - "mean_token_accuracy": 0.8336416870355606, - "num_tokens": 419571564.0, + "grad_norm": 0.36346011493475233, + "learning_rate": 1.744039064669709e-05, + "loss": 0.7253, + "mean_token_accuracy": 0.7780666872859001, + "num_tokens": 361924581.0, "step": 380 }, { "epoch": 0.5471664594066442, - "grad_norm": 0.31125146975357, - "learning_rate": 1.013073987661834e-05, - "loss": 0.54, - "mean_token_accuracy": 0.8338716469705105, - "num_tokens": 425096738.0, + "grad_norm": 0.404515192018998, + "learning_rate": 1.7367621693738917e-05, + "loss": 0.715, + "mean_token_accuracy": 0.7817073427140713, + "num_tokens": 366676773.0, "step": 385 }, { "epoch": 0.5542725173210161, - "grad_norm": 0.2668746904615073, - "learning_rate": 9.924667956309862e-06, - "loss": 0.5312, - "mean_token_accuracy": 0.836452516913414, - "num_tokens": 430620174.0, + "grad_norm": 0.36988377045483584, + "learning_rate": 1.7294009509105052e-05, + "loss": 0.7131, + "mean_token_accuracy": 0.7806239545345306, + "num_tokens": 371452085.0, "step": 390 }, { "epoch": 0.5613785752353881, - "grad_norm": 0.40741674797587885, - "learning_rate": 9.719166455540437e-06, - "loss": 0.5374, - "mean_token_accuracy": 0.8344319149851799, - "num_tokens": 436140181.0, + "grad_norm": 0.36659851484970235, + "learning_rate": 1.7219563840881783e-05, + "loss": 0.7116, + "mean_token_accuracy": 0.782407358288765, + "num_tokens": 376207953.0, "step": 395 }, { "epoch": 0.5684846331497602, - "grad_norm": 0.27348406881195864, - "learning_rate": 9.51434438442655e-06, - "loss": 0.5348, - "mean_token_accuracy": 0.8360598988831043, - "num_tokens": 441662675.0, + "grad_norm": 0.3624183655015318, + "learning_rate": 1.71442945475294e-05, + "loss": 0.7169, + "mean_token_accuracy": 0.7801115453243256, + "num_tokens": 380979250.0, "step": 400 }, { "epoch": 0.5684846331497602, - "eval_loss": 0.5157341361045837, - "eval_mean_token_accuracy": 0.8361675456950539, - "eval_num_tokens": 441662675.0, - "eval_runtime": 161.4058, - "eval_samples_per_second": 22.546, - "eval_steps_per_second": 0.706, + "eval_loss": 0.6918764114379883, + "eval_mean_token_accuracy": 0.7797647902840062, + "eval_num_tokens": 380979250.0, + "eval_runtime": 150.2483, + "eval_samples_per_second": 24.22, + "eval_steps_per_second": 0.759, "step": 400 }, { "epoch": 0.5755906910641322, - "grad_norm": 0.27295040196213805, - "learning_rate": 9.310310392675132e-06, - "loss": 0.5232, - "mean_token_accuracy": 0.8382402293384075, - "num_tokens": 447154493.0, + "grad_norm": 0.3779480713941837, + "learning_rate": 1.7068211596576662e-05, + "loss": 0.716, + "mean_token_accuracy": 0.7807160533964634, + "num_tokens": 385752024.0, "step": 405 }, { "epoch": 0.5826967489785042, - "grad_norm": 0.26519161043459566, - "learning_rate": 9.107172711949324e-06, - "loss": 0.5382, - "mean_token_accuracy": 0.8342290692031383, - "num_tokens": 452684121.0, + "grad_norm": 0.3956915407192127, + "learning_rate": 1.699132506330086e-05, + "loss": 0.7168, + "mean_token_accuracy": 0.780977015197277, + "num_tokens": 390510208.0, "step": 410 }, { "epoch": 0.5898028068928762, - "grad_norm": 0.2629255148134165, - "learning_rate": 8.905039098456049e-06, - "loss": 0.53, - "mean_token_accuracy": 0.835949394851923, - "num_tokens": 458209573.0, + "grad_norm": 0.37756059351173565, + "learning_rate": 1.691364512939358e-05, + "loss": 0.7138, + "mean_token_accuracy": 0.7802788965404034, + "num_tokens": 395264854.0, "step": 415 }, { "epoch": 0.5969088648072481, - "grad_norm": 0.2762404944964492, - "learning_rate": 8.704016775785742e-06, - "loss": 0.5345, - "mean_token_accuracy": 0.8355151884257793, - "num_tokens": 463743007.0, + "grad_norm": 0.4087318897216221, + "learning_rate": 1.6835182081612426e-05, + "loss": 0.7136, + "mean_token_accuracy": 0.782038314640522, + "num_tokens": 400017717.0, "step": 420 }, { "epoch": 0.6040149227216202, - "grad_norm": 0.3862082589618725, - "learning_rate": 8.50421237803464e-06, - "loss": 0.529, - "mean_token_accuracy": 0.8362281493842602, - "num_tokens": 469259672.0, + "grad_norm": 0.40966025914497906, + "learning_rate": 1.6755946310418777e-05, + "loss": 0.7162, + "mean_token_accuracy": 0.7809364423155785, + "num_tokens": 404785855.0, "step": 425 }, { "epoch": 0.6111209806359922, - "grad_norm": 0.29912793555607287, - "learning_rate": 8.30573189323978e-06, - "loss": 0.523, - "mean_token_accuracy": 0.8389238156378269, - "num_tokens": 474761309.0, + "grad_norm": 0.34380470044932104, + "learning_rate": 1.6675948308601826e-05, + "loss": 0.7088, + "mean_token_accuracy": 0.7824217259883881, + "num_tokens": 409545265.0, "step": 430 }, { "epoch": 0.6182270385503642, - "grad_norm": 0.28697574980998686, - "learning_rate": 8.108680607156669e-06, - "loss": 0.5373, - "mean_token_accuracy": 0.8346886426210404, - "num_tokens": 480283239.0, + "grad_norm": 0.3999223329715891, + "learning_rate": 1.6595198669889086e-05, + "loss": 0.7178, + "mean_token_accuracy": 0.7794642865657806, + "num_tokens": 414313757.0, "step": 435 }, { "epoch": 0.6253330964647362, - "grad_norm": 0.2981386592415112, - "learning_rate": 7.913163047409533e-06, - "loss": 0.5299, - "mean_token_accuracy": 0.8359711997210979, - "num_tokens": 485803053.0, + "grad_norm": 0.4093202361120024, + "learning_rate": 1.6513708087543507e-05, + "loss": 0.7112, + "mean_token_accuracy": 0.7812661081552505, + "num_tokens": 419067741.0, "step": 440 }, { "epoch": 0.6324391543791081, - "grad_norm": 0.2811128507955913, - "learning_rate": 7.719282928043688e-06, - "loss": 0.5312, - "mean_token_accuracy": 0.8358238264918327, - "num_tokens": 491312112.0, + "grad_norm": 0.3927913861855057, + "learning_rate": 1.643148735294744e-05, + "loss": 0.7085, + "mean_token_accuracy": 0.7821477875113487, + "num_tokens": 423849699.0, "step": 445 }, { "epoch": 0.6395452122934802, - "grad_norm": 0.2538506418412949, - "learning_rate": 7.527143094509492e-06, - "loss": 0.5292, - "mean_token_accuracy": 0.8370420016348362, - "num_tokens": 496828932.0, + "grad_norm": 0.38988729195797683, + "learning_rate": 1.634854735417356e-05, + "loss": 0.7184, + "mean_token_accuracy": 0.7806262195110321, + "num_tokens": 428613216.0, "step": 450 }, { "epoch": 0.6466512702078522, - "grad_norm": 0.26664943478343317, - "learning_rate": 7.336845469107061e-06, - "loss": 0.5295, - "mean_token_accuracy": 0.8362418174743652, - "num_tokens": 502329483.0, + "grad_norm": 0.35125767095510474, + "learning_rate": 1.6264899074543038e-05, + "loss": 0.7244, + "mean_token_accuracy": 0.7782423093914985, + "num_tokens": 433373732.0, "step": 455 }, { "epoch": 0.6537573281222242, - "grad_norm": 0.2822285091829132, - "learning_rate": 7.148490996920661e-06, - "loss": 0.5315, - "mean_token_accuracy": 0.8359686724841595, - "num_tokens": 507853922.0, + "grad_norm": 0.3717296312246723, + "learning_rate": 1.6180553591171064e-05, + "loss": 0.7134, + "mean_token_accuracy": 0.7801944658160209, + "num_tokens": 438144634.0, "step": 460 }, { "epoch": 0.6608633860365962, - "grad_norm": 0.26891344059858413, - "learning_rate": 6.9621795922714805e-06, - "loss": 0.5283, - "mean_token_accuracy": 0.837210227549076, - "num_tokens": 513366033.0, + "grad_norm": 0.3514665073580472, + "learning_rate": 1.6095522073499968e-05, + "loss": 0.7094, + "mean_token_accuracy": 0.782074099034071, + "num_tokens": 442899589.0, "step": 465 }, { "epoch": 0.6679694439509682, - "grad_norm": 0.27887938299077314, - "learning_rate": 6.778010085717202e-06, - "loss": 0.5275, - "mean_token_accuracy": 0.8373380437493324, - "num_tokens": 518888798.0, + "grad_norm": 0.3635349909050601, + "learning_rate": 1.600981578182011e-05, + "loss": 0.7125, + "mean_token_accuracy": 0.7808018557727336, + "num_tokens": 447672633.0, "step": 470 }, { "epoch": 0.6750755018653402, - "grad_norm": 0.25953435317326623, - "learning_rate": 6.596080171626409e-06, - "loss": 0.5307, - "mean_token_accuracy": 0.8357150256633759, - "num_tokens": 524392285.0, + "grad_norm": 0.3548670898985585, + "learning_rate": 1.5923446065778715e-05, + "loss": 0.7162, + "mean_token_accuracy": 0.7795430406928062, + "num_tokens": 452431439.0, "step": 475 }, { "epoch": 0.6821815597797122, - "grad_norm": 0.2665111343594617, - "learning_rate": 6.416486356355769e-06, - "loss": 0.5365, - "mean_token_accuracy": 0.8344089902937413, - "num_tokens": 529904641.0, + "grad_norm": 0.34932388566686257, + "learning_rate": 1.5836424362876933e-05, + "loss": 0.6984, + "mean_token_accuracy": 0.7855889156460762, + "num_tokens": 457177703.0, "step": 480 }, { "epoch": 0.6892876176940842, - "grad_norm": 0.26605568148767894, - "learning_rate": 6.239323907057342e-06, - "loss": 0.5334, - "mean_token_accuracy": 0.8357222154736519, - "num_tokens": 535417362.0, + "grad_norm": 0.3981248379617205, + "learning_rate": 1.5748762196955198e-05, + "loss": 0.7036, + "mean_token_accuracy": 0.7827964283525943, + "num_tokens": 461930774.0, "step": 485 }, { "epoch": 0.6963936756084562, - "grad_norm": 0.2636227512736411, - "learning_rate": 6.064686801143271e-06, - "loss": 0.5166, - "mean_token_accuracy": 0.8401257589459419, - "num_tokens": 540908836.0, + "grad_norm": 0.3393463040601756, + "learning_rate": 1.5660471176667194e-05, + "loss": 0.7092, + "mean_token_accuracy": 0.7816402152180671, + "num_tokens": 466702045.0, "step": 490 }, { "epoch": 0.7034997335228282, - "grad_norm": 0.26826679232229517, - "learning_rate": 5.892667676434633e-06, - "loss": 0.5238, - "mean_token_accuracy": 0.8382793001830577, - "num_tokens": 546444796.0, + "grad_norm": 0.3747245118209944, + "learning_rate": 1.5571562993942594e-05, + "loss": 0.7063, + "mean_token_accuracy": 0.7829745762050152, + "num_tokens": 471461872.0, "step": 495 }, { "epoch": 0.7106057914372003, - "grad_norm": 0.3599829471194239, - "learning_rate": 5.723357782020867e-06, - "loss": 0.5225, - "mean_token_accuracy": 0.838120236247778, - "num_tokens": 551952947.0, + "grad_norm": 0.356673539185162, + "learning_rate": 1.5482049422438732e-05, + "loss": 0.7052, + "mean_token_accuracy": 0.7823217682540416, + "num_tokens": 476233238.0, "step": 500 }, { "epoch": 0.7106057914372003, - "eval_loss": 0.5095566511154175, - "eval_mean_token_accuracy": 0.8375082057819032, - "eval_num_tokens": 551952947.0, - "eval_runtime": 161.2359, - "eval_samples_per_second": 22.569, - "eval_steps_per_second": 0.707, + "eval_loss": 0.6809196472167969, + "eval_mean_token_accuracy": 0.7825243828589457, + "eval_num_tokens": 476233238.0, + "eval_runtime": 150.1867, + "eval_samples_per_second": 24.23, + "eval_steps_per_second": 0.759, "step": 500 }, { "epoch": 0.7177118493515722, - "grad_norm": 0.26704698034936375, - "learning_rate": 5.556846929855857e-06, - "loss": 0.5203, - "mean_token_accuracy": 0.8389924250543117, - "num_tokens": 557467246.0, + "grad_norm": 0.342268344236882, + "learning_rate": 1.5391942315981506e-05, + "loss": 0.7124, + "mean_token_accuracy": 0.7804363466799259, + "num_tokens": 481010410.0, "step": 505 }, { "epoch": 0.7248179072659442, - "grad_norm": 0.26539658794683185, - "learning_rate": 5.393223447116409e-06, - "loss": 0.5344, - "mean_token_accuracy": 0.8358133606612682, - "num_tokens": 562988077.0, + "grad_norm": 0.41188433212292186, + "learning_rate": 1.530125360699561e-05, + "loss": 0.7089, + "mean_token_accuracy": 0.7815835013985634, + "num_tokens": 485757825.0, "step": 510 }, { "epoch": 0.7319239651803162, - "grad_norm": 0.24829603150024088, - "learning_rate": 5.232574129348278e-06, - "loss": 0.5235, - "mean_token_accuracy": 0.8386510953307151, - "num_tokens": 568506583.0, + "grad_norm": 0.456891730601901, + "learning_rate": 1.520999530492441e-05, + "loss": 0.7022, + "mean_token_accuracy": 0.7851340644061565, + "num_tokens": 490512360.0, "step": 515 }, { "epoch": 0.7390300230946882, - "grad_norm": 0.2605719227696348, - "learning_rate": 5.0749841944247e-06, - "loss": 0.5338, - "mean_token_accuracy": 0.83480354398489, - "num_tokens": 574039672.0, + "grad_norm": 0.347445377971525, + "learning_rate": 1.511817949463956e-05, + "loss": 0.7066, + "mean_token_accuracy": 0.7829876273870469, + "num_tokens": 495265025.0, "step": 520 }, { "epoch": 0.7461360810090603, - "grad_norm": 0.26778891488350126, - "learning_rate": 4.92053723734182e-06, - "loss": 0.5314, - "mean_token_accuracy": 0.8356986902654171, - "num_tokens": 579544417.0, + "grad_norm": 0.3649440044406707, + "learning_rate": 1.5025818334840695e-05, + "loss": 0.7057, + "mean_token_accuracy": 0.7825053557753563, + "num_tokens": 500030371.0, "step": 525 }, { "epoch": 0.7532421389234323, - "grad_norm": 0.42601785989143315, - "learning_rate": 4.769315185874951e-06, - "loss": 0.5269, - "mean_token_accuracy": 0.8374913208186626, - "num_tokens": 585066703.0, + "grad_norm": 0.3848276798074321, + "learning_rate": 1.493292405644531e-05, + "loss": 0.6916, + "mean_token_accuracy": 0.7862150557339191, + "num_tokens": 504787581.0, "step": 530 }, { "epoch": 0.7603481968378042, - "grad_norm": 0.2541554106960908, - "learning_rate": 4.621398257119266e-06, - "loss": 0.5262, - "mean_token_accuracy": 0.837667242437601, - "num_tokens": 590596017.0, + "grad_norm": 0.3545112606618106, + "learning_rate": 1.4839508960969071e-05, + "loss": 0.7041, + "mean_token_accuracy": 0.7828620508313179, + "num_tokens": 509570758.0, "step": 535 }, { "epoch": 0.7674542547521762, - "grad_norm": 0.2695567911282263, - "learning_rate": 4.476864914937923e-06, - "loss": 0.5195, - "mean_token_accuracy": 0.8392265714704991, - "num_tokens": 596113106.0, + "grad_norm": 0.37573645816136086, + "learning_rate": 1.4745585418896799e-05, + "loss": 0.7022, + "mean_token_accuracy": 0.7837928868830204, + "num_tokens": 514321600.0, "step": 540 }, { "epoch": 0.7745603126665482, - "grad_norm": 0.25052432871031344, - "learning_rate": 4.335791828340183e-06, - "loss": 0.5296, - "mean_token_accuracy": 0.837313498556614, - "num_tokens": 601659151.0, + "grad_norm": 0.367683939418967, + "learning_rate": 1.4651165868044301e-05, + "loss": 0.6995, + "mean_token_accuracy": 0.7847208097577095, + "num_tokens": 519082348.0, "step": 545 }, { "epoch": 0.7816663705809203, - "grad_norm": 0.26020461836934194, - "learning_rate": 4.1982538308116775e-06, - "loss": 0.5251, - "mean_token_accuracy": 0.8361369468271732, - "num_tokens": 607175134.0, + "grad_norm": 0.37333173963401106, + "learning_rate": 1.45562628119113e-05, + "loss": 0.7008, + "mean_token_accuracy": 0.7843301363289357, + "num_tokens": 523847671.0, "step": 550 }, { "epoch": 0.7887724284952923, - "grad_norm": 0.24684211829794916, - "learning_rate": 4.064323880618279e-06, - "loss": 0.5274, - "mean_token_accuracy": 0.8380102440714836, - "num_tokens": 612699036.0, + "grad_norm": 0.35304534000101323, + "learning_rate": 1.446088881802566e-05, + "loss": 0.7113, + "mean_token_accuracy": 0.780696228891611, + "num_tokens": 528620044.0, "step": 555 }, { "epoch": 0.7958784864096642, - "grad_norm": 0.2745271550137196, - "learning_rate": 3.934073022104759e-06, - "loss": 0.5233, - "mean_token_accuracy": 0.8380540162324905, - "num_tokens": 618232472.0, + "grad_norm": 0.37158140908427656, + "learning_rate": 1.4365056516279126e-05, + "loss": 0.7016, + "mean_token_accuracy": 0.7839049801230431, + "num_tokens": 533367563.0, "step": 560 }, { "epoch": 0.8029845443240362, - "grad_norm": 0.2695298031436579, - "learning_rate": 3.807570348008672e-06, - "loss": 0.5243, - "mean_token_accuracy": 0.8377192810177803, - "num_tokens": 623766598.0, + "grad_norm": 0.3666636970376427, + "learning_rate": 1.426877859725482e-05, + "loss": 0.7013, + "mean_token_accuracy": 0.7832373000681401, + "num_tokens": 538117561.0, "step": 565 }, { "epoch": 0.8100906022384082, - "grad_norm": 0.2646862071287686, - "learning_rate": 3.684882962809484e-06, - "loss": 0.5304, - "mean_token_accuracy": 0.8363631062209607, - "num_tokens": 629297697.0, + "grad_norm": 0.3480002426515768, + "learning_rate": 1.4172067810546689e-05, + "loss": 0.7024, + "mean_token_accuracy": 0.7843490958213806, + "num_tokens": 542889289.0, "step": 570 }, { "epoch": 0.8171966601527803, - "grad_norm": 0.23795317969691135, - "learning_rate": 3.5660759471324037e-06, - "loss": 0.5287, - "mean_token_accuracy": 0.836943382024765, - "num_tokens": 634808630.0, + "grad_norm": 0.34159899298495605, + "learning_rate": 1.4074936963071135e-05, + "loss": 0.7034, + "mean_token_accuracy": 0.7836663112044334, + "num_tokens": 547637481.0, "step": 575 }, { "epoch": 0.8243027180671523, - "grad_norm": 0.25754712205145064, - "learning_rate": 3.451212323225786e-06, - "loss": 0.52, - "mean_token_accuracy": 0.8387916676700116, - "num_tokens": 640315396.0, + "grad_norm": 0.346312037596486, + "learning_rate": 1.3977398917371074e-05, + "loss": 0.6952, + "mean_token_accuracy": 0.7860016152262688, + "num_tokens": 552402659.0, "step": 580 }, { "epoch": 0.8314087759815243, - "grad_norm": 0.24240514105833283, - "learning_rate": 3.340353021530409e-06, - "loss": 0.5214, - "mean_token_accuracy": 0.8374654315412045, - "num_tokens": 645841425.0, + "grad_norm": 0.4021455809002062, + "learning_rate": 1.3879466589912598e-05, + "loss": 0.6938, + "mean_token_accuracy": 0.785366540402174, + "num_tokens": 557156063.0, "step": 585 }, { "epoch": 0.8385148338958962, - "grad_norm": 0.26461850588888575, - "learning_rate": 3.2335568483583708e-06, - "loss": 0.516, - "mean_token_accuracy": 0.8414874106645585, - "num_tokens": 651349089.0, + "grad_norm": 0.35612007953198216, + "learning_rate": 1.3781152949374527e-05, + "loss": 0.7012, + "mean_token_accuracy": 0.7838830970227718, + "num_tokens": 561916767.0, "step": 590 }, { "epoch": 0.8456208918102682, - "grad_norm": 0.25434414665474275, - "learning_rate": 3.1308804546987615e-06, - "loss": 0.5305, - "mean_token_accuracy": 0.8366403698921203, - "num_tokens": 656881938.0, + "grad_norm": 0.3655479416355732, + "learning_rate": 1.3682471014931031e-05, + "loss": 0.7019, + "mean_token_accuracy": 0.7831911854445934, + "num_tokens": 566684863.0, "step": 595 }, { "epoch": 0.8527269497246402, - "grad_norm": 0.25710193414257565, - "learning_rate": 3.0323783061666307e-06, - "loss": 0.5218, - "mean_token_accuracy": 0.8387804657220841, - "num_tokens": 662399294.0, + "grad_norm": 0.35487507828239484, + "learning_rate": 1.3583433854527557e-05, + "loss": 0.6967, + "mean_token_accuracy": 0.7847252510488033, + "num_tokens": 571452634.0, "step": 600 }, { "epoch": 0.8527269497246402, - "eval_loss": 0.5056362748146057, - "eval_mean_token_accuracy": 0.8385671649062842, - "eval_num_tokens": 662399294.0, - "eval_runtime": 161.5872, - "eval_samples_per_second": 22.52, - "eval_steps_per_second": 0.706, + "eval_loss": 0.672866702079773, + "eval_mean_token_accuracy": 0.784141309951481, + "eval_num_tokens": 571452634.0, + "eval_runtime": 149.8383, + "eval_samples_per_second": 24.286, + "eval_steps_per_second": 0.761, "step": 600 }, { "epoch": 0.8598330076390123, - "grad_norm": 0.24729771911584983, - "learning_rate": 2.9381026541112145e-06, - "loss": 0.5237, - "mean_token_accuracy": 0.8380155004560947, - "num_tokens": 667931430.0, + "grad_norm": 0.3456230748389429, + "learning_rate": 1.3484054583150315e-05, + "loss": 0.6906, + "mean_token_accuracy": 0.7867132879793644, + "num_tokens": 576198167.0, "step": 605 }, { "epoch": 0.8669390655533843, - "grad_norm": 0.25660228503672733, - "learning_rate": 2.848103507898745e-06, - "loss": 0.5269, - "mean_token_accuracy": 0.8368470750749111, - "num_tokens": 673461286.0, + "grad_norm": 0.4168654830920555, + "learning_rate": 1.3384346361089535e-05, + "loss": 0.6885, + "mean_token_accuracy": 0.7866604030132294, + "num_tokens": 580952899.0, "step": 610 }, { "epoch": 0.8740451234677563, - "grad_norm": 0.2856740804187853, - "learning_rate": 2.7624286083845187e-06, - "loss": 0.5218, - "mean_token_accuracy": 0.8375120624899864, - "num_tokens": 678984176.0, + "grad_norm": 0.3723864301027309, + "learning_rate": 1.3284322392196703e-05, + "loss": 0.6943, + "mean_token_accuracy": 0.7859079904854298, + "num_tokens": 585731060.0, "step": 615 }, { "epoch": 0.8811511813821282, - "grad_norm": 0.2678064141268326, - "learning_rate": 2.6811234025883457e-06, - "loss": 0.5172, - "mean_token_accuracy": 0.839438085258007, - "num_tokens": 684519341.0, + "grad_norm": 0.3470961871922244, + "learning_rate": 1.3183995922136048e-05, + "loss": 0.712, + "mean_token_accuracy": 0.7812197484076023, + "num_tokens": 590504105.0, "step": 620 }, { "epoch": 0.8882572392965002, - "grad_norm": 0.26981981962318713, - "learning_rate": 2.604231019586761e-06, - "loss": 0.5207, - "mean_token_accuracy": 0.839027612656355, - "num_tokens": 690037943.0, + "grad_norm": 0.36809607687600115, + "learning_rate": 1.308338023663049e-05, + "loss": 0.7012, + "mean_token_accuracy": 0.7837964847683907, + "num_tokens": 595263948.0, "step": 625 }, { "epoch": 0.8953632972108723, - "grad_norm": 0.2653800927885472, - "learning_rate": 2.5317922476348194e-06, - "loss": 0.523, - "mean_token_accuracy": 0.8382897555828095, - "num_tokens": 695563755.0, + "grad_norm": 0.40488523074148874, + "learning_rate": 1.2982488659702269e-05, + "loss": 0.696, + "mean_token_accuracy": 0.7849378556013107, + "num_tokens": 600009699.0, "step": 630 }, { "epoch": 0.9024693551252443, - "grad_norm": 0.2909130920485432, - "learning_rate": 2.4638455125296043e-06, - "loss": 0.5246, - "mean_token_accuracy": 0.837975486367941, - "num_tokens": 701088593.0, + "grad_norm": 0.36084599960401653, + "learning_rate": 1.2881334551908524e-05, + "loss": 0.6932, + "mean_token_accuracy": 0.785707937926054, + "num_tokens": 604750827.0, "step": 635 }, { "epoch": 0.9095754130396163, - "grad_norm": 0.2538310488730525, - "learning_rate": 2.400426857226914e-06, - "loss": 0.5181, - "mean_token_accuracy": 0.8393405571579933, - "num_tokens": 706621161.0, + "grad_norm": 0.47712658403892855, + "learning_rate": 1.2779931308572022e-05, + "loss": 0.6932, + "mean_token_accuracy": 0.7863863408565521, + "num_tokens": 609500130.0, "step": 640 }, { "epoch": 0.9166814709539882, - "grad_norm": 0.2569434114852769, - "learning_rate": 2.3415699227219517e-06, - "loss": 0.5295, - "mean_token_accuracy": 0.8363286212086678, - "num_tokens": 712135507.0, + "grad_norm": 0.3494828834988836, + "learning_rate": 1.2678292358007274e-05, + "loss": 0.6859, + "mean_token_accuracy": 0.7878520257771016, + "num_tokens": 614261653.0, "step": 645 }, { "epoch": 0.9237875288683602, - "grad_norm": 0.26172097069920536, - "learning_rate": 2.2873059302041627e-06, - "loss": 0.52, - "mean_token_accuracy": 0.8396071724593639, - "num_tokens": 717654276.0, + "grad_norm": 0.38391809182329456, + "learning_rate": 1.2576431159742298e-05, + "loss": 0.7083, + "mean_token_accuracy": 0.7823263764381408, + "num_tokens": 619054673.0, "step": 650 }, { "epoch": 0.9308935867827323, - "grad_norm": 0.26549742257601705, - "learning_rate": 2.2376636644956656e-06, - "loss": 0.5175, - "mean_token_accuracy": 0.8391963638365268, - "num_tokens": 723171964.0, + "grad_norm": 0.341786835416634, + "learning_rate": 1.247436120273624e-05, + "loss": 0.7049, + "mean_token_accuracy": 0.7822027482092381, + "num_tokens": 623817276.0, "step": 655 }, { "epoch": 0.9379996446971043, - "grad_norm": 0.2556259295868791, - "learning_rate": 2.192669458782096e-06, - "loss": 0.5255, - "mean_token_accuracy": 0.8376853354275227, - "num_tokens": 728678520.0, + "grad_norm": 0.43974221008421865, + "learning_rate": 1.237209600359311e-05, + "loss": 0.6935, + "mean_token_accuracy": 0.7854240909218788, + "num_tokens": 628574712.0, "step": 660 }, { "epoch": 0.9451057026114763, - "grad_norm": 0.2511199627055941, - "learning_rate": 2.1523471806439205e-06, - "loss": 0.5344, - "mean_token_accuracy": 0.8351106189191342, - "num_tokens": 734210332.0, + "grad_norm": 0.4040988616310412, + "learning_rate": 1.226964910477183e-05, + "loss": 0.6898, + "mean_token_accuracy": 0.7870114140212536, + "num_tokens": 633335145.0, "step": 665 }, { "epoch": 0.9522117605258483, - "grad_norm": 0.2744720323508251, - "learning_rate": 2.1167182193956738e-06, - "loss": 0.5156, - "mean_token_accuracy": 0.8407034426927567, - "num_tokens": 739702586.0, + "grad_norm": 0.38559148131636184, + "learning_rate": 1.2167034072792887e-05, + "loss": 0.6937, + "mean_token_accuracy": 0.7853186056017876, + "num_tokens": 638086757.0, "step": 670 }, { "epoch": 0.9593178184402202, - "grad_norm": 0.2996101632719976, - "learning_rate": 2.0858014747397952e-06, - "loss": 0.5245, - "mean_token_accuracy": 0.8386490792036057, - "num_tokens": 745219812.0, + "grad_norm": 0.35182778835643425, + "learning_rate": 1.2064264496441786e-05, + "loss": 0.6893, + "mean_token_accuracy": 0.7859195664525032, + "num_tokens": 642864800.0, "step": 675 }, { "epoch": 0.9664238763545923, - "grad_norm": 0.2615861429363619, - "learning_rate": 2.0596133467411213e-06, - "loss": 0.5175, - "mean_token_accuracy": 0.8394487954676151, - "num_tokens": 750717984.0, + "grad_norm": 0.37801611596609275, + "learning_rate": 1.1961353984969557e-05, + "loss": 0.689, + "mean_token_accuracy": 0.7867573000490665, + "num_tokens": 647632233.0, "step": 680 }, { "epoch": 0.9735299342689643, - "grad_norm": 0.2670223147061545, - "learning_rate": 2.0381677271273177e-06, - "loss": 0.5209, - "mean_token_accuracy": 0.8379806369543076, - "num_tokens": 756252357.0, + "grad_norm": 0.3676417856657156, + "learning_rate": 1.1858316166290542e-05, + "loss": 0.6933, + "mean_token_accuracy": 0.7860686622560025, + "num_tokens": 652408527.0, "step": 685 }, { "epoch": 0.9806359921833363, - "grad_norm": 0.260083835300812, - "learning_rate": 2.0214759919198904e-06, - "loss": 0.5149, - "mean_token_accuracy": 0.839554088562727, - "num_tokens": 761750059.0, + "grad_norm": 0.32727128259002763, + "learning_rate": 1.1755164685177733e-05, + "loss": 0.6909, + "mean_token_accuracy": 0.7855750493705272, + "num_tokens": 657175363.0, "step": 690 }, { "epoch": 0.9877420500977083, - "grad_norm": 0.3230971032122484, - "learning_rate": 2.0095469953996724e-06, - "loss": 0.5244, - "mean_token_accuracy": 0.8375246554613114, - "num_tokens": 767275567.0, + "grad_norm": 0.346480246739393, + "learning_rate": 1.1651913201455865e-05, + "loss": 0.6901, + "mean_token_accuracy": 0.78620011433959, + "num_tokens": 661940460.0, "step": 695 }, { "epoch": 0.9948481080120803, - "grad_norm": 0.26793724105887723, - "learning_rate": 2.002387065409989e-06, - "loss": 0.527, - "mean_token_accuracy": 0.8372870542109012, - "num_tokens": 772815373.0, + "grad_norm": 0.3382141590702366, + "learning_rate": 1.154857538819249e-05, + "loss": 0.6935, + "mean_token_accuracy": 0.7859153963625432, + "num_tokens": 666706092.0, "step": 700 }, { "epoch": 0.9948481080120803, - "eval_loss": 0.5037879943847656, - "eval_mean_token_accuracy": 0.8392795601434875, - "eval_num_tokens": 772815373.0, - "eval_runtime": 162.0902, - "eval_samples_per_second": 22.45, - "eval_steps_per_second": 0.703, + "eval_loss": 0.6661998629570007, + "eval_mean_token_accuracy": 0.7860168757145864, + "eval_num_tokens": 666706092.0, + "eval_runtime": 150.5473, + "eval_samples_per_second": 24.172, + "eval_steps_per_second": 0.757, "step": 700 }, { - "epoch": 1.0, - "mean_token_accuracy": 0.8380202034424091, - "num_tokens": 776817701.0, - "step": 704, - "total_flos": 6035679775555584.0, - "train_loss": 0.6108774590221319, - "train_runtime": 28363.9285, - "train_samples_per_second": 12.701, - "train_steps_per_second": 0.025 + "epoch": 1.0014212115828744, + "grad_norm": 1.0385755552019693, + "learning_rate": 1.144516492988736e-05, + "loss": 0.6823, + "mean_token_accuracy": 0.7878129852784647, + "num_tokens": 671095094.0, + "step": 705 + }, + { + "epoch": 1.0085272694972465, + "grad_norm": 0.34901946667846845, + "learning_rate": 1.134169552066023e-05, + "loss": 0.6613, + "mean_token_accuracy": 0.7918078258633614, + "num_tokens": 675850987.0, + "step": 710 + }, + { + "epoch": 1.0156333274116185, + "grad_norm": 0.35371147066198844, + "learning_rate": 1.1238180862437431e-05, + "loss": 0.6616, + "mean_token_accuracy": 0.7930883727967739, + "num_tokens": 680610147.0, + "step": 715 + }, + { + "epoch": 1.0227393853259905, + "grad_norm": 0.371067048471263, + "learning_rate": 1.1134634663137373e-05, + "loss": 0.6552, + "mean_token_accuracy": 0.7940364375710487, + "num_tokens": 685353908.0, + "step": 720 + }, + { + "epoch": 1.0298454432403623, + "grad_norm": 0.3481611518266803, + "learning_rate": 1.1031070634855314e-05, + "loss": 0.6593, + "mean_token_accuracy": 0.7930267058312893, + "num_tokens": 690111045.0, + "step": 725 + }, + { + "epoch": 1.0369515011547343, + "grad_norm": 0.35013978798256634, + "learning_rate": 1.0927502492047492e-05, + "loss": 0.6673, + "mean_token_accuracy": 0.7919997818768024, + "num_tokens": 694881554.0, + "step": 730 + }, + { + "epoch": 1.0440575590691064, + "grad_norm": 0.33993734586176455, + "learning_rate": 1.0823943949715022e-05, + "loss": 0.67, + "mean_token_accuracy": 0.7910104177892208, + "num_tokens": 699670214.0, + "step": 735 + }, + { + "epoch": 1.0511636169834784, + "grad_norm": 0.3412860380013659, + "learning_rate": 1.0720408721587671e-05, + "loss": 0.6715, + "mean_token_accuracy": 0.7910432547330857, + "num_tokens": 704426344.0, + "step": 740 + }, + { + "epoch": 1.0582696748978504, + "grad_norm": 0.3627320420498548, + "learning_rate": 1.061691051830783e-05, + "loss": 0.668, + "mean_token_accuracy": 0.7916376106441021, + "num_tokens": 709184272.0, + "step": 745 + }, + { + "epoch": 1.0653757328122224, + "grad_norm": 0.3551359244101548, + "learning_rate": 1.0513463045614873e-05, + "loss": 0.6732, + "mean_token_accuracy": 0.7899613387882709, + "num_tokens": 713964117.0, + "step": 750 + }, + { + "epoch": 1.0724817907265944, + "grad_norm": 0.35291410759778385, + "learning_rate": 1.0410080002530188e-05, + "loss": 0.6653, + "mean_token_accuracy": 0.7933160819113254, + "num_tokens": 718714498.0, + "step": 755 + }, + { + "epoch": 1.0795878486409665, + "grad_norm": 0.5183974245642026, + "learning_rate": 1.030677507954307e-05, + "loss": 0.669, + "mean_token_accuracy": 0.7922067753970623, + "num_tokens": 723480315.0, + "step": 760 + }, + { + "epoch": 1.0866939065553385, + "grad_norm": 0.36178536189786653, + "learning_rate": 1.0203561956797777e-05, + "loss": 0.6592, + "mean_token_accuracy": 0.7924857877194882, + "num_tokens": 728257943.0, + "step": 765 + }, + { + "epoch": 1.0937999644697105, + "grad_norm": 0.350161084492629, + "learning_rate": 1.0100454302281917e-05, + "loss": 0.6708, + "mean_token_accuracy": 0.7903590828180314, + "num_tokens": 733027792.0, + "step": 770 + }, + { + "epoch": 1.1009060223840825, + "grad_norm": 0.3924504778281434, + "learning_rate": 9.997465770016488e-06, + "loss": 0.665, + "mean_token_accuracy": 0.7942788422107696, + "num_tokens": 737777094.0, + "step": 775 + }, + { + "epoch": 1.1080120802984545, + "grad_norm": 0.3784959263919085, + "learning_rate": 9.894609998247735e-06, + "loss": 0.667, + "mean_token_accuracy": 0.7912828728556633, + "num_tokens": 742543159.0, + "step": 780 + }, + { + "epoch": 1.1151181382128263, + "grad_norm": 0.3498944950913003, + "learning_rate": 9.791900607641104e-06, + "loss": 0.6635, + "mean_token_accuracy": 0.7930950812995434, + "num_tokens": 747308343.0, + "step": 785 + }, + { + "epoch": 1.1222241961271984, + "grad_norm": 0.3820207111204085, + "learning_rate": 9.68935119947753e-06, + "loss": 0.668, + "mean_token_accuracy": 0.7914499528706074, + "num_tokens": 752079771.0, + "step": 790 + }, + { + "epoch": 1.1293302540415704, + "grad_norm": 0.37200401458209914, + "learning_rate": 9.586975353852284e-06, + "loss": 0.6639, + "mean_token_accuracy": 0.7913541235029697, + "num_tokens": 756847538.0, + "step": 795 + }, + { + "epoch": 1.1364363119559424, + "grad_norm": 0.3953405586571805, + "learning_rate": 9.484786627876655e-06, + "loss": 0.6697, + "mean_token_accuracy": 0.790704844892025, + "num_tokens": 761615758.0, + "step": 800 + }, + { + "epoch": 1.1364363119559424, + "eval_loss": 0.6620959043502808, + "eval_mean_token_accuracy": 0.7867940282612517, + "eval_num_tokens": 761615758.0, + "eval_runtime": 149.9826, + "eval_samples_per_second": 24.263, + "eval_steps_per_second": 0.76, + "step": 800 + }, + { + "epoch": 1.1435423698703144, + "grad_norm": 0.3501237218712183, + "learning_rate": 9.382798553882605e-06, + "loss": 0.6685, + "mean_token_accuracy": 0.789706601947546, + "num_tokens": 766396725.0, + "step": 805 + }, + { + "epoch": 1.1506484277846865, + "grad_norm": 0.35257144104430527, + "learning_rate": 9.281024637630794e-06, + "loss": 0.656, + "mean_token_accuracy": 0.7935691051185131, + "num_tokens": 771153408.0, + "step": 810 + }, + { + "epoch": 1.1577544856990585, + "grad_norm": 0.3686235797828206, + "learning_rate": 9.179478356522055e-06, + "loss": 0.6617, + "mean_token_accuracy": 0.7928701542317868, + "num_tokens": 775910085.0, + "step": 815 + }, + { + "epoch": 1.1648605436134305, + "grad_norm": 0.35944613382478946, + "learning_rate": 9.078173157812669e-06, + "loss": 0.6673, + "mean_token_accuracy": 0.7925907090306282, + "num_tokens": 780683650.0, + "step": 820 + }, + { + "epoch": 1.1719666015278025, + "grad_norm": 0.33694316783063644, + "learning_rate": 8.97712245683359e-06, + "loss": 0.6686, + "mean_token_accuracy": 0.7904776819050312, + "num_tokens": 785473480.0, + "step": 825 + }, + { + "epoch": 1.1790726594421745, + "grad_norm": 0.37743982477075316, + "learning_rate": 8.876339635213951e-06, + "loss": 0.6672, + "mean_token_accuracy": 0.7913396395742893, + "num_tokens": 790244466.0, + "step": 830 + }, + { + "epoch": 1.1861787173565466, + "grad_norm": 0.3852825297524554, + "learning_rate": 8.775838039108975e-06, + "loss": 0.6577, + "mean_token_accuracy": 0.7940163776278496, + "num_tokens": 794986608.0, + "step": 835 + }, + { + "epoch": 1.1932847752709184, + "grad_norm": 0.3584263516575822, + "learning_rate": 8.67563097743263e-06, + "loss": 0.6589, + "mean_token_accuracy": 0.7941929534077644, + "num_tokens": 799743654.0, + "step": 840 + }, + { + "epoch": 1.2003908331852904, + "grad_norm": 0.36222679284602094, + "learning_rate": 8.575731720095194e-06, + "loss": 0.6558, + "mean_token_accuracy": 0.7949050404131413, + "num_tokens": 804510301.0, + "step": 845 + }, + { + "epoch": 1.2074968910996624, + "grad_norm": 0.36601316470081835, + "learning_rate": 8.476153496245978e-06, + "loss": 0.6765, + "mean_token_accuracy": 0.7888801738619804, + "num_tokens": 809295294.0, + "step": 850 + }, + { + "epoch": 1.2146029490140344, + "grad_norm": 0.35840290876863273, + "learning_rate": 8.376909492521465e-06, + "loss": 0.6651, + "mean_token_accuracy": 0.7920581080019474, + "num_tokens": 814063402.0, + "step": 855 + }, + { + "epoch": 1.2217090069284064, + "grad_norm": 0.35918850454361245, + "learning_rate": 8.278012851299082e-06, + "loss": 0.6604, + "mean_token_accuracy": 0.793212516605854, + "num_tokens": 818822580.0, + "step": 860 + }, + { + "epoch": 1.2288150648427785, + "grad_norm": 0.36931586095521574, + "learning_rate": 8.179476668956799e-06, + "loss": 0.6697, + "mean_token_accuracy": 0.789932218939066, + "num_tokens": 823577622.0, + "step": 865 + }, + { + "epoch": 1.2359211227571505, + "grad_norm": 0.3618218146346168, + "learning_rate": 8.081313994138857e-06, + "loss": 0.6573, + "mean_token_accuracy": 0.7943486146628856, + "num_tokens": 828319732.0, + "step": 870 + }, + { + "epoch": 1.2430271806715225, + "grad_norm": 0.3947688956702076, + "learning_rate": 7.983537826027808e-06, + "loss": 0.6677, + "mean_token_accuracy": 0.7911505416035652, + "num_tokens": 833074724.0, + "step": 875 + }, + { + "epoch": 1.2501332385858945, + "grad_norm": 0.49837585254908734, + "learning_rate": 7.886161112623072e-06, + "loss": 0.6549, + "mean_token_accuracy": 0.7948375590145588, + "num_tokens": 837847437.0, + "step": 880 + }, + { + "epoch": 1.2572392965002666, + "grad_norm": 0.3875626366429437, + "learning_rate": 7.789196749026349e-06, + "loss": 0.6519, + "mean_token_accuracy": 0.7962292313575745, + "num_tokens": 842595165.0, + "step": 885 + }, + { + "epoch": 1.2643453544146386, + "grad_norm": 0.3461153977326627, + "learning_rate": 7.692657575733928e-06, + "loss": 0.6591, + "mean_token_accuracy": 0.7930607885122299, + "num_tokens": 847377566.0, + "step": 890 + }, + { + "epoch": 1.2714514123290104, + "grad_norm": 0.3539458234583982, + "learning_rate": 7.596556376936328e-06, + "loss": 0.6585, + "mean_token_accuracy": 0.7939456604421139, + "num_tokens": 852138878.0, + "step": 895 + }, + { + "epoch": 1.2785574702433826, + "grad_norm": 0.3776944250010228, + "learning_rate": 7.500905878825335e-06, + "loss": 0.6552, + "mean_token_accuracy": 0.7954832412302494, + "num_tokens": 856908856.0, + "step": 900 + }, + { + "epoch": 1.2785574702433826, + "eval_loss": 0.6587108373641968, + "eval_mean_token_accuracy": 0.7879830112582759, + "eval_num_tokens": 856908856.0, + "eval_runtime": 150.1109, + "eval_samples_per_second": 24.242, + "eval_steps_per_second": 0.759, + "step": 900 + }, + { + "epoch": 1.2856635281577544, + "grad_norm": 0.35417475896948203, + "learning_rate": 7.405718747908743e-06, + "loss": 0.6554, + "mean_token_accuracy": 0.7936457127332688, + "num_tokens": 861668793.0, + "step": 905 + }, + { + "epoch": 1.2927695860721264, + "grad_norm": 0.3717646188257627, + "learning_rate": 7.311007589332986e-06, + "loss": 0.6587, + "mean_token_accuracy": 0.7932697109878063, + "num_tokens": 866418403.0, + "step": 910 + }, + { + "epoch": 1.2998756439864985, + "grad_norm": 0.38878740217599195, + "learning_rate": 7.216784945213913e-06, + "loss": 0.6625, + "mean_token_accuracy": 0.7936202257871627, + "num_tokens": 871159945.0, + "step": 915 + }, + { + "epoch": 1.3069817019008705, + "grad_norm": 0.3422730868265931, + "learning_rate": 7.123063292975889e-06, + "loss": 0.6525, + "mean_token_accuracy": 0.794795686006546, + "num_tokens": 875924929.0, + "step": 920 + }, + { + "epoch": 1.3140877598152425, + "grad_norm": 0.3694651695886196, + "learning_rate": 7.02985504369949e-06, + "loss": 0.6547, + "mean_token_accuracy": 0.7951326429843902, + "num_tokens": 880666672.0, + "step": 925 + }, + { + "epoch": 1.3211938177296145, + "grad_norm": 0.3621570732014425, + "learning_rate": 6.937172540477944e-06, + "loss": 0.6654, + "mean_token_accuracy": 0.7919820554554462, + "num_tokens": 885436601.0, + "step": 930 + }, + { + "epoch": 1.3282998756439865, + "grad_norm": 0.7187487529688721, + "learning_rate": 6.8450280567826074e-06, + "loss": 0.6636, + "mean_token_accuracy": 0.792605972290039, + "num_tokens": 890209266.0, + "step": 935 + }, + { + "epoch": 1.3354059335583586, + "grad_norm": 0.3607746297969546, + "learning_rate": 6.753433794837663e-06, + "loss": 0.655, + "mean_token_accuracy": 0.7943654432892799, + "num_tokens": 894978447.0, + "step": 940 + }, + { + "epoch": 1.3425119914727306, + "grad_norm": 0.34656870317990024, + "learning_rate": 6.662401884004226e-06, + "loss": 0.6594, + "mean_token_accuracy": 0.7929094567894935, + "num_tokens": 899731953.0, + "step": 945 + }, + { + "epoch": 1.3496180493871024, + "grad_norm": 0.3556403292186795, + "learning_rate": 6.571944379174128e-06, + "loss": 0.6557, + "mean_token_accuracy": 0.7939096741378308, + "num_tokens": 904484204.0, + "step": 950 + }, + { + "epoch": 1.3567241073014746, + "grad_norm": 0.3481029203946856, + "learning_rate": 6.482073259173533e-06, + "loss": 0.6558, + "mean_token_accuracy": 0.795223805308342, + "num_tokens": 909254980.0, + "step": 955 + }, + { + "epoch": 1.3638301652158464, + "grad_norm": 0.36246749377385634, + "learning_rate": 6.39280042517666e-06, + "loss": 0.6576, + "mean_token_accuracy": 0.794485367834568, + "num_tokens": 914013636.0, + "step": 960 + }, + { + "epoch": 1.3709362231302185, + "grad_norm": 0.35744165552118257, + "learning_rate": 6.304137699129758e-06, + "loss": 0.6521, + "mean_token_accuracy": 0.7954901576042175, + "num_tokens": 918774652.0, + "step": 965 + }, + { + "epoch": 1.3780422810445905, + "grad_norm": 0.34190890162690535, + "learning_rate": 6.216096822185591e-06, + "loss": 0.6596, + "mean_token_accuracy": 0.7934505857527256, + "num_tokens": 923523836.0, + "step": 970 + }, + { + "epoch": 1.3851483389589625, + "grad_norm": 0.3417362920171981, + "learning_rate": 6.12868945314862e-06, + "loss": 0.6647, + "mean_token_accuracy": 0.7919908218085766, + "num_tokens": 928304038.0, + "step": 975 + }, + { + "epoch": 1.3922543968733345, + "grad_norm": 0.3435048840223176, + "learning_rate": 6.041927166931078e-06, + "loss": 0.6577, + "mean_token_accuracy": 0.7943571574985981, + "num_tokens": 933073919.0, + "step": 980 + }, + { + "epoch": 1.3993604547877065, + "grad_norm": 0.40933619958354717, + "learning_rate": 5.9558214530201784e-06, + "loss": 0.6575, + "mean_token_accuracy": 0.7943412482738494, + "num_tokens": 937846004.0, + "step": 985 + }, + { + "epoch": 1.4064665127020786, + "grad_norm": 0.3515117903938119, + "learning_rate": 5.870383713956601e-06, + "loss": 0.6599, + "mean_token_accuracy": 0.7938548773527145, + "num_tokens": 942601267.0, + "step": 990 + }, + { + "epoch": 1.4135725706164506, + "grad_norm": 0.38419237136120965, + "learning_rate": 5.785625263824531e-06, + "loss": 0.6552, + "mean_token_accuracy": 0.7948469713330268, + "num_tokens": 947375335.0, + "step": 995 + }, + { + "epoch": 1.4206786285308226, + "grad_norm": 0.38108767014595607, + "learning_rate": 5.701557326753375e-06, + "loss": 0.6504, + "mean_token_accuracy": 0.7960710853338242, + "num_tokens": 952105402.0, + "step": 1000 + }, + { + "epoch": 1.4206786285308226, + "eval_loss": 0.6552348136901855, + "eval_mean_token_accuracy": 0.7893575147578591, + "eval_num_tokens": 952105402.0, + "eval_runtime": 149.5045, + "eval_samples_per_second": 24.34, + "eval_steps_per_second": 0.763, + "step": 1000 + }, + { + "epoch": 1.4277846864451944, + "grad_norm": 0.3429174906843955, + "learning_rate": 5.6181910354314265e-06, + "loss": 0.6596, + "mean_token_accuracy": 0.7940759062767029, + "num_tokens": 956874826.0, + "step": 1005 + }, + { + "epoch": 1.4348907443595666, + "grad_norm": 0.37307240441832, + "learning_rate": 5.5355374296316e-06, + "loss": 0.6589, + "mean_token_accuracy": 0.7940549589693546, + "num_tokens": 961632193.0, + "step": 1010 + }, + { + "epoch": 1.4419968022739384, + "grad_norm": 0.3463607871324184, + "learning_rate": 5.4536074547495055e-06, + "loss": 0.6576, + "mean_token_accuracy": 0.7948333404958248, + "num_tokens": 966392410.0, + "step": 1015 + }, + { + "epoch": 1.4491028601883105, + "grad_norm": 0.34652042003246536, + "learning_rate": 5.372411960353996e-06, + "loss": 0.6636, + "mean_token_accuracy": 0.7924063883721828, + "num_tokens": 971170949.0, + "step": 1020 + }, + { + "epoch": 1.4562089181026825, + "grad_norm": 0.33480848996072166, + "learning_rate": 5.2919616987504205e-06, + "loss": 0.6436, + "mean_token_accuracy": 0.7979453206062317, + "num_tokens": 975920452.0, + "step": 1025 + }, + { + "epoch": 1.4633149760170545, + "grad_norm": 0.35444982881545845, + "learning_rate": 5.212267323556754e-06, + "loss": 0.6488, + "mean_token_accuracy": 0.7975021339952946, + "num_tokens": 980657772.0, + "step": 1030 + }, + { + "epoch": 1.4704210339314265, + "grad_norm": 0.3236146641950486, + "learning_rate": 5.1333393882927776e-06, + "loss": 0.6656, + "mean_token_accuracy": 0.7911154888570309, + "num_tokens": 985424225.0, + "step": 1035 + }, + { + "epoch": 1.4775270918457986, + "grad_norm": 0.36258575664825715, + "learning_rate": 5.055188344982549e-06, + "loss": 0.653, + "mean_token_accuracy": 0.7950214244425297, + "num_tokens": 990170268.0, + "step": 1040 + }, + { + "epoch": 1.4846331497601706, + "grad_norm": 0.3626547592700128, + "learning_rate": 4.977824542770279e-06, + "loss": 0.6645, + "mean_token_accuracy": 0.7932340361177921, + "num_tokens": 994933612.0, + "step": 1045 + }, + { + "epoch": 1.4917392076745426, + "grad_norm": 0.3424014804959087, + "learning_rate": 4.901258226549855e-06, + "loss": 0.6499, + "mean_token_accuracy": 0.7964041963219642, + "num_tokens": 999695033.0, + "step": 1050 + }, + { + "epoch": 1.4988452655889146, + "grad_norm": 0.3457300931458133, + "learning_rate": 4.825499535608169e-06, + "loss": 0.659, + "mean_token_accuracy": 0.7942204736173153, + "num_tokens": 1004453306.0, + "step": 1055 + }, + { + "epoch": 1.5059513235032864, + "grad_norm": 0.407404649365606, + "learning_rate": 4.750558502282403e-06, + "loss": 0.6466, + "mean_token_accuracy": 0.7969782948493958, + "num_tokens": 1009222958.0, + "step": 1060 + }, + { + "epoch": 1.5130573814176587, + "grad_norm": 0.36455466451492874, + "learning_rate": 4.676445050631517e-06, + "loss": 0.6669, + "mean_token_accuracy": 0.7919491566717625, + "num_tokens": 1013988411.0, + "step": 1065 + }, + { + "epoch": 1.5201634393320305, + "grad_norm": 0.3352428252596833, + "learning_rate": 4.603168995122048e-06, + "loss": 0.653, + "mean_token_accuracy": 0.7959543123841286, + "num_tokens": 1018736541.0, + "step": 1070 + }, + { + "epoch": 1.5272694972464027, + "grad_norm": 0.336146168164258, + "learning_rate": 4.530740039328427e-06, + "loss": 0.6527, + "mean_token_accuracy": 0.795566051453352, + "num_tokens": 1023492540.0, + "step": 1075 + }, + { + "epoch": 1.5343755551607745, + "grad_norm": 0.35998475713849004, + "learning_rate": 4.4591677746479935e-06, + "loss": 0.6542, + "mean_token_accuracy": 0.7954114884138107, + "num_tokens": 1028251642.0, + "step": 1080 + }, + { + "epoch": 1.5414816130751465, + "grad_norm": 0.4210390284353885, + "learning_rate": 4.38846167903085e-06, + "loss": 0.6501, + "mean_token_accuracy": 0.7963161066174507, + "num_tokens": 1033004523.0, + "step": 1085 + }, + { + "epoch": 1.5485876709895185, + "grad_norm": 0.4960482537413715, + "learning_rate": 4.318631115724741e-06, + "loss": 0.6553, + "mean_token_accuracy": 0.7946652464568615, + "num_tokens": 1037760729.0, + "step": 1090 + }, + { + "epoch": 1.5556937289038906, + "grad_norm": 0.3799332566110809, + "learning_rate": 4.2496853320351424e-06, + "loss": 0.6607, + "mean_token_accuracy": 0.7947109803557396, + "num_tokens": 1042523723.0, + "step": 1095 + }, + { + "epoch": 1.5627997868182626, + "grad_norm": 0.33946558046244235, + "learning_rate": 4.1816334581006656e-06, + "loss": 0.6651, + "mean_token_accuracy": 0.792590418457985, + "num_tokens": 1047291640.0, + "step": 1100 + }, + { + "epoch": 1.5627997868182626, + "eval_loss": 0.6525910496711731, + "eval_mean_token_accuracy": 0.7899543700510996, + "eval_num_tokens": 1047291640.0, + "eval_runtime": 150.3086, + "eval_samples_per_second": 24.21, + "eval_steps_per_second": 0.758, + "step": 1100 + }, + { + "epoch": 1.5699058447326346, + "grad_norm": 0.32287060668756534, + "learning_rate": 4.114484505684019e-06, + "loss": 0.6541, + "mean_token_accuracy": 0.7952132284641266, + "num_tokens": 1052042031.0, + "step": 1105 + }, + { + "epoch": 1.5770119026470066, + "grad_norm": 0.3370631693518313, + "learning_rate": 4.048247366978606e-06, + "loss": 0.658, + "mean_token_accuracy": 0.7935857936739922, + "num_tokens": 1056804804.0, + "step": 1110 + }, + { + "epoch": 1.5841179605613784, + "grad_norm": 0.414769500614358, + "learning_rate": 3.9829308134309995e-06, + "loss": 0.6475, + "mean_token_accuracy": 0.7969807527959347, + "num_tokens": 1061577783.0, + "step": 1115 + }, + { + "epoch": 1.5912240184757507, + "grad_norm": 0.35600756916022547, + "learning_rate": 3.9185434945793725e-06, + "loss": 0.6559, + "mean_token_accuracy": 0.7951311826705932, + "num_tokens": 1066355020.0, + "step": 1120 + }, + { + "epoch": 1.5983300763901225, + "grad_norm": 0.36188029855593157, + "learning_rate": 3.855093936908081e-06, + "loss": 0.6664, + "mean_token_accuracy": 0.7921065390110016, + "num_tokens": 1071139121.0, + "step": 1125 + }, + { + "epoch": 1.6054361343044947, + "grad_norm": 0.3632538405728989, + "learning_rate": 3.7925905427185504e-06, + "loss": 0.6569, + "mean_token_accuracy": 0.7936886362731457, + "num_tokens": 1075914044.0, + "step": 1130 + }, + { + "epoch": 1.6125421922188665, + "grad_norm": 0.3669437696101656, + "learning_rate": 3.7310415890166e-06, + "loss": 0.6512, + "mean_token_accuracy": 0.7960372731089592, + "num_tokens": 1080682478.0, + "step": 1135 + }, + { + "epoch": 1.6196482501332385, + "grad_norm": 0.3331543509156551, + "learning_rate": 3.6704552264163695e-06, + "loss": 0.6561, + "mean_token_accuracy": 0.7935027062892914, + "num_tokens": 1085456231.0, + "step": 1140 + }, + { + "epoch": 1.6267543080476106, + "grad_norm": 0.3299992523767914, + "learning_rate": 3.6108394780609513e-06, + "loss": 0.6506, + "mean_token_accuracy": 0.7957184061408042, + "num_tokens": 1090215557.0, + "step": 1145 + }, + { + "epoch": 1.6338603659619826, + "grad_norm": 0.3450779481067245, + "learning_rate": 3.552202238559953e-06, + "loss": 0.6429, + "mean_token_accuracy": 0.798128329962492, + "num_tokens": 1094959524.0, + "step": 1150 + }, + { + "epoch": 1.6409664238763546, + "grad_norm": 0.3531321257611881, + "learning_rate": 3.4945512729440413e-06, + "loss": 0.6503, + "mean_token_accuracy": 0.7954187601804733, + "num_tokens": 1099731395.0, + "step": 1155 + }, + { + "epoch": 1.6480724817907266, + "grad_norm": 0.3498741705682094, + "learning_rate": 3.437894215636661e-06, + "loss": 0.6578, + "mean_token_accuracy": 0.7941137261688709, + "num_tokens": 1104494157.0, + "step": 1160 + }, + { + "epoch": 1.6551785397050987, + "grad_norm": 0.3777101584751149, + "learning_rate": 3.382238569443045e-06, + "loss": 0.6529, + "mean_token_accuracy": 0.7957448020577431, + "num_tokens": 1109252674.0, + "step": 1165 + }, + { + "epoch": 1.6622845976194704, + "grad_norm": 0.3915263029819705, + "learning_rate": 3.3275917045566596e-06, + "loss": 0.6517, + "mean_token_accuracy": 0.7957381546497345, + "num_tokens": 1114004017.0, + "step": 1170 + }, + { + "epoch": 1.6693906555338427, + "grad_norm": 0.33568853526043657, + "learning_rate": 3.2739608575832056e-06, + "loss": 0.6412, + "mean_token_accuracy": 0.7980836987495422, + "num_tokens": 1118768157.0, + "step": 1175 + }, + { + "epoch": 1.6764967134482145, + "grad_norm": 0.34752341789739466, + "learning_rate": 3.2213531305823125e-06, + "loss": 0.6613, + "mean_token_accuracy": 0.7935202896595002, + "num_tokens": 1123535145.0, + "step": 1180 + }, + { + "epoch": 1.6836027713625867, + "grad_norm": 0.34083716495360583, + "learning_rate": 3.1697754901270477e-06, + "loss": 0.6507, + "mean_token_accuracy": 0.7964440450072289, + "num_tokens": 1128307445.0, + "step": 1185 + }, + { + "epoch": 1.6907088292769585, + "grad_norm": 0.3774093707155365, + "learning_rate": 3.1192347663813684e-06, + "loss": 0.6547, + "mean_token_accuracy": 0.7946882367134094, + "num_tokens": 1133071242.0, + "step": 1190 + }, + { + "epoch": 1.6978148871913306, + "grad_norm": 0.36333647453863616, + "learning_rate": 3.0697376521956377e-06, + "loss": 0.6526, + "mean_token_accuracy": 0.7956908911466598, + "num_tokens": 1137831284.0, + "step": 1195 + }, + { + "epoch": 1.7049209451057026, + "grad_norm": 0.33780601240351715, + "learning_rate": 3.021290702220331e-06, + "loss": 0.6561, + "mean_token_accuracy": 0.7948304824531078, + "num_tokens": 1142587626.0, + "step": 1200 + }, + { + "epoch": 1.7049209451057026, + "eval_loss": 0.6507056355476379, + "eval_mean_token_accuracy": 0.790492679466281, + "eval_num_tokens": 1142587626.0, + "eval_runtime": 149.5446, + "eval_samples_per_second": 24.334, + "eval_steps_per_second": 0.762, + "step": 1200 + }, + { + "epoch": 1.7120270030200746, + "grad_norm": 0.3537743025090112, + "learning_rate": 2.9739003320380237e-06, + "loss": 0.6624, + "mean_token_accuracy": 0.793489520996809, + "num_tokens": 1147357460.0, + "step": 1205 + }, + { + "epoch": 1.7191330609344466, + "grad_norm": 0.4133446002595886, + "learning_rate": 2.927572817313823e-06, + "loss": 0.6585, + "mean_token_accuracy": 0.7936319254338742, + "num_tokens": 1152138440.0, + "step": 1210 + }, + { + "epoch": 1.7262391188488186, + "grad_norm": 0.3563260686728207, + "learning_rate": 2.8823142929643043e-06, + "loss": 0.6426, + "mean_token_accuracy": 0.797927625477314, + "num_tokens": 1156890428.0, + "step": 1215 + }, + { + "epoch": 1.7333451767631907, + "grad_norm": 0.3989494711298427, + "learning_rate": 2.838130752345092e-06, + "loss": 0.6582, + "mean_token_accuracy": 0.7947382763028145, + "num_tokens": 1161657895.0, + "step": 1220 + }, + { + "epoch": 1.7404512346775625, + "grad_norm": 0.3432409644849126, + "learning_rate": 2.7950280464572066e-06, + "loss": 0.6541, + "mean_token_accuracy": 0.7953485876321793, + "num_tokens": 1166423043.0, + "step": 1225 + }, + { + "epoch": 1.7475572925919347, + "grad_norm": 0.3368844710605464, + "learning_rate": 2.7530118831722286e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.796825060248375, + "num_tokens": 1171166643.0, + "step": 1230 + }, + { + "epoch": 1.7546633505063065, + "grad_norm": 0.45251140872957446, + "learning_rate": 2.7120878264764437e-06, + "loss": 0.6473, + "mean_token_accuracy": 0.7977175071835518, + "num_tokens": 1175924107.0, + "step": 1235 + }, + { + "epoch": 1.7617694084206788, + "grad_norm": 0.3545121531531998, + "learning_rate": 2.67226129573403e-06, + "loss": 0.6512, + "mean_token_accuracy": 0.7960038974881172, + "num_tokens": 1180681893.0, + "step": 1240 + }, + { + "epoch": 1.7688754663350506, + "grad_norm": 0.3323525365320921, + "learning_rate": 2.633537564969398e-06, + "loss": 0.6557, + "mean_token_accuracy": 0.7952632494270802, + "num_tokens": 1185447027.0, + "step": 1245 + }, + { + "epoch": 1.7759815242494228, + "grad_norm": 0.34600724842490793, + "learning_rate": 2.5959217621687823e-06, + "loss": 0.6608, + "mean_token_accuracy": 0.7938597463071346, + "num_tokens": 1190231791.0, + "step": 1250 + }, + { + "epoch": 1.7830875821637946, + "grad_norm": 0.3678379104234008, + "learning_rate": 2.5594188686011616e-06, + "loss": 0.6541, + "mean_token_accuracy": 0.7947786100208759, + "num_tokens": 1194998688.0, + "step": 1255 + }, + { + "epoch": 1.7901936400781666, + "grad_norm": 0.36986419243022695, + "learning_rate": 2.524033718158621e-06, + "loss": 0.6492, + "mean_token_accuracy": 0.7966626077890396, + "num_tokens": 1199764688.0, + "step": 1260 + }, + { + "epoch": 1.7972996979925386, + "grad_norm": 0.36634691155162535, + "learning_rate": 2.489770996716227e-06, + "loss": 0.6549, + "mean_token_accuracy": 0.7945116639137269, + "num_tokens": 1204526423.0, + "step": 1265 + }, + { + "epoch": 1.8044057559069107, + "grad_norm": 0.3592637763107603, + "learning_rate": 2.456635241511491e-06, + "loss": 0.6436, + "mean_token_accuracy": 0.7984024800360203, + "num_tokens": 1209280668.0, + "step": 1270 + }, + { + "epoch": 1.8115118138212827, + "grad_norm": 0.3637888435360703, + "learning_rate": 2.4246308405435314e-06, + "loss": 0.6503, + "mean_token_accuracy": 0.7954847238957882, + "num_tokens": 1214048139.0, + "step": 1275 + }, + { + "epoch": 1.8186178717356547, + "grad_norm": 0.3515690726134511, + "learning_rate": 2.3937620319919966e-06, + "loss": 0.6471, + "mean_token_accuracy": 0.7975172877311707, + "num_tokens": 1218805359.0, + "step": 1280 + }, + { + "epoch": 1.8257239296500267, + "grad_norm": 0.37901691417441497, + "learning_rate": 2.3640329036558167e-06, + "loss": 0.6458, + "mean_token_accuracy": 0.7973252393305301, + "num_tokens": 1223580683.0, + "step": 1285 + }, + { + "epoch": 1.8328299875643985, + "grad_norm": 0.48078408366926273, + "learning_rate": 2.3354473924118843e-06, + "loss": 0.6517, + "mean_token_accuracy": 0.7954902827739716, + "num_tokens": 1228344380.0, + "step": 1290 + }, + { + "epoch": 1.8399360454787708, + "grad_norm": 0.3487084891659478, + "learning_rate": 2.3080092836937124e-06, + "loss": 0.649, + "mean_token_accuracy": 0.7968501009047031, + "num_tokens": 1233124681.0, + "step": 1295 + }, + { + "epoch": 1.8470421033931426, + "grad_norm": 0.34640131501737065, + "learning_rate": 2.2817222109901442e-06, + "loss": 0.6448, + "mean_token_accuracy": 0.7978550389409065, + "num_tokens": 1237873166.0, + "step": 1300 + }, + { + "epoch": 1.8470421033931426, + "eval_loss": 0.6490960121154785, + "eval_mean_token_accuracy": 0.7908800155447241, + "eval_num_tokens": 1237873166.0, + "eval_runtime": 149.9569, + "eval_samples_per_second": 24.267, + "eval_steps_per_second": 0.76, + "step": 1300 + }, + { + "epoch": 1.8541481613075148, + "grad_norm": 0.3243906306128693, + "learning_rate": 2.256589655364193e-06, + "loss": 0.6593, + "mean_token_accuracy": 0.7929202131927013, + "num_tokens": 1242627340.0, + "step": 1305 + }, + { + "epoch": 1.8612542192218866, + "grad_norm": 0.37597198041798413, + "learning_rate": 2.2326149449920653e-06, + "loss": 0.6446, + "mean_token_accuracy": 0.797098808735609, + "num_tokens": 1247387461.0, + "step": 1310 + }, + { + "epoch": 1.8683602771362586, + "grad_norm": 0.35265594906604686, + "learning_rate": 2.2098012547224197e-06, + "loss": 0.6513, + "mean_token_accuracy": 0.7950267992913723, + "num_tokens": 1252135688.0, + "step": 1315 + }, + { + "epoch": 1.8754663350506307, + "grad_norm": 0.3583812845832173, + "learning_rate": 2.188151605655942e-06, + "loss": 0.6521, + "mean_token_accuracy": 0.7945805780589581, + "num_tokens": 1256903702.0, + "step": 1320 + }, + { + "epoch": 1.8825723929650027, + "grad_norm": 0.3577801661959976, + "learning_rate": 2.1676688647452795e-06, + "loss": 0.6437, + "mean_token_accuracy": 0.7986263297498226, + "num_tokens": 1261633144.0, + "step": 1325 + }, + { + "epoch": 1.8896784508793747, + "grad_norm": 0.35744367217582623, + "learning_rate": 2.1483557444153795e-06, + "loss": 0.649, + "mean_token_accuracy": 0.7966003373265267, + "num_tokens": 1266390903.0, + "step": 1330 + }, + { + "epoch": 1.8967845087937467, + "grad_norm": 0.39747974453689555, + "learning_rate": 2.1302148022042993e-06, + "loss": 0.6491, + "mean_token_accuracy": 0.7970162339508533, + "num_tokens": 1271162270.0, + "step": 1335 + }, + { + "epoch": 1.9038905667081187, + "grad_norm": 0.3547995480225708, + "learning_rate": 2.113248440424526e-06, + "loss": 0.643, + "mean_token_accuracy": 0.7987522542476654, + "num_tokens": 1275906083.0, + "step": 1340 + }, + { + "epoch": 1.9109966246224905, + "grad_norm": 0.3924659274346196, + "learning_rate": 2.0974589058448456e-06, + "loss": 0.6499, + "mean_token_accuracy": 0.7970600210130214, + "num_tokens": 1280649985.0, + "step": 1345 + }, + { + "epoch": 1.9181026825368628, + "grad_norm": 0.3450930632168253, + "learning_rate": 2.0828482893928208e-06, + "loss": 0.6525, + "mean_token_accuracy": 0.795515525341034, + "num_tokens": 1285434113.0, + "step": 1350 + }, + { + "epoch": 1.9252087404512346, + "grad_norm": 0.33567083461731984, + "learning_rate": 2.069418525877897e-06, + "loss": 0.644, + "mean_token_accuracy": 0.798255106061697, + "num_tokens": 1290191830.0, + "step": 1355 + }, + { + "epoch": 1.9323147983656068, + "grad_norm": 0.36694627449422723, + "learning_rate": 2.0571713937351834e-06, + "loss": 0.6397, + "mean_token_accuracy": 0.7977312818169594, + "num_tokens": 1294948980.0, + "step": 1360 + }, + { + "epoch": 1.9394208562799786, + "grad_norm": 0.362232508705221, + "learning_rate": 2.0461085147899497e-06, + "loss": 0.6457, + "mean_token_accuracy": 0.7973731994628906, + "num_tokens": 1299719386.0, + "step": 1365 + }, + { + "epoch": 1.9465269141943506, + "grad_norm": 0.3669523670641092, + "learning_rate": 2.0362313540428485e-06, + "loss": 0.6472, + "mean_token_accuracy": 0.797086289525032, + "num_tokens": 1304487261.0, + "step": 1370 + }, + { + "epoch": 1.9536329721087227, + "grad_norm": 0.37200879998568015, + "learning_rate": 2.027541219475922e-06, + "loss": 0.6475, + "mean_token_accuracy": 0.7960396580398083, + "num_tokens": 1309241194.0, + "step": 1375 + }, + { + "epoch": 1.9607390300230947, + "grad_norm": 0.3688539723341265, + "learning_rate": 2.020039261879382e-06, + "loss": 0.6573, + "mean_token_accuracy": 0.7950874969363213, + "num_tokens": 1314011836.0, + "step": 1380 + }, + { + "epoch": 1.9678450879374667, + "grad_norm": 0.43966459213132447, + "learning_rate": 2.013726474699225e-06, + "loss": 0.6505, + "mean_token_accuracy": 0.7958736583590508, + "num_tokens": 1318761485.0, + "step": 1385 + }, + { + "epoch": 1.9749511458518387, + "grad_norm": 0.34824004564185856, + "learning_rate": 2.008603693905673e-06, + "loss": 0.6476, + "mean_token_accuracy": 0.7972124963998795, + "num_tokens": 1323527340.0, + "step": 1390 + }, + { + "epoch": 1.9820572037662108, + "grad_norm": 0.37468360563296904, + "learning_rate": 2.0046715978824663e-06, + "loss": 0.6496, + "mean_token_accuracy": 0.7958178780972958, + "num_tokens": 1328302362.0, + "step": 1395 + }, + { + "epoch": 1.9891632616805826, + "grad_norm": 0.3561744251531493, + "learning_rate": 2.001930707337034e-06, + "loss": 0.6501, + "mean_token_accuracy": 0.7963785864412785, + "num_tokens": 1333062144.0, + "step": 1400 + }, + { + "epoch": 1.9891632616805826, + "eval_loss": 0.6482434868812561, + "eval_mean_token_accuracy": 0.7910316936802446, + "eval_num_tokens": 1333062144.0, + "eval_runtime": 149.6853, + "eval_samples_per_second": 24.311, + "eval_steps_per_second": 0.762, + "step": 1400 + }, + { + "epoch": 1.9962693195949548, + "grad_norm": 0.3552540336023921, + "learning_rate": 2.000381385231536e-06, + "loss": 0.656, + "mean_token_accuracy": 0.7951462939381599, + "num_tokens": 1337810166.0, + "step": 1405 + }, + { + "epoch": 2.0, + "mean_token_accuracy": 0.7962638111341567, + "num_tokens": 1340314150.0, + "step": 1408, + "total_flos": 1.0314062938243072e+16, + "train_loss": 0.7184352108531378, + "train_runtime": 49977.4428, + "train_samples_per_second": 14.417, + "train_steps_per_second": 0.028 } ], "logging_steps": 5, - "max_steps": 704, + "max_steps": 1408, "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 1, + "num_train_epochs": 2, + "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { @@ -1368,7 +2707,7 @@ "attributes": {} } }, - "total_flos": 6035679775555584.0, + "total_flos": 1.0314062938243072e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null