{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2846975088967972, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0035587188612099642, "grad_norm": 3.714656812928013, "learning_rate": 3.521126760563381e-06, "loss": 1.2324, "mean_token_accuracy": 0.706312108039856, "step": 5 }, { "epoch": 0.0071174377224199285, "grad_norm": 2.538884412766077, "learning_rate": 7.042253521126762e-06, "loss": 1.1399, "mean_token_accuracy": 0.7131082773208618, "step": 10 }, { "epoch": 0.010676156583629894, "grad_norm": 1.7771129703443833, "learning_rate": 1.056338028169014e-05, "loss": 0.9862, "mean_token_accuracy": 0.7257840037345886, "step": 15 }, { "epoch": 0.014234875444839857, "grad_norm": 1.1549668455493816, "learning_rate": 1.4084507042253523e-05, "loss": 0.8305, "mean_token_accuracy": 0.7594800233840943, "step": 20 }, { "epoch": 0.017793594306049824, "grad_norm": 0.8084059187346242, "learning_rate": 1.7605633802816902e-05, "loss": 0.7989, "mean_token_accuracy": 0.7640575289726257, "step": 25 }, { "epoch": 0.021352313167259787, "grad_norm": 0.8267562843107086, "learning_rate": 2.112676056338028e-05, "loss": 0.7668, "mean_token_accuracy": 0.7727443218231201, "step": 30 }, { "epoch": 0.02491103202846975, "grad_norm": 0.7851378820161544, "learning_rate": 2.4647887323943664e-05, "loss": 0.7649, "mean_token_accuracy": 0.7709716081619262, "step": 35 }, { "epoch": 0.028469750889679714, "grad_norm": 0.7637956379243507, "learning_rate": 2.8169014084507046e-05, "loss": 0.7394, "mean_token_accuracy": 0.7772023797035217, "step": 40 }, { "epoch": 0.03202846975088968, "grad_norm": 0.7732620068824387, "learning_rate": 3.1690140845070426e-05, "loss": 0.7346, "mean_token_accuracy": 0.779318380355835, "step": 45 }, { "epoch": 0.03558718861209965, "grad_norm": 0.7696301616562734, "learning_rate": 3.5211267605633805e-05, "loss": 0.7206, "mean_token_accuracy": 0.7811890006065368, "step": 50 }, { "epoch": 0.03914590747330961, "grad_norm": 0.7965650840150478, "learning_rate": 3.8732394366197184e-05, "loss": 0.7099, "mean_token_accuracy": 0.7832624912261963, "step": 55 }, { "epoch": 0.042704626334519574, "grad_norm": 0.732913212165206, "learning_rate": 4.225352112676056e-05, "loss": 0.7365, "mean_token_accuracy": 0.778208339214325, "step": 60 }, { "epoch": 0.046263345195729534, "grad_norm": 0.840463903142822, "learning_rate": 4.577464788732395e-05, "loss": 0.7158, "mean_token_accuracy": 0.7836069345474244, "step": 65 }, { "epoch": 0.0498220640569395, "grad_norm": 0.7796733146584423, "learning_rate": 4.929577464788733e-05, "loss": 0.6948, "mean_token_accuracy": 0.7882483005523682, "step": 70 }, { "epoch": 0.05338078291814947, "grad_norm": 0.7826425373484522, "learning_rate": 4.999900170848507e-05, "loss": 0.7299, "mean_token_accuracy": 0.7795511484146118, "step": 75 }, { "epoch": 0.05693950177935943, "grad_norm": 0.7664471564553723, "learning_rate": 4.9994946301028825e-05, "loss": 0.7205, "mean_token_accuracy": 0.7806410670280457, "step": 80 }, { "epoch": 0.060498220640569395, "grad_norm": 0.6968982746230619, "learning_rate": 4.99877719462654e-05, "loss": 0.7217, "mean_token_accuracy": 0.7813021302223205, "step": 85 }, { "epoch": 0.06405693950177936, "grad_norm": 0.7093605930206083, "learning_rate": 4.997747963892645e-05, "loss": 0.7337, "mean_token_accuracy": 0.7769145965576172, "step": 90 }, { "epoch": 0.06761565836298933, "grad_norm": 0.7406894880280147, "learning_rate": 4.99640708060509e-05, "loss": 0.7337, "mean_token_accuracy": 0.7780750393867493, "step": 95 }, { "epoch": 0.0711743772241993, "grad_norm": 0.7318985545441932, "learning_rate": 4.994754730678713e-05, "loss": 0.727, "mean_token_accuracy": 0.7799831032752991, "step": 100 }, { "epoch": 0.07473309608540925, "grad_norm": 0.7639016380490891, "learning_rate": 4.992791143213523e-05, "loss": 0.7247, "mean_token_accuracy": 0.7811060786247254, "step": 105 }, { "epoch": 0.07829181494661921, "grad_norm": 0.8011798663105023, "learning_rate": 4.990516590462928e-05, "loss": 0.7161, "mean_token_accuracy": 0.7826329588890075, "step": 110 }, { "epoch": 0.08185053380782918, "grad_norm": 0.7616291847324664, "learning_rate": 4.9879313877959934e-05, "loss": 0.7172, "mean_token_accuracy": 0.7815984845161438, "step": 115 }, { "epoch": 0.08540925266903915, "grad_norm": 0.6970328489587734, "learning_rate": 4.985035893653713e-05, "loss": 0.6988, "mean_token_accuracy": 0.7884742975234985, "step": 120 }, { "epoch": 0.08896797153024912, "grad_norm": 0.6887071171016819, "learning_rate": 4.9818305094993096e-05, "loss": 0.7252, "mean_token_accuracy": 0.7797473907470703, "step": 125 }, { "epoch": 0.09252669039145907, "grad_norm": 0.782855999297394, "learning_rate": 4.978315679762574e-05, "loss": 0.7182, "mean_token_accuracy": 0.7811893105506897, "step": 130 }, { "epoch": 0.09608540925266904, "grad_norm": 0.6962527878171872, "learning_rate": 4.9744918917782446e-05, "loss": 0.7274, "mean_token_accuracy": 0.7792230010032654, "step": 135 }, { "epoch": 0.099644128113879, "grad_norm": 0.6966111313941732, "learning_rate": 4.9703596757184346e-05, "loss": 0.6958, "mean_token_accuracy": 0.78841632604599, "step": 140 }, { "epoch": 0.10320284697508897, "grad_norm": 0.6965295119485749, "learning_rate": 4.965919604519125e-05, "loss": 0.7158, "mean_token_accuracy": 0.782188069820404, "step": 145 }, { "epoch": 0.10676156583629894, "grad_norm": 0.7008145512070334, "learning_rate": 4.96117229380073e-05, "loss": 0.7311, "mean_token_accuracy": 0.7790884852409363, "step": 150 }, { "epoch": 0.1103202846975089, "grad_norm": 0.7210254246058475, "learning_rate": 4.956118401782734e-05, "loss": 0.6856, "mean_token_accuracy": 0.7896694540977478, "step": 155 }, { "epoch": 0.11387900355871886, "grad_norm": 0.7230055796086301, "learning_rate": 4.950758629192433e-05, "loss": 0.7205, "mean_token_accuracy": 0.7811832308769227, "step": 160 }, { "epoch": 0.11743772241992882, "grad_norm": 0.7177778907287135, "learning_rate": 4.945093719167778e-05, "loss": 0.7033, "mean_token_accuracy": 0.7855350494384765, "step": 165 }, { "epoch": 0.12099644128113879, "grad_norm": 0.6993140429254685, "learning_rate": 4.939124457154336e-05, "loss": 0.716, "mean_token_accuracy": 0.7823803782463074, "step": 170 }, { "epoch": 0.12455516014234876, "grad_norm": 0.749036299071456, "learning_rate": 4.932851670796389e-05, "loss": 0.6944, "mean_token_accuracy": 0.78707515001297, "step": 175 }, { "epoch": 0.12811387900355872, "grad_norm": 0.8003534588080977, "learning_rate": 4.926276229822181e-05, "loss": 0.7039, "mean_token_accuracy": 0.7855878114700318, "step": 180 }, { "epoch": 0.13167259786476868, "grad_norm": 0.6511982930622267, "learning_rate": 4.919399045923326e-05, "loss": 0.7046, "mean_token_accuracy": 0.7856501579284668, "step": 185 }, { "epoch": 0.13523131672597866, "grad_norm": 0.6631202589315934, "learning_rate": 4.9122210726284046e-05, "loss": 0.6918, "mean_token_accuracy": 0.789514684677124, "step": 190 }, { "epoch": 0.1387900355871886, "grad_norm": 0.6877007898563937, "learning_rate": 4.904743305170753e-05, "loss": 0.6973, "mean_token_accuracy": 0.7876662492752076, "step": 195 }, { "epoch": 0.1423487544483986, "grad_norm": 0.6594871153113472, "learning_rate": 4.896966780350477e-05, "loss": 0.7106, "mean_token_accuracy": 0.7843179941177368, "step": 200 }, { "epoch": 0.14590747330960854, "grad_norm": 0.7077337595180323, "learning_rate": 4.888892576390694e-05, "loss": 0.7124, "mean_token_accuracy": 0.7831673264503479, "step": 205 }, { "epoch": 0.1494661921708185, "grad_norm": 0.6734240121888417, "learning_rate": 4.88052181278804e-05, "loss": 0.6822, "mean_token_accuracy": 0.7912026405334472, "step": 210 }, { "epoch": 0.15302491103202848, "grad_norm": 0.6209145189032446, "learning_rate": 4.871855650157446e-05, "loss": 0.7311, "mean_token_accuracy": 0.779719889163971, "step": 215 }, { "epoch": 0.15658362989323843, "grad_norm": 0.6857572196473167, "learning_rate": 4.8628952900712265e-05, "loss": 0.7039, "mean_token_accuracy": 0.7850899338722229, "step": 220 }, { "epoch": 0.1601423487544484, "grad_norm": 0.721269601531556, "learning_rate": 4.853641974892466e-05, "loss": 0.6751, "mean_token_accuracy": 0.7923677682876586, "step": 225 }, { "epoch": 0.16370106761565836, "grad_norm": 0.6569686267689558, "learning_rate": 4.8440969876027794e-05, "loss": 0.6902, "mean_token_accuracy": 0.7885013699531556, "step": 230 }, { "epoch": 0.16725978647686832, "grad_norm": 0.652284057104664, "learning_rate": 4.834261651624412e-05, "loss": 0.7002, "mean_token_accuracy": 0.7859378337860108, "step": 235 }, { "epoch": 0.1708185053380783, "grad_norm": 0.6985712892311762, "learning_rate": 4.824137330636756e-05, "loss": 0.6937, "mean_token_accuracy": 0.7884337782859803, "step": 240 }, { "epoch": 0.17437722419928825, "grad_norm": 0.6981291268835246, "learning_rate": 4.8137254283872696e-05, "loss": 0.706, "mean_token_accuracy": 0.7852147102355957, "step": 245 }, { "epoch": 0.17793594306049823, "grad_norm": 0.6830081055108704, "learning_rate": 4.803027388496845e-05, "loss": 0.6813, "mean_token_accuracy": 0.7909272313117981, "step": 250 }, { "epoch": 0.18149466192170818, "grad_norm": 0.6763389483738134, "learning_rate": 4.7920446942596535e-05, "loss": 0.7104, "mean_token_accuracy": 0.7841034054756164, "step": 255 }, { "epoch": 0.18505338078291814, "grad_norm": 0.6758095067389677, "learning_rate": 4.780778868437481e-05, "loss": 0.6912, "mean_token_accuracy": 0.7872965097427368, "step": 260 }, { "epoch": 0.18861209964412812, "grad_norm": 0.6654497461754171, "learning_rate": 4.769231473048598e-05, "loss": 0.7035, "mean_token_accuracy": 0.7856454253196716, "step": 265 }, { "epoch": 0.19217081850533807, "grad_norm": 0.6692448464596932, "learning_rate": 4.757404109151184e-05, "loss": 0.6956, "mean_token_accuracy": 0.7881039142608642, "step": 270 }, { "epoch": 0.19572953736654805, "grad_norm": 0.6384295872220533, "learning_rate": 4.745298416621336e-05, "loss": 0.6941, "mean_token_accuracy": 0.7865496397018432, "step": 275 }, { "epoch": 0.199288256227758, "grad_norm": 0.6917569136927582, "learning_rate": 4.7329160739257035e-05, "loss": 0.6653, "mean_token_accuracy": 0.7947100758552551, "step": 280 }, { "epoch": 0.20284697508896798, "grad_norm": 0.6237053314088369, "learning_rate": 4.720258797888762e-05, "loss": 0.6954, "mean_token_accuracy": 0.787773597240448, "step": 285 }, { "epoch": 0.20640569395017794, "grad_norm": 0.6404989768357283, "learning_rate": 4.707328343454777e-05, "loss": 0.7143, "mean_token_accuracy": 0.7825374364852905, "step": 290 }, { "epoch": 0.2099644128113879, "grad_norm": 0.6609383386205755, "learning_rate": 4.694126503444479e-05, "loss": 0.7104, "mean_token_accuracy": 0.7829306364059448, "step": 295 }, { "epoch": 0.21352313167259787, "grad_norm": 0.6390075364325144, "learning_rate": 4.680655108306484e-05, "loss": 0.7068, "mean_token_accuracy": 0.7846204996109009, "step": 300 }, { "epoch": 0.21708185053380782, "grad_norm": 0.7320785611015472, "learning_rate": 4.666916025863505e-05, "loss": 0.6866, "mean_token_accuracy": 0.7891976118087769, "step": 305 }, { "epoch": 0.2206405693950178, "grad_norm": 0.6542388529582787, "learning_rate": 4.652911161053369e-05, "loss": 0.6684, "mean_token_accuracy": 0.7946648597717285, "step": 310 }, { "epoch": 0.22419928825622776, "grad_norm": 0.6387809858585279, "learning_rate": 4.6386424556649046e-05, "loss": 0.7067, "mean_token_accuracy": 0.784406590461731, "step": 315 }, { "epoch": 0.2277580071174377, "grad_norm": 0.6511518481529157, "learning_rate": 4.624111888068704e-05, "loss": 0.6669, "mean_token_accuracy": 0.7934553146362304, "step": 320 }, { "epoch": 0.2313167259786477, "grad_norm": 0.7857269281531926, "learning_rate": 4.6093214729428236e-05, "loss": 0.6977, "mean_token_accuracy": 0.7871865391731262, "step": 325 }, { "epoch": 0.23487544483985764, "grad_norm": 0.708401390320286, "learning_rate": 4.5942732609934436e-05, "loss": 0.6917, "mean_token_accuracy": 0.7883163809776306, "step": 330 }, { "epoch": 0.23843416370106763, "grad_norm": 0.6537859721831119, "learning_rate": 4.57896933867054e-05, "loss": 0.7016, "mean_token_accuracy": 0.785762631893158, "step": 335 }, { "epoch": 0.24199288256227758, "grad_norm": 0.6322721246524066, "learning_rate": 4.563411827878591e-05, "loss": 0.6591, "mean_token_accuracy": 0.7976097822189331, "step": 340 }, { "epoch": 0.24555160142348753, "grad_norm": 0.613702823629008, "learning_rate": 4.5476028856823774e-05, "loss": 0.6831, "mean_token_accuracy": 0.7909620523452758, "step": 345 }, { "epoch": 0.2491103202846975, "grad_norm": 0.6470379557892744, "learning_rate": 4.531544704007899e-05, "loss": 0.6877, "mean_token_accuracy": 0.7888366222381592, "step": 350 }, { "epoch": 0.2526690391459075, "grad_norm": 0.6349334832549038, "learning_rate": 4.5152395093384655e-05, "loss": 0.6855, "mean_token_accuracy": 0.7899898767471314, "step": 355 }, { "epoch": 0.25622775800711745, "grad_norm": 0.6462491719466713, "learning_rate": 4.4986895624059934e-05, "loss": 0.6932, "mean_token_accuracy": 0.7873799800872803, "step": 360 }, { "epoch": 0.2597864768683274, "grad_norm": 0.630603740544669, "learning_rate": 4.481897157877545e-05, "loss": 0.709, "mean_token_accuracy": 0.7845431327819824, "step": 365 }, { "epoch": 0.26334519572953735, "grad_norm": 0.6086861709084705, "learning_rate": 4.464864624037182e-05, "loss": 0.6825, "mean_token_accuracy": 0.7913990497589112, "step": 370 }, { "epoch": 0.2669039145907473, "grad_norm": 1.2548196206715339, "learning_rate": 4.447594322463137e-05, "loss": 0.6954, "mean_token_accuracy": 0.7860358953475952, "step": 375 }, { "epoch": 0.2704626334519573, "grad_norm": 0.7181263405100091, "learning_rate": 4.4300886477003836e-05, "loss": 0.6813, "mean_token_accuracy": 0.7917158126831054, "step": 380 }, { "epoch": 0.27402135231316727, "grad_norm": 0.6711130910236095, "learning_rate": 4.412350026928628e-05, "loss": 0.6896, "mean_token_accuracy": 0.7883881092071533, "step": 385 }, { "epoch": 0.2775800711743772, "grad_norm": 0.6035405028151128, "learning_rate": 4.3943809196257794e-05, "loss": 0.6848, "mean_token_accuracy": 0.7895100355148316, "step": 390 }, { "epoch": 0.28113879003558717, "grad_norm": 0.5844919307530534, "learning_rate": 4.37618381722694e-05, "loss": 0.6668, "mean_token_accuracy": 0.7936931371688842, "step": 395 }, { "epoch": 0.2846975088967972, "grad_norm": 0.6492460475437084, "learning_rate": 4.357761242778965e-05, "loss": 0.6684, "mean_token_accuracy": 0.7950313925743103, "step": 400 }, { "epoch": 0.2846975088967972, "step": 400, "total_flos": 38010460569600.0, "train_loss": 0.0, "train_runtime": 2.1351, "train_samples_per_second": 2851.346, "train_steps_per_second": 44.962 } ], "logging_steps": 5, "max_steps": 96, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 38010460569600.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }