{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.997242140099283, "eval_steps": 500, "global_step": 339, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00882515168229454, "grad_norm": 0.40674829483032227, "learning_rate": 9.999785297426788e-05, "loss": 0.2055, "num_input_tokens_seen": 203120, "step": 1 }, { "epoch": 0.01765030336458908, "grad_norm": 0.4242195785045624, "learning_rate": 9.999141208146028e-05, "loss": 0.1902, "num_input_tokens_seen": 406048, "step": 2 }, { "epoch": 0.026475455046883617, "grad_norm": 0.3813261389732361, "learning_rate": 9.998067787472772e-05, "loss": 0.1421, "num_input_tokens_seen": 614736, "step": 3 }, { "epoch": 0.03530060672917816, "grad_norm": 0.28003761172294617, "learning_rate": 9.996565127593488e-05, "loss": 0.1102, "num_input_tokens_seen": 816416, "step": 4 }, { "epoch": 0.0441257584114727, "grad_norm": 0.25300610065460205, "learning_rate": 9.994633357558158e-05, "loss": 0.0801, "num_input_tokens_seen": 1024272, "step": 5 }, { "epoch": 0.052950910093767234, "grad_norm": 0.2328871786594391, "learning_rate": 9.99227264326918e-05, "loss": 0.0574, "num_input_tokens_seen": 1228192, "step": 6 }, { "epoch": 0.06177606177606178, "grad_norm": 0.17362241446971893, "learning_rate": 9.989483187467127e-05, "loss": 0.0401, "num_input_tokens_seen": 1434992, "step": 7 }, { "epoch": 0.07060121345835632, "grad_norm": 0.09250874817371368, "learning_rate": 9.986265229713331e-05, "loss": 0.0295, "num_input_tokens_seen": 1646560, "step": 8 }, { "epoch": 0.07942636514065085, "grad_norm": 0.08936059474945068, "learning_rate": 9.982619046369321e-05, "loss": 0.0262, "num_input_tokens_seen": 1838624, "step": 9 }, { "epoch": 0.0882515168229454, "grad_norm": 0.08603595942258835, "learning_rate": 9.978544950573074e-05, "loss": 0.0263, "num_input_tokens_seen": 2053488, "step": 10 }, { "epoch": 0.09707666850523994, "grad_norm": 0.07848804444074631, "learning_rate": 9.974043292212128e-05, "loss": 0.022, "num_input_tokens_seen": 2253680, "step": 11 }, { "epoch": 0.10590182018753447, "grad_norm": 0.06246768683195114, "learning_rate": 9.96911445789354e-05, "loss": 0.0202, "num_input_tokens_seen": 2442000, "step": 12 }, { "epoch": 0.11472697186982901, "grad_norm": 0.048259809613227844, "learning_rate": 9.963758870910671e-05, "loss": 0.0202, "num_input_tokens_seen": 2655920, "step": 13 }, { "epoch": 0.12355212355212356, "grad_norm": 0.03917853534221649, "learning_rate": 9.957976991206846e-05, "loss": 0.0178, "num_input_tokens_seen": 2874064, "step": 14 }, { "epoch": 0.13237727523441808, "grad_norm": 0.040510393679142, "learning_rate": 9.951769315335844e-05, "loss": 0.0158, "num_input_tokens_seen": 3071744, "step": 15 }, { "epoch": 0.14120242691671264, "grad_norm": 0.035558607429265976, "learning_rate": 9.945136376419259e-05, "loss": 0.0159, "num_input_tokens_seen": 3277904, "step": 16 }, { "epoch": 0.15002757859900717, "grad_norm": 0.034995947033166885, "learning_rate": 9.938078744100712e-05, "loss": 0.0147, "num_input_tokens_seen": 3493136, "step": 17 }, { "epoch": 0.1588527302813017, "grad_norm": 0.03230876475572586, "learning_rate": 9.930597024496931e-05, "loss": 0.0138, "num_input_tokens_seen": 3704288, "step": 18 }, { "epoch": 0.16767788196359626, "grad_norm": 0.028281500563025475, "learning_rate": 9.922691860145696e-05, "loss": 0.0128, "num_input_tokens_seen": 3904352, "step": 19 }, { "epoch": 0.1765030336458908, "grad_norm": 0.026264235377311707, "learning_rate": 9.914363929950659e-05, "loss": 0.0124, "num_input_tokens_seen": 4113888, "step": 20 }, { "epoch": 0.18532818532818532, "grad_norm": 0.023232094943523407, "learning_rate": 9.905613949123036e-05, "loss": 0.0116, "num_input_tokens_seen": 4323504, "step": 21 }, { "epoch": 0.19415333701047988, "grad_norm": 0.02393435873091221, "learning_rate": 9.896442669120187e-05, "loss": 0.0109, "num_input_tokens_seen": 4523008, "step": 22 }, { "epoch": 0.2029784886927744, "grad_norm": 0.024421676993370056, "learning_rate": 9.886850877581079e-05, "loss": 0.0106, "num_input_tokens_seen": 4732864, "step": 23 }, { "epoch": 0.21180364037506894, "grad_norm": 0.022869078442454338, "learning_rate": 9.876839398258641e-05, "loss": 0.0099, "num_input_tokens_seen": 4941936, "step": 24 }, { "epoch": 0.2206287920573635, "grad_norm": 0.025933578610420227, "learning_rate": 9.866409090949022e-05, "loss": 0.0109, "num_input_tokens_seen": 5143584, "step": 25 }, { "epoch": 0.22945394373965802, "grad_norm": 0.02043001353740692, "learning_rate": 9.855560851417752e-05, "loss": 0.0084, "num_input_tokens_seen": 5351024, "step": 26 }, { "epoch": 0.23827909542195255, "grad_norm": 0.02140035293996334, "learning_rate": 9.844295611322804e-05, "loss": 0.0081, "num_input_tokens_seen": 5563760, "step": 27 }, { "epoch": 0.2471042471042471, "grad_norm": 0.019948888570070267, "learning_rate": 9.832614338134595e-05, "loss": 0.0078, "num_input_tokens_seen": 5772416, "step": 28 }, { "epoch": 0.25592939878654164, "grad_norm": 0.021153336390852928, "learning_rate": 9.820518035052889e-05, "loss": 0.0081, "num_input_tokens_seen": 5974464, "step": 29 }, { "epoch": 0.26475455046883617, "grad_norm": 0.02002059668302536, "learning_rate": 9.808007740920646e-05, "loss": 0.0087, "num_input_tokens_seen": 6193520, "step": 30 }, { "epoch": 0.2735797021511307, "grad_norm": 0.029256833717226982, "learning_rate": 9.795084530134801e-05, "loss": 0.0079, "num_input_tokens_seen": 6399792, "step": 31 }, { "epoch": 0.2824048538334253, "grad_norm": 0.02395695447921753, "learning_rate": 9.781749512553999e-05, "loss": 0.0086, "num_input_tokens_seen": 6603584, "step": 32 }, { "epoch": 0.2912300055157198, "grad_norm": 0.02185678854584694, "learning_rate": 9.768003833403278e-05, "loss": 0.0079, "num_input_tokens_seen": 6810656, "step": 33 }, { "epoch": 0.30005515719801434, "grad_norm": 0.02072463184595108, "learning_rate": 9.753848673175707e-05, "loss": 0.0069, "num_input_tokens_seen": 7001792, "step": 34 }, { "epoch": 0.3088803088803089, "grad_norm": 0.018024709075689316, "learning_rate": 9.739285247531018e-05, "loss": 0.0064, "num_input_tokens_seen": 7205952, "step": 35 }, { "epoch": 0.3177054605626034, "grad_norm": 0.019729286432266235, "learning_rate": 9.724314807191195e-05, "loss": 0.006, "num_input_tokens_seen": 7406304, "step": 36 }, { "epoch": 0.32653061224489793, "grad_norm": 0.01830880530178547, "learning_rate": 9.708938637833065e-05, "loss": 0.0067, "num_input_tokens_seen": 7629568, "step": 37 }, { "epoch": 0.3353557639271925, "grad_norm": 0.021113887429237366, "learning_rate": 9.693158059977878e-05, "loss": 0.0063, "num_input_tokens_seen": 7845200, "step": 38 }, { "epoch": 0.34418091560948705, "grad_norm": 0.015138108283281326, "learning_rate": 9.676974428877901e-05, "loss": 0.0058, "num_input_tokens_seen": 8061840, "step": 39 }, { "epoch": 0.3530060672917816, "grad_norm": 0.017043087631464005, "learning_rate": 9.660389134400033e-05, "loss": 0.0061, "num_input_tokens_seen": 8279664, "step": 40 }, { "epoch": 0.3618312189740761, "grad_norm": 0.01955767348408699, "learning_rate": 9.643403600906433e-05, "loss": 0.0055, "num_input_tokens_seen": 8475376, "step": 41 }, { "epoch": 0.37065637065637064, "grad_norm": 0.014688636176288128, "learning_rate": 9.626019287132203e-05, "loss": 0.005, "num_input_tokens_seen": 8691760, "step": 42 }, { "epoch": 0.3794815223386652, "grad_norm": 0.01973150670528412, "learning_rate": 9.608237686060099e-05, "loss": 0.006, "num_input_tokens_seen": 8884736, "step": 43 }, { "epoch": 0.38830667402095975, "grad_norm": 0.01489401888102293, "learning_rate": 9.590060324792327e-05, "loss": 0.0048, "num_input_tokens_seen": 9084064, "step": 44 }, { "epoch": 0.3971318257032543, "grad_norm": 0.015995647758245468, "learning_rate": 9.571488764419381e-05, "loss": 0.0047, "num_input_tokens_seen": 9302144, "step": 45 }, { "epoch": 0.4059569773855488, "grad_norm": 0.01859475113451481, "learning_rate": 9.552524599885981e-05, "loss": 0.0053, "num_input_tokens_seen": 9517456, "step": 46 }, { "epoch": 0.41478212906784334, "grad_norm": 0.018746482208371162, "learning_rate": 9.533169459854098e-05, "loss": 0.0044, "num_input_tokens_seen": 9710768, "step": 47 }, { "epoch": 0.42360728075013787, "grad_norm": 0.017155013978481293, "learning_rate": 9.513425006563079e-05, "loss": 0.0043, "num_input_tokens_seen": 9914064, "step": 48 }, { "epoch": 0.43243243243243246, "grad_norm": 0.015938682481646538, "learning_rate": 9.493292935686895e-05, "loss": 0.0041, "num_input_tokens_seen": 10120208, "step": 49 }, { "epoch": 0.441257584114727, "grad_norm": 0.017114240676164627, "learning_rate": 9.472774976188515e-05, "loss": 0.0044, "num_input_tokens_seen": 10346304, "step": 50 }, { "epoch": 0.4500827357970215, "grad_norm": 0.014332287944853306, "learning_rate": 9.451872890171419e-05, "loss": 0.004, "num_input_tokens_seen": 10547984, "step": 51 }, { "epoch": 0.45890788747931605, "grad_norm": 0.017018554732203484, "learning_rate": 9.43058847272827e-05, "loss": 0.0045, "num_input_tokens_seen": 10754288, "step": 52 }, { "epoch": 0.4677330391616106, "grad_norm": 0.013670100830495358, "learning_rate": 9.408923551786743e-05, "loss": 0.0028, "num_input_tokens_seen": 10942704, "step": 53 }, { "epoch": 0.4765581908439051, "grad_norm": 0.016749229282140732, "learning_rate": 9.386879987952549e-05, "loss": 0.0034, "num_input_tokens_seen": 11150864, "step": 54 }, { "epoch": 0.4853833425261997, "grad_norm": 0.01554529182612896, "learning_rate": 9.364459674349641e-05, "loss": 0.0042, "num_input_tokens_seen": 11367728, "step": 55 }, { "epoch": 0.4942084942084942, "grad_norm": 0.015070905908942223, "learning_rate": 9.341664536457626e-05, "loss": 0.0028, "num_input_tokens_seen": 11575536, "step": 56 }, { "epoch": 0.5030336458907887, "grad_norm": 0.016440849751234055, "learning_rate": 9.31849653194641e-05, "loss": 0.0035, "num_input_tokens_seen": 11781328, "step": 57 }, { "epoch": 0.5118587975730833, "grad_norm": 0.014468475244939327, "learning_rate": 9.294957650508065e-05, "loss": 0.0029, "num_input_tokens_seen": 11981232, "step": 58 }, { "epoch": 0.5206839492553779, "grad_norm": 0.014588565565645695, "learning_rate": 9.27104991368596e-05, "loss": 0.0028, "num_input_tokens_seen": 12187296, "step": 59 }, { "epoch": 0.5295091009376723, "grad_norm": 0.0141281234100461, "learning_rate": 9.246775374701139e-05, "loss": 0.0027, "num_input_tokens_seen": 12385632, "step": 60 }, { "epoch": 0.5383342526199669, "grad_norm": 0.013463583774864674, "learning_rate": 9.222136118275995e-05, "loss": 0.0022, "num_input_tokens_seen": 12588928, "step": 61 }, { "epoch": 0.5471594043022614, "grad_norm": 0.014033553190529346, "learning_rate": 9.197134260455233e-05, "loss": 0.0027, "num_input_tokens_seen": 12825616, "step": 62 }, { "epoch": 0.555984555984556, "grad_norm": 0.013906535692512989, "learning_rate": 9.171771948424137e-05, "loss": 0.0025, "num_input_tokens_seen": 13044976, "step": 63 }, { "epoch": 0.5648097076668506, "grad_norm": 0.012418747879564762, "learning_rate": 9.146051360324166e-05, "loss": 0.0025, "num_input_tokens_seen": 13255280, "step": 64 }, { "epoch": 0.573634859349145, "grad_norm": 0.015126565471291542, "learning_rate": 9.119974705065901e-05, "loss": 0.0022, "num_input_tokens_seen": 13463456, "step": 65 }, { "epoch": 0.5824600110314396, "grad_norm": 0.013123284094035625, "learning_rate": 9.093544222139337e-05, "loss": 0.0023, "num_input_tokens_seen": 13667744, "step": 66 }, { "epoch": 0.5912851627137341, "grad_norm": 0.014246366918087006, "learning_rate": 9.066762181421552e-05, "loss": 0.0024, "num_input_tokens_seen": 13874240, "step": 67 }, { "epoch": 0.6001103143960287, "grad_norm": 0.011402356438338757, "learning_rate": 9.039630882981768e-05, "loss": 0.0015, "num_input_tokens_seen": 14081392, "step": 68 }, { "epoch": 0.6089354660783233, "grad_norm": 0.014725148677825928, "learning_rate": 9.012152656883823e-05, "loss": 0.0033, "num_input_tokens_seen": 14300896, "step": 69 }, { "epoch": 0.6177606177606177, "grad_norm": 0.014837515540421009, "learning_rate": 8.984329862986056e-05, "loss": 0.0021, "num_input_tokens_seen": 14523968, "step": 70 }, { "epoch": 0.6265857694429123, "grad_norm": 0.014493652619421482, "learning_rate": 8.956164890738643e-05, "loss": 0.0013, "num_input_tokens_seen": 14728960, "step": 71 }, { "epoch": 0.6354109211252068, "grad_norm": 0.011806878261268139, "learning_rate": 8.927660158978392e-05, "loss": 0.0016, "num_input_tokens_seen": 14912480, "step": 72 }, { "epoch": 0.6442360728075014, "grad_norm": 0.01818985864520073, "learning_rate": 8.898818115721008e-05, "loss": 0.0019, "num_input_tokens_seen": 15114608, "step": 73 }, { "epoch": 0.6530612244897959, "grad_norm": 0.015412255190312862, "learning_rate": 8.86964123795085e-05, "loss": 0.0017, "num_input_tokens_seen": 15326112, "step": 74 }, { "epoch": 0.6618863761720905, "grad_norm": 0.013063928112387657, "learning_rate": 8.84013203140821e-05, "loss": 0.0015, "num_input_tokens_seen": 15545248, "step": 75 }, { "epoch": 0.670711527854385, "grad_norm": 0.016336796805262566, "learning_rate": 8.810293030374126e-05, "loss": 0.0017, "num_input_tokens_seen": 15751872, "step": 76 }, { "epoch": 0.6795366795366795, "grad_norm": 0.010313590988516808, "learning_rate": 8.780126797452713e-05, "loss": 0.001, "num_input_tokens_seen": 15957872, "step": 77 }, { "epoch": 0.6883618312189741, "grad_norm": 0.015468253754079342, "learning_rate": 8.749635923351107e-05, "loss": 0.0018, "num_input_tokens_seen": 16162640, "step": 78 }, { "epoch": 0.6971869829012686, "grad_norm": 0.01543041318655014, "learning_rate": 8.71882302665696e-05, "loss": 0.001, "num_input_tokens_seen": 16352368, "step": 79 }, { "epoch": 0.7060121345835632, "grad_norm": 0.01957864873111248, "learning_rate": 8.687690753613554e-05, "loss": 0.0014, "num_input_tokens_seen": 16563920, "step": 80 }, { "epoch": 0.7148372862658577, "grad_norm": 0.012508533895015717, "learning_rate": 8.656241777892543e-05, "loss": 0.001, "num_input_tokens_seen": 16759024, "step": 81 }, { "epoch": 0.7236624379481522, "grad_norm": 0.012273616157472134, "learning_rate": 8.624478800364332e-05, "loss": 0.0013, "num_input_tokens_seen": 16973728, "step": 82 }, { "epoch": 0.7324875896304468, "grad_norm": 0.01503776852041483, "learning_rate": 8.592404548866123e-05, "loss": 0.0012, "num_input_tokens_seen": 17162752, "step": 83 }, { "epoch": 0.7413127413127413, "grad_norm": 0.014227951876819134, "learning_rate": 8.560021777967649e-05, "loss": 0.0013, "num_input_tokens_seen": 17364064, "step": 84 }, { "epoch": 0.7501378929950359, "grad_norm": 0.01252016518265009, "learning_rate": 8.527333268734606e-05, "loss": 0.0011, "num_input_tokens_seen": 17564576, "step": 85 }, { "epoch": 0.7589630446773304, "grad_norm": 0.011520475149154663, "learning_rate": 8.494341828489812e-05, "loss": 0.0037, "num_input_tokens_seen": 17778752, "step": 86 }, { "epoch": 0.7677881963596249, "grad_norm": 0.010531144216656685, "learning_rate": 8.461050290572114e-05, "loss": 0.0007, "num_input_tokens_seen": 17982448, "step": 87 }, { "epoch": 0.7766133480419195, "grad_norm": 0.010875461623072624, "learning_rate": 8.427461514093056e-05, "loss": 0.0008, "num_input_tokens_seen": 18180608, "step": 88 }, { "epoch": 0.785438499724214, "grad_norm": 0.007611530367285013, "learning_rate": 8.393578383691329e-05, "loss": 0.0006, "num_input_tokens_seen": 18384496, "step": 89 }, { "epoch": 0.7942636514065086, "grad_norm": 0.010159923695027828, "learning_rate": 8.359403809285053e-05, "loss": 0.001, "num_input_tokens_seen": 18587744, "step": 90 }, { "epoch": 0.803088803088803, "grad_norm": 0.011715343222022057, "learning_rate": 8.324940725821852e-05, "loss": 0.001, "num_input_tokens_seen": 18791056, "step": 91 }, { "epoch": 0.8119139547710976, "grad_norm": 0.012972251512110233, "learning_rate": 8.290192093026805e-05, "loss": 0.0008, "num_input_tokens_seen": 18985008, "step": 92 }, { "epoch": 0.8207391064533922, "grad_norm": 0.0135871022939682, "learning_rate": 8.255160895148263e-05, "loss": 0.0014, "num_input_tokens_seen": 19193888, "step": 93 }, { "epoch": 0.8295642581356867, "grad_norm": 0.011914449743926525, "learning_rate": 8.219850140701557e-05, "loss": 0.001, "num_input_tokens_seen": 19399552, "step": 94 }, { "epoch": 0.8383894098179813, "grad_norm": 0.009591113775968552, "learning_rate": 8.184262862210624e-05, "loss": 0.0007, "num_input_tokens_seen": 19605120, "step": 95 }, { "epoch": 0.8472145615002757, "grad_norm": 0.009942690841853619, "learning_rate": 8.148402115947571e-05, "loss": 0.0008, "num_input_tokens_seen": 19802480, "step": 96 }, { "epoch": 0.8560397131825703, "grad_norm": 0.012667879462242126, "learning_rate": 8.112270981670196e-05, "loss": 0.0011, "num_input_tokens_seen": 20009520, "step": 97 }, { "epoch": 0.8648648648648649, "grad_norm": 0.010983509942889214, "learning_rate": 8.075872562357501e-05, "loss": 0.0009, "num_input_tokens_seen": 20235888, "step": 98 }, { "epoch": 0.8736900165471594, "grad_norm": 0.011479397304356098, "learning_rate": 8.039209983943201e-05, "loss": 0.0006, "num_input_tokens_seen": 20433600, "step": 99 }, { "epoch": 0.882515168229454, "grad_norm": 0.012184002436697483, "learning_rate": 8.002286395047267e-05, "loss": 0.0009, "num_input_tokens_seen": 20631664, "step": 100 }, { "epoch": 0.8913403199117484, "grad_norm": 0.009395604953169823, "learning_rate": 7.965104966705518e-05, "loss": 0.0006, "num_input_tokens_seen": 20833056, "step": 101 }, { "epoch": 0.900165471594043, "grad_norm": 0.013585143722593784, "learning_rate": 7.927668892097289e-05, "loss": 0.0008, "num_input_tokens_seen": 21051104, "step": 102 }, { "epoch": 0.9089906232763376, "grad_norm": 0.008882119320333004, "learning_rate": 7.889981386271201e-05, "loss": 0.0005, "num_input_tokens_seen": 21246080, "step": 103 }, { "epoch": 0.9178157749586321, "grad_norm": 0.010433576069772243, "learning_rate": 7.852045685869045e-05, "loss": 0.0006, "num_input_tokens_seen": 21439696, "step": 104 }, { "epoch": 0.9266409266409267, "grad_norm": 0.01474383007735014, "learning_rate": 7.813865048847819e-05, "loss": 0.0008, "num_input_tokens_seen": 21648432, "step": 105 }, { "epoch": 0.9354660783232212, "grad_norm": 0.011113091371953487, "learning_rate": 7.775442754199928e-05, "loss": 0.0007, "num_input_tokens_seen": 21864368, "step": 106 }, { "epoch": 0.9442912300055157, "grad_norm": 0.009181715548038483, "learning_rate": 7.736782101671587e-05, "loss": 0.0006, "num_input_tokens_seen": 22061968, "step": 107 }, { "epoch": 0.9531163816878102, "grad_norm": 0.0140100521966815, "learning_rate": 7.697886411479423e-05, "loss": 0.0012, "num_input_tokens_seen": 22278128, "step": 108 }, { "epoch": 0.9619415333701048, "grad_norm": 0.007349591236561537, "learning_rate": 7.658759024025349e-05, "loss": 0.0004, "num_input_tokens_seen": 22469056, "step": 109 }, { "epoch": 0.9707666850523994, "grad_norm": 0.01252900529652834, "learning_rate": 7.619403299609668e-05, "loss": 0.0008, "num_input_tokens_seen": 22662128, "step": 110 }, { "epoch": 0.9795918367346939, "grad_norm": 0.012083148583769798, "learning_rate": 7.579822618142505e-05, "loss": 0.0007, "num_input_tokens_seen": 22883216, "step": 111 }, { "epoch": 0.9884169884169884, "grad_norm": 0.010517132468521595, "learning_rate": 7.540020378853523e-05, "loss": 0.0005, "num_input_tokens_seen": 23085888, "step": 112 }, { "epoch": 0.9972421400992829, "grad_norm": 0.01143716461956501, "learning_rate": 7.500000000000001e-05, "loss": 0.0007, "num_input_tokens_seen": 23307520, "step": 113 }, { "epoch": 1.0088251516822946, "grad_norm": 0.0287212785333395, "learning_rate": 7.459764918573264e-05, "loss": 0.0014, "num_input_tokens_seen": 23564192, "step": 114 }, { "epoch": 1.0176503033645892, "grad_norm": 0.010353313758969307, "learning_rate": 7.419318590003523e-05, "loss": 0.0007, "num_input_tokens_seen": 23768816, "step": 115 }, { "epoch": 1.0264754550468835, "grad_norm": 0.013796573504805565, "learning_rate": 7.378664487863103e-05, "loss": 0.0006, "num_input_tokens_seen": 23974096, "step": 116 }, { "epoch": 1.0353006067291781, "grad_norm": 0.006352484691888094, "learning_rate": 7.33780610356814e-05, "loss": 0.0003, "num_input_tokens_seen": 24172256, "step": 117 }, { "epoch": 1.0441257584114727, "grad_norm": 0.007957457564771175, "learning_rate": 7.296746946078736e-05, "loss": 0.0004, "num_input_tokens_seen": 24362208, "step": 118 }, { "epoch": 1.0529509100937673, "grad_norm": 0.0068214968778193, "learning_rate": 7.255490541597594e-05, "loss": 0.0003, "num_input_tokens_seen": 24562224, "step": 119 }, { "epoch": 1.0617760617760619, "grad_norm": 0.00877879373729229, "learning_rate": 7.214040433267198e-05, "loss": 0.0005, "num_input_tokens_seen": 24776528, "step": 120 }, { "epoch": 1.0706012134583562, "grad_norm": 0.007200079504400492, "learning_rate": 7.172400180865513e-05, "loss": 0.0003, "num_input_tokens_seen": 24985008, "step": 121 }, { "epoch": 1.0794263651406508, "grad_norm": 0.010829208418726921, "learning_rate": 7.130573360500276e-05, "loss": 0.0005, "num_input_tokens_seen": 25200720, "step": 122 }, { "epoch": 1.0882515168229454, "grad_norm": 0.010170291177928448, "learning_rate": 7.088563564301873e-05, "loss": 0.0004, "num_input_tokens_seen": 25413568, "step": 123 }, { "epoch": 1.09707666850524, "grad_norm": 0.007032219786196947, "learning_rate": 7.046374400114842e-05, "loss": 0.0003, "num_input_tokens_seen": 25608576, "step": 124 }, { "epoch": 1.1059018201875346, "grad_norm": 0.00843306165188551, "learning_rate": 7.004009491188022e-05, "loss": 0.0003, "num_input_tokens_seen": 25818400, "step": 125 }, { "epoch": 1.114726971869829, "grad_norm": 0.00947788916528225, "learning_rate": 6.961472475863405e-05, "loss": 0.0005, "num_input_tokens_seen": 26037424, "step": 126 }, { "epoch": 1.1235521235521235, "grad_norm": 0.009593469090759754, "learning_rate": 6.918767007263646e-05, "loss": 0.0005, "num_input_tokens_seen": 26250480, "step": 127 }, { "epoch": 1.1323772752344181, "grad_norm": 0.012611499056220055, "learning_rate": 6.875896752978344e-05, "loss": 0.0005, "num_input_tokens_seen": 26458592, "step": 128 }, { "epoch": 1.1412024269167127, "grad_norm": 0.005860932637006044, "learning_rate": 6.832865394749065e-05, "loss": 0.0004, "num_input_tokens_seen": 26680256, "step": 129 }, { "epoch": 1.150027578599007, "grad_norm": 0.008905632421374321, "learning_rate": 6.789676628153143e-05, "loss": 0.0004, "num_input_tokens_seen": 26887424, "step": 130 }, { "epoch": 1.1588527302813016, "grad_norm": 0.00839240662753582, "learning_rate": 6.746334162286307e-05, "loss": 0.0003, "num_input_tokens_seen": 27112736, "step": 131 }, { "epoch": 1.1676778819635962, "grad_norm": 0.010829194448888302, "learning_rate": 6.702841719444141e-05, "loss": 0.0004, "num_input_tokens_seen": 27320064, "step": 132 }, { "epoch": 1.1765030336458908, "grad_norm": 0.005576102528721094, "learning_rate": 6.659203034802397e-05, "loss": 0.0003, "num_input_tokens_seen": 27520544, "step": 133 }, { "epoch": 1.1853281853281854, "grad_norm": 0.008609413169324398, "learning_rate": 6.615421856096231e-05, "loss": 0.0009, "num_input_tokens_seen": 27737920, "step": 134 }, { "epoch": 1.19415333701048, "grad_norm": 0.013195198960602283, "learning_rate": 6.571501943298334e-05, "loss": 0.0014, "num_input_tokens_seen": 27947552, "step": 135 }, { "epoch": 1.2029784886927744, "grad_norm": 0.008647961542010307, "learning_rate": 6.527447068296026e-05, "loss": 0.0003, "num_input_tokens_seen": 28143808, "step": 136 }, { "epoch": 1.211803640375069, "grad_norm": 0.006975845899432898, "learning_rate": 6.483261014567311e-05, "loss": 0.0002, "num_input_tokens_seen": 28349312, "step": 137 }, { "epoch": 1.2206287920573635, "grad_norm": 0.013750969432294369, "learning_rate": 6.438947576855968e-05, "loss": 0.0002, "num_input_tokens_seen": 28560096, "step": 138 }, { "epoch": 1.229453943739658, "grad_norm": 0.009799162857234478, "learning_rate": 6.394510560845637e-05, "loss": 0.0005, "num_input_tokens_seen": 28764544, "step": 139 }, { "epoch": 1.2382790954219525, "grad_norm": 0.00819414108991623, "learning_rate": 6.349953782832991e-05, "loss": 0.0004, "num_input_tokens_seen": 28949360, "step": 140 }, { "epoch": 1.247104247104247, "grad_norm": 0.008884673938155174, "learning_rate": 6.305281069399989e-05, "loss": 0.0002, "num_input_tokens_seen": 29148112, "step": 141 }, { "epoch": 1.2559293987865416, "grad_norm": 0.009248818270862103, "learning_rate": 6.26049625708524e-05, "loss": 0.0004, "num_input_tokens_seen": 29370624, "step": 142 }, { "epoch": 1.2647545504688362, "grad_norm": 0.008902438916265965, "learning_rate": 6.215603192054522e-05, "loss": 0.0003, "num_input_tokens_seen": 29572464, "step": 143 }, { "epoch": 1.2735797021511308, "grad_norm": 0.012439709156751633, "learning_rate": 6.17060572977047e-05, "loss": 0.0006, "num_input_tokens_seen": 29771152, "step": 144 }, { "epoch": 1.2824048538334254, "grad_norm": 0.013059360906481743, "learning_rate": 6.125507734661458e-05, "loss": 0.0003, "num_input_tokens_seen": 29954960, "step": 145 }, { "epoch": 1.2912300055157198, "grad_norm": 0.011295526288449764, "learning_rate": 6.080313079789723e-05, "loss": 0.0004, "num_input_tokens_seen": 30165568, "step": 146 }, { "epoch": 1.3000551571980143, "grad_norm": 0.01000818982720375, "learning_rate": 6.035025646518746e-05, "loss": 0.0005, "num_input_tokens_seen": 30372160, "step": 147 }, { "epoch": 1.308880308880309, "grad_norm": 0.010914387181401253, "learning_rate": 5.989649324179911e-05, "loss": 0.0003, "num_input_tokens_seen": 30572752, "step": 148 }, { "epoch": 1.3177054605626033, "grad_norm": 0.009289560839533806, "learning_rate": 5.944188009738483e-05, "loss": 0.0004, "num_input_tokens_seen": 30780496, "step": 149 }, { "epoch": 1.3265306122448979, "grad_norm": 0.015559184364974499, "learning_rate": 5.8986456074589404e-05, "loss": 0.0004, "num_input_tokens_seen": 30975120, "step": 150 }, { "epoch": 1.3353557639271925, "grad_norm": 0.00643413420766592, "learning_rate": 5.853026028569667e-05, "loss": 0.0002, "num_input_tokens_seen": 31174000, "step": 151 }, { "epoch": 1.344180915609487, "grad_norm": 0.0077626509591937065, "learning_rate": 5.807333190927053e-05, "loss": 0.0003, "num_input_tokens_seen": 31387088, "step": 152 }, { "epoch": 1.3530060672917816, "grad_norm": 0.0083751380443573, "learning_rate": 5.761571018679025e-05, "loss": 0.0003, "num_input_tokens_seen": 31576400, "step": 153 }, { "epoch": 1.3618312189740762, "grad_norm": 0.007961435243487358, "learning_rate": 5.715743441928041e-05, "loss": 0.0003, "num_input_tokens_seen": 31784320, "step": 154 }, { "epoch": 1.3706563706563706, "grad_norm": 0.006737589370459318, "learning_rate": 5.669854396393559e-05, "loss": 0.0004, "num_input_tokens_seen": 31987520, "step": 155 }, { "epoch": 1.3794815223386652, "grad_norm": 0.014642222784459591, "learning_rate": 5.6239078230740436e-05, "loss": 0.0004, "num_input_tokens_seen": 32187456, "step": 156 }, { "epoch": 1.3883066740209598, "grad_norm": 0.006064648274332285, "learning_rate": 5.5779076679085054e-05, "loss": 0.0002, "num_input_tokens_seen": 32384528, "step": 157 }, { "epoch": 1.3971318257032543, "grad_norm": 0.009461612440645695, "learning_rate": 5.531857881437612e-05, "loss": 0.0004, "num_input_tokens_seen": 32593040, "step": 158 }, { "epoch": 1.4059569773855487, "grad_norm": 0.007511747535318136, "learning_rate": 5.48576241846443e-05, "loss": 0.0003, "num_input_tokens_seen": 32797952, "step": 159 }, { "epoch": 1.4147821290678433, "grad_norm": 0.02702983096241951, "learning_rate": 5.4396252377147615e-05, "loss": 0.0003, "num_input_tokens_seen": 33008800, "step": 160 }, { "epoch": 1.4236072807501379, "grad_norm": 0.008439299650490284, "learning_rate": 5.3934503014971793e-05, "loss": 0.0003, "num_input_tokens_seen": 33208352, "step": 161 }, { "epoch": 1.4324324324324325, "grad_norm": 0.0037907836958765984, "learning_rate": 5.347241575362729e-05, "loss": 0.0002, "num_input_tokens_seen": 33410208, "step": 162 }, { "epoch": 1.441257584114727, "grad_norm": 0.008237862028181553, "learning_rate": 5.30100302776438e-05, "loss": 0.0003, "num_input_tokens_seen": 33631888, "step": 163 }, { "epoch": 1.4500827357970216, "grad_norm": 0.009860441088676453, "learning_rate": 5.254738629716186e-05, "loss": 0.0004, "num_input_tokens_seen": 33825152, "step": 164 }, { "epoch": 1.458907887479316, "grad_norm": 0.007564296945929527, "learning_rate": 5.208452354452274e-05, "loss": 0.0003, "num_input_tokens_seen": 34020352, "step": 165 }, { "epoch": 1.4677330391616106, "grad_norm": 0.019607344642281532, "learning_rate": 5.162148177085604e-05, "loss": 0.0004, "num_input_tokens_seen": 34226288, "step": 166 }, { "epoch": 1.4765581908439052, "grad_norm": 0.007924061268568039, "learning_rate": 5.115830074266591e-05, "loss": 0.0016, "num_input_tokens_seen": 34426672, "step": 167 }, { "epoch": 1.4853833425261997, "grad_norm": 0.006358864717185497, "learning_rate": 5.0695020238415756e-05, "loss": 0.0002, "num_input_tokens_seen": 34636944, "step": 168 }, { "epoch": 1.494208494208494, "grad_norm": 0.010681587271392345, "learning_rate": 5.0231680045112176e-05, "loss": 0.0003, "num_input_tokens_seen": 34839456, "step": 169 }, { "epoch": 1.5030336458907887, "grad_norm": 0.01033815648406744, "learning_rate": 4.976831995488784e-05, "loss": 0.0002, "num_input_tokens_seen": 35031600, "step": 170 }, { "epoch": 1.5118587975730833, "grad_norm": 0.016812577843666077, "learning_rate": 4.9304979761584256e-05, "loss": 0.0004, "num_input_tokens_seen": 35227728, "step": 171 }, { "epoch": 1.5206839492553779, "grad_norm": 0.008957776241004467, "learning_rate": 4.884169925733409e-05, "loss": 0.0002, "num_input_tokens_seen": 35436528, "step": 172 }, { "epoch": 1.5295091009376725, "grad_norm": 0.006675931625068188, "learning_rate": 4.837851822914397e-05, "loss": 0.0002, "num_input_tokens_seen": 35628624, "step": 173 }, { "epoch": 1.538334252619967, "grad_norm": 0.006146900821477175, "learning_rate": 4.791547645547726e-05, "loss": 0.0002, "num_input_tokens_seen": 35827376, "step": 174 }, { "epoch": 1.5471594043022614, "grad_norm": 0.012180755846202374, "learning_rate": 4.745261370283817e-05, "loss": 0.0003, "num_input_tokens_seen": 36056560, "step": 175 }, { "epoch": 1.555984555984556, "grad_norm": 0.00920344889163971, "learning_rate": 4.698996972235622e-05, "loss": 0.0002, "num_input_tokens_seen": 36267568, "step": 176 }, { "epoch": 1.5648097076668506, "grad_norm": 0.010103096254169941, "learning_rate": 4.652758424637271e-05, "loss": 0.0027, "num_input_tokens_seen": 36473008, "step": 177 }, { "epoch": 1.573634859349145, "grad_norm": 0.012086655013263226, "learning_rate": 4.606549698502823e-05, "loss": 0.0004, "num_input_tokens_seen": 36670944, "step": 178 }, { "epoch": 1.5824600110314395, "grad_norm": 0.0054108137264847755, "learning_rate": 4.56037476228524e-05, "loss": 0.0001, "num_input_tokens_seen": 36882256, "step": 179 }, { "epoch": 1.591285162713734, "grad_norm": 0.014871139079332352, "learning_rate": 4.5142375815355706e-05, "loss": 0.0004, "num_input_tokens_seen": 37091392, "step": 180 }, { "epoch": 1.6001103143960287, "grad_norm": 0.005915229208767414, "learning_rate": 4.468142118562389e-05, "loss": 0.0002, "num_input_tokens_seen": 37309680, "step": 181 }, { "epoch": 1.6089354660783233, "grad_norm": 0.006937643978744745, "learning_rate": 4.4220923320914964e-05, "loss": 0.0003, "num_input_tokens_seen": 37517952, "step": 182 }, { "epoch": 1.6177606177606179, "grad_norm": 0.00866376981139183, "learning_rate": 4.376092176925958e-05, "loss": 0.0003, "num_input_tokens_seen": 37732160, "step": 183 }, { "epoch": 1.6265857694429124, "grad_norm": 0.007841500453650951, "learning_rate": 4.330145603606441e-05, "loss": 0.0004, "num_input_tokens_seen": 37940368, "step": 184 }, { "epoch": 1.6354109211252068, "grad_norm": 0.008568421937525272, "learning_rate": 4.2842565580719595e-05, "loss": 0.0004, "num_input_tokens_seen": 38135024, "step": 185 }, { "epoch": 1.6442360728075014, "grad_norm": 0.011796732433140278, "learning_rate": 4.238428981320975e-05, "loss": 0.0002, "num_input_tokens_seen": 38336176, "step": 186 }, { "epoch": 1.6530612244897958, "grad_norm": 0.00755694042891264, "learning_rate": 4.192666809072948e-05, "loss": 0.0003, "num_input_tokens_seen": 38548880, "step": 187 }, { "epoch": 1.6618863761720903, "grad_norm": 0.01243317686021328, "learning_rate": 4.146973971430333e-05, "loss": 0.0003, "num_input_tokens_seen": 38755920, "step": 188 }, { "epoch": 1.670711527854385, "grad_norm": 0.006207725498825312, "learning_rate": 4.101354392541061e-05, "loss": 0.0002, "num_input_tokens_seen": 38973328, "step": 189 }, { "epoch": 1.6795366795366795, "grad_norm": 0.008532355539500713, "learning_rate": 4.0558119902615174e-05, "loss": 0.0003, "num_input_tokens_seen": 39193232, "step": 190 }, { "epoch": 1.688361831218974, "grad_norm": 0.008602111600339413, "learning_rate": 4.010350675820091e-05, "loss": 0.0003, "num_input_tokens_seen": 39406608, "step": 191 }, { "epoch": 1.6971869829012687, "grad_norm": 0.008903734385967255, "learning_rate": 3.964974353481254e-05, "loss": 0.0004, "num_input_tokens_seen": 39620160, "step": 192 }, { "epoch": 1.7060121345835633, "grad_norm": 0.005871508736163378, "learning_rate": 3.919686920210277e-05, "loss": 0.0001, "num_input_tokens_seen": 39815952, "step": 193 }, { "epoch": 1.7148372862658579, "grad_norm": 0.008220325224101543, "learning_rate": 3.874492265338544e-05, "loss": 0.0003, "num_input_tokens_seen": 40015408, "step": 194 }, { "epoch": 1.7236624379481522, "grad_norm": 0.00940727163106203, "learning_rate": 3.829394270229531e-05, "loss": 0.0002, "num_input_tokens_seen": 40215328, "step": 195 }, { "epoch": 1.7324875896304468, "grad_norm": 0.005745697300881147, "learning_rate": 3.784396807945477e-05, "loss": 0.0002, "num_input_tokens_seen": 40414384, "step": 196 }, { "epoch": 1.7413127413127412, "grad_norm": 0.009524352848529816, "learning_rate": 3.7395037429147615e-05, "loss": 0.0002, "num_input_tokens_seen": 40620656, "step": 197 }, { "epoch": 1.7501378929950357, "grad_norm": 0.00809427909553051, "learning_rate": 3.694718930600012e-05, "loss": 0.0003, "num_input_tokens_seen": 40847008, "step": 198 }, { "epoch": 1.7589630446773303, "grad_norm": 0.0051635075360536575, "learning_rate": 3.65004621716701e-05, "loss": 0.0001, "num_input_tokens_seen": 41036368, "step": 199 }, { "epoch": 1.767788196359625, "grad_norm": 0.006504002492874861, "learning_rate": 3.6054894391543646e-05, "loss": 0.0003, "num_input_tokens_seen": 41252976, "step": 200 }, { "epoch": 1.7766133480419195, "grad_norm": 0.009855791926383972, "learning_rate": 3.561052423144032e-05, "loss": 0.0002, "num_input_tokens_seen": 41465104, "step": 201 }, { "epoch": 1.785438499724214, "grad_norm": 0.004304118454456329, "learning_rate": 3.5167389854326905e-05, "loss": 0.0002, "num_input_tokens_seen": 41670800, "step": 202 }, { "epoch": 1.7942636514065087, "grad_norm": 0.014682441018521786, "learning_rate": 3.4725529317039754e-05, "loss": 0.0013, "num_input_tokens_seen": 41883536, "step": 203 }, { "epoch": 1.803088803088803, "grad_norm": 0.0061918287537992, "learning_rate": 3.428498056701665e-05, "loss": 0.0001, "num_input_tokens_seen": 42083360, "step": 204 }, { "epoch": 1.8119139547710976, "grad_norm": 0.009490927681326866, "learning_rate": 3.38457814390377e-05, "loss": 0.0002, "num_input_tokens_seen": 42283120, "step": 205 }, { "epoch": 1.8207391064533922, "grad_norm": 0.008434086106717587, "learning_rate": 3.340796965197604e-05, "loss": 0.0003, "num_input_tokens_seen": 42499088, "step": 206 }, { "epoch": 1.8295642581356866, "grad_norm": 0.004052174277603626, "learning_rate": 3.297158280555862e-05, "loss": 0.0001, "num_input_tokens_seen": 42692976, "step": 207 }, { "epoch": 1.8383894098179812, "grad_norm": 0.007411065977066755, "learning_rate": 3.2536658377136935e-05, "loss": 0.0003, "num_input_tokens_seen": 42907216, "step": 208 }, { "epoch": 1.8472145615002757, "grad_norm": 0.006996455602347851, "learning_rate": 3.210323371846857e-05, "loss": 0.0001, "num_input_tokens_seen": 43112448, "step": 209 }, { "epoch": 1.8560397131825703, "grad_norm": 0.006998082622885704, "learning_rate": 3.167134605250938e-05, "loss": 0.0003, "num_input_tokens_seen": 43340096, "step": 210 }, { "epoch": 1.864864864864865, "grad_norm": 0.006418649572879076, "learning_rate": 3.124103247021657e-05, "loss": 0.0001, "num_input_tokens_seen": 43539664, "step": 211 }, { "epoch": 1.8736900165471595, "grad_norm": 0.009151714853942394, "learning_rate": 3.081232992736355e-05, "loss": 0.0003, "num_input_tokens_seen": 43727664, "step": 212 }, { "epoch": 1.882515168229454, "grad_norm": 0.004692760296165943, "learning_rate": 3.0385275241365962e-05, "loss": 0.0002, "num_input_tokens_seen": 43953584, "step": 213 }, { "epoch": 1.8913403199117484, "grad_norm": 0.006455820985138416, "learning_rate": 2.9959905088119776e-05, "loss": 0.0002, "num_input_tokens_seen": 44157504, "step": 214 }, { "epoch": 1.900165471594043, "grad_norm": 0.006325691007077694, "learning_rate": 2.9536255998851613e-05, "loss": 0.0001, "num_input_tokens_seen": 44350448, "step": 215 }, { "epoch": 1.9089906232763376, "grad_norm": 0.006784004159271717, "learning_rate": 2.9114364356981272e-05, "loss": 0.0002, "num_input_tokens_seen": 44561472, "step": 216 }, { "epoch": 1.917815774958632, "grad_norm": 0.008874817751348019, "learning_rate": 2.8694266394997238e-05, "loss": 0.0002, "num_input_tokens_seen": 44769936, "step": 217 }, { "epoch": 1.9266409266409266, "grad_norm": 0.006964050233364105, "learning_rate": 2.8275998191344888e-05, "loss": 0.0002, "num_input_tokens_seen": 44979344, "step": 218 }, { "epoch": 1.9354660783232212, "grad_norm": 0.014264012686908245, "learning_rate": 2.7859595667328026e-05, "loss": 0.0002, "num_input_tokens_seen": 45196944, "step": 219 }, { "epoch": 1.9442912300055157, "grad_norm": 0.005279663018882275, "learning_rate": 2.7445094584024067e-05, "loss": 0.0001, "num_input_tokens_seen": 45406832, "step": 220 }, { "epoch": 1.9531163816878103, "grad_norm": 0.0171637125313282, "learning_rate": 2.7032530539212658e-05, "loss": 0.0003, "num_input_tokens_seen": 45603120, "step": 221 }, { "epoch": 1.961941533370105, "grad_norm": 0.007687513716518879, "learning_rate": 2.6621938964318595e-05, "loss": 0.0002, "num_input_tokens_seen": 45805184, "step": 222 }, { "epoch": 1.9707666850523995, "grad_norm": 0.0034611017908900976, "learning_rate": 2.621335512136899e-05, "loss": 0.0001, "num_input_tokens_seen": 46001184, "step": 223 }, { "epoch": 1.9795918367346939, "grad_norm": 0.004358428996056318, "learning_rate": 2.5806814099964772e-05, "loss": 0.0002, "num_input_tokens_seen": 46206288, "step": 224 }, { "epoch": 1.9884169884169884, "grad_norm": 0.008765267208218575, "learning_rate": 2.540235081426736e-05, "loss": 0.0002, "num_input_tokens_seen": 46427344, "step": 225 }, { "epoch": 1.9972421400992828, "grad_norm": 0.006889387033879757, "learning_rate": 2.500000000000001e-05, "loss": 0.0003, "num_input_tokens_seen": 46627344, "step": 226 }, { "epoch": 2.0088251516822946, "grad_norm": 0.043494511395692825, "learning_rate": 2.459979621146477e-05, "loss": 0.0011, "num_input_tokens_seen": 46901504, "step": 227 }, { "epoch": 2.017650303364589, "grad_norm": 0.007718184031546116, "learning_rate": 2.4201773818574956e-05, "loss": 0.0001, "num_input_tokens_seen": 47104400, "step": 228 }, { "epoch": 2.0264754550468838, "grad_norm": 0.003912526648491621, "learning_rate": 2.3805967003903333e-05, "loss": 0.0001, "num_input_tokens_seen": 47314176, "step": 229 }, { "epoch": 2.0353006067291783, "grad_norm": 0.010783454403281212, "learning_rate": 2.3412409759746528e-05, "loss": 0.0003, "num_input_tokens_seen": 47525264, "step": 230 }, { "epoch": 2.0441257584114725, "grad_norm": 0.0026623259764164686, "learning_rate": 2.302113588520578e-05, "loss": 0.0001, "num_input_tokens_seen": 47724528, "step": 231 }, { "epoch": 2.052950910093767, "grad_norm": 0.00557671207934618, "learning_rate": 2.2632178983284153e-05, "loss": 0.0002, "num_input_tokens_seen": 47932624, "step": 232 }, { "epoch": 2.0617760617760617, "grad_norm": 0.003710981458425522, "learning_rate": 2.2245572458000712e-05, "loss": 0.0001, "num_input_tokens_seen": 48148608, "step": 233 }, { "epoch": 2.0706012134583562, "grad_norm": 0.009742701426148415, "learning_rate": 2.1861349511521815e-05, "loss": 0.0025, "num_input_tokens_seen": 48373632, "step": 234 }, { "epoch": 2.079426365140651, "grad_norm": 0.009755464270710945, "learning_rate": 2.147954314130955e-05, "loss": 0.0013, "num_input_tokens_seen": 48586512, "step": 235 }, { "epoch": 2.0882515168229454, "grad_norm": 0.002706202445551753, "learning_rate": 2.1100186137288e-05, "loss": 0.0001, "num_input_tokens_seen": 48793568, "step": 236 }, { "epoch": 2.09707666850524, "grad_norm": 0.005180325359106064, "learning_rate": 2.072331107902713e-05, "loss": 0.0001, "num_input_tokens_seen": 49006224, "step": 237 }, { "epoch": 2.1059018201875346, "grad_norm": 0.005968959536403418, "learning_rate": 2.0348950332944834e-05, "loss": 0.0002, "num_input_tokens_seen": 49217632, "step": 238 }, { "epoch": 2.114726971869829, "grad_norm": 0.0063306307420134544, "learning_rate": 1.9977136049527345e-05, "loss": 0.0001, "num_input_tokens_seen": 49426624, "step": 239 }, { "epoch": 2.1235521235521237, "grad_norm": 0.005157762672752142, "learning_rate": 1.960790016056801e-05, "loss": 0.0001, "num_input_tokens_seen": 49623376, "step": 240 }, { "epoch": 2.132377275234418, "grad_norm": 0.005218483041971922, "learning_rate": 1.9241274376425e-05, "loss": 0.0002, "num_input_tokens_seen": 49828144, "step": 241 }, { "epoch": 2.1412024269167125, "grad_norm": 0.00744604179635644, "learning_rate": 1.8877290183298057e-05, "loss": 0.0002, "num_input_tokens_seen": 50018448, "step": 242 }, { "epoch": 2.150027578599007, "grad_norm": 0.005399591755121946, "learning_rate": 1.8515978840524302e-05, "loss": 0.0001, "num_input_tokens_seen": 50218176, "step": 243 }, { "epoch": 2.1588527302813016, "grad_norm": 0.005761398002505302, "learning_rate": 1.815737137789377e-05, "loss": 0.0002, "num_input_tokens_seen": 50424896, "step": 244 }, { "epoch": 2.1676778819635962, "grad_norm": 0.006964447908103466, "learning_rate": 1.7801498592984446e-05, "loss": 0.0006, "num_input_tokens_seen": 50635088, "step": 245 }, { "epoch": 2.176503033645891, "grad_norm": 0.002962745726108551, "learning_rate": 1.7448391048517376e-05, "loss": 0.0001, "num_input_tokens_seen": 50849552, "step": 246 }, { "epoch": 2.1853281853281854, "grad_norm": 0.005332667380571365, "learning_rate": 1.7098079069731958e-05, "loss": 0.0002, "num_input_tokens_seen": 51037776, "step": 247 }, { "epoch": 2.19415333701048, "grad_norm": 0.006928949151188135, "learning_rate": 1.6750592741781497e-05, "loss": 0.0002, "num_input_tokens_seen": 51242672, "step": 248 }, { "epoch": 2.2029784886927746, "grad_norm": 0.004213888198137283, "learning_rate": 1.640596190714947e-05, "loss": 0.0001, "num_input_tokens_seen": 51437008, "step": 249 }, { "epoch": 2.211803640375069, "grad_norm": 0.010446918196976185, "learning_rate": 1.6064216163086716e-05, "loss": 0.0001, "num_input_tokens_seen": 51641264, "step": 250 }, { "epoch": 2.2206287920573633, "grad_norm": 0.004029524512588978, "learning_rate": 1.5725384859069455e-05, "loss": 0.0001, "num_input_tokens_seen": 51842592, "step": 251 }, { "epoch": 2.229453943739658, "grad_norm": 0.006790219806134701, "learning_rate": 1.538949709427886e-05, "loss": 0.0012, "num_input_tokens_seen": 52047456, "step": 252 }, { "epoch": 2.2382790954219525, "grad_norm": 0.003987099044024944, "learning_rate": 1.5056581715101886e-05, "loss": 0.0001, "num_input_tokens_seen": 52242208, "step": 253 }, { "epoch": 2.247104247104247, "grad_norm": 0.008930574171245098, "learning_rate": 1.472666731265394e-05, "loss": 0.0003, "num_input_tokens_seen": 52436800, "step": 254 }, { "epoch": 2.2559293987865416, "grad_norm": 0.004108684603124857, "learning_rate": 1.4399782220323515e-05, "loss": 0.0001, "num_input_tokens_seen": 52624752, "step": 255 }, { "epoch": 2.2647545504688362, "grad_norm": 0.00732703972607851, "learning_rate": 1.4075954511338785e-05, "loss": 0.0001, "num_input_tokens_seen": 52836384, "step": 256 }, { "epoch": 2.273579702151131, "grad_norm": 0.006608397234231234, "learning_rate": 1.3755211996356687e-05, "loss": 0.0001, "num_input_tokens_seen": 53059296, "step": 257 }, { "epoch": 2.2824048538334254, "grad_norm": 0.002376733347773552, "learning_rate": 1.3437582221074573e-05, "loss": 0.0001, "num_input_tokens_seen": 53267440, "step": 258 }, { "epoch": 2.29123000551572, "grad_norm": 0.004921163432300091, "learning_rate": 1.3123092463864456e-05, "loss": 0.0001, "num_input_tokens_seen": 53501008, "step": 259 }, { "epoch": 2.300055157198014, "grad_norm": 0.0034377635456621647, "learning_rate": 1.2811769733430406e-05, "loss": 0.0001, "num_input_tokens_seen": 53700432, "step": 260 }, { "epoch": 2.3088803088803087, "grad_norm": 0.006821690127253532, "learning_rate": 1.250364076648894e-05, "loss": 0.0002, "num_input_tokens_seen": 53919616, "step": 261 }, { "epoch": 2.3177054605626033, "grad_norm": 0.004776927176862955, "learning_rate": 1.2198732025472876e-05, "loss": 0.0001, "num_input_tokens_seen": 54130528, "step": 262 }, { "epoch": 2.326530612244898, "grad_norm": 0.004824692849069834, "learning_rate": 1.1897069696258755e-05, "loss": 0.0002, "num_input_tokens_seen": 54350560, "step": 263 }, { "epoch": 2.3353557639271925, "grad_norm": 0.005174586083739996, "learning_rate": 1.1598679685917901e-05, "loss": 0.0001, "num_input_tokens_seen": 54542224, "step": 264 }, { "epoch": 2.344180915609487, "grad_norm": 0.012352543883025646, "learning_rate": 1.1303587620491513e-05, "loss": 0.0002, "num_input_tokens_seen": 54745136, "step": 265 }, { "epoch": 2.3530060672917816, "grad_norm": 0.005056153051555157, "learning_rate": 1.1011818842789928e-05, "loss": 0.0001, "num_input_tokens_seen": 54957584, "step": 266 }, { "epoch": 2.361831218974076, "grad_norm": 0.010525842197239399, "learning_rate": 1.0723398410216084e-05, "loss": 0.0001, "num_input_tokens_seen": 55162496, "step": 267 }, { "epoch": 2.370656370656371, "grad_norm": 0.0092442212626338, "learning_rate": 1.0438351092613569e-05, "loss": 0.0002, "num_input_tokens_seen": 55376544, "step": 268 }, { "epoch": 2.3794815223386654, "grad_norm": 0.00699999462813139, "learning_rate": 1.0156701370139454e-05, "loss": 0.0001, "num_input_tokens_seen": 55583072, "step": 269 }, { "epoch": 2.38830667402096, "grad_norm": 0.007677710149437189, "learning_rate": 9.878473431161767e-06, "loss": 0.0002, "num_input_tokens_seen": 55801200, "step": 270 }, { "epoch": 2.397131825703254, "grad_norm": 0.003174175275489688, "learning_rate": 9.603691170182317e-06, "loss": 0.0001, "num_input_tokens_seen": 55998080, "step": 271 }, { "epoch": 2.4059569773855487, "grad_norm": 0.005871200002729893, "learning_rate": 9.33237818578449e-06, "loss": 0.0002, "num_input_tokens_seen": 56200448, "step": 272 }, { "epoch": 2.4147821290678433, "grad_norm": 0.00371691957116127, "learning_rate": 9.064557778606631e-06, "loss": 0.0001, "num_input_tokens_seen": 56400416, "step": 273 }, { "epoch": 2.423607280750138, "grad_norm": 0.007599337492138147, "learning_rate": 8.800252949340998e-06, "loss": 0.0002, "num_input_tokens_seen": 56606128, "step": 274 }, { "epoch": 2.4324324324324325, "grad_norm": 0.0015243644593283534, "learning_rate": 8.539486396758356e-06, "loss": 0.0, "num_input_tokens_seen": 56797824, "step": 275 }, { "epoch": 2.441257584114727, "grad_norm": 0.0030196798034012318, "learning_rate": 8.28228051575864e-06, "loss": 0.0001, "num_input_tokens_seen": 57006384, "step": 276 }, { "epoch": 2.4500827357970216, "grad_norm": 0.005347589962184429, "learning_rate": 8.02865739544767e-06, "loss": 0.0001, "num_input_tokens_seen": 57207824, "step": 277 }, { "epoch": 2.458907887479316, "grad_norm": 0.005150883924216032, "learning_rate": 7.778638817240042e-06, "loss": 0.0001, "num_input_tokens_seen": 57415152, "step": 278 }, { "epoch": 2.467733039161611, "grad_norm": 0.006857512053102255, "learning_rate": 7.532246252988617e-06, "loss": 0.0001, "num_input_tokens_seen": 57628096, "step": 279 }, { "epoch": 2.476558190843905, "grad_norm": 0.005364645272493362, "learning_rate": 7.289500863140414e-06, "loss": 0.0001, "num_input_tokens_seen": 57824064, "step": 280 }, { "epoch": 2.4853833425261995, "grad_norm": 0.007198365870863199, "learning_rate": 7.05042349491935e-06, "loss": 0.0002, "num_input_tokens_seen": 58042720, "step": 281 }, { "epoch": 2.494208494208494, "grad_norm": 0.005014900583773851, "learning_rate": 6.815034680535915e-06, "loss": 0.0001, "num_input_tokens_seen": 58255408, "step": 282 }, { "epoch": 2.5030336458907887, "grad_norm": 0.008873779326677322, "learning_rate": 6.5833546354237556e-06, "loss": 0.0001, "num_input_tokens_seen": 58464800, "step": 283 }, { "epoch": 2.5118587975730833, "grad_norm": 0.0044725253246724606, "learning_rate": 6.355403256503595e-06, "loss": 0.0001, "num_input_tokens_seen": 58672496, "step": 284 }, { "epoch": 2.520683949255378, "grad_norm": 0.0047348616644740105, "learning_rate": 6.1312001204745115e-06, "loss": 0.0002, "num_input_tokens_seen": 58898256, "step": 285 }, { "epoch": 2.5295091009376725, "grad_norm": 0.00710884016007185, "learning_rate": 5.910764482132575e-06, "loss": 0.0001, "num_input_tokens_seen": 59107152, "step": 286 }, { "epoch": 2.538334252619967, "grad_norm": 0.007686229422688484, "learning_rate": 5.6941152727173265e-06, "loss": 0.0002, "num_input_tokens_seen": 59307664, "step": 287 }, { "epoch": 2.5471594043022616, "grad_norm": 0.014555118046700954, "learning_rate": 5.481271098285817e-06, "loss": 0.0003, "num_input_tokens_seen": 59514736, "step": 288 }, { "epoch": 2.5559845559845558, "grad_norm": 0.0028200196102261543, "learning_rate": 5.272250238114856e-06, "loss": 0.0001, "num_input_tokens_seen": 59712512, "step": 289 }, { "epoch": 2.564809707666851, "grad_norm": 0.004194322973489761, "learning_rate": 5.067070643131055e-06, "loss": 0.0001, "num_input_tokens_seen": 59910000, "step": 290 }, { "epoch": 2.573634859349145, "grad_norm": 0.006987538188695908, "learning_rate": 4.865749934369223e-06, "loss": 0.0002, "num_input_tokens_seen": 60116400, "step": 291 }, { "epoch": 2.5824600110314395, "grad_norm": 0.003778768004849553, "learning_rate": 4.668305401459022e-06, "loss": 0.0002, "num_input_tokens_seen": 60320368, "step": 292 }, { "epoch": 2.591285162713734, "grad_norm": 0.003472360782325268, "learning_rate": 4.474754001140191e-06, "loss": 0.0001, "num_input_tokens_seen": 60536528, "step": 293 }, { "epoch": 2.6001103143960287, "grad_norm": 0.009052475914359093, "learning_rate": 4.285112355806192e-06, "loss": 0.001, "num_input_tokens_seen": 60743120, "step": 294 }, { "epoch": 2.6089354660783233, "grad_norm": 0.0060082292184233665, "learning_rate": 4.099396752076745e-06, "loss": 0.0001, "num_input_tokens_seen": 60942704, "step": 295 }, { "epoch": 2.617760617760618, "grad_norm": 0.0075798071920871735, "learning_rate": 3.917623139399018e-06, "loss": 0.0001, "num_input_tokens_seen": 61140128, "step": 296 }, { "epoch": 2.6265857694429124, "grad_norm": 0.0055752964690327644, "learning_rate": 3.7398071286779857e-06, "loss": 0.0001, "num_input_tokens_seen": 61334224, "step": 297 }, { "epoch": 2.6354109211252066, "grad_norm": 0.007863204926252365, "learning_rate": 3.5659639909356723e-06, "loss": 0.0001, "num_input_tokens_seen": 61543280, "step": 298 }, { "epoch": 2.6442360728075016, "grad_norm": 0.006538075394928455, "learning_rate": 3.3961086559996803e-06, "loss": 0.0002, "num_input_tokens_seen": 61750720, "step": 299 }, { "epoch": 2.6530612244897958, "grad_norm": 0.002779777627438307, "learning_rate": 3.230255711220992e-06, "loss": 0.0, "num_input_tokens_seen": 61945952, "step": 300 }, { "epoch": 2.6618863761720903, "grad_norm": 0.004271807614713907, "learning_rate": 3.0684194002212287e-06, "loss": 0.0001, "num_input_tokens_seen": 62155632, "step": 301 }, { "epoch": 2.670711527854385, "grad_norm": 0.00638817623257637, "learning_rate": 2.910613621669356e-06, "loss": 0.0001, "num_input_tokens_seen": 62353216, "step": 302 }, { "epoch": 2.6795366795366795, "grad_norm": 0.00442032516002655, "learning_rate": 2.7568519280880558e-06, "loss": 0.0001, "num_input_tokens_seen": 62544128, "step": 303 }, { "epoch": 2.688361831218974, "grad_norm": 0.008686737157404423, "learning_rate": 2.607147524689829e-06, "loss": 0.0004, "num_input_tokens_seen": 62752688, "step": 304 }, { "epoch": 2.6971869829012687, "grad_norm": 0.0059651597402989864, "learning_rate": 2.4615132682429374e-06, "loss": 0.0001, "num_input_tokens_seen": 62963296, "step": 305 }, { "epoch": 2.7060121345835633, "grad_norm": 0.0056177834048867226, "learning_rate": 2.3199616659672354e-06, "loss": 0.0002, "num_input_tokens_seen": 63161904, "step": 306 }, { "epoch": 2.714837286265858, "grad_norm": 0.0029979923274368048, "learning_rate": 2.182504874460006e-06, "loss": 0.0001, "num_input_tokens_seen": 63365744, "step": 307 }, { "epoch": 2.7236624379481524, "grad_norm": 0.004314000252634287, "learning_rate": 2.049154698651989e-06, "loss": 0.0001, "num_input_tokens_seen": 63571808, "step": 308 }, { "epoch": 2.7324875896304466, "grad_norm": 0.006837273947894573, "learning_rate": 1.919922590793549e-06, "loss": 0.0001, "num_input_tokens_seen": 63768960, "step": 309 }, { "epoch": 2.741312741312741, "grad_norm": 0.0037646403070539236, "learning_rate": 1.7948196494711188e-06, "loss": 0.0001, "num_input_tokens_seen": 63979648, "step": 310 }, { "epoch": 2.7501378929950357, "grad_norm": 0.0031723175197839737, "learning_rate": 1.6738566186540627e-06, "loss": 0.0001, "num_input_tokens_seen": 64189712, "step": 311 }, { "epoch": 2.7589630446773303, "grad_norm": 0.005477920174598694, "learning_rate": 1.5570438867719694e-06, "loss": 0.0001, "num_input_tokens_seen": 64400624, "step": 312 }, { "epoch": 2.767788196359625, "grad_norm": 0.006315939594060183, "learning_rate": 1.4443914858224938e-06, "loss": 0.0001, "num_input_tokens_seen": 64626320, "step": 313 }, { "epoch": 2.7766133480419195, "grad_norm": 0.004753002431243658, "learning_rate": 1.3359090905097848e-06, "loss": 0.0001, "num_input_tokens_seen": 64826480, "step": 314 }, { "epoch": 2.785438499724214, "grad_norm": 0.00812880601733923, "learning_rate": 1.2316060174136002e-06, "loss": 0.0003, "num_input_tokens_seen": 65031984, "step": 315 }, { "epoch": 2.7942636514065087, "grad_norm": 0.0029212606605142355, "learning_rate": 1.1314912241892183e-06, "loss": 0.0001, "num_input_tokens_seen": 65239456, "step": 316 }, { "epoch": 2.8030888030888033, "grad_norm": 0.006850802339613438, "learning_rate": 1.0355733087981378e-06, "loss": 0.0002, "num_input_tokens_seen": 65433888, "step": 317 }, { "epoch": 2.8119139547710974, "grad_norm": 0.0020711093675345182, "learning_rate": 9.43860508769645e-07, "loss": 0.0001, "num_input_tokens_seen": 65638288, "step": 318 }, { "epoch": 2.8207391064533924, "grad_norm": 0.004868640564382076, "learning_rate": 8.563607004934193e-07, "loss": 0.0002, "num_input_tokens_seen": 65855952, "step": 319 }, { "epoch": 2.8295642581356866, "grad_norm": 0.006297328509390354, "learning_rate": 7.730813985430407e-07, "loss": 0.0002, "num_input_tokens_seen": 66070192, "step": 320 }, { "epoch": 2.838389409817981, "grad_norm": 0.0036759376525878906, "learning_rate": 6.940297550306896e-07, "loss": 0.0001, "num_input_tokens_seen": 66283808, "step": 321 }, { "epoch": 2.8472145615002757, "grad_norm": 0.0120092136785388, "learning_rate": 6.192125589928821e-07, "loss": 0.0002, "num_input_tokens_seen": 66507776, "step": 322 }, { "epoch": 2.8560397131825703, "grad_norm": 0.005414010491222143, "learning_rate": 5.486362358074094e-07, "loss": 0.0002, "num_input_tokens_seen": 66708320, "step": 323 }, { "epoch": 2.864864864864865, "grad_norm": 0.007992051541805267, "learning_rate": 4.823068466415615e-07, "loss": 0.0001, "num_input_tokens_seen": 66910032, "step": 324 }, { "epoch": 2.8736900165471595, "grad_norm": 0.006493248511105776, "learning_rate": 4.202300879315446e-07, "loss": 0.0001, "num_input_tokens_seen": 67112784, "step": 325 }, { "epoch": 2.882515168229454, "grad_norm": 0.004381334874778986, "learning_rate": 3.624112908932942e-07, "loss": 0.0001, "num_input_tokens_seen": 67306464, "step": 326 }, { "epoch": 2.8913403199117482, "grad_norm": 0.00577085604891181, "learning_rate": 3.088554210646133e-07, "loss": 0.0001, "num_input_tokens_seen": 67504720, "step": 327 }, { "epoch": 2.9001654715940433, "grad_norm": 0.003793071024119854, "learning_rate": 2.595670778787196e-07, "loss": 0.0001, "num_input_tokens_seen": 67694048, "step": 328 }, { "epoch": 2.9089906232763374, "grad_norm": 0.00835067592561245, "learning_rate": 2.1455049426926666e-07, "loss": 0.0002, "num_input_tokens_seen": 67895008, "step": 329 }, { "epoch": 2.917815774958632, "grad_norm": 0.005372443702071905, "learning_rate": 1.7380953630678488e-07, "loss": 0.0001, "num_input_tokens_seen": 68093168, "step": 330 }, { "epoch": 2.9266409266409266, "grad_norm": 0.010219305753707886, "learning_rate": 1.373477028666803e-07, "loss": 0.0002, "num_input_tokens_seen": 68305568, "step": 331 }, { "epoch": 2.935466078323221, "grad_norm": 0.0038206197787076235, "learning_rate": 1.0516812532873621e-07, "loss": 0.0001, "num_input_tokens_seen": 68506384, "step": 332 }, { "epoch": 2.9442912300055157, "grad_norm": 0.007432411424815655, "learning_rate": 7.727356730820035e-08, "loss": 0.0002, "num_input_tokens_seen": 68716160, "step": 333 }, { "epoch": 2.9531163816878103, "grad_norm": 0.004036502446979284, "learning_rate": 5.3666424418413744e-08, "loss": 0.0001, "num_input_tokens_seen": 68918048, "step": 334 }, { "epoch": 2.961941533370105, "grad_norm": 0.0045955548994243145, "learning_rate": 3.4348724065119685e-08, "loss": 0.0001, "num_input_tokens_seen": 69129152, "step": 335 }, { "epoch": 2.9707666850523995, "grad_norm": 0.012164157815277576, "learning_rate": 1.9322125272297488e-08, "loss": 0.0003, "num_input_tokens_seen": 69328576, "step": 336 }, { "epoch": 2.979591836734694, "grad_norm": 0.0029640356078743935, "learning_rate": 8.587918539726402e-09, "loss": 0.0001, "num_input_tokens_seen": 69537232, "step": 337 }, { "epoch": 2.988416988416988, "grad_norm": 0.005239939782768488, "learning_rate": 2.1470257321298813e-09, "loss": 0.0001, "num_input_tokens_seen": 69761008, "step": 338 }, { "epoch": 2.997242140099283, "grad_norm": 0.0060053626075387, "learning_rate": 0.0, "loss": 0.0001, "num_input_tokens_seen": 69953200, "step": 339 }, { "epoch": 2.997242140099283, "num_input_tokens_seen": 69953200, "step": 339, "total_flos": 2.976146663409713e+18, "train_loss": 0.004280612113766934, "train_runtime": 8852.4475, "train_samples_per_second": 4.914, "train_steps_per_second": 0.038 } ], "logging_steps": 1, "max_steps": 339, "num_input_tokens_seen": 69953200, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.976146663409713e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }