| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.996101364522417, | |
| "eval_steps": 500, | |
| "global_step": 384, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005198180636777128, | |
| "grad_norm": 46.50929641723633, | |
| "learning_rate": 8.333333333333333e-08, | |
| "loss": 1.4958, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.010396361273554255, | |
| "grad_norm": 36.618709564208984, | |
| "learning_rate": 1.6666666666666665e-07, | |
| "loss": 1.2222, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.015594541910331383, | |
| "grad_norm": 42.09670639038086, | |
| "learning_rate": 2.5e-07, | |
| "loss": 1.2627, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.02079272254710851, | |
| "grad_norm": 38.13116455078125, | |
| "learning_rate": 3.333333333333333e-07, | |
| "loss": 1.2579, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.02599090318388564, | |
| "grad_norm": 40.15380859375, | |
| "learning_rate": 4.1666666666666667e-07, | |
| "loss": 1.2421, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.031189083820662766, | |
| "grad_norm": 43.696563720703125, | |
| "learning_rate": 5e-07, | |
| "loss": 1.264, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.036387264457439894, | |
| "grad_norm": 44.71561813354492, | |
| "learning_rate": 5.833333333333334e-07, | |
| "loss": 1.3881, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.04158544509421702, | |
| "grad_norm": 41.07535171508789, | |
| "learning_rate": 6.666666666666666e-07, | |
| "loss": 1.1705, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.04678362573099415, | |
| "grad_norm": 37.13037109375, | |
| "learning_rate": 7.5e-07, | |
| "loss": 1.1108, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.05198180636777128, | |
| "grad_norm": 39.47488021850586, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 1.1024, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.057179987004548405, | |
| "grad_norm": 35.2398681640625, | |
| "learning_rate": 9.166666666666665e-07, | |
| "loss": 1.0848, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.06237816764132553, | |
| "grad_norm": 26.39617347717285, | |
| "learning_rate": 1e-06, | |
| "loss": 0.7678, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.06757634827810266, | |
| "grad_norm": 23.569713592529297, | |
| "learning_rate": 9.999821700020548e-07, | |
| "loss": 0.7753, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.07277452891487979, | |
| "grad_norm": 20.85965919494629, | |
| "learning_rate": 9.99928681279855e-07, | |
| "loss": 0.6663, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.07797270955165692, | |
| "grad_norm": 19.964326858520508, | |
| "learning_rate": 9.998395376482152e-07, | |
| "loss": 0.5468, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.08317089018843404, | |
| "grad_norm": 10.887548446655273, | |
| "learning_rate": 9.997147454648588e-07, | |
| "loss": 0.4754, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.08836907082521117, | |
| "grad_norm": 10.726633071899414, | |
| "learning_rate": 9.995543136299635e-07, | |
| "loss": 0.4547, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0935672514619883, | |
| "grad_norm": 9.673012733459473, | |
| "learning_rate": 9.993582535855263e-07, | |
| "loss": 0.4634, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.09876543209876543, | |
| "grad_norm": 7.258286476135254, | |
| "learning_rate": 9.991265793145479e-07, | |
| "loss": 0.3635, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.10396361273554255, | |
| "grad_norm": 5.402667999267578, | |
| "learning_rate": 9.988593073400354e-07, | |
| "loss": 0.356, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10916179337231968, | |
| "grad_norm": 4.859364032745361, | |
| "learning_rate": 9.985564567238236e-07, | |
| "loss": 0.3692, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.11435997400909681, | |
| "grad_norm": 3.8276686668395996, | |
| "learning_rate": 9.982180490652164e-07, | |
| "loss": 0.2976, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.11955815464587394, | |
| "grad_norm": 3.0185964107513428, | |
| "learning_rate": 9.97844108499445e-07, | |
| "loss": 0.2635, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.12475633528265107, | |
| "grad_norm": 2.6632726192474365, | |
| "learning_rate": 9.974346616959475e-07, | |
| "loss": 0.3086, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.1299545159194282, | |
| "grad_norm": 2.5440852642059326, | |
| "learning_rate": 9.969897378564667e-07, | |
| "loss": 0.2746, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.13515269655620532, | |
| "grad_norm": 2.642413377761841, | |
| "learning_rate": 9.965093687129667e-07, | |
| "loss": 0.2889, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.14035087719298245, | |
| "grad_norm": 2.513338565826416, | |
| "learning_rate": 9.959935885253715e-07, | |
| "loss": 0.2778, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.14554905782975958, | |
| "grad_norm": 2.2585127353668213, | |
| "learning_rate": 9.954424340791195e-07, | |
| "loss": 0.2311, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.1507472384665367, | |
| "grad_norm": 2.021958351135254, | |
| "learning_rate": 9.948559446825411e-07, | |
| "loss": 0.2403, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.15594541910331383, | |
| "grad_norm": 2.938659429550171, | |
| "learning_rate": 9.942341621640557e-07, | |
| "loss": 0.2984, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.16114359974009096, | |
| "grad_norm": 1.9811211824417114, | |
| "learning_rate": 9.93577130869187e-07, | |
| "loss": 0.2607, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.1663417803768681, | |
| "grad_norm": 1.8804433345794678, | |
| "learning_rate": 9.928848976574018e-07, | |
| "loss": 0.2236, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.17153996101364521, | |
| "grad_norm": 2.2095425128936768, | |
| "learning_rate": 9.921575118987671e-07, | |
| "loss": 0.247, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.17673814165042234, | |
| "grad_norm": 2.0361135005950928, | |
| "learning_rate": 9.91395025470429e-07, | |
| "loss": 0.2519, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.18193632228719947, | |
| "grad_norm": 1.9882704019546509, | |
| "learning_rate": 9.905974927529133e-07, | |
| "loss": 0.2387, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1871345029239766, | |
| "grad_norm": 2.1970348358154297, | |
| "learning_rate": 9.897649706262473e-07, | |
| "loss": 0.2506, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.19233268356075373, | |
| "grad_norm": 1.9535129070281982, | |
| "learning_rate": 9.888975184659016e-07, | |
| "loss": 0.2491, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.19753086419753085, | |
| "grad_norm": 1.7368297576904297, | |
| "learning_rate": 9.879951981385577e-07, | |
| "loss": 0.2002, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.20272904483430798, | |
| "grad_norm": 1.8394443988800049, | |
| "learning_rate": 9.870580739976935e-07, | |
| "loss": 0.2107, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.2079272254710851, | |
| "grad_norm": 2.0104002952575684, | |
| "learning_rate": 9.860862128789952e-07, | |
| "loss": 0.2373, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.21312540610786224, | |
| "grad_norm": 1.7731590270996094, | |
| "learning_rate": 9.850796840955899e-07, | |
| "loss": 0.1881, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.21832358674463936, | |
| "grad_norm": 1.937873363494873, | |
| "learning_rate": 9.840385594331022e-07, | |
| "loss": 0.2238, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.2235217673814165, | |
| "grad_norm": 1.9435638189315796, | |
| "learning_rate": 9.82962913144534e-07, | |
| "loss": 0.2237, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.22871994801819362, | |
| "grad_norm": 1.786537766456604, | |
| "learning_rate": 9.818528219449704e-07, | |
| "loss": 0.1951, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.23391812865497075, | |
| "grad_norm": 1.7915631532669067, | |
| "learning_rate": 9.807083650061062e-07, | |
| "loss": 0.2257, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.23911630929174787, | |
| "grad_norm": 1.798353910446167, | |
| "learning_rate": 9.79529623950601e-07, | |
| "loss": 0.236, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.244314489928525, | |
| "grad_norm": 1.872049331665039, | |
| "learning_rate": 9.783166828462572e-07, | |
| "loss": 0.2354, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.24951267056530213, | |
| "grad_norm": 1.879210114479065, | |
| "learning_rate": 9.770696282000244e-07, | |
| "loss": 0.2229, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.25471085120207926, | |
| "grad_norm": 1.9663130044937134, | |
| "learning_rate": 9.757885489518296e-07, | |
| "loss": 0.2461, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.2599090318388564, | |
| "grad_norm": 1.6957286596298218, | |
| "learning_rate": 9.744735364682344e-07, | |
| "loss": 0.2065, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2651072124756335, | |
| "grad_norm": 1.7848544120788574, | |
| "learning_rate": 9.731246845359184e-07, | |
| "loss": 0.1949, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.27030539311241064, | |
| "grad_norm": 1.8262349367141724, | |
| "learning_rate": 9.7174208935499e-07, | |
| "loss": 0.2135, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.27550357374918777, | |
| "grad_norm": 1.6100451946258545, | |
| "learning_rate": 9.703258495321265e-07, | |
| "loss": 0.1643, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.2807017543859649, | |
| "grad_norm": 1.6476277112960815, | |
| "learning_rate": 9.688760660735402e-07, | |
| "loss": 0.1796, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.285899935022742, | |
| "grad_norm": 1.6926974058151245, | |
| "learning_rate": 9.673928423777756e-07, | |
| "loss": 0.2048, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.29109811565951915, | |
| "grad_norm": 1.7797563076019287, | |
| "learning_rate": 9.658762842283341e-07, | |
| "loss": 0.1953, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.2962962962962963, | |
| "grad_norm": 1.7844983339309692, | |
| "learning_rate": 9.643264997861312e-07, | |
| "loss": 0.2103, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.3014944769330734, | |
| "grad_norm": 1.8552502393722534, | |
| "learning_rate": 9.627435995817797e-07, | |
| "loss": 0.1854, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.30669265756985054, | |
| "grad_norm": 1.8308689594268799, | |
| "learning_rate": 9.611276965077097e-07, | |
| "loss": 0.1892, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.31189083820662766, | |
| "grad_norm": 1.8636093139648438, | |
| "learning_rate": 9.594789058101153e-07, | |
| "loss": 0.216, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3170890188434048, | |
| "grad_norm": 1.5857099294662476, | |
| "learning_rate": 9.577973450807351e-07, | |
| "loss": 0.1924, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.3222871994801819, | |
| "grad_norm": 1.670000433921814, | |
| "learning_rate": 9.560831342484666e-07, | |
| "loss": 0.2088, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.32748538011695905, | |
| "grad_norm": 1.8758388757705688, | |
| "learning_rate": 9.543363955708124e-07, | |
| "loss": 0.1697, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.3326835607537362, | |
| "grad_norm": 2.020310401916504, | |
| "learning_rate": 9.525572536251605e-07, | |
| "loss": 0.2249, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.3378817413905133, | |
| "grad_norm": 1.8294882774353027, | |
| "learning_rate": 9.507458352999001e-07, | |
| "loss": 0.1884, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.34307992202729043, | |
| "grad_norm": 1.606002926826477, | |
| "learning_rate": 9.489022697853708e-07, | |
| "loss": 0.1761, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.34827810266406756, | |
| "grad_norm": 1.6073530912399292, | |
| "learning_rate": 9.470266885646503e-07, | |
| "loss": 0.1871, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.3534762833008447, | |
| "grad_norm": 1.7087726593017578, | |
| "learning_rate": 9.451192254041758e-07, | |
| "loss": 0.173, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.3586744639376218, | |
| "grad_norm": 1.7764538526535034, | |
| "learning_rate": 9.431800163442041e-07, | |
| "loss": 0.1957, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.36387264457439894, | |
| "grad_norm": 1.8759775161743164, | |
| "learning_rate": 9.412091996891095e-07, | |
| "loss": 0.2154, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.36907082521117607, | |
| "grad_norm": 1.8281443119049072, | |
| "learning_rate": 9.392069159975198e-07, | |
| "loss": 0.1679, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.3742690058479532, | |
| "grad_norm": 1.7894129753112793, | |
| "learning_rate": 9.37173308072291e-07, | |
| "loss": 0.1679, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.3794671864847303, | |
| "grad_norm": 1.6492183208465576, | |
| "learning_rate": 9.35108520950324e-07, | |
| "loss": 0.1833, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.38466536712150745, | |
| "grad_norm": 1.6076239347457886, | |
| "learning_rate": 9.330127018922193e-07, | |
| "loss": 0.1507, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.3898635477582846, | |
| "grad_norm": 1.8182544708251953, | |
| "learning_rate": 9.308860003717748e-07, | |
| "loss": 0.1759, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.3950617283950617, | |
| "grad_norm": 2.183497667312622, | |
| "learning_rate": 9.287285680653254e-07, | |
| "loss": 0.2069, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.40025990903183883, | |
| "grad_norm": 1.9281930923461914, | |
| "learning_rate": 9.265405588409256e-07, | |
| "loss": 0.1813, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.40545808966861596, | |
| "grad_norm": 1.7534650564193726, | |
| "learning_rate": 9.243221287473755e-07, | |
| "loss": 0.1764, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.4106562703053931, | |
| "grad_norm": 1.7174078226089478, | |
| "learning_rate": 9.220734360030906e-07, | |
| "loss": 0.1863, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.4158544509421702, | |
| "grad_norm": 1.7550305128097534, | |
| "learning_rate": 9.197946409848194e-07, | |
| "loss": 0.1718, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 1.4776816368103027, | |
| "learning_rate": 9.174859062162037e-07, | |
| "loss": 0.156, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.4262508122157245, | |
| "grad_norm": 1.7932229042053223, | |
| "learning_rate": 9.151473963561882e-07, | |
| "loss": 0.1821, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.4314489928525016, | |
| "grad_norm": 1.6103583574295044, | |
| "learning_rate": 9.127792781872768e-07, | |
| "loss": 0.1749, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.43664717348927873, | |
| "grad_norm": 1.8216729164123535, | |
| "learning_rate": 9.103817206036382e-07, | |
| "loss": 0.177, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.44184535412605586, | |
| "grad_norm": 1.7169886827468872, | |
| "learning_rate": 9.079548945990592e-07, | |
| "loss": 0.1845, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.447043534762833, | |
| "grad_norm": 1.4935150146484375, | |
| "learning_rate": 9.054989732547506e-07, | |
| "loss": 0.1518, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.4522417153996101, | |
| "grad_norm": 1.7215607166290283, | |
| "learning_rate": 9.030141317270025e-07, | |
| "loss": 0.1651, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.45743989603638724, | |
| "grad_norm": 1.885299801826477, | |
| "learning_rate": 9.005005472346923e-07, | |
| "loss": 0.1862, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.46263807667316437, | |
| "grad_norm": 1.6924781799316406, | |
| "learning_rate": 8.979583990466452e-07, | |
| "loss": 0.1834, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.4678362573099415, | |
| "grad_norm": 1.6620601415634155, | |
| "learning_rate": 8.953878684688492e-07, | |
| "loss": 0.1736, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4730344379467186, | |
| "grad_norm": 1.7256325483322144, | |
| "learning_rate": 8.92789138831524e-07, | |
| "loss": 0.1792, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.47823261858349575, | |
| "grad_norm": 1.6039340496063232, | |
| "learning_rate": 8.901623954760459e-07, | |
| "loss": 0.1704, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.4834307992202729, | |
| "grad_norm": 1.6422524452209473, | |
| "learning_rate": 8.875078257417294e-07, | |
| "loss": 0.1621, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.48862897985705, | |
| "grad_norm": 1.6837060451507568, | |
| "learning_rate": 8.84825618952466e-07, | |
| "loss": 0.183, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.49382716049382713, | |
| "grad_norm": 1.750653862953186, | |
| "learning_rate": 8.821159664032223e-07, | |
| "loss": 0.1689, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.49902534113060426, | |
| "grad_norm": 1.6462229490280151, | |
| "learning_rate": 8.793790613463954e-07, | |
| "loss": 0.1394, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.5042235217673814, | |
| "grad_norm": 1.7336857318878174, | |
| "learning_rate": 8.766150989780317e-07, | |
| "loss": 0.1581, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.5094217024041585, | |
| "grad_norm": 1.8384933471679688, | |
| "learning_rate": 8.738242764239046e-07, | |
| "loss": 0.1918, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.5146198830409356, | |
| "grad_norm": 1.723486065864563, | |
| "learning_rate": 8.710067927254554e-07, | |
| "loss": 0.1737, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.5198180636777128, | |
| "grad_norm": 1.9092669486999512, | |
| "learning_rate": 8.681628488255986e-07, | |
| "loss": 0.1728, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5250162443144899, | |
| "grad_norm": 1.729762315750122, | |
| "learning_rate": 8.652926475543898e-07, | |
| "loss": 0.162, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.530214424951267, | |
| "grad_norm": 1.7867392301559448, | |
| "learning_rate": 8.623963936145599e-07, | |
| "loss": 0.1658, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.5354126055880442, | |
| "grad_norm": 2.0217678546905518, | |
| "learning_rate": 8.594742935669164e-07, | |
| "loss": 0.1865, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.5406107862248213, | |
| "grad_norm": 1.7473349571228027, | |
| "learning_rate": 8.565265558156101e-07, | |
| "loss": 0.1535, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.5458089668615984, | |
| "grad_norm": 1.5292036533355713, | |
| "learning_rate": 8.535533905932737e-07, | |
| "loss": 0.1559, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.5510071474983755, | |
| "grad_norm": 1.5472049713134766, | |
| "learning_rate": 8.505550099460263e-07, | |
| "loss": 0.1423, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.5562053281351527, | |
| "grad_norm": 1.636443853378296, | |
| "learning_rate": 8.475316277183508e-07, | |
| "loss": 0.1747, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.5614035087719298, | |
| "grad_norm": 1.5992189645767212, | |
| "learning_rate": 8.444834595378433e-07, | |
| "loss": 0.1766, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.5666016894087069, | |
| "grad_norm": 1.6766347885131836, | |
| "learning_rate": 8.414107227998328e-07, | |
| "loss": 0.1421, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.571799870045484, | |
| "grad_norm": 1.7345399856567383, | |
| "learning_rate": 8.383136366518787e-07, | |
| "loss": 0.1752, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5769980506822612, | |
| "grad_norm": 1.669264793395996, | |
| "learning_rate": 8.351924219781392e-07, | |
| "loss": 0.1661, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.5821962313190383, | |
| "grad_norm": 1.7636350393295288, | |
| "learning_rate": 8.320473013836195e-07, | |
| "loss": 0.1892, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.5873944119558154, | |
| "grad_norm": 1.8429635763168335, | |
| "learning_rate": 8.288784991782945e-07, | |
| "loss": 0.1883, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.5925925925925926, | |
| "grad_norm": 1.5329152345657349, | |
| "learning_rate": 8.256862413611112e-07, | |
| "loss": 0.1472, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.5977907732293697, | |
| "grad_norm": 1.9208284616470337, | |
| "learning_rate": 8.22470755603871e-07, | |
| "loss": 0.1714, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.6029889538661468, | |
| "grad_norm": 1.6381752490997314, | |
| "learning_rate": 8.192322712349917e-07, | |
| "loss": 0.1806, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.6081871345029239, | |
| "grad_norm": 1.5502922534942627, | |
| "learning_rate": 8.159710192231519e-07, | |
| "loss": 0.1653, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.6133853151397011, | |
| "grad_norm": 1.604650616645813, | |
| "learning_rate": 8.126872321608183e-07, | |
| "loss": 0.1478, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.6185834957764782, | |
| "grad_norm": 1.6860443353652954, | |
| "learning_rate": 8.093811442476572e-07, | |
| "loss": 0.1639, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.6237816764132553, | |
| "grad_norm": 1.5915076732635498, | |
| "learning_rate": 8.060529912738314e-07, | |
| "loss": 0.1511, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.6289798570500325, | |
| "grad_norm": 1.7241225242614746, | |
| "learning_rate": 8.027030106031835e-07, | |
| "loss": 0.1848, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.6341780376868096, | |
| "grad_norm": 1.7747095823287964, | |
| "learning_rate": 7.993314411563075e-07, | |
| "loss": 0.1816, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.6393762183235867, | |
| "grad_norm": 1.6497771739959717, | |
| "learning_rate": 7.959385233935085e-07, | |
| "loss": 0.1696, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.6445743989603638, | |
| "grad_norm": 1.4712307453155518, | |
| "learning_rate": 7.925244992976537e-07, | |
| "loss": 0.1297, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.649772579597141, | |
| "grad_norm": 1.618713140487671, | |
| "learning_rate": 7.890896123569135e-07, | |
| "loss": 0.1708, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.6549707602339181, | |
| "grad_norm": 1.8550593852996826, | |
| "learning_rate": 7.856341075473961e-07, | |
| "loss": 0.1646, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.6601689408706952, | |
| "grad_norm": 1.7929205894470215, | |
| "learning_rate": 7.821582313156763e-07, | |
| "loss": 0.1555, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.6653671215074723, | |
| "grad_norm": 1.8011633157730103, | |
| "learning_rate": 7.786622315612181e-07, | |
| "loss": 0.1882, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.6705653021442495, | |
| "grad_norm": 1.642986536026001, | |
| "learning_rate": 7.751463576186957e-07, | |
| "loss": 0.1659, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.6757634827810266, | |
| "grad_norm": 1.547602653503418, | |
| "learning_rate": 7.716108602402094e-07, | |
| "loss": 0.1479, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.6809616634178037, | |
| "grad_norm": 1.6602659225463867, | |
| "learning_rate": 7.680559915774033e-07, | |
| "loss": 0.1627, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.6861598440545809, | |
| "grad_norm": 1.8091386556625366, | |
| "learning_rate": 7.644820051634812e-07, | |
| "loss": 0.1637, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.691358024691358, | |
| "grad_norm": 1.669487476348877, | |
| "learning_rate": 7.608891558951248e-07, | |
| "loss": 0.1599, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.6965562053281351, | |
| "grad_norm": 1.9016000032424927, | |
| "learning_rate": 7.572777000143145e-07, | |
| "loss": 0.1654, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 1.4672502279281616, | |
| "learning_rate": 7.536478950900536e-07, | |
| "loss": 0.1482, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.7069525666016894, | |
| "grad_norm": 1.4602234363555908, | |
| "learning_rate": 7.5e-07, | |
| "loss": 0.1214, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.7121507472384665, | |
| "grad_norm": 1.725661277770996, | |
| "learning_rate": 7.463342749120013e-07, | |
| "loss": 0.1406, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.7173489278752436, | |
| "grad_norm": 1.6164398193359375, | |
| "learning_rate": 7.426509812655405e-07, | |
| "loss": 0.1492, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.7225471085120208, | |
| "grad_norm": 1.609312891960144, | |
| "learning_rate": 7.389503817530905e-07, | |
| "loss": 0.1669, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.7277452891487979, | |
| "grad_norm": 1.512629508972168, | |
| "learning_rate": 7.352327403013779e-07, | |
| "loss": 0.1318, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.732943469785575, | |
| "grad_norm": 1.7129087448120117, | |
| "learning_rate": 7.314983220525604e-07, | |
| "loss": 0.1762, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.7381416504223521, | |
| "grad_norm": 1.6480506658554077, | |
| "learning_rate": 7.277473933453169e-07, | |
| "loss": 0.1738, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.7433398310591293, | |
| "grad_norm": 1.5904552936553955, | |
| "learning_rate": 7.239802216958522e-07, | |
| "loss": 0.1558, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.7485380116959064, | |
| "grad_norm": 1.6988767385482788, | |
| "learning_rate": 7.201970757788171e-07, | |
| "loss": 0.1661, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.7537361923326835, | |
| "grad_norm": 1.5458639860153198, | |
| "learning_rate": 7.163982254081474e-07, | |
| "loss": 0.1338, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.7589343729694606, | |
| "grad_norm": 1.5240118503570557, | |
| "learning_rate": 7.125839415178203e-07, | |
| "loss": 0.1405, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.7641325536062378, | |
| "grad_norm": 1.7464412450790405, | |
| "learning_rate": 7.087544961425316e-07, | |
| "loss": 0.1682, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.7693307342430149, | |
| "grad_norm": 1.7425211668014526, | |
| "learning_rate": 7.049101623982937e-07, | |
| "loss": 0.1839, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.774528914879792, | |
| "grad_norm": 1.4918522834777832, | |
| "learning_rate": 7.010512144629579e-07, | |
| "loss": 0.1124, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.7797270955165692, | |
| "grad_norm": 1.6756539344787598, | |
| "learning_rate": 6.971779275566593e-07, | |
| "loss": 0.1546, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.7849252761533463, | |
| "grad_norm": 1.5222876071929932, | |
| "learning_rate": 6.93290577922188e-07, | |
| "loss": 0.1313, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.7901234567901234, | |
| "grad_norm": 1.548453688621521, | |
| "learning_rate": 6.89389442805288e-07, | |
| "loss": 0.1349, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.7953216374269005, | |
| "grad_norm": 1.6898419857025146, | |
| "learning_rate": 6.85474800434884e-07, | |
| "loss": 0.1568, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.8005198180636777, | |
| "grad_norm": 1.8794304132461548, | |
| "learning_rate": 6.815469300032373e-07, | |
| "loss": 0.161, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.8057179987004548, | |
| "grad_norm": 1.6816418170928955, | |
| "learning_rate": 6.776061116460352e-07, | |
| "loss": 0.1615, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.8109161793372319, | |
| "grad_norm": 1.960444688796997, | |
| "learning_rate": 6.7365262642241e-07, | |
| "loss": 0.1948, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.816114359974009, | |
| "grad_norm": 1.6450730562210083, | |
| "learning_rate": 6.696867562948962e-07, | |
| "loss": 0.161, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.8213125406107862, | |
| "grad_norm": 1.4993230104446411, | |
| "learning_rate": 6.657087841093179e-07, | |
| "loss": 0.1476, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.8265107212475633, | |
| "grad_norm": 1.856066346168518, | |
| "learning_rate": 6.61718993574619e-07, | |
| "loss": 0.1599, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.8317089018843404, | |
| "grad_norm": 1.6243445873260498, | |
| "learning_rate": 6.577176692426278e-07, | |
| "loss": 0.1548, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8369070825211176, | |
| "grad_norm": 1.538219928741455, | |
| "learning_rate": 6.537050964877625e-07, | |
| "loss": 0.1428, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 1.429417371749878, | |
| "learning_rate": 6.496815614866791e-07, | |
| "loss": 0.1205, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.8473034437946718, | |
| "grad_norm": 1.7732073068618774, | |
| "learning_rate": 6.456473511978606e-07, | |
| "loss": 0.1903, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.852501624431449, | |
| "grad_norm": 1.575061321258545, | |
| "learning_rate": 6.416027533411519e-07, | |
| "loss": 0.1571, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.8576998050682261, | |
| "grad_norm": 1.6352499723434448, | |
| "learning_rate": 6.375480563772389e-07, | |
| "loss": 0.1484, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.8628979857050032, | |
| "grad_norm": 1.7170888185501099, | |
| "learning_rate": 6.334835494870758e-07, | |
| "loss": 0.1735, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.8680961663417803, | |
| "grad_norm": 1.5450496673583984, | |
| "learning_rate": 6.294095225512604e-07, | |
| "loss": 0.1339, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.8732943469785575, | |
| "grad_norm": 1.5989458560943604, | |
| "learning_rate": 6.253262661293602e-07, | |
| "loss": 0.1393, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.8784925276153346, | |
| "grad_norm": 1.464534878730774, | |
| "learning_rate": 6.2123407143919e-07, | |
| "loss": 0.1421, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.8836907082521117, | |
| "grad_norm": 1.6165345907211304, | |
| "learning_rate": 6.17133230336041e-07, | |
| "loss": 0.154, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 1.5105384588241577, | |
| "learning_rate": 6.130240352918674e-07, | |
| "loss": 0.1614, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.894087069525666, | |
| "grad_norm": 1.6538264751434326, | |
| "learning_rate": 6.089067793744257e-07, | |
| "loss": 0.1213, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.8992852501624431, | |
| "grad_norm": 1.5659717321395874, | |
| "learning_rate": 6.047817562263743e-07, | |
| "loss": 0.1349, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.9044834307992202, | |
| "grad_norm": 1.6108099222183228, | |
| "learning_rate": 6.0064926004433e-07, | |
| "loss": 0.1572, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.9096816114359974, | |
| "grad_norm": 1.7230148315429688, | |
| "learning_rate": 5.965095855578868e-07, | |
| "loss": 0.1376, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.9148797920727745, | |
| "grad_norm": 1.7344483137130737, | |
| "learning_rate": 5.923630280085947e-07, | |
| "loss": 0.1572, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.9200779727095516, | |
| "grad_norm": 1.6481879949569702, | |
| "learning_rate": 5.882098831289043e-07, | |
| "loss": 0.1626, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.9252761533463287, | |
| "grad_norm": 1.7318065166473389, | |
| "learning_rate": 5.840504471210741e-07, | |
| "loss": 0.1756, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.9304743339831059, | |
| "grad_norm": 1.676165223121643, | |
| "learning_rate": 5.79885016636046e-07, | |
| "loss": 0.147, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.935672514619883, | |
| "grad_norm": 1.4620646238327026, | |
| "learning_rate": 5.757138887522883e-07, | |
| "loss": 0.1249, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.9408706952566601, | |
| "grad_norm": 1.65927255153656, | |
| "learning_rate": 5.71537360954607e-07, | |
| "loss": 0.163, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.9460688758934372, | |
| "grad_norm": 1.5536587238311768, | |
| "learning_rate": 5.673557311129306e-07, | |
| "loss": 0.1351, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.9512670565302144, | |
| "grad_norm": 1.7076836824417114, | |
| "learning_rate": 5.631692974610647e-07, | |
| "loss": 0.1771, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.9564652371669915, | |
| "grad_norm": 1.4979828596115112, | |
| "learning_rate": 5.589783585754231e-07, | |
| "loss": 0.121, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.9616634178037686, | |
| "grad_norm": 1.5839756727218628, | |
| "learning_rate": 5.547832133537327e-07, | |
| "loss": 0.1458, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.9668615984405458, | |
| "grad_norm": 1.7546137571334839, | |
| "learning_rate": 5.505841609937161e-07, | |
| "loss": 0.1671, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.9720597790773229, | |
| "grad_norm": 1.7105190753936768, | |
| "learning_rate": 5.463815009717532e-07, | |
| "loss": 0.1314, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.9772579597141, | |
| "grad_norm": 1.8557852506637573, | |
| "learning_rate": 5.421755330215223e-07, | |
| "loss": 0.1794, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.9824561403508771, | |
| "grad_norm": 1.569214105606079, | |
| "learning_rate": 5.379665571126231e-07, | |
| "loss": 0.1307, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.9876543209876543, | |
| "grad_norm": 1.6137492656707764, | |
| "learning_rate": 5.337548734291826e-07, | |
| "loss": 0.1412, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.9928525016244314, | |
| "grad_norm": 1.6707996129989624, | |
| "learning_rate": 5.295407823484467e-07, | |
| "loss": 0.1627, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.9980506822612085, | |
| "grad_norm": 2.3229496479034424, | |
| "learning_rate": 5.253245844193564e-07, | |
| "loss": 0.1872, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.0032488628979856, | |
| "grad_norm": 1.4374881982803345, | |
| "learning_rate": 5.211065803411134e-07, | |
| "loss": 0.1118, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.0084470435347628, | |
| "grad_norm": 1.7097264528274536, | |
| "learning_rate": 5.168870709417341e-07, | |
| "loss": 0.1603, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.01364522417154, | |
| "grad_norm": 1.5904935598373413, | |
| "learning_rate": 5.126663571565939e-07, | |
| "loss": 0.128, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.018843404808317, | |
| "grad_norm": 1.5433835983276367, | |
| "learning_rate": 5.084447400069654e-07, | |
| "loss": 0.1192, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.0240415854450942, | |
| "grad_norm": 1.40073561668396, | |
| "learning_rate": 5.042225205785492e-07, | |
| "loss": 0.1188, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.0292397660818713, | |
| "grad_norm": 1.6374619007110596, | |
| "learning_rate": 5e-07, | |
| "loss": 0.1486, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.0344379467186484, | |
| "grad_norm": 1.4800790548324585, | |
| "learning_rate": 4.957774794214508e-07, | |
| "loss": 0.1297, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.0396361273554255, | |
| "grad_norm": 1.5941686630249023, | |
| "learning_rate": 4.915552599930345e-07, | |
| "loss": 0.1466, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0448343079922027, | |
| "grad_norm": 1.5258111953735352, | |
| "learning_rate": 4.873336428434061e-07, | |
| "loss": 0.1264, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.0500324886289798, | |
| "grad_norm": 1.725998878479004, | |
| "learning_rate": 4.831129290582659e-07, | |
| "loss": 0.1334, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.055230669265757, | |
| "grad_norm": 1.8626656532287598, | |
| "learning_rate": 4.788934196588865e-07, | |
| "loss": 0.1503, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.060428849902534, | |
| "grad_norm": 1.4593480825424194, | |
| "learning_rate": 4.746754155806437e-07, | |
| "loss": 0.1225, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.0656270305393112, | |
| "grad_norm": 1.6284171342849731, | |
| "learning_rate": 4.7045921765155337e-07, | |
| "loss": 0.1397, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.0708252111760883, | |
| "grad_norm": 1.4469428062438965, | |
| "learning_rate": 4.662451265708174e-07, | |
| "loss": 0.1082, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.0760233918128654, | |
| "grad_norm": 1.3986436128616333, | |
| "learning_rate": 4.620334428873769e-07, | |
| "loss": 0.1025, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.0812215724496426, | |
| "grad_norm": 1.6322413682937622, | |
| "learning_rate": 4.5782446697847764e-07, | |
| "loss": 0.126, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.0864197530864197, | |
| "grad_norm": 1.4022878408432007, | |
| "learning_rate": 4.536184990282467e-07, | |
| "loss": 0.0932, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.0916179337231968, | |
| "grad_norm": 1.8021215200424194, | |
| "learning_rate": 4.4941583900628393e-07, | |
| "loss": 0.1662, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.096816114359974, | |
| "grad_norm": 1.806060552597046, | |
| "learning_rate": 4.4521678664626745e-07, | |
| "loss": 0.1574, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.102014294996751, | |
| "grad_norm": 1.7470866441726685, | |
| "learning_rate": 4.4102164142457705e-07, | |
| "loss": 0.1467, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.1072124756335282, | |
| "grad_norm": 1.5723119974136353, | |
| "learning_rate": 4.368307025389355e-07, | |
| "loss": 0.1084, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.1124106562703053, | |
| "grad_norm": 1.5377353429794312, | |
| "learning_rate": 4.326442688870696e-07, | |
| "loss": 0.116, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.1176088369070825, | |
| "grad_norm": 1.7077683210372925, | |
| "learning_rate": 4.2846263904539303e-07, | |
| "loss": 0.1483, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.1228070175438596, | |
| "grad_norm": 1.8333083391189575, | |
| "learning_rate": 4.242861112477118e-07, | |
| "loss": 0.1527, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.1280051981806367, | |
| "grad_norm": 1.5769239664077759, | |
| "learning_rate": 4.201149833639539e-07, | |
| "loss": 0.121, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.1332033788174138, | |
| "grad_norm": 1.919921636581421, | |
| "learning_rate": 4.15949552878926e-07, | |
| "loss": 0.173, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.138401559454191, | |
| "grad_norm": 1.5571967363357544, | |
| "learning_rate": 4.117901168710959e-07, | |
| "loss": 0.1227, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.143599740090968, | |
| "grad_norm": 1.6683669090270996, | |
| "learning_rate": 4.0763697199140546e-07, | |
| "loss": 0.1422, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.1487979207277452, | |
| "grad_norm": 1.6401057243347168, | |
| "learning_rate": 4.034904144421134e-07, | |
| "loss": 0.1256, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.1539961013645224, | |
| "grad_norm": 1.5419056415557861, | |
| "learning_rate": 3.9935073995566987e-07, | |
| "loss": 0.1302, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.1591942820012995, | |
| "grad_norm": 1.650795817375183, | |
| "learning_rate": 3.952182437736256e-07, | |
| "loss": 0.1471, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.1643924626380766, | |
| "grad_norm": 1.672743797302246, | |
| "learning_rate": 3.910932206255742e-07, | |
| "loss": 0.1298, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.1695906432748537, | |
| "grad_norm": 1.6902425289154053, | |
| "learning_rate": 3.869759647081325e-07, | |
| "loss": 0.1414, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.1747888239116309, | |
| "grad_norm": 1.607485055923462, | |
| "learning_rate": 3.828667696639589e-07, | |
| "loss": 0.1032, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.179987004548408, | |
| "grad_norm": 1.5336533784866333, | |
| "learning_rate": 3.7876592856081e-07, | |
| "loss": 0.1116, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.1851851851851851, | |
| "grad_norm": 1.5528396368026733, | |
| "learning_rate": 3.7467373387063964e-07, | |
| "loss": 0.1243, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.1903833658219622, | |
| "grad_norm": 1.743318796157837, | |
| "learning_rate": 3.7059047744873955e-07, | |
| "loss": 0.1437, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.1955815464587394, | |
| "grad_norm": 1.6484525203704834, | |
| "learning_rate": 3.665164505129241e-07, | |
| "loss": 0.131, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.2007797270955165, | |
| "grad_norm": 1.6531956195831299, | |
| "learning_rate": 3.6245194362276094e-07, | |
| "loss": 0.1268, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.2059779077322936, | |
| "grad_norm": 1.491297960281372, | |
| "learning_rate": 3.5839724665884795e-07, | |
| "loss": 0.1261, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.2111760883690708, | |
| "grad_norm": 1.5535728931427002, | |
| "learning_rate": 3.5435264880213937e-07, | |
| "loss": 0.1233, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.2163742690058479, | |
| "grad_norm": 1.6621695756912231, | |
| "learning_rate": 3.50318438513321e-07, | |
| "loss": 0.1331, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.221572449642625, | |
| "grad_norm": 1.5829371213912964, | |
| "learning_rate": 3.462949035122376e-07, | |
| "loss": 0.1229, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.2267706302794021, | |
| "grad_norm": 1.7693301439285278, | |
| "learning_rate": 3.4228233075737223e-07, | |
| "loss": 0.1434, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.2319688109161793, | |
| "grad_norm": 1.6113789081573486, | |
| "learning_rate": 3.3828100642538093e-07, | |
| "loss": 0.1213, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.2371669915529564, | |
| "grad_norm": 1.5799354314804077, | |
| "learning_rate": 3.342912158906821e-07, | |
| "loss": 0.1191, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.2423651721897335, | |
| "grad_norm": 1.5467253923416138, | |
| "learning_rate": 3.3031324370510396e-07, | |
| "loss": 0.1133, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.2475633528265107, | |
| "grad_norm": 1.8147982358932495, | |
| "learning_rate": 3.263473735775899e-07, | |
| "loss": 0.1391, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.2527615334632878, | |
| "grad_norm": 1.702359676361084, | |
| "learning_rate": 3.2239388835396484e-07, | |
| "loss": 0.1339, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.257959714100065, | |
| "grad_norm": 1.7504138946533203, | |
| "learning_rate": 3.184530699967627e-07, | |
| "loss": 0.1565, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "grad_norm": 1.7226463556289673, | |
| "learning_rate": 3.1452519956511614e-07, | |
| "loss": 0.1266, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.2683560753736192, | |
| "grad_norm": 1.8186461925506592, | |
| "learning_rate": 3.1061055719471197e-07, | |
| "loss": 0.1347, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.2735542560103963, | |
| "grad_norm": 1.6384614706039429, | |
| "learning_rate": 3.0670942207781204e-07, | |
| "loss": 0.1115, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.2787524366471734, | |
| "grad_norm": 1.7369823455810547, | |
| "learning_rate": 3.028220724433408e-07, | |
| "loss": 0.129, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.2839506172839505, | |
| "grad_norm": 1.6780822277069092, | |
| "learning_rate": 2.989487855370421e-07, | |
| "loss": 0.1385, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.2891487979207277, | |
| "grad_norm": 1.87428879737854, | |
| "learning_rate": 2.9508983760170634e-07, | |
| "loss": 0.1435, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.2943469785575048, | |
| "grad_norm": 1.7940468788146973, | |
| "learning_rate": 2.9124550385746856e-07, | |
| "loss": 0.1491, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.299545159194282, | |
| "grad_norm": 1.712099552154541, | |
| "learning_rate": 2.8741605848217976e-07, | |
| "loss": 0.131, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.304743339831059, | |
| "grad_norm": 1.6154824495315552, | |
| "learning_rate": 2.8360177459185263e-07, | |
| "loss": 0.1145, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.3099415204678362, | |
| "grad_norm": 1.6371185779571533, | |
| "learning_rate": 2.7980292422118277e-07, | |
| "loss": 0.1232, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.3151397011046133, | |
| "grad_norm": 1.8156847953796387, | |
| "learning_rate": 2.7601977830414766e-07, | |
| "loss": 0.1274, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.3203378817413904, | |
| "grad_norm": 1.6596229076385498, | |
| "learning_rate": 2.72252606654683e-07, | |
| "loss": 0.1168, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.3255360623781676, | |
| "grad_norm": 1.6106423139572144, | |
| "learning_rate": 2.685016779474396e-07, | |
| "loss": 0.1139, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.3307342430149447, | |
| "grad_norm": 1.6363728046417236, | |
| "learning_rate": 2.6476725969862226e-07, | |
| "loss": 0.1297, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.3359324236517218, | |
| "grad_norm": 1.4978957176208496, | |
| "learning_rate": 2.6104961824690964e-07, | |
| "loss": 0.1191, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.341130604288499, | |
| "grad_norm": 1.5889379978179932, | |
| "learning_rate": 2.5734901873445956e-07, | |
| "loss": 0.1236, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.346328784925276, | |
| "grad_norm": 1.534178376197815, | |
| "learning_rate": 2.536657250879988e-07, | |
| "loss": 0.1053, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.3515269655620532, | |
| "grad_norm": 1.8409833908081055, | |
| "learning_rate": 2.500000000000001e-07, | |
| "loss": 0.1427, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.3567251461988303, | |
| "grad_norm": 1.7446589469909668, | |
| "learning_rate": 2.4635210490994647e-07, | |
| "loss": 0.1194, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.3619233268356075, | |
| "grad_norm": 1.7886688709259033, | |
| "learning_rate": 2.427222999856857e-07, | |
| "loss": 0.1351, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.3671215074723846, | |
| "grad_norm": 1.6462031602859497, | |
| "learning_rate": 2.391108441048753e-07, | |
| "loss": 0.1249, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.3723196881091617, | |
| "grad_norm": 1.8700019121170044, | |
| "learning_rate": 2.355179948365189e-07, | |
| "loss": 0.1482, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.3775178687459388, | |
| "grad_norm": 1.8244132995605469, | |
| "learning_rate": 2.3194400842259687e-07, | |
| "loss": 0.134, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.382716049382716, | |
| "grad_norm": 1.8189918994903564, | |
| "learning_rate": 2.283891397597908e-07, | |
| "loss": 0.1258, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.387914230019493, | |
| "grad_norm": 1.5552480220794678, | |
| "learning_rate": 2.2485364238130433e-07, | |
| "loss": 0.1131, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.3931124106562702, | |
| "grad_norm": 1.659328579902649, | |
| "learning_rate": 2.2133776843878183e-07, | |
| "loss": 0.1119, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.3983105912930474, | |
| "grad_norm": 1.3867595195770264, | |
| "learning_rate": 2.1784176868432375e-07, | |
| "loss": 0.0851, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.4035087719298245, | |
| "grad_norm": 1.7647099494934082, | |
| "learning_rate": 2.1436589245260372e-07, | |
| "loss": 0.1158, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.4087069525666016, | |
| "grad_norm": 1.8926544189453125, | |
| "learning_rate": 2.109103876430864e-07, | |
| "loss": 0.1527, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.4139051332033787, | |
| "grad_norm": 1.839390516281128, | |
| "learning_rate": 2.074755007023461e-07, | |
| "loss": 0.1108, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.4191033138401559, | |
| "grad_norm": 1.797500729560852, | |
| "learning_rate": 2.040614766064913e-07, | |
| "loss": 0.1508, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.424301494476933, | |
| "grad_norm": 1.7864497900009155, | |
| "learning_rate": 2.0066855884369243e-07, | |
| "loss": 0.1242, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.4294996751137101, | |
| "grad_norm": 1.853615641593933, | |
| "learning_rate": 1.9729698939681644e-07, | |
| "loss": 0.122, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.4346978557504872, | |
| "grad_norm": 1.6054631471633911, | |
| "learning_rate": 1.9394700872616853e-07, | |
| "loss": 0.1212, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.4398960363872644, | |
| "grad_norm": 1.632055640220642, | |
| "learning_rate": 1.906188557523427e-07, | |
| "loss": 0.1101, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.4450942170240415, | |
| "grad_norm": 1.6560664176940918, | |
| "learning_rate": 1.873127678391816e-07, | |
| "loss": 0.1217, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.4502923976608186, | |
| "grad_norm": 1.4159197807312012, | |
| "learning_rate": 1.8402898077684803e-07, | |
| "loss": 0.0988, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.4554905782975958, | |
| "grad_norm": 1.6314151287078857, | |
| "learning_rate": 1.8076772876500828e-07, | |
| "loss": 0.1293, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.4606887589343729, | |
| "grad_norm": 1.7430942058563232, | |
| "learning_rate": 1.775292443961291e-07, | |
| "loss": 0.1401, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.46588693957115, | |
| "grad_norm": 1.7857812643051147, | |
| "learning_rate": 1.7431375863888898e-07, | |
| "loss": 0.1275, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.4710851202079271, | |
| "grad_norm": 1.5308443307876587, | |
| "learning_rate": 1.7112150082170568e-07, | |
| "loss": 0.1061, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.4762833008447043, | |
| "grad_norm": 1.6288737058639526, | |
| "learning_rate": 1.679526986163804e-07, | |
| "loss": 0.1119, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.4814814814814814, | |
| "grad_norm": 1.704690933227539, | |
| "learning_rate": 1.6480757802186068e-07, | |
| "loss": 0.1166, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.4866796621182585, | |
| "grad_norm": 1.5033763647079468, | |
| "learning_rate": 1.6168636334812125e-07, | |
| "loss": 0.1045, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.4918778427550357, | |
| "grad_norm": 1.4401872158050537, | |
| "learning_rate": 1.5858927720016706e-07, | |
| "loss": 0.0959, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.4970760233918128, | |
| "grad_norm": 1.6706205606460571, | |
| "learning_rate": 1.555165404621567e-07, | |
| "loss": 0.1124, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.50227420402859, | |
| "grad_norm": 1.8483508825302124, | |
| "learning_rate": 1.5246837228164905e-07, | |
| "loss": 0.1146, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.507472384665367, | |
| "grad_norm": 1.8255398273468018, | |
| "learning_rate": 1.494449900539737e-07, | |
| "loss": 0.1413, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.5126705653021442, | |
| "grad_norm": 1.8132373094558716, | |
| "learning_rate": 1.4644660940672627e-07, | |
| "loss": 0.1316, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.5178687459389213, | |
| "grad_norm": 1.7580801248550415, | |
| "learning_rate": 1.434734441843899e-07, | |
| "loss": 0.1252, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.5230669265756984, | |
| "grad_norm": 1.4802451133728027, | |
| "learning_rate": 1.4052570643308375e-07, | |
| "loss": 0.1087, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.5282651072124755, | |
| "grad_norm": 1.595434308052063, | |
| "learning_rate": 1.376036063854401e-07, | |
| "loss": 0.1063, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.5334632878492527, | |
| "grad_norm": 1.5052251815795898, | |
| "learning_rate": 1.3470735244561027e-07, | |
| "loss": 0.1071, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.5386614684860298, | |
| "grad_norm": 1.4966932535171509, | |
| "learning_rate": 1.3183715117440142e-07, | |
| "loss": 0.1003, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.543859649122807, | |
| "grad_norm": 1.5577093362808228, | |
| "learning_rate": 1.2899320727454472e-07, | |
| "loss": 0.1147, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.549057829759584, | |
| "grad_norm": 2.081566572189331, | |
| "learning_rate": 1.2617572357609562e-07, | |
| "loss": 0.1479, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.5542560103963612, | |
| "grad_norm": 1.5348504781723022, | |
| "learning_rate": 1.2338490102196825e-07, | |
| "loss": 0.1061, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.5594541910331383, | |
| "grad_norm": 1.7641793489456177, | |
| "learning_rate": 1.2062093865360457e-07, | |
| "loss": 0.1359, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.5646523716699154, | |
| "grad_norm": 1.5747112035751343, | |
| "learning_rate": 1.1788403359677767e-07, | |
| "loss": 0.1069, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.5698505523066926, | |
| "grad_norm": 1.6333017349243164, | |
| "learning_rate": 1.1517438104753385e-07, | |
| "loss": 0.1077, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.5750487329434697, | |
| "grad_norm": 1.5666186809539795, | |
| "learning_rate": 1.1249217425827062e-07, | |
| "loss": 0.1118, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.5802469135802468, | |
| "grad_norm": 1.5051664113998413, | |
| "learning_rate": 1.0983760452395413e-07, | |
| "loss": 0.1043, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.585445094217024, | |
| "grad_norm": 1.7494423389434814, | |
| "learning_rate": 1.07210861168476e-07, | |
| "loss": 0.1327, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.590643274853801, | |
| "grad_norm": 1.5114164352416992, | |
| "learning_rate": 1.0461213153115079e-07, | |
| "loss": 0.0938, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.5958414554905782, | |
| "grad_norm": 1.562703251838684, | |
| "learning_rate": 1.0204160095335479e-07, | |
| "loss": 0.1056, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.6010396361273553, | |
| "grad_norm": 1.6739459037780762, | |
| "learning_rate": 9.94994527653078e-08, | |
| "loss": 0.1223, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.6062378167641325, | |
| "grad_norm": 1.6659190654754639, | |
| "learning_rate": 9.69858682729976e-08, | |
| "loss": 0.1201, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.6114359974009096, | |
| "grad_norm": 1.6198205947875977, | |
| "learning_rate": 9.45010267452495e-08, | |
| "loss": 0.1041, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.6166341780376867, | |
| "grad_norm": 1.6569786071777344, | |
| "learning_rate": 9.204510540094095e-08, | |
| "loss": 0.1153, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.6218323586744638, | |
| "grad_norm": 1.6825376749038696, | |
| "learning_rate": 8.961827939636196e-08, | |
| "loss": 0.1195, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.627030539311241, | |
| "grad_norm": 1.6692954301834106, | |
| "learning_rate": 8.722072181272311e-08, | |
| "loss": 0.14, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 1.632228719948018, | |
| "grad_norm": 1.86336350440979, | |
| "learning_rate": 8.485260364381186e-08, | |
| "loss": 0.1154, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.6374269005847952, | |
| "grad_norm": 1.6375104188919067, | |
| "learning_rate": 8.251409378379637e-08, | |
| "loss": 0.1087, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.6426250812215724, | |
| "grad_norm": 1.6260446310043335, | |
| "learning_rate": 8.02053590151805e-08, | |
| "loss": 0.1099, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.6478232618583495, | |
| "grad_norm": 1.7146011590957642, | |
| "learning_rate": 7.792656399690922e-08, | |
| "loss": 0.1167, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.6530214424951266, | |
| "grad_norm": 1.6423556804656982, | |
| "learning_rate": 7.567787125262449e-08, | |
| "loss": 0.1171, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.6582196231319037, | |
| "grad_norm": 1.5510940551757812, | |
| "learning_rate": 7.345944115907421e-08, | |
| "loss": 0.1013, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.6634178037686809, | |
| "grad_norm": 1.863503336906433, | |
| "learning_rate": 7.127143193467445e-08, | |
| "loss": 0.1423, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.668615984405458, | |
| "grad_norm": 1.7284936904907227, | |
| "learning_rate": 6.911399962822518e-08, | |
| "loss": 0.1112, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 1.6738141650422351, | |
| "grad_norm": 1.7205549478530884, | |
| "learning_rate": 6.698729810778064e-08, | |
| "loss": 0.136, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.6790123456790123, | |
| "grad_norm": 1.4762628078460693, | |
| "learning_rate": 6.48914790496759e-08, | |
| "loss": 0.1144, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "grad_norm": 1.8362925052642822, | |
| "learning_rate": 6.282669192770895e-08, | |
| "loss": 0.1328, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.6894087069525665, | |
| "grad_norm": 1.4527249336242676, | |
| "learning_rate": 6.079308400248029e-08, | |
| "loss": 0.1055, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.6946068875893436, | |
| "grad_norm": 1.7803164720535278, | |
| "learning_rate": 5.8790800310890456e-08, | |
| "loss": 0.1451, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.6998050682261208, | |
| "grad_norm": 2.059589147567749, | |
| "learning_rate": 5.6819983655795936e-08, | |
| "loss": 0.1458, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.705003248862898, | |
| "grad_norm": 1.7053784132003784, | |
| "learning_rate": 5.4880774595824245e-08, | |
| "loss": 0.1257, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.710201429499675, | |
| "grad_norm": 1.97287118434906, | |
| "learning_rate": 5.297331143534972e-08, | |
| "loss": 0.1381, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.7153996101364521, | |
| "grad_norm": 1.6465667486190796, | |
| "learning_rate": 5.109773021462921e-08, | |
| "loss": 0.1155, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.7205977907732293, | |
| "grad_norm": 1.6381739377975464, | |
| "learning_rate": 4.925416470009991e-08, | |
| "loss": 0.1224, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.7257959714100064, | |
| "grad_norm": 1.5278061628341675, | |
| "learning_rate": 4.744274637483936e-08, | |
| "loss": 0.1203, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.7309941520467835, | |
| "grad_norm": 1.7712794542312622, | |
| "learning_rate": 4.566360442918754e-08, | |
| "loss": 0.1334, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.7361923326835607, | |
| "grad_norm": 1.912316083908081, | |
| "learning_rate": 4.3916865751533306e-08, | |
| "loss": 0.1475, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.7413905133203378, | |
| "grad_norm": 1.7500931024551392, | |
| "learning_rate": 4.220265491926489e-08, | |
| "loss": 0.1165, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.746588693957115, | |
| "grad_norm": 1.7582532167434692, | |
| "learning_rate": 4.0521094189884696e-08, | |
| "loss": 0.119, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.751786874593892, | |
| "grad_norm": 1.6794097423553467, | |
| "learning_rate": 3.887230349229015e-08, | |
| "loss": 0.1093, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.7569850552306692, | |
| "grad_norm": 1.797913908958435, | |
| "learning_rate": 3.7256400418220256e-08, | |
| "loss": 0.1235, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.7621832358674463, | |
| "grad_norm": 1.6042097806930542, | |
| "learning_rate": 3.567350021386895e-08, | |
| "loss": 0.096, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.7673814165042234, | |
| "grad_norm": 1.7623355388641357, | |
| "learning_rate": 3.412371577166578e-08, | |
| "loss": 0.1153, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.7725795971410006, | |
| "grad_norm": 1.737052321434021, | |
| "learning_rate": 3.260715762222449e-08, | |
| "loss": 0.1327, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 1.61103355884552, | |
| "learning_rate": 3.1123933926459845e-08, | |
| "loss": 0.1213, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.7829759584145548, | |
| "grad_norm": 1.78020441532135, | |
| "learning_rate": 2.9674150467873527e-08, | |
| "loss": 0.1262, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.788174139051332, | |
| "grad_norm": 1.7625352144241333, | |
| "learning_rate": 2.825791064500993e-08, | |
| "loss": 0.1465, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.793372319688109, | |
| "grad_norm": 1.7474111318588257, | |
| "learning_rate": 2.6875315464081562e-08, | |
| "loss": 0.1289, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.7985705003248862, | |
| "grad_norm": 1.7776579856872559, | |
| "learning_rate": 2.5526463531765463e-08, | |
| "loss": 0.1219, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.8037686809616633, | |
| "grad_norm": 1.7461159229278564, | |
| "learning_rate": 2.4211451048170296e-08, | |
| "loss": 0.1361, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.8089668615984404, | |
| "grad_norm": 1.701060175895691, | |
| "learning_rate": 2.293037179997559e-08, | |
| "loss": 0.1463, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.8141650422352176, | |
| "grad_norm": 1.7974237203598022, | |
| "learning_rate": 2.1683317153742775e-08, | |
| "loss": 0.14, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.8193632228719947, | |
| "grad_norm": 1.5575218200683594, | |
| "learning_rate": 2.047037604939894e-08, | |
| "loss": 0.1234, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.8245614035087718, | |
| "grad_norm": 1.6900442838668823, | |
| "learning_rate": 1.92916349938938e-08, | |
| "loss": 0.1119, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.829759584145549, | |
| "grad_norm": 1.7725763320922852, | |
| "learning_rate": 1.8147178055029577e-08, | |
| "loss": 0.1456, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.834957764782326, | |
| "grad_norm": 1.677262544631958, | |
| "learning_rate": 1.7037086855465898e-08, | |
| "loss": 0.1041, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.8401559454191032, | |
| "grad_norm": 1.5990703105926514, | |
| "learning_rate": 1.596144056689791e-08, | |
| "loss": 0.1149, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.8453541260558803, | |
| "grad_norm": 2.083341598510742, | |
| "learning_rate": 1.4920315904410064e-08, | |
| "loss": 0.1375, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.8505523066926575, | |
| "grad_norm": 1.7379205226898193, | |
| "learning_rate": 1.3913787121004716e-08, | |
| "loss": 0.1365, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.8557504873294346, | |
| "grad_norm": 1.6175756454467773, | |
| "learning_rate": 1.2941926002306536e-08, | |
| "loss": 0.1348, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.8609486679662117, | |
| "grad_norm": 1.7680426836013794, | |
| "learning_rate": 1.200480186144237e-08, | |
| "loss": 0.1232, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.8661468486029889, | |
| "grad_norm": 1.3798372745513916, | |
| "learning_rate": 1.1102481534098374e-08, | |
| "loss": 0.0966, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.871345029239766, | |
| "grad_norm": 1.584373950958252, | |
| "learning_rate": 1.0235029373752757e-08, | |
| "loss": 0.1082, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.876543209876543, | |
| "grad_norm": 1.7235833406448364, | |
| "learning_rate": 9.402507247086578e-09, | |
| "loss": 0.1348, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.8817413905133202, | |
| "grad_norm": 1.6077182292938232, | |
| "learning_rate": 8.60497452957104e-09, | |
| "loss": 0.1234, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.8869395711500974, | |
| "grad_norm": 1.5776941776275635, | |
| "learning_rate": 7.842488101232891e-09, | |
| "loss": 0.1177, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.8921377517868745, | |
| "grad_norm": 1.5909714698791504, | |
| "learning_rate": 7.115102342598101e-09, | |
| "loss": 0.1133, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.8973359324236516, | |
| "grad_norm": 1.7370619773864746, | |
| "learning_rate": 6.422869130812913e-09, | |
| "loss": 0.1269, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.9025341130604287, | |
| "grad_norm": 1.7405683994293213, | |
| "learning_rate": 5.765837835944309e-09, | |
| "loss": 0.1307, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.9077322936972059, | |
| "grad_norm": 1.9631272554397583, | |
| "learning_rate": 5.144055317458817e-09, | |
| "loss": 0.1546, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 1.912930474333983, | |
| "grad_norm": 1.6291249990463257, | |
| "learning_rate": 4.55756592088058e-09, | |
| "loss": 0.1249, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.9181286549707601, | |
| "grad_norm": 1.528748869895935, | |
| "learning_rate": 4.0064114746284905e-09, | |
| "loss": 0.1027, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 1.9233268356075373, | |
| "grad_norm": 1.8311703205108643, | |
| "learning_rate": 3.4906312870331965e-09, | |
| "loss": 0.1128, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.9285250162443144, | |
| "grad_norm": 1.8863190412521362, | |
| "learning_rate": 3.010262143533393e-09, | |
| "loss": 0.1441, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 1.9337231968810915, | |
| "grad_norm": 1.5785506963729858, | |
| "learning_rate": 2.5653383040524224e-09, | |
| "loss": 0.1198, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.9389213775178686, | |
| "grad_norm": 1.6531542539596558, | |
| "learning_rate": 2.155891500554896e-09, | |
| "loss": 0.1224, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 1.9441195581546458, | |
| "grad_norm": 1.5542500019073486, | |
| "learning_rate": 1.7819509347835049e-09, | |
| "loss": 0.1149, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.949317738791423, | |
| "grad_norm": 1.5082885026931763, | |
| "learning_rate": 1.4435432761762955e-09, | |
| "loss": 0.1061, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.9545159194282, | |
| "grad_norm": 1.5856765508651733, | |
| "learning_rate": 1.1406926599646372e-09, | |
| "loss": 0.1327, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.9597141000649771, | |
| "grad_norm": 1.6566141843795776, | |
| "learning_rate": 8.73420685452042e-10, | |
| "loss": 0.1105, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 1.9649122807017543, | |
| "grad_norm": 1.7047754526138306, | |
| "learning_rate": 6.417464144736207e-10, | |
| "loss": 0.1301, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.9701104613385314, | |
| "grad_norm": 1.7699244022369385, | |
| "learning_rate": 4.4568637003633556e-10, | |
| "loss": 0.1412, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 1.9753086419753085, | |
| "grad_norm": 1.8177380561828613, | |
| "learning_rate": 2.852545351409996e-10, | |
| "loss": 0.1571, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.9805068226120857, | |
| "grad_norm": 1.5334748029708862, | |
| "learning_rate": 1.6046235178474033e-10, | |
| "loss": 0.104, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 1.9857050032488628, | |
| "grad_norm": 1.7260781526565552, | |
| "learning_rate": 7.13187201450971e-11, | |
| "loss": 0.1263, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.99090318388564, | |
| "grad_norm": 1.6607623100280762, | |
| "learning_rate": 1.7829997945084662e-11, | |
| "loss": 0.1135, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 1.996101364522417, | |
| "grad_norm": 1.7004939317703247, | |
| "learning_rate": 0.0, | |
| "loss": 0.1315, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.996101364522417, | |
| "step": 384, | |
| "total_flos": 3.0550374682743276e+18, | |
| "train_loss": 0.19105235013800362, | |
| "train_runtime": 6828.3197, | |
| "train_samples_per_second": 7.211, | |
| "train_steps_per_second": 0.056 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 384, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.0550374682743276e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |