|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.975767366720517, |
|
"eval_steps": 500, |
|
"global_step": 616, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006462035541195477, |
|
"grad_norm": 15.905838012695312, |
|
"learning_rate": 1.6129032258064518e-07, |
|
"loss": 0.1599, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.012924071082390954, |
|
"grad_norm": 27.293901443481445, |
|
"learning_rate": 3.2258064516129035e-07, |
|
"loss": 0.2199, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01938610662358643, |
|
"grad_norm": 18.97646141052246, |
|
"learning_rate": 4.838709677419355e-07, |
|
"loss": 0.2004, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.025848142164781908, |
|
"grad_norm": 26.008861541748047, |
|
"learning_rate": 6.451612903225807e-07, |
|
"loss": 0.2176, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03231017770597738, |
|
"grad_norm": 50.078155517578125, |
|
"learning_rate": 8.064516129032258e-07, |
|
"loss": 0.1862, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03877221324717286, |
|
"grad_norm": 3807.063720703125, |
|
"learning_rate": 9.67741935483871e-07, |
|
"loss": 0.2138, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.045234248788368334, |
|
"grad_norm": 14.361485481262207, |
|
"learning_rate": 1.1290322580645162e-06, |
|
"loss": 0.1844, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.051696284329563816, |
|
"grad_norm": 8.04658031463623, |
|
"learning_rate": 1.2903225806451614e-06, |
|
"loss": 0.1894, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05815831987075929, |
|
"grad_norm": 7.152590274810791, |
|
"learning_rate": 1.4516129032258066e-06, |
|
"loss": 0.1658, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06462035541195477, |
|
"grad_norm": 4.961307525634766, |
|
"learning_rate": 1.6129032258064516e-06, |
|
"loss": 0.1739, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07108239095315025, |
|
"grad_norm": 6.286808013916016, |
|
"learning_rate": 1.774193548387097e-06, |
|
"loss": 0.1784, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07754442649434572, |
|
"grad_norm": 3.9918179512023926, |
|
"learning_rate": 1.935483870967742e-06, |
|
"loss": 0.1839, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0840064620355412, |
|
"grad_norm": 4.566269874572754, |
|
"learning_rate": 2.096774193548387e-06, |
|
"loss": 0.1794, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09046849757673667, |
|
"grad_norm": 2.4546995162963867, |
|
"learning_rate": 2.2580645161290324e-06, |
|
"loss": 0.155, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.09693053311793215, |
|
"grad_norm": 19.017515182495117, |
|
"learning_rate": 2.4193548387096776e-06, |
|
"loss": 0.1428, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.10339256865912763, |
|
"grad_norm": 1.8924509286880493, |
|
"learning_rate": 2.580645161290323e-06, |
|
"loss": 0.1424, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1098546042003231, |
|
"grad_norm": 1.193750023841858, |
|
"learning_rate": 2.7419354838709676e-06, |
|
"loss": 0.1369, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.11631663974151858, |
|
"grad_norm": 0.9094203114509583, |
|
"learning_rate": 2.903225806451613e-06, |
|
"loss": 0.1269, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12277867528271405, |
|
"grad_norm": 1.336350440979004, |
|
"learning_rate": 3.0645161290322584e-06, |
|
"loss": 0.1243, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12924071082390953, |
|
"grad_norm": 26.0351505279541, |
|
"learning_rate": 3.225806451612903e-06, |
|
"loss": 0.1165, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13570274636510501, |
|
"grad_norm": 1.1493116617202759, |
|
"learning_rate": 3.3870967741935484e-06, |
|
"loss": 0.1543, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1421647819063005, |
|
"grad_norm": 0.8133248686790466, |
|
"learning_rate": 3.548387096774194e-06, |
|
"loss": 0.1064, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.14862681744749595, |
|
"grad_norm": 0.8336551785469055, |
|
"learning_rate": 3.7096774193548392e-06, |
|
"loss": 0.1192, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.15508885298869143, |
|
"grad_norm": 0.7685548067092896, |
|
"learning_rate": 3.870967741935484e-06, |
|
"loss": 0.1051, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16155088852988692, |
|
"grad_norm": 1.217137336730957, |
|
"learning_rate": 4.032258064516129e-06, |
|
"loss": 0.1377, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1680129240710824, |
|
"grad_norm": 1.1432764530181885, |
|
"learning_rate": 4.193548387096774e-06, |
|
"loss": 0.1151, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.17447495961227788, |
|
"grad_norm": 2.054145097732544, |
|
"learning_rate": 4.35483870967742e-06, |
|
"loss": 0.1178, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.18093699515347333, |
|
"grad_norm": 0.9552474021911621, |
|
"learning_rate": 4.516129032258065e-06, |
|
"loss": 0.1363, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.18739903069466882, |
|
"grad_norm": 1.1134074926376343, |
|
"learning_rate": 4.67741935483871e-06, |
|
"loss": 0.0971, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1938610662358643, |
|
"grad_norm": 1.2744982242584229, |
|
"learning_rate": 4.838709677419355e-06, |
|
"loss": 0.1177, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.20032310177705978, |
|
"grad_norm": 0.8065481185913086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1089, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.20678513731825526, |
|
"grad_norm": 0.8988919854164124, |
|
"learning_rate": 5.161290322580646e-06, |
|
"loss": 0.1324, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.21324717285945072, |
|
"grad_norm": 0.7514846324920654, |
|
"learning_rate": 5.322580645161291e-06, |
|
"loss": 0.1045, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2197092084006462, |
|
"grad_norm": 0.8767650723457336, |
|
"learning_rate": 5.483870967741935e-06, |
|
"loss": 0.1188, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.22617124394184168, |
|
"grad_norm": 0.9772207736968994, |
|
"learning_rate": 5.645161290322582e-06, |
|
"loss": 0.0941, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.23263327948303716, |
|
"grad_norm": 1.004211664199829, |
|
"learning_rate": 5.806451612903226e-06, |
|
"loss": 0.1063, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.23909531502423265, |
|
"grad_norm": 0.753398597240448, |
|
"learning_rate": 5.967741935483872e-06, |
|
"loss": 0.0908, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2455573505654281, |
|
"grad_norm": 0.8244940042495728, |
|
"learning_rate": 6.129032258064517e-06, |
|
"loss": 0.1037, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2520193861066236, |
|
"grad_norm": 0.9269906282424927, |
|
"learning_rate": 6.290322580645162e-06, |
|
"loss": 0.121, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.25848142164781907, |
|
"grad_norm": 0.8812072277069092, |
|
"learning_rate": 6.451612903225806e-06, |
|
"loss": 0.1229, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2649434571890145, |
|
"grad_norm": 1.0441511869430542, |
|
"learning_rate": 6.612903225806452e-06, |
|
"loss": 0.1201, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.27140549273021003, |
|
"grad_norm": 0.819930374622345, |
|
"learning_rate": 6.774193548387097e-06, |
|
"loss": 0.1175, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2778675282714055, |
|
"grad_norm": 0.7328965067863464, |
|
"learning_rate": 6.935483870967743e-06, |
|
"loss": 0.0865, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.284329563812601, |
|
"grad_norm": 0.7510896325111389, |
|
"learning_rate": 7.096774193548388e-06, |
|
"loss": 0.0874, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.29079159935379645, |
|
"grad_norm": 0.8940995335578918, |
|
"learning_rate": 7.258064516129033e-06, |
|
"loss": 0.1069, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2972536348949919, |
|
"grad_norm": 1.0601608753204346, |
|
"learning_rate": 7.4193548387096784e-06, |
|
"loss": 0.1034, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3037156704361874, |
|
"grad_norm": 0.6812165379524231, |
|
"learning_rate": 7.580645161290323e-06, |
|
"loss": 0.083, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.31017770597738287, |
|
"grad_norm": 0.7888991236686707, |
|
"learning_rate": 7.741935483870968e-06, |
|
"loss": 0.1021, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3166397415185784, |
|
"grad_norm": 0.7519361972808838, |
|
"learning_rate": 7.903225806451613e-06, |
|
"loss": 0.095, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.32310177705977383, |
|
"grad_norm": 0.8417134881019592, |
|
"learning_rate": 8.064516129032258e-06, |
|
"loss": 0.0904, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3295638126009693, |
|
"grad_norm": 0.7418419122695923, |
|
"learning_rate": 8.225806451612904e-06, |
|
"loss": 0.1081, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3360258481421648, |
|
"grad_norm": 0.7588947415351868, |
|
"learning_rate": 8.387096774193549e-06, |
|
"loss": 0.106, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.34248788368336025, |
|
"grad_norm": 0.5819141864776611, |
|
"learning_rate": 8.548387096774194e-06, |
|
"loss": 0.0767, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.34894991922455576, |
|
"grad_norm": 0.7023757100105286, |
|
"learning_rate": 8.70967741935484e-06, |
|
"loss": 0.1041, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3554119547657512, |
|
"grad_norm": 0.6532196402549744, |
|
"learning_rate": 8.870967741935484e-06, |
|
"loss": 0.0815, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.36187399030694667, |
|
"grad_norm": 0.7267298102378845, |
|
"learning_rate": 9.03225806451613e-06, |
|
"loss": 0.0994, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3683360258481422, |
|
"grad_norm": 0.6799736618995667, |
|
"learning_rate": 9.193548387096775e-06, |
|
"loss": 0.0867, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.37479806138933763, |
|
"grad_norm": 0.6375831961631775, |
|
"learning_rate": 9.35483870967742e-06, |
|
"loss": 0.082, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.38126009693053314, |
|
"grad_norm": 0.5718384385108948, |
|
"learning_rate": 9.516129032258065e-06, |
|
"loss": 0.0769, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.3877221324717286, |
|
"grad_norm": 0.682659924030304, |
|
"learning_rate": 9.67741935483871e-06, |
|
"loss": 0.0744, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.39418416801292405, |
|
"grad_norm": 0.7422767877578735, |
|
"learning_rate": 9.838709677419356e-06, |
|
"loss": 0.0838, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.40064620355411956, |
|
"grad_norm": 0.6545633673667908, |
|
"learning_rate": 1e-05, |
|
"loss": 0.083, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.407108239095315, |
|
"grad_norm": 0.6176275014877319, |
|
"learning_rate": 9.99991960684721e-06, |
|
"loss": 0.0819, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4135702746365105, |
|
"grad_norm": 0.6768634915351868, |
|
"learning_rate": 9.999678429974063e-06, |
|
"loss": 0.0742, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.420032310177706, |
|
"grad_norm": 0.727836549282074, |
|
"learning_rate": 9.999276477136145e-06, |
|
"loss": 0.0915, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.42649434571890144, |
|
"grad_norm": 0.6889824867248535, |
|
"learning_rate": 9.998713761259157e-06, |
|
"loss": 0.0805, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.43295638126009695, |
|
"grad_norm": 0.7547912001609802, |
|
"learning_rate": 9.997990300438505e-06, |
|
"loss": 0.0852, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4394184168012924, |
|
"grad_norm": 0.5975009202957153, |
|
"learning_rate": 9.997106117938704e-06, |
|
"loss": 0.0818, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4458804523424879, |
|
"grad_norm": 1.1194217205047607, |
|
"learning_rate": 9.996061242192645e-06, |
|
"loss": 0.0961, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.45234248788368336, |
|
"grad_norm": 0.6122073531150818, |
|
"learning_rate": 9.994855706800666e-06, |
|
"loss": 0.0714, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4588045234248788, |
|
"grad_norm": 19.22555923461914, |
|
"learning_rate": 9.993489550529486e-06, |
|
"loss": 0.0691, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.46526655896607433, |
|
"grad_norm": 0.468181848526001, |
|
"learning_rate": 9.991962817310947e-06, |
|
"loss": 0.0661, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.4717285945072698, |
|
"grad_norm": 13.211414337158203, |
|
"learning_rate": 9.990275556240612e-06, |
|
"loss": 0.0849, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.4781906300484653, |
|
"grad_norm": 0.6043105125427246, |
|
"learning_rate": 9.98842782157617e-06, |
|
"loss": 0.0803, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.48465266558966075, |
|
"grad_norm": 0.6836904883384705, |
|
"learning_rate": 9.986419672735712e-06, |
|
"loss": 0.0984, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.4911147011308562, |
|
"grad_norm": 0.7549227476119995, |
|
"learning_rate": 9.9842511742958e-06, |
|
"loss": 0.0682, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.4975767366720517, |
|
"grad_norm": 0.671994149684906, |
|
"learning_rate": 9.981922395989409e-06, |
|
"loss": 0.07, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5040387722132472, |
|
"grad_norm": 0.8292804956436157, |
|
"learning_rate": 9.97943341270367e-06, |
|
"loss": 0.0918, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5105008077544426, |
|
"grad_norm": 0.5530771017074585, |
|
"learning_rate": 9.976784304477467e-06, |
|
"loss": 0.0768, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5169628432956381, |
|
"grad_norm": 0.5503768920898438, |
|
"learning_rate": 9.973975156498866e-06, |
|
"loss": 0.0923, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5234248788368336, |
|
"grad_norm": 0.46011969447135925, |
|
"learning_rate": 9.971006059102369e-06, |
|
"loss": 0.063, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.529886914378029, |
|
"grad_norm": 0.5449568033218384, |
|
"learning_rate": 9.96787710776602e-06, |
|
"loss": 0.0667, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5363489499192245, |
|
"grad_norm": 0.5621317028999329, |
|
"learning_rate": 9.964588403108324e-06, |
|
"loss": 0.0568, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5428109854604201, |
|
"grad_norm": 0.4867885410785675, |
|
"learning_rate": 9.961140050885014e-06, |
|
"loss": 0.0681, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5492730210016155, |
|
"grad_norm": 0.5386512279510498, |
|
"learning_rate": 9.957532161985654e-06, |
|
"loss": 0.0711, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.555735056542811, |
|
"grad_norm": 0.4690256416797638, |
|
"learning_rate": 9.95376485243007e-06, |
|
"loss": 0.0667, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5621970920840065, |
|
"grad_norm": 0.5720282793045044, |
|
"learning_rate": 9.949838243364614e-06, |
|
"loss": 0.0939, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.568659127625202, |
|
"grad_norm": 0.6215516924858093, |
|
"learning_rate": 9.945752461058286e-06, |
|
"loss": 0.0691, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5751211631663974, |
|
"grad_norm": 0.4669437110424042, |
|
"learning_rate": 9.941507636898651e-06, |
|
"loss": 0.0685, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5815831987075929, |
|
"grad_norm": 0.4649712145328522, |
|
"learning_rate": 9.937103907387626e-06, |
|
"loss": 0.074, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5880452342487884, |
|
"grad_norm": 0.7192949056625366, |
|
"learning_rate": 9.932541414137096e-06, |
|
"loss": 0.0965, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5945072697899838, |
|
"grad_norm": 13.985039710998535, |
|
"learning_rate": 9.927820303864342e-06, |
|
"loss": 0.0651, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6009693053311793, |
|
"grad_norm": 0.5609593987464905, |
|
"learning_rate": 9.922940728387345e-06, |
|
"loss": 0.0796, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6074313408723748, |
|
"grad_norm": 1.4020167589187622, |
|
"learning_rate": 9.917902844619885e-06, |
|
"loss": 0.0657, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6138933764135702, |
|
"grad_norm": 0.5554618239402771, |
|
"learning_rate": 9.912706814566504e-06, |
|
"loss": 0.0676, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6203554119547657, |
|
"grad_norm": 0.6547884345054626, |
|
"learning_rate": 9.907352805317301e-06, |
|
"loss": 0.109, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6268174474959612, |
|
"grad_norm": 0.6062126755714417, |
|
"learning_rate": 9.901840989042547e-06, |
|
"loss": 0.0864, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6332794830371568, |
|
"grad_norm": 0.4889056980609894, |
|
"learning_rate": 9.896171542987158e-06, |
|
"loss": 0.0632, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6397415185783522, |
|
"grad_norm": 0.6889820098876953, |
|
"learning_rate": 9.890344649464992e-06, |
|
"loss": 0.0782, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6462035541195477, |
|
"grad_norm": 1.3493472337722778, |
|
"learning_rate": 9.884360495852984e-06, |
|
"loss": 0.0779, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6526655896607432, |
|
"grad_norm": 0.5208078026771545, |
|
"learning_rate": 9.878219274585125e-06, |
|
"loss": 0.072, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6591276252019386, |
|
"grad_norm": 0.6799489259719849, |
|
"learning_rate": 9.871921183146272e-06, |
|
"loss": 0.0718, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6655896607431341, |
|
"grad_norm": 0.5299189686775208, |
|
"learning_rate": 9.865466424065792e-06, |
|
"loss": 0.0687, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6720516962843296, |
|
"grad_norm": 0.6351456046104431, |
|
"learning_rate": 9.858855204911065e-06, |
|
"loss": 0.0892, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.678513731825525, |
|
"grad_norm": 0.567482054233551, |
|
"learning_rate": 9.852087738280784e-06, |
|
"loss": 0.0778, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6849757673667205, |
|
"grad_norm": 0.5811858773231506, |
|
"learning_rate": 9.845164241798143e-06, |
|
"loss": 0.0709, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.691437802907916, |
|
"grad_norm": 0.600745677947998, |
|
"learning_rate": 9.838084938103832e-06, |
|
"loss": 0.0721, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6978998384491115, |
|
"grad_norm": 0.5404493808746338, |
|
"learning_rate": 9.830850054848859e-06, |
|
"loss": 0.0651, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7043618739903069, |
|
"grad_norm": 0.6496622562408447, |
|
"learning_rate": 9.823459824687262e-06, |
|
"loss": 0.0794, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7108239095315024, |
|
"grad_norm": 0.5033782124519348, |
|
"learning_rate": 9.815914485268598e-06, |
|
"loss": 0.0866, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7172859450726979, |
|
"grad_norm": 0.48423922061920166, |
|
"learning_rate": 9.808214279230317e-06, |
|
"loss": 0.0681, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7237479806138933, |
|
"grad_norm": 0.5603338479995728, |
|
"learning_rate": 9.800359454189955e-06, |
|
"loss": 0.0894, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7302100161550888, |
|
"grad_norm": 0.554610550403595, |
|
"learning_rate": 9.792350262737173e-06, |
|
"loss": 0.0735, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7366720516962844, |
|
"grad_norm": 0.4603743553161621, |
|
"learning_rate": 9.784186962425633e-06, |
|
"loss": 0.0546, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7431340872374798, |
|
"grad_norm": 0.4438991844654083, |
|
"learning_rate": 9.775869815764713e-06, |
|
"loss": 0.0571, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7495961227786753, |
|
"grad_norm": 0.6239739060401917, |
|
"learning_rate": 9.76739909021107e-06, |
|
"loss": 0.0997, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7560581583198708, |
|
"grad_norm": 0.47505640983581543, |
|
"learning_rate": 9.758775058160037e-06, |
|
"loss": 0.0755, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7625201938610663, |
|
"grad_norm": 0.5345163345336914, |
|
"learning_rate": 9.749997996936866e-06, |
|
"loss": 0.0754, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7689822294022617, |
|
"grad_norm": 0.4881122410297394, |
|
"learning_rate": 9.741068188787806e-06, |
|
"loss": 0.0787, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7754442649434572, |
|
"grad_norm": 0.6172494888305664, |
|
"learning_rate": 9.731985920871028e-06, |
|
"loss": 0.0737, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7819063004846527, |
|
"grad_norm": 0.4486066401004791, |
|
"learning_rate": 9.722751485247393e-06, |
|
"loss": 0.0702, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7883683360258481, |
|
"grad_norm": 1.1322523355484009, |
|
"learning_rate": 9.713365178871061e-06, |
|
"loss": 0.0766, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7948303715670436, |
|
"grad_norm": 0.44906798005104065, |
|
"learning_rate": 9.703827303579936e-06, |
|
"loss": 0.0635, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8012924071082391, |
|
"grad_norm": 0.5776321887969971, |
|
"learning_rate": 9.694138166085964e-06, |
|
"loss": 0.0733, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8077544426494345, |
|
"grad_norm": 0.45236676931381226, |
|
"learning_rate": 9.684298077965269e-06, |
|
"loss": 0.0653, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.81421647819063, |
|
"grad_norm": 0.4732246696949005, |
|
"learning_rate": 9.674307355648136e-06, |
|
"loss": 0.0647, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8206785137318255, |
|
"grad_norm": 0.5663241744041443, |
|
"learning_rate": 9.664166320408828e-06, |
|
"loss": 0.0634, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.827140549273021, |
|
"grad_norm": 0.6197142004966736, |
|
"learning_rate": 9.653875298355264e-06, |
|
"loss": 0.0854, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8336025848142165, |
|
"grad_norm": 0.6459212899208069, |
|
"learning_rate": 9.64343462041853e-06, |
|
"loss": 0.0744, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.840064620355412, |
|
"grad_norm": 0.6052826046943665, |
|
"learning_rate": 9.63284462234223e-06, |
|
"loss": 0.0882, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8465266558966075, |
|
"grad_norm": 0.6480473875999451, |
|
"learning_rate": 9.622105644671698e-06, |
|
"loss": 0.0721, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8529886914378029, |
|
"grad_norm": 0.5728505849838257, |
|
"learning_rate": 9.611218032743044e-06, |
|
"loss": 0.0929, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8594507269789984, |
|
"grad_norm": 0.5130248665809631, |
|
"learning_rate": 9.600182136672048e-06, |
|
"loss": 0.0738, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.8659127625201939, |
|
"grad_norm": 0.4673842787742615, |
|
"learning_rate": 9.5889983113429e-06, |
|
"loss": 0.0631, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8723747980613893, |
|
"grad_norm": 0.4138599932193756, |
|
"learning_rate": 9.57766691639679e-06, |
|
"loss": 0.0639, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8788368336025848, |
|
"grad_norm": 0.520550549030304, |
|
"learning_rate": 9.56618831622035e-06, |
|
"loss": 0.0615, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8852988691437803, |
|
"grad_norm": 0.5906829833984375, |
|
"learning_rate": 9.554562879933917e-06, |
|
"loss": 0.0767, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8917609046849758, |
|
"grad_norm": 0.528221070766449, |
|
"learning_rate": 9.542790981379683e-06, |
|
"loss": 0.0668, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8982229402261712, |
|
"grad_norm": 0.5368471741676331, |
|
"learning_rate": 9.530872999109665e-06, |
|
"loss": 0.068, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9046849757673667, |
|
"grad_norm": 0.5090873837471008, |
|
"learning_rate": 9.51880931637353e-06, |
|
"loss": 0.0615, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9111470113085622, |
|
"grad_norm": 0.4326649308204651, |
|
"learning_rate": 9.506600321106273e-06, |
|
"loss": 0.0612, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.9176090468497576, |
|
"grad_norm": 0.5169522762298584, |
|
"learning_rate": 9.494246405915743e-06, |
|
"loss": 0.0707, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9240710823909531, |
|
"grad_norm": 0.5026911497116089, |
|
"learning_rate": 9.481747968070018e-06, |
|
"loss": 0.064, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.9305331179321487, |
|
"grad_norm": 0.45077645778656006, |
|
"learning_rate": 9.469105409484628e-06, |
|
"loss": 0.053, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9369951534733441, |
|
"grad_norm": 0.46447330713272095, |
|
"learning_rate": 9.456319136709628e-06, |
|
"loss": 0.0679, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9434571890145396, |
|
"grad_norm": 0.43717750906944275, |
|
"learning_rate": 9.443389560916532e-06, |
|
"loss": 0.0734, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9499192245557351, |
|
"grad_norm": 0.4999764561653137, |
|
"learning_rate": 9.430317097885082e-06, |
|
"loss": 0.0764, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.9563812600969306, |
|
"grad_norm": 0.5041486024856567, |
|
"learning_rate": 9.417102167989888e-06, |
|
"loss": 0.0776, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.962843295638126, |
|
"grad_norm": 0.5824252367019653, |
|
"learning_rate": 9.403745196186904e-06, |
|
"loss": 0.0871, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.9693053311793215, |
|
"grad_norm": 0.44613441824913025, |
|
"learning_rate": 9.390246611999754e-06, |
|
"loss": 0.0753, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.975767366720517, |
|
"grad_norm": 0.5553467273712158, |
|
"learning_rate": 9.376606849505939e-06, |
|
"loss": 0.0788, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.9822294022617124, |
|
"grad_norm": 0.47406890988349915, |
|
"learning_rate": 9.362826347322857e-06, |
|
"loss": 0.0612, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9886914378029079, |
|
"grad_norm": 0.36964505910873413, |
|
"learning_rate": 9.348905548593722e-06, |
|
"loss": 0.0683, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9951534733441034, |
|
"grad_norm": 0.4598817229270935, |
|
"learning_rate": 9.334844900973292e-06, |
|
"loss": 0.0718, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.4598817229270935, |
|
"learning_rate": 9.320644856613482e-06, |
|
"loss": 0.0629, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0064620355411955, |
|
"grad_norm": 0.4689626395702362, |
|
"learning_rate": 9.306305872148826e-06, |
|
"loss": 0.0471, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.012924071082391, |
|
"grad_norm": 0.37304380536079407, |
|
"learning_rate": 9.291828408681796e-06, |
|
"loss": 0.0471, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.0193861066235865, |
|
"grad_norm": 0.45355430245399475, |
|
"learning_rate": 9.277212931767958e-06, |
|
"loss": 0.0497, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.0258481421647818, |
|
"grad_norm": 0.4865805506706238, |
|
"learning_rate": 9.262459911401025e-06, |
|
"loss": 0.0662, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.0323101777059773, |
|
"grad_norm": 0.35557398200035095, |
|
"learning_rate": 9.247569821997724e-06, |
|
"loss": 0.0397, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0387722132471728, |
|
"grad_norm": 0.34070008993148804, |
|
"learning_rate": 9.232543142382546e-06, |
|
"loss": 0.0614, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.0452342487883683, |
|
"grad_norm": 0.4690254330635071, |
|
"learning_rate": 9.217380355772353e-06, |
|
"loss": 0.0484, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.0516962843295639, |
|
"grad_norm": 0.3816601634025574, |
|
"learning_rate": 9.202081949760833e-06, |
|
"loss": 0.045, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.0581583198707594, |
|
"grad_norm": 0.7278222441673279, |
|
"learning_rate": 9.186648416302823e-06, |
|
"loss": 0.0497, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.0646203554119547, |
|
"grad_norm": 0.3829418420791626, |
|
"learning_rate": 9.171080251698488e-06, |
|
"loss": 0.0371, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.0710823909531502, |
|
"grad_norm": 0.33008721470832825, |
|
"learning_rate": 9.155377956577363e-06, |
|
"loss": 0.0561, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.0775444264943457, |
|
"grad_norm": 0.47766488790512085, |
|
"learning_rate": 9.13954203588225e-06, |
|
"loss": 0.0528, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.0840064620355412, |
|
"grad_norm": 0.3429339528083801, |
|
"learning_rate": 9.123572998852988e-06, |
|
"loss": 0.0524, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.0904684975767367, |
|
"grad_norm": 0.4901416003704071, |
|
"learning_rate": 9.107471359010069e-06, |
|
"loss": 0.0512, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.0969305331179322, |
|
"grad_norm": 0.4330100119113922, |
|
"learning_rate": 9.091237634138133e-06, |
|
"loss": 0.054, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1033925686591277, |
|
"grad_norm": 0.3829297721385956, |
|
"learning_rate": 9.074872346269305e-06, |
|
"loss": 0.0414, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.109854604200323, |
|
"grad_norm": 0.3282840847969055, |
|
"learning_rate": 9.058376021666424e-06, |
|
"loss": 0.0476, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.1163166397415185, |
|
"grad_norm": 0.39873006939888, |
|
"learning_rate": 9.041749190806105e-06, |
|
"loss": 0.0514, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.122778675282714, |
|
"grad_norm": 0.44557517766952515, |
|
"learning_rate": 9.024992388361691e-06, |
|
"loss": 0.0435, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.1292407108239095, |
|
"grad_norm": 3.866434097290039, |
|
"learning_rate": 9.008106153186055e-06, |
|
"loss": 0.0497, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.135702746365105, |
|
"grad_norm": 0.3858250677585602, |
|
"learning_rate": 8.991091028294268e-06, |
|
"loss": 0.0464, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.1421647819063006, |
|
"grad_norm": 0.47651711106300354, |
|
"learning_rate": 8.973947560846146e-06, |
|
"loss": 0.0376, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.148626817447496, |
|
"grad_norm": 0.7432534694671631, |
|
"learning_rate": 8.956676302128646e-06, |
|
"loss": 0.0436, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.1550888529886914, |
|
"grad_norm": 0.523420512676239, |
|
"learning_rate": 8.939277807538147e-06, |
|
"loss": 0.0619, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.1615508885298869, |
|
"grad_norm": 0.463724821805954, |
|
"learning_rate": 8.921752636562582e-06, |
|
"loss": 0.0455, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.1680129240710824, |
|
"grad_norm": 0.45723769068717957, |
|
"learning_rate": 8.90410135276345e-06, |
|
"loss": 0.0421, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.1744749596122779, |
|
"grad_norm": 0.38577011227607727, |
|
"learning_rate": 8.886324523757692e-06, |
|
"loss": 0.0531, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.1809369951534734, |
|
"grad_norm": 0.5292965173721313, |
|
"learning_rate": 8.868422721199442e-06, |
|
"loss": 0.0635, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.187399030694669, |
|
"grad_norm": 0.3577275574207306, |
|
"learning_rate": 8.850396520761636e-06, |
|
"loss": 0.0458, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.1938610662358644, |
|
"grad_norm": 0.4251251518726349, |
|
"learning_rate": 8.832246502117512e-06, |
|
"loss": 0.0363, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.2003231017770597, |
|
"grad_norm": 3.846379041671753, |
|
"learning_rate": 8.813973248921958e-06, |
|
"loss": 0.0512, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.2067851373182552, |
|
"grad_norm": 0.41917675733566284, |
|
"learning_rate": 8.795577348792748e-06, |
|
"loss": 0.0421, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.2132471728594507, |
|
"grad_norm": 0.5150820016860962, |
|
"learning_rate": 8.777059393291645e-06, |
|
"loss": 0.0413, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.2197092084006462, |
|
"grad_norm": 0.3357498049736023, |
|
"learning_rate": 8.75841997790538e-06, |
|
"loss": 0.0422, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.2261712439418417, |
|
"grad_norm": 0.37925323843955994, |
|
"learning_rate": 8.739659702026502e-06, |
|
"loss": 0.0448, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.2326332794830372, |
|
"grad_norm": 0.8641751408576965, |
|
"learning_rate": 8.7207791689341e-06, |
|
"loss": 0.0599, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.2390953150242328, |
|
"grad_norm": 0.4052288234233856, |
|
"learning_rate": 8.701778985774405e-06, |
|
"loss": 0.0503, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.245557350565428, |
|
"grad_norm": 0.45619818568229675, |
|
"learning_rate": 8.68265976354127e-06, |
|
"loss": 0.0483, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.2520193861066236, |
|
"grad_norm": 0.35979220271110535, |
|
"learning_rate": 8.663422117056519e-06, |
|
"loss": 0.0413, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.258481421647819, |
|
"grad_norm": 0.3667192757129669, |
|
"learning_rate": 8.644066664950169e-06, |
|
"loss": 0.0383, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.2649434571890146, |
|
"grad_norm": 0.3911844491958618, |
|
"learning_rate": 8.62459402964055e-06, |
|
"loss": 0.0461, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.27140549273021, |
|
"grad_norm": 0.5011119842529297, |
|
"learning_rate": 8.605004837314277e-06, |
|
"loss": 0.0492, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.2778675282714054, |
|
"grad_norm": 0.562304675579071, |
|
"learning_rate": 8.585299717906127e-06, |
|
"loss": 0.0668, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.284329563812601, |
|
"grad_norm": 0.5580583214759827, |
|
"learning_rate": 8.565479305078767e-06, |
|
"loss": 0.0352, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.2907915993537964, |
|
"grad_norm": 0.5646708011627197, |
|
"learning_rate": 8.54554423620239e-06, |
|
"loss": 0.0489, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.297253634894992, |
|
"grad_norm": 0.5223414897918701, |
|
"learning_rate": 8.525495152334211e-06, |
|
"loss": 0.0447, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.3037156704361874, |
|
"grad_norm": 0.4284669756889343, |
|
"learning_rate": 8.505332698197853e-06, |
|
"loss": 0.0518, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.310177705977383, |
|
"grad_norm": 0.4862108826637268, |
|
"learning_rate": 8.48505752216262e-06, |
|
"loss": 0.0389, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.3166397415185784, |
|
"grad_norm": 0.4296407103538513, |
|
"learning_rate": 8.464670276222642e-06, |
|
"loss": 0.0509, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.3231017770597737, |
|
"grad_norm": 0.40503671765327454, |
|
"learning_rate": 8.444171615975909e-06, |
|
"loss": 0.0493, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.3295638126009692, |
|
"grad_norm": 0.39369431138038635, |
|
"learning_rate": 8.423562200603192e-06, |
|
"loss": 0.0594, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.3360258481421647, |
|
"grad_norm": 0.5622990727424622, |
|
"learning_rate": 8.402842692846842e-06, |
|
"loss": 0.039, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.3424878836833603, |
|
"grad_norm": 0.4047924280166626, |
|
"learning_rate": 8.38201375898948e-06, |
|
"loss": 0.0363, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.3489499192245558, |
|
"grad_norm": 0.4414857029914856, |
|
"learning_rate": 8.361076068832574e-06, |
|
"loss": 0.0582, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.3554119547657513, |
|
"grad_norm": 0.43151915073394775, |
|
"learning_rate": 8.340030295674887e-06, |
|
"loss": 0.0433, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.3618739903069468, |
|
"grad_norm": 0.49384137988090515, |
|
"learning_rate": 8.31887711629085e-06, |
|
"loss": 0.0514, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.368336025848142, |
|
"grad_norm": 0.4050130844116211, |
|
"learning_rate": 8.29761721090877e-06, |
|
"loss": 0.0448, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.3747980613893376, |
|
"grad_norm": 0.351788192987442, |
|
"learning_rate": 8.276251263188976e-06, |
|
"loss": 0.0415, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.381260096930533, |
|
"grad_norm": 0.3712752163410187, |
|
"learning_rate": 8.254779960201831e-06, |
|
"loss": 0.0428, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.3877221324717286, |
|
"grad_norm": 0.408925861120224, |
|
"learning_rate": 8.23320399240563e-06, |
|
"loss": 0.0527, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.394184168012924, |
|
"grad_norm": 0.3616960942745209, |
|
"learning_rate": 8.2115240536244e-06, |
|
"loss": 0.0323, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.4006462035541196, |
|
"grad_norm": 0.3473566472530365, |
|
"learning_rate": 8.1897408410256e-06, |
|
"loss": 0.0463, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.4071082390953151, |
|
"grad_norm": 0.477464497089386, |
|
"learning_rate": 8.16785505509768e-06, |
|
"loss": 0.0521, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.4135702746365104, |
|
"grad_norm": 0.6549597978591919, |
|
"learning_rate": 8.145867399627575e-06, |
|
"loss": 0.0467, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.420032310177706, |
|
"grad_norm": 0.46223175525665283, |
|
"learning_rate": 8.123778581678064e-06, |
|
"loss": 0.0444, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.4264943457189014, |
|
"grad_norm": 0.5096275210380554, |
|
"learning_rate": 8.10158931156503e-06, |
|
"loss": 0.0451, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.432956381260097, |
|
"grad_norm": 0.37000322341918945, |
|
"learning_rate": 8.079300302834632e-06, |
|
"loss": 0.051, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.4394184168012925, |
|
"grad_norm": 0.671576738357544, |
|
"learning_rate": 8.056912272240338e-06, |
|
"loss": 0.0466, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.445880452342488, |
|
"grad_norm": 0.35391512513160706, |
|
"learning_rate": 8.034425939719896e-06, |
|
"loss": 0.0528, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.4523424878836835, |
|
"grad_norm": 0.35577550530433655, |
|
"learning_rate": 8.011842028372175e-06, |
|
"loss": 0.047, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.4588045234248788, |
|
"grad_norm": 0.4128228425979614, |
|
"learning_rate": 7.989161264433904e-06, |
|
"loss": 0.04, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.4652665589660743, |
|
"grad_norm": 3.7340619564056396, |
|
"learning_rate": 7.966384377256335e-06, |
|
"loss": 0.0509, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.4717285945072698, |
|
"grad_norm": 0.4408370852470398, |
|
"learning_rate": 7.943512099281776e-06, |
|
"loss": 0.0533, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.4781906300484653, |
|
"grad_norm": 0.3726482093334198, |
|
"learning_rate": 7.92054516602004e-06, |
|
"loss": 0.0395, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.4846526655896608, |
|
"grad_norm": 0.40403544902801514, |
|
"learning_rate": 7.897484316024799e-06, |
|
"loss": 0.0582, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.491114701130856, |
|
"grad_norm": 0.4779585301876068, |
|
"learning_rate": 7.874330290869829e-06, |
|
"loss": 0.0422, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.4975767366720518, |
|
"grad_norm": 0.46821513772010803, |
|
"learning_rate": 7.85108383512516e-06, |
|
"loss": 0.0551, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.504038772213247, |
|
"grad_norm": 0.3851401209831238, |
|
"learning_rate": 7.827745696333139e-06, |
|
"loss": 0.0542, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.5105008077544426, |
|
"grad_norm": 0.3976247310638428, |
|
"learning_rate": 7.804316624984391e-06, |
|
"loss": 0.0444, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.5169628432956381, |
|
"grad_norm": 0.3913721740245819, |
|
"learning_rate": 7.780797374493683e-06, |
|
"loss": 0.0421, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.5234248788368336, |
|
"grad_norm": 0.38424035906791687, |
|
"learning_rate": 7.757188701175688e-06, |
|
"loss": 0.057, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.5298869143780292, |
|
"grad_norm": 0.36402031779289246, |
|
"learning_rate": 7.733491364220686e-06, |
|
"loss": 0.0599, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.5363489499192244, |
|
"grad_norm": 0.5822838544845581, |
|
"learning_rate": 7.709706125670124e-06, |
|
"loss": 0.0418, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.5428109854604202, |
|
"grad_norm": 0.38915562629699707, |
|
"learning_rate": 7.685833750392131e-06, |
|
"loss": 0.0462, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.5492730210016155, |
|
"grad_norm": 0.41611573100090027, |
|
"learning_rate": 7.661875006056914e-06, |
|
"loss": 0.0363, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.555735056542811, |
|
"grad_norm": 0.3718394637107849, |
|
"learning_rate": 7.637830663112064e-06, |
|
"loss": 0.0562, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.5621970920840065, |
|
"grad_norm": 0.5133814811706543, |
|
"learning_rate": 7.613701494757803e-06, |
|
"loss": 0.0566, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.568659127625202, |
|
"grad_norm": 0.496139794588089, |
|
"learning_rate": 7.589488276922095e-06, |
|
"loss": 0.053, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.5751211631663975, |
|
"grad_norm": 0.43612685799598694, |
|
"learning_rate": 7.5651917882357075e-06, |
|
"loss": 0.0371, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.5815831987075928, |
|
"grad_norm": 0.47427716851234436, |
|
"learning_rate": 7.540812810007172e-06, |
|
"loss": 0.0652, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.5880452342487885, |
|
"grad_norm": 0.48456844687461853, |
|
"learning_rate": 7.516352126197658e-06, |
|
"loss": 0.05, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.5945072697899838, |
|
"grad_norm": 0.4757959246635437, |
|
"learning_rate": 7.491810523395762e-06, |
|
"loss": 0.051, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.6009693053311793, |
|
"grad_norm": 0.44984814524650574, |
|
"learning_rate": 7.467188790792213e-06, |
|
"loss": 0.0469, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.6074313408723748, |
|
"grad_norm": 0.3804328739643097, |
|
"learning_rate": 7.442487720154494e-06, |
|
"loss": 0.0438, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.6138933764135701, |
|
"grad_norm": 0.3791749179363251, |
|
"learning_rate": 7.417708105801386e-06, |
|
"loss": 0.0488, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6203554119547658, |
|
"grad_norm": 0.3565092980861664, |
|
"learning_rate": 7.392850744577416e-06, |
|
"loss": 0.0369, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.6268174474959611, |
|
"grad_norm": 0.3572937250137329, |
|
"learning_rate": 7.36791643582724e-06, |
|
"loss": 0.0368, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.6332794830371569, |
|
"grad_norm": 3.2710509300231934, |
|
"learning_rate": 7.342905981369937e-06, |
|
"loss": 0.061, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.6397415185783522, |
|
"grad_norm": 0.4295857548713684, |
|
"learning_rate": 7.31782018547322e-06, |
|
"loss": 0.0428, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.6462035541195477, |
|
"grad_norm": 0.40736424922943115, |
|
"learning_rate": 7.2926598548275765e-06, |
|
"loss": 0.0419, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.6526655896607432, |
|
"grad_norm": 0.3768783211708069, |
|
"learning_rate": 7.267425798520333e-06, |
|
"loss": 0.0417, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.6591276252019385, |
|
"grad_norm": 0.39212289452552795, |
|
"learning_rate": 7.242118828009622e-06, |
|
"loss": 0.0538, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.6655896607431342, |
|
"grad_norm": 0.4783058166503906, |
|
"learning_rate": 7.2167397570983075e-06, |
|
"loss": 0.0402, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.6720516962843295, |
|
"grad_norm": 0.4783688485622406, |
|
"learning_rate": 7.191289401907796e-06, |
|
"loss": 0.0435, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.678513731825525, |
|
"grad_norm": 0.44153332710266113, |
|
"learning_rate": 7.165768580851806e-06, |
|
"loss": 0.0429, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.6849757673667205, |
|
"grad_norm": 0.37209975719451904, |
|
"learning_rate": 7.140178114610045e-06, |
|
"loss": 0.0613, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.691437802907916, |
|
"grad_norm": 0.4744909703731537, |
|
"learning_rate": 7.114518826101815e-06, |
|
"loss": 0.048, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.6978998384491115, |
|
"grad_norm": 0.5942760705947876, |
|
"learning_rate": 7.088791540459562e-06, |
|
"loss": 0.043, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.7043618739903068, |
|
"grad_norm": 0.43296632170677185, |
|
"learning_rate": 7.062997085002322e-06, |
|
"loss": 0.0486, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.7108239095315025, |
|
"grad_norm": 0.4319445788860321, |
|
"learning_rate": 7.03713628920914e-06, |
|
"loss": 0.0417, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.7172859450726978, |
|
"grad_norm": 2.305532932281494, |
|
"learning_rate": 7.011209984692375e-06, |
|
"loss": 0.04, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.7237479806138933, |
|
"grad_norm": 0.3852980434894562, |
|
"learning_rate": 6.985219005170973e-06, |
|
"loss": 0.0432, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.7302100161550888, |
|
"grad_norm": 0.43154263496398926, |
|
"learning_rate": 6.959164186443648e-06, |
|
"loss": 0.0457, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.7366720516962844, |
|
"grad_norm": 0.4496629238128662, |
|
"learning_rate": 6.933046366362011e-06, |
|
"loss": 0.0557, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.7431340872374799, |
|
"grad_norm": 0.5219531059265137, |
|
"learning_rate": 6.90686638480362e-06, |
|
"loss": 0.0364, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.7495961227786752, |
|
"grad_norm": 0.525452196598053, |
|
"learning_rate": 6.88062508364498e-06, |
|
"loss": 0.0567, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.7560581583198709, |
|
"grad_norm": 0.3721238672733307, |
|
"learning_rate": 6.8543233067344625e-06, |
|
"loss": 0.0569, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.7625201938610662, |
|
"grad_norm": 0.49607154726982117, |
|
"learning_rate": 6.827961899865178e-06, |
|
"loss": 0.0419, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.7689822294022617, |
|
"grad_norm": 0.3797336220741272, |
|
"learning_rate": 6.801541710747767e-06, |
|
"loss": 0.0427, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.7754442649434572, |
|
"grad_norm": 0.3607645332813263, |
|
"learning_rate": 6.775063588983153e-06, |
|
"loss": 0.0411, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.7819063004846527, |
|
"grad_norm": 0.3796041011810303, |
|
"learning_rate": 6.748528386035209e-06, |
|
"loss": 0.0448, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.7883683360258482, |
|
"grad_norm": 0.39435023069381714, |
|
"learning_rate": 6.7219369552033865e-06, |
|
"loss": 0.0429, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.7948303715670435, |
|
"grad_norm": 0.6904699802398682, |
|
"learning_rate": 6.695290151595271e-06, |
|
"loss": 0.0507, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.8012924071082392, |
|
"grad_norm": 0.45726367831230164, |
|
"learning_rate": 6.668588832099081e-06, |
|
"loss": 0.0378, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.8077544426494345, |
|
"grad_norm": 0.31050002574920654, |
|
"learning_rate": 6.6418338553561225e-06, |
|
"loss": 0.0496, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.81421647819063, |
|
"grad_norm": 0.42013099789619446, |
|
"learning_rate": 6.615026081733168e-06, |
|
"loss": 0.0467, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.8206785137318255, |
|
"grad_norm": 0.3072008490562439, |
|
"learning_rate": 6.5881663732947935e-06, |
|
"loss": 0.0458, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.827140549273021, |
|
"grad_norm": 3.501995801925659, |
|
"learning_rate": 6.561255593775656e-06, |
|
"loss": 0.0385, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.8336025848142166, |
|
"grad_norm": 0.3589681386947632, |
|
"learning_rate": 6.5342946085527205e-06, |
|
"loss": 0.0368, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.8400646203554119, |
|
"grad_norm": 0.405106782913208, |
|
"learning_rate": 6.507284284617427e-06, |
|
"loss": 0.0499, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.8465266558966076, |
|
"grad_norm": 0.41938844323158264, |
|
"learning_rate": 6.480225490547821e-06, |
|
"loss": 0.056, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.8529886914378029, |
|
"grad_norm": 0.43309280276298523, |
|
"learning_rate": 6.4531190964806005e-06, |
|
"loss": 0.0434, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.8594507269789984, |
|
"grad_norm": 0.3927607238292694, |
|
"learning_rate": 6.425965974083164e-06, |
|
"loss": 0.0461, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.865912762520194, |
|
"grad_norm": 0.38023263216018677, |
|
"learning_rate": 6.398766996525554e-06, |
|
"loss": 0.0373, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.8723747980613892, |
|
"grad_norm": 0.3614664673805237, |
|
"learning_rate": 6.371523038452398e-06, |
|
"loss": 0.0421, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.878836833602585, |
|
"grad_norm": 0.40119874477386475, |
|
"learning_rate": 6.344234975954765e-06, |
|
"loss": 0.0433, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.8852988691437802, |
|
"grad_norm": 0.5142848491668701, |
|
"learning_rate": 6.316903686542011e-06, |
|
"loss": 0.0411, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.891760904684976, |
|
"grad_norm": 0.40000125765800476, |
|
"learning_rate": 6.289530049113543e-06, |
|
"loss": 0.0544, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.8982229402261712, |
|
"grad_norm": 3.5423994064331055, |
|
"learning_rate": 6.262114943930566e-06, |
|
"loss": 0.0466, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.9046849757673667, |
|
"grad_norm": 0.42096999287605286, |
|
"learning_rate": 6.234659252587782e-06, |
|
"loss": 0.0488, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.9111470113085622, |
|
"grad_norm": 0.45695486664772034, |
|
"learning_rate": 6.20716385798502e-06, |
|
"loss": 0.053, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.9176090468497575, |
|
"grad_norm": 0.42890581488609314, |
|
"learning_rate": 6.17962964429887e-06, |
|
"loss": 0.0476, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.9240710823909533, |
|
"grad_norm": 0.4009229242801666, |
|
"learning_rate": 6.152057496954225e-06, |
|
"loss": 0.0429, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.9305331179321485, |
|
"grad_norm": 0.7172302007675171, |
|
"learning_rate": 6.12444830259583e-06, |
|
"loss": 0.0504, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.936995153473344, |
|
"grad_norm": 0.4833920896053314, |
|
"learning_rate": 6.096802949059757e-06, |
|
"loss": 0.0538, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.9434571890145396, |
|
"grad_norm": 3.44974684715271, |
|
"learning_rate": 6.069122325344857e-06, |
|
"loss": 0.0408, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.949919224555735, |
|
"grad_norm": 0.7173548340797424, |
|
"learning_rate": 6.041407321584178e-06, |
|
"loss": 0.0485, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.9563812600969306, |
|
"grad_norm": 0.4045400023460388, |
|
"learning_rate": 6.013658829016328e-06, |
|
"loss": 0.0433, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.9628432956381259, |
|
"grad_norm": 0.38726726174354553, |
|
"learning_rate": 5.9858777399568325e-06, |
|
"loss": 0.044, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.9693053311793216, |
|
"grad_norm": 0.4238860011100769, |
|
"learning_rate": 5.958064947769423e-06, |
|
"loss": 0.0488, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.975767366720517, |
|
"grad_norm": 0.41494399309158325, |
|
"learning_rate": 5.930221346837324e-06, |
|
"loss": 0.0444, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.9822294022617124, |
|
"grad_norm": 1.2054787874221802, |
|
"learning_rate": 5.902347832534475e-06, |
|
"loss": 0.0639, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.988691437802908, |
|
"grad_norm": 0.4015323519706726, |
|
"learning_rate": 5.874445301196761e-06, |
|
"loss": 0.0392, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.9951534733441034, |
|
"grad_norm": 0.4661884307861328, |
|
"learning_rate": 5.846514650093162e-06, |
|
"loss": 0.0487, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.43507838249206543, |
|
"learning_rate": 5.818556777396923e-06, |
|
"loss": 0.0566, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.0064620355411953, |
|
"grad_norm": 0.52005934715271, |
|
"learning_rate": 5.790572582156654e-06, |
|
"loss": 0.0266, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.012924071082391, |
|
"grad_norm": 0.3060389757156372, |
|
"learning_rate": 5.76256296426743e-06, |
|
"loss": 0.0255, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.0193861066235863, |
|
"grad_norm": 0.26340675354003906, |
|
"learning_rate": 5.734528824441845e-06, |
|
"loss": 0.0226, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.025848142164782, |
|
"grad_norm": 0.29653921723365784, |
|
"learning_rate": 5.706471064181055e-06, |
|
"loss": 0.0246, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.0323101777059773, |
|
"grad_norm": 0.2679958939552307, |
|
"learning_rate": 5.678390585745784e-06, |
|
"loss": 0.019, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.038772213247173, |
|
"grad_norm": 0.40780436992645264, |
|
"learning_rate": 5.6502882921273084e-06, |
|
"loss": 0.0223, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.0452342487883683, |
|
"grad_norm": 4.025135517120361, |
|
"learning_rate": 5.6221650870184215e-06, |
|
"loss": 0.0289, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.0516962843295636, |
|
"grad_norm": 0.3368605971336365, |
|
"learning_rate": 5.594021874784376e-06, |
|
"loss": 0.0216, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.0581583198707594, |
|
"grad_norm": 0.37941357493400574, |
|
"learning_rate": 5.565859560433792e-06, |
|
"loss": 0.028, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.0646203554119547, |
|
"grad_norm": 0.39574962854385376, |
|
"learning_rate": 5.537679049589568e-06, |
|
"loss": 0.0359, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.0710823909531504, |
|
"grad_norm": 0.3605610728263855, |
|
"learning_rate": 5.50948124845975e-06, |
|
"loss": 0.0228, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.0775444264943457, |
|
"grad_norm": 0.3201454281806946, |
|
"learning_rate": 5.481267063808392e-06, |
|
"loss": 0.0241, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.0840064620355414, |
|
"grad_norm": 0.32988935708999634, |
|
"learning_rate": 5.453037402926397e-06, |
|
"loss": 0.0232, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.0904684975767367, |
|
"grad_norm": 0.43368834257125854, |
|
"learning_rate": 5.4247931736023385e-06, |
|
"loss": 0.0219, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.096930533117932, |
|
"grad_norm": 0.37438321113586426, |
|
"learning_rate": 5.396535284093278e-06, |
|
"loss": 0.0216, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.1033925686591277, |
|
"grad_norm": 0.41498661041259766, |
|
"learning_rate": 5.368264643095543e-06, |
|
"loss": 0.0214, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.109854604200323, |
|
"grad_norm": 0.43471306562423706, |
|
"learning_rate": 5.3399821597155225e-06, |
|
"loss": 0.0194, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.1163166397415187, |
|
"grad_norm": 0.550999104976654, |
|
"learning_rate": 5.3116887434404155e-06, |
|
"loss": 0.0264, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.122778675282714, |
|
"grad_norm": 0.4273984730243683, |
|
"learning_rate": 5.283385304109e-06, |
|
"loss": 0.0238, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.1292407108239093, |
|
"grad_norm": 0.34016546607017517, |
|
"learning_rate": 5.255072751882363e-06, |
|
"loss": 0.0221, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.135702746365105, |
|
"grad_norm": 0.43326249718666077, |
|
"learning_rate": 5.22675199721464e-06, |
|
"loss": 0.0273, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.1421647819063003, |
|
"grad_norm": 0.3180527985095978, |
|
"learning_rate": 5.198423950823734e-06, |
|
"loss": 0.0243, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.148626817447496, |
|
"grad_norm": 0.28859448432922363, |
|
"learning_rate": 5.170089523662028e-06, |
|
"loss": 0.0272, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.1550888529886914, |
|
"grad_norm": 11.164939880371094, |
|
"learning_rate": 5.141749626887101e-06, |
|
"loss": 0.0306, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.161550888529887, |
|
"grad_norm": 0.3972926139831543, |
|
"learning_rate": 5.113405171832404e-06, |
|
"loss": 0.0209, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.1680129240710824, |
|
"grad_norm": 0.32792770862579346, |
|
"learning_rate": 5.0850570699779875e-06, |
|
"loss": 0.029, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.1744749596122777, |
|
"grad_norm": 0.5258669853210449, |
|
"learning_rate": 5.05670623292116e-06, |
|
"loss": 0.0469, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.1809369951534734, |
|
"grad_norm": 4.376863479614258, |
|
"learning_rate": 5.028353572347195e-06, |
|
"loss": 0.0329, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.1873990306946687, |
|
"grad_norm": 0.38787195086479187, |
|
"learning_rate": 5e-06, |
|
"loss": 0.025, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.1938610662358644, |
|
"grad_norm": 0.3971186578273773, |
|
"learning_rate": 4.971646427652806e-06, |
|
"loss": 0.022, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.2003231017770597, |
|
"grad_norm": 0.4147299528121948, |
|
"learning_rate": 4.94329376707884e-06, |
|
"loss": 0.0353, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.2067851373182554, |
|
"grad_norm": 0.3218820095062256, |
|
"learning_rate": 4.914942930022014e-06, |
|
"loss": 0.0223, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.2132471728594507, |
|
"grad_norm": 0.3497336208820343, |
|
"learning_rate": 4.8865948281675976e-06, |
|
"loss": 0.0245, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.219709208400646, |
|
"grad_norm": 0.5387941002845764, |
|
"learning_rate": 4.858250373112901e-06, |
|
"loss": 0.0375, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.2261712439418417, |
|
"grad_norm": 0.3683622181415558, |
|
"learning_rate": 4.829910476337972e-06, |
|
"loss": 0.0174, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.232633279483037, |
|
"grad_norm": 0.40947481989860535, |
|
"learning_rate": 4.801576049176269e-06, |
|
"loss": 0.0211, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.2390953150242328, |
|
"grad_norm": 0.38499361276626587, |
|
"learning_rate": 4.773248002785362e-06, |
|
"loss": 0.0229, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.245557350565428, |
|
"grad_norm": 0.45877766609191895, |
|
"learning_rate": 4.744927248117639e-06, |
|
"loss": 0.0271, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.2520193861066238, |
|
"grad_norm": 0.6704347133636475, |
|
"learning_rate": 4.716614695891002e-06, |
|
"loss": 0.0223, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.258481421647819, |
|
"grad_norm": 21.48920440673828, |
|
"learning_rate": 4.688311256559587e-06, |
|
"loss": 0.0235, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.2649434571890144, |
|
"grad_norm": 0.33492201566696167, |
|
"learning_rate": 4.66001784028448e-06, |
|
"loss": 0.0269, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.27140549273021, |
|
"grad_norm": 0.39123740792274475, |
|
"learning_rate": 4.631735356904458e-06, |
|
"loss": 0.0222, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.2778675282714054, |
|
"grad_norm": 0.3031889796257019, |
|
"learning_rate": 4.6034647159067234e-06, |
|
"loss": 0.0225, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.284329563812601, |
|
"grad_norm": 0.3864370584487915, |
|
"learning_rate": 4.575206826397662e-06, |
|
"loss": 0.0222, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.2907915993537964, |
|
"grad_norm": 0.6137961149215698, |
|
"learning_rate": 4.546962597073607e-06, |
|
"loss": 0.0276, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.297253634894992, |
|
"grad_norm": 0.4349140226840973, |
|
"learning_rate": 4.5187329361916095e-06, |
|
"loss": 0.0248, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.3037156704361874, |
|
"grad_norm": 0.3713863492012024, |
|
"learning_rate": 4.490518751540251e-06, |
|
"loss": 0.0268, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.3101777059773827, |
|
"grad_norm": 0.39676621556282043, |
|
"learning_rate": 4.462320950410432e-06, |
|
"loss": 0.0262, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.3166397415185784, |
|
"grad_norm": 0.37068402767181396, |
|
"learning_rate": 4.4341404395662105e-06, |
|
"loss": 0.0211, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.3231017770597737, |
|
"grad_norm": 0.29026004672050476, |
|
"learning_rate": 4.405978125215627e-06, |
|
"loss": 0.0208, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.3295638126009695, |
|
"grad_norm": 0.46004900336265564, |
|
"learning_rate": 4.377834912981579e-06, |
|
"loss": 0.0267, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.3360258481421647, |
|
"grad_norm": 0.38983336091041565, |
|
"learning_rate": 4.3497117078726915e-06, |
|
"loss": 0.026, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.3424878836833605, |
|
"grad_norm": 0.485213965177536, |
|
"learning_rate": 4.321609414254217e-06, |
|
"loss": 0.0224, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.3489499192245558, |
|
"grad_norm": 0.3934685289859772, |
|
"learning_rate": 4.2935289358189454e-06, |
|
"loss": 0.022, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.355411954765751, |
|
"grad_norm": 0.4222012460231781, |
|
"learning_rate": 4.265471175558156e-06, |
|
"loss": 0.0198, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.361873990306947, |
|
"grad_norm": 0.3215023875236511, |
|
"learning_rate": 4.237437035732572e-06, |
|
"loss": 0.0252, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.368336025848142, |
|
"grad_norm": 0.38228940963745117, |
|
"learning_rate": 4.2094274178433455e-06, |
|
"loss": 0.026, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.374798061389338, |
|
"grad_norm": 0.44622451066970825, |
|
"learning_rate": 4.18144322260308e-06, |
|
"loss": 0.0323, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.381260096930533, |
|
"grad_norm": 0.42110568284988403, |
|
"learning_rate": 4.153485349906839e-06, |
|
"loss": 0.0175, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.387722132471729, |
|
"grad_norm": 0.4341752827167511, |
|
"learning_rate": 4.125554698803241e-06, |
|
"loss": 0.0224, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.394184168012924, |
|
"grad_norm": 0.6297674775123596, |
|
"learning_rate": 4.0976521674655255e-06, |
|
"loss": 0.0223, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.4006462035541194, |
|
"grad_norm": 0.34877192974090576, |
|
"learning_rate": 4.069778653162679e-06, |
|
"loss": 0.0223, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.407108239095315, |
|
"grad_norm": 0.3711933493614197, |
|
"learning_rate": 4.041935052230579e-06, |
|
"loss": 0.018, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.4135702746365104, |
|
"grad_norm": 0.2846461236476898, |
|
"learning_rate": 4.014122260043169e-06, |
|
"loss": 0.0196, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.420032310177706, |
|
"grad_norm": 0.3798210024833679, |
|
"learning_rate": 3.986341170983672e-06, |
|
"loss": 0.0252, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.4264943457189014, |
|
"grad_norm": 0.45135289430618286, |
|
"learning_rate": 3.958592678415825e-06, |
|
"loss": 0.0272, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.432956381260097, |
|
"grad_norm": 0.3350200355052948, |
|
"learning_rate": 3.9308776746551444e-06, |
|
"loss": 0.0208, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.4394184168012925, |
|
"grad_norm": 0.3789970576763153, |
|
"learning_rate": 3.903197050940244e-06, |
|
"loss": 0.0206, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.4458804523424877, |
|
"grad_norm": 0.40493834018707275, |
|
"learning_rate": 3.875551697404172e-06, |
|
"loss": 0.0225, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.4523424878836835, |
|
"grad_norm": 0.45063942670822144, |
|
"learning_rate": 3.847942503045776e-06, |
|
"loss": 0.0246, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.4588045234248788, |
|
"grad_norm": 0.36469295620918274, |
|
"learning_rate": 3.820370355701133e-06, |
|
"loss": 0.0288, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.4652665589660745, |
|
"grad_norm": 0.3883582055568695, |
|
"learning_rate": 3.792836142014981e-06, |
|
"loss": 0.0305, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.47172859450727, |
|
"grad_norm": 0.41473427414894104, |
|
"learning_rate": 3.7653407474122195e-06, |
|
"loss": 0.0216, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.4781906300484655, |
|
"grad_norm": 0.44368648529052734, |
|
"learning_rate": 3.7378850560694337e-06, |
|
"loss": 0.0297, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.484652665589661, |
|
"grad_norm": 0.5202524662017822, |
|
"learning_rate": 3.7104699508864606e-06, |
|
"loss": 0.0258, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.491114701130856, |
|
"grad_norm": 0.41755107045173645, |
|
"learning_rate": 3.683096313457991e-06, |
|
"loss": 0.0319, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.497576736672052, |
|
"grad_norm": 0.4544621706008911, |
|
"learning_rate": 3.6557650240452358e-06, |
|
"loss": 0.027, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.504038772213247, |
|
"grad_norm": 0.3956614136695862, |
|
"learning_rate": 3.6284769615476045e-06, |
|
"loss": 0.018, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.5105008077544424, |
|
"grad_norm": 0.3755435049533844, |
|
"learning_rate": 3.601233003474448e-06, |
|
"loss": 0.0223, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.516962843295638, |
|
"grad_norm": 0.36021357774734497, |
|
"learning_rate": 3.5740340259168383e-06, |
|
"loss": 0.0203, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.523424878836834, |
|
"grad_norm": 0.35905206203460693, |
|
"learning_rate": 3.5468809035194008e-06, |
|
"loss": 0.0213, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.529886914378029, |
|
"grad_norm": 0.35023632645606995, |
|
"learning_rate": 3.519774509452181e-06, |
|
"loss": 0.0194, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.5363489499192244, |
|
"grad_norm": 0.3874582052230835, |
|
"learning_rate": 3.4927157153825717e-06, |
|
"loss": 0.0255, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.54281098546042, |
|
"grad_norm": 0.3666871190071106, |
|
"learning_rate": 3.4657053914472816e-06, |
|
"loss": 0.0205, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.5492730210016155, |
|
"grad_norm": 0.47673851251602173, |
|
"learning_rate": 3.4387444062243453e-06, |
|
"loss": 0.0271, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.5557350565428107, |
|
"grad_norm": 0.3491911292076111, |
|
"learning_rate": 3.4118336267052086e-06, |
|
"loss": 0.0191, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.5621970920840065, |
|
"grad_norm": 0.3430013656616211, |
|
"learning_rate": 3.384973918266834e-06, |
|
"loss": 0.019, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.568659127625202, |
|
"grad_norm": 0.3667357265949249, |
|
"learning_rate": 3.3581661446438796e-06, |
|
"loss": 0.0241, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.5751211631663975, |
|
"grad_norm": 0.395435094833374, |
|
"learning_rate": 3.3314111679009203e-06, |
|
"loss": 0.0295, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.581583198707593, |
|
"grad_norm": 0.4434990882873535, |
|
"learning_rate": 3.3047098484047314e-06, |
|
"loss": 0.023, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.5880452342487885, |
|
"grad_norm": 0.40480467677116394, |
|
"learning_rate": 3.2780630447966135e-06, |
|
"loss": 0.0245, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.594507269789984, |
|
"grad_norm": 0.39888110756874084, |
|
"learning_rate": 3.251471613964793e-06, |
|
"loss": 0.0356, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.600969305331179, |
|
"grad_norm": 0.3975083529949188, |
|
"learning_rate": 3.224936411016849e-06, |
|
"loss": 0.0212, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.607431340872375, |
|
"grad_norm": 0.31860825419425964, |
|
"learning_rate": 3.198458289252234e-06, |
|
"loss": 0.0148, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.61389337641357, |
|
"grad_norm": 0.5908383727073669, |
|
"learning_rate": 3.172038100134823e-06, |
|
"loss": 0.0215, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.620355411954766, |
|
"grad_norm": 0.32025986909866333, |
|
"learning_rate": 3.145676693265537e-06, |
|
"loss": 0.0189, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.626817447495961, |
|
"grad_norm": 0.3670276403427124, |
|
"learning_rate": 3.1193749163550226e-06, |
|
"loss": 0.0229, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.633279483037157, |
|
"grad_norm": 0.3176894187927246, |
|
"learning_rate": 3.093133615196381e-06, |
|
"loss": 0.0212, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.639741518578352, |
|
"grad_norm": 0.41281652450561523, |
|
"learning_rate": 3.0669536336379906e-06, |
|
"loss": 0.0191, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.6462035541195474, |
|
"grad_norm": 0.38180580735206604, |
|
"learning_rate": 3.040835813556352e-06, |
|
"loss": 0.02, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.652665589660743, |
|
"grad_norm": 0.2951391041278839, |
|
"learning_rate": 3.014780994829029e-06, |
|
"loss": 0.0162, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.6591276252019385, |
|
"grad_norm": 0.45817264914512634, |
|
"learning_rate": 2.988790015307627e-06, |
|
"loss": 0.0215, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.665589660743134, |
|
"grad_norm": 0.4247450828552246, |
|
"learning_rate": 2.9628637107908614e-06, |
|
"loss": 0.0271, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.6720516962843295, |
|
"grad_norm": 0.3811360001564026, |
|
"learning_rate": 2.937002914997679e-06, |
|
"loss": 0.0276, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.678513731825525, |
|
"grad_norm": 0.3762235641479492, |
|
"learning_rate": 2.911208459540442e-06, |
|
"loss": 0.021, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.6849757673667205, |
|
"grad_norm": 0.4333343207836151, |
|
"learning_rate": 2.8854811738981848e-06, |
|
"loss": 0.0247, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.691437802907916, |
|
"grad_norm": 0.36235716938972473, |
|
"learning_rate": 2.859821885389957e-06, |
|
"loss": 0.024, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.6978998384491115, |
|
"grad_norm": 0.3801264762878418, |
|
"learning_rate": 2.8342314191481952e-06, |
|
"loss": 0.0243, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.704361873990307, |
|
"grad_norm": 0.37116238474845886, |
|
"learning_rate": 2.808710598092206e-06, |
|
"loss": 0.0202, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.7108239095315025, |
|
"grad_norm": 0.36050671339035034, |
|
"learning_rate": 2.783260242901694e-06, |
|
"loss": 0.023, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.717285945072698, |
|
"grad_norm": 0.4219389855861664, |
|
"learning_rate": 2.7578811719903788e-06, |
|
"loss": 0.0201, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.7237479806138936, |
|
"grad_norm": 0.40774789452552795, |
|
"learning_rate": 2.7325742014796695e-06, |
|
"loss": 0.0272, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.730210016155089, |
|
"grad_norm": 0.3500642478466034, |
|
"learning_rate": 2.707340145172423e-06, |
|
"loss": 0.0209, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.736672051696284, |
|
"grad_norm": 0.40537238121032715, |
|
"learning_rate": 2.682179814526783e-06, |
|
"loss": 0.0192, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.74313408723748, |
|
"grad_norm": 0.346743643283844, |
|
"learning_rate": 2.6570940186300655e-06, |
|
"loss": 0.0208, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.749596122778675, |
|
"grad_norm": 0.504565954208374, |
|
"learning_rate": 2.6320835641727615e-06, |
|
"loss": 0.026, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.756058158319871, |
|
"grad_norm": 0.4314068853855133, |
|
"learning_rate": 2.607149255422584e-06, |
|
"loss": 0.0193, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.762520193861066, |
|
"grad_norm": 0.32916414737701416, |
|
"learning_rate": 2.582291894198617e-06, |
|
"loss": 0.0196, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.768982229402262, |
|
"grad_norm": 1.6414940357208252, |
|
"learning_rate": 2.557512279845509e-06, |
|
"loss": 0.0211, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.775444264943457, |
|
"grad_norm": 0.4827597141265869, |
|
"learning_rate": 2.5328112092077882e-06, |
|
"loss": 0.0234, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.7819063004846525, |
|
"grad_norm": 0.45988279581069946, |
|
"learning_rate": 2.5081894766042393e-06, |
|
"loss": 0.0282, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.788368336025848, |
|
"grad_norm": 0.5003206133842468, |
|
"learning_rate": 2.4836478738023424e-06, |
|
"loss": 0.0236, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.7948303715670435, |
|
"grad_norm": 0.44246071577072144, |
|
"learning_rate": 2.4591871899928286e-06, |
|
"loss": 0.0374, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.8012924071082392, |
|
"grad_norm": 0.4109882712364197, |
|
"learning_rate": 2.434808211764294e-06, |
|
"loss": 0.0218, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.8077544426494345, |
|
"grad_norm": 0.46064233779907227, |
|
"learning_rate": 2.410511723077907e-06, |
|
"loss": 0.0178, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.8142164781906303, |
|
"grad_norm": 0.33899685740470886, |
|
"learning_rate": 2.386298505242198e-06, |
|
"loss": 0.0296, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.8206785137318255, |
|
"grad_norm": 0.37225639820098877, |
|
"learning_rate": 2.3621693368879363e-06, |
|
"loss": 0.0268, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 2.827140549273021, |
|
"grad_norm": 0.49789291620254517, |
|
"learning_rate": 2.3381249939430882e-06, |
|
"loss": 0.0288, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.8336025848142166, |
|
"grad_norm": 0.4957059919834137, |
|
"learning_rate": 2.3141662496078695e-06, |
|
"loss": 0.0194, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 2.840064620355412, |
|
"grad_norm": 0.36811667680740356, |
|
"learning_rate": 2.2902938743298765e-06, |
|
"loss": 0.0219, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.8465266558966076, |
|
"grad_norm": 0.3737165927886963, |
|
"learning_rate": 2.2665086357793155e-06, |
|
"loss": 0.0217, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 2.852988691437803, |
|
"grad_norm": 0.46944284439086914, |
|
"learning_rate": 2.242811298824312e-06, |
|
"loss": 0.0218, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.8594507269789986, |
|
"grad_norm": 2.569209098815918, |
|
"learning_rate": 2.21920262550632e-06, |
|
"loss": 0.0226, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 2.865912762520194, |
|
"grad_norm": 0.46464183926582336, |
|
"learning_rate": 2.1956833750156086e-06, |
|
"loss": 0.0273, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.872374798061389, |
|
"grad_norm": 0.36627525091171265, |
|
"learning_rate": 2.1722543036668613e-06, |
|
"loss": 0.0237, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.878836833602585, |
|
"grad_norm": 0.42264869809150696, |
|
"learning_rate": 2.1489161648748436e-06, |
|
"loss": 0.0207, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.88529886914378, |
|
"grad_norm": 0.32069268822669983, |
|
"learning_rate": 2.125669709130174e-06, |
|
"loss": 0.0187, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.891760904684976, |
|
"grad_norm": 0.3201678693294525, |
|
"learning_rate": 2.102515683975201e-06, |
|
"loss": 0.0184, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.898222940226171, |
|
"grad_norm": 0.3414503037929535, |
|
"learning_rate": 2.0794548339799605e-06, |
|
"loss": 0.0191, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.904684975767367, |
|
"grad_norm": 0.33603090047836304, |
|
"learning_rate": 2.056487900718227e-06, |
|
"loss": 0.0186, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.9111470113085622, |
|
"grad_norm": 0.40074023604393005, |
|
"learning_rate": 2.0336156227436653e-06, |
|
"loss": 0.0184, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.9176090468497575, |
|
"grad_norm": 0.37722983956336975, |
|
"learning_rate": 2.010838735566096e-06, |
|
"loss": 0.0257, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.9240710823909533, |
|
"grad_norm": 0.3689229488372803, |
|
"learning_rate": 1.9881579716278267e-06, |
|
"loss": 0.0182, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.9305331179321485, |
|
"grad_norm": 0.4667937755584717, |
|
"learning_rate": 1.9655740602801055e-06, |
|
"loss": 0.0373, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.936995153473344, |
|
"grad_norm": 0.3728683888912201, |
|
"learning_rate": 1.943087727759663e-06, |
|
"loss": 0.0221, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.9434571890145396, |
|
"grad_norm": 0.38799959421157837, |
|
"learning_rate": 1.92069969716537e-06, |
|
"loss": 0.0238, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.9499192245557353, |
|
"grad_norm": 0.5497745275497437, |
|
"learning_rate": 1.8984106884349702e-06, |
|
"loss": 0.0291, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 2.9563812600969306, |
|
"grad_norm": 0.3938653767108917, |
|
"learning_rate": 1.8762214183219379e-06, |
|
"loss": 0.0215, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.962843295638126, |
|
"grad_norm": 0.35690999031066895, |
|
"learning_rate": 1.8541326003724258e-06, |
|
"loss": 0.0191, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 2.9693053311793216, |
|
"grad_norm": 0.3347409665584564, |
|
"learning_rate": 1.8321449449023215e-06, |
|
"loss": 0.0205, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.975767366720517, |
|
"grad_norm": 0.3588216006755829, |
|
"learning_rate": 1.8102591589744016e-06, |
|
"loss": 0.0246, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 2.982229402261712, |
|
"grad_norm": 0.4010322391986847, |
|
"learning_rate": 1.7884759463755984e-06, |
|
"loss": 0.0196, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.988691437802908, |
|
"grad_norm": 0.3672601878643036, |
|
"learning_rate": 1.7667960075943723e-06, |
|
"loss": 0.023, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 2.9951534733441036, |
|
"grad_norm": 0.5553978085517883, |
|
"learning_rate": 1.7452200397981706e-06, |
|
"loss": 0.0167, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.5560592412948608, |
|
"learning_rate": 1.723748736811025e-06, |
|
"loss": 0.0229, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.0064620355411953, |
|
"grad_norm": 0.32263022661209106, |
|
"learning_rate": 1.7023827890912302e-06, |
|
"loss": 0.016, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 3.012924071082391, |
|
"grad_norm": 0.3601260185241699, |
|
"learning_rate": 1.681122883709152e-06, |
|
"loss": 0.013, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 3.0193861066235863, |
|
"grad_norm": 2.2087621688842773, |
|
"learning_rate": 1.6599697043251128e-06, |
|
"loss": 0.0129, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 3.025848142164782, |
|
"grad_norm": 0.4122757315635681, |
|
"learning_rate": 1.638923931167427e-06, |
|
"loss": 0.0119, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 3.0323101777059773, |
|
"grad_norm": 0.24214933812618256, |
|
"learning_rate": 1.6179862410105197e-06, |
|
"loss": 0.0079, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.038772213247173, |
|
"grad_norm": 0.2189481109380722, |
|
"learning_rate": 1.5971573071531588e-06, |
|
"loss": 0.0124, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 3.0452342487883683, |
|
"grad_norm": 1.217540979385376, |
|
"learning_rate": 1.5764377993968094e-06, |
|
"loss": 0.011, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 3.0516962843295636, |
|
"grad_norm": 0.3144000768661499, |
|
"learning_rate": 1.5558283840240924e-06, |
|
"loss": 0.0115, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 3.0581583198707594, |
|
"grad_norm": 0.23900973796844482, |
|
"learning_rate": 1.5353297237773595e-06, |
|
"loss": 0.0088, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 3.0646203554119547, |
|
"grad_norm": 0.2415008693933487, |
|
"learning_rate": 1.5149424778373811e-06, |
|
"loss": 0.0097, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.0710823909531504, |
|
"grad_norm": 0.24851197004318237, |
|
"learning_rate": 1.4946673018021484e-06, |
|
"loss": 0.0114, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 3.0775444264943457, |
|
"grad_norm": 0.32280173897743225, |
|
"learning_rate": 1.474504847665791e-06, |
|
"loss": 0.0113, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 3.0840064620355414, |
|
"grad_norm": 0.2650688588619232, |
|
"learning_rate": 1.4544557637976108e-06, |
|
"loss": 0.0092, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 3.0904684975767367, |
|
"grad_norm": 0.39706680178642273, |
|
"learning_rate": 1.4345206949212338e-06, |
|
"loss": 0.0107, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 3.096930533117932, |
|
"grad_norm": 0.27723875641822815, |
|
"learning_rate": 1.4147002820938743e-06, |
|
"loss": 0.0124, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.1033925686591277, |
|
"grad_norm": 0.35606086254119873, |
|
"learning_rate": 1.3949951626857244e-06, |
|
"loss": 0.0092, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 3.109854604200323, |
|
"grad_norm": 0.3037882447242737, |
|
"learning_rate": 1.375405970359453e-06, |
|
"loss": 0.0111, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 3.1163166397415187, |
|
"grad_norm": 0.3035484254360199, |
|
"learning_rate": 1.3559333350498332e-06, |
|
"loss": 0.0118, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 3.122778675282714, |
|
"grad_norm": 0.32289376854896545, |
|
"learning_rate": 1.3365778829434834e-06, |
|
"loss": 0.0123, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 3.1292407108239093, |
|
"grad_norm": 0.3762624263763428, |
|
"learning_rate": 1.3173402364587307e-06, |
|
"loss": 0.0087, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.135702746365105, |
|
"grad_norm": 0.21013927459716797, |
|
"learning_rate": 1.298221014225597e-06, |
|
"loss": 0.0065, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 3.1421647819063003, |
|
"grad_norm": 0.352093368768692, |
|
"learning_rate": 1.2792208310659015e-06, |
|
"loss": 0.0135, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 3.148626817447496, |
|
"grad_norm": 0.25764092803001404, |
|
"learning_rate": 1.2603402979734992e-06, |
|
"loss": 0.0092, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 3.1550888529886914, |
|
"grad_norm": 0.26913541555404663, |
|
"learning_rate": 1.2415800220946223e-06, |
|
"loss": 0.0057, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 3.161550888529887, |
|
"grad_norm": 0.3840892016887665, |
|
"learning_rate": 1.2229406067083566e-06, |
|
"loss": 0.0126, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.1680129240710824, |
|
"grad_norm": 0.6032068133354187, |
|
"learning_rate": 1.2044226512072537e-06, |
|
"loss": 0.0129, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 3.1744749596122777, |
|
"grad_norm": 0.2559524178504944, |
|
"learning_rate": 1.1860267510780432e-06, |
|
"loss": 0.0078, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 3.1809369951534734, |
|
"grad_norm": 1.6579912900924683, |
|
"learning_rate": 1.1677534978824906e-06, |
|
"loss": 0.0126, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 3.1873990306946687, |
|
"grad_norm": 0.2948991060256958, |
|
"learning_rate": 1.1496034792383654e-06, |
|
"loss": 0.0087, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 3.1938610662358644, |
|
"grad_norm": 0.32418084144592285, |
|
"learning_rate": 1.1315772788005603e-06, |
|
"loss": 0.0093, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.2003231017770597, |
|
"grad_norm": 0.42377397418022156, |
|
"learning_rate": 1.1136754762423097e-06, |
|
"loss": 0.0102, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 3.2067851373182554, |
|
"grad_norm": 0.34062591195106506, |
|
"learning_rate": 1.0958986472365518e-06, |
|
"loss": 0.0176, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 3.2132471728594507, |
|
"grad_norm": 0.43844881653785706, |
|
"learning_rate": 1.0782473634374191e-06, |
|
"loss": 0.0132, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 3.219709208400646, |
|
"grad_norm": 0.4194018244743347, |
|
"learning_rate": 1.0607221924618533e-06, |
|
"loss": 0.0103, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 3.2261712439418417, |
|
"grad_norm": 0.27430325746536255, |
|
"learning_rate": 1.0433236978713546e-06, |
|
"loss": 0.0085, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.232633279483037, |
|
"grad_norm": 0.1858188509941101, |
|
"learning_rate": 1.0260524391538546e-06, |
|
"loss": 0.0092, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 3.2390953150242328, |
|
"grad_norm": 0.43904629349708557, |
|
"learning_rate": 1.0089089717057337e-06, |
|
"loss": 0.0107, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 3.245557350565428, |
|
"grad_norm": 0.3002469539642334, |
|
"learning_rate": 9.91893846813947e-07, |
|
"loss": 0.0086, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 3.2520193861066238, |
|
"grad_norm": 0.2562623918056488, |
|
"learning_rate": 9.7500761163831e-07, |
|
"loss": 0.007, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 3.258481421647819, |
|
"grad_norm": 0.29067540168762207, |
|
"learning_rate": 9.582508091938953e-07, |
|
"loss": 0.0081, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.2649434571890144, |
|
"grad_norm": 0.2679020166397095, |
|
"learning_rate": 9.416239783335785e-07, |
|
"loss": 0.0098, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 3.27140549273021, |
|
"grad_norm": 0.32440176606178284, |
|
"learning_rate": 9.251276537306969e-07, |
|
"loss": 0.0074, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 3.2778675282714054, |
|
"grad_norm": 0.2440427541732788, |
|
"learning_rate": 9.087623658618682e-07, |
|
"loss": 0.009, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 3.284329563812601, |
|
"grad_norm": 0.3681286871433258, |
|
"learning_rate": 8.925286409899308e-07, |
|
"loss": 0.0071, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 3.2907915993537964, |
|
"grad_norm": 0.2806476950645447, |
|
"learning_rate": 8.764270011470144e-07, |
|
"loss": 0.0099, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.297253634894992, |
|
"grad_norm": 0.4697900414466858, |
|
"learning_rate": 8.604579641177524e-07, |
|
"loss": 0.0152, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 3.3037156704361874, |
|
"grad_norm": 0.3158174753189087, |
|
"learning_rate": 8.446220434226382e-07, |
|
"loss": 0.0085, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 3.3101777059773827, |
|
"grad_norm": 0.42739158868789673, |
|
"learning_rate": 8.289197483015127e-07, |
|
"loss": 0.0132, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 3.3166397415185784, |
|
"grad_norm": 0.2981080114841461, |
|
"learning_rate": 8.133515836971773e-07, |
|
"loss": 0.0064, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 3.3231017770597737, |
|
"grad_norm": 0.41693729162216187, |
|
"learning_rate": 7.97918050239167e-07, |
|
"loss": 0.0114, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.3295638126009695, |
|
"grad_norm": 0.44874924421310425, |
|
"learning_rate": 7.826196442276473e-07, |
|
"loss": 0.0099, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 3.3360258481421647, |
|
"grad_norm": 0.2718696892261505, |
|
"learning_rate": 7.674568576174546e-07, |
|
"loss": 0.0118, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 3.3424878836833605, |
|
"grad_norm": 0.39589688181877136, |
|
"learning_rate": 7.524301780022774e-07, |
|
"loss": 0.0143, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 3.3489499192245558, |
|
"grad_norm": 0.31575649976730347, |
|
"learning_rate": 7.375400885989758e-07, |
|
"loss": 0.0131, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 3.355411954765751, |
|
"grad_norm": 0.4461822807788849, |
|
"learning_rate": 7.227870682320432e-07, |
|
"loss": 0.0086, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.361873990306947, |
|
"grad_norm": 0.34174418449401855, |
|
"learning_rate": 7.081715913182069e-07, |
|
"loss": 0.0069, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 3.368336025848142, |
|
"grad_norm": 0.2855561375617981, |
|
"learning_rate": 6.936941278511744e-07, |
|
"loss": 0.008, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 3.374798061389338, |
|
"grad_norm": 0.3220924139022827, |
|
"learning_rate": 6.793551433865198e-07, |
|
"loss": 0.009, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 3.381260096930533, |
|
"grad_norm": 0.28071683645248413, |
|
"learning_rate": 6.651550990267091e-07, |
|
"loss": 0.0069, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 3.387722132471729, |
|
"grad_norm": 0.26843127608299255, |
|
"learning_rate": 6.510944514062784e-07, |
|
"loss": 0.0074, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.394184168012924, |
|
"grad_norm": 0.3113328218460083, |
|
"learning_rate": 6.371736526771421e-07, |
|
"loss": 0.0091, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 3.4006462035541194, |
|
"grad_norm": 0.22835861146450043, |
|
"learning_rate": 6.233931504940633e-07, |
|
"loss": 0.0072, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 3.407108239095315, |
|
"grad_norm": 8068.888671875, |
|
"learning_rate": 6.097533880002476e-07, |
|
"loss": 0.0093, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 3.4135702746365104, |
|
"grad_norm": 0.39374470710754395, |
|
"learning_rate": 5.962548038130972e-07, |
|
"loss": 0.012, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 3.420032310177706, |
|
"grad_norm": 0.4547136723995209, |
|
"learning_rate": 5.828978320101109e-07, |
|
"loss": 0.0098, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.4264943457189014, |
|
"grad_norm": 0.4039766490459442, |
|
"learning_rate": 5.696829021149181e-07, |
|
"loss": 0.0077, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 3.432956381260097, |
|
"grad_norm": 0.32658788561820984, |
|
"learning_rate": 5.566104390834709e-07, |
|
"loss": 0.008, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 3.4394184168012925, |
|
"grad_norm": 0.3402605950832367, |
|
"learning_rate": 5.436808632903729e-07, |
|
"loss": 0.0082, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 3.4458804523424877, |
|
"grad_norm": 0.31616881489753723, |
|
"learning_rate": 5.308945905153729e-07, |
|
"loss": 0.0129, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 3.4523424878836835, |
|
"grad_norm": 0.43558764457702637, |
|
"learning_rate": 5.182520319299816e-07, |
|
"loss": 0.0117, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 3.4588045234248788, |
|
"grad_norm": 0.337622731924057, |
|
"learning_rate": 5.057535940842567e-07, |
|
"loss": 0.007, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 3.4652665589660745, |
|
"grad_norm": 0.3466811180114746, |
|
"learning_rate": 4.933996788937279e-07, |
|
"loss": 0.011, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 3.47172859450727, |
|
"grad_norm": 0.35212457180023193, |
|
"learning_rate": 4.811906836264718e-07, |
|
"loss": 0.0107, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 3.4781906300484655, |
|
"grad_norm": 0.3255941867828369, |
|
"learning_rate": 4.691270008903365e-07, |
|
"loss": 0.0113, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 3.484652665589661, |
|
"grad_norm": 0.3242311477661133, |
|
"learning_rate": 4.572090186203171e-07, |
|
"loss": 0.0116, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.491114701130856, |
|
"grad_norm": 0.35152673721313477, |
|
"learning_rate": 4.4543712006608507e-07, |
|
"loss": 0.0101, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 3.497576736672052, |
|
"grad_norm": 0.31033825874328613, |
|
"learning_rate": 4.338116837796519e-07, |
|
"loss": 0.0101, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 3.504038772213247, |
|
"grad_norm": 0.3278428316116333, |
|
"learning_rate": 4.2233308360321024e-07, |
|
"loss": 0.0077, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 3.5105008077544424, |
|
"grad_norm": 1.6630736589431763, |
|
"learning_rate": 4.110016886571011e-07, |
|
"loss": 0.0087, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 3.516962843295638, |
|
"grad_norm": 0.49623093008995056, |
|
"learning_rate": 3.998178633279537e-07, |
|
"loss": 0.0075, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 3.523424878836834, |
|
"grad_norm": 0.24495165050029755, |
|
"learning_rate": 3.887819672569565e-07, |
|
"loss": 0.0076, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 3.529886914378029, |
|
"grad_norm": 1.0713497400283813, |
|
"learning_rate": 3.778943553283015e-07, |
|
"loss": 0.0087, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 3.5363489499192244, |
|
"grad_norm": 0.35764917731285095, |
|
"learning_rate": 3.671553776577702e-07, |
|
"loss": 0.0145, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 3.54281098546042, |
|
"grad_norm": 0.4270109534263611, |
|
"learning_rate": 3.5656537958147164e-07, |
|
"loss": 0.0196, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 3.5492730210016155, |
|
"grad_norm": 0.36601418256759644, |
|
"learning_rate": 3.461247016447372e-07, |
|
"loss": 0.0128, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.5557350565428107, |
|
"grad_norm": 0.2740163505077362, |
|
"learning_rate": 3.3583367959117374e-07, |
|
"loss": 0.0073, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 3.5621970920840065, |
|
"grad_norm": 0.30819642543792725, |
|
"learning_rate": 3.2569264435186597e-07, |
|
"loss": 0.011, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 3.568659127625202, |
|
"grad_norm": 0.33172082901000977, |
|
"learning_rate": 3.1570192203473183e-07, |
|
"loss": 0.0092, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 3.5751211631663975, |
|
"grad_norm": 0.23010392487049103, |
|
"learning_rate": 3.058618339140368e-07, |
|
"loss": 0.0035, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 3.581583198707593, |
|
"grad_norm": 0.22640886902809143, |
|
"learning_rate": 2.961726964200645e-07, |
|
"loss": 0.0056, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 3.5880452342487885, |
|
"grad_norm": 0.21879248321056366, |
|
"learning_rate": 2.8663482112893936e-07, |
|
"loss": 0.0066, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 3.594507269789984, |
|
"grad_norm": 0.2790137827396393, |
|
"learning_rate": 2.772485147526077e-07, |
|
"loss": 0.0082, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 3.600969305331179, |
|
"grad_norm": 2.688298225402832, |
|
"learning_rate": 2.680140791289737e-07, |
|
"loss": 0.015, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 3.607431340872375, |
|
"grad_norm": 0.29820549488067627, |
|
"learning_rate": 2.5893181121219637e-07, |
|
"loss": 0.0097, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 3.61389337641357, |
|
"grad_norm": 0.33826717734336853, |
|
"learning_rate": 2.500020030631356e-07, |
|
"loss": 0.0087, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.620355411954766, |
|
"grad_norm": 0.43633323907852173, |
|
"learning_rate": 2.4122494183996426e-07, |
|
"loss": 0.0127, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 3.626817447495961, |
|
"grad_norm": 0.2826526165008545, |
|
"learning_rate": 2.3260090978893146e-07, |
|
"loss": 0.0089, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 3.633279483037157, |
|
"grad_norm": 0.27118533849716187, |
|
"learning_rate": 2.2413018423528832e-07, |
|
"loss": 0.0059, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 3.639741518578352, |
|
"grad_norm": 2.0000076293945312, |
|
"learning_rate": 2.1581303757436778e-07, |
|
"loss": 0.0088, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 3.6462035541195474, |
|
"grad_norm": 0.2869947850704193, |
|
"learning_rate": 2.076497372628261e-07, |
|
"loss": 0.0084, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 3.652665589660743, |
|
"grad_norm": 0.35098886489868164, |
|
"learning_rate": 1.9964054581004476e-07, |
|
"loss": 0.0133, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 3.6591276252019385, |
|
"grad_norm": 0.32275232672691345, |
|
"learning_rate": 1.9178572076968437e-07, |
|
"loss": 0.0109, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 3.665589660743134, |
|
"grad_norm": 0.44486019015312195, |
|
"learning_rate": 1.84085514731403e-07, |
|
"loss": 0.0093, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 3.6720516962843295, |
|
"grad_norm": 0.30504077672958374, |
|
"learning_rate": 1.7654017531273882e-07, |
|
"loss": 0.0097, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 3.678513731825525, |
|
"grad_norm": 0.4125590920448303, |
|
"learning_rate": 1.6914994515114082e-07, |
|
"loss": 0.01, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.6849757673667205, |
|
"grad_norm": 0.44932204484939575, |
|
"learning_rate": 1.619150618961701e-07, |
|
"loss": 0.01, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 3.691437802907916, |
|
"grad_norm": 0.39718618988990784, |
|
"learning_rate": 1.5483575820185615e-07, |
|
"loss": 0.0096, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 3.6978998384491115, |
|
"grad_norm": 0.2970835864543915, |
|
"learning_rate": 1.4791226171921748e-07, |
|
"loss": 0.0076, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 3.704361873990307, |
|
"grad_norm": 0.33279949426651, |
|
"learning_rate": 1.411447950889372e-07, |
|
"loss": 0.0108, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 3.7108239095315025, |
|
"grad_norm": 0.33126136660575867, |
|
"learning_rate": 1.3453357593420757e-07, |
|
"loss": 0.0084, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.717285945072698, |
|
"grad_norm": 0.41318628191947937, |
|
"learning_rate": 1.2807881685372947e-07, |
|
"loss": 0.007, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 3.7237479806138936, |
|
"grad_norm": 0.3176984488964081, |
|
"learning_rate": 1.2178072541487508e-07, |
|
"loss": 0.0103, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 3.730210016155089, |
|
"grad_norm": 0.35345304012298584, |
|
"learning_rate": 1.1563950414701653e-07, |
|
"loss": 0.0082, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 3.736672051696284, |
|
"grad_norm": 0.37485456466674805, |
|
"learning_rate": 1.0965535053500843e-07, |
|
"loss": 0.0072, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 3.74313408723748, |
|
"grad_norm": 0.426248699426651, |
|
"learning_rate": 1.0382845701284228e-07, |
|
"loss": 0.0107, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.749596122778675, |
|
"grad_norm": 0.7327535152435303, |
|
"learning_rate": 9.815901095745373e-08, |
|
"loss": 0.0108, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 3.756058158319871, |
|
"grad_norm": 0.39146602153778076, |
|
"learning_rate": 9.264719468270011e-08, |
|
"loss": 0.0165, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 3.762520193861066, |
|
"grad_norm": 0.35452187061309814, |
|
"learning_rate": 8.729318543349685e-08, |
|
"loss": 0.0068, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 3.768982229402262, |
|
"grad_norm": 0.3429497480392456, |
|
"learning_rate": 8.209715538011753e-08, |
|
"loss": 0.0101, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 3.775444264943457, |
|
"grad_norm": 1.697080373764038, |
|
"learning_rate": 7.70592716126567e-08, |
|
"loss": 0.0105, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 3.7819063004846525, |
|
"grad_norm": 0.27320510149002075, |
|
"learning_rate": 7.217969613565856e-08, |
|
"loss": 0.0105, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 3.788368336025848, |
|
"grad_norm": 0.288004070520401, |
|
"learning_rate": 6.745858586290566e-08, |
|
"loss": 0.0083, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 3.7948303715670435, |
|
"grad_norm": 0.42086732387542725, |
|
"learning_rate": 6.28960926123745e-08, |
|
"loss": 0.0143, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 3.8012924071082392, |
|
"grad_norm": 0.30742815136909485, |
|
"learning_rate": 5.84923631013512e-08, |
|
"loss": 0.0148, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 3.8077544426494345, |
|
"grad_norm": 0.4420062005519867, |
|
"learning_rate": 5.424753894171519e-08, |
|
"loss": 0.0116, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.8142164781906303, |
|
"grad_norm": 0.2861470580101013, |
|
"learning_rate": 5.016175663538625e-08, |
|
"loss": 0.0043, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 3.8206785137318255, |
|
"grad_norm": 0.26148203015327454, |
|
"learning_rate": 4.623514756993241e-08, |
|
"loss": 0.0062, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 3.827140549273021, |
|
"grad_norm": 0.2247193306684494, |
|
"learning_rate": 4.246783801434617e-08, |
|
"loss": 0.0068, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 3.8336025848142166, |
|
"grad_norm": 0.2534724175930023, |
|
"learning_rate": 3.885994911498603e-08, |
|
"loss": 0.0065, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 3.840064620355412, |
|
"grad_norm": 0.31165772676467896, |
|
"learning_rate": 3.541159689167628e-08, |
|
"loss": 0.0097, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 3.8465266558966076, |
|
"grad_norm": 0.47996920347213745, |
|
"learning_rate": 3.212289223398002e-08, |
|
"loss": 0.0057, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 3.852988691437803, |
|
"grad_norm": 0.2576350271701813, |
|
"learning_rate": 2.8993940897631412e-08, |
|
"loss": 0.0073, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 3.8594507269789986, |
|
"grad_norm": 0.3661513030529022, |
|
"learning_rate": 2.602484350113621e-08, |
|
"loss": 0.0085, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 3.865912762520194, |
|
"grad_norm": 0.4161849915981293, |
|
"learning_rate": 2.321569552253433e-08, |
|
"loss": 0.0116, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 3.872374798061389, |
|
"grad_norm": 0.25652387738227844, |
|
"learning_rate": 2.056658729633121e-08, |
|
"loss": 0.0078, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.878836833602585, |
|
"grad_norm": 0.31865638494491577, |
|
"learning_rate": 1.807760401059122e-08, |
|
"loss": 0.0111, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 3.88529886914378, |
|
"grad_norm": 0.4232560396194458, |
|
"learning_rate": 1.5748825704199887e-08, |
|
"loss": 0.0146, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 3.891760904684976, |
|
"grad_norm": 0.3084033727645874, |
|
"learning_rate": 1.3580327264289261e-08, |
|
"loss": 0.0066, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 3.898222940226171, |
|
"grad_norm": 0.2703307271003723, |
|
"learning_rate": 1.1572178423830405e-08, |
|
"loss": 0.0091, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 3.904684975767367, |
|
"grad_norm": 0.34035852551460266, |
|
"learning_rate": 9.724443759389635e-09, |
|
"loss": 0.0059, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 3.9111470113085622, |
|
"grad_norm": 0.5055537819862366, |
|
"learning_rate": 8.037182689052958e-09, |
|
"loss": 0.0087, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 3.9176090468497575, |
|
"grad_norm": 0.4783738851547241, |
|
"learning_rate": 6.510449470514824e-09, |
|
"loss": 0.0115, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 3.9240710823909533, |
|
"grad_norm": 0.24573703110218048, |
|
"learning_rate": 5.1442931993350705e-09, |
|
"loss": 0.007, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 3.9305331179321485, |
|
"grad_norm": 0.2814759314060211, |
|
"learning_rate": 3.9387578073563086e-09, |
|
"loss": 0.0105, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 3.936995153473344, |
|
"grad_norm": 0.30954548716545105, |
|
"learning_rate": 2.8938820612961494e-09, |
|
"loss": 0.0057, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.9434571890145396, |
|
"grad_norm": 0.25791114568710327, |
|
"learning_rate": 2.0096995614959924e-09, |
|
"loss": 0.0094, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 3.9499192245557353, |
|
"grad_norm": 0.2954118549823761, |
|
"learning_rate": 1.2862387408435483e-09, |
|
"loss": 0.0053, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 3.9563812600969306, |
|
"grad_norm": 0.3213749825954437, |
|
"learning_rate": 7.235228638574621e-10, |
|
"loss": 0.0145, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 3.962843295638126, |
|
"grad_norm": 0.2650403380393982, |
|
"learning_rate": 3.2157002593902196e-10, |
|
"loss": 0.0072, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 3.9693053311793216, |
|
"grad_norm": 0.32353588938713074, |
|
"learning_rate": 8.039315279040338e-11, |
|
"loss": 0.0067, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 3.975767366720517, |
|
"grad_norm": 0.2813875377178192, |
|
"learning_rate": 0.0, |
|
"loss": 0.007, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 3.975767366720517, |
|
"step": 616, |
|
"total_flos": 8.773760444520202e+17, |
|
"train_loss": 0.04414510081163849, |
|
"train_runtime": 2220.3071, |
|
"train_samples_per_second": 8.921, |
|
"train_steps_per_second": 0.277 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 616, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.773760444520202e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|