ReForm-SFT-7B / trainer_state.json
SiniShell1's picture
Upload folder using huggingface_hub
bccc0bb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.975767366720517,
"eval_steps": 500,
"global_step": 616,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006462035541195477,
"grad_norm": 15.905838012695312,
"learning_rate": 1.6129032258064518e-07,
"loss": 0.1599,
"step": 1
},
{
"epoch": 0.012924071082390954,
"grad_norm": 27.293901443481445,
"learning_rate": 3.2258064516129035e-07,
"loss": 0.2199,
"step": 2
},
{
"epoch": 0.01938610662358643,
"grad_norm": 18.97646141052246,
"learning_rate": 4.838709677419355e-07,
"loss": 0.2004,
"step": 3
},
{
"epoch": 0.025848142164781908,
"grad_norm": 26.008861541748047,
"learning_rate": 6.451612903225807e-07,
"loss": 0.2176,
"step": 4
},
{
"epoch": 0.03231017770597738,
"grad_norm": 50.078155517578125,
"learning_rate": 8.064516129032258e-07,
"loss": 0.1862,
"step": 5
},
{
"epoch": 0.03877221324717286,
"grad_norm": 3807.063720703125,
"learning_rate": 9.67741935483871e-07,
"loss": 0.2138,
"step": 6
},
{
"epoch": 0.045234248788368334,
"grad_norm": 14.361485481262207,
"learning_rate": 1.1290322580645162e-06,
"loss": 0.1844,
"step": 7
},
{
"epoch": 0.051696284329563816,
"grad_norm": 8.04658031463623,
"learning_rate": 1.2903225806451614e-06,
"loss": 0.1894,
"step": 8
},
{
"epoch": 0.05815831987075929,
"grad_norm": 7.152590274810791,
"learning_rate": 1.4516129032258066e-06,
"loss": 0.1658,
"step": 9
},
{
"epoch": 0.06462035541195477,
"grad_norm": 4.961307525634766,
"learning_rate": 1.6129032258064516e-06,
"loss": 0.1739,
"step": 10
},
{
"epoch": 0.07108239095315025,
"grad_norm": 6.286808013916016,
"learning_rate": 1.774193548387097e-06,
"loss": 0.1784,
"step": 11
},
{
"epoch": 0.07754442649434572,
"grad_norm": 3.9918179512023926,
"learning_rate": 1.935483870967742e-06,
"loss": 0.1839,
"step": 12
},
{
"epoch": 0.0840064620355412,
"grad_norm": 4.566269874572754,
"learning_rate": 2.096774193548387e-06,
"loss": 0.1794,
"step": 13
},
{
"epoch": 0.09046849757673667,
"grad_norm": 2.4546995162963867,
"learning_rate": 2.2580645161290324e-06,
"loss": 0.155,
"step": 14
},
{
"epoch": 0.09693053311793215,
"grad_norm": 19.017515182495117,
"learning_rate": 2.4193548387096776e-06,
"loss": 0.1428,
"step": 15
},
{
"epoch": 0.10339256865912763,
"grad_norm": 1.8924509286880493,
"learning_rate": 2.580645161290323e-06,
"loss": 0.1424,
"step": 16
},
{
"epoch": 0.1098546042003231,
"grad_norm": 1.193750023841858,
"learning_rate": 2.7419354838709676e-06,
"loss": 0.1369,
"step": 17
},
{
"epoch": 0.11631663974151858,
"grad_norm": 0.9094203114509583,
"learning_rate": 2.903225806451613e-06,
"loss": 0.1269,
"step": 18
},
{
"epoch": 0.12277867528271405,
"grad_norm": 1.336350440979004,
"learning_rate": 3.0645161290322584e-06,
"loss": 0.1243,
"step": 19
},
{
"epoch": 0.12924071082390953,
"grad_norm": 26.0351505279541,
"learning_rate": 3.225806451612903e-06,
"loss": 0.1165,
"step": 20
},
{
"epoch": 0.13570274636510501,
"grad_norm": 1.1493116617202759,
"learning_rate": 3.3870967741935484e-06,
"loss": 0.1543,
"step": 21
},
{
"epoch": 0.1421647819063005,
"grad_norm": 0.8133248686790466,
"learning_rate": 3.548387096774194e-06,
"loss": 0.1064,
"step": 22
},
{
"epoch": 0.14862681744749595,
"grad_norm": 0.8336551785469055,
"learning_rate": 3.7096774193548392e-06,
"loss": 0.1192,
"step": 23
},
{
"epoch": 0.15508885298869143,
"grad_norm": 0.7685548067092896,
"learning_rate": 3.870967741935484e-06,
"loss": 0.1051,
"step": 24
},
{
"epoch": 0.16155088852988692,
"grad_norm": 1.217137336730957,
"learning_rate": 4.032258064516129e-06,
"loss": 0.1377,
"step": 25
},
{
"epoch": 0.1680129240710824,
"grad_norm": 1.1432764530181885,
"learning_rate": 4.193548387096774e-06,
"loss": 0.1151,
"step": 26
},
{
"epoch": 0.17447495961227788,
"grad_norm": 2.054145097732544,
"learning_rate": 4.35483870967742e-06,
"loss": 0.1178,
"step": 27
},
{
"epoch": 0.18093699515347333,
"grad_norm": 0.9552474021911621,
"learning_rate": 4.516129032258065e-06,
"loss": 0.1363,
"step": 28
},
{
"epoch": 0.18739903069466882,
"grad_norm": 1.1134074926376343,
"learning_rate": 4.67741935483871e-06,
"loss": 0.0971,
"step": 29
},
{
"epoch": 0.1938610662358643,
"grad_norm": 1.2744982242584229,
"learning_rate": 4.838709677419355e-06,
"loss": 0.1177,
"step": 30
},
{
"epoch": 0.20032310177705978,
"grad_norm": 0.8065481185913086,
"learning_rate": 5e-06,
"loss": 0.1089,
"step": 31
},
{
"epoch": 0.20678513731825526,
"grad_norm": 0.8988919854164124,
"learning_rate": 5.161290322580646e-06,
"loss": 0.1324,
"step": 32
},
{
"epoch": 0.21324717285945072,
"grad_norm": 0.7514846324920654,
"learning_rate": 5.322580645161291e-06,
"loss": 0.1045,
"step": 33
},
{
"epoch": 0.2197092084006462,
"grad_norm": 0.8767650723457336,
"learning_rate": 5.483870967741935e-06,
"loss": 0.1188,
"step": 34
},
{
"epoch": 0.22617124394184168,
"grad_norm": 0.9772207736968994,
"learning_rate": 5.645161290322582e-06,
"loss": 0.0941,
"step": 35
},
{
"epoch": 0.23263327948303716,
"grad_norm": 1.004211664199829,
"learning_rate": 5.806451612903226e-06,
"loss": 0.1063,
"step": 36
},
{
"epoch": 0.23909531502423265,
"grad_norm": 0.753398597240448,
"learning_rate": 5.967741935483872e-06,
"loss": 0.0908,
"step": 37
},
{
"epoch": 0.2455573505654281,
"grad_norm": 0.8244940042495728,
"learning_rate": 6.129032258064517e-06,
"loss": 0.1037,
"step": 38
},
{
"epoch": 0.2520193861066236,
"grad_norm": 0.9269906282424927,
"learning_rate": 6.290322580645162e-06,
"loss": 0.121,
"step": 39
},
{
"epoch": 0.25848142164781907,
"grad_norm": 0.8812072277069092,
"learning_rate": 6.451612903225806e-06,
"loss": 0.1229,
"step": 40
},
{
"epoch": 0.2649434571890145,
"grad_norm": 1.0441511869430542,
"learning_rate": 6.612903225806452e-06,
"loss": 0.1201,
"step": 41
},
{
"epoch": 0.27140549273021003,
"grad_norm": 0.819930374622345,
"learning_rate": 6.774193548387097e-06,
"loss": 0.1175,
"step": 42
},
{
"epoch": 0.2778675282714055,
"grad_norm": 0.7328965067863464,
"learning_rate": 6.935483870967743e-06,
"loss": 0.0865,
"step": 43
},
{
"epoch": 0.284329563812601,
"grad_norm": 0.7510896325111389,
"learning_rate": 7.096774193548388e-06,
"loss": 0.0874,
"step": 44
},
{
"epoch": 0.29079159935379645,
"grad_norm": 0.8940995335578918,
"learning_rate": 7.258064516129033e-06,
"loss": 0.1069,
"step": 45
},
{
"epoch": 0.2972536348949919,
"grad_norm": 1.0601608753204346,
"learning_rate": 7.4193548387096784e-06,
"loss": 0.1034,
"step": 46
},
{
"epoch": 0.3037156704361874,
"grad_norm": 0.6812165379524231,
"learning_rate": 7.580645161290323e-06,
"loss": 0.083,
"step": 47
},
{
"epoch": 0.31017770597738287,
"grad_norm": 0.7888991236686707,
"learning_rate": 7.741935483870968e-06,
"loss": 0.1021,
"step": 48
},
{
"epoch": 0.3166397415185784,
"grad_norm": 0.7519361972808838,
"learning_rate": 7.903225806451613e-06,
"loss": 0.095,
"step": 49
},
{
"epoch": 0.32310177705977383,
"grad_norm": 0.8417134881019592,
"learning_rate": 8.064516129032258e-06,
"loss": 0.0904,
"step": 50
},
{
"epoch": 0.3295638126009693,
"grad_norm": 0.7418419122695923,
"learning_rate": 8.225806451612904e-06,
"loss": 0.1081,
"step": 51
},
{
"epoch": 0.3360258481421648,
"grad_norm": 0.7588947415351868,
"learning_rate": 8.387096774193549e-06,
"loss": 0.106,
"step": 52
},
{
"epoch": 0.34248788368336025,
"grad_norm": 0.5819141864776611,
"learning_rate": 8.548387096774194e-06,
"loss": 0.0767,
"step": 53
},
{
"epoch": 0.34894991922455576,
"grad_norm": 0.7023757100105286,
"learning_rate": 8.70967741935484e-06,
"loss": 0.1041,
"step": 54
},
{
"epoch": 0.3554119547657512,
"grad_norm": 0.6532196402549744,
"learning_rate": 8.870967741935484e-06,
"loss": 0.0815,
"step": 55
},
{
"epoch": 0.36187399030694667,
"grad_norm": 0.7267298102378845,
"learning_rate": 9.03225806451613e-06,
"loss": 0.0994,
"step": 56
},
{
"epoch": 0.3683360258481422,
"grad_norm": 0.6799736618995667,
"learning_rate": 9.193548387096775e-06,
"loss": 0.0867,
"step": 57
},
{
"epoch": 0.37479806138933763,
"grad_norm": 0.6375831961631775,
"learning_rate": 9.35483870967742e-06,
"loss": 0.082,
"step": 58
},
{
"epoch": 0.38126009693053314,
"grad_norm": 0.5718384385108948,
"learning_rate": 9.516129032258065e-06,
"loss": 0.0769,
"step": 59
},
{
"epoch": 0.3877221324717286,
"grad_norm": 0.682659924030304,
"learning_rate": 9.67741935483871e-06,
"loss": 0.0744,
"step": 60
},
{
"epoch": 0.39418416801292405,
"grad_norm": 0.7422767877578735,
"learning_rate": 9.838709677419356e-06,
"loss": 0.0838,
"step": 61
},
{
"epoch": 0.40064620355411956,
"grad_norm": 0.6545633673667908,
"learning_rate": 1e-05,
"loss": 0.083,
"step": 62
},
{
"epoch": 0.407108239095315,
"grad_norm": 0.6176275014877319,
"learning_rate": 9.99991960684721e-06,
"loss": 0.0819,
"step": 63
},
{
"epoch": 0.4135702746365105,
"grad_norm": 0.6768634915351868,
"learning_rate": 9.999678429974063e-06,
"loss": 0.0742,
"step": 64
},
{
"epoch": 0.420032310177706,
"grad_norm": 0.727836549282074,
"learning_rate": 9.999276477136145e-06,
"loss": 0.0915,
"step": 65
},
{
"epoch": 0.42649434571890144,
"grad_norm": 0.6889824867248535,
"learning_rate": 9.998713761259157e-06,
"loss": 0.0805,
"step": 66
},
{
"epoch": 0.43295638126009695,
"grad_norm": 0.7547912001609802,
"learning_rate": 9.997990300438505e-06,
"loss": 0.0852,
"step": 67
},
{
"epoch": 0.4394184168012924,
"grad_norm": 0.5975009202957153,
"learning_rate": 9.997106117938704e-06,
"loss": 0.0818,
"step": 68
},
{
"epoch": 0.4458804523424879,
"grad_norm": 1.1194217205047607,
"learning_rate": 9.996061242192645e-06,
"loss": 0.0961,
"step": 69
},
{
"epoch": 0.45234248788368336,
"grad_norm": 0.6122073531150818,
"learning_rate": 9.994855706800666e-06,
"loss": 0.0714,
"step": 70
},
{
"epoch": 0.4588045234248788,
"grad_norm": 19.22555923461914,
"learning_rate": 9.993489550529486e-06,
"loss": 0.0691,
"step": 71
},
{
"epoch": 0.46526655896607433,
"grad_norm": 0.468181848526001,
"learning_rate": 9.991962817310947e-06,
"loss": 0.0661,
"step": 72
},
{
"epoch": 0.4717285945072698,
"grad_norm": 13.211414337158203,
"learning_rate": 9.990275556240612e-06,
"loss": 0.0849,
"step": 73
},
{
"epoch": 0.4781906300484653,
"grad_norm": 0.6043105125427246,
"learning_rate": 9.98842782157617e-06,
"loss": 0.0803,
"step": 74
},
{
"epoch": 0.48465266558966075,
"grad_norm": 0.6836904883384705,
"learning_rate": 9.986419672735712e-06,
"loss": 0.0984,
"step": 75
},
{
"epoch": 0.4911147011308562,
"grad_norm": 0.7549227476119995,
"learning_rate": 9.9842511742958e-06,
"loss": 0.0682,
"step": 76
},
{
"epoch": 0.4975767366720517,
"grad_norm": 0.671994149684906,
"learning_rate": 9.981922395989409e-06,
"loss": 0.07,
"step": 77
},
{
"epoch": 0.5040387722132472,
"grad_norm": 0.8292804956436157,
"learning_rate": 9.97943341270367e-06,
"loss": 0.0918,
"step": 78
},
{
"epoch": 0.5105008077544426,
"grad_norm": 0.5530771017074585,
"learning_rate": 9.976784304477467e-06,
"loss": 0.0768,
"step": 79
},
{
"epoch": 0.5169628432956381,
"grad_norm": 0.5503768920898438,
"learning_rate": 9.973975156498866e-06,
"loss": 0.0923,
"step": 80
},
{
"epoch": 0.5234248788368336,
"grad_norm": 0.46011969447135925,
"learning_rate": 9.971006059102369e-06,
"loss": 0.063,
"step": 81
},
{
"epoch": 0.529886914378029,
"grad_norm": 0.5449568033218384,
"learning_rate": 9.96787710776602e-06,
"loss": 0.0667,
"step": 82
},
{
"epoch": 0.5363489499192245,
"grad_norm": 0.5621317028999329,
"learning_rate": 9.964588403108324e-06,
"loss": 0.0568,
"step": 83
},
{
"epoch": 0.5428109854604201,
"grad_norm": 0.4867885410785675,
"learning_rate": 9.961140050885014e-06,
"loss": 0.0681,
"step": 84
},
{
"epoch": 0.5492730210016155,
"grad_norm": 0.5386512279510498,
"learning_rate": 9.957532161985654e-06,
"loss": 0.0711,
"step": 85
},
{
"epoch": 0.555735056542811,
"grad_norm": 0.4690256416797638,
"learning_rate": 9.95376485243007e-06,
"loss": 0.0667,
"step": 86
},
{
"epoch": 0.5621970920840065,
"grad_norm": 0.5720282793045044,
"learning_rate": 9.949838243364614e-06,
"loss": 0.0939,
"step": 87
},
{
"epoch": 0.568659127625202,
"grad_norm": 0.6215516924858093,
"learning_rate": 9.945752461058286e-06,
"loss": 0.0691,
"step": 88
},
{
"epoch": 0.5751211631663974,
"grad_norm": 0.4669437110424042,
"learning_rate": 9.941507636898651e-06,
"loss": 0.0685,
"step": 89
},
{
"epoch": 0.5815831987075929,
"grad_norm": 0.4649712145328522,
"learning_rate": 9.937103907387626e-06,
"loss": 0.074,
"step": 90
},
{
"epoch": 0.5880452342487884,
"grad_norm": 0.7192949056625366,
"learning_rate": 9.932541414137096e-06,
"loss": 0.0965,
"step": 91
},
{
"epoch": 0.5945072697899838,
"grad_norm": 13.985039710998535,
"learning_rate": 9.927820303864342e-06,
"loss": 0.0651,
"step": 92
},
{
"epoch": 0.6009693053311793,
"grad_norm": 0.5609593987464905,
"learning_rate": 9.922940728387345e-06,
"loss": 0.0796,
"step": 93
},
{
"epoch": 0.6074313408723748,
"grad_norm": 1.4020167589187622,
"learning_rate": 9.917902844619885e-06,
"loss": 0.0657,
"step": 94
},
{
"epoch": 0.6138933764135702,
"grad_norm": 0.5554618239402771,
"learning_rate": 9.912706814566504e-06,
"loss": 0.0676,
"step": 95
},
{
"epoch": 0.6203554119547657,
"grad_norm": 0.6547884345054626,
"learning_rate": 9.907352805317301e-06,
"loss": 0.109,
"step": 96
},
{
"epoch": 0.6268174474959612,
"grad_norm": 0.6062126755714417,
"learning_rate": 9.901840989042547e-06,
"loss": 0.0864,
"step": 97
},
{
"epoch": 0.6332794830371568,
"grad_norm": 0.4889056980609894,
"learning_rate": 9.896171542987158e-06,
"loss": 0.0632,
"step": 98
},
{
"epoch": 0.6397415185783522,
"grad_norm": 0.6889820098876953,
"learning_rate": 9.890344649464992e-06,
"loss": 0.0782,
"step": 99
},
{
"epoch": 0.6462035541195477,
"grad_norm": 1.3493472337722778,
"learning_rate": 9.884360495852984e-06,
"loss": 0.0779,
"step": 100
},
{
"epoch": 0.6526655896607432,
"grad_norm": 0.5208078026771545,
"learning_rate": 9.878219274585125e-06,
"loss": 0.072,
"step": 101
},
{
"epoch": 0.6591276252019386,
"grad_norm": 0.6799489259719849,
"learning_rate": 9.871921183146272e-06,
"loss": 0.0718,
"step": 102
},
{
"epoch": 0.6655896607431341,
"grad_norm": 0.5299189686775208,
"learning_rate": 9.865466424065792e-06,
"loss": 0.0687,
"step": 103
},
{
"epoch": 0.6720516962843296,
"grad_norm": 0.6351456046104431,
"learning_rate": 9.858855204911065e-06,
"loss": 0.0892,
"step": 104
},
{
"epoch": 0.678513731825525,
"grad_norm": 0.567482054233551,
"learning_rate": 9.852087738280784e-06,
"loss": 0.0778,
"step": 105
},
{
"epoch": 0.6849757673667205,
"grad_norm": 0.5811858773231506,
"learning_rate": 9.845164241798143e-06,
"loss": 0.0709,
"step": 106
},
{
"epoch": 0.691437802907916,
"grad_norm": 0.600745677947998,
"learning_rate": 9.838084938103832e-06,
"loss": 0.0721,
"step": 107
},
{
"epoch": 0.6978998384491115,
"grad_norm": 0.5404493808746338,
"learning_rate": 9.830850054848859e-06,
"loss": 0.0651,
"step": 108
},
{
"epoch": 0.7043618739903069,
"grad_norm": 0.6496622562408447,
"learning_rate": 9.823459824687262e-06,
"loss": 0.0794,
"step": 109
},
{
"epoch": 0.7108239095315024,
"grad_norm": 0.5033782124519348,
"learning_rate": 9.815914485268598e-06,
"loss": 0.0866,
"step": 110
},
{
"epoch": 0.7172859450726979,
"grad_norm": 0.48423922061920166,
"learning_rate": 9.808214279230317e-06,
"loss": 0.0681,
"step": 111
},
{
"epoch": 0.7237479806138933,
"grad_norm": 0.5603338479995728,
"learning_rate": 9.800359454189955e-06,
"loss": 0.0894,
"step": 112
},
{
"epoch": 0.7302100161550888,
"grad_norm": 0.554610550403595,
"learning_rate": 9.792350262737173e-06,
"loss": 0.0735,
"step": 113
},
{
"epoch": 0.7366720516962844,
"grad_norm": 0.4603743553161621,
"learning_rate": 9.784186962425633e-06,
"loss": 0.0546,
"step": 114
},
{
"epoch": 0.7431340872374798,
"grad_norm": 0.4438991844654083,
"learning_rate": 9.775869815764713e-06,
"loss": 0.0571,
"step": 115
},
{
"epoch": 0.7495961227786753,
"grad_norm": 0.6239739060401917,
"learning_rate": 9.76739909021107e-06,
"loss": 0.0997,
"step": 116
},
{
"epoch": 0.7560581583198708,
"grad_norm": 0.47505640983581543,
"learning_rate": 9.758775058160037e-06,
"loss": 0.0755,
"step": 117
},
{
"epoch": 0.7625201938610663,
"grad_norm": 0.5345163345336914,
"learning_rate": 9.749997996936866e-06,
"loss": 0.0754,
"step": 118
},
{
"epoch": 0.7689822294022617,
"grad_norm": 0.4881122410297394,
"learning_rate": 9.741068188787806e-06,
"loss": 0.0787,
"step": 119
},
{
"epoch": 0.7754442649434572,
"grad_norm": 0.6172494888305664,
"learning_rate": 9.731985920871028e-06,
"loss": 0.0737,
"step": 120
},
{
"epoch": 0.7819063004846527,
"grad_norm": 0.4486066401004791,
"learning_rate": 9.722751485247393e-06,
"loss": 0.0702,
"step": 121
},
{
"epoch": 0.7883683360258481,
"grad_norm": 1.1322523355484009,
"learning_rate": 9.713365178871061e-06,
"loss": 0.0766,
"step": 122
},
{
"epoch": 0.7948303715670436,
"grad_norm": 0.44906798005104065,
"learning_rate": 9.703827303579936e-06,
"loss": 0.0635,
"step": 123
},
{
"epoch": 0.8012924071082391,
"grad_norm": 0.5776321887969971,
"learning_rate": 9.694138166085964e-06,
"loss": 0.0733,
"step": 124
},
{
"epoch": 0.8077544426494345,
"grad_norm": 0.45236676931381226,
"learning_rate": 9.684298077965269e-06,
"loss": 0.0653,
"step": 125
},
{
"epoch": 0.81421647819063,
"grad_norm": 0.4732246696949005,
"learning_rate": 9.674307355648136e-06,
"loss": 0.0647,
"step": 126
},
{
"epoch": 0.8206785137318255,
"grad_norm": 0.5663241744041443,
"learning_rate": 9.664166320408828e-06,
"loss": 0.0634,
"step": 127
},
{
"epoch": 0.827140549273021,
"grad_norm": 0.6197142004966736,
"learning_rate": 9.653875298355264e-06,
"loss": 0.0854,
"step": 128
},
{
"epoch": 0.8336025848142165,
"grad_norm": 0.6459212899208069,
"learning_rate": 9.64343462041853e-06,
"loss": 0.0744,
"step": 129
},
{
"epoch": 0.840064620355412,
"grad_norm": 0.6052826046943665,
"learning_rate": 9.63284462234223e-06,
"loss": 0.0882,
"step": 130
},
{
"epoch": 0.8465266558966075,
"grad_norm": 0.6480473875999451,
"learning_rate": 9.622105644671698e-06,
"loss": 0.0721,
"step": 131
},
{
"epoch": 0.8529886914378029,
"grad_norm": 0.5728505849838257,
"learning_rate": 9.611218032743044e-06,
"loss": 0.0929,
"step": 132
},
{
"epoch": 0.8594507269789984,
"grad_norm": 0.5130248665809631,
"learning_rate": 9.600182136672048e-06,
"loss": 0.0738,
"step": 133
},
{
"epoch": 0.8659127625201939,
"grad_norm": 0.4673842787742615,
"learning_rate": 9.5889983113429e-06,
"loss": 0.0631,
"step": 134
},
{
"epoch": 0.8723747980613893,
"grad_norm": 0.4138599932193756,
"learning_rate": 9.57766691639679e-06,
"loss": 0.0639,
"step": 135
},
{
"epoch": 0.8788368336025848,
"grad_norm": 0.520550549030304,
"learning_rate": 9.56618831622035e-06,
"loss": 0.0615,
"step": 136
},
{
"epoch": 0.8852988691437803,
"grad_norm": 0.5906829833984375,
"learning_rate": 9.554562879933917e-06,
"loss": 0.0767,
"step": 137
},
{
"epoch": 0.8917609046849758,
"grad_norm": 0.528221070766449,
"learning_rate": 9.542790981379683e-06,
"loss": 0.0668,
"step": 138
},
{
"epoch": 0.8982229402261712,
"grad_norm": 0.5368471741676331,
"learning_rate": 9.530872999109665e-06,
"loss": 0.068,
"step": 139
},
{
"epoch": 0.9046849757673667,
"grad_norm": 0.5090873837471008,
"learning_rate": 9.51880931637353e-06,
"loss": 0.0615,
"step": 140
},
{
"epoch": 0.9111470113085622,
"grad_norm": 0.4326649308204651,
"learning_rate": 9.506600321106273e-06,
"loss": 0.0612,
"step": 141
},
{
"epoch": 0.9176090468497576,
"grad_norm": 0.5169522762298584,
"learning_rate": 9.494246405915743e-06,
"loss": 0.0707,
"step": 142
},
{
"epoch": 0.9240710823909531,
"grad_norm": 0.5026911497116089,
"learning_rate": 9.481747968070018e-06,
"loss": 0.064,
"step": 143
},
{
"epoch": 0.9305331179321487,
"grad_norm": 0.45077645778656006,
"learning_rate": 9.469105409484628e-06,
"loss": 0.053,
"step": 144
},
{
"epoch": 0.9369951534733441,
"grad_norm": 0.46447330713272095,
"learning_rate": 9.456319136709628e-06,
"loss": 0.0679,
"step": 145
},
{
"epoch": 0.9434571890145396,
"grad_norm": 0.43717750906944275,
"learning_rate": 9.443389560916532e-06,
"loss": 0.0734,
"step": 146
},
{
"epoch": 0.9499192245557351,
"grad_norm": 0.4999764561653137,
"learning_rate": 9.430317097885082e-06,
"loss": 0.0764,
"step": 147
},
{
"epoch": 0.9563812600969306,
"grad_norm": 0.5041486024856567,
"learning_rate": 9.417102167989888e-06,
"loss": 0.0776,
"step": 148
},
{
"epoch": 0.962843295638126,
"grad_norm": 0.5824252367019653,
"learning_rate": 9.403745196186904e-06,
"loss": 0.0871,
"step": 149
},
{
"epoch": 0.9693053311793215,
"grad_norm": 0.44613441824913025,
"learning_rate": 9.390246611999754e-06,
"loss": 0.0753,
"step": 150
},
{
"epoch": 0.975767366720517,
"grad_norm": 0.5553467273712158,
"learning_rate": 9.376606849505939e-06,
"loss": 0.0788,
"step": 151
},
{
"epoch": 0.9822294022617124,
"grad_norm": 0.47406890988349915,
"learning_rate": 9.362826347322857e-06,
"loss": 0.0612,
"step": 152
},
{
"epoch": 0.9886914378029079,
"grad_norm": 0.36964505910873413,
"learning_rate": 9.348905548593722e-06,
"loss": 0.0683,
"step": 153
},
{
"epoch": 0.9951534733441034,
"grad_norm": 0.4598817229270935,
"learning_rate": 9.334844900973292e-06,
"loss": 0.0718,
"step": 154
},
{
"epoch": 1.0,
"grad_norm": 0.4598817229270935,
"learning_rate": 9.320644856613482e-06,
"loss": 0.0629,
"step": 155
},
{
"epoch": 1.0064620355411955,
"grad_norm": 0.4689626395702362,
"learning_rate": 9.306305872148826e-06,
"loss": 0.0471,
"step": 156
},
{
"epoch": 1.012924071082391,
"grad_norm": 0.37304380536079407,
"learning_rate": 9.291828408681796e-06,
"loss": 0.0471,
"step": 157
},
{
"epoch": 1.0193861066235865,
"grad_norm": 0.45355430245399475,
"learning_rate": 9.277212931767958e-06,
"loss": 0.0497,
"step": 158
},
{
"epoch": 1.0258481421647818,
"grad_norm": 0.4865805506706238,
"learning_rate": 9.262459911401025e-06,
"loss": 0.0662,
"step": 159
},
{
"epoch": 1.0323101777059773,
"grad_norm": 0.35557398200035095,
"learning_rate": 9.247569821997724e-06,
"loss": 0.0397,
"step": 160
},
{
"epoch": 1.0387722132471728,
"grad_norm": 0.34070008993148804,
"learning_rate": 9.232543142382546e-06,
"loss": 0.0614,
"step": 161
},
{
"epoch": 1.0452342487883683,
"grad_norm": 0.4690254330635071,
"learning_rate": 9.217380355772353e-06,
"loss": 0.0484,
"step": 162
},
{
"epoch": 1.0516962843295639,
"grad_norm": 0.3816601634025574,
"learning_rate": 9.202081949760833e-06,
"loss": 0.045,
"step": 163
},
{
"epoch": 1.0581583198707594,
"grad_norm": 0.7278222441673279,
"learning_rate": 9.186648416302823e-06,
"loss": 0.0497,
"step": 164
},
{
"epoch": 1.0646203554119547,
"grad_norm": 0.3829418420791626,
"learning_rate": 9.171080251698488e-06,
"loss": 0.0371,
"step": 165
},
{
"epoch": 1.0710823909531502,
"grad_norm": 0.33008721470832825,
"learning_rate": 9.155377956577363e-06,
"loss": 0.0561,
"step": 166
},
{
"epoch": 1.0775444264943457,
"grad_norm": 0.47766488790512085,
"learning_rate": 9.13954203588225e-06,
"loss": 0.0528,
"step": 167
},
{
"epoch": 1.0840064620355412,
"grad_norm": 0.3429339528083801,
"learning_rate": 9.123572998852988e-06,
"loss": 0.0524,
"step": 168
},
{
"epoch": 1.0904684975767367,
"grad_norm": 0.4901416003704071,
"learning_rate": 9.107471359010069e-06,
"loss": 0.0512,
"step": 169
},
{
"epoch": 1.0969305331179322,
"grad_norm": 0.4330100119113922,
"learning_rate": 9.091237634138133e-06,
"loss": 0.054,
"step": 170
},
{
"epoch": 1.1033925686591277,
"grad_norm": 0.3829297721385956,
"learning_rate": 9.074872346269305e-06,
"loss": 0.0414,
"step": 171
},
{
"epoch": 1.109854604200323,
"grad_norm": 0.3282840847969055,
"learning_rate": 9.058376021666424e-06,
"loss": 0.0476,
"step": 172
},
{
"epoch": 1.1163166397415185,
"grad_norm": 0.39873006939888,
"learning_rate": 9.041749190806105e-06,
"loss": 0.0514,
"step": 173
},
{
"epoch": 1.122778675282714,
"grad_norm": 0.44557517766952515,
"learning_rate": 9.024992388361691e-06,
"loss": 0.0435,
"step": 174
},
{
"epoch": 1.1292407108239095,
"grad_norm": 3.866434097290039,
"learning_rate": 9.008106153186055e-06,
"loss": 0.0497,
"step": 175
},
{
"epoch": 1.135702746365105,
"grad_norm": 0.3858250677585602,
"learning_rate": 8.991091028294268e-06,
"loss": 0.0464,
"step": 176
},
{
"epoch": 1.1421647819063006,
"grad_norm": 0.47651711106300354,
"learning_rate": 8.973947560846146e-06,
"loss": 0.0376,
"step": 177
},
{
"epoch": 1.148626817447496,
"grad_norm": 0.7432534694671631,
"learning_rate": 8.956676302128646e-06,
"loss": 0.0436,
"step": 178
},
{
"epoch": 1.1550888529886914,
"grad_norm": 0.523420512676239,
"learning_rate": 8.939277807538147e-06,
"loss": 0.0619,
"step": 179
},
{
"epoch": 1.1615508885298869,
"grad_norm": 0.463724821805954,
"learning_rate": 8.921752636562582e-06,
"loss": 0.0455,
"step": 180
},
{
"epoch": 1.1680129240710824,
"grad_norm": 0.45723769068717957,
"learning_rate": 8.90410135276345e-06,
"loss": 0.0421,
"step": 181
},
{
"epoch": 1.1744749596122779,
"grad_norm": 0.38577011227607727,
"learning_rate": 8.886324523757692e-06,
"loss": 0.0531,
"step": 182
},
{
"epoch": 1.1809369951534734,
"grad_norm": 0.5292965173721313,
"learning_rate": 8.868422721199442e-06,
"loss": 0.0635,
"step": 183
},
{
"epoch": 1.187399030694669,
"grad_norm": 0.3577275574207306,
"learning_rate": 8.850396520761636e-06,
"loss": 0.0458,
"step": 184
},
{
"epoch": 1.1938610662358644,
"grad_norm": 0.4251251518726349,
"learning_rate": 8.832246502117512e-06,
"loss": 0.0363,
"step": 185
},
{
"epoch": 1.2003231017770597,
"grad_norm": 3.846379041671753,
"learning_rate": 8.813973248921958e-06,
"loss": 0.0512,
"step": 186
},
{
"epoch": 1.2067851373182552,
"grad_norm": 0.41917675733566284,
"learning_rate": 8.795577348792748e-06,
"loss": 0.0421,
"step": 187
},
{
"epoch": 1.2132471728594507,
"grad_norm": 0.5150820016860962,
"learning_rate": 8.777059393291645e-06,
"loss": 0.0413,
"step": 188
},
{
"epoch": 1.2197092084006462,
"grad_norm": 0.3357498049736023,
"learning_rate": 8.75841997790538e-06,
"loss": 0.0422,
"step": 189
},
{
"epoch": 1.2261712439418417,
"grad_norm": 0.37925323843955994,
"learning_rate": 8.739659702026502e-06,
"loss": 0.0448,
"step": 190
},
{
"epoch": 1.2326332794830372,
"grad_norm": 0.8641751408576965,
"learning_rate": 8.7207791689341e-06,
"loss": 0.0599,
"step": 191
},
{
"epoch": 1.2390953150242328,
"grad_norm": 0.4052288234233856,
"learning_rate": 8.701778985774405e-06,
"loss": 0.0503,
"step": 192
},
{
"epoch": 1.245557350565428,
"grad_norm": 0.45619818568229675,
"learning_rate": 8.68265976354127e-06,
"loss": 0.0483,
"step": 193
},
{
"epoch": 1.2520193861066236,
"grad_norm": 0.35979220271110535,
"learning_rate": 8.663422117056519e-06,
"loss": 0.0413,
"step": 194
},
{
"epoch": 1.258481421647819,
"grad_norm": 0.3667192757129669,
"learning_rate": 8.644066664950169e-06,
"loss": 0.0383,
"step": 195
},
{
"epoch": 1.2649434571890146,
"grad_norm": 0.3911844491958618,
"learning_rate": 8.62459402964055e-06,
"loss": 0.0461,
"step": 196
},
{
"epoch": 1.27140549273021,
"grad_norm": 0.5011119842529297,
"learning_rate": 8.605004837314277e-06,
"loss": 0.0492,
"step": 197
},
{
"epoch": 1.2778675282714054,
"grad_norm": 0.562304675579071,
"learning_rate": 8.585299717906127e-06,
"loss": 0.0668,
"step": 198
},
{
"epoch": 1.284329563812601,
"grad_norm": 0.5580583214759827,
"learning_rate": 8.565479305078767e-06,
"loss": 0.0352,
"step": 199
},
{
"epoch": 1.2907915993537964,
"grad_norm": 0.5646708011627197,
"learning_rate": 8.54554423620239e-06,
"loss": 0.0489,
"step": 200
},
{
"epoch": 1.297253634894992,
"grad_norm": 0.5223414897918701,
"learning_rate": 8.525495152334211e-06,
"loss": 0.0447,
"step": 201
},
{
"epoch": 1.3037156704361874,
"grad_norm": 0.4284669756889343,
"learning_rate": 8.505332698197853e-06,
"loss": 0.0518,
"step": 202
},
{
"epoch": 1.310177705977383,
"grad_norm": 0.4862108826637268,
"learning_rate": 8.48505752216262e-06,
"loss": 0.0389,
"step": 203
},
{
"epoch": 1.3166397415185784,
"grad_norm": 0.4296407103538513,
"learning_rate": 8.464670276222642e-06,
"loss": 0.0509,
"step": 204
},
{
"epoch": 1.3231017770597737,
"grad_norm": 0.40503671765327454,
"learning_rate": 8.444171615975909e-06,
"loss": 0.0493,
"step": 205
},
{
"epoch": 1.3295638126009692,
"grad_norm": 0.39369431138038635,
"learning_rate": 8.423562200603192e-06,
"loss": 0.0594,
"step": 206
},
{
"epoch": 1.3360258481421647,
"grad_norm": 0.5622990727424622,
"learning_rate": 8.402842692846842e-06,
"loss": 0.039,
"step": 207
},
{
"epoch": 1.3424878836833603,
"grad_norm": 0.4047924280166626,
"learning_rate": 8.38201375898948e-06,
"loss": 0.0363,
"step": 208
},
{
"epoch": 1.3489499192245558,
"grad_norm": 0.4414857029914856,
"learning_rate": 8.361076068832574e-06,
"loss": 0.0582,
"step": 209
},
{
"epoch": 1.3554119547657513,
"grad_norm": 0.43151915073394775,
"learning_rate": 8.340030295674887e-06,
"loss": 0.0433,
"step": 210
},
{
"epoch": 1.3618739903069468,
"grad_norm": 0.49384137988090515,
"learning_rate": 8.31887711629085e-06,
"loss": 0.0514,
"step": 211
},
{
"epoch": 1.368336025848142,
"grad_norm": 0.4050130844116211,
"learning_rate": 8.29761721090877e-06,
"loss": 0.0448,
"step": 212
},
{
"epoch": 1.3747980613893376,
"grad_norm": 0.351788192987442,
"learning_rate": 8.276251263188976e-06,
"loss": 0.0415,
"step": 213
},
{
"epoch": 1.381260096930533,
"grad_norm": 0.3712752163410187,
"learning_rate": 8.254779960201831e-06,
"loss": 0.0428,
"step": 214
},
{
"epoch": 1.3877221324717286,
"grad_norm": 0.408925861120224,
"learning_rate": 8.23320399240563e-06,
"loss": 0.0527,
"step": 215
},
{
"epoch": 1.394184168012924,
"grad_norm": 0.3616960942745209,
"learning_rate": 8.2115240536244e-06,
"loss": 0.0323,
"step": 216
},
{
"epoch": 1.4006462035541196,
"grad_norm": 0.3473566472530365,
"learning_rate": 8.1897408410256e-06,
"loss": 0.0463,
"step": 217
},
{
"epoch": 1.4071082390953151,
"grad_norm": 0.477464497089386,
"learning_rate": 8.16785505509768e-06,
"loss": 0.0521,
"step": 218
},
{
"epoch": 1.4135702746365104,
"grad_norm": 0.6549597978591919,
"learning_rate": 8.145867399627575e-06,
"loss": 0.0467,
"step": 219
},
{
"epoch": 1.420032310177706,
"grad_norm": 0.46223175525665283,
"learning_rate": 8.123778581678064e-06,
"loss": 0.0444,
"step": 220
},
{
"epoch": 1.4264943457189014,
"grad_norm": 0.5096275210380554,
"learning_rate": 8.10158931156503e-06,
"loss": 0.0451,
"step": 221
},
{
"epoch": 1.432956381260097,
"grad_norm": 0.37000322341918945,
"learning_rate": 8.079300302834632e-06,
"loss": 0.051,
"step": 222
},
{
"epoch": 1.4394184168012925,
"grad_norm": 0.671576738357544,
"learning_rate": 8.056912272240338e-06,
"loss": 0.0466,
"step": 223
},
{
"epoch": 1.445880452342488,
"grad_norm": 0.35391512513160706,
"learning_rate": 8.034425939719896e-06,
"loss": 0.0528,
"step": 224
},
{
"epoch": 1.4523424878836835,
"grad_norm": 0.35577550530433655,
"learning_rate": 8.011842028372175e-06,
"loss": 0.047,
"step": 225
},
{
"epoch": 1.4588045234248788,
"grad_norm": 0.4128228425979614,
"learning_rate": 7.989161264433904e-06,
"loss": 0.04,
"step": 226
},
{
"epoch": 1.4652665589660743,
"grad_norm": 3.7340619564056396,
"learning_rate": 7.966384377256335e-06,
"loss": 0.0509,
"step": 227
},
{
"epoch": 1.4717285945072698,
"grad_norm": 0.4408370852470398,
"learning_rate": 7.943512099281776e-06,
"loss": 0.0533,
"step": 228
},
{
"epoch": 1.4781906300484653,
"grad_norm": 0.3726482093334198,
"learning_rate": 7.92054516602004e-06,
"loss": 0.0395,
"step": 229
},
{
"epoch": 1.4846526655896608,
"grad_norm": 0.40403544902801514,
"learning_rate": 7.897484316024799e-06,
"loss": 0.0582,
"step": 230
},
{
"epoch": 1.491114701130856,
"grad_norm": 0.4779585301876068,
"learning_rate": 7.874330290869829e-06,
"loss": 0.0422,
"step": 231
},
{
"epoch": 1.4975767366720518,
"grad_norm": 0.46821513772010803,
"learning_rate": 7.85108383512516e-06,
"loss": 0.0551,
"step": 232
},
{
"epoch": 1.504038772213247,
"grad_norm": 0.3851401209831238,
"learning_rate": 7.827745696333139e-06,
"loss": 0.0542,
"step": 233
},
{
"epoch": 1.5105008077544426,
"grad_norm": 0.3976247310638428,
"learning_rate": 7.804316624984391e-06,
"loss": 0.0444,
"step": 234
},
{
"epoch": 1.5169628432956381,
"grad_norm": 0.3913721740245819,
"learning_rate": 7.780797374493683e-06,
"loss": 0.0421,
"step": 235
},
{
"epoch": 1.5234248788368336,
"grad_norm": 0.38424035906791687,
"learning_rate": 7.757188701175688e-06,
"loss": 0.057,
"step": 236
},
{
"epoch": 1.5298869143780292,
"grad_norm": 0.36402031779289246,
"learning_rate": 7.733491364220686e-06,
"loss": 0.0599,
"step": 237
},
{
"epoch": 1.5363489499192244,
"grad_norm": 0.5822838544845581,
"learning_rate": 7.709706125670124e-06,
"loss": 0.0418,
"step": 238
},
{
"epoch": 1.5428109854604202,
"grad_norm": 0.38915562629699707,
"learning_rate": 7.685833750392131e-06,
"loss": 0.0462,
"step": 239
},
{
"epoch": 1.5492730210016155,
"grad_norm": 0.41611573100090027,
"learning_rate": 7.661875006056914e-06,
"loss": 0.0363,
"step": 240
},
{
"epoch": 1.555735056542811,
"grad_norm": 0.3718394637107849,
"learning_rate": 7.637830663112064e-06,
"loss": 0.0562,
"step": 241
},
{
"epoch": 1.5621970920840065,
"grad_norm": 0.5133814811706543,
"learning_rate": 7.613701494757803e-06,
"loss": 0.0566,
"step": 242
},
{
"epoch": 1.568659127625202,
"grad_norm": 0.496139794588089,
"learning_rate": 7.589488276922095e-06,
"loss": 0.053,
"step": 243
},
{
"epoch": 1.5751211631663975,
"grad_norm": 0.43612685799598694,
"learning_rate": 7.5651917882357075e-06,
"loss": 0.0371,
"step": 244
},
{
"epoch": 1.5815831987075928,
"grad_norm": 0.47427716851234436,
"learning_rate": 7.540812810007172e-06,
"loss": 0.0652,
"step": 245
},
{
"epoch": 1.5880452342487885,
"grad_norm": 0.48456844687461853,
"learning_rate": 7.516352126197658e-06,
"loss": 0.05,
"step": 246
},
{
"epoch": 1.5945072697899838,
"grad_norm": 0.4757959246635437,
"learning_rate": 7.491810523395762e-06,
"loss": 0.051,
"step": 247
},
{
"epoch": 1.6009693053311793,
"grad_norm": 0.44984814524650574,
"learning_rate": 7.467188790792213e-06,
"loss": 0.0469,
"step": 248
},
{
"epoch": 1.6074313408723748,
"grad_norm": 0.3804328739643097,
"learning_rate": 7.442487720154494e-06,
"loss": 0.0438,
"step": 249
},
{
"epoch": 1.6138933764135701,
"grad_norm": 0.3791749179363251,
"learning_rate": 7.417708105801386e-06,
"loss": 0.0488,
"step": 250
},
{
"epoch": 1.6203554119547658,
"grad_norm": 0.3565092980861664,
"learning_rate": 7.392850744577416e-06,
"loss": 0.0369,
"step": 251
},
{
"epoch": 1.6268174474959611,
"grad_norm": 0.3572937250137329,
"learning_rate": 7.36791643582724e-06,
"loss": 0.0368,
"step": 252
},
{
"epoch": 1.6332794830371569,
"grad_norm": 3.2710509300231934,
"learning_rate": 7.342905981369937e-06,
"loss": 0.061,
"step": 253
},
{
"epoch": 1.6397415185783522,
"grad_norm": 0.4295857548713684,
"learning_rate": 7.31782018547322e-06,
"loss": 0.0428,
"step": 254
},
{
"epoch": 1.6462035541195477,
"grad_norm": 0.40736424922943115,
"learning_rate": 7.2926598548275765e-06,
"loss": 0.0419,
"step": 255
},
{
"epoch": 1.6526655896607432,
"grad_norm": 0.3768783211708069,
"learning_rate": 7.267425798520333e-06,
"loss": 0.0417,
"step": 256
},
{
"epoch": 1.6591276252019385,
"grad_norm": 0.39212289452552795,
"learning_rate": 7.242118828009622e-06,
"loss": 0.0538,
"step": 257
},
{
"epoch": 1.6655896607431342,
"grad_norm": 0.4783058166503906,
"learning_rate": 7.2167397570983075e-06,
"loss": 0.0402,
"step": 258
},
{
"epoch": 1.6720516962843295,
"grad_norm": 0.4783688485622406,
"learning_rate": 7.191289401907796e-06,
"loss": 0.0435,
"step": 259
},
{
"epoch": 1.678513731825525,
"grad_norm": 0.44153332710266113,
"learning_rate": 7.165768580851806e-06,
"loss": 0.0429,
"step": 260
},
{
"epoch": 1.6849757673667205,
"grad_norm": 0.37209975719451904,
"learning_rate": 7.140178114610045e-06,
"loss": 0.0613,
"step": 261
},
{
"epoch": 1.691437802907916,
"grad_norm": 0.4744909703731537,
"learning_rate": 7.114518826101815e-06,
"loss": 0.048,
"step": 262
},
{
"epoch": 1.6978998384491115,
"grad_norm": 0.5942760705947876,
"learning_rate": 7.088791540459562e-06,
"loss": 0.043,
"step": 263
},
{
"epoch": 1.7043618739903068,
"grad_norm": 0.43296632170677185,
"learning_rate": 7.062997085002322e-06,
"loss": 0.0486,
"step": 264
},
{
"epoch": 1.7108239095315025,
"grad_norm": 0.4319445788860321,
"learning_rate": 7.03713628920914e-06,
"loss": 0.0417,
"step": 265
},
{
"epoch": 1.7172859450726978,
"grad_norm": 2.305532932281494,
"learning_rate": 7.011209984692375e-06,
"loss": 0.04,
"step": 266
},
{
"epoch": 1.7237479806138933,
"grad_norm": 0.3852980434894562,
"learning_rate": 6.985219005170973e-06,
"loss": 0.0432,
"step": 267
},
{
"epoch": 1.7302100161550888,
"grad_norm": 0.43154263496398926,
"learning_rate": 6.959164186443648e-06,
"loss": 0.0457,
"step": 268
},
{
"epoch": 1.7366720516962844,
"grad_norm": 0.4496629238128662,
"learning_rate": 6.933046366362011e-06,
"loss": 0.0557,
"step": 269
},
{
"epoch": 1.7431340872374799,
"grad_norm": 0.5219531059265137,
"learning_rate": 6.90686638480362e-06,
"loss": 0.0364,
"step": 270
},
{
"epoch": 1.7495961227786752,
"grad_norm": 0.525452196598053,
"learning_rate": 6.88062508364498e-06,
"loss": 0.0567,
"step": 271
},
{
"epoch": 1.7560581583198709,
"grad_norm": 0.3721238672733307,
"learning_rate": 6.8543233067344625e-06,
"loss": 0.0569,
"step": 272
},
{
"epoch": 1.7625201938610662,
"grad_norm": 0.49607154726982117,
"learning_rate": 6.827961899865178e-06,
"loss": 0.0419,
"step": 273
},
{
"epoch": 1.7689822294022617,
"grad_norm": 0.3797336220741272,
"learning_rate": 6.801541710747767e-06,
"loss": 0.0427,
"step": 274
},
{
"epoch": 1.7754442649434572,
"grad_norm": 0.3607645332813263,
"learning_rate": 6.775063588983153e-06,
"loss": 0.0411,
"step": 275
},
{
"epoch": 1.7819063004846527,
"grad_norm": 0.3796041011810303,
"learning_rate": 6.748528386035209e-06,
"loss": 0.0448,
"step": 276
},
{
"epoch": 1.7883683360258482,
"grad_norm": 0.39435023069381714,
"learning_rate": 6.7219369552033865e-06,
"loss": 0.0429,
"step": 277
},
{
"epoch": 1.7948303715670435,
"grad_norm": 0.6904699802398682,
"learning_rate": 6.695290151595271e-06,
"loss": 0.0507,
"step": 278
},
{
"epoch": 1.8012924071082392,
"grad_norm": 0.45726367831230164,
"learning_rate": 6.668588832099081e-06,
"loss": 0.0378,
"step": 279
},
{
"epoch": 1.8077544426494345,
"grad_norm": 0.31050002574920654,
"learning_rate": 6.6418338553561225e-06,
"loss": 0.0496,
"step": 280
},
{
"epoch": 1.81421647819063,
"grad_norm": 0.42013099789619446,
"learning_rate": 6.615026081733168e-06,
"loss": 0.0467,
"step": 281
},
{
"epoch": 1.8206785137318255,
"grad_norm": 0.3072008490562439,
"learning_rate": 6.5881663732947935e-06,
"loss": 0.0458,
"step": 282
},
{
"epoch": 1.827140549273021,
"grad_norm": 3.501995801925659,
"learning_rate": 6.561255593775656e-06,
"loss": 0.0385,
"step": 283
},
{
"epoch": 1.8336025848142166,
"grad_norm": 0.3589681386947632,
"learning_rate": 6.5342946085527205e-06,
"loss": 0.0368,
"step": 284
},
{
"epoch": 1.8400646203554119,
"grad_norm": 0.405106782913208,
"learning_rate": 6.507284284617427e-06,
"loss": 0.0499,
"step": 285
},
{
"epoch": 1.8465266558966076,
"grad_norm": 0.41938844323158264,
"learning_rate": 6.480225490547821e-06,
"loss": 0.056,
"step": 286
},
{
"epoch": 1.8529886914378029,
"grad_norm": 0.43309280276298523,
"learning_rate": 6.4531190964806005e-06,
"loss": 0.0434,
"step": 287
},
{
"epoch": 1.8594507269789984,
"grad_norm": 0.3927607238292694,
"learning_rate": 6.425965974083164e-06,
"loss": 0.0461,
"step": 288
},
{
"epoch": 1.865912762520194,
"grad_norm": 0.38023263216018677,
"learning_rate": 6.398766996525554e-06,
"loss": 0.0373,
"step": 289
},
{
"epoch": 1.8723747980613892,
"grad_norm": 0.3614664673805237,
"learning_rate": 6.371523038452398e-06,
"loss": 0.0421,
"step": 290
},
{
"epoch": 1.878836833602585,
"grad_norm": 0.40119874477386475,
"learning_rate": 6.344234975954765e-06,
"loss": 0.0433,
"step": 291
},
{
"epoch": 1.8852988691437802,
"grad_norm": 0.5142848491668701,
"learning_rate": 6.316903686542011e-06,
"loss": 0.0411,
"step": 292
},
{
"epoch": 1.891760904684976,
"grad_norm": 0.40000125765800476,
"learning_rate": 6.289530049113543e-06,
"loss": 0.0544,
"step": 293
},
{
"epoch": 1.8982229402261712,
"grad_norm": 3.5423994064331055,
"learning_rate": 6.262114943930566e-06,
"loss": 0.0466,
"step": 294
},
{
"epoch": 1.9046849757673667,
"grad_norm": 0.42096999287605286,
"learning_rate": 6.234659252587782e-06,
"loss": 0.0488,
"step": 295
},
{
"epoch": 1.9111470113085622,
"grad_norm": 0.45695486664772034,
"learning_rate": 6.20716385798502e-06,
"loss": 0.053,
"step": 296
},
{
"epoch": 1.9176090468497575,
"grad_norm": 0.42890581488609314,
"learning_rate": 6.17962964429887e-06,
"loss": 0.0476,
"step": 297
},
{
"epoch": 1.9240710823909533,
"grad_norm": 0.4009229242801666,
"learning_rate": 6.152057496954225e-06,
"loss": 0.0429,
"step": 298
},
{
"epoch": 1.9305331179321485,
"grad_norm": 0.7172302007675171,
"learning_rate": 6.12444830259583e-06,
"loss": 0.0504,
"step": 299
},
{
"epoch": 1.936995153473344,
"grad_norm": 0.4833920896053314,
"learning_rate": 6.096802949059757e-06,
"loss": 0.0538,
"step": 300
},
{
"epoch": 1.9434571890145396,
"grad_norm": 3.44974684715271,
"learning_rate": 6.069122325344857e-06,
"loss": 0.0408,
"step": 301
},
{
"epoch": 1.949919224555735,
"grad_norm": 0.7173548340797424,
"learning_rate": 6.041407321584178e-06,
"loss": 0.0485,
"step": 302
},
{
"epoch": 1.9563812600969306,
"grad_norm": 0.4045400023460388,
"learning_rate": 6.013658829016328e-06,
"loss": 0.0433,
"step": 303
},
{
"epoch": 1.9628432956381259,
"grad_norm": 0.38726726174354553,
"learning_rate": 5.9858777399568325e-06,
"loss": 0.044,
"step": 304
},
{
"epoch": 1.9693053311793216,
"grad_norm": 0.4238860011100769,
"learning_rate": 5.958064947769423e-06,
"loss": 0.0488,
"step": 305
},
{
"epoch": 1.975767366720517,
"grad_norm": 0.41494399309158325,
"learning_rate": 5.930221346837324e-06,
"loss": 0.0444,
"step": 306
},
{
"epoch": 1.9822294022617124,
"grad_norm": 1.2054787874221802,
"learning_rate": 5.902347832534475e-06,
"loss": 0.0639,
"step": 307
},
{
"epoch": 1.988691437802908,
"grad_norm": 0.4015323519706726,
"learning_rate": 5.874445301196761e-06,
"loss": 0.0392,
"step": 308
},
{
"epoch": 1.9951534733441034,
"grad_norm": 0.4661884307861328,
"learning_rate": 5.846514650093162e-06,
"loss": 0.0487,
"step": 309
},
{
"epoch": 2.0,
"grad_norm": 0.43507838249206543,
"learning_rate": 5.818556777396923e-06,
"loss": 0.0566,
"step": 310
},
{
"epoch": 2.0064620355411953,
"grad_norm": 0.52005934715271,
"learning_rate": 5.790572582156654e-06,
"loss": 0.0266,
"step": 311
},
{
"epoch": 2.012924071082391,
"grad_norm": 0.3060389757156372,
"learning_rate": 5.76256296426743e-06,
"loss": 0.0255,
"step": 312
},
{
"epoch": 2.0193861066235863,
"grad_norm": 0.26340675354003906,
"learning_rate": 5.734528824441845e-06,
"loss": 0.0226,
"step": 313
},
{
"epoch": 2.025848142164782,
"grad_norm": 0.29653921723365784,
"learning_rate": 5.706471064181055e-06,
"loss": 0.0246,
"step": 314
},
{
"epoch": 2.0323101777059773,
"grad_norm": 0.2679958939552307,
"learning_rate": 5.678390585745784e-06,
"loss": 0.019,
"step": 315
},
{
"epoch": 2.038772213247173,
"grad_norm": 0.40780436992645264,
"learning_rate": 5.6502882921273084e-06,
"loss": 0.0223,
"step": 316
},
{
"epoch": 2.0452342487883683,
"grad_norm": 4.025135517120361,
"learning_rate": 5.6221650870184215e-06,
"loss": 0.0289,
"step": 317
},
{
"epoch": 2.0516962843295636,
"grad_norm": 0.3368605971336365,
"learning_rate": 5.594021874784376e-06,
"loss": 0.0216,
"step": 318
},
{
"epoch": 2.0581583198707594,
"grad_norm": 0.37941357493400574,
"learning_rate": 5.565859560433792e-06,
"loss": 0.028,
"step": 319
},
{
"epoch": 2.0646203554119547,
"grad_norm": 0.39574962854385376,
"learning_rate": 5.537679049589568e-06,
"loss": 0.0359,
"step": 320
},
{
"epoch": 2.0710823909531504,
"grad_norm": 0.3605610728263855,
"learning_rate": 5.50948124845975e-06,
"loss": 0.0228,
"step": 321
},
{
"epoch": 2.0775444264943457,
"grad_norm": 0.3201454281806946,
"learning_rate": 5.481267063808392e-06,
"loss": 0.0241,
"step": 322
},
{
"epoch": 2.0840064620355414,
"grad_norm": 0.32988935708999634,
"learning_rate": 5.453037402926397e-06,
"loss": 0.0232,
"step": 323
},
{
"epoch": 2.0904684975767367,
"grad_norm": 0.43368834257125854,
"learning_rate": 5.4247931736023385e-06,
"loss": 0.0219,
"step": 324
},
{
"epoch": 2.096930533117932,
"grad_norm": 0.37438321113586426,
"learning_rate": 5.396535284093278e-06,
"loss": 0.0216,
"step": 325
},
{
"epoch": 2.1033925686591277,
"grad_norm": 0.41498661041259766,
"learning_rate": 5.368264643095543e-06,
"loss": 0.0214,
"step": 326
},
{
"epoch": 2.109854604200323,
"grad_norm": 0.43471306562423706,
"learning_rate": 5.3399821597155225e-06,
"loss": 0.0194,
"step": 327
},
{
"epoch": 2.1163166397415187,
"grad_norm": 0.550999104976654,
"learning_rate": 5.3116887434404155e-06,
"loss": 0.0264,
"step": 328
},
{
"epoch": 2.122778675282714,
"grad_norm": 0.4273984730243683,
"learning_rate": 5.283385304109e-06,
"loss": 0.0238,
"step": 329
},
{
"epoch": 2.1292407108239093,
"grad_norm": 0.34016546607017517,
"learning_rate": 5.255072751882363e-06,
"loss": 0.0221,
"step": 330
},
{
"epoch": 2.135702746365105,
"grad_norm": 0.43326249718666077,
"learning_rate": 5.22675199721464e-06,
"loss": 0.0273,
"step": 331
},
{
"epoch": 2.1421647819063003,
"grad_norm": 0.3180527985095978,
"learning_rate": 5.198423950823734e-06,
"loss": 0.0243,
"step": 332
},
{
"epoch": 2.148626817447496,
"grad_norm": 0.28859448432922363,
"learning_rate": 5.170089523662028e-06,
"loss": 0.0272,
"step": 333
},
{
"epoch": 2.1550888529886914,
"grad_norm": 11.164939880371094,
"learning_rate": 5.141749626887101e-06,
"loss": 0.0306,
"step": 334
},
{
"epoch": 2.161550888529887,
"grad_norm": 0.3972926139831543,
"learning_rate": 5.113405171832404e-06,
"loss": 0.0209,
"step": 335
},
{
"epoch": 2.1680129240710824,
"grad_norm": 0.32792770862579346,
"learning_rate": 5.0850570699779875e-06,
"loss": 0.029,
"step": 336
},
{
"epoch": 2.1744749596122777,
"grad_norm": 0.5258669853210449,
"learning_rate": 5.05670623292116e-06,
"loss": 0.0469,
"step": 337
},
{
"epoch": 2.1809369951534734,
"grad_norm": 4.376863479614258,
"learning_rate": 5.028353572347195e-06,
"loss": 0.0329,
"step": 338
},
{
"epoch": 2.1873990306946687,
"grad_norm": 0.38787195086479187,
"learning_rate": 5e-06,
"loss": 0.025,
"step": 339
},
{
"epoch": 2.1938610662358644,
"grad_norm": 0.3971186578273773,
"learning_rate": 4.971646427652806e-06,
"loss": 0.022,
"step": 340
},
{
"epoch": 2.2003231017770597,
"grad_norm": 0.4147299528121948,
"learning_rate": 4.94329376707884e-06,
"loss": 0.0353,
"step": 341
},
{
"epoch": 2.2067851373182554,
"grad_norm": 0.3218820095062256,
"learning_rate": 4.914942930022014e-06,
"loss": 0.0223,
"step": 342
},
{
"epoch": 2.2132471728594507,
"grad_norm": 0.3497336208820343,
"learning_rate": 4.8865948281675976e-06,
"loss": 0.0245,
"step": 343
},
{
"epoch": 2.219709208400646,
"grad_norm": 0.5387941002845764,
"learning_rate": 4.858250373112901e-06,
"loss": 0.0375,
"step": 344
},
{
"epoch": 2.2261712439418417,
"grad_norm": 0.3683622181415558,
"learning_rate": 4.829910476337972e-06,
"loss": 0.0174,
"step": 345
},
{
"epoch": 2.232633279483037,
"grad_norm": 0.40947481989860535,
"learning_rate": 4.801576049176269e-06,
"loss": 0.0211,
"step": 346
},
{
"epoch": 2.2390953150242328,
"grad_norm": 0.38499361276626587,
"learning_rate": 4.773248002785362e-06,
"loss": 0.0229,
"step": 347
},
{
"epoch": 2.245557350565428,
"grad_norm": 0.45877766609191895,
"learning_rate": 4.744927248117639e-06,
"loss": 0.0271,
"step": 348
},
{
"epoch": 2.2520193861066238,
"grad_norm": 0.6704347133636475,
"learning_rate": 4.716614695891002e-06,
"loss": 0.0223,
"step": 349
},
{
"epoch": 2.258481421647819,
"grad_norm": 21.48920440673828,
"learning_rate": 4.688311256559587e-06,
"loss": 0.0235,
"step": 350
},
{
"epoch": 2.2649434571890144,
"grad_norm": 0.33492201566696167,
"learning_rate": 4.66001784028448e-06,
"loss": 0.0269,
"step": 351
},
{
"epoch": 2.27140549273021,
"grad_norm": 0.39123740792274475,
"learning_rate": 4.631735356904458e-06,
"loss": 0.0222,
"step": 352
},
{
"epoch": 2.2778675282714054,
"grad_norm": 0.3031889796257019,
"learning_rate": 4.6034647159067234e-06,
"loss": 0.0225,
"step": 353
},
{
"epoch": 2.284329563812601,
"grad_norm": 0.3864370584487915,
"learning_rate": 4.575206826397662e-06,
"loss": 0.0222,
"step": 354
},
{
"epoch": 2.2907915993537964,
"grad_norm": 0.6137961149215698,
"learning_rate": 4.546962597073607e-06,
"loss": 0.0276,
"step": 355
},
{
"epoch": 2.297253634894992,
"grad_norm": 0.4349140226840973,
"learning_rate": 4.5187329361916095e-06,
"loss": 0.0248,
"step": 356
},
{
"epoch": 2.3037156704361874,
"grad_norm": 0.3713863492012024,
"learning_rate": 4.490518751540251e-06,
"loss": 0.0268,
"step": 357
},
{
"epoch": 2.3101777059773827,
"grad_norm": 0.39676621556282043,
"learning_rate": 4.462320950410432e-06,
"loss": 0.0262,
"step": 358
},
{
"epoch": 2.3166397415185784,
"grad_norm": 0.37068402767181396,
"learning_rate": 4.4341404395662105e-06,
"loss": 0.0211,
"step": 359
},
{
"epoch": 2.3231017770597737,
"grad_norm": 0.29026004672050476,
"learning_rate": 4.405978125215627e-06,
"loss": 0.0208,
"step": 360
},
{
"epoch": 2.3295638126009695,
"grad_norm": 0.46004900336265564,
"learning_rate": 4.377834912981579e-06,
"loss": 0.0267,
"step": 361
},
{
"epoch": 2.3360258481421647,
"grad_norm": 0.38983336091041565,
"learning_rate": 4.3497117078726915e-06,
"loss": 0.026,
"step": 362
},
{
"epoch": 2.3424878836833605,
"grad_norm": 0.485213965177536,
"learning_rate": 4.321609414254217e-06,
"loss": 0.0224,
"step": 363
},
{
"epoch": 2.3489499192245558,
"grad_norm": 0.3934685289859772,
"learning_rate": 4.2935289358189454e-06,
"loss": 0.022,
"step": 364
},
{
"epoch": 2.355411954765751,
"grad_norm": 0.4222012460231781,
"learning_rate": 4.265471175558156e-06,
"loss": 0.0198,
"step": 365
},
{
"epoch": 2.361873990306947,
"grad_norm": 0.3215023875236511,
"learning_rate": 4.237437035732572e-06,
"loss": 0.0252,
"step": 366
},
{
"epoch": 2.368336025848142,
"grad_norm": 0.38228940963745117,
"learning_rate": 4.2094274178433455e-06,
"loss": 0.026,
"step": 367
},
{
"epoch": 2.374798061389338,
"grad_norm": 0.44622451066970825,
"learning_rate": 4.18144322260308e-06,
"loss": 0.0323,
"step": 368
},
{
"epoch": 2.381260096930533,
"grad_norm": 0.42110568284988403,
"learning_rate": 4.153485349906839e-06,
"loss": 0.0175,
"step": 369
},
{
"epoch": 2.387722132471729,
"grad_norm": 0.4341752827167511,
"learning_rate": 4.125554698803241e-06,
"loss": 0.0224,
"step": 370
},
{
"epoch": 2.394184168012924,
"grad_norm": 0.6297674775123596,
"learning_rate": 4.0976521674655255e-06,
"loss": 0.0223,
"step": 371
},
{
"epoch": 2.4006462035541194,
"grad_norm": 0.34877192974090576,
"learning_rate": 4.069778653162679e-06,
"loss": 0.0223,
"step": 372
},
{
"epoch": 2.407108239095315,
"grad_norm": 0.3711933493614197,
"learning_rate": 4.041935052230579e-06,
"loss": 0.018,
"step": 373
},
{
"epoch": 2.4135702746365104,
"grad_norm": 0.2846461236476898,
"learning_rate": 4.014122260043169e-06,
"loss": 0.0196,
"step": 374
},
{
"epoch": 2.420032310177706,
"grad_norm": 0.3798210024833679,
"learning_rate": 3.986341170983672e-06,
"loss": 0.0252,
"step": 375
},
{
"epoch": 2.4264943457189014,
"grad_norm": 0.45135289430618286,
"learning_rate": 3.958592678415825e-06,
"loss": 0.0272,
"step": 376
},
{
"epoch": 2.432956381260097,
"grad_norm": 0.3350200355052948,
"learning_rate": 3.9308776746551444e-06,
"loss": 0.0208,
"step": 377
},
{
"epoch": 2.4394184168012925,
"grad_norm": 0.3789970576763153,
"learning_rate": 3.903197050940244e-06,
"loss": 0.0206,
"step": 378
},
{
"epoch": 2.4458804523424877,
"grad_norm": 0.40493834018707275,
"learning_rate": 3.875551697404172e-06,
"loss": 0.0225,
"step": 379
},
{
"epoch": 2.4523424878836835,
"grad_norm": 0.45063942670822144,
"learning_rate": 3.847942503045776e-06,
"loss": 0.0246,
"step": 380
},
{
"epoch": 2.4588045234248788,
"grad_norm": 0.36469295620918274,
"learning_rate": 3.820370355701133e-06,
"loss": 0.0288,
"step": 381
},
{
"epoch": 2.4652665589660745,
"grad_norm": 0.3883582055568695,
"learning_rate": 3.792836142014981e-06,
"loss": 0.0305,
"step": 382
},
{
"epoch": 2.47172859450727,
"grad_norm": 0.41473427414894104,
"learning_rate": 3.7653407474122195e-06,
"loss": 0.0216,
"step": 383
},
{
"epoch": 2.4781906300484655,
"grad_norm": 0.44368648529052734,
"learning_rate": 3.7378850560694337e-06,
"loss": 0.0297,
"step": 384
},
{
"epoch": 2.484652665589661,
"grad_norm": 0.5202524662017822,
"learning_rate": 3.7104699508864606e-06,
"loss": 0.0258,
"step": 385
},
{
"epoch": 2.491114701130856,
"grad_norm": 0.41755107045173645,
"learning_rate": 3.683096313457991e-06,
"loss": 0.0319,
"step": 386
},
{
"epoch": 2.497576736672052,
"grad_norm": 0.4544621706008911,
"learning_rate": 3.6557650240452358e-06,
"loss": 0.027,
"step": 387
},
{
"epoch": 2.504038772213247,
"grad_norm": 0.3956614136695862,
"learning_rate": 3.6284769615476045e-06,
"loss": 0.018,
"step": 388
},
{
"epoch": 2.5105008077544424,
"grad_norm": 0.3755435049533844,
"learning_rate": 3.601233003474448e-06,
"loss": 0.0223,
"step": 389
},
{
"epoch": 2.516962843295638,
"grad_norm": 0.36021357774734497,
"learning_rate": 3.5740340259168383e-06,
"loss": 0.0203,
"step": 390
},
{
"epoch": 2.523424878836834,
"grad_norm": 0.35905206203460693,
"learning_rate": 3.5468809035194008e-06,
"loss": 0.0213,
"step": 391
},
{
"epoch": 2.529886914378029,
"grad_norm": 0.35023632645606995,
"learning_rate": 3.519774509452181e-06,
"loss": 0.0194,
"step": 392
},
{
"epoch": 2.5363489499192244,
"grad_norm": 0.3874582052230835,
"learning_rate": 3.4927157153825717e-06,
"loss": 0.0255,
"step": 393
},
{
"epoch": 2.54281098546042,
"grad_norm": 0.3666871190071106,
"learning_rate": 3.4657053914472816e-06,
"loss": 0.0205,
"step": 394
},
{
"epoch": 2.5492730210016155,
"grad_norm": 0.47673851251602173,
"learning_rate": 3.4387444062243453e-06,
"loss": 0.0271,
"step": 395
},
{
"epoch": 2.5557350565428107,
"grad_norm": 0.3491911292076111,
"learning_rate": 3.4118336267052086e-06,
"loss": 0.0191,
"step": 396
},
{
"epoch": 2.5621970920840065,
"grad_norm": 0.3430013656616211,
"learning_rate": 3.384973918266834e-06,
"loss": 0.019,
"step": 397
},
{
"epoch": 2.568659127625202,
"grad_norm": 0.3667357265949249,
"learning_rate": 3.3581661446438796e-06,
"loss": 0.0241,
"step": 398
},
{
"epoch": 2.5751211631663975,
"grad_norm": 0.395435094833374,
"learning_rate": 3.3314111679009203e-06,
"loss": 0.0295,
"step": 399
},
{
"epoch": 2.581583198707593,
"grad_norm": 0.4434990882873535,
"learning_rate": 3.3047098484047314e-06,
"loss": 0.023,
"step": 400
},
{
"epoch": 2.5880452342487885,
"grad_norm": 0.40480467677116394,
"learning_rate": 3.2780630447966135e-06,
"loss": 0.0245,
"step": 401
},
{
"epoch": 2.594507269789984,
"grad_norm": 0.39888110756874084,
"learning_rate": 3.251471613964793e-06,
"loss": 0.0356,
"step": 402
},
{
"epoch": 2.600969305331179,
"grad_norm": 0.3975083529949188,
"learning_rate": 3.224936411016849e-06,
"loss": 0.0212,
"step": 403
},
{
"epoch": 2.607431340872375,
"grad_norm": 0.31860825419425964,
"learning_rate": 3.198458289252234e-06,
"loss": 0.0148,
"step": 404
},
{
"epoch": 2.61389337641357,
"grad_norm": 0.5908383727073669,
"learning_rate": 3.172038100134823e-06,
"loss": 0.0215,
"step": 405
},
{
"epoch": 2.620355411954766,
"grad_norm": 0.32025986909866333,
"learning_rate": 3.145676693265537e-06,
"loss": 0.0189,
"step": 406
},
{
"epoch": 2.626817447495961,
"grad_norm": 0.3670276403427124,
"learning_rate": 3.1193749163550226e-06,
"loss": 0.0229,
"step": 407
},
{
"epoch": 2.633279483037157,
"grad_norm": 0.3176894187927246,
"learning_rate": 3.093133615196381e-06,
"loss": 0.0212,
"step": 408
},
{
"epoch": 2.639741518578352,
"grad_norm": 0.41281652450561523,
"learning_rate": 3.0669536336379906e-06,
"loss": 0.0191,
"step": 409
},
{
"epoch": 2.6462035541195474,
"grad_norm": 0.38180580735206604,
"learning_rate": 3.040835813556352e-06,
"loss": 0.02,
"step": 410
},
{
"epoch": 2.652665589660743,
"grad_norm": 0.2951391041278839,
"learning_rate": 3.014780994829029e-06,
"loss": 0.0162,
"step": 411
},
{
"epoch": 2.6591276252019385,
"grad_norm": 0.45817264914512634,
"learning_rate": 2.988790015307627e-06,
"loss": 0.0215,
"step": 412
},
{
"epoch": 2.665589660743134,
"grad_norm": 0.4247450828552246,
"learning_rate": 2.9628637107908614e-06,
"loss": 0.0271,
"step": 413
},
{
"epoch": 2.6720516962843295,
"grad_norm": 0.3811360001564026,
"learning_rate": 2.937002914997679e-06,
"loss": 0.0276,
"step": 414
},
{
"epoch": 2.678513731825525,
"grad_norm": 0.3762235641479492,
"learning_rate": 2.911208459540442e-06,
"loss": 0.021,
"step": 415
},
{
"epoch": 2.6849757673667205,
"grad_norm": 0.4333343207836151,
"learning_rate": 2.8854811738981848e-06,
"loss": 0.0247,
"step": 416
},
{
"epoch": 2.691437802907916,
"grad_norm": 0.36235716938972473,
"learning_rate": 2.859821885389957e-06,
"loss": 0.024,
"step": 417
},
{
"epoch": 2.6978998384491115,
"grad_norm": 0.3801264762878418,
"learning_rate": 2.8342314191481952e-06,
"loss": 0.0243,
"step": 418
},
{
"epoch": 2.704361873990307,
"grad_norm": 0.37116238474845886,
"learning_rate": 2.808710598092206e-06,
"loss": 0.0202,
"step": 419
},
{
"epoch": 2.7108239095315025,
"grad_norm": 0.36050671339035034,
"learning_rate": 2.783260242901694e-06,
"loss": 0.023,
"step": 420
},
{
"epoch": 2.717285945072698,
"grad_norm": 0.4219389855861664,
"learning_rate": 2.7578811719903788e-06,
"loss": 0.0201,
"step": 421
},
{
"epoch": 2.7237479806138936,
"grad_norm": 0.40774789452552795,
"learning_rate": 2.7325742014796695e-06,
"loss": 0.0272,
"step": 422
},
{
"epoch": 2.730210016155089,
"grad_norm": 0.3500642478466034,
"learning_rate": 2.707340145172423e-06,
"loss": 0.0209,
"step": 423
},
{
"epoch": 2.736672051696284,
"grad_norm": 0.40537238121032715,
"learning_rate": 2.682179814526783e-06,
"loss": 0.0192,
"step": 424
},
{
"epoch": 2.74313408723748,
"grad_norm": 0.346743643283844,
"learning_rate": 2.6570940186300655e-06,
"loss": 0.0208,
"step": 425
},
{
"epoch": 2.749596122778675,
"grad_norm": 0.504565954208374,
"learning_rate": 2.6320835641727615e-06,
"loss": 0.026,
"step": 426
},
{
"epoch": 2.756058158319871,
"grad_norm": 0.4314068853855133,
"learning_rate": 2.607149255422584e-06,
"loss": 0.0193,
"step": 427
},
{
"epoch": 2.762520193861066,
"grad_norm": 0.32916414737701416,
"learning_rate": 2.582291894198617e-06,
"loss": 0.0196,
"step": 428
},
{
"epoch": 2.768982229402262,
"grad_norm": 1.6414940357208252,
"learning_rate": 2.557512279845509e-06,
"loss": 0.0211,
"step": 429
},
{
"epoch": 2.775444264943457,
"grad_norm": 0.4827597141265869,
"learning_rate": 2.5328112092077882e-06,
"loss": 0.0234,
"step": 430
},
{
"epoch": 2.7819063004846525,
"grad_norm": 0.45988279581069946,
"learning_rate": 2.5081894766042393e-06,
"loss": 0.0282,
"step": 431
},
{
"epoch": 2.788368336025848,
"grad_norm": 0.5003206133842468,
"learning_rate": 2.4836478738023424e-06,
"loss": 0.0236,
"step": 432
},
{
"epoch": 2.7948303715670435,
"grad_norm": 0.44246071577072144,
"learning_rate": 2.4591871899928286e-06,
"loss": 0.0374,
"step": 433
},
{
"epoch": 2.8012924071082392,
"grad_norm": 0.4109882712364197,
"learning_rate": 2.434808211764294e-06,
"loss": 0.0218,
"step": 434
},
{
"epoch": 2.8077544426494345,
"grad_norm": 0.46064233779907227,
"learning_rate": 2.410511723077907e-06,
"loss": 0.0178,
"step": 435
},
{
"epoch": 2.8142164781906303,
"grad_norm": 0.33899685740470886,
"learning_rate": 2.386298505242198e-06,
"loss": 0.0296,
"step": 436
},
{
"epoch": 2.8206785137318255,
"grad_norm": 0.37225639820098877,
"learning_rate": 2.3621693368879363e-06,
"loss": 0.0268,
"step": 437
},
{
"epoch": 2.827140549273021,
"grad_norm": 0.49789291620254517,
"learning_rate": 2.3381249939430882e-06,
"loss": 0.0288,
"step": 438
},
{
"epoch": 2.8336025848142166,
"grad_norm": 0.4957059919834137,
"learning_rate": 2.3141662496078695e-06,
"loss": 0.0194,
"step": 439
},
{
"epoch": 2.840064620355412,
"grad_norm": 0.36811667680740356,
"learning_rate": 2.2902938743298765e-06,
"loss": 0.0219,
"step": 440
},
{
"epoch": 2.8465266558966076,
"grad_norm": 0.3737165927886963,
"learning_rate": 2.2665086357793155e-06,
"loss": 0.0217,
"step": 441
},
{
"epoch": 2.852988691437803,
"grad_norm": 0.46944284439086914,
"learning_rate": 2.242811298824312e-06,
"loss": 0.0218,
"step": 442
},
{
"epoch": 2.8594507269789986,
"grad_norm": 2.569209098815918,
"learning_rate": 2.21920262550632e-06,
"loss": 0.0226,
"step": 443
},
{
"epoch": 2.865912762520194,
"grad_norm": 0.46464183926582336,
"learning_rate": 2.1956833750156086e-06,
"loss": 0.0273,
"step": 444
},
{
"epoch": 2.872374798061389,
"grad_norm": 0.36627525091171265,
"learning_rate": 2.1722543036668613e-06,
"loss": 0.0237,
"step": 445
},
{
"epoch": 2.878836833602585,
"grad_norm": 0.42264869809150696,
"learning_rate": 2.1489161648748436e-06,
"loss": 0.0207,
"step": 446
},
{
"epoch": 2.88529886914378,
"grad_norm": 0.32069268822669983,
"learning_rate": 2.125669709130174e-06,
"loss": 0.0187,
"step": 447
},
{
"epoch": 2.891760904684976,
"grad_norm": 0.3201678693294525,
"learning_rate": 2.102515683975201e-06,
"loss": 0.0184,
"step": 448
},
{
"epoch": 2.898222940226171,
"grad_norm": 0.3414503037929535,
"learning_rate": 2.0794548339799605e-06,
"loss": 0.0191,
"step": 449
},
{
"epoch": 2.904684975767367,
"grad_norm": 0.33603090047836304,
"learning_rate": 2.056487900718227e-06,
"loss": 0.0186,
"step": 450
},
{
"epoch": 2.9111470113085622,
"grad_norm": 0.40074023604393005,
"learning_rate": 2.0336156227436653e-06,
"loss": 0.0184,
"step": 451
},
{
"epoch": 2.9176090468497575,
"grad_norm": 0.37722983956336975,
"learning_rate": 2.010838735566096e-06,
"loss": 0.0257,
"step": 452
},
{
"epoch": 2.9240710823909533,
"grad_norm": 0.3689229488372803,
"learning_rate": 1.9881579716278267e-06,
"loss": 0.0182,
"step": 453
},
{
"epoch": 2.9305331179321485,
"grad_norm": 0.4667937755584717,
"learning_rate": 1.9655740602801055e-06,
"loss": 0.0373,
"step": 454
},
{
"epoch": 2.936995153473344,
"grad_norm": 0.3728683888912201,
"learning_rate": 1.943087727759663e-06,
"loss": 0.0221,
"step": 455
},
{
"epoch": 2.9434571890145396,
"grad_norm": 0.38799959421157837,
"learning_rate": 1.92069969716537e-06,
"loss": 0.0238,
"step": 456
},
{
"epoch": 2.9499192245557353,
"grad_norm": 0.5497745275497437,
"learning_rate": 1.8984106884349702e-06,
"loss": 0.0291,
"step": 457
},
{
"epoch": 2.9563812600969306,
"grad_norm": 0.3938653767108917,
"learning_rate": 1.8762214183219379e-06,
"loss": 0.0215,
"step": 458
},
{
"epoch": 2.962843295638126,
"grad_norm": 0.35690999031066895,
"learning_rate": 1.8541326003724258e-06,
"loss": 0.0191,
"step": 459
},
{
"epoch": 2.9693053311793216,
"grad_norm": 0.3347409665584564,
"learning_rate": 1.8321449449023215e-06,
"loss": 0.0205,
"step": 460
},
{
"epoch": 2.975767366720517,
"grad_norm": 0.3588216006755829,
"learning_rate": 1.8102591589744016e-06,
"loss": 0.0246,
"step": 461
},
{
"epoch": 2.982229402261712,
"grad_norm": 0.4010322391986847,
"learning_rate": 1.7884759463755984e-06,
"loss": 0.0196,
"step": 462
},
{
"epoch": 2.988691437802908,
"grad_norm": 0.3672601878643036,
"learning_rate": 1.7667960075943723e-06,
"loss": 0.023,
"step": 463
},
{
"epoch": 2.9951534733441036,
"grad_norm": 0.5553978085517883,
"learning_rate": 1.7452200397981706e-06,
"loss": 0.0167,
"step": 464
},
{
"epoch": 3.0,
"grad_norm": 0.5560592412948608,
"learning_rate": 1.723748736811025e-06,
"loss": 0.0229,
"step": 465
},
{
"epoch": 3.0064620355411953,
"grad_norm": 0.32263022661209106,
"learning_rate": 1.7023827890912302e-06,
"loss": 0.016,
"step": 466
},
{
"epoch": 3.012924071082391,
"grad_norm": 0.3601260185241699,
"learning_rate": 1.681122883709152e-06,
"loss": 0.013,
"step": 467
},
{
"epoch": 3.0193861066235863,
"grad_norm": 2.2087621688842773,
"learning_rate": 1.6599697043251128e-06,
"loss": 0.0129,
"step": 468
},
{
"epoch": 3.025848142164782,
"grad_norm": 0.4122757315635681,
"learning_rate": 1.638923931167427e-06,
"loss": 0.0119,
"step": 469
},
{
"epoch": 3.0323101777059773,
"grad_norm": 0.24214933812618256,
"learning_rate": 1.6179862410105197e-06,
"loss": 0.0079,
"step": 470
},
{
"epoch": 3.038772213247173,
"grad_norm": 0.2189481109380722,
"learning_rate": 1.5971573071531588e-06,
"loss": 0.0124,
"step": 471
},
{
"epoch": 3.0452342487883683,
"grad_norm": 1.217540979385376,
"learning_rate": 1.5764377993968094e-06,
"loss": 0.011,
"step": 472
},
{
"epoch": 3.0516962843295636,
"grad_norm": 0.3144000768661499,
"learning_rate": 1.5558283840240924e-06,
"loss": 0.0115,
"step": 473
},
{
"epoch": 3.0581583198707594,
"grad_norm": 0.23900973796844482,
"learning_rate": 1.5353297237773595e-06,
"loss": 0.0088,
"step": 474
},
{
"epoch": 3.0646203554119547,
"grad_norm": 0.2415008693933487,
"learning_rate": 1.5149424778373811e-06,
"loss": 0.0097,
"step": 475
},
{
"epoch": 3.0710823909531504,
"grad_norm": 0.24851197004318237,
"learning_rate": 1.4946673018021484e-06,
"loss": 0.0114,
"step": 476
},
{
"epoch": 3.0775444264943457,
"grad_norm": 0.32280173897743225,
"learning_rate": 1.474504847665791e-06,
"loss": 0.0113,
"step": 477
},
{
"epoch": 3.0840064620355414,
"grad_norm": 0.2650688588619232,
"learning_rate": 1.4544557637976108e-06,
"loss": 0.0092,
"step": 478
},
{
"epoch": 3.0904684975767367,
"grad_norm": 0.39706680178642273,
"learning_rate": 1.4345206949212338e-06,
"loss": 0.0107,
"step": 479
},
{
"epoch": 3.096930533117932,
"grad_norm": 0.27723875641822815,
"learning_rate": 1.4147002820938743e-06,
"loss": 0.0124,
"step": 480
},
{
"epoch": 3.1033925686591277,
"grad_norm": 0.35606086254119873,
"learning_rate": 1.3949951626857244e-06,
"loss": 0.0092,
"step": 481
},
{
"epoch": 3.109854604200323,
"grad_norm": 0.3037882447242737,
"learning_rate": 1.375405970359453e-06,
"loss": 0.0111,
"step": 482
},
{
"epoch": 3.1163166397415187,
"grad_norm": 0.3035484254360199,
"learning_rate": 1.3559333350498332e-06,
"loss": 0.0118,
"step": 483
},
{
"epoch": 3.122778675282714,
"grad_norm": 0.32289376854896545,
"learning_rate": 1.3365778829434834e-06,
"loss": 0.0123,
"step": 484
},
{
"epoch": 3.1292407108239093,
"grad_norm": 0.3762624263763428,
"learning_rate": 1.3173402364587307e-06,
"loss": 0.0087,
"step": 485
},
{
"epoch": 3.135702746365105,
"grad_norm": 0.21013927459716797,
"learning_rate": 1.298221014225597e-06,
"loss": 0.0065,
"step": 486
},
{
"epoch": 3.1421647819063003,
"grad_norm": 0.352093368768692,
"learning_rate": 1.2792208310659015e-06,
"loss": 0.0135,
"step": 487
},
{
"epoch": 3.148626817447496,
"grad_norm": 0.25764092803001404,
"learning_rate": 1.2603402979734992e-06,
"loss": 0.0092,
"step": 488
},
{
"epoch": 3.1550888529886914,
"grad_norm": 0.26913541555404663,
"learning_rate": 1.2415800220946223e-06,
"loss": 0.0057,
"step": 489
},
{
"epoch": 3.161550888529887,
"grad_norm": 0.3840892016887665,
"learning_rate": 1.2229406067083566e-06,
"loss": 0.0126,
"step": 490
},
{
"epoch": 3.1680129240710824,
"grad_norm": 0.6032068133354187,
"learning_rate": 1.2044226512072537e-06,
"loss": 0.0129,
"step": 491
},
{
"epoch": 3.1744749596122777,
"grad_norm": 0.2559524178504944,
"learning_rate": 1.1860267510780432e-06,
"loss": 0.0078,
"step": 492
},
{
"epoch": 3.1809369951534734,
"grad_norm": 1.6579912900924683,
"learning_rate": 1.1677534978824906e-06,
"loss": 0.0126,
"step": 493
},
{
"epoch": 3.1873990306946687,
"grad_norm": 0.2948991060256958,
"learning_rate": 1.1496034792383654e-06,
"loss": 0.0087,
"step": 494
},
{
"epoch": 3.1938610662358644,
"grad_norm": 0.32418084144592285,
"learning_rate": 1.1315772788005603e-06,
"loss": 0.0093,
"step": 495
},
{
"epoch": 3.2003231017770597,
"grad_norm": 0.42377397418022156,
"learning_rate": 1.1136754762423097e-06,
"loss": 0.0102,
"step": 496
},
{
"epoch": 3.2067851373182554,
"grad_norm": 0.34062591195106506,
"learning_rate": 1.0958986472365518e-06,
"loss": 0.0176,
"step": 497
},
{
"epoch": 3.2132471728594507,
"grad_norm": 0.43844881653785706,
"learning_rate": 1.0782473634374191e-06,
"loss": 0.0132,
"step": 498
},
{
"epoch": 3.219709208400646,
"grad_norm": 0.4194018244743347,
"learning_rate": 1.0607221924618533e-06,
"loss": 0.0103,
"step": 499
},
{
"epoch": 3.2261712439418417,
"grad_norm": 0.27430325746536255,
"learning_rate": 1.0433236978713546e-06,
"loss": 0.0085,
"step": 500
},
{
"epoch": 3.232633279483037,
"grad_norm": 0.1858188509941101,
"learning_rate": 1.0260524391538546e-06,
"loss": 0.0092,
"step": 501
},
{
"epoch": 3.2390953150242328,
"grad_norm": 0.43904629349708557,
"learning_rate": 1.0089089717057337e-06,
"loss": 0.0107,
"step": 502
},
{
"epoch": 3.245557350565428,
"grad_norm": 0.3002469539642334,
"learning_rate": 9.91893846813947e-07,
"loss": 0.0086,
"step": 503
},
{
"epoch": 3.2520193861066238,
"grad_norm": 0.2562623918056488,
"learning_rate": 9.7500761163831e-07,
"loss": 0.007,
"step": 504
},
{
"epoch": 3.258481421647819,
"grad_norm": 0.29067540168762207,
"learning_rate": 9.582508091938953e-07,
"loss": 0.0081,
"step": 505
},
{
"epoch": 3.2649434571890144,
"grad_norm": 0.2679020166397095,
"learning_rate": 9.416239783335785e-07,
"loss": 0.0098,
"step": 506
},
{
"epoch": 3.27140549273021,
"grad_norm": 0.32440176606178284,
"learning_rate": 9.251276537306969e-07,
"loss": 0.0074,
"step": 507
},
{
"epoch": 3.2778675282714054,
"grad_norm": 0.2440427541732788,
"learning_rate": 9.087623658618682e-07,
"loss": 0.009,
"step": 508
},
{
"epoch": 3.284329563812601,
"grad_norm": 0.3681286871433258,
"learning_rate": 8.925286409899308e-07,
"loss": 0.0071,
"step": 509
},
{
"epoch": 3.2907915993537964,
"grad_norm": 0.2806476950645447,
"learning_rate": 8.764270011470144e-07,
"loss": 0.0099,
"step": 510
},
{
"epoch": 3.297253634894992,
"grad_norm": 0.4697900414466858,
"learning_rate": 8.604579641177524e-07,
"loss": 0.0152,
"step": 511
},
{
"epoch": 3.3037156704361874,
"grad_norm": 0.3158174753189087,
"learning_rate": 8.446220434226382e-07,
"loss": 0.0085,
"step": 512
},
{
"epoch": 3.3101777059773827,
"grad_norm": 0.42739158868789673,
"learning_rate": 8.289197483015127e-07,
"loss": 0.0132,
"step": 513
},
{
"epoch": 3.3166397415185784,
"grad_norm": 0.2981080114841461,
"learning_rate": 8.133515836971773e-07,
"loss": 0.0064,
"step": 514
},
{
"epoch": 3.3231017770597737,
"grad_norm": 0.41693729162216187,
"learning_rate": 7.97918050239167e-07,
"loss": 0.0114,
"step": 515
},
{
"epoch": 3.3295638126009695,
"grad_norm": 0.44874924421310425,
"learning_rate": 7.826196442276473e-07,
"loss": 0.0099,
"step": 516
},
{
"epoch": 3.3360258481421647,
"grad_norm": 0.2718696892261505,
"learning_rate": 7.674568576174546e-07,
"loss": 0.0118,
"step": 517
},
{
"epoch": 3.3424878836833605,
"grad_norm": 0.39589688181877136,
"learning_rate": 7.524301780022774e-07,
"loss": 0.0143,
"step": 518
},
{
"epoch": 3.3489499192245558,
"grad_norm": 0.31575649976730347,
"learning_rate": 7.375400885989758e-07,
"loss": 0.0131,
"step": 519
},
{
"epoch": 3.355411954765751,
"grad_norm": 0.4461822807788849,
"learning_rate": 7.227870682320432e-07,
"loss": 0.0086,
"step": 520
},
{
"epoch": 3.361873990306947,
"grad_norm": 0.34174418449401855,
"learning_rate": 7.081715913182069e-07,
"loss": 0.0069,
"step": 521
},
{
"epoch": 3.368336025848142,
"grad_norm": 0.2855561375617981,
"learning_rate": 6.936941278511744e-07,
"loss": 0.008,
"step": 522
},
{
"epoch": 3.374798061389338,
"grad_norm": 0.3220924139022827,
"learning_rate": 6.793551433865198e-07,
"loss": 0.009,
"step": 523
},
{
"epoch": 3.381260096930533,
"grad_norm": 0.28071683645248413,
"learning_rate": 6.651550990267091e-07,
"loss": 0.0069,
"step": 524
},
{
"epoch": 3.387722132471729,
"grad_norm": 0.26843127608299255,
"learning_rate": 6.510944514062784e-07,
"loss": 0.0074,
"step": 525
},
{
"epoch": 3.394184168012924,
"grad_norm": 0.3113328218460083,
"learning_rate": 6.371736526771421e-07,
"loss": 0.0091,
"step": 526
},
{
"epoch": 3.4006462035541194,
"grad_norm": 0.22835861146450043,
"learning_rate": 6.233931504940633e-07,
"loss": 0.0072,
"step": 527
},
{
"epoch": 3.407108239095315,
"grad_norm": 8068.888671875,
"learning_rate": 6.097533880002476e-07,
"loss": 0.0093,
"step": 528
},
{
"epoch": 3.4135702746365104,
"grad_norm": 0.39374470710754395,
"learning_rate": 5.962548038130972e-07,
"loss": 0.012,
"step": 529
},
{
"epoch": 3.420032310177706,
"grad_norm": 0.4547136723995209,
"learning_rate": 5.828978320101109e-07,
"loss": 0.0098,
"step": 530
},
{
"epoch": 3.4264943457189014,
"grad_norm": 0.4039766490459442,
"learning_rate": 5.696829021149181e-07,
"loss": 0.0077,
"step": 531
},
{
"epoch": 3.432956381260097,
"grad_norm": 0.32658788561820984,
"learning_rate": 5.566104390834709e-07,
"loss": 0.008,
"step": 532
},
{
"epoch": 3.4394184168012925,
"grad_norm": 0.3402605950832367,
"learning_rate": 5.436808632903729e-07,
"loss": 0.0082,
"step": 533
},
{
"epoch": 3.4458804523424877,
"grad_norm": 0.31616881489753723,
"learning_rate": 5.308945905153729e-07,
"loss": 0.0129,
"step": 534
},
{
"epoch": 3.4523424878836835,
"grad_norm": 0.43558764457702637,
"learning_rate": 5.182520319299816e-07,
"loss": 0.0117,
"step": 535
},
{
"epoch": 3.4588045234248788,
"grad_norm": 0.337622731924057,
"learning_rate": 5.057535940842567e-07,
"loss": 0.007,
"step": 536
},
{
"epoch": 3.4652665589660745,
"grad_norm": 0.3466811180114746,
"learning_rate": 4.933996788937279e-07,
"loss": 0.011,
"step": 537
},
{
"epoch": 3.47172859450727,
"grad_norm": 0.35212457180023193,
"learning_rate": 4.811906836264718e-07,
"loss": 0.0107,
"step": 538
},
{
"epoch": 3.4781906300484655,
"grad_norm": 0.3255941867828369,
"learning_rate": 4.691270008903365e-07,
"loss": 0.0113,
"step": 539
},
{
"epoch": 3.484652665589661,
"grad_norm": 0.3242311477661133,
"learning_rate": 4.572090186203171e-07,
"loss": 0.0116,
"step": 540
},
{
"epoch": 3.491114701130856,
"grad_norm": 0.35152673721313477,
"learning_rate": 4.4543712006608507e-07,
"loss": 0.0101,
"step": 541
},
{
"epoch": 3.497576736672052,
"grad_norm": 0.31033825874328613,
"learning_rate": 4.338116837796519e-07,
"loss": 0.0101,
"step": 542
},
{
"epoch": 3.504038772213247,
"grad_norm": 0.3278428316116333,
"learning_rate": 4.2233308360321024e-07,
"loss": 0.0077,
"step": 543
},
{
"epoch": 3.5105008077544424,
"grad_norm": 1.6630736589431763,
"learning_rate": 4.110016886571011e-07,
"loss": 0.0087,
"step": 544
},
{
"epoch": 3.516962843295638,
"grad_norm": 0.49623093008995056,
"learning_rate": 3.998178633279537e-07,
"loss": 0.0075,
"step": 545
},
{
"epoch": 3.523424878836834,
"grad_norm": 0.24495165050029755,
"learning_rate": 3.887819672569565e-07,
"loss": 0.0076,
"step": 546
},
{
"epoch": 3.529886914378029,
"grad_norm": 1.0713497400283813,
"learning_rate": 3.778943553283015e-07,
"loss": 0.0087,
"step": 547
},
{
"epoch": 3.5363489499192244,
"grad_norm": 0.35764917731285095,
"learning_rate": 3.671553776577702e-07,
"loss": 0.0145,
"step": 548
},
{
"epoch": 3.54281098546042,
"grad_norm": 0.4270109534263611,
"learning_rate": 3.5656537958147164e-07,
"loss": 0.0196,
"step": 549
},
{
"epoch": 3.5492730210016155,
"grad_norm": 0.36601418256759644,
"learning_rate": 3.461247016447372e-07,
"loss": 0.0128,
"step": 550
},
{
"epoch": 3.5557350565428107,
"grad_norm": 0.2740163505077362,
"learning_rate": 3.3583367959117374e-07,
"loss": 0.0073,
"step": 551
},
{
"epoch": 3.5621970920840065,
"grad_norm": 0.30819642543792725,
"learning_rate": 3.2569264435186597e-07,
"loss": 0.011,
"step": 552
},
{
"epoch": 3.568659127625202,
"grad_norm": 0.33172082901000977,
"learning_rate": 3.1570192203473183e-07,
"loss": 0.0092,
"step": 553
},
{
"epoch": 3.5751211631663975,
"grad_norm": 0.23010392487049103,
"learning_rate": 3.058618339140368e-07,
"loss": 0.0035,
"step": 554
},
{
"epoch": 3.581583198707593,
"grad_norm": 0.22640886902809143,
"learning_rate": 2.961726964200645e-07,
"loss": 0.0056,
"step": 555
},
{
"epoch": 3.5880452342487885,
"grad_norm": 0.21879248321056366,
"learning_rate": 2.8663482112893936e-07,
"loss": 0.0066,
"step": 556
},
{
"epoch": 3.594507269789984,
"grad_norm": 0.2790137827396393,
"learning_rate": 2.772485147526077e-07,
"loss": 0.0082,
"step": 557
},
{
"epoch": 3.600969305331179,
"grad_norm": 2.688298225402832,
"learning_rate": 2.680140791289737e-07,
"loss": 0.015,
"step": 558
},
{
"epoch": 3.607431340872375,
"grad_norm": 0.29820549488067627,
"learning_rate": 2.5893181121219637e-07,
"loss": 0.0097,
"step": 559
},
{
"epoch": 3.61389337641357,
"grad_norm": 0.33826717734336853,
"learning_rate": 2.500020030631356e-07,
"loss": 0.0087,
"step": 560
},
{
"epoch": 3.620355411954766,
"grad_norm": 0.43633323907852173,
"learning_rate": 2.4122494183996426e-07,
"loss": 0.0127,
"step": 561
},
{
"epoch": 3.626817447495961,
"grad_norm": 0.2826526165008545,
"learning_rate": 2.3260090978893146e-07,
"loss": 0.0089,
"step": 562
},
{
"epoch": 3.633279483037157,
"grad_norm": 0.27118533849716187,
"learning_rate": 2.2413018423528832e-07,
"loss": 0.0059,
"step": 563
},
{
"epoch": 3.639741518578352,
"grad_norm": 2.0000076293945312,
"learning_rate": 2.1581303757436778e-07,
"loss": 0.0088,
"step": 564
},
{
"epoch": 3.6462035541195474,
"grad_norm": 0.2869947850704193,
"learning_rate": 2.076497372628261e-07,
"loss": 0.0084,
"step": 565
},
{
"epoch": 3.652665589660743,
"grad_norm": 0.35098886489868164,
"learning_rate": 1.9964054581004476e-07,
"loss": 0.0133,
"step": 566
},
{
"epoch": 3.6591276252019385,
"grad_norm": 0.32275232672691345,
"learning_rate": 1.9178572076968437e-07,
"loss": 0.0109,
"step": 567
},
{
"epoch": 3.665589660743134,
"grad_norm": 0.44486019015312195,
"learning_rate": 1.84085514731403e-07,
"loss": 0.0093,
"step": 568
},
{
"epoch": 3.6720516962843295,
"grad_norm": 0.30504077672958374,
"learning_rate": 1.7654017531273882e-07,
"loss": 0.0097,
"step": 569
},
{
"epoch": 3.678513731825525,
"grad_norm": 0.4125590920448303,
"learning_rate": 1.6914994515114082e-07,
"loss": 0.01,
"step": 570
},
{
"epoch": 3.6849757673667205,
"grad_norm": 0.44932204484939575,
"learning_rate": 1.619150618961701e-07,
"loss": 0.01,
"step": 571
},
{
"epoch": 3.691437802907916,
"grad_norm": 0.39718618988990784,
"learning_rate": 1.5483575820185615e-07,
"loss": 0.0096,
"step": 572
},
{
"epoch": 3.6978998384491115,
"grad_norm": 0.2970835864543915,
"learning_rate": 1.4791226171921748e-07,
"loss": 0.0076,
"step": 573
},
{
"epoch": 3.704361873990307,
"grad_norm": 0.33279949426651,
"learning_rate": 1.411447950889372e-07,
"loss": 0.0108,
"step": 574
},
{
"epoch": 3.7108239095315025,
"grad_norm": 0.33126136660575867,
"learning_rate": 1.3453357593420757e-07,
"loss": 0.0084,
"step": 575
},
{
"epoch": 3.717285945072698,
"grad_norm": 0.41318628191947937,
"learning_rate": 1.2807881685372947e-07,
"loss": 0.007,
"step": 576
},
{
"epoch": 3.7237479806138936,
"grad_norm": 0.3176984488964081,
"learning_rate": 1.2178072541487508e-07,
"loss": 0.0103,
"step": 577
},
{
"epoch": 3.730210016155089,
"grad_norm": 0.35345304012298584,
"learning_rate": 1.1563950414701653e-07,
"loss": 0.0082,
"step": 578
},
{
"epoch": 3.736672051696284,
"grad_norm": 0.37485456466674805,
"learning_rate": 1.0965535053500843e-07,
"loss": 0.0072,
"step": 579
},
{
"epoch": 3.74313408723748,
"grad_norm": 0.426248699426651,
"learning_rate": 1.0382845701284228e-07,
"loss": 0.0107,
"step": 580
},
{
"epoch": 3.749596122778675,
"grad_norm": 0.7327535152435303,
"learning_rate": 9.815901095745373e-08,
"loss": 0.0108,
"step": 581
},
{
"epoch": 3.756058158319871,
"grad_norm": 0.39146602153778076,
"learning_rate": 9.264719468270011e-08,
"loss": 0.0165,
"step": 582
},
{
"epoch": 3.762520193861066,
"grad_norm": 0.35452187061309814,
"learning_rate": 8.729318543349685e-08,
"loss": 0.0068,
"step": 583
},
{
"epoch": 3.768982229402262,
"grad_norm": 0.3429497480392456,
"learning_rate": 8.209715538011753e-08,
"loss": 0.0101,
"step": 584
},
{
"epoch": 3.775444264943457,
"grad_norm": 1.697080373764038,
"learning_rate": 7.70592716126567e-08,
"loss": 0.0105,
"step": 585
},
{
"epoch": 3.7819063004846525,
"grad_norm": 0.27320510149002075,
"learning_rate": 7.217969613565856e-08,
"loss": 0.0105,
"step": 586
},
{
"epoch": 3.788368336025848,
"grad_norm": 0.288004070520401,
"learning_rate": 6.745858586290566e-08,
"loss": 0.0083,
"step": 587
},
{
"epoch": 3.7948303715670435,
"grad_norm": 0.42086732387542725,
"learning_rate": 6.28960926123745e-08,
"loss": 0.0143,
"step": 588
},
{
"epoch": 3.8012924071082392,
"grad_norm": 0.30742815136909485,
"learning_rate": 5.84923631013512e-08,
"loss": 0.0148,
"step": 589
},
{
"epoch": 3.8077544426494345,
"grad_norm": 0.4420062005519867,
"learning_rate": 5.424753894171519e-08,
"loss": 0.0116,
"step": 590
},
{
"epoch": 3.8142164781906303,
"grad_norm": 0.2861470580101013,
"learning_rate": 5.016175663538625e-08,
"loss": 0.0043,
"step": 591
},
{
"epoch": 3.8206785137318255,
"grad_norm": 0.26148203015327454,
"learning_rate": 4.623514756993241e-08,
"loss": 0.0062,
"step": 592
},
{
"epoch": 3.827140549273021,
"grad_norm": 0.2247193306684494,
"learning_rate": 4.246783801434617e-08,
"loss": 0.0068,
"step": 593
},
{
"epoch": 3.8336025848142166,
"grad_norm": 0.2534724175930023,
"learning_rate": 3.885994911498603e-08,
"loss": 0.0065,
"step": 594
},
{
"epoch": 3.840064620355412,
"grad_norm": 0.31165772676467896,
"learning_rate": 3.541159689167628e-08,
"loss": 0.0097,
"step": 595
},
{
"epoch": 3.8465266558966076,
"grad_norm": 0.47996920347213745,
"learning_rate": 3.212289223398002e-08,
"loss": 0.0057,
"step": 596
},
{
"epoch": 3.852988691437803,
"grad_norm": 0.2576350271701813,
"learning_rate": 2.8993940897631412e-08,
"loss": 0.0073,
"step": 597
},
{
"epoch": 3.8594507269789986,
"grad_norm": 0.3661513030529022,
"learning_rate": 2.602484350113621e-08,
"loss": 0.0085,
"step": 598
},
{
"epoch": 3.865912762520194,
"grad_norm": 0.4161849915981293,
"learning_rate": 2.321569552253433e-08,
"loss": 0.0116,
"step": 599
},
{
"epoch": 3.872374798061389,
"grad_norm": 0.25652387738227844,
"learning_rate": 2.056658729633121e-08,
"loss": 0.0078,
"step": 600
},
{
"epoch": 3.878836833602585,
"grad_norm": 0.31865638494491577,
"learning_rate": 1.807760401059122e-08,
"loss": 0.0111,
"step": 601
},
{
"epoch": 3.88529886914378,
"grad_norm": 0.4232560396194458,
"learning_rate": 1.5748825704199887e-08,
"loss": 0.0146,
"step": 602
},
{
"epoch": 3.891760904684976,
"grad_norm": 0.3084033727645874,
"learning_rate": 1.3580327264289261e-08,
"loss": 0.0066,
"step": 603
},
{
"epoch": 3.898222940226171,
"grad_norm": 0.2703307271003723,
"learning_rate": 1.1572178423830405e-08,
"loss": 0.0091,
"step": 604
},
{
"epoch": 3.904684975767367,
"grad_norm": 0.34035852551460266,
"learning_rate": 9.724443759389635e-09,
"loss": 0.0059,
"step": 605
},
{
"epoch": 3.9111470113085622,
"grad_norm": 0.5055537819862366,
"learning_rate": 8.037182689052958e-09,
"loss": 0.0087,
"step": 606
},
{
"epoch": 3.9176090468497575,
"grad_norm": 0.4783738851547241,
"learning_rate": 6.510449470514824e-09,
"loss": 0.0115,
"step": 607
},
{
"epoch": 3.9240710823909533,
"grad_norm": 0.24573703110218048,
"learning_rate": 5.1442931993350705e-09,
"loss": 0.007,
"step": 608
},
{
"epoch": 3.9305331179321485,
"grad_norm": 0.2814759314060211,
"learning_rate": 3.9387578073563086e-09,
"loss": 0.0105,
"step": 609
},
{
"epoch": 3.936995153473344,
"grad_norm": 0.30954548716545105,
"learning_rate": 2.8938820612961494e-09,
"loss": 0.0057,
"step": 610
},
{
"epoch": 3.9434571890145396,
"grad_norm": 0.25791114568710327,
"learning_rate": 2.0096995614959924e-09,
"loss": 0.0094,
"step": 611
},
{
"epoch": 3.9499192245557353,
"grad_norm": 0.2954118549823761,
"learning_rate": 1.2862387408435483e-09,
"loss": 0.0053,
"step": 612
},
{
"epoch": 3.9563812600969306,
"grad_norm": 0.3213749825954437,
"learning_rate": 7.235228638574621e-10,
"loss": 0.0145,
"step": 613
},
{
"epoch": 3.962843295638126,
"grad_norm": 0.2650403380393982,
"learning_rate": 3.2157002593902196e-10,
"loss": 0.0072,
"step": 614
},
{
"epoch": 3.9693053311793216,
"grad_norm": 0.32353588938713074,
"learning_rate": 8.039315279040338e-11,
"loss": 0.0067,
"step": 615
},
{
"epoch": 3.975767366720517,
"grad_norm": 0.2813875377178192,
"learning_rate": 0.0,
"loss": 0.007,
"step": 616
},
{
"epoch": 3.975767366720517,
"step": 616,
"total_flos": 8.773760444520202e+17,
"train_loss": 0.04414510081163849,
"train_runtime": 2220.3071,
"train_samples_per_second": 8.921,
"train_steps_per_second": 0.277
}
],
"logging_steps": 1,
"max_steps": 616,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.773760444520202e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}