TLDR_code_release / trainer_state.json
lizhongzhi2022's picture
Upload folder using huggingface_hub
c70e138 verified
raw
history blame
46 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 8,
"global_step": 256,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00390625,
"grad_norm": 3.380525042530954,
"learning_rate": 1e-05,
"loss": 0.2859,
"step": 1
},
{
"epoch": 0.0078125,
"grad_norm": 2.6901012326349156,
"learning_rate": 1e-05,
"loss": 0.2117,
"step": 2
},
{
"epoch": 0.01171875,
"grad_norm": 3.191447237922227,
"learning_rate": 1e-05,
"loss": 0.2602,
"step": 3
},
{
"epoch": 0.015625,
"grad_norm": 2.204083519446381,
"learning_rate": 1e-05,
"loss": 0.1972,
"step": 4
},
{
"epoch": 0.01953125,
"grad_norm": 2.0481149317155687,
"learning_rate": 1e-05,
"loss": 0.2338,
"step": 5
},
{
"epoch": 0.0234375,
"grad_norm": 1.6269814174466988,
"learning_rate": 1e-05,
"loss": 0.214,
"step": 6
},
{
"epoch": 0.02734375,
"grad_norm": 1.6908703624878527,
"learning_rate": 1e-05,
"loss": 0.2088,
"step": 7
},
{
"epoch": 0.03125,
"grad_norm": 1.2059719622160197,
"learning_rate": 1e-05,
"loss": 0.1975,
"step": 8
},
{
"epoch": 0.03125,
"eval_dev_acc": 0.515625,
"eval_dev_token": 4849.7578125,
"eval_runtime": 168.4394,
"eval_samples_per_second": 0.095,
"eval_steps_per_second": 0.006,
"step": 8
},
{
"epoch": 0.03515625,
"grad_norm": 1.6837720712641369,
"learning_rate": 1e-05,
"loss": 0.1873,
"step": 9
},
{
"epoch": 0.0390625,
"grad_norm": 1.2504651087103098,
"learning_rate": 1e-05,
"loss": 0.1959,
"step": 10
},
{
"epoch": 0.04296875,
"grad_norm": 1.3187603751382884,
"learning_rate": 1e-05,
"loss": 0.2135,
"step": 11
},
{
"epoch": 0.046875,
"grad_norm": 1.3545446581007174,
"learning_rate": 1e-05,
"loss": 0.2428,
"step": 12
},
{
"epoch": 0.05078125,
"grad_norm": 1.6286051945906104,
"learning_rate": 1e-05,
"loss": 0.1708,
"step": 13
},
{
"epoch": 0.0546875,
"grad_norm": 1.6081830921647842,
"learning_rate": 1e-05,
"loss": 0.1471,
"step": 14
},
{
"epoch": 0.05859375,
"grad_norm": 1.4305460955933824,
"learning_rate": 1e-05,
"loss": 0.1837,
"step": 15
},
{
"epoch": 0.0625,
"grad_norm": 1.3961670104174644,
"learning_rate": 1e-05,
"loss": 0.1352,
"step": 16
},
{
"epoch": 0.0625,
"eval_dev_acc": 0.4296875,
"eval_dev_token": 5067.265625,
"eval_runtime": 167.2848,
"eval_samples_per_second": 0.096,
"eval_steps_per_second": 0.006,
"step": 16
},
{
"epoch": 0.06640625,
"grad_norm": 1.5507019702345457,
"learning_rate": 1e-05,
"loss": 0.1657,
"step": 17
},
{
"epoch": 0.0703125,
"grad_norm": 1.3395286968352729,
"learning_rate": 1e-05,
"loss": 0.1824,
"step": 18
},
{
"epoch": 0.07421875,
"grad_norm": 2.201219146342779,
"learning_rate": 1e-05,
"loss": 0.1391,
"step": 19
},
{
"epoch": 0.078125,
"grad_norm": 1.75559779570709,
"learning_rate": 1e-05,
"loss": 0.1351,
"step": 20
},
{
"epoch": 0.08203125,
"grad_norm": 2.0359121335172428,
"learning_rate": 1e-05,
"loss": 0.1748,
"step": 21
},
{
"epoch": 0.0859375,
"grad_norm": 1.6822343317370052,
"learning_rate": 1e-05,
"loss": 0.1582,
"step": 22
},
{
"epoch": 0.08984375,
"grad_norm": 1.9664935447837442,
"learning_rate": 1e-05,
"loss": 0.1338,
"step": 23
},
{
"epoch": 0.09375,
"grad_norm": 1.1463903797363937,
"learning_rate": 1e-05,
"loss": 0.1139,
"step": 24
},
{
"epoch": 0.09375,
"eval_dev_acc": 0.4296875,
"eval_dev_token": 4994.296875,
"eval_runtime": 168.4043,
"eval_samples_per_second": 0.095,
"eval_steps_per_second": 0.006,
"step": 24
},
{
"epoch": 0.09765625,
"grad_norm": 2.1728621095149627,
"learning_rate": 1e-05,
"loss": 0.1471,
"step": 25
},
{
"epoch": 0.1015625,
"grad_norm": 1.6714738223766954,
"learning_rate": 1e-05,
"loss": 0.1349,
"step": 26
},
{
"epoch": 0.10546875,
"grad_norm": 1.5574316583381629,
"learning_rate": 1e-05,
"loss": 0.1356,
"step": 27
},
{
"epoch": 0.109375,
"grad_norm": 1.4728847084572547,
"learning_rate": 1e-05,
"loss": 0.1509,
"step": 28
},
{
"epoch": 0.11328125,
"grad_norm": 1.4769394661942852,
"learning_rate": 1e-05,
"loss": 0.1294,
"step": 29
},
{
"epoch": 0.1171875,
"grad_norm": 1.8550097520759188,
"learning_rate": 1e-05,
"loss": 0.1208,
"step": 30
},
{
"epoch": 0.12109375,
"grad_norm": 1.75157088447911,
"learning_rate": 1e-05,
"loss": 0.0993,
"step": 31
},
{
"epoch": 0.125,
"grad_norm": 1.6233472727407252,
"learning_rate": 1e-05,
"loss": 0.1412,
"step": 32
},
{
"epoch": 0.125,
"eval_dev_acc": 0.4609375,
"eval_dev_token": 4228.15625,
"eval_runtime": 159.0398,
"eval_samples_per_second": 0.101,
"eval_steps_per_second": 0.006,
"step": 32
},
{
"epoch": 0.12890625,
"grad_norm": 1.5246001678514782,
"learning_rate": 1e-05,
"loss": 0.1268,
"step": 33
},
{
"epoch": 0.1328125,
"grad_norm": 1.020147996755851,
"learning_rate": 1e-05,
"loss": 0.166,
"step": 34
},
{
"epoch": 0.13671875,
"grad_norm": 0.9795032964583498,
"learning_rate": 1e-05,
"loss": 0.1223,
"step": 35
},
{
"epoch": 0.140625,
"grad_norm": 1.0328587053324862,
"learning_rate": 1e-05,
"loss": 0.0889,
"step": 36
},
{
"epoch": 0.14453125,
"grad_norm": 0.8587530858129762,
"learning_rate": 1e-05,
"loss": 0.1618,
"step": 37
},
{
"epoch": 0.1484375,
"grad_norm": 1.0451234874371433,
"learning_rate": 1e-05,
"loss": 0.1973,
"step": 38
},
{
"epoch": 0.15234375,
"grad_norm": 1.032741287831154,
"learning_rate": 1e-05,
"loss": 0.1999,
"step": 39
},
{
"epoch": 0.15625,
"grad_norm": 1.0128010813738295,
"learning_rate": 1e-05,
"loss": 0.1314,
"step": 40
},
{
"epoch": 0.15625,
"eval_dev_acc": 0.40625,
"eval_dev_token": 5015.7421875,
"eval_runtime": 167.9354,
"eval_samples_per_second": 0.095,
"eval_steps_per_second": 0.006,
"step": 40
},
{
"epoch": 0.16015625,
"grad_norm": 0.7085331860395175,
"learning_rate": 1e-05,
"loss": 0.1424,
"step": 41
},
{
"epoch": 0.1640625,
"grad_norm": 0.8522197113830303,
"learning_rate": 1e-05,
"loss": 0.1523,
"step": 42
},
{
"epoch": 0.16796875,
"grad_norm": 0.9700458234990689,
"learning_rate": 1e-05,
"loss": 0.1655,
"step": 43
},
{
"epoch": 0.171875,
"grad_norm": 2.0713947251278855,
"learning_rate": 1e-05,
"loss": 0.2946,
"step": 44
},
{
"epoch": 0.17578125,
"grad_norm": 1.6441862242379885,
"learning_rate": 1e-05,
"loss": 0.2547,
"step": 45
},
{
"epoch": 0.1796875,
"grad_norm": 1.7959964112861366,
"learning_rate": 1e-05,
"loss": 0.3009,
"step": 46
},
{
"epoch": 0.18359375,
"grad_norm": 1.3449858551505456,
"learning_rate": 1e-05,
"loss": 0.2094,
"step": 47
},
{
"epoch": 0.1875,
"grad_norm": 1.2087309569022056,
"learning_rate": 1e-05,
"loss": 0.1908,
"step": 48
},
{
"epoch": 0.1875,
"eval_dev_acc": 0.34375,
"eval_dev_token": 4538.84375,
"eval_runtime": 161.6976,
"eval_samples_per_second": 0.099,
"eval_steps_per_second": 0.006,
"step": 48
},
{
"epoch": 0.19140625,
"grad_norm": 1.1559146316352948,
"learning_rate": 1e-05,
"loss": 0.3036,
"step": 49
},
{
"epoch": 0.1953125,
"grad_norm": 1.131769529502962,
"learning_rate": 1e-05,
"loss": 0.2441,
"step": 50
},
{
"epoch": 0.19921875,
"grad_norm": 1.4116452844735226,
"learning_rate": 1e-05,
"loss": 0.2028,
"step": 51
},
{
"epoch": 0.203125,
"grad_norm": 0.7550364491986332,
"learning_rate": 1e-05,
"loss": 0.215,
"step": 52
},
{
"epoch": 0.20703125,
"grad_norm": 1.3915284765850489,
"learning_rate": 1e-05,
"loss": 0.2878,
"step": 53
},
{
"epoch": 0.2109375,
"grad_norm": 1.6351241901381652,
"learning_rate": 1e-05,
"loss": 0.2446,
"step": 54
},
{
"epoch": 0.21484375,
"grad_norm": 1.6083218458029132,
"learning_rate": 1e-05,
"loss": 0.2088,
"step": 55
},
{
"epoch": 0.21875,
"grad_norm": 0.7434150303822764,
"learning_rate": 1e-05,
"loss": 0.2262,
"step": 56
},
{
"epoch": 0.21875,
"eval_dev_acc": 0.30708661675453186,
"eval_dev_token": 5670.251953125,
"eval_runtime": 174.7692,
"eval_samples_per_second": 0.092,
"eval_steps_per_second": 0.006,
"step": 56
},
{
"epoch": 0.22265625,
"grad_norm": 1.0769799759099778,
"learning_rate": 1e-05,
"loss": 0.208,
"step": 57
},
{
"epoch": 0.2265625,
"grad_norm": 0.9298141621627772,
"learning_rate": 1e-05,
"loss": 0.1687,
"step": 58
},
{
"epoch": 0.23046875,
"grad_norm": 1.285492123129724,
"learning_rate": 1e-05,
"loss": 0.2427,
"step": 59
},
{
"epoch": 0.234375,
"grad_norm": 0.8346778861730894,
"learning_rate": 1e-05,
"loss": 0.219,
"step": 60
},
{
"epoch": 0.23828125,
"grad_norm": 0.9873196942775492,
"learning_rate": 1e-05,
"loss": 0.242,
"step": 61
},
{
"epoch": 0.2421875,
"grad_norm": 0.9596507860915271,
"learning_rate": 1e-05,
"loss": 0.2148,
"step": 62
},
{
"epoch": 0.24609375,
"grad_norm": 1.0988562593647762,
"learning_rate": 1e-05,
"loss": 0.2396,
"step": 63
},
{
"epoch": 0.25,
"grad_norm": 0.9707635131928222,
"learning_rate": 1e-05,
"loss": 0.238,
"step": 64
},
{
"epoch": 0.25,
"eval_dev_acc": 0.5390625,
"eval_dev_token": 4394.921875,
"eval_runtime": 161.3481,
"eval_samples_per_second": 0.099,
"eval_steps_per_second": 0.006,
"step": 64
},
{
"epoch": 0.25390625,
"grad_norm": 0.8083595053544823,
"learning_rate": 1e-05,
"loss": 0.293,
"step": 65
},
{
"epoch": 0.2578125,
"grad_norm": 0.6893947679382126,
"learning_rate": 1e-05,
"loss": 0.2866,
"step": 66
},
{
"epoch": 0.26171875,
"grad_norm": 1.0271679359276198,
"learning_rate": 1e-05,
"loss": 0.2276,
"step": 67
},
{
"epoch": 0.265625,
"grad_norm": 1.1776528602190077,
"learning_rate": 1e-05,
"loss": 0.1887,
"step": 68
},
{
"epoch": 0.26953125,
"grad_norm": 1.163717423684938,
"learning_rate": 1e-05,
"loss": 0.2147,
"step": 69
},
{
"epoch": 0.2734375,
"grad_norm": 0.8134427746893115,
"learning_rate": 1e-05,
"loss": 0.2342,
"step": 70
},
{
"epoch": 0.27734375,
"grad_norm": 1.4269332848478926,
"learning_rate": 1e-05,
"loss": 0.1919,
"step": 71
},
{
"epoch": 0.28125,
"grad_norm": 0.8200789264174901,
"learning_rate": 1e-05,
"loss": 0.2175,
"step": 72
},
{
"epoch": 0.28125,
"eval_dev_acc": 0.53125,
"eval_dev_token": 4859.7421875,
"eval_runtime": 166.6197,
"eval_samples_per_second": 0.096,
"eval_steps_per_second": 0.006,
"step": 72
},
{
"epoch": 0.28515625,
"grad_norm": 1.007316679088458,
"learning_rate": 1e-05,
"loss": 0.3108,
"step": 73
},
{
"epoch": 0.2890625,
"grad_norm": 0.6637709768510952,
"learning_rate": 1e-05,
"loss": 0.1794,
"step": 74
},
{
"epoch": 0.29296875,
"grad_norm": 1.0144512803754202,
"learning_rate": 1e-05,
"loss": 0.1905,
"step": 75
},
{
"epoch": 0.296875,
"grad_norm": 1.2499777112248354,
"learning_rate": 1e-05,
"loss": 0.2014,
"step": 76
},
{
"epoch": 0.30078125,
"grad_norm": 1.0642239482819718,
"learning_rate": 1e-05,
"loss": 0.1648,
"step": 77
},
{
"epoch": 0.3046875,
"grad_norm": 0.8739614674360524,
"learning_rate": 1e-05,
"loss": 0.1537,
"step": 78
},
{
"epoch": 0.30859375,
"grad_norm": 0.5320613340314281,
"learning_rate": 1e-05,
"loss": 0.2128,
"step": 79
},
{
"epoch": 0.3125,
"grad_norm": 1.2802208673828028,
"learning_rate": 1e-05,
"loss": 0.1939,
"step": 80
},
{
"epoch": 0.3125,
"eval_dev_acc": 0.4609375,
"eval_dev_token": 5065.421875,
"eval_runtime": 168.4523,
"eval_samples_per_second": 0.095,
"eval_steps_per_second": 0.006,
"step": 80
},
{
"epoch": 0.31640625,
"grad_norm": 1.1564057868614226,
"learning_rate": 1e-05,
"loss": 0.2215,
"step": 81
},
{
"epoch": 0.3203125,
"grad_norm": 0.7104999594850884,
"learning_rate": 1e-05,
"loss": 0.1224,
"step": 82
},
{
"epoch": 0.32421875,
"grad_norm": 0.6466657594813067,
"learning_rate": 1e-05,
"loss": 0.145,
"step": 83
},
{
"epoch": 0.328125,
"grad_norm": 1.3499118701284736,
"learning_rate": 1e-05,
"loss": 0.1963,
"step": 84
},
{
"epoch": 0.33203125,
"grad_norm": 0.6363338361760021,
"learning_rate": 1e-05,
"loss": 0.1781,
"step": 85
},
{
"epoch": 0.3359375,
"grad_norm": 0.8807906150832371,
"learning_rate": 1e-05,
"loss": 0.1426,
"step": 86
},
{
"epoch": 0.33984375,
"grad_norm": 0.7466707582875238,
"learning_rate": 1e-05,
"loss": 0.1629,
"step": 87
},
{
"epoch": 0.34375,
"grad_norm": 0.7773292125565866,
"learning_rate": 1e-05,
"loss": 0.181,
"step": 88
},
{
"epoch": 0.34375,
"eval_dev_acc": 0.4609375,
"eval_dev_token": 5092.8984375,
"eval_runtime": 168.9275,
"eval_samples_per_second": 0.095,
"eval_steps_per_second": 0.006,
"step": 88
},
{
"epoch": 0.34765625,
"grad_norm": 0.9798290139606278,
"learning_rate": 1e-05,
"loss": 0.1725,
"step": 89
},
{
"epoch": 0.3515625,
"grad_norm": 1.2761428002675261,
"learning_rate": 1e-05,
"loss": 0.175,
"step": 90
},
{
"epoch": 0.35546875,
"grad_norm": 0.5042091805859357,
"learning_rate": 1e-05,
"loss": 0.218,
"step": 91
},
{
"epoch": 0.359375,
"grad_norm": 1.017358230975041,
"learning_rate": 1e-05,
"loss": 0.2502,
"step": 92
},
{
"epoch": 0.36328125,
"grad_norm": 0.7366049175316091,
"learning_rate": 1e-05,
"loss": 0.1656,
"step": 93
},
{
"epoch": 0.3671875,
"grad_norm": 0.9422427666318486,
"learning_rate": 1e-05,
"loss": 0.1455,
"step": 94
},
{
"epoch": 0.37109375,
"grad_norm": 0.7689775552730859,
"learning_rate": 1e-05,
"loss": 0.1485,
"step": 95
},
{
"epoch": 0.375,
"grad_norm": 0.9090457524355386,
"learning_rate": 1e-05,
"loss": 0.1411,
"step": 96
},
{
"epoch": 0.375,
"eval_dev_acc": 0.453125,
"eval_dev_token": 4948.8359375,
"eval_runtime": 165.5377,
"eval_samples_per_second": 0.097,
"eval_steps_per_second": 0.006,
"step": 96
},
{
"epoch": 0.37890625,
"grad_norm": 0.7235724828873173,
"learning_rate": 1e-05,
"loss": 0.2193,
"step": 97
},
{
"epoch": 0.3828125,
"grad_norm": 0.7200445685294068,
"learning_rate": 1e-05,
"loss": 0.1985,
"step": 98
},
{
"epoch": 0.38671875,
"grad_norm": 0.6060156821220763,
"learning_rate": 1e-05,
"loss": 0.2096,
"step": 99
},
{
"epoch": 0.390625,
"grad_norm": 0.7114968462244617,
"learning_rate": 1e-05,
"loss": 0.1928,
"step": 100
},
{
"epoch": 0.39453125,
"grad_norm": 0.6397518359548336,
"learning_rate": 1e-05,
"loss": 0.2165,
"step": 101
},
{
"epoch": 0.3984375,
"grad_norm": 0.7027126137819094,
"learning_rate": 1e-05,
"loss": 0.2263,
"step": 102
},
{
"epoch": 0.40234375,
"grad_norm": 0.8648981933002193,
"learning_rate": 1e-05,
"loss": 0.2874,
"step": 103
},
{
"epoch": 0.40625,
"grad_norm": 0.9742992968412495,
"learning_rate": 1e-05,
"loss": 0.1755,
"step": 104
},
{
"epoch": 0.40625,
"eval_dev_acc": 0.3515625,
"eval_dev_token": 5303.1796875,
"eval_runtime": 173.9477,
"eval_samples_per_second": 0.092,
"eval_steps_per_second": 0.006,
"step": 104
},
{
"epoch": 0.41015625,
"grad_norm": 0.6358933759276069,
"learning_rate": 1e-05,
"loss": 0.1907,
"step": 105
},
{
"epoch": 0.4140625,
"grad_norm": 0.7859972506268991,
"learning_rate": 1e-05,
"loss": 0.1731,
"step": 106
},
{
"epoch": 0.41796875,
"grad_norm": 0.6429885607052577,
"learning_rate": 1e-05,
"loss": 0.187,
"step": 107
},
{
"epoch": 0.421875,
"grad_norm": 0.6314004528855494,
"learning_rate": 1e-05,
"loss": 0.2185,
"step": 108
},
{
"epoch": 0.42578125,
"grad_norm": 0.8243656111706104,
"learning_rate": 1e-05,
"loss": 0.1384,
"step": 109
},
{
"epoch": 0.4296875,
"grad_norm": 0.7310074535827911,
"learning_rate": 1e-05,
"loss": 0.1724,
"step": 110
},
{
"epoch": 0.43359375,
"grad_norm": 1.8710293554497974,
"learning_rate": 1e-05,
"loss": 0.273,
"step": 111
},
{
"epoch": 0.4375,
"grad_norm": 1.3308164398688347,
"learning_rate": 1e-05,
"loss": 0.2852,
"step": 112
},
{
"epoch": 0.4375,
"eval_dev_acc": 0.296875,
"eval_dev_token": 5770.9375,
"eval_runtime": 175.5918,
"eval_samples_per_second": 0.091,
"eval_steps_per_second": 0.006,
"step": 112
},
{
"epoch": 0.44140625,
"grad_norm": 0.4499041384963393,
"learning_rate": 1e-05,
"loss": 0.1845,
"step": 113
},
{
"epoch": 0.4453125,
"grad_norm": 0.5818915994231291,
"learning_rate": 1e-05,
"loss": 0.2709,
"step": 114
},
{
"epoch": 0.44921875,
"grad_norm": 0.6130904000526848,
"learning_rate": 1e-05,
"loss": 0.231,
"step": 115
},
{
"epoch": 0.453125,
"grad_norm": 0.7266034880537791,
"learning_rate": 1e-05,
"loss": 0.1555,
"step": 116
},
{
"epoch": 0.45703125,
"grad_norm": 0.425032745279421,
"learning_rate": 1e-05,
"loss": 0.1733,
"step": 117
},
{
"epoch": 0.4609375,
"grad_norm": 0.41408811254876093,
"learning_rate": 1e-05,
"loss": 0.1793,
"step": 118
},
{
"epoch": 0.46484375,
"grad_norm": 0.8433491024471641,
"learning_rate": 1e-05,
"loss": 0.2335,
"step": 119
},
{
"epoch": 0.46875,
"grad_norm": 0.5585183306922875,
"learning_rate": 1e-05,
"loss": 0.2515,
"step": 120
},
{
"epoch": 0.46875,
"eval_dev_acc": 0.4724409580230713,
"eval_dev_token": 4777.55126953125,
"eval_runtime": 165.1485,
"eval_samples_per_second": 0.097,
"eval_steps_per_second": 0.006,
"step": 120
},
{
"epoch": 0.47265625,
"grad_norm": 0.9520218462259554,
"learning_rate": 1e-05,
"loss": 0.2613,
"step": 121
},
{
"epoch": 0.4765625,
"grad_norm": 0.4858585527334522,
"learning_rate": 1e-05,
"loss": 0.2379,
"step": 122
},
{
"epoch": 0.48046875,
"grad_norm": 0.5772160567620949,
"learning_rate": 1e-05,
"loss": 0.241,
"step": 123
},
{
"epoch": 0.484375,
"grad_norm": 0.731954162407159,
"learning_rate": 1e-05,
"loss": 0.2482,
"step": 124
},
{
"epoch": 0.48828125,
"grad_norm": 0.49226621710163243,
"learning_rate": 1e-05,
"loss": 0.2333,
"step": 125
},
{
"epoch": 0.4921875,
"grad_norm": 0.43779404197089106,
"learning_rate": 1e-05,
"loss": 0.185,
"step": 126
},
{
"epoch": 0.49609375,
"grad_norm": 0.6856986141306837,
"learning_rate": 1e-05,
"loss": 0.1943,
"step": 127
},
{
"epoch": 0.5,
"grad_norm": 0.6558122415773976,
"learning_rate": 1e-05,
"loss": 0.2185,
"step": 128
},
{
"epoch": 0.5,
"eval_dev_acc": 0.4765625,
"eval_dev_token": 4368.859375,
"eval_runtime": 161.9718,
"eval_samples_per_second": 0.099,
"eval_steps_per_second": 0.006,
"step": 128
},
{
"epoch": 0.50390625,
"grad_norm": 0.4099906022533745,
"learning_rate": 1e-05,
"loss": 0.2113,
"step": 129
},
{
"epoch": 0.5078125,
"grad_norm": 0.49752415105495956,
"learning_rate": 1e-05,
"loss": 0.2217,
"step": 130
},
{
"epoch": 0.51171875,
"grad_norm": 0.8912790018467623,
"learning_rate": 1e-05,
"loss": 0.3422,
"step": 131
},
{
"epoch": 0.515625,
"grad_norm": 0.6764829647253893,
"learning_rate": 1e-05,
"loss": 0.2055,
"step": 132
},
{
"epoch": 0.51953125,
"grad_norm": 0.8399641090693946,
"learning_rate": 1e-05,
"loss": 0.2087,
"step": 133
},
{
"epoch": 0.5234375,
"grad_norm": 0.4594160953603203,
"learning_rate": 1e-05,
"loss": 0.2093,
"step": 134
},
{
"epoch": 0.52734375,
"grad_norm": 0.7432138703184232,
"learning_rate": 1e-05,
"loss": 0.1969,
"step": 135
},
{
"epoch": 0.53125,
"grad_norm": 0.4584467325236011,
"learning_rate": 1e-05,
"loss": 0.1806,
"step": 136
},
{
"epoch": 0.53125,
"eval_dev_acc": 0.4765625,
"eval_dev_token": 4603.53125,
"eval_runtime": 164.3452,
"eval_samples_per_second": 0.097,
"eval_steps_per_second": 0.006,
"step": 136
},
{
"epoch": 0.53515625,
"grad_norm": 0.6458588312529675,
"learning_rate": 1e-05,
"loss": 0.2087,
"step": 137
},
{
"epoch": 0.5390625,
"grad_norm": 0.7370624067340756,
"learning_rate": 1e-05,
"loss": 0.1854,
"step": 138
},
{
"epoch": 0.54296875,
"grad_norm": 0.7141604462138248,
"learning_rate": 1e-05,
"loss": 0.2535,
"step": 139
},
{
"epoch": 0.546875,
"grad_norm": 0.8212814690178184,
"learning_rate": 1e-05,
"loss": 0.1668,
"step": 140
},
{
"epoch": 0.55078125,
"grad_norm": 0.5799692948316157,
"learning_rate": 1e-05,
"loss": 0.2375,
"step": 141
},
{
"epoch": 0.5546875,
"grad_norm": 0.5333639624775814,
"learning_rate": 1e-05,
"loss": 0.1737,
"step": 142
},
{
"epoch": 0.55859375,
"grad_norm": 0.4076841439195106,
"learning_rate": 1e-05,
"loss": 0.1627,
"step": 143
},
{
"epoch": 0.5625,
"grad_norm": 0.4118175478201596,
"learning_rate": 1e-05,
"loss": 0.1576,
"step": 144
},
{
"epoch": 0.5625,
"eval_dev_acc": 0.5234375,
"eval_dev_token": 5125.0703125,
"eval_runtime": 168.804,
"eval_samples_per_second": 0.095,
"eval_steps_per_second": 0.006,
"step": 144
},
{
"epoch": 0.56640625,
"grad_norm": 0.5988381099011506,
"learning_rate": 1e-05,
"loss": 0.1656,
"step": 145
},
{
"epoch": 0.5703125,
"grad_norm": 0.9328153493065982,
"learning_rate": 1e-05,
"loss": 0.1788,
"step": 146
},
{
"epoch": 0.57421875,
"grad_norm": 0.8013592126955402,
"learning_rate": 1e-05,
"loss": 0.2009,
"step": 147
},
{
"epoch": 0.578125,
"grad_norm": 0.4868159061171701,
"learning_rate": 1e-05,
"loss": 0.217,
"step": 148
},
{
"epoch": 0.58203125,
"grad_norm": 0.6758953539585006,
"learning_rate": 1e-05,
"loss": 0.2344,
"step": 149
},
{
"epoch": 0.5859375,
"grad_norm": 0.8609458752061137,
"learning_rate": 1e-05,
"loss": 0.1939,
"step": 150
},
{
"epoch": 0.58984375,
"grad_norm": 0.45913847739444186,
"learning_rate": 1e-05,
"loss": 0.1691,
"step": 151
},
{
"epoch": 0.59375,
"grad_norm": 0.8064977044716175,
"learning_rate": 1e-05,
"loss": 0.1949,
"step": 152
},
{
"epoch": 0.59375,
"eval_dev_acc": 0.40625,
"eval_dev_token": 4508.484375,
"eval_runtime": 160.3398,
"eval_samples_per_second": 0.1,
"eval_steps_per_second": 0.006,
"step": 152
},
{
"epoch": 0.59765625,
"grad_norm": 0.9904042315049291,
"learning_rate": 1e-05,
"loss": 0.2253,
"step": 153
},
{
"epoch": 0.6015625,
"grad_norm": 0.5524318414569037,
"learning_rate": 1e-05,
"loss": 0.2535,
"step": 154
},
{
"epoch": 0.60546875,
"grad_norm": 0.418186463867415,
"learning_rate": 1e-05,
"loss": 0.1884,
"step": 155
},
{
"epoch": 0.609375,
"grad_norm": 0.6311027708045368,
"learning_rate": 1e-05,
"loss": 0.2408,
"step": 156
},
{
"epoch": 0.61328125,
"grad_norm": 0.4550696199781805,
"learning_rate": 1e-05,
"loss": 0.173,
"step": 157
},
{
"epoch": 0.6171875,
"grad_norm": 0.4596598696608727,
"learning_rate": 1e-05,
"loss": 0.1592,
"step": 158
},
{
"epoch": 0.62109375,
"grad_norm": 0.5573937890044522,
"learning_rate": 1e-05,
"loss": 0.1748,
"step": 159
},
{
"epoch": 0.625,
"grad_norm": 1.0862165315332113,
"learning_rate": 1e-05,
"loss": 0.2369,
"step": 160
},
{
"epoch": 0.625,
"eval_dev_acc": 0.4296875,
"eval_dev_token": 4869.8828125,
"eval_runtime": 167.2914,
"eval_samples_per_second": 0.096,
"eval_steps_per_second": 0.006,
"step": 160
},
{
"epoch": 0.62890625,
"grad_norm": 0.46051384064237827,
"learning_rate": 1e-05,
"loss": 0.2086,
"step": 161
},
{
"epoch": 0.6328125,
"grad_norm": 0.7125397532570018,
"learning_rate": 1e-05,
"loss": 0.2212,
"step": 162
},
{
"epoch": 0.63671875,
"grad_norm": 0.564820498711706,
"learning_rate": 1e-05,
"loss": 0.3019,
"step": 163
},
{
"epoch": 0.640625,
"grad_norm": 0.5218656690400247,
"learning_rate": 1e-05,
"loss": 0.1324,
"step": 164
},
{
"epoch": 0.64453125,
"grad_norm": 0.4994022980399308,
"learning_rate": 1e-05,
"loss": 0.1438,
"step": 165
},
{
"epoch": 0.6484375,
"grad_norm": 0.7016809849517179,
"learning_rate": 1e-05,
"loss": 0.2791,
"step": 166
},
{
"epoch": 0.65234375,
"grad_norm": 0.597463304680723,
"learning_rate": 1e-05,
"loss": 0.1749,
"step": 167
},
{
"epoch": 0.65625,
"grad_norm": 0.5536855781273838,
"learning_rate": 1e-05,
"loss": 0.2391,
"step": 168
},
{
"epoch": 0.65625,
"eval_dev_acc": 0.3203125,
"eval_dev_token": 5451.3671875,
"eval_runtime": 172.7574,
"eval_samples_per_second": 0.093,
"eval_steps_per_second": 0.006,
"step": 168
},
{
"epoch": 0.66015625,
"grad_norm": 0.9103508979108635,
"learning_rate": 1e-05,
"loss": 0.2613,
"step": 169
},
{
"epoch": 0.6640625,
"grad_norm": 0.4928845564740678,
"learning_rate": 1e-05,
"loss": 0.215,
"step": 170
},
{
"epoch": 0.66796875,
"grad_norm": 0.8690405638773996,
"learning_rate": 1e-05,
"loss": 0.2355,
"step": 171
},
{
"epoch": 0.671875,
"grad_norm": 0.5511255682147113,
"learning_rate": 1e-05,
"loss": 0.2406,
"step": 172
},
{
"epoch": 0.67578125,
"grad_norm": 0.44346107905460214,
"learning_rate": 1e-05,
"loss": 0.1867,
"step": 173
},
{
"epoch": 0.6796875,
"grad_norm": 0.4019557678019079,
"learning_rate": 1e-05,
"loss": 0.1488,
"step": 174
},
{
"epoch": 0.68359375,
"grad_norm": 0.4139658009208469,
"learning_rate": 1e-05,
"loss": 0.1666,
"step": 175
},
{
"epoch": 0.6875,
"grad_norm": 0.45363011716779816,
"learning_rate": 1e-05,
"loss": 0.2006,
"step": 176
},
{
"epoch": 0.6875,
"eval_dev_acc": 0.3385826647281647,
"eval_dev_token": 4971.81884765625,
"eval_runtime": 166.9967,
"eval_samples_per_second": 0.096,
"eval_steps_per_second": 0.006,
"step": 176
},
{
"epoch": 0.69140625,
"grad_norm": 0.46674698673244774,
"learning_rate": 1e-05,
"loss": 0.1788,
"step": 177
},
{
"epoch": 0.6953125,
"grad_norm": 0.5396579551057291,
"learning_rate": 1e-05,
"loss": 0.1857,
"step": 178
},
{
"epoch": 0.69921875,
"grad_norm": 0.42472472699800484,
"learning_rate": 1e-05,
"loss": 0.1707,
"step": 179
},
{
"epoch": 0.703125,
"grad_norm": 0.4208916108378261,
"learning_rate": 1e-05,
"loss": 0.1736,
"step": 180
},
{
"epoch": 0.70703125,
"grad_norm": 0.5161632347165661,
"learning_rate": 1e-05,
"loss": 0.2074,
"step": 181
},
{
"epoch": 0.7109375,
"grad_norm": 0.4851147968745633,
"learning_rate": 1e-05,
"loss": 0.2183,
"step": 182
},
{
"epoch": 0.71484375,
"grad_norm": 0.5286494967968609,
"learning_rate": 1e-05,
"loss": 0.1877,
"step": 183
},
{
"epoch": 0.71875,
"grad_norm": 0.5399316089624949,
"learning_rate": 1e-05,
"loss": 0.209,
"step": 184
},
{
"epoch": 0.71875,
"eval_dev_acc": 0.3984375,
"eval_dev_token": 4787.84375,
"eval_runtime": 166.2574,
"eval_samples_per_second": 0.096,
"eval_steps_per_second": 0.006,
"step": 184
},
{
"epoch": 0.72265625,
"grad_norm": 0.7188938790166789,
"learning_rate": 1e-05,
"loss": 0.2065,
"step": 185
},
{
"epoch": 0.7265625,
"grad_norm": 0.5843767003652576,
"learning_rate": 1e-05,
"loss": 0.2356,
"step": 186
},
{
"epoch": 0.73046875,
"grad_norm": 0.4904003204685076,
"learning_rate": 1e-05,
"loss": 0.201,
"step": 187
},
{
"epoch": 0.734375,
"grad_norm": 0.485266158116283,
"learning_rate": 1e-05,
"loss": 0.1869,
"step": 188
},
{
"epoch": 0.73828125,
"grad_norm": 0.5242977395658632,
"learning_rate": 1e-05,
"loss": 0.2122,
"step": 189
},
{
"epoch": 0.7421875,
"grad_norm": 0.5417537780138298,
"learning_rate": 1e-05,
"loss": 0.2799,
"step": 190
},
{
"epoch": 0.74609375,
"grad_norm": 0.48949419193338123,
"learning_rate": 1e-05,
"loss": 0.212,
"step": 191
},
{
"epoch": 0.75,
"grad_norm": 0.48118963817889204,
"learning_rate": 1e-05,
"loss": 0.2195,
"step": 192
},
{
"epoch": 0.75,
"eval_dev_acc": 0.453125,
"eval_dev_token": 5056.7421875,
"eval_runtime": 168.273,
"eval_samples_per_second": 0.095,
"eval_steps_per_second": 0.006,
"step": 192
},
{
"epoch": 0.75390625,
"grad_norm": 0.6844465372064547,
"learning_rate": 1e-05,
"loss": 0.1645,
"step": 193
},
{
"epoch": 0.7578125,
"grad_norm": 0.49653100043792153,
"learning_rate": 1e-05,
"loss": 0.2023,
"step": 194
},
{
"epoch": 0.76171875,
"grad_norm": 0.5539027026151374,
"learning_rate": 1e-05,
"loss": 0.2348,
"step": 195
},
{
"epoch": 0.765625,
"grad_norm": 0.5003270709383194,
"learning_rate": 1e-05,
"loss": 0.2545,
"step": 196
},
{
"epoch": 0.76953125,
"grad_norm": 0.5666703162116131,
"learning_rate": 1e-05,
"loss": 0.2739,
"step": 197
},
{
"epoch": 0.7734375,
"grad_norm": 0.5281121627729704,
"learning_rate": 1e-05,
"loss": 0.1927,
"step": 198
},
{
"epoch": 0.77734375,
"grad_norm": 0.4691586351966124,
"learning_rate": 1e-05,
"loss": 0.2101,
"step": 199
},
{
"epoch": 0.78125,
"grad_norm": 0.43348894899907703,
"learning_rate": 1e-05,
"loss": 0.1636,
"step": 200
},
{
"epoch": 0.78125,
"eval_dev_acc": 0.4296875,
"eval_dev_token": 5082.265625,
"eval_runtime": 169.7777,
"eval_samples_per_second": 0.094,
"eval_steps_per_second": 0.006,
"step": 200
},
{
"epoch": 0.78515625,
"grad_norm": 0.4995118305726593,
"learning_rate": 1e-05,
"loss": 0.2149,
"step": 201
},
{
"epoch": 0.7890625,
"grad_norm": 0.3958721084761467,
"learning_rate": 1e-05,
"loss": 0.1732,
"step": 202
},
{
"epoch": 0.79296875,
"grad_norm": 0.4883258744044862,
"learning_rate": 1e-05,
"loss": 0.219,
"step": 203
},
{
"epoch": 0.796875,
"grad_norm": 0.45472746506302575,
"learning_rate": 1e-05,
"loss": 0.2187,
"step": 204
},
{
"epoch": 0.80078125,
"grad_norm": 0.45006095039367805,
"learning_rate": 1e-05,
"loss": 0.1924,
"step": 205
},
{
"epoch": 0.8046875,
"grad_norm": 0.4127537232406072,
"learning_rate": 1e-05,
"loss": 0.1736,
"step": 206
},
{
"epoch": 0.80859375,
"grad_norm": 0.4669392415601201,
"learning_rate": 1e-05,
"loss": 0.1847,
"step": 207
},
{
"epoch": 0.8125,
"grad_norm": 0.41469363114093816,
"learning_rate": 1e-05,
"loss": 0.1556,
"step": 208
},
{
"epoch": 0.8125,
"eval_dev_acc": 0.4609375,
"eval_dev_token": 4918.28125,
"eval_runtime": 166.5675,
"eval_samples_per_second": 0.096,
"eval_steps_per_second": 0.006,
"step": 208
},
{
"epoch": 0.81640625,
"grad_norm": 0.4433576280938302,
"learning_rate": 1e-05,
"loss": 0.1934,
"step": 209
},
{
"epoch": 0.8203125,
"grad_norm": 0.4355305023653351,
"learning_rate": 1e-05,
"loss": 0.1742,
"step": 210
},
{
"epoch": 0.82421875,
"grad_norm": 0.44938618579632195,
"learning_rate": 1e-05,
"loss": 0.1902,
"step": 211
},
{
"epoch": 0.828125,
"grad_norm": 0.5351771463999816,
"learning_rate": 1e-05,
"loss": 0.2148,
"step": 212
},
{
"epoch": 0.83203125,
"grad_norm": 0.5839350362138708,
"learning_rate": 1e-05,
"loss": 0.275,
"step": 213
},
{
"epoch": 0.8359375,
"grad_norm": 0.6964110745693202,
"learning_rate": 1e-05,
"loss": 0.2179,
"step": 214
},
{
"epoch": 0.83984375,
"grad_norm": 0.4337830660702992,
"learning_rate": 1e-05,
"loss": 0.2152,
"step": 215
},
{
"epoch": 0.84375,
"grad_norm": 0.46223312750006246,
"learning_rate": 1e-05,
"loss": 0.2405,
"step": 216
},
{
"epoch": 0.84375,
"eval_dev_acc": 0.3828125,
"eval_dev_token": 5435.3046875,
"eval_runtime": 173.8173,
"eval_samples_per_second": 0.092,
"eval_steps_per_second": 0.006,
"step": 216
},
{
"epoch": 0.84765625,
"grad_norm": 0.5541820526606585,
"learning_rate": 1e-05,
"loss": 0.2751,
"step": 217
},
{
"epoch": 0.8515625,
"grad_norm": 0.4662570041545537,
"learning_rate": 1e-05,
"loss": 0.2142,
"step": 218
},
{
"epoch": 0.85546875,
"grad_norm": 0.7737037625157579,
"learning_rate": 1e-05,
"loss": 0.2397,
"step": 219
},
{
"epoch": 0.859375,
"grad_norm": 0.5572195616624243,
"learning_rate": 1e-05,
"loss": 0.2421,
"step": 220
},
{
"epoch": 0.86328125,
"grad_norm": 0.5088509372691609,
"learning_rate": 1e-05,
"loss": 0.1875,
"step": 221
},
{
"epoch": 0.8671875,
"grad_norm": 0.508699458613964,
"learning_rate": 1e-05,
"loss": 0.1927,
"step": 222
},
{
"epoch": 0.87109375,
"grad_norm": 0.5150091482241945,
"learning_rate": 1e-05,
"loss": 0.2536,
"step": 223
},
{
"epoch": 0.875,
"grad_norm": 0.5203627078659161,
"learning_rate": 1e-05,
"loss": 0.2571,
"step": 224
},
{
"epoch": 0.875,
"eval_dev_acc": 0.3515625,
"eval_dev_token": 5227.0859375,
"eval_runtime": 170.2355,
"eval_samples_per_second": 0.094,
"eval_steps_per_second": 0.006,
"step": 224
},
{
"epoch": 0.87890625,
"grad_norm": 0.5279392216696818,
"learning_rate": 1e-05,
"loss": 0.2278,
"step": 225
},
{
"epoch": 0.8828125,
"grad_norm": 0.45017131620724865,
"learning_rate": 1e-05,
"loss": 0.2132,
"step": 226
},
{
"epoch": 0.88671875,
"grad_norm": 0.48915211275869575,
"learning_rate": 1e-05,
"loss": 0.2627,
"step": 227
},
{
"epoch": 0.890625,
"grad_norm": 0.4606618945421734,
"learning_rate": 1e-05,
"loss": 0.1528,
"step": 228
},
{
"epoch": 0.89453125,
"grad_norm": 0.5072593200666395,
"learning_rate": 1e-05,
"loss": 0.2148,
"step": 229
},
{
"epoch": 0.8984375,
"grad_norm": 0.5513069869439534,
"learning_rate": 1e-05,
"loss": 0.2319,
"step": 230
},
{
"epoch": 0.90234375,
"grad_norm": 0.4917083878550277,
"learning_rate": 1e-05,
"loss": 0.1989,
"step": 231
},
{
"epoch": 0.90625,
"grad_norm": 0.4027028580105545,
"learning_rate": 1e-05,
"loss": 0.1398,
"step": 232
},
{
"epoch": 0.90625,
"eval_dev_acc": 0.3779527544975281,
"eval_dev_token": 5651.6455078125,
"eval_runtime": 175.5543,
"eval_samples_per_second": 0.091,
"eval_steps_per_second": 0.006,
"step": 232
},
{
"epoch": 0.91015625,
"grad_norm": 0.4098440727615931,
"learning_rate": 1e-05,
"loss": 0.1481,
"step": 233
},
{
"epoch": 0.9140625,
"grad_norm": 0.4379253949500134,
"learning_rate": 1e-05,
"loss": 0.172,
"step": 234
},
{
"epoch": 0.91796875,
"grad_norm": 0.6161974608496972,
"learning_rate": 1e-05,
"loss": 0.2234,
"step": 235
},
{
"epoch": 0.921875,
"grad_norm": 0.6431694552333217,
"learning_rate": 1e-05,
"loss": 0.2928,
"step": 236
},
{
"epoch": 0.92578125,
"grad_norm": 0.7524837454023333,
"learning_rate": 1e-05,
"loss": 0.3518,
"step": 237
},
{
"epoch": 0.9296875,
"grad_norm": 0.5137794157548315,
"learning_rate": 1e-05,
"loss": 0.2371,
"step": 238
},
{
"epoch": 0.93359375,
"grad_norm": 0.42726761741926383,
"learning_rate": 1e-05,
"loss": 0.1349,
"step": 239
},
{
"epoch": 0.9375,
"grad_norm": 0.50721507122848,
"learning_rate": 1e-05,
"loss": 0.147,
"step": 240
},
{
"epoch": 0.9375,
"eval_dev_acc": 0.4375,
"eval_dev_token": 5554.34375,
"eval_runtime": 173.4206,
"eval_samples_per_second": 0.092,
"eval_steps_per_second": 0.006,
"step": 240
},
{
"epoch": 0.94140625,
"grad_norm": 0.5085504060972834,
"learning_rate": 1e-05,
"loss": 0.2115,
"step": 241
},
{
"epoch": 0.9453125,
"grad_norm": 0.5245333395138617,
"learning_rate": 1e-05,
"loss": 0.2203,
"step": 242
},
{
"epoch": 0.94921875,
"grad_norm": 0.5149241747645703,
"learning_rate": 1e-05,
"loss": 0.1935,
"step": 243
},
{
"epoch": 0.953125,
"grad_norm": 0.45199967311107936,
"learning_rate": 1e-05,
"loss": 0.1875,
"step": 244
},
{
"epoch": 0.95703125,
"grad_norm": 0.6017279864923942,
"learning_rate": 1e-05,
"loss": 0.1964,
"step": 245
},
{
"epoch": 0.9609375,
"grad_norm": 0.541548647166723,
"learning_rate": 1e-05,
"loss": 0.2029,
"step": 246
},
{
"epoch": 0.96484375,
"grad_norm": 0.7095706252744872,
"learning_rate": 1e-05,
"loss": 0.1824,
"step": 247
},
{
"epoch": 0.96875,
"grad_norm": 0.6630534512223186,
"learning_rate": 1e-05,
"loss": 0.2346,
"step": 248
},
{
"epoch": 0.96875,
"eval_dev_acc": 0.5234375,
"eval_dev_token": 5464.203125,
"eval_runtime": 173.0858,
"eval_samples_per_second": 0.092,
"eval_steps_per_second": 0.006,
"step": 248
},
{
"epoch": 0.97265625,
"grad_norm": 0.7470938668923351,
"learning_rate": 1e-05,
"loss": 0.3028,
"step": 249
},
{
"epoch": 0.9765625,
"grad_norm": 0.534162369114681,
"learning_rate": 1e-05,
"loss": 0.243,
"step": 250
},
{
"epoch": 0.98046875,
"grad_norm": 0.5240149993617814,
"learning_rate": 1e-05,
"loss": 0.2475,
"step": 251
},
{
"epoch": 0.984375,
"grad_norm": 0.48058164633897993,
"learning_rate": 1e-05,
"loss": 0.2234,
"step": 252
},
{
"epoch": 0.98828125,
"grad_norm": 0.5427424821749397,
"learning_rate": 1e-05,
"loss": 0.2338,
"step": 253
},
{
"epoch": 0.9921875,
"grad_norm": 0.5309304323745797,
"learning_rate": 1e-05,
"loss": 0.2751,
"step": 254
},
{
"epoch": 0.99609375,
"grad_norm": 0.4961154954055658,
"learning_rate": 1e-05,
"loss": 0.2329,
"step": 255
},
{
"epoch": 1.0,
"grad_norm": 0.519835488758917,
"learning_rate": 1e-05,
"loss": 0.2182,
"step": 256
},
{
"epoch": 1.0,
"eval_dev_acc": 0.4453125,
"eval_dev_token": 5674.0546875,
"eval_runtime": 175.8662,
"eval_samples_per_second": 0.091,
"eval_steps_per_second": 0.006,
"step": 256
}
],
"logging_steps": 1.0,
"max_steps": 256,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 64,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 31380919492608.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}