|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 8, |
|
"global_step": 256, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00390625, |
|
"grad_norm": 3.380525042530954, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2859, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0078125, |
|
"grad_norm": 2.6901012326349156, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2117, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01171875, |
|
"grad_norm": 3.191447237922227, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2602, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.015625, |
|
"grad_norm": 2.204083519446381, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1972, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01953125, |
|
"grad_norm": 2.0481149317155687, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2338, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0234375, |
|
"grad_norm": 1.6269814174466988, |
|
"learning_rate": 1e-05, |
|
"loss": 0.214, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02734375, |
|
"grad_norm": 1.6908703624878527, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2088, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"grad_norm": 1.2059719622160197, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1975, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03125, |
|
"eval_dev_acc": 0.515625, |
|
"eval_dev_token": 4849.7578125, |
|
"eval_runtime": 168.4394, |
|
"eval_samples_per_second": 0.095, |
|
"eval_steps_per_second": 0.006, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03515625, |
|
"grad_norm": 1.6837720712641369, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1873, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0390625, |
|
"grad_norm": 1.2504651087103098, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1959, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04296875, |
|
"grad_norm": 1.3187603751382884, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2135, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.046875, |
|
"grad_norm": 1.3545446581007174, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2428, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05078125, |
|
"grad_norm": 1.6286051945906104, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1708, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0546875, |
|
"grad_norm": 1.6081830921647842, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1471, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.05859375, |
|
"grad_norm": 1.4305460955933824, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1837, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 1.3961670104174644, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1352, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"eval_dev_acc": 0.4296875, |
|
"eval_dev_token": 5067.265625, |
|
"eval_runtime": 167.2848, |
|
"eval_samples_per_second": 0.096, |
|
"eval_steps_per_second": 0.006, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.06640625, |
|
"grad_norm": 1.5507019702345457, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1657, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0703125, |
|
"grad_norm": 1.3395286968352729, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1824, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.07421875, |
|
"grad_norm": 2.201219146342779, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1391, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.078125, |
|
"grad_norm": 1.75559779570709, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1351, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08203125, |
|
"grad_norm": 2.0359121335172428, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1748, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0859375, |
|
"grad_norm": 1.6822343317370052, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1582, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.08984375, |
|
"grad_norm": 1.9664935447837442, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1338, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"grad_norm": 1.1463903797363937, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1139, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.09375, |
|
"eval_dev_acc": 0.4296875, |
|
"eval_dev_token": 4994.296875, |
|
"eval_runtime": 168.4043, |
|
"eval_samples_per_second": 0.095, |
|
"eval_steps_per_second": 0.006, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.09765625, |
|
"grad_norm": 2.1728621095149627, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1471, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1015625, |
|
"grad_norm": 1.6714738223766954, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1349, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.10546875, |
|
"grad_norm": 1.5574316583381629, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1356, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.109375, |
|
"grad_norm": 1.4728847084572547, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1509, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.11328125, |
|
"grad_norm": 1.4769394661942852, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1294, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1171875, |
|
"grad_norm": 1.8550097520759188, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1208, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12109375, |
|
"grad_norm": 1.75157088447911, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0993, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 1.6233472727407252, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1412, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"eval_dev_acc": 0.4609375, |
|
"eval_dev_token": 4228.15625, |
|
"eval_runtime": 159.0398, |
|
"eval_samples_per_second": 0.101, |
|
"eval_steps_per_second": 0.006, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.12890625, |
|
"grad_norm": 1.5246001678514782, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1268, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1328125, |
|
"grad_norm": 1.020147996755851, |
|
"learning_rate": 1e-05, |
|
"loss": 0.166, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.13671875, |
|
"grad_norm": 0.9795032964583498, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1223, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.140625, |
|
"grad_norm": 1.0328587053324862, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0889, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.14453125, |
|
"grad_norm": 0.8587530858129762, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1618, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.1484375, |
|
"grad_norm": 1.0451234874371433, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1973, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.15234375, |
|
"grad_norm": 1.032741287831154, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1999, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 1.0128010813738295, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1314, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"eval_dev_acc": 0.40625, |
|
"eval_dev_token": 5015.7421875, |
|
"eval_runtime": 167.9354, |
|
"eval_samples_per_second": 0.095, |
|
"eval_steps_per_second": 0.006, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.16015625, |
|
"grad_norm": 0.7085331860395175, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1424, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.1640625, |
|
"grad_norm": 0.8522197113830303, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1523, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.16796875, |
|
"grad_norm": 0.9700458234990689, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1655, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.171875, |
|
"grad_norm": 2.0713947251278855, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2946, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.17578125, |
|
"grad_norm": 1.6441862242379885, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2547, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1796875, |
|
"grad_norm": 1.7959964112861366, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3009, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.18359375, |
|
"grad_norm": 1.3449858551505456, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2094, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 1.2087309569022056, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1908, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"eval_dev_acc": 0.34375, |
|
"eval_dev_token": 4538.84375, |
|
"eval_runtime": 161.6976, |
|
"eval_samples_per_second": 0.099, |
|
"eval_steps_per_second": 0.006, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.19140625, |
|
"grad_norm": 1.1559146316352948, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3036, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.1953125, |
|
"grad_norm": 1.131769529502962, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2441, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.19921875, |
|
"grad_norm": 1.4116452844735226, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2028, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.203125, |
|
"grad_norm": 0.7550364491986332, |
|
"learning_rate": 1e-05, |
|
"loss": 0.215, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.20703125, |
|
"grad_norm": 1.3915284765850489, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2878, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.2109375, |
|
"grad_norm": 1.6351241901381652, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2446, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.21484375, |
|
"grad_norm": 1.6083218458029132, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2088, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"grad_norm": 0.7434150303822764, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2262, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.21875, |
|
"eval_dev_acc": 0.30708661675453186, |
|
"eval_dev_token": 5670.251953125, |
|
"eval_runtime": 174.7692, |
|
"eval_samples_per_second": 0.092, |
|
"eval_steps_per_second": 0.006, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.22265625, |
|
"grad_norm": 1.0769799759099778, |
|
"learning_rate": 1e-05, |
|
"loss": 0.208, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.2265625, |
|
"grad_norm": 0.9298141621627772, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1687, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.23046875, |
|
"grad_norm": 1.285492123129724, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2427, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.234375, |
|
"grad_norm": 0.8346778861730894, |
|
"learning_rate": 1e-05, |
|
"loss": 0.219, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.23828125, |
|
"grad_norm": 0.9873196942775492, |
|
"learning_rate": 1e-05, |
|
"loss": 0.242, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2421875, |
|
"grad_norm": 0.9596507860915271, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2148, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.24609375, |
|
"grad_norm": 1.0988562593647762, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2396, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.9707635131928222, |
|
"learning_rate": 1e-05, |
|
"loss": 0.238, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_dev_acc": 0.5390625, |
|
"eval_dev_token": 4394.921875, |
|
"eval_runtime": 161.3481, |
|
"eval_samples_per_second": 0.099, |
|
"eval_steps_per_second": 0.006, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.25390625, |
|
"grad_norm": 0.8083595053544823, |
|
"learning_rate": 1e-05, |
|
"loss": 0.293, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2578125, |
|
"grad_norm": 0.6893947679382126, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2866, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.26171875, |
|
"grad_norm": 1.0271679359276198, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2276, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.265625, |
|
"grad_norm": 1.1776528602190077, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1887, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.26953125, |
|
"grad_norm": 1.163717423684938, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2147, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.2734375, |
|
"grad_norm": 0.8134427746893115, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2342, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.27734375, |
|
"grad_norm": 1.4269332848478926, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1919, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"grad_norm": 0.8200789264174901, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2175, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.28125, |
|
"eval_dev_acc": 0.53125, |
|
"eval_dev_token": 4859.7421875, |
|
"eval_runtime": 166.6197, |
|
"eval_samples_per_second": 0.096, |
|
"eval_steps_per_second": 0.006, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.28515625, |
|
"grad_norm": 1.007316679088458, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3108, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.2890625, |
|
"grad_norm": 0.6637709768510952, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1794, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.29296875, |
|
"grad_norm": 1.0144512803754202, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1905, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.296875, |
|
"grad_norm": 1.2499777112248354, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2014, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.30078125, |
|
"grad_norm": 1.0642239482819718, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1648, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.3046875, |
|
"grad_norm": 0.8739614674360524, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1537, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.30859375, |
|
"grad_norm": 0.5320613340314281, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2128, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 1.2802208673828028, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1939, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"eval_dev_acc": 0.4609375, |
|
"eval_dev_token": 5065.421875, |
|
"eval_runtime": 168.4523, |
|
"eval_samples_per_second": 0.095, |
|
"eval_steps_per_second": 0.006, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.31640625, |
|
"grad_norm": 1.1564057868614226, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2215, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.3203125, |
|
"grad_norm": 0.7104999594850884, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1224, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.32421875, |
|
"grad_norm": 0.6466657594813067, |
|
"learning_rate": 1e-05, |
|
"loss": 0.145, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.328125, |
|
"grad_norm": 1.3499118701284736, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1963, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.33203125, |
|
"grad_norm": 0.6363338361760021, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1781, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3359375, |
|
"grad_norm": 0.8807906150832371, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1426, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.33984375, |
|
"grad_norm": 0.7466707582875238, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1629, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"grad_norm": 0.7773292125565866, |
|
"learning_rate": 1e-05, |
|
"loss": 0.181, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.34375, |
|
"eval_dev_acc": 0.4609375, |
|
"eval_dev_token": 5092.8984375, |
|
"eval_runtime": 168.9275, |
|
"eval_samples_per_second": 0.095, |
|
"eval_steps_per_second": 0.006, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.34765625, |
|
"grad_norm": 0.9798290139606278, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1725, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.3515625, |
|
"grad_norm": 1.2761428002675261, |
|
"learning_rate": 1e-05, |
|
"loss": 0.175, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.35546875, |
|
"grad_norm": 0.5042091805859357, |
|
"learning_rate": 1e-05, |
|
"loss": 0.218, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.359375, |
|
"grad_norm": 1.017358230975041, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2502, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.36328125, |
|
"grad_norm": 0.7366049175316091, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1656, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.3671875, |
|
"grad_norm": 0.9422427666318486, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1455, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.37109375, |
|
"grad_norm": 0.7689775552730859, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1485, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.9090457524355386, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1411, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"eval_dev_acc": 0.453125, |
|
"eval_dev_token": 4948.8359375, |
|
"eval_runtime": 165.5377, |
|
"eval_samples_per_second": 0.097, |
|
"eval_steps_per_second": 0.006, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.37890625, |
|
"grad_norm": 0.7235724828873173, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2193, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.3828125, |
|
"grad_norm": 0.7200445685294068, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1985, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.38671875, |
|
"grad_norm": 0.6060156821220763, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2096, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.390625, |
|
"grad_norm": 0.7114968462244617, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1928, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.39453125, |
|
"grad_norm": 0.6397518359548336, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2165, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.3984375, |
|
"grad_norm": 0.7027126137819094, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2263, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.40234375, |
|
"grad_norm": 0.8648981933002193, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2874, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"grad_norm": 0.9742992968412495, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1755, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.40625, |
|
"eval_dev_acc": 0.3515625, |
|
"eval_dev_token": 5303.1796875, |
|
"eval_runtime": 173.9477, |
|
"eval_samples_per_second": 0.092, |
|
"eval_steps_per_second": 0.006, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.41015625, |
|
"grad_norm": 0.6358933759276069, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1907, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.4140625, |
|
"grad_norm": 0.7859972506268991, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1731, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.41796875, |
|
"grad_norm": 0.6429885607052577, |
|
"learning_rate": 1e-05, |
|
"loss": 0.187, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.421875, |
|
"grad_norm": 0.6314004528855494, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2185, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.42578125, |
|
"grad_norm": 0.8243656111706104, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1384, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.4296875, |
|
"grad_norm": 0.7310074535827911, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1724, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.43359375, |
|
"grad_norm": 1.8710293554497974, |
|
"learning_rate": 1e-05, |
|
"loss": 0.273, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"grad_norm": 1.3308164398688347, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2852, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.4375, |
|
"eval_dev_acc": 0.296875, |
|
"eval_dev_token": 5770.9375, |
|
"eval_runtime": 175.5918, |
|
"eval_samples_per_second": 0.091, |
|
"eval_steps_per_second": 0.006, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.44140625, |
|
"grad_norm": 0.4499041384963393, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1845, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.4453125, |
|
"grad_norm": 0.5818915994231291, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2709, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.44921875, |
|
"grad_norm": 0.6130904000526848, |
|
"learning_rate": 1e-05, |
|
"loss": 0.231, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.453125, |
|
"grad_norm": 0.7266034880537791, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1555, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.45703125, |
|
"grad_norm": 0.425032745279421, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1733, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.4609375, |
|
"grad_norm": 0.41408811254876093, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1793, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.46484375, |
|
"grad_norm": 0.8433491024471641, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2335, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 0.5585183306922875, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2515, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"eval_dev_acc": 0.4724409580230713, |
|
"eval_dev_token": 4777.55126953125, |
|
"eval_runtime": 165.1485, |
|
"eval_samples_per_second": 0.097, |
|
"eval_steps_per_second": 0.006, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.47265625, |
|
"grad_norm": 0.9520218462259554, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2613, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.4765625, |
|
"grad_norm": 0.4858585527334522, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2379, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.48046875, |
|
"grad_norm": 0.5772160567620949, |
|
"learning_rate": 1e-05, |
|
"loss": 0.241, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.484375, |
|
"grad_norm": 0.731954162407159, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2482, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.48828125, |
|
"grad_norm": 0.49226621710163243, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2333, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.4921875, |
|
"grad_norm": 0.43779404197089106, |
|
"learning_rate": 1e-05, |
|
"loss": 0.185, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.49609375, |
|
"grad_norm": 0.6856986141306837, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1943, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6558122415773976, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2185, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_dev_acc": 0.4765625, |
|
"eval_dev_token": 4368.859375, |
|
"eval_runtime": 161.9718, |
|
"eval_samples_per_second": 0.099, |
|
"eval_steps_per_second": 0.006, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.50390625, |
|
"grad_norm": 0.4099906022533745, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2113, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.5078125, |
|
"grad_norm": 0.49752415105495956, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2217, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.51171875, |
|
"grad_norm": 0.8912790018467623, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3422, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.515625, |
|
"grad_norm": 0.6764829647253893, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2055, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.51953125, |
|
"grad_norm": 0.8399641090693946, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2087, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.5234375, |
|
"grad_norm": 0.4594160953603203, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2093, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.52734375, |
|
"grad_norm": 0.7432138703184232, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1969, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"grad_norm": 0.4584467325236011, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1806, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.53125, |
|
"eval_dev_acc": 0.4765625, |
|
"eval_dev_token": 4603.53125, |
|
"eval_runtime": 164.3452, |
|
"eval_samples_per_second": 0.097, |
|
"eval_steps_per_second": 0.006, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.53515625, |
|
"grad_norm": 0.6458588312529675, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2087, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.5390625, |
|
"grad_norm": 0.7370624067340756, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1854, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.54296875, |
|
"grad_norm": 0.7141604462138248, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2535, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.546875, |
|
"grad_norm": 0.8212814690178184, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1668, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.55078125, |
|
"grad_norm": 0.5799692948316157, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2375, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.5546875, |
|
"grad_norm": 0.5333639624775814, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1737, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.55859375, |
|
"grad_norm": 0.4076841439195106, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1627, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"grad_norm": 0.4118175478201596, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1576, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.5625, |
|
"eval_dev_acc": 0.5234375, |
|
"eval_dev_token": 5125.0703125, |
|
"eval_runtime": 168.804, |
|
"eval_samples_per_second": 0.095, |
|
"eval_steps_per_second": 0.006, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.56640625, |
|
"grad_norm": 0.5988381099011506, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1656, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5703125, |
|
"grad_norm": 0.9328153493065982, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1788, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.57421875, |
|
"grad_norm": 0.8013592126955402, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2009, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.578125, |
|
"grad_norm": 0.4868159061171701, |
|
"learning_rate": 1e-05, |
|
"loss": 0.217, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.58203125, |
|
"grad_norm": 0.6758953539585006, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2344, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.5859375, |
|
"grad_norm": 0.8609458752061137, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1939, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.58984375, |
|
"grad_norm": 0.45913847739444186, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1691, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"grad_norm": 0.8064977044716175, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1949, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.59375, |
|
"eval_dev_acc": 0.40625, |
|
"eval_dev_token": 4508.484375, |
|
"eval_runtime": 160.3398, |
|
"eval_samples_per_second": 0.1, |
|
"eval_steps_per_second": 0.006, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.59765625, |
|
"grad_norm": 0.9904042315049291, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2253, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.6015625, |
|
"grad_norm": 0.5524318414569037, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2535, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.60546875, |
|
"grad_norm": 0.418186463867415, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1884, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.609375, |
|
"grad_norm": 0.6311027708045368, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2408, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.61328125, |
|
"grad_norm": 0.4550696199781805, |
|
"learning_rate": 1e-05, |
|
"loss": 0.173, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.6171875, |
|
"grad_norm": 0.4596598696608727, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1592, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.62109375, |
|
"grad_norm": 0.5573937890044522, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1748, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 1.0862165315332113, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2369, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"eval_dev_acc": 0.4296875, |
|
"eval_dev_token": 4869.8828125, |
|
"eval_runtime": 167.2914, |
|
"eval_samples_per_second": 0.096, |
|
"eval_steps_per_second": 0.006, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.62890625, |
|
"grad_norm": 0.46051384064237827, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2086, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.6328125, |
|
"grad_norm": 0.7125397532570018, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2212, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.63671875, |
|
"grad_norm": 0.564820498711706, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3019, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.640625, |
|
"grad_norm": 0.5218656690400247, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1324, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.64453125, |
|
"grad_norm": 0.4994022980399308, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1438, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6484375, |
|
"grad_norm": 0.7016809849517179, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2791, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.65234375, |
|
"grad_norm": 0.597463304680723, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1749, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"grad_norm": 0.5536855781273838, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2391, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.65625, |
|
"eval_dev_acc": 0.3203125, |
|
"eval_dev_token": 5451.3671875, |
|
"eval_runtime": 172.7574, |
|
"eval_samples_per_second": 0.093, |
|
"eval_steps_per_second": 0.006, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.66015625, |
|
"grad_norm": 0.9103508979108635, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2613, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.6640625, |
|
"grad_norm": 0.4928845564740678, |
|
"learning_rate": 1e-05, |
|
"loss": 0.215, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.66796875, |
|
"grad_norm": 0.8690405638773996, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2355, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.671875, |
|
"grad_norm": 0.5511255682147113, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2406, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.67578125, |
|
"grad_norm": 0.44346107905460214, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1867, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.6796875, |
|
"grad_norm": 0.4019557678019079, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1488, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.68359375, |
|
"grad_norm": 0.4139658009208469, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1666, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"grad_norm": 0.45363011716779816, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2006, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.6875, |
|
"eval_dev_acc": 0.3385826647281647, |
|
"eval_dev_token": 4971.81884765625, |
|
"eval_runtime": 166.9967, |
|
"eval_samples_per_second": 0.096, |
|
"eval_steps_per_second": 0.006, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.69140625, |
|
"grad_norm": 0.46674698673244774, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1788, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.6953125, |
|
"grad_norm": 0.5396579551057291, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1857, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.69921875, |
|
"grad_norm": 0.42472472699800484, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1707, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.703125, |
|
"grad_norm": 0.4208916108378261, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1736, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.70703125, |
|
"grad_norm": 0.5161632347165661, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2074, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.7109375, |
|
"grad_norm": 0.4851147968745633, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2183, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.71484375, |
|
"grad_norm": 0.5286494967968609, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1877, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"grad_norm": 0.5399316089624949, |
|
"learning_rate": 1e-05, |
|
"loss": 0.209, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.71875, |
|
"eval_dev_acc": 0.3984375, |
|
"eval_dev_token": 4787.84375, |
|
"eval_runtime": 166.2574, |
|
"eval_samples_per_second": 0.096, |
|
"eval_steps_per_second": 0.006, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.72265625, |
|
"grad_norm": 0.7188938790166789, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2065, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7265625, |
|
"grad_norm": 0.5843767003652576, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2356, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.73046875, |
|
"grad_norm": 0.4904003204685076, |
|
"learning_rate": 1e-05, |
|
"loss": 0.201, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.734375, |
|
"grad_norm": 0.485266158116283, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1869, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.73828125, |
|
"grad_norm": 0.5242977395658632, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2122, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.7421875, |
|
"grad_norm": 0.5417537780138298, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2799, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.74609375, |
|
"grad_norm": 0.48949419193338123, |
|
"learning_rate": 1e-05, |
|
"loss": 0.212, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.48118963817889204, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2195, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_dev_acc": 0.453125, |
|
"eval_dev_token": 5056.7421875, |
|
"eval_runtime": 168.273, |
|
"eval_samples_per_second": 0.095, |
|
"eval_steps_per_second": 0.006, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.75390625, |
|
"grad_norm": 0.6844465372064547, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1645, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.7578125, |
|
"grad_norm": 0.49653100043792153, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2023, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.76171875, |
|
"grad_norm": 0.5539027026151374, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2348, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.765625, |
|
"grad_norm": 0.5003270709383194, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2545, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.76953125, |
|
"grad_norm": 0.5666703162116131, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2739, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.7734375, |
|
"grad_norm": 0.5281121627729704, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1927, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.77734375, |
|
"grad_norm": 0.4691586351966124, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2101, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 0.43348894899907703, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1636, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"eval_dev_acc": 0.4296875, |
|
"eval_dev_token": 5082.265625, |
|
"eval_runtime": 169.7777, |
|
"eval_samples_per_second": 0.094, |
|
"eval_steps_per_second": 0.006, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.78515625, |
|
"grad_norm": 0.4995118305726593, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2149, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.7890625, |
|
"grad_norm": 0.3958721084761467, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1732, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.79296875, |
|
"grad_norm": 0.4883258744044862, |
|
"learning_rate": 1e-05, |
|
"loss": 0.219, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.796875, |
|
"grad_norm": 0.45472746506302575, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2187, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.80078125, |
|
"grad_norm": 0.45006095039367805, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1924, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8046875, |
|
"grad_norm": 0.4127537232406072, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1736, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.80859375, |
|
"grad_norm": 0.4669392415601201, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1847, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"grad_norm": 0.41469363114093816, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1556, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.8125, |
|
"eval_dev_acc": 0.4609375, |
|
"eval_dev_token": 4918.28125, |
|
"eval_runtime": 166.5675, |
|
"eval_samples_per_second": 0.096, |
|
"eval_steps_per_second": 0.006, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.81640625, |
|
"grad_norm": 0.4433576280938302, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1934, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.8203125, |
|
"grad_norm": 0.4355305023653351, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1742, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.82421875, |
|
"grad_norm": 0.44938618579632195, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1902, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.828125, |
|
"grad_norm": 0.5351771463999816, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2148, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.83203125, |
|
"grad_norm": 0.5839350362138708, |
|
"learning_rate": 1e-05, |
|
"loss": 0.275, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.8359375, |
|
"grad_norm": 0.6964110745693202, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2179, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.83984375, |
|
"grad_norm": 0.4337830660702992, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2152, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"grad_norm": 0.46223312750006246, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2405, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.84375, |
|
"eval_dev_acc": 0.3828125, |
|
"eval_dev_token": 5435.3046875, |
|
"eval_runtime": 173.8173, |
|
"eval_samples_per_second": 0.092, |
|
"eval_steps_per_second": 0.006, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.84765625, |
|
"grad_norm": 0.5541820526606585, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2751, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.8515625, |
|
"grad_norm": 0.4662570041545537, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2142, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.85546875, |
|
"grad_norm": 0.7737037625157579, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2397, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.859375, |
|
"grad_norm": 0.5572195616624243, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2421, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.86328125, |
|
"grad_norm": 0.5088509372691609, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1875, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.8671875, |
|
"grad_norm": 0.508699458613964, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1927, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.87109375, |
|
"grad_norm": 0.5150091482241945, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2536, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.5203627078659161, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2571, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"eval_dev_acc": 0.3515625, |
|
"eval_dev_token": 5227.0859375, |
|
"eval_runtime": 170.2355, |
|
"eval_samples_per_second": 0.094, |
|
"eval_steps_per_second": 0.006, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.87890625, |
|
"grad_norm": 0.5279392216696818, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2278, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.8828125, |
|
"grad_norm": 0.45017131620724865, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2132, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.88671875, |
|
"grad_norm": 0.48915211275869575, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2627, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.890625, |
|
"grad_norm": 0.4606618945421734, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1528, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.89453125, |
|
"grad_norm": 0.5072593200666395, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2148, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.8984375, |
|
"grad_norm": 0.5513069869439534, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2319, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.90234375, |
|
"grad_norm": 0.4917083878550277, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1989, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"grad_norm": 0.4027028580105545, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1398, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.90625, |
|
"eval_dev_acc": 0.3779527544975281, |
|
"eval_dev_token": 5651.6455078125, |
|
"eval_runtime": 175.5543, |
|
"eval_samples_per_second": 0.091, |
|
"eval_steps_per_second": 0.006, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.91015625, |
|
"grad_norm": 0.4098440727615931, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1481, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.9140625, |
|
"grad_norm": 0.4379253949500134, |
|
"learning_rate": 1e-05, |
|
"loss": 0.172, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.91796875, |
|
"grad_norm": 0.6161974608496972, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2234, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.921875, |
|
"grad_norm": 0.6431694552333217, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2928, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.92578125, |
|
"grad_norm": 0.7524837454023333, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3518, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.9296875, |
|
"grad_norm": 0.5137794157548315, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2371, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.93359375, |
|
"grad_norm": 0.42726761741926383, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1349, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.50721507122848, |
|
"learning_rate": 1e-05, |
|
"loss": 0.147, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"eval_dev_acc": 0.4375, |
|
"eval_dev_token": 5554.34375, |
|
"eval_runtime": 173.4206, |
|
"eval_samples_per_second": 0.092, |
|
"eval_steps_per_second": 0.006, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.94140625, |
|
"grad_norm": 0.5085504060972834, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2115, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.9453125, |
|
"grad_norm": 0.5245333395138617, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2203, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.94921875, |
|
"grad_norm": 0.5149241747645703, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1935, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.953125, |
|
"grad_norm": 0.45199967311107936, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1875, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.95703125, |
|
"grad_norm": 0.6017279864923942, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1964, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.9609375, |
|
"grad_norm": 0.541548647166723, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2029, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.96484375, |
|
"grad_norm": 0.7095706252744872, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1824, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"grad_norm": 0.6630534512223186, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2346, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.96875, |
|
"eval_dev_acc": 0.5234375, |
|
"eval_dev_token": 5464.203125, |
|
"eval_runtime": 173.0858, |
|
"eval_samples_per_second": 0.092, |
|
"eval_steps_per_second": 0.006, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.97265625, |
|
"grad_norm": 0.7470938668923351, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3028, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.9765625, |
|
"grad_norm": 0.534162369114681, |
|
"learning_rate": 1e-05, |
|
"loss": 0.243, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.98046875, |
|
"grad_norm": 0.5240149993617814, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2475, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.984375, |
|
"grad_norm": 0.48058164633897993, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2234, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.98828125, |
|
"grad_norm": 0.5427424821749397, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2338, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.9921875, |
|
"grad_norm": 0.5309304323745797, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2751, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.99609375, |
|
"grad_norm": 0.4961154954055658, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2329, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.519835488758917, |
|
"learning_rate": 1e-05, |
|
"loss": 0.2182, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_dev_acc": 0.4453125, |
|
"eval_dev_token": 5674.0546875, |
|
"eval_runtime": 175.8662, |
|
"eval_samples_per_second": 0.091, |
|
"eval_steps_per_second": 0.006, |
|
"step": 256 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 256, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 64, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 31380919492608.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|