{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 8, "global_step": 256, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00390625, "grad_norm": 3.380525042530954, "learning_rate": 1e-05, "loss": 0.2859, "step": 1 }, { "epoch": 0.0078125, "grad_norm": 2.6901012326349156, "learning_rate": 1e-05, "loss": 0.2117, "step": 2 }, { "epoch": 0.01171875, "grad_norm": 3.191447237922227, "learning_rate": 1e-05, "loss": 0.2602, "step": 3 }, { "epoch": 0.015625, "grad_norm": 2.204083519446381, "learning_rate": 1e-05, "loss": 0.1972, "step": 4 }, { "epoch": 0.01953125, "grad_norm": 2.0481149317155687, "learning_rate": 1e-05, "loss": 0.2338, "step": 5 }, { "epoch": 0.0234375, "grad_norm": 1.6269814174466988, "learning_rate": 1e-05, "loss": 0.214, "step": 6 }, { "epoch": 0.02734375, "grad_norm": 1.6908703624878527, "learning_rate": 1e-05, "loss": 0.2088, "step": 7 }, { "epoch": 0.03125, "grad_norm": 1.2059719622160197, "learning_rate": 1e-05, "loss": 0.1975, "step": 8 }, { "epoch": 0.03125, "eval_dev_acc": 0.515625, "eval_dev_token": 4849.7578125, "eval_runtime": 168.4394, "eval_samples_per_second": 0.095, "eval_steps_per_second": 0.006, "step": 8 }, { "epoch": 0.03515625, "grad_norm": 1.6837720712641369, "learning_rate": 1e-05, "loss": 0.1873, "step": 9 }, { "epoch": 0.0390625, "grad_norm": 1.2504651087103098, "learning_rate": 1e-05, "loss": 0.1959, "step": 10 }, { "epoch": 0.04296875, "grad_norm": 1.3187603751382884, "learning_rate": 1e-05, "loss": 0.2135, "step": 11 }, { "epoch": 0.046875, "grad_norm": 1.3545446581007174, "learning_rate": 1e-05, "loss": 0.2428, "step": 12 }, { "epoch": 0.05078125, "grad_norm": 1.6286051945906104, "learning_rate": 1e-05, "loss": 0.1708, "step": 13 }, { "epoch": 0.0546875, "grad_norm": 1.6081830921647842, "learning_rate": 1e-05, "loss": 0.1471, "step": 14 }, { "epoch": 0.05859375, "grad_norm": 1.4305460955933824, "learning_rate": 1e-05, "loss": 0.1837, "step": 15 }, { "epoch": 0.0625, "grad_norm": 1.3961670104174644, "learning_rate": 1e-05, "loss": 0.1352, "step": 16 }, { "epoch": 0.0625, "eval_dev_acc": 0.4296875, "eval_dev_token": 5067.265625, "eval_runtime": 167.2848, "eval_samples_per_second": 0.096, "eval_steps_per_second": 0.006, "step": 16 }, { "epoch": 0.06640625, "grad_norm": 1.5507019702345457, "learning_rate": 1e-05, "loss": 0.1657, "step": 17 }, { "epoch": 0.0703125, "grad_norm": 1.3395286968352729, "learning_rate": 1e-05, "loss": 0.1824, "step": 18 }, { "epoch": 0.07421875, "grad_norm": 2.201219146342779, "learning_rate": 1e-05, "loss": 0.1391, "step": 19 }, { "epoch": 0.078125, "grad_norm": 1.75559779570709, "learning_rate": 1e-05, "loss": 0.1351, "step": 20 }, { "epoch": 0.08203125, "grad_norm": 2.0359121335172428, "learning_rate": 1e-05, "loss": 0.1748, "step": 21 }, { "epoch": 0.0859375, "grad_norm": 1.6822343317370052, "learning_rate": 1e-05, "loss": 0.1582, "step": 22 }, { "epoch": 0.08984375, "grad_norm": 1.9664935447837442, "learning_rate": 1e-05, "loss": 0.1338, "step": 23 }, { "epoch": 0.09375, "grad_norm": 1.1463903797363937, "learning_rate": 1e-05, "loss": 0.1139, "step": 24 }, { "epoch": 0.09375, "eval_dev_acc": 0.4296875, "eval_dev_token": 4994.296875, "eval_runtime": 168.4043, "eval_samples_per_second": 0.095, "eval_steps_per_second": 0.006, "step": 24 }, { "epoch": 0.09765625, "grad_norm": 2.1728621095149627, "learning_rate": 1e-05, "loss": 0.1471, "step": 25 }, { "epoch": 0.1015625, "grad_norm": 1.6714738223766954, "learning_rate": 1e-05, "loss": 0.1349, "step": 26 }, { "epoch": 0.10546875, "grad_norm": 1.5574316583381629, "learning_rate": 1e-05, "loss": 0.1356, "step": 27 }, { "epoch": 0.109375, "grad_norm": 1.4728847084572547, "learning_rate": 1e-05, "loss": 0.1509, "step": 28 }, { "epoch": 0.11328125, "grad_norm": 1.4769394661942852, "learning_rate": 1e-05, "loss": 0.1294, "step": 29 }, { "epoch": 0.1171875, "grad_norm": 1.8550097520759188, "learning_rate": 1e-05, "loss": 0.1208, "step": 30 }, { "epoch": 0.12109375, "grad_norm": 1.75157088447911, "learning_rate": 1e-05, "loss": 0.0993, "step": 31 }, { "epoch": 0.125, "grad_norm": 1.6233472727407252, "learning_rate": 1e-05, "loss": 0.1412, "step": 32 }, { "epoch": 0.125, "eval_dev_acc": 0.4609375, "eval_dev_token": 4228.15625, "eval_runtime": 159.0398, "eval_samples_per_second": 0.101, "eval_steps_per_second": 0.006, "step": 32 }, { "epoch": 0.12890625, "grad_norm": 1.5246001678514782, "learning_rate": 1e-05, "loss": 0.1268, "step": 33 }, { "epoch": 0.1328125, "grad_norm": 1.020147996755851, "learning_rate": 1e-05, "loss": 0.166, "step": 34 }, { "epoch": 0.13671875, "grad_norm": 0.9795032964583498, "learning_rate": 1e-05, "loss": 0.1223, "step": 35 }, { "epoch": 0.140625, "grad_norm": 1.0328587053324862, "learning_rate": 1e-05, "loss": 0.0889, "step": 36 }, { "epoch": 0.14453125, "grad_norm": 0.8587530858129762, "learning_rate": 1e-05, "loss": 0.1618, "step": 37 }, { "epoch": 0.1484375, "grad_norm": 1.0451234874371433, "learning_rate": 1e-05, "loss": 0.1973, "step": 38 }, { "epoch": 0.15234375, "grad_norm": 1.032741287831154, "learning_rate": 1e-05, "loss": 0.1999, "step": 39 }, { "epoch": 0.15625, "grad_norm": 1.0128010813738295, "learning_rate": 1e-05, "loss": 0.1314, "step": 40 }, { "epoch": 0.15625, "eval_dev_acc": 0.40625, "eval_dev_token": 5015.7421875, "eval_runtime": 167.9354, "eval_samples_per_second": 0.095, "eval_steps_per_second": 0.006, "step": 40 }, { "epoch": 0.16015625, "grad_norm": 0.7085331860395175, "learning_rate": 1e-05, "loss": 0.1424, "step": 41 }, { "epoch": 0.1640625, "grad_norm": 0.8522197113830303, "learning_rate": 1e-05, "loss": 0.1523, "step": 42 }, { "epoch": 0.16796875, "grad_norm": 0.9700458234990689, "learning_rate": 1e-05, "loss": 0.1655, "step": 43 }, { "epoch": 0.171875, "grad_norm": 2.0713947251278855, "learning_rate": 1e-05, "loss": 0.2946, "step": 44 }, { "epoch": 0.17578125, "grad_norm": 1.6441862242379885, "learning_rate": 1e-05, "loss": 0.2547, "step": 45 }, { "epoch": 0.1796875, "grad_norm": 1.7959964112861366, "learning_rate": 1e-05, "loss": 0.3009, "step": 46 }, { "epoch": 0.18359375, "grad_norm": 1.3449858551505456, "learning_rate": 1e-05, "loss": 0.2094, "step": 47 }, { "epoch": 0.1875, "grad_norm": 1.2087309569022056, "learning_rate": 1e-05, "loss": 0.1908, "step": 48 }, { "epoch": 0.1875, "eval_dev_acc": 0.34375, "eval_dev_token": 4538.84375, "eval_runtime": 161.6976, "eval_samples_per_second": 0.099, "eval_steps_per_second": 0.006, "step": 48 }, { "epoch": 0.19140625, "grad_norm": 1.1559146316352948, "learning_rate": 1e-05, "loss": 0.3036, "step": 49 }, { "epoch": 0.1953125, "grad_norm": 1.131769529502962, "learning_rate": 1e-05, "loss": 0.2441, "step": 50 }, { "epoch": 0.19921875, "grad_norm": 1.4116452844735226, "learning_rate": 1e-05, "loss": 0.2028, "step": 51 }, { "epoch": 0.203125, "grad_norm": 0.7550364491986332, "learning_rate": 1e-05, "loss": 0.215, "step": 52 }, { "epoch": 0.20703125, "grad_norm": 1.3915284765850489, "learning_rate": 1e-05, "loss": 0.2878, "step": 53 }, { "epoch": 0.2109375, "grad_norm": 1.6351241901381652, "learning_rate": 1e-05, "loss": 0.2446, "step": 54 }, { "epoch": 0.21484375, "grad_norm": 1.6083218458029132, "learning_rate": 1e-05, "loss": 0.2088, "step": 55 }, { "epoch": 0.21875, "grad_norm": 0.7434150303822764, "learning_rate": 1e-05, "loss": 0.2262, "step": 56 }, { "epoch": 0.21875, "eval_dev_acc": 0.30708661675453186, "eval_dev_token": 5670.251953125, "eval_runtime": 174.7692, "eval_samples_per_second": 0.092, "eval_steps_per_second": 0.006, "step": 56 }, { "epoch": 0.22265625, "grad_norm": 1.0769799759099778, "learning_rate": 1e-05, "loss": 0.208, "step": 57 }, { "epoch": 0.2265625, "grad_norm": 0.9298141621627772, "learning_rate": 1e-05, "loss": 0.1687, "step": 58 }, { "epoch": 0.23046875, "grad_norm": 1.285492123129724, "learning_rate": 1e-05, "loss": 0.2427, "step": 59 }, { "epoch": 0.234375, "grad_norm": 0.8346778861730894, "learning_rate": 1e-05, "loss": 0.219, "step": 60 }, { "epoch": 0.23828125, "grad_norm": 0.9873196942775492, "learning_rate": 1e-05, "loss": 0.242, "step": 61 }, { "epoch": 0.2421875, "grad_norm": 0.9596507860915271, "learning_rate": 1e-05, "loss": 0.2148, "step": 62 }, { "epoch": 0.24609375, "grad_norm": 1.0988562593647762, "learning_rate": 1e-05, "loss": 0.2396, "step": 63 }, { "epoch": 0.25, "grad_norm": 0.9707635131928222, "learning_rate": 1e-05, "loss": 0.238, "step": 64 }, { "epoch": 0.25, "eval_dev_acc": 0.5390625, "eval_dev_token": 4394.921875, "eval_runtime": 161.3481, "eval_samples_per_second": 0.099, "eval_steps_per_second": 0.006, "step": 64 }, { "epoch": 0.25390625, "grad_norm": 0.8083595053544823, "learning_rate": 1e-05, "loss": 0.293, "step": 65 }, { "epoch": 0.2578125, "grad_norm": 0.6893947679382126, "learning_rate": 1e-05, "loss": 0.2866, "step": 66 }, { "epoch": 0.26171875, "grad_norm": 1.0271679359276198, "learning_rate": 1e-05, "loss": 0.2276, "step": 67 }, { "epoch": 0.265625, "grad_norm": 1.1776528602190077, "learning_rate": 1e-05, "loss": 0.1887, "step": 68 }, { "epoch": 0.26953125, "grad_norm": 1.163717423684938, "learning_rate": 1e-05, "loss": 0.2147, "step": 69 }, { "epoch": 0.2734375, "grad_norm": 0.8134427746893115, "learning_rate": 1e-05, "loss": 0.2342, "step": 70 }, { "epoch": 0.27734375, "grad_norm": 1.4269332848478926, "learning_rate": 1e-05, "loss": 0.1919, "step": 71 }, { "epoch": 0.28125, "grad_norm": 0.8200789264174901, "learning_rate": 1e-05, "loss": 0.2175, "step": 72 }, { "epoch": 0.28125, "eval_dev_acc": 0.53125, "eval_dev_token": 4859.7421875, "eval_runtime": 166.6197, "eval_samples_per_second": 0.096, "eval_steps_per_second": 0.006, "step": 72 }, { "epoch": 0.28515625, "grad_norm": 1.007316679088458, "learning_rate": 1e-05, "loss": 0.3108, "step": 73 }, { "epoch": 0.2890625, "grad_norm": 0.6637709768510952, "learning_rate": 1e-05, "loss": 0.1794, "step": 74 }, { "epoch": 0.29296875, "grad_norm": 1.0144512803754202, "learning_rate": 1e-05, "loss": 0.1905, "step": 75 }, { "epoch": 0.296875, "grad_norm": 1.2499777112248354, "learning_rate": 1e-05, "loss": 0.2014, "step": 76 }, { "epoch": 0.30078125, "grad_norm": 1.0642239482819718, "learning_rate": 1e-05, "loss": 0.1648, "step": 77 }, { "epoch": 0.3046875, "grad_norm": 0.8739614674360524, "learning_rate": 1e-05, "loss": 0.1537, "step": 78 }, { "epoch": 0.30859375, "grad_norm": 0.5320613340314281, "learning_rate": 1e-05, "loss": 0.2128, "step": 79 }, { "epoch": 0.3125, "grad_norm": 1.2802208673828028, "learning_rate": 1e-05, "loss": 0.1939, "step": 80 }, { "epoch": 0.3125, "eval_dev_acc": 0.4609375, "eval_dev_token": 5065.421875, "eval_runtime": 168.4523, "eval_samples_per_second": 0.095, "eval_steps_per_second": 0.006, "step": 80 }, { "epoch": 0.31640625, "grad_norm": 1.1564057868614226, "learning_rate": 1e-05, "loss": 0.2215, "step": 81 }, { "epoch": 0.3203125, "grad_norm": 0.7104999594850884, "learning_rate": 1e-05, "loss": 0.1224, "step": 82 }, { "epoch": 0.32421875, "grad_norm": 0.6466657594813067, "learning_rate": 1e-05, "loss": 0.145, "step": 83 }, { "epoch": 0.328125, "grad_norm": 1.3499118701284736, "learning_rate": 1e-05, "loss": 0.1963, "step": 84 }, { "epoch": 0.33203125, "grad_norm": 0.6363338361760021, "learning_rate": 1e-05, "loss": 0.1781, "step": 85 }, { "epoch": 0.3359375, "grad_norm": 0.8807906150832371, "learning_rate": 1e-05, "loss": 0.1426, "step": 86 }, { "epoch": 0.33984375, "grad_norm": 0.7466707582875238, "learning_rate": 1e-05, "loss": 0.1629, "step": 87 }, { "epoch": 0.34375, "grad_norm": 0.7773292125565866, "learning_rate": 1e-05, "loss": 0.181, "step": 88 }, { "epoch": 0.34375, "eval_dev_acc": 0.4609375, "eval_dev_token": 5092.8984375, "eval_runtime": 168.9275, "eval_samples_per_second": 0.095, "eval_steps_per_second": 0.006, "step": 88 }, { "epoch": 0.34765625, "grad_norm": 0.9798290139606278, "learning_rate": 1e-05, "loss": 0.1725, "step": 89 }, { "epoch": 0.3515625, "grad_norm": 1.2761428002675261, "learning_rate": 1e-05, "loss": 0.175, "step": 90 }, { "epoch": 0.35546875, "grad_norm": 0.5042091805859357, "learning_rate": 1e-05, "loss": 0.218, "step": 91 }, { "epoch": 0.359375, "grad_norm": 1.017358230975041, "learning_rate": 1e-05, "loss": 0.2502, "step": 92 }, { "epoch": 0.36328125, "grad_norm": 0.7366049175316091, "learning_rate": 1e-05, "loss": 0.1656, "step": 93 }, { "epoch": 0.3671875, "grad_norm": 0.9422427666318486, "learning_rate": 1e-05, "loss": 0.1455, "step": 94 }, { "epoch": 0.37109375, "grad_norm": 0.7689775552730859, "learning_rate": 1e-05, "loss": 0.1485, "step": 95 }, { "epoch": 0.375, "grad_norm": 0.9090457524355386, "learning_rate": 1e-05, "loss": 0.1411, "step": 96 }, { "epoch": 0.375, "eval_dev_acc": 0.453125, "eval_dev_token": 4948.8359375, "eval_runtime": 165.5377, "eval_samples_per_second": 0.097, "eval_steps_per_second": 0.006, "step": 96 }, { "epoch": 0.37890625, "grad_norm": 0.7235724828873173, "learning_rate": 1e-05, "loss": 0.2193, "step": 97 }, { "epoch": 0.3828125, "grad_norm": 0.7200445685294068, "learning_rate": 1e-05, "loss": 0.1985, "step": 98 }, { "epoch": 0.38671875, "grad_norm": 0.6060156821220763, "learning_rate": 1e-05, "loss": 0.2096, "step": 99 }, { "epoch": 0.390625, "grad_norm": 0.7114968462244617, "learning_rate": 1e-05, "loss": 0.1928, "step": 100 }, { "epoch": 0.39453125, "grad_norm": 0.6397518359548336, "learning_rate": 1e-05, "loss": 0.2165, "step": 101 }, { "epoch": 0.3984375, "grad_norm": 0.7027126137819094, "learning_rate": 1e-05, "loss": 0.2263, "step": 102 }, { "epoch": 0.40234375, "grad_norm": 0.8648981933002193, "learning_rate": 1e-05, "loss": 0.2874, "step": 103 }, { "epoch": 0.40625, "grad_norm": 0.9742992968412495, "learning_rate": 1e-05, "loss": 0.1755, "step": 104 }, { "epoch": 0.40625, "eval_dev_acc": 0.3515625, "eval_dev_token": 5303.1796875, "eval_runtime": 173.9477, "eval_samples_per_second": 0.092, "eval_steps_per_second": 0.006, "step": 104 }, { "epoch": 0.41015625, "grad_norm": 0.6358933759276069, "learning_rate": 1e-05, "loss": 0.1907, "step": 105 }, { "epoch": 0.4140625, "grad_norm": 0.7859972506268991, "learning_rate": 1e-05, "loss": 0.1731, "step": 106 }, { "epoch": 0.41796875, "grad_norm": 0.6429885607052577, "learning_rate": 1e-05, "loss": 0.187, "step": 107 }, { "epoch": 0.421875, "grad_norm": 0.6314004528855494, "learning_rate": 1e-05, "loss": 0.2185, "step": 108 }, { "epoch": 0.42578125, "grad_norm": 0.8243656111706104, "learning_rate": 1e-05, "loss": 0.1384, "step": 109 }, { "epoch": 0.4296875, "grad_norm": 0.7310074535827911, "learning_rate": 1e-05, "loss": 0.1724, "step": 110 }, { "epoch": 0.43359375, "grad_norm": 1.8710293554497974, "learning_rate": 1e-05, "loss": 0.273, "step": 111 }, { "epoch": 0.4375, "grad_norm": 1.3308164398688347, "learning_rate": 1e-05, "loss": 0.2852, "step": 112 }, { "epoch": 0.4375, "eval_dev_acc": 0.296875, "eval_dev_token": 5770.9375, "eval_runtime": 175.5918, "eval_samples_per_second": 0.091, "eval_steps_per_second": 0.006, "step": 112 }, { "epoch": 0.44140625, "grad_norm": 0.4499041384963393, "learning_rate": 1e-05, "loss": 0.1845, "step": 113 }, { "epoch": 0.4453125, "grad_norm": 0.5818915994231291, "learning_rate": 1e-05, "loss": 0.2709, "step": 114 }, { "epoch": 0.44921875, "grad_norm": 0.6130904000526848, "learning_rate": 1e-05, "loss": 0.231, "step": 115 }, { "epoch": 0.453125, "grad_norm": 0.7266034880537791, "learning_rate": 1e-05, "loss": 0.1555, "step": 116 }, { "epoch": 0.45703125, "grad_norm": 0.425032745279421, "learning_rate": 1e-05, "loss": 0.1733, "step": 117 }, { "epoch": 0.4609375, "grad_norm": 0.41408811254876093, "learning_rate": 1e-05, "loss": 0.1793, "step": 118 }, { "epoch": 0.46484375, "grad_norm": 0.8433491024471641, "learning_rate": 1e-05, "loss": 0.2335, "step": 119 }, { "epoch": 0.46875, "grad_norm": 0.5585183306922875, "learning_rate": 1e-05, "loss": 0.2515, "step": 120 }, { "epoch": 0.46875, "eval_dev_acc": 0.4724409580230713, "eval_dev_token": 4777.55126953125, "eval_runtime": 165.1485, "eval_samples_per_second": 0.097, "eval_steps_per_second": 0.006, "step": 120 }, { "epoch": 0.47265625, "grad_norm": 0.9520218462259554, "learning_rate": 1e-05, "loss": 0.2613, "step": 121 }, { "epoch": 0.4765625, "grad_norm": 0.4858585527334522, "learning_rate": 1e-05, "loss": 0.2379, "step": 122 }, { "epoch": 0.48046875, "grad_norm": 0.5772160567620949, "learning_rate": 1e-05, "loss": 0.241, "step": 123 }, { "epoch": 0.484375, "grad_norm": 0.731954162407159, "learning_rate": 1e-05, "loss": 0.2482, "step": 124 }, { "epoch": 0.48828125, "grad_norm": 0.49226621710163243, "learning_rate": 1e-05, "loss": 0.2333, "step": 125 }, { "epoch": 0.4921875, "grad_norm": 0.43779404197089106, "learning_rate": 1e-05, "loss": 0.185, "step": 126 }, { "epoch": 0.49609375, "grad_norm": 0.6856986141306837, "learning_rate": 1e-05, "loss": 0.1943, "step": 127 }, { "epoch": 0.5, "grad_norm": 0.6558122415773976, "learning_rate": 1e-05, "loss": 0.2185, "step": 128 }, { "epoch": 0.5, "eval_dev_acc": 0.4765625, "eval_dev_token": 4368.859375, "eval_runtime": 161.9718, "eval_samples_per_second": 0.099, "eval_steps_per_second": 0.006, "step": 128 }, { "epoch": 0.50390625, "grad_norm": 0.4099906022533745, "learning_rate": 1e-05, "loss": 0.2113, "step": 129 }, { "epoch": 0.5078125, "grad_norm": 0.49752415105495956, "learning_rate": 1e-05, "loss": 0.2217, "step": 130 }, { "epoch": 0.51171875, "grad_norm": 0.8912790018467623, "learning_rate": 1e-05, "loss": 0.3422, "step": 131 }, { "epoch": 0.515625, "grad_norm": 0.6764829647253893, "learning_rate": 1e-05, "loss": 0.2055, "step": 132 }, { "epoch": 0.51953125, "grad_norm": 0.8399641090693946, "learning_rate": 1e-05, "loss": 0.2087, "step": 133 }, { "epoch": 0.5234375, "grad_norm": 0.4594160953603203, "learning_rate": 1e-05, "loss": 0.2093, "step": 134 }, { "epoch": 0.52734375, "grad_norm": 0.7432138703184232, "learning_rate": 1e-05, "loss": 0.1969, "step": 135 }, { "epoch": 0.53125, "grad_norm": 0.4584467325236011, "learning_rate": 1e-05, "loss": 0.1806, "step": 136 }, { "epoch": 0.53125, "eval_dev_acc": 0.4765625, "eval_dev_token": 4603.53125, "eval_runtime": 164.3452, "eval_samples_per_second": 0.097, "eval_steps_per_second": 0.006, "step": 136 }, { "epoch": 0.53515625, "grad_norm": 0.6458588312529675, "learning_rate": 1e-05, "loss": 0.2087, "step": 137 }, { "epoch": 0.5390625, "grad_norm": 0.7370624067340756, "learning_rate": 1e-05, "loss": 0.1854, "step": 138 }, { "epoch": 0.54296875, "grad_norm": 0.7141604462138248, "learning_rate": 1e-05, "loss": 0.2535, "step": 139 }, { "epoch": 0.546875, "grad_norm": 0.8212814690178184, "learning_rate": 1e-05, "loss": 0.1668, "step": 140 }, { "epoch": 0.55078125, "grad_norm": 0.5799692948316157, "learning_rate": 1e-05, "loss": 0.2375, "step": 141 }, { "epoch": 0.5546875, "grad_norm": 0.5333639624775814, "learning_rate": 1e-05, "loss": 0.1737, "step": 142 }, { "epoch": 0.55859375, "grad_norm": 0.4076841439195106, "learning_rate": 1e-05, "loss": 0.1627, "step": 143 }, { "epoch": 0.5625, "grad_norm": 0.4118175478201596, "learning_rate": 1e-05, "loss": 0.1576, "step": 144 }, { "epoch": 0.5625, "eval_dev_acc": 0.5234375, "eval_dev_token": 5125.0703125, "eval_runtime": 168.804, "eval_samples_per_second": 0.095, "eval_steps_per_second": 0.006, "step": 144 }, { "epoch": 0.56640625, "grad_norm": 0.5988381099011506, "learning_rate": 1e-05, "loss": 0.1656, "step": 145 }, { "epoch": 0.5703125, "grad_norm": 0.9328153493065982, "learning_rate": 1e-05, "loss": 0.1788, "step": 146 }, { "epoch": 0.57421875, "grad_norm": 0.8013592126955402, "learning_rate": 1e-05, "loss": 0.2009, "step": 147 }, { "epoch": 0.578125, "grad_norm": 0.4868159061171701, "learning_rate": 1e-05, "loss": 0.217, "step": 148 }, { "epoch": 0.58203125, "grad_norm": 0.6758953539585006, "learning_rate": 1e-05, "loss": 0.2344, "step": 149 }, { "epoch": 0.5859375, "grad_norm": 0.8609458752061137, "learning_rate": 1e-05, "loss": 0.1939, "step": 150 }, { "epoch": 0.58984375, "grad_norm": 0.45913847739444186, "learning_rate": 1e-05, "loss": 0.1691, "step": 151 }, { "epoch": 0.59375, "grad_norm": 0.8064977044716175, "learning_rate": 1e-05, "loss": 0.1949, "step": 152 }, { "epoch": 0.59375, "eval_dev_acc": 0.40625, "eval_dev_token": 4508.484375, "eval_runtime": 160.3398, "eval_samples_per_second": 0.1, "eval_steps_per_second": 0.006, "step": 152 }, { "epoch": 0.59765625, "grad_norm": 0.9904042315049291, "learning_rate": 1e-05, "loss": 0.2253, "step": 153 }, { "epoch": 0.6015625, "grad_norm": 0.5524318414569037, "learning_rate": 1e-05, "loss": 0.2535, "step": 154 }, { "epoch": 0.60546875, "grad_norm": 0.418186463867415, "learning_rate": 1e-05, "loss": 0.1884, "step": 155 }, { "epoch": 0.609375, "grad_norm": 0.6311027708045368, "learning_rate": 1e-05, "loss": 0.2408, "step": 156 }, { "epoch": 0.61328125, "grad_norm": 0.4550696199781805, "learning_rate": 1e-05, "loss": 0.173, "step": 157 }, { "epoch": 0.6171875, "grad_norm": 0.4596598696608727, "learning_rate": 1e-05, "loss": 0.1592, "step": 158 }, { "epoch": 0.62109375, "grad_norm": 0.5573937890044522, "learning_rate": 1e-05, "loss": 0.1748, "step": 159 }, { "epoch": 0.625, "grad_norm": 1.0862165315332113, "learning_rate": 1e-05, "loss": 0.2369, "step": 160 }, { "epoch": 0.625, "eval_dev_acc": 0.4296875, "eval_dev_token": 4869.8828125, "eval_runtime": 167.2914, "eval_samples_per_second": 0.096, "eval_steps_per_second": 0.006, "step": 160 }, { "epoch": 0.62890625, "grad_norm": 0.46051384064237827, "learning_rate": 1e-05, "loss": 0.2086, "step": 161 }, { "epoch": 0.6328125, "grad_norm": 0.7125397532570018, "learning_rate": 1e-05, "loss": 0.2212, "step": 162 }, { "epoch": 0.63671875, "grad_norm": 0.564820498711706, "learning_rate": 1e-05, "loss": 0.3019, "step": 163 }, { "epoch": 0.640625, "grad_norm": 0.5218656690400247, "learning_rate": 1e-05, "loss": 0.1324, "step": 164 }, { "epoch": 0.64453125, "grad_norm": 0.4994022980399308, "learning_rate": 1e-05, "loss": 0.1438, "step": 165 }, { "epoch": 0.6484375, "grad_norm": 0.7016809849517179, "learning_rate": 1e-05, "loss": 0.2791, "step": 166 }, { "epoch": 0.65234375, "grad_norm": 0.597463304680723, "learning_rate": 1e-05, "loss": 0.1749, "step": 167 }, { "epoch": 0.65625, "grad_norm": 0.5536855781273838, "learning_rate": 1e-05, "loss": 0.2391, "step": 168 }, { "epoch": 0.65625, "eval_dev_acc": 0.3203125, "eval_dev_token": 5451.3671875, "eval_runtime": 172.7574, "eval_samples_per_second": 0.093, "eval_steps_per_second": 0.006, "step": 168 }, { "epoch": 0.66015625, "grad_norm": 0.9103508979108635, "learning_rate": 1e-05, "loss": 0.2613, "step": 169 }, { "epoch": 0.6640625, "grad_norm": 0.4928845564740678, "learning_rate": 1e-05, "loss": 0.215, "step": 170 }, { "epoch": 0.66796875, "grad_norm": 0.8690405638773996, "learning_rate": 1e-05, "loss": 0.2355, "step": 171 }, { "epoch": 0.671875, "grad_norm": 0.5511255682147113, "learning_rate": 1e-05, "loss": 0.2406, "step": 172 }, { "epoch": 0.67578125, "grad_norm": 0.44346107905460214, "learning_rate": 1e-05, "loss": 0.1867, "step": 173 }, { "epoch": 0.6796875, "grad_norm": 0.4019557678019079, "learning_rate": 1e-05, "loss": 0.1488, "step": 174 }, { "epoch": 0.68359375, "grad_norm": 0.4139658009208469, "learning_rate": 1e-05, "loss": 0.1666, "step": 175 }, { "epoch": 0.6875, "grad_norm": 0.45363011716779816, "learning_rate": 1e-05, "loss": 0.2006, "step": 176 }, { "epoch": 0.6875, "eval_dev_acc": 0.3385826647281647, "eval_dev_token": 4971.81884765625, "eval_runtime": 166.9967, "eval_samples_per_second": 0.096, "eval_steps_per_second": 0.006, "step": 176 }, { "epoch": 0.69140625, "grad_norm": 0.46674698673244774, "learning_rate": 1e-05, "loss": 0.1788, "step": 177 }, { "epoch": 0.6953125, "grad_norm": 0.5396579551057291, "learning_rate": 1e-05, "loss": 0.1857, "step": 178 }, { "epoch": 0.69921875, "grad_norm": 0.42472472699800484, "learning_rate": 1e-05, "loss": 0.1707, "step": 179 }, { "epoch": 0.703125, "grad_norm": 0.4208916108378261, "learning_rate": 1e-05, "loss": 0.1736, "step": 180 }, { "epoch": 0.70703125, "grad_norm": 0.5161632347165661, "learning_rate": 1e-05, "loss": 0.2074, "step": 181 }, { "epoch": 0.7109375, "grad_norm": 0.4851147968745633, "learning_rate": 1e-05, "loss": 0.2183, "step": 182 }, { "epoch": 0.71484375, "grad_norm": 0.5286494967968609, "learning_rate": 1e-05, "loss": 0.1877, "step": 183 }, { "epoch": 0.71875, "grad_norm": 0.5399316089624949, "learning_rate": 1e-05, "loss": 0.209, "step": 184 }, { "epoch": 0.71875, "eval_dev_acc": 0.3984375, "eval_dev_token": 4787.84375, "eval_runtime": 166.2574, "eval_samples_per_second": 0.096, "eval_steps_per_second": 0.006, "step": 184 }, { "epoch": 0.72265625, "grad_norm": 0.7188938790166789, "learning_rate": 1e-05, "loss": 0.2065, "step": 185 }, { "epoch": 0.7265625, "grad_norm": 0.5843767003652576, "learning_rate": 1e-05, "loss": 0.2356, "step": 186 }, { "epoch": 0.73046875, "grad_norm": 0.4904003204685076, "learning_rate": 1e-05, "loss": 0.201, "step": 187 }, { "epoch": 0.734375, "grad_norm": 0.485266158116283, "learning_rate": 1e-05, "loss": 0.1869, "step": 188 }, { "epoch": 0.73828125, "grad_norm": 0.5242977395658632, "learning_rate": 1e-05, "loss": 0.2122, "step": 189 }, { "epoch": 0.7421875, "grad_norm": 0.5417537780138298, "learning_rate": 1e-05, "loss": 0.2799, "step": 190 }, { "epoch": 0.74609375, "grad_norm": 0.48949419193338123, "learning_rate": 1e-05, "loss": 0.212, "step": 191 }, { "epoch": 0.75, "grad_norm": 0.48118963817889204, "learning_rate": 1e-05, "loss": 0.2195, "step": 192 }, { "epoch": 0.75, "eval_dev_acc": 0.453125, "eval_dev_token": 5056.7421875, "eval_runtime": 168.273, "eval_samples_per_second": 0.095, "eval_steps_per_second": 0.006, "step": 192 }, { "epoch": 0.75390625, "grad_norm": 0.6844465372064547, "learning_rate": 1e-05, "loss": 0.1645, "step": 193 }, { "epoch": 0.7578125, "grad_norm": 0.49653100043792153, "learning_rate": 1e-05, "loss": 0.2023, "step": 194 }, { "epoch": 0.76171875, "grad_norm": 0.5539027026151374, "learning_rate": 1e-05, "loss": 0.2348, "step": 195 }, { "epoch": 0.765625, "grad_norm": 0.5003270709383194, "learning_rate": 1e-05, "loss": 0.2545, "step": 196 }, { "epoch": 0.76953125, "grad_norm": 0.5666703162116131, "learning_rate": 1e-05, "loss": 0.2739, "step": 197 }, { "epoch": 0.7734375, "grad_norm": 0.5281121627729704, "learning_rate": 1e-05, "loss": 0.1927, "step": 198 }, { "epoch": 0.77734375, "grad_norm": 0.4691586351966124, "learning_rate": 1e-05, "loss": 0.2101, "step": 199 }, { "epoch": 0.78125, "grad_norm": 0.43348894899907703, "learning_rate": 1e-05, "loss": 0.1636, "step": 200 }, { "epoch": 0.78125, "eval_dev_acc": 0.4296875, "eval_dev_token": 5082.265625, "eval_runtime": 169.7777, "eval_samples_per_second": 0.094, "eval_steps_per_second": 0.006, "step": 200 }, { "epoch": 0.78515625, "grad_norm": 0.4995118305726593, "learning_rate": 1e-05, "loss": 0.2149, "step": 201 }, { "epoch": 0.7890625, "grad_norm": 0.3958721084761467, "learning_rate": 1e-05, "loss": 0.1732, "step": 202 }, { "epoch": 0.79296875, "grad_norm": 0.4883258744044862, "learning_rate": 1e-05, "loss": 0.219, "step": 203 }, { "epoch": 0.796875, "grad_norm": 0.45472746506302575, "learning_rate": 1e-05, "loss": 0.2187, "step": 204 }, { "epoch": 0.80078125, "grad_norm": 0.45006095039367805, "learning_rate": 1e-05, "loss": 0.1924, "step": 205 }, { "epoch": 0.8046875, "grad_norm": 0.4127537232406072, "learning_rate": 1e-05, "loss": 0.1736, "step": 206 }, { "epoch": 0.80859375, "grad_norm": 0.4669392415601201, "learning_rate": 1e-05, "loss": 0.1847, "step": 207 }, { "epoch": 0.8125, "grad_norm": 0.41469363114093816, "learning_rate": 1e-05, "loss": 0.1556, "step": 208 }, { "epoch": 0.8125, "eval_dev_acc": 0.4609375, "eval_dev_token": 4918.28125, "eval_runtime": 166.5675, "eval_samples_per_second": 0.096, "eval_steps_per_second": 0.006, "step": 208 }, { "epoch": 0.81640625, "grad_norm": 0.4433576280938302, "learning_rate": 1e-05, "loss": 0.1934, "step": 209 }, { "epoch": 0.8203125, "grad_norm": 0.4355305023653351, "learning_rate": 1e-05, "loss": 0.1742, "step": 210 }, { "epoch": 0.82421875, "grad_norm": 0.44938618579632195, "learning_rate": 1e-05, "loss": 0.1902, "step": 211 }, { "epoch": 0.828125, "grad_norm": 0.5351771463999816, "learning_rate": 1e-05, "loss": 0.2148, "step": 212 }, { "epoch": 0.83203125, "grad_norm": 0.5839350362138708, "learning_rate": 1e-05, "loss": 0.275, "step": 213 }, { "epoch": 0.8359375, "grad_norm": 0.6964110745693202, "learning_rate": 1e-05, "loss": 0.2179, "step": 214 }, { "epoch": 0.83984375, "grad_norm": 0.4337830660702992, "learning_rate": 1e-05, "loss": 0.2152, "step": 215 }, { "epoch": 0.84375, "grad_norm": 0.46223312750006246, "learning_rate": 1e-05, "loss": 0.2405, "step": 216 }, { "epoch": 0.84375, "eval_dev_acc": 0.3828125, "eval_dev_token": 5435.3046875, "eval_runtime": 173.8173, "eval_samples_per_second": 0.092, "eval_steps_per_second": 0.006, "step": 216 }, { "epoch": 0.84765625, "grad_norm": 0.5541820526606585, "learning_rate": 1e-05, "loss": 0.2751, "step": 217 }, { "epoch": 0.8515625, "grad_norm": 0.4662570041545537, "learning_rate": 1e-05, "loss": 0.2142, "step": 218 }, { "epoch": 0.85546875, "grad_norm": 0.7737037625157579, "learning_rate": 1e-05, "loss": 0.2397, "step": 219 }, { "epoch": 0.859375, "grad_norm": 0.5572195616624243, "learning_rate": 1e-05, "loss": 0.2421, "step": 220 }, { "epoch": 0.86328125, "grad_norm": 0.5088509372691609, "learning_rate": 1e-05, "loss": 0.1875, "step": 221 }, { "epoch": 0.8671875, "grad_norm": 0.508699458613964, "learning_rate": 1e-05, "loss": 0.1927, "step": 222 }, { "epoch": 0.87109375, "grad_norm": 0.5150091482241945, "learning_rate": 1e-05, "loss": 0.2536, "step": 223 }, { "epoch": 0.875, "grad_norm": 0.5203627078659161, "learning_rate": 1e-05, "loss": 0.2571, "step": 224 }, { "epoch": 0.875, "eval_dev_acc": 0.3515625, "eval_dev_token": 5227.0859375, "eval_runtime": 170.2355, "eval_samples_per_second": 0.094, "eval_steps_per_second": 0.006, "step": 224 }, { "epoch": 0.87890625, "grad_norm": 0.5279392216696818, "learning_rate": 1e-05, "loss": 0.2278, "step": 225 }, { "epoch": 0.8828125, "grad_norm": 0.45017131620724865, "learning_rate": 1e-05, "loss": 0.2132, "step": 226 }, { "epoch": 0.88671875, "grad_norm": 0.48915211275869575, "learning_rate": 1e-05, "loss": 0.2627, "step": 227 }, { "epoch": 0.890625, "grad_norm": 0.4606618945421734, "learning_rate": 1e-05, "loss": 0.1528, "step": 228 }, { "epoch": 0.89453125, "grad_norm": 0.5072593200666395, "learning_rate": 1e-05, "loss": 0.2148, "step": 229 }, { "epoch": 0.8984375, "grad_norm": 0.5513069869439534, "learning_rate": 1e-05, "loss": 0.2319, "step": 230 }, { "epoch": 0.90234375, "grad_norm": 0.4917083878550277, "learning_rate": 1e-05, "loss": 0.1989, "step": 231 }, { "epoch": 0.90625, "grad_norm": 0.4027028580105545, "learning_rate": 1e-05, "loss": 0.1398, "step": 232 }, { "epoch": 0.90625, "eval_dev_acc": 0.3779527544975281, "eval_dev_token": 5651.6455078125, "eval_runtime": 175.5543, "eval_samples_per_second": 0.091, "eval_steps_per_second": 0.006, "step": 232 }, { "epoch": 0.91015625, "grad_norm": 0.4098440727615931, "learning_rate": 1e-05, "loss": 0.1481, "step": 233 }, { "epoch": 0.9140625, "grad_norm": 0.4379253949500134, "learning_rate": 1e-05, "loss": 0.172, "step": 234 }, { "epoch": 0.91796875, "grad_norm": 0.6161974608496972, "learning_rate": 1e-05, "loss": 0.2234, "step": 235 }, { "epoch": 0.921875, "grad_norm": 0.6431694552333217, "learning_rate": 1e-05, "loss": 0.2928, "step": 236 }, { "epoch": 0.92578125, "grad_norm": 0.7524837454023333, "learning_rate": 1e-05, "loss": 0.3518, "step": 237 }, { "epoch": 0.9296875, "grad_norm": 0.5137794157548315, "learning_rate": 1e-05, "loss": 0.2371, "step": 238 }, { "epoch": 0.93359375, "grad_norm": 0.42726761741926383, "learning_rate": 1e-05, "loss": 0.1349, "step": 239 }, { "epoch": 0.9375, "grad_norm": 0.50721507122848, "learning_rate": 1e-05, "loss": 0.147, "step": 240 }, { "epoch": 0.9375, "eval_dev_acc": 0.4375, "eval_dev_token": 5554.34375, "eval_runtime": 173.4206, "eval_samples_per_second": 0.092, "eval_steps_per_second": 0.006, "step": 240 }, { "epoch": 0.94140625, "grad_norm": 0.5085504060972834, "learning_rate": 1e-05, "loss": 0.2115, "step": 241 }, { "epoch": 0.9453125, "grad_norm": 0.5245333395138617, "learning_rate": 1e-05, "loss": 0.2203, "step": 242 }, { "epoch": 0.94921875, "grad_norm": 0.5149241747645703, "learning_rate": 1e-05, "loss": 0.1935, "step": 243 }, { "epoch": 0.953125, "grad_norm": 0.45199967311107936, "learning_rate": 1e-05, "loss": 0.1875, "step": 244 }, { "epoch": 0.95703125, "grad_norm": 0.6017279864923942, "learning_rate": 1e-05, "loss": 0.1964, "step": 245 }, { "epoch": 0.9609375, "grad_norm": 0.541548647166723, "learning_rate": 1e-05, "loss": 0.2029, "step": 246 }, { "epoch": 0.96484375, "grad_norm": 0.7095706252744872, "learning_rate": 1e-05, "loss": 0.1824, "step": 247 }, { "epoch": 0.96875, "grad_norm": 0.6630534512223186, "learning_rate": 1e-05, "loss": 0.2346, "step": 248 }, { "epoch": 0.96875, "eval_dev_acc": 0.5234375, "eval_dev_token": 5464.203125, "eval_runtime": 173.0858, "eval_samples_per_second": 0.092, "eval_steps_per_second": 0.006, "step": 248 }, { "epoch": 0.97265625, "grad_norm": 0.7470938668923351, "learning_rate": 1e-05, "loss": 0.3028, "step": 249 }, { "epoch": 0.9765625, "grad_norm": 0.534162369114681, "learning_rate": 1e-05, "loss": 0.243, "step": 250 }, { "epoch": 0.98046875, "grad_norm": 0.5240149993617814, "learning_rate": 1e-05, "loss": 0.2475, "step": 251 }, { "epoch": 0.984375, "grad_norm": 0.48058164633897993, "learning_rate": 1e-05, "loss": 0.2234, "step": 252 }, { "epoch": 0.98828125, "grad_norm": 0.5427424821749397, "learning_rate": 1e-05, "loss": 0.2338, "step": 253 }, { "epoch": 0.9921875, "grad_norm": 0.5309304323745797, "learning_rate": 1e-05, "loss": 0.2751, "step": 254 }, { "epoch": 0.99609375, "grad_norm": 0.4961154954055658, "learning_rate": 1e-05, "loss": 0.2329, "step": 255 }, { "epoch": 1.0, "grad_norm": 0.519835488758917, "learning_rate": 1e-05, "loss": 0.2182, "step": 256 }, { "epoch": 1.0, "eval_dev_acc": 0.4453125, "eval_dev_token": 5674.0546875, "eval_runtime": 175.8662, "eval_samples_per_second": 0.091, "eval_steps_per_second": 0.006, "step": 256 } ], "logging_steps": 1.0, "max_steps": 256, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 64, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 31380919492608.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }