{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.980612244897959, "eval_steps": 82, "global_step": 490, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004081632653061225, "grad_norm": 0.03618604317307472, "learning_rate": 2e-05, "loss": 1.1867, "step": 1 }, { "epoch": 0.004081632653061225, "eval_loss": 1.2217298746109009, "eval_runtime": 117.7701, "eval_samples_per_second": 20.735, "eval_steps_per_second": 10.368, "step": 1 }, { "epoch": 0.00816326530612245, "grad_norm": 0.03361086547374725, "learning_rate": 4e-05, "loss": 1.1923, "step": 2 }, { "epoch": 0.012244897959183673, "grad_norm": 0.034457892179489136, "learning_rate": 6e-05, "loss": 1.0744, "step": 3 }, { "epoch": 0.0163265306122449, "grad_norm": 0.03408154472708702, "learning_rate": 8e-05, "loss": 1.2796, "step": 4 }, { "epoch": 0.02040816326530612, "grad_norm": 0.03475559875369072, "learning_rate": 0.0001, "loss": 1.2709, "step": 5 }, { "epoch": 0.024489795918367346, "grad_norm": 0.03768599033355713, "learning_rate": 0.00012, "loss": 1.0895, "step": 6 }, { "epoch": 0.02857142857142857, "grad_norm": 0.03434902802109718, "learning_rate": 0.00014, "loss": 1.2194, "step": 7 }, { "epoch": 0.0326530612244898, "grad_norm": 0.029295403510332108, "learning_rate": 0.00016, "loss": 1.1522, "step": 8 }, { "epoch": 0.036734693877551024, "grad_norm": 0.027247965335845947, "learning_rate": 0.00018, "loss": 1.1636, "step": 9 }, { "epoch": 0.04081632653061224, "grad_norm": 0.027813177555799484, "learning_rate": 0.0002, "loss": 1.1848, "step": 10 }, { "epoch": 0.044897959183673466, "grad_norm": 0.027455640956759453, "learning_rate": 0.00019999906115681734, "loss": 1.2151, "step": 11 }, { "epoch": 0.04897959183673469, "grad_norm": 0.026825131848454475, "learning_rate": 0.0001999962446448979, "loss": 1.1751, "step": 12 }, { "epoch": 0.053061224489795916, "grad_norm": 0.026261834427714348, "learning_rate": 0.0001999915505171269, "loss": 1.0573, "step": 13 }, { "epoch": 0.05714285714285714, "grad_norm": 0.022048471495509148, "learning_rate": 0.0001999849788616454, "loss": 1.1997, "step": 14 }, { "epoch": 0.061224489795918366, "grad_norm": 0.01966056413948536, "learning_rate": 0.00019997652980184843, "loss": 1.1469, "step": 15 }, { "epoch": 0.0653061224489796, "grad_norm": 0.021213103085756302, "learning_rate": 0.00019996620349638285, "loss": 1.15, "step": 16 }, { "epoch": 0.06938775510204082, "grad_norm": 0.02238585613667965, "learning_rate": 0.00019995400013914427, "loss": 1.206, "step": 17 }, { "epoch": 0.07346938775510205, "grad_norm": 0.02359834685921669, "learning_rate": 0.0001999399199592735, "loss": 1.3413, "step": 18 }, { "epoch": 0.07755102040816327, "grad_norm": 0.022565221413969994, "learning_rate": 0.00019992396322115213, "loss": 1.1798, "step": 19 }, { "epoch": 0.08163265306122448, "grad_norm": 0.02135898545384407, "learning_rate": 0.0001999061302243977, "loss": 1.1602, "step": 20 }, { "epoch": 0.08571428571428572, "grad_norm": 0.02203250862658024, "learning_rate": 0.00019988642130385788, "loss": 1.0915, "step": 21 }, { "epoch": 0.08979591836734693, "grad_norm": 0.01716572232544422, "learning_rate": 0.00019986483682960445, "loss": 1.1858, "step": 22 }, { "epoch": 0.09387755102040816, "grad_norm": 0.01753074862062931, "learning_rate": 0.00019984137720692612, "loss": 1.1849, "step": 23 }, { "epoch": 0.09795918367346938, "grad_norm": 0.017932750284671783, "learning_rate": 0.00019981604287632102, "loss": 1.1579, "step": 24 }, { "epoch": 0.10204081632653061, "grad_norm": 0.017208363860845566, "learning_rate": 0.00019978883431348845, "loss": 1.1223, "step": 25 }, { "epoch": 0.10612244897959183, "grad_norm": 0.01637221872806549, "learning_rate": 0.00019975975202931982, "loss": 1.1809, "step": 26 }, { "epoch": 0.11020408163265306, "grad_norm": 0.016254756599664688, "learning_rate": 0.0001997287965698893, "loss": 1.1978, "step": 27 }, { "epoch": 0.11428571428571428, "grad_norm": 0.016743820160627365, "learning_rate": 0.00019969596851644327, "loss": 1.1273, "step": 28 }, { "epoch": 0.11836734693877551, "grad_norm": 0.020556606352329254, "learning_rate": 0.0001996612684853896, "loss": 1.1638, "step": 29 }, { "epoch": 0.12244897959183673, "grad_norm": 0.019542187452316284, "learning_rate": 0.00019962469712828614, "loss": 1.2558, "step": 30 }, { "epoch": 0.12653061224489795, "grad_norm": 0.01658390648663044, "learning_rate": 0.00019958625513182815, "loss": 1.2176, "step": 31 }, { "epoch": 0.1306122448979592, "grad_norm": 0.017420461401343346, "learning_rate": 0.00019954594321783584, "loss": 1.2671, "step": 32 }, { "epoch": 0.1346938775510204, "grad_norm": 0.01890096440911293, "learning_rate": 0.00019950376214324052, "loss": 1.1448, "step": 33 }, { "epoch": 0.13877551020408163, "grad_norm": 0.016807271167635918, "learning_rate": 0.00019945971270007043, "loss": 1.2085, "step": 34 }, { "epoch": 0.14285714285714285, "grad_norm": 0.015759294852614403, "learning_rate": 0.00019941379571543596, "loss": 1.2207, "step": 35 }, { "epoch": 0.1469387755102041, "grad_norm": 0.016418293118476868, "learning_rate": 0.00019936601205151414, "loss": 1.1031, "step": 36 }, { "epoch": 0.1510204081632653, "grad_norm": 0.017639920115470886, "learning_rate": 0.00019931636260553224, "loss": 1.1439, "step": 37 }, { "epoch": 0.15510204081632653, "grad_norm": 0.01651432178914547, "learning_rate": 0.00019926484830975113, "loss": 1.1111, "step": 38 }, { "epoch": 0.15918367346938775, "grad_norm": 0.016843752935528755, "learning_rate": 0.0001992114701314478, "loss": 1.2519, "step": 39 }, { "epoch": 0.16326530612244897, "grad_norm": 0.016024339944124222, "learning_rate": 0.00019915622907289694, "loss": 1.2122, "step": 40 }, { "epoch": 0.1673469387755102, "grad_norm": 0.017350338399410248, "learning_rate": 0.00019909912617135244, "loss": 1.2104, "step": 41 }, { "epoch": 0.17142857142857143, "grad_norm": 0.01698286086320877, "learning_rate": 0.00019904016249902763, "loss": 1.2258, "step": 42 }, { "epoch": 0.17551020408163265, "grad_norm": 0.016692234203219414, "learning_rate": 0.00019897933916307543, "loss": 1.1614, "step": 43 }, { "epoch": 0.17959183673469387, "grad_norm": 0.01875966228544712, "learning_rate": 0.00019891665730556725, "loss": 1.2314, "step": 44 }, { "epoch": 0.1836734693877551, "grad_norm": 0.016782566905021667, "learning_rate": 0.00019885211810347184, "loss": 1.2096, "step": 45 }, { "epoch": 0.18775510204081633, "grad_norm": 0.017428990453481674, "learning_rate": 0.00019878572276863294, "loss": 1.0537, "step": 46 }, { "epoch": 0.19183673469387755, "grad_norm": 0.016738982871174812, "learning_rate": 0.00019871747254774673, "loss": 1.2018, "step": 47 }, { "epoch": 0.19591836734693877, "grad_norm": 0.018019411712884903, "learning_rate": 0.0001986473687223383, "loss": 1.2504, "step": 48 }, { "epoch": 0.2, "grad_norm": 0.017473606392741203, "learning_rate": 0.0001985754126087376, "loss": 1.0481, "step": 49 }, { "epoch": 0.20408163265306123, "grad_norm": 0.017300540581345558, "learning_rate": 0.00019850160555805486, "loss": 1.2109, "step": 50 }, { "epoch": 0.20816326530612245, "grad_norm": 0.0182975921779871, "learning_rate": 0.00019842594895615488, "loss": 1.1598, "step": 51 }, { "epoch": 0.21224489795918366, "grad_norm": 0.020277326926589012, "learning_rate": 0.00019834844422363142, "loss": 1.1177, "step": 52 }, { "epoch": 0.2163265306122449, "grad_norm": 0.018494602292776108, "learning_rate": 0.00019826909281578026, "loss": 1.1037, "step": 53 }, { "epoch": 0.22040816326530613, "grad_norm": 0.017682479694485664, "learning_rate": 0.00019818789622257196, "loss": 1.1358, "step": 54 }, { "epoch": 0.22448979591836735, "grad_norm": 0.0186194758862257, "learning_rate": 0.00019810485596862392, "loss": 1.1403, "step": 55 }, { "epoch": 0.22857142857142856, "grad_norm": 0.018025796860456467, "learning_rate": 0.00019801997361317163, "loss": 1.1959, "step": 56 }, { "epoch": 0.23265306122448978, "grad_norm": 0.01819712296128273, "learning_rate": 0.0001979332507500395, "loss": 1.1181, "step": 57 }, { "epoch": 0.23673469387755103, "grad_norm": 0.018991166725754738, "learning_rate": 0.00019784468900761095, "loss": 1.1393, "step": 58 }, { "epoch": 0.24081632653061225, "grad_norm": 0.01914682239294052, "learning_rate": 0.0001977542900487977, "loss": 1.182, "step": 59 }, { "epoch": 0.24489795918367346, "grad_norm": 0.01992950588464737, "learning_rate": 0.00019766205557100868, "loss": 1.1629, "step": 60 }, { "epoch": 0.24897959183673468, "grad_norm": 0.024829212576150894, "learning_rate": 0.00019756798730611813, "loss": 1.0753, "step": 61 }, { "epoch": 0.2530612244897959, "grad_norm": 0.02457556687295437, "learning_rate": 0.00019747208702043296, "loss": 1.1718, "step": 62 }, { "epoch": 0.2571428571428571, "grad_norm": 0.019101744517683983, "learning_rate": 0.0001973743565146599, "loss": 1.1343, "step": 63 }, { "epoch": 0.2612244897959184, "grad_norm": 0.020323360338807106, "learning_rate": 0.00019727479762387116, "loss": 1.1689, "step": 64 }, { "epoch": 0.2653061224489796, "grad_norm": 0.018925843760371208, "learning_rate": 0.00019717341221747056, "loss": 1.2098, "step": 65 }, { "epoch": 0.2693877551020408, "grad_norm": 0.019229738041758537, "learning_rate": 0.00019707020219915806, "loss": 1.1542, "step": 66 }, { "epoch": 0.27346938775510204, "grad_norm": 0.019665885716676712, "learning_rate": 0.00019696516950689404, "loss": 1.1154, "step": 67 }, { "epoch": 0.27755102040816326, "grad_norm": 0.019163204357028008, "learning_rate": 0.0001968583161128631, "loss": 1.1726, "step": 68 }, { "epoch": 0.2816326530612245, "grad_norm": 0.019880875945091248, "learning_rate": 0.00019674964402343684, "loss": 1.1441, "step": 69 }, { "epoch": 0.2857142857142857, "grad_norm": 0.019096847623586655, "learning_rate": 0.00019663915527913625, "loss": 1.2021, "step": 70 }, { "epoch": 0.2897959183673469, "grad_norm": 0.019253911450505257, "learning_rate": 0.00019652685195459344, "loss": 1.1237, "step": 71 }, { "epoch": 0.2938775510204082, "grad_norm": 0.020909370854496956, "learning_rate": 0.00019641273615851257, "loss": 1.0575, "step": 72 }, { "epoch": 0.2979591836734694, "grad_norm": 0.019176874309778214, "learning_rate": 0.00019629681003363044, "loss": 1.0377, "step": 73 }, { "epoch": 0.3020408163265306, "grad_norm": 0.019969960674643517, "learning_rate": 0.00019617907575667602, "loss": 1.1471, "step": 74 }, { "epoch": 0.30612244897959184, "grad_norm": 0.019450997933745384, "learning_rate": 0.00019605953553832988, "loss": 1.2022, "step": 75 }, { "epoch": 0.31020408163265306, "grad_norm": 0.02082456275820732, "learning_rate": 0.00019593819162318232, "loss": 1.0932, "step": 76 }, { "epoch": 0.3142857142857143, "grad_norm": 0.020122263580560684, "learning_rate": 0.00019581504628969154, "loss": 1.0826, "step": 77 }, { "epoch": 0.3183673469387755, "grad_norm": 0.020104922354221344, "learning_rate": 0.00019569010185014062, "loss": 1.1404, "step": 78 }, { "epoch": 0.3224489795918367, "grad_norm": 0.021288864314556122, "learning_rate": 0.00019556336065059432, "loss": 1.0951, "step": 79 }, { "epoch": 0.32653061224489793, "grad_norm": 0.02072795107960701, "learning_rate": 0.00019543482507085482, "loss": 1.0649, "step": 80 }, { "epoch": 0.3306122448979592, "grad_norm": 0.020123451948165894, "learning_rate": 0.00019530449752441718, "loss": 1.1333, "step": 81 }, { "epoch": 0.3346938775510204, "grad_norm": 0.020374584943056107, "learning_rate": 0.00019517238045842404, "loss": 1.147, "step": 82 }, { "epoch": 0.3346938775510204, "eval_loss": 1.1397618055343628, "eval_runtime": 120.3925, "eval_samples_per_second": 20.284, "eval_steps_per_second": 10.142, "step": 82 }, { "epoch": 0.33877551020408164, "grad_norm": 0.0197721179574728, "learning_rate": 0.00019503847635361944, "loss": 1.0954, "step": 83 }, { "epoch": 0.34285714285714286, "grad_norm": 0.021181972697377205, "learning_rate": 0.00019490278772430256, "loss": 1.1145, "step": 84 }, { "epoch": 0.3469387755102041, "grad_norm": 0.021057790145277977, "learning_rate": 0.00019476531711828027, "loss": 1.2255, "step": 85 }, { "epoch": 0.3510204081632653, "grad_norm": 0.0207724217325449, "learning_rate": 0.00019462606711681936, "loss": 1.0366, "step": 86 }, { "epoch": 0.3551020408163265, "grad_norm": 0.021172018721699715, "learning_rate": 0.00019448504033459818, "loss": 1.0835, "step": 87 }, { "epoch": 0.35918367346938773, "grad_norm": 0.02109033428132534, "learning_rate": 0.00019434223941965738, "loss": 1.0902, "step": 88 }, { "epoch": 0.363265306122449, "grad_norm": 0.02078443393111229, "learning_rate": 0.00019419766705335026, "loss": 1.087, "step": 89 }, { "epoch": 0.3673469387755102, "grad_norm": 0.020614784210920334, "learning_rate": 0.0001940513259502924, "loss": 1.0834, "step": 90 }, { "epoch": 0.37142857142857144, "grad_norm": 0.020187893882393837, "learning_rate": 0.0001939032188583108, "loss": 1.1847, "step": 91 }, { "epoch": 0.37551020408163266, "grad_norm": 0.020622489973902702, "learning_rate": 0.0001937533485583921, "loss": 1.1767, "step": 92 }, { "epoch": 0.3795918367346939, "grad_norm": 0.02118833176791668, "learning_rate": 0.00019360171786463043, "loss": 1.1491, "step": 93 }, { "epoch": 0.3836734693877551, "grad_norm": 0.021402837708592415, "learning_rate": 0.00019344832962417475, "loss": 1.1547, "step": 94 }, { "epoch": 0.3877551020408163, "grad_norm": 0.02148307114839554, "learning_rate": 0.0001932931867171751, "loss": 1.2062, "step": 95 }, { "epoch": 0.39183673469387753, "grad_norm": 0.022203955799341202, "learning_rate": 0.00019313629205672868, "loss": 1.1271, "step": 96 }, { "epoch": 0.39591836734693875, "grad_norm": 0.02081882208585739, "learning_rate": 0.00019297764858882514, "loss": 1.0965, "step": 97 }, { "epoch": 0.4, "grad_norm": 0.021749386563897133, "learning_rate": 0.00019281725929229127, "loss": 1.11, "step": 98 }, { "epoch": 0.40408163265306124, "grad_norm": 0.02319493517279625, "learning_rate": 0.00019265512717873498, "loss": 1.0196, "step": 99 }, { "epoch": 0.40816326530612246, "grad_norm": 0.021901512518525124, "learning_rate": 0.0001924912552924889, "loss": 1.1552, "step": 100 }, { "epoch": 0.4122448979591837, "grad_norm": 0.02368365228176117, "learning_rate": 0.00019232564671055306, "loss": 1.2047, "step": 101 }, { "epoch": 0.4163265306122449, "grad_norm": 0.022438503801822662, "learning_rate": 0.00019215830454253724, "loss": 1.1429, "step": 102 }, { "epoch": 0.4204081632653061, "grad_norm": 0.022685807198286057, "learning_rate": 0.00019198923193060254, "loss": 1.0717, "step": 103 }, { "epoch": 0.42448979591836733, "grad_norm": 0.023902015760540962, "learning_rate": 0.00019181843204940232, "loss": 1.1139, "step": 104 }, { "epoch": 0.42857142857142855, "grad_norm": 0.022928839549422264, "learning_rate": 0.00019164590810602262, "loss": 1.1515, "step": 105 }, { "epoch": 0.4326530612244898, "grad_norm": 0.02210794948041439, "learning_rate": 0.00019147166333992205, "loss": 1.1762, "step": 106 }, { "epoch": 0.43673469387755104, "grad_norm": 0.023219434544444084, "learning_rate": 0.00019129570102287082, "loss": 1.1925, "step": 107 }, { "epoch": 0.44081632653061226, "grad_norm": 0.023004446178674698, "learning_rate": 0.00019111802445888936, "loss": 1.1449, "step": 108 }, { "epoch": 0.4448979591836735, "grad_norm": 0.023740626871585846, "learning_rate": 0.00019093863698418627, "loss": 1.2179, "step": 109 }, { "epoch": 0.4489795918367347, "grad_norm": 0.06276971846818924, "learning_rate": 0.00019075754196709572, "loss": 1.2497, "step": 110 }, { "epoch": 0.4530612244897959, "grad_norm": 0.023355931043624878, "learning_rate": 0.00019057474280801415, "loss": 1.1625, "step": 111 }, { "epoch": 0.45714285714285713, "grad_norm": 0.03020629473030567, "learning_rate": 0.00019039024293933645, "loss": 1.0821, "step": 112 }, { "epoch": 0.46122448979591835, "grad_norm": 0.022392934188246727, "learning_rate": 0.00019020404582539152, "loss": 1.1074, "step": 113 }, { "epoch": 0.46530612244897956, "grad_norm": 0.023577727377414703, "learning_rate": 0.00019001615496237712, "loss": 1.1128, "step": 114 }, { "epoch": 0.46938775510204084, "grad_norm": 0.023507297039031982, "learning_rate": 0.00018982657387829445, "loss": 1.1345, "step": 115 }, { "epoch": 0.47346938775510206, "grad_norm": 0.023036476224660873, "learning_rate": 0.0001896353061328816, "loss": 1.128, "step": 116 }, { "epoch": 0.4775510204081633, "grad_norm": 0.023640332743525505, "learning_rate": 0.00018944235531754698, "loss": 1.062, "step": 117 }, { "epoch": 0.4816326530612245, "grad_norm": 0.022972116246819496, "learning_rate": 0.00018924772505530174, "loss": 1.1024, "step": 118 }, { "epoch": 0.4857142857142857, "grad_norm": 0.023795459419488907, "learning_rate": 0.00018905141900069178, "loss": 1.1595, "step": 119 }, { "epoch": 0.4897959183673469, "grad_norm": 0.02379484474658966, "learning_rate": 0.00018885344083972914, "loss": 1.1492, "step": 120 }, { "epoch": 0.49387755102040815, "grad_norm": 0.023673737421631813, "learning_rate": 0.00018865379428982271, "loss": 1.1571, "step": 121 }, { "epoch": 0.49795918367346936, "grad_norm": 0.02365570329129696, "learning_rate": 0.00018845248309970854, "loss": 1.1216, "step": 122 }, { "epoch": 0.5020408163265306, "grad_norm": 0.025217821821570396, "learning_rate": 0.00018824951104937947, "loss": 1.1422, "step": 123 }, { "epoch": 0.5061224489795918, "grad_norm": 0.02457055076956749, "learning_rate": 0.00018804488195001392, "loss": 1.119, "step": 124 }, { "epoch": 0.5102040816326531, "grad_norm": 0.023103831335902214, "learning_rate": 0.00018783859964390464, "loss": 1.153, "step": 125 }, { "epoch": 0.5142857142857142, "grad_norm": 0.02386569045484066, "learning_rate": 0.00018763066800438636, "loss": 1.137, "step": 126 }, { "epoch": 0.5183673469387755, "grad_norm": 0.024395650252699852, "learning_rate": 0.00018742109093576313, "loss": 1.1592, "step": 127 }, { "epoch": 0.5224489795918368, "grad_norm": 0.024944225326180458, "learning_rate": 0.000187209872373235, "loss": 1.1395, "step": 128 }, { "epoch": 0.5265306122448979, "grad_norm": 0.023946771398186684, "learning_rate": 0.00018699701628282407, "loss": 1.1779, "step": 129 }, { "epoch": 0.5306122448979592, "grad_norm": 0.02391199767589569, "learning_rate": 0.00018678252666130013, "loss": 1.1299, "step": 130 }, { "epoch": 0.5346938775510204, "grad_norm": 0.0230008065700531, "learning_rate": 0.00018656640753610563, "loss": 1.1087, "step": 131 }, { "epoch": 0.5387755102040817, "grad_norm": 0.023263461887836456, "learning_rate": 0.0001863486629652799, "loss": 1.0835, "step": 132 }, { "epoch": 0.5428571428571428, "grad_norm": 0.024622686207294464, "learning_rate": 0.000186129297037383, "loss": 1.0911, "step": 133 }, { "epoch": 0.5469387755102041, "grad_norm": 0.023301225155591965, "learning_rate": 0.0001859083138714191, "loss": 1.1345, "step": 134 }, { "epoch": 0.5510204081632653, "grad_norm": 0.023047855123877525, "learning_rate": 0.00018568571761675893, "loss": 1.1353, "step": 135 }, { "epoch": 0.5551020408163265, "grad_norm": 0.025225356221199036, "learning_rate": 0.00018546151245306205, "loss": 1.149, "step": 136 }, { "epoch": 0.5591836734693878, "grad_norm": 0.024374982342123985, "learning_rate": 0.00018523570259019827, "loss": 1.0618, "step": 137 }, { "epoch": 0.563265306122449, "grad_norm": 0.024517694488167763, "learning_rate": 0.00018500829226816853, "loss": 1.148, "step": 138 }, { "epoch": 0.5673469387755102, "grad_norm": 0.022940896451473236, "learning_rate": 0.0001847792857570255, "loss": 1.0824, "step": 139 }, { "epoch": 0.5714285714285714, "grad_norm": 0.025163279846310616, "learning_rate": 0.0001845486873567932, "loss": 1.112, "step": 140 }, { "epoch": 0.5755102040816327, "grad_norm": 0.024664800614118576, "learning_rate": 0.00018431650139738633, "loss": 1.1398, "step": 141 }, { "epoch": 0.5795918367346938, "grad_norm": 0.02429143153131008, "learning_rate": 0.000184082732238529, "loss": 1.2003, "step": 142 }, { "epoch": 0.5836734693877551, "grad_norm": 0.026087280362844467, "learning_rate": 0.00018384738426967283, "loss": 1.1151, "step": 143 }, { "epoch": 0.5877551020408164, "grad_norm": 0.025691909715533257, "learning_rate": 0.00018361046190991455, "loss": 1.1526, "step": 144 }, { "epoch": 0.5918367346938775, "grad_norm": 0.024672340601682663, "learning_rate": 0.00018337196960791302, "loss": 1.1781, "step": 145 }, { "epoch": 0.5959183673469388, "grad_norm": 0.025797029957175255, "learning_rate": 0.00018313191184180568, "loss": 1.2053, "step": 146 }, { "epoch": 0.6, "grad_norm": 0.025542639195919037, "learning_rate": 0.0001828902931191244, "loss": 1.0819, "step": 147 }, { "epoch": 0.6040816326530613, "grad_norm": 0.025126414373517036, "learning_rate": 0.0001826471179767111, "loss": 1.1935, "step": 148 }, { "epoch": 0.6081632653061224, "grad_norm": 0.02403743751347065, "learning_rate": 0.0001824023909806322, "loss": 1.1151, "step": 149 }, { "epoch": 0.6122448979591837, "grad_norm": 0.025347478687763214, "learning_rate": 0.00018215611672609317, "loss": 1.0863, "step": 150 }, { "epoch": 0.6163265306122448, "grad_norm": 0.024234246462583542, "learning_rate": 0.00018190829983735207, "loss": 0.9933, "step": 151 }, { "epoch": 0.6204081632653061, "grad_norm": 0.02541242353618145, "learning_rate": 0.00018165894496763286, "loss": 1.016, "step": 152 }, { "epoch": 0.6244897959183674, "grad_norm": 0.025475289672613144, "learning_rate": 0.00018140805679903795, "loss": 1.1199, "step": 153 }, { "epoch": 0.6285714285714286, "grad_norm": 0.025047162547707558, "learning_rate": 0.00018115564004246023, "loss": 1.1866, "step": 154 }, { "epoch": 0.6326530612244898, "grad_norm": 0.024512339383363724, "learning_rate": 0.00018090169943749476, "loss": 1.0407, "step": 155 }, { "epoch": 0.636734693877551, "grad_norm": 0.027875879779458046, "learning_rate": 0.0001806462397523496, "loss": 1.1148, "step": 156 }, { "epoch": 0.6408163265306123, "grad_norm": 0.027647798880934715, "learning_rate": 0.00018038926578375653, "loss": 1.1748, "step": 157 }, { "epoch": 0.6448979591836734, "grad_norm": 0.0258516576141119, "learning_rate": 0.0001801307823568806, "loss": 1.156, "step": 158 }, { "epoch": 0.6489795918367347, "grad_norm": 0.02649206854403019, "learning_rate": 0.00017987079432522996, "loss": 1.0625, "step": 159 }, { "epoch": 0.6530612244897959, "grad_norm": 0.027242302894592285, "learning_rate": 0.00017960930657056438, "loss": 1.0624, "step": 160 }, { "epoch": 0.6571428571428571, "grad_norm": 0.026996418833732605, "learning_rate": 0.0001793463240028038, "loss": 1.0979, "step": 161 }, { "epoch": 0.6612244897959184, "grad_norm": 0.025911618024110794, "learning_rate": 0.00017908185155993605, "loss": 1.0767, "step": 162 }, { "epoch": 0.6653061224489796, "grad_norm": 0.02710540033876896, "learning_rate": 0.0001788158942079241, "loss": 0.9389, "step": 163 }, { "epoch": 0.6693877551020408, "grad_norm": 0.026086492463946342, "learning_rate": 0.00017854845694061292, "loss": 1.1475, "step": 164 }, { "epoch": 0.6693877551020408, "eval_loss": 1.1236058473587036, "eval_runtime": 120.3474, "eval_samples_per_second": 20.291, "eval_steps_per_second": 10.146, "step": 164 }, { "epoch": 0.673469387755102, "grad_norm": 0.02641923353075981, "learning_rate": 0.00017827954477963557, "loss": 1.1128, "step": 165 }, { "epoch": 0.6775510204081633, "grad_norm": 0.026196755468845367, "learning_rate": 0.00017800916277431908, "loss": 1.059, "step": 166 }, { "epoch": 0.6816326530612244, "grad_norm": 0.025523358955979347, "learning_rate": 0.00017773731600158947, "loss": 1.1618, "step": 167 }, { "epoch": 0.6857142857142857, "grad_norm": 0.026307355612516403, "learning_rate": 0.00017746400956587653, "loss": 1.0585, "step": 168 }, { "epoch": 0.689795918367347, "grad_norm": 0.026362139731645584, "learning_rate": 0.00017718924859901793, "loss": 1.0543, "step": 169 }, { "epoch": 0.6938775510204082, "grad_norm": 0.02530169114470482, "learning_rate": 0.0001769130382601629, "loss": 1.1009, "step": 170 }, { "epoch": 0.6979591836734694, "grad_norm": 0.026273014023900032, "learning_rate": 0.0001766353837356753, "loss": 1.1963, "step": 171 }, { "epoch": 0.7020408163265306, "grad_norm": 0.026570703834295273, "learning_rate": 0.00017635629023903627, "loss": 1.1098, "step": 172 }, { "epoch": 0.7061224489795919, "grad_norm": 0.02533114328980446, "learning_rate": 0.0001760757630107464, "loss": 1.2321, "step": 173 }, { "epoch": 0.710204081632653, "grad_norm": 0.02558620274066925, "learning_rate": 0.00017579380731822712, "loss": 1.1174, "step": 174 }, { "epoch": 0.7142857142857143, "grad_norm": 0.028141306713223457, "learning_rate": 0.00017551042845572208, "loss": 1.0609, "step": 175 }, { "epoch": 0.7183673469387755, "grad_norm": 0.028484290465712547, "learning_rate": 0.00017522563174419753, "loss": 1.0779, "step": 176 }, { "epoch": 0.7224489795918367, "grad_norm": 0.0262393057346344, "learning_rate": 0.00017493942253124248, "loss": 1.0889, "step": 177 }, { "epoch": 0.726530612244898, "grad_norm": 0.029188042506575584, "learning_rate": 0.00017465180619096832, "loss": 1.1816, "step": 178 }, { "epoch": 0.7306122448979592, "grad_norm": 0.02816024050116539, "learning_rate": 0.00017436278812390786, "loss": 1.155, "step": 179 }, { "epoch": 0.7346938775510204, "grad_norm": 0.026962874457240105, "learning_rate": 0.00017407237375691392, "loss": 1.0784, "step": 180 }, { "epoch": 0.7387755102040816, "grad_norm": 0.027746373787522316, "learning_rate": 0.00017378056854305747, "loss": 1.1896, "step": 181 }, { "epoch": 0.7428571428571429, "grad_norm": 0.02716743014752865, "learning_rate": 0.00017348737796152522, "loss": 1.0722, "step": 182 }, { "epoch": 0.746938775510204, "grad_norm": 0.02938067726790905, "learning_rate": 0.0001731928075175168, "loss": 1.0179, "step": 183 }, { "epoch": 0.7510204081632653, "grad_norm": 0.03580696880817413, "learning_rate": 0.00017289686274214118, "loss": 1.0951, "step": 184 }, { "epoch": 0.7551020408163265, "grad_norm": 0.02849019691348076, "learning_rate": 0.0001725995491923131, "loss": 1.1006, "step": 185 }, { "epoch": 0.7591836734693878, "grad_norm": 0.026822512969374657, "learning_rate": 0.00017230087245064858, "loss": 1.1535, "step": 186 }, { "epoch": 0.763265306122449, "grad_norm": 0.027751388028264046, "learning_rate": 0.00017200083812536, "loss": 1.0975, "step": 187 }, { "epoch": 0.7673469387755102, "grad_norm": 0.027201151475310326, "learning_rate": 0.00017169945185015106, "loss": 1.1022, "step": 188 }, { "epoch": 0.7714285714285715, "grad_norm": 0.026958104223012924, "learning_rate": 0.00017139671928411072, "loss": 1.14, "step": 189 }, { "epoch": 0.7755102040816326, "grad_norm": 0.026919234544038773, "learning_rate": 0.00017109264611160708, "loss": 1.1056, "step": 190 }, { "epoch": 0.7795918367346939, "grad_norm": 0.027088504284620285, "learning_rate": 0.00017078723804218066, "loss": 1.2057, "step": 191 }, { "epoch": 0.7836734693877551, "grad_norm": 0.027464529499411583, "learning_rate": 0.00017048050081043713, "loss": 1.1709, "step": 192 }, { "epoch": 0.7877551020408163, "grad_norm": 0.028090914711356163, "learning_rate": 0.0001701724401759397, "loss": 1.1489, "step": 193 }, { "epoch": 0.7918367346938775, "grad_norm": 0.026381971314549446, "learning_rate": 0.00016986306192310084, "loss": 1.0724, "step": 194 }, { "epoch": 0.7959183673469388, "grad_norm": 0.027733104303479195, "learning_rate": 0.00016955237186107387, "loss": 1.1281, "step": 195 }, { "epoch": 0.8, "grad_norm": 0.026476366445422173, "learning_rate": 0.0001692403758236437, "loss": 1.19, "step": 196 }, { "epoch": 0.8040816326530612, "grad_norm": 0.02931978367269039, "learning_rate": 0.0001689270796691174, "loss": 1.0904, "step": 197 }, { "epoch": 0.8081632653061225, "grad_norm": 0.02770647406578064, "learning_rate": 0.00016861248928021411, "loss": 1.1499, "step": 198 }, { "epoch": 0.8122448979591836, "grad_norm": 0.02689889818429947, "learning_rate": 0.00016829661056395474, "loss": 1.1525, "step": 199 }, { "epoch": 0.8163265306122449, "grad_norm": 0.02684464119374752, "learning_rate": 0.0001679794494515508, "loss": 1.0428, "step": 200 }, { "epoch": 0.8204081632653061, "grad_norm": 0.026796666905283928, "learning_rate": 0.0001676610118982933, "loss": 1.1829, "step": 201 }, { "epoch": 0.8244897959183674, "grad_norm": 0.028474919497966766, "learning_rate": 0.00016734130388344073, "loss": 1.0853, "step": 202 }, { "epoch": 0.8285714285714286, "grad_norm": 0.027959391474723816, "learning_rate": 0.00016702033141010694, "loss": 1.0554, "step": 203 }, { "epoch": 0.8326530612244898, "grad_norm": 0.027617380023002625, "learning_rate": 0.00016669810050514827, "loss": 1.2237, "step": 204 }, { "epoch": 0.8367346938775511, "grad_norm": 0.028105339035391808, "learning_rate": 0.00016637461721905045, "loss": 1.0778, "step": 205 }, { "epoch": 0.8408163265306122, "grad_norm": 0.028216930106282234, "learning_rate": 0.00016604988762581512, "loss": 1.1507, "step": 206 }, { "epoch": 0.8448979591836735, "grad_norm": 0.026569265872240067, "learning_rate": 0.00016572391782284547, "loss": 1.0134, "step": 207 }, { "epoch": 0.8489795918367347, "grad_norm": 0.027188677340745926, "learning_rate": 0.00016539671393083215, "loss": 1.0784, "step": 208 }, { "epoch": 0.8530612244897959, "grad_norm": 0.02793797291815281, "learning_rate": 0.00016506828209363796, "loss": 1.1477, "step": 209 }, { "epoch": 0.8571428571428571, "grad_norm": 0.027332162484526634, "learning_rate": 0.00016473862847818277, "loss": 1.1653, "step": 210 }, { "epoch": 0.8612244897959184, "grad_norm": 0.02860257215797901, "learning_rate": 0.00016440775927432753, "loss": 1.1599, "step": 211 }, { "epoch": 0.8653061224489796, "grad_norm": 0.028649616986513138, "learning_rate": 0.0001640756806947582, "loss": 1.1796, "step": 212 }, { "epoch": 0.8693877551020408, "grad_norm": 0.027630403637886047, "learning_rate": 0.000163742398974869, "loss": 1.1315, "step": 213 }, { "epoch": 0.8734693877551021, "grad_norm": 0.027829712256789207, "learning_rate": 0.00016340792037264527, "loss": 1.0573, "step": 214 }, { "epoch": 0.8775510204081632, "grad_norm": 0.027864307165145874, "learning_rate": 0.00016307225116854622, "loss": 1.083, "step": 215 }, { "epoch": 0.8816326530612245, "grad_norm": 0.028005510568618774, "learning_rate": 0.0001627353976653866, "loss": 1.238, "step": 216 }, { "epoch": 0.8857142857142857, "grad_norm": 0.02755396068096161, "learning_rate": 0.00016239736618821885, "loss": 1.1124, "step": 217 }, { "epoch": 0.889795918367347, "grad_norm": 0.026409147307276726, "learning_rate": 0.00016205816308421386, "loss": 1.1627, "step": 218 }, { "epoch": 0.8938775510204081, "grad_norm": 0.02800268866121769, "learning_rate": 0.00016171779472254206, "loss": 1.0921, "step": 219 }, { "epoch": 0.8979591836734694, "grad_norm": 0.02667434886097908, "learning_rate": 0.00016137626749425377, "loss": 1.207, "step": 220 }, { "epoch": 0.9020408163265307, "grad_norm": 0.027823835611343384, "learning_rate": 0.0001610335878121592, "loss": 1.0776, "step": 221 }, { "epoch": 0.9061224489795918, "grad_norm": 0.02820996195077896, "learning_rate": 0.000160689762110708, "loss": 1.0903, "step": 222 }, { "epoch": 0.9102040816326531, "grad_norm": 0.02725161425769329, "learning_rate": 0.00016034479684586854, "loss": 1.1587, "step": 223 }, { "epoch": 0.9142857142857143, "grad_norm": 0.027408665046095848, "learning_rate": 0.0001599986984950065, "loss": 1.1058, "step": 224 }, { "epoch": 0.9183673469387755, "grad_norm": 0.027829701080918312, "learning_rate": 0.00015965147355676343, "loss": 1.1037, "step": 225 }, { "epoch": 0.9224489795918367, "grad_norm": 0.027295254170894623, "learning_rate": 0.0001593031285509347, "loss": 1.0681, "step": 226 }, { "epoch": 0.926530612244898, "grad_norm": 0.028630707412958145, "learning_rate": 0.00015895367001834694, "loss": 1.1282, "step": 227 }, { "epoch": 0.9306122448979591, "grad_norm": 0.02818943001329899, "learning_rate": 0.0001586031045207354, "loss": 1.1101, "step": 228 }, { "epoch": 0.9346938775510204, "grad_norm": 0.028520913794636726, "learning_rate": 0.0001582514386406206, "loss": 1.1716, "step": 229 }, { "epoch": 0.9387755102040817, "grad_norm": 0.027190426364541054, "learning_rate": 0.0001578986789811849, "loss": 1.0615, "step": 230 }, { "epoch": 0.9428571428571428, "grad_norm": 0.028909413143992424, "learning_rate": 0.00015754483216614821, "loss": 1.0757, "step": 231 }, { "epoch": 0.9469387755102041, "grad_norm": 0.028231430798768997, "learning_rate": 0.000157189904839644, "loss": 1.14, "step": 232 }, { "epoch": 0.9510204081632653, "grad_norm": 0.02774875983595848, "learning_rate": 0.00015683390366609426, "loss": 1.0429, "step": 233 }, { "epoch": 0.9551020408163265, "grad_norm": 0.030419837683439255, "learning_rate": 0.00015647683533008455, "loss": 1.113, "step": 234 }, { "epoch": 0.9591836734693877, "grad_norm": 0.028292929753661156, "learning_rate": 0.00015611870653623825, "loss": 1.1127, "step": 235 }, { "epoch": 0.963265306122449, "grad_norm": 0.028285473585128784, "learning_rate": 0.00015575952400909092, "loss": 1.0493, "step": 236 }, { "epoch": 0.9673469387755103, "grad_norm": 0.028861958533525467, "learning_rate": 0.00015539929449296392, "loss": 1.0879, "step": 237 }, { "epoch": 0.9714285714285714, "grad_norm": 0.02901526540517807, "learning_rate": 0.00015503802475183773, "loss": 1.21, "step": 238 }, { "epoch": 0.9755102040816327, "grad_norm": 0.02809295803308487, "learning_rate": 0.00015467572156922503, "loss": 1.1949, "step": 239 }, { "epoch": 0.9795918367346939, "grad_norm": 0.03164355456829071, "learning_rate": 0.00015431239174804328, "loss": 1.068, "step": 240 }, { "epoch": 0.9836734693877551, "grad_norm": 0.029855625703930855, "learning_rate": 0.00015394804211048702, "loss": 1.1485, "step": 241 }, { "epoch": 0.9877551020408163, "grad_norm": 0.027486370876431465, "learning_rate": 0.00015358267949789966, "loss": 1.0449, "step": 242 }, { "epoch": 0.9918367346938776, "grad_norm": 0.02765297144651413, "learning_rate": 0.0001532163107706452, "loss": 1.0341, "step": 243 }, { "epoch": 0.9959183673469387, "grad_norm": 0.02840602956712246, "learning_rate": 0.0001528489428079793, "loss": 1.2001, "step": 244 }, { "epoch": 1.0, "grad_norm": 0.028239021077752113, "learning_rate": 0.00015248058250792008, "loss": 1.1528, "step": 245 }, { "epoch": 1.0040816326530613, "grad_norm": 0.02886350080370903, "learning_rate": 0.00015211123678711866, "loss": 1.1831, "step": 246 }, { "epoch": 1.0040816326530613, "eval_loss": 1.114267110824585, "eval_runtime": 120.4197, "eval_samples_per_second": 20.279, "eval_steps_per_second": 10.14, "step": 246 }, { "epoch": 1.0081632653061225, "grad_norm": 0.028275564312934875, "learning_rate": 0.00015174091258072925, "loss": 1.0882, "step": 247 }, { "epoch": 1.0122448979591836, "grad_norm": 0.030598660930991173, "learning_rate": 0.00015136961684227904, "loss": 1.1793, "step": 248 }, { "epoch": 1.0163265306122449, "grad_norm": 0.02968600206077099, "learning_rate": 0.00015099735654353747, "loss": 1.0578, "step": 249 }, { "epoch": 1.0010204081632652, "grad_norm": 0.03257656469941139, "learning_rate": 0.0001506241386743854, "loss": 1.0023, "step": 250 }, { "epoch": 1.0051020408163265, "grad_norm": 0.02804708480834961, "learning_rate": 0.0001502499702426839, "loss": 1.1929, "step": 251 }, { "epoch": 1.0091836734693878, "grad_norm": 0.02974173054099083, "learning_rate": 0.00014987485827414255, "loss": 1.1358, "step": 252 }, { "epoch": 1.013265306122449, "grad_norm": 0.0282144732773304, "learning_rate": 0.00014949880981218766, "loss": 0.9802, "step": 253 }, { "epoch": 1.0173469387755103, "grad_norm": 0.029623284935951233, "learning_rate": 0.00014912183191782995, "loss": 1.1808, "step": 254 }, { "epoch": 1.0214285714285714, "grad_norm": 0.02861897461116314, "learning_rate": 0.00014874393166953192, "loss": 1.2327, "step": 255 }, { "epoch": 1.0255102040816326, "grad_norm": 0.0278904028236866, "learning_rate": 0.000148365116163075, "loss": 1.1364, "step": 256 }, { "epoch": 1.029591836734694, "grad_norm": 0.02950625866651535, "learning_rate": 0.00014798539251142633, "loss": 1.1223, "step": 257 }, { "epoch": 1.0336734693877552, "grad_norm": 0.02931659109890461, "learning_rate": 0.00014760476784460514, "loss": 1.2109, "step": 258 }, { "epoch": 1.0377551020408162, "grad_norm": 0.028963668271899223, "learning_rate": 0.00014722324930954885, "loss": 1.0766, "step": 259 }, { "epoch": 1.0418367346938775, "grad_norm": 0.029344873502850533, "learning_rate": 0.00014684084406997903, "loss": 1.0301, "step": 260 }, { "epoch": 1.0459183673469388, "grad_norm": 0.028957512229681015, "learning_rate": 0.0001464575593062667, "loss": 1.1246, "step": 261 }, { "epoch": 1.05, "grad_norm": 0.02943386323750019, "learning_rate": 0.00014607340221529762, "loss": 1.1502, "step": 262 }, { "epoch": 1.0540816326530613, "grad_norm": 0.028920989483594894, "learning_rate": 0.0001456883800103371, "loss": 1.2012, "step": 263 }, { "epoch": 1.0581632653061224, "grad_norm": 0.029016492888331413, "learning_rate": 0.0001453024999208946, "loss": 1.1021, "step": 264 }, { "epoch": 1.0622448979591836, "grad_norm": 0.029628349468111992, "learning_rate": 0.00014491576919258792, "loss": 1.1601, "step": 265 }, { "epoch": 1.066326530612245, "grad_norm": 0.03196660056710243, "learning_rate": 0.00014452819508700723, "loss": 1.0135, "step": 266 }, { "epoch": 1.0704081632653062, "grad_norm": 0.029348861426115036, "learning_rate": 0.00014413978488157867, "loss": 1.0943, "step": 267 }, { "epoch": 1.0744897959183675, "grad_norm": 0.030313340947031975, "learning_rate": 0.0001437505458694277, "loss": 1.1161, "step": 268 }, { "epoch": 1.0785714285714285, "grad_norm": 0.030253972858190536, "learning_rate": 0.00014336048535924223, "loss": 1.151, "step": 269 }, { "epoch": 1.0826530612244898, "grad_norm": 0.03184593468904495, "learning_rate": 0.0001429696106751352, "loss": 1.1009, "step": 270 }, { "epoch": 1.086734693877551, "grad_norm": 0.03207677975296974, "learning_rate": 0.00014257792915650728, "loss": 1.104, "step": 271 }, { "epoch": 1.0908163265306123, "grad_norm": 0.02996198646724224, "learning_rate": 0.00014218544815790897, "loss": 1.0853, "step": 272 }, { "epoch": 1.0948979591836734, "grad_norm": 0.03296904265880585, "learning_rate": 0.00014179217504890242, "loss": 1.1149, "step": 273 }, { "epoch": 1.0989795918367347, "grad_norm": 0.03023442253470421, "learning_rate": 0.00014139811721392324, "loss": 1.1763, "step": 274 }, { "epoch": 1.103061224489796, "grad_norm": 0.03454556316137314, "learning_rate": 0.0001410032820521416, "loss": 1.2259, "step": 275 }, { "epoch": 1.1071428571428572, "grad_norm": 0.030207209289073944, "learning_rate": 0.00014060767697732354, "loss": 1.1265, "step": 276 }, { "epoch": 1.1112244897959185, "grad_norm": 0.030806340277194977, "learning_rate": 0.00014021130941769164, "loss": 1.0964, "step": 277 }, { "epoch": 1.1153061224489795, "grad_norm": 0.030314523726701736, "learning_rate": 0.00013981418681578546, "loss": 1.144, "step": 278 }, { "epoch": 1.1193877551020408, "grad_norm": 0.03102901391685009, "learning_rate": 0.00013941631662832199, "loss": 1.0801, "step": 279 }, { "epoch": 1.123469387755102, "grad_norm": 0.031429093331098557, "learning_rate": 0.00013901770632605547, "loss": 1.041, "step": 280 }, { "epoch": 1.1275510204081634, "grad_norm": 0.030079476535320282, "learning_rate": 0.0001386183633936372, "loss": 1.027, "step": 281 }, { "epoch": 1.1316326530612244, "grad_norm": 0.03151247650384903, "learning_rate": 0.00013821829532947497, "loss": 1.1443, "step": 282 }, { "epoch": 1.1357142857142857, "grad_norm": 0.031054867431521416, "learning_rate": 0.0001378175096455923, "loss": 1.0623, "step": 283 }, { "epoch": 1.139795918367347, "grad_norm": 0.03046008013188839, "learning_rate": 0.00013741601386748728, "loss": 0.9639, "step": 284 }, { "epoch": 1.1438775510204082, "grad_norm": 0.03275845944881439, "learning_rate": 0.00013701381553399145, "loss": 1.0596, "step": 285 }, { "epoch": 1.1479591836734695, "grad_norm": 0.030060578137636185, "learning_rate": 0.00013661092219712802, "loss": 1.0824, "step": 286 }, { "epoch": 1.1520408163265305, "grad_norm": 0.03152566775679588, "learning_rate": 0.00013620734142197032, "loss": 1.0503, "step": 287 }, { "epoch": 1.1561224489795918, "grad_norm": 0.03527853637933731, "learning_rate": 0.0001358030807864995, "loss": 1.1604, "step": 288 }, { "epoch": 1.160204081632653, "grad_norm": 0.0333213284611702, "learning_rate": 0.00013539814788146235, "loss": 1.1204, "step": 289 }, { "epoch": 1.1642857142857144, "grad_norm": 0.03294171392917633, "learning_rate": 0.00013499255031022885, "loss": 1.1409, "step": 290 }, { "epoch": 1.1683673469387754, "grad_norm": 0.03201167285442352, "learning_rate": 0.0001345862956886493, "loss": 1.1238, "step": 291 }, { "epoch": 1.1724489795918367, "grad_norm": 0.031082091853022575, "learning_rate": 0.00013417939164491136, "loss": 1.1676, "step": 292 }, { "epoch": 1.176530612244898, "grad_norm": 0.03172369301319122, "learning_rate": 0.00013377184581939673, "loss": 1.0529, "step": 293 }, { "epoch": 1.1806122448979592, "grad_norm": 0.029640652239322662, "learning_rate": 0.00013336366586453783, "loss": 1.1071, "step": 294 }, { "epoch": 1.1846938775510205, "grad_norm": 0.03119003400206566, "learning_rate": 0.00013295485944467405, "loss": 1.1205, "step": 295 }, { "epoch": 1.1887755102040816, "grad_norm": 0.03193094953894615, "learning_rate": 0.00013254543423590773, "loss": 1.0767, "step": 296 }, { "epoch": 1.1928571428571428, "grad_norm": 0.032400280237197876, "learning_rate": 0.00013213539792596027, "loss": 1.1358, "step": 297 }, { "epoch": 1.1969387755102041, "grad_norm": 0.0319623164832592, "learning_rate": 0.00013172475821402748, "loss": 1.0308, "step": 298 }, { "epoch": 1.2010204081632654, "grad_norm": 0.03384249284863472, "learning_rate": 0.0001313135228106353, "loss": 1.0548, "step": 299 }, { "epoch": 1.2051020408163264, "grad_norm": 0.030600961297750473, "learning_rate": 0.00013090169943749476, "loss": 0.983, "step": 300 }, { "epoch": 1.2091836734693877, "grad_norm": 0.03205844759941101, "learning_rate": 0.00013048929582735723, "loss": 1.041, "step": 301 }, { "epoch": 1.213265306122449, "grad_norm": 0.032236404716968536, "learning_rate": 0.00013007631972386912, "loss": 1.0697, "step": 302 }, { "epoch": 1.2173469387755103, "grad_norm": 0.03104401007294655, "learning_rate": 0.00012966277888142632, "loss": 1.059, "step": 303 }, { "epoch": 1.2214285714285715, "grad_norm": 0.03168416768312454, "learning_rate": 0.0001292486810650289, "loss": 1.1379, "step": 304 }, { "epoch": 1.2255102040816326, "grad_norm": 0.03132764995098114, "learning_rate": 0.0001288340340501351, "loss": 1.1364, "step": 305 }, { "epoch": 1.2295918367346939, "grad_norm": 0.03303636610507965, "learning_rate": 0.0001284188456225153, "loss": 1.0464, "step": 306 }, { "epoch": 1.2336734693877551, "grad_norm": 0.03127754479646683, "learning_rate": 0.00012800312357810615, "loss": 1.0639, "step": 307 }, { "epoch": 1.2377551020408164, "grad_norm": 0.03249693289399147, "learning_rate": 0.00012758687572286367, "loss": 1.0653, "step": 308 }, { "epoch": 1.2418367346938775, "grad_norm": 0.03227215260267258, "learning_rate": 0.00012717010987261715, "loss": 1.1142, "step": 309 }, { "epoch": 1.2459183673469387, "grad_norm": 0.03237595036625862, "learning_rate": 0.00012675283385292212, "loss": 1.1, "step": 310 }, { "epoch": 1.25, "grad_norm": 0.0319119468331337, "learning_rate": 0.00012633505549891345, "loss": 1.1782, "step": 311 }, { "epoch": 1.2540816326530613, "grad_norm": 0.033556390553712845, "learning_rate": 0.00012591678265515838, "loss": 1.181, "step": 312 }, { "epoch": 1.2581632653061225, "grad_norm": 0.03282499313354492, "learning_rate": 0.00012549802317550908, "loss": 1.0193, "step": 313 }, { "epoch": 1.2622448979591836, "grad_norm": 0.031234145164489746, "learning_rate": 0.0001250787849229552, "loss": 1.0906, "step": 314 }, { "epoch": 1.2663265306122449, "grad_norm": 0.032275062054395676, "learning_rate": 0.00012465907576947622, "loss": 1.0731, "step": 315 }, { "epoch": 1.2704081632653061, "grad_norm": 0.032516300678253174, "learning_rate": 0.00012423890359589368, "loss": 1.0989, "step": 316 }, { "epoch": 1.2744897959183674, "grad_norm": 0.030433477833867073, "learning_rate": 0.00012381827629172324, "loss": 1.0506, "step": 317 }, { "epoch": 1.2785714285714285, "grad_norm": 0.03216548636555672, "learning_rate": 0.00012339720175502642, "loss": 1.0797, "step": 318 }, { "epoch": 1.2826530612244897, "grad_norm": 0.03167072683572769, "learning_rate": 0.00012297568789226238, "loss": 1.1299, "step": 319 }, { "epoch": 1.286734693877551, "grad_norm": 0.031179511919617653, "learning_rate": 0.00012255374261813944, "loss": 1.2068, "step": 320 }, { "epoch": 1.2908163265306123, "grad_norm": 0.0317654013633728, "learning_rate": 0.00012213137385546648, "loss": 1.0542, "step": 321 }, { "epoch": 1.2948979591836736, "grad_norm": 0.03267383947968483, "learning_rate": 0.00012170858953500417, "loss": 1.1773, "step": 322 }, { "epoch": 1.2989795918367346, "grad_norm": 0.03213581070303917, "learning_rate": 0.0001212853975953161, "loss": 1.0293, "step": 323 }, { "epoch": 1.3030612244897959, "grad_norm": 0.033722877502441406, "learning_rate": 0.00012086180598261956, "loss": 1.1718, "step": 324 }, { "epoch": 1.3071428571428572, "grad_norm": 0.03295879811048508, "learning_rate": 0.0001204378226506365, "loss": 1.0517, "step": 325 }, { "epoch": 1.3112244897959184, "grad_norm": 0.0332418717443943, "learning_rate": 0.00012001345556044416, "loss": 1.0805, "step": 326 }, { "epoch": 1.3153061224489795, "grad_norm": 0.03321802616119385, "learning_rate": 0.00011958871268032554, "loss": 1.1807, "step": 327 }, { "epoch": 1.3193877551020408, "grad_norm": 0.03372679278254509, "learning_rate": 0.0001191636019856198, "loss": 1.1513, "step": 328 }, { "epoch": 1.3193877551020408, "eval_loss": 1.1087149381637573, "eval_runtime": 120.2761, "eval_samples_per_second": 20.303, "eval_steps_per_second": 10.152, "step": 328 }, { "epoch": 1.323469387755102, "grad_norm": 0.03233565017580986, "learning_rate": 0.00011873813145857249, "loss": 1.132, "step": 329 }, { "epoch": 1.3275510204081633, "grad_norm": 0.032153040170669556, "learning_rate": 0.00011831230908818563, "loss": 1.1377, "step": 330 }, { "epoch": 1.3316326530612246, "grad_norm": 0.032428547739982605, "learning_rate": 0.00011788614287006786, "loss": 1.09, "step": 331 }, { "epoch": 1.3357142857142856, "grad_norm": 0.0352560319006443, "learning_rate": 0.00011745964080628411, "loss": 1.0845, "step": 332 }, { "epoch": 1.339795918367347, "grad_norm": 0.03259318321943283, "learning_rate": 0.00011703281090520552, "loss": 1.1122, "step": 333 }, { "epoch": 1.3438775510204082, "grad_norm": 0.03463459014892578, "learning_rate": 0.00011660566118135894, "loss": 1.0425, "step": 334 }, { "epoch": 1.3479591836734695, "grad_norm": 0.03391732648015022, "learning_rate": 0.0001161781996552765, "loss": 1.0356, "step": 335 }, { "epoch": 1.3520408163265305, "grad_norm": 0.03276665136218071, "learning_rate": 0.00011575043435334494, "loss": 1.052, "step": 336 }, { "epoch": 1.3561224489795918, "grad_norm": 0.03379301726818085, "learning_rate": 0.00011532237330765507, "loss": 1.0791, "step": 337 }, { "epoch": 1.360204081632653, "grad_norm": 0.031217332929372787, "learning_rate": 0.00011489402455585076, "loss": 1.0145, "step": 338 }, { "epoch": 1.3642857142857143, "grad_norm": 0.03169461712241173, "learning_rate": 0.00011446539614097813, "loss": 1.1603, "step": 339 }, { "epoch": 1.3683673469387756, "grad_norm": 0.034710437059402466, "learning_rate": 0.00011403649611133444, "loss": 1.0246, "step": 340 }, { "epoch": 1.3724489795918369, "grad_norm": 0.03189219534397125, "learning_rate": 0.0001136073325203171, "loss": 1.0795, "step": 341 }, { "epoch": 1.376530612244898, "grad_norm": 0.03188316524028778, "learning_rate": 0.00011317791342627238, "loss": 1.0462, "step": 342 }, { "epoch": 1.3806122448979592, "grad_norm": 0.03283124417066574, "learning_rate": 0.000112748246892344, "loss": 1.1128, "step": 343 }, { "epoch": 1.3846938775510205, "grad_norm": 0.03310257941484451, "learning_rate": 0.0001123183409863219, "loss": 1.0864, "step": 344 }, { "epoch": 1.3887755102040815, "grad_norm": 0.035449717193841934, "learning_rate": 0.00011188820378049065, "loss": 1.1329, "step": 345 }, { "epoch": 1.3928571428571428, "grad_norm": 0.03374037146568298, "learning_rate": 0.00011145784335147793, "loss": 1.1011, "step": 346 }, { "epoch": 1.396938775510204, "grad_norm": 0.03215891122817993, "learning_rate": 0.00011102726778010288, "loss": 1.0289, "step": 347 }, { "epoch": 1.4010204081632653, "grad_norm": 0.03225525841116905, "learning_rate": 0.00011059648515122424, "loss": 1.0744, "step": 348 }, { "epoch": 1.4051020408163266, "grad_norm": 0.032239388674497604, "learning_rate": 0.00011016550355358872, "loss": 1.0904, "step": 349 }, { "epoch": 1.4091836734693879, "grad_norm": 0.03210952505469322, "learning_rate": 0.00010973433107967902, "loss": 1.1008, "step": 350 }, { "epoch": 1.413265306122449, "grad_norm": 0.0324125811457634, "learning_rate": 0.00010930297582556181, "loss": 1.0871, "step": 351 }, { "epoch": 1.4173469387755102, "grad_norm": 0.034373488277196884, "learning_rate": 0.00010887144589073602, "loss": 0.9905, "step": 352 }, { "epoch": 1.4214285714285715, "grad_norm": 0.03328999504446983, "learning_rate": 0.00010843974937798029, "loss": 1.0666, "step": 353 }, { "epoch": 1.4255102040816325, "grad_norm": 0.03199386224150658, "learning_rate": 0.00010800789439320128, "loss": 1.0453, "step": 354 }, { "epoch": 1.4295918367346938, "grad_norm": 0.03383859619498253, "learning_rate": 0.00010757588904528106, "loss": 0.956, "step": 355 }, { "epoch": 1.433673469387755, "grad_norm": 0.03234272822737694, "learning_rate": 0.00010714374144592516, "loss": 1.0641, "step": 356 }, { "epoch": 1.4377551020408164, "grad_norm": 0.031517963856458664, "learning_rate": 0.0001067114597095102, "loss": 1.109, "step": 357 }, { "epoch": 1.4418367346938776, "grad_norm": 0.03353012353181839, "learning_rate": 0.00010627905195293135, "loss": 1.0613, "step": 358 }, { "epoch": 1.445918367346939, "grad_norm": 0.034205447882413864, "learning_rate": 0.00010584652629545011, "loss": 1.1648, "step": 359 }, { "epoch": 1.45, "grad_norm": 0.031480543315410614, "learning_rate": 0.00010541389085854176, "loss": 1.1624, "step": 360 }, { "epoch": 1.4540816326530612, "grad_norm": 0.03325892984867096, "learning_rate": 0.00010498115376574294, "loss": 1.1139, "step": 361 }, { "epoch": 1.4581632653061225, "grad_norm": 0.033394064754247665, "learning_rate": 0.00010454832314249902, "loss": 1.0492, "step": 362 }, { "epoch": 1.4622448979591836, "grad_norm": 0.03387491777539253, "learning_rate": 0.0001041154071160116, "loss": 1.0856, "step": 363 }, { "epoch": 1.4663265306122448, "grad_norm": 0.03363283723592758, "learning_rate": 0.0001036824138150859, "loss": 1.0744, "step": 364 }, { "epoch": 1.470408163265306, "grad_norm": 0.03499883785843849, "learning_rate": 0.00010324935136997806, "loss": 1.0982, "step": 365 }, { "epoch": 1.4744897959183674, "grad_norm": 0.03243999928236008, "learning_rate": 0.00010281622791224257, "loss": 1.0581, "step": 366 }, { "epoch": 1.4785714285714286, "grad_norm": 0.03668928146362305, "learning_rate": 0.0001023830515745796, "loss": 1.1081, "step": 367 }, { "epoch": 1.48265306122449, "grad_norm": 0.03404795750975609, "learning_rate": 0.00010194983049068212, "loss": 1.0435, "step": 368 }, { "epoch": 1.486734693877551, "grad_norm": 0.03391297161579132, "learning_rate": 0.00010151657279508336, "loss": 1.0651, "step": 369 }, { "epoch": 1.4908163265306122, "grad_norm": 0.033831968903541565, "learning_rate": 0.000101083286623004, "loss": 1.1807, "step": 370 }, { "epoch": 1.4948979591836735, "grad_norm": 0.03301573544740677, "learning_rate": 0.00010064998011019944, "loss": 0.9622, "step": 371 }, { "epoch": 1.4989795918367346, "grad_norm": 0.03434665501117706, "learning_rate": 0.00010021666139280697, "loss": 1.0813, "step": 372 }, { "epoch": 1.5030612244897958, "grad_norm": 0.033603403717279434, "learning_rate": 9.978333860719306e-05, "loss": 1.1015, "step": 373 }, { "epoch": 1.5071428571428571, "grad_norm": 0.033279791474342346, "learning_rate": 9.935001988980061e-05, "loss": 1.0389, "step": 374 }, { "epoch": 1.5112244897959184, "grad_norm": 0.032859690487384796, "learning_rate": 9.891671337699602e-05, "loss": 1.0042, "step": 375 }, { "epoch": 1.5153061224489797, "grad_norm": 0.033149540424346924, "learning_rate": 9.848342720491665e-05, "loss": 1.1924, "step": 376 }, { "epoch": 1.519387755102041, "grad_norm": 0.03327937051653862, "learning_rate": 9.805016950931792e-05, "loss": 1.0259, "step": 377 }, { "epoch": 1.523469387755102, "grad_norm": 0.03412073478102684, "learning_rate": 9.76169484254204e-05, "loss": 1.1495, "step": 378 }, { "epoch": 1.5275510204081633, "grad_norm": 0.034567687660455704, "learning_rate": 9.718377208775744e-05, "loss": 1.0706, "step": 379 }, { "epoch": 1.5316326530612245, "grad_norm": 0.03373974934220314, "learning_rate": 9.675064863002196e-05, "loss": 1.0123, "step": 380 }, { "epoch": 1.5357142857142856, "grad_norm": 0.03521249070763588, "learning_rate": 9.631758618491415e-05, "loss": 1.0909, "step": 381 }, { "epoch": 1.5397959183673469, "grad_norm": 0.03452066332101822, "learning_rate": 9.588459288398841e-05, "loss": 1.0786, "step": 382 }, { "epoch": 1.5438775510204081, "grad_norm": 0.033778153359889984, "learning_rate": 9.545167685750099e-05, "loss": 1.1406, "step": 383 }, { "epoch": 1.5479591836734694, "grad_norm": 0.033918268978595734, "learning_rate": 9.50188462342571e-05, "loss": 0.9496, "step": 384 }, { "epoch": 1.5520408163265307, "grad_norm": 0.03323463723063469, "learning_rate": 9.458610914145826e-05, "loss": 1.1372, "step": 385 }, { "epoch": 1.556122448979592, "grad_norm": 0.03271915018558502, "learning_rate": 9.415347370454992e-05, "loss": 1.1697, "step": 386 }, { "epoch": 1.560204081632653, "grad_norm": 0.03397282212972641, "learning_rate": 9.372094804706867e-05, "loss": 1.0906, "step": 387 }, { "epoch": 1.5642857142857143, "grad_norm": 0.03263228386640549, "learning_rate": 9.328854029048984e-05, "loss": 1.1572, "step": 388 }, { "epoch": 1.5683673469387756, "grad_norm": 0.034290216863155365, "learning_rate": 9.285625855407484e-05, "loss": 0.9682, "step": 389 }, { "epoch": 1.5724489795918366, "grad_norm": 0.03301115706562996, "learning_rate": 9.242411095471897e-05, "loss": 1.2016, "step": 390 }, { "epoch": 1.5765306122448979, "grad_norm": 0.03409136086702347, "learning_rate": 9.199210560679876e-05, "loss": 1.0756, "step": 391 }, { "epoch": 1.5806122448979592, "grad_norm": 0.03300061076879501, "learning_rate": 9.15602506220197e-05, "loss": 1.0968, "step": 392 }, { "epoch": 1.5846938775510204, "grad_norm": 0.03417652100324631, "learning_rate": 9.112855410926403e-05, "loss": 1.0295, "step": 393 }, { "epoch": 1.5887755102040817, "grad_norm": 0.034996576607227325, "learning_rate": 9.069702417443821e-05, "loss": 1.1373, "step": 394 }, { "epoch": 1.592857142857143, "grad_norm": 0.03447016328573227, "learning_rate": 9.026566892032105e-05, "loss": 1.0964, "step": 395 }, { "epoch": 1.5969387755102042, "grad_norm": 0.037098806351423264, "learning_rate": 8.98344964464113e-05, "loss": 1.0036, "step": 396 }, { "epoch": 1.6010204081632653, "grad_norm": 0.033991578966379166, "learning_rate": 8.940351484877577e-05, "loss": 1.0671, "step": 397 }, { "epoch": 1.6051020408163266, "grad_norm": 0.03501284494996071, "learning_rate": 8.897273221989714e-05, "loss": 1.0848, "step": 398 }, { "epoch": 1.6091836734693876, "grad_norm": 0.03478754311800003, "learning_rate": 8.854215664852206e-05, "loss": 1.086, "step": 399 }, { "epoch": 1.613265306122449, "grad_norm": 0.03527893126010895, "learning_rate": 8.811179621950936e-05, "loss": 1.1172, "step": 400 }, { "epoch": 1.6173469387755102, "grad_norm": 0.03505920246243477, "learning_rate": 8.768165901367812e-05, "loss": 1.1382, "step": 401 }, { "epoch": 1.6214285714285714, "grad_norm": 0.03398558497428894, "learning_rate": 8.725175310765605e-05, "loss": 1.1622, "step": 402 }, { "epoch": 1.6255102040816327, "grad_norm": 0.03405265510082245, "learning_rate": 8.682208657372766e-05, "loss": 1.066, "step": 403 }, { "epoch": 1.629591836734694, "grad_norm": 0.033411748707294464, "learning_rate": 8.63926674796829e-05, "loss": 1.0183, "step": 404 }, { "epoch": 1.6336734693877553, "grad_norm": 0.03483176976442337, "learning_rate": 8.596350388866558e-05, "loss": 1.0862, "step": 405 }, { "epoch": 1.6377551020408163, "grad_norm": 0.0341670848429203, "learning_rate": 8.553460385902189e-05, "loss": 1.1641, "step": 406 }, { "epoch": 1.6418367346938776, "grad_norm": 0.03399474918842316, "learning_rate": 8.510597544414927e-05, "loss": 1.0483, "step": 407 }, { "epoch": 1.6459183673469386, "grad_norm": 0.03335323557257652, "learning_rate": 8.467762669234495e-05, "loss": 1.0856, "step": 408 }, { "epoch": 1.65, "grad_norm": 0.035050079226493835, "learning_rate": 8.424956564665508e-05, "loss": 1.0424, "step": 409 }, { "epoch": 1.6540816326530612, "grad_norm": 0.033788349479436874, "learning_rate": 8.382180034472353e-05, "loss": 1.0978, "step": 410 }, { "epoch": 1.6540816326530612, "eval_loss": 1.1044933795928955, "eval_runtime": 122.6879, "eval_samples_per_second": 19.904, "eval_steps_per_second": 9.952, "step": 410 }, { "epoch": 1.6581632653061225, "grad_norm": 0.0354621559381485, "learning_rate": 8.339433881864107e-05, "loss": 1.0069, "step": 411 }, { "epoch": 1.6622448979591837, "grad_norm": 0.03387383744120598, "learning_rate": 8.29671890947945e-05, "loss": 1.0669, "step": 412 }, { "epoch": 1.666326530612245, "grad_norm": 0.03535078838467598, "learning_rate": 8.25403591937159e-05, "loss": 1.0917, "step": 413 }, { "epoch": 1.6704081632653063, "grad_norm": 0.036951616406440735, "learning_rate": 8.211385712993218e-05, "loss": 1.0091, "step": 414 }, { "epoch": 1.6744897959183673, "grad_norm": 0.03283822536468506, "learning_rate": 8.168769091181438e-05, "loss": 1.134, "step": 415 }, { "epoch": 1.6785714285714286, "grad_norm": 0.03339226171374321, "learning_rate": 8.126186854142752e-05, "loss": 1.0092, "step": 416 }, { "epoch": 1.6826530612244897, "grad_norm": 0.03598244488239288, "learning_rate": 8.083639801438021e-05, "loss": 1.0363, "step": 417 }, { "epoch": 1.686734693877551, "grad_norm": 0.03570684790611267, "learning_rate": 8.041128731967444e-05, "loss": 1.1585, "step": 418 }, { "epoch": 1.6908163265306122, "grad_norm": 0.03431576117873192, "learning_rate": 7.998654443955586e-05, "loss": 1.1218, "step": 419 }, { "epoch": 1.6948979591836735, "grad_norm": 0.04130874201655388, "learning_rate": 7.956217734936353e-05, "loss": 1.0716, "step": 420 }, { "epoch": 1.6989795918367347, "grad_norm": 0.03377668932080269, "learning_rate": 7.913819401738049e-05, "loss": 1.1883, "step": 421 }, { "epoch": 1.703061224489796, "grad_norm": 0.034607548266649246, "learning_rate": 7.871460240468394e-05, "loss": 1.069, "step": 422 }, { "epoch": 1.7071428571428573, "grad_norm": 0.03309245780110359, "learning_rate": 7.829141046499581e-05, "loss": 1.0604, "step": 423 }, { "epoch": 1.7112244897959183, "grad_norm": 0.03462745249271393, "learning_rate": 7.786862614453355e-05, "loss": 1.163, "step": 424 }, { "epoch": 1.7153061224489796, "grad_norm": 0.03639388829469681, "learning_rate": 7.744625738186059e-05, "loss": 1.0962, "step": 425 }, { "epoch": 1.7193877551020407, "grad_norm": 0.03333800658583641, "learning_rate": 7.702431210773766e-05, "loss": 1.0276, "step": 426 }, { "epoch": 1.723469387755102, "grad_norm": 0.03520345687866211, "learning_rate": 7.660279824497359e-05, "loss": 1.1203, "step": 427 }, { "epoch": 1.7275510204081632, "grad_norm": 0.03311832249164581, "learning_rate": 7.61817237082768e-05, "loss": 1.0798, "step": 428 }, { "epoch": 1.7316326530612245, "grad_norm": 0.033078547567129135, "learning_rate": 7.576109640410633e-05, "loss": 1.0593, "step": 429 }, { "epoch": 1.7357142857142858, "grad_norm": 0.0346568301320076, "learning_rate": 7.534092423052381e-05, "loss": 1.0018, "step": 430 }, { "epoch": 1.739795918367347, "grad_norm": 0.033940769731998444, "learning_rate": 7.492121507704483e-05, "loss": 1.1602, "step": 431 }, { "epoch": 1.7438775510204083, "grad_norm": 0.03488701581954956, "learning_rate": 7.450197682449092e-05, "loss": 1.1494, "step": 432 }, { "epoch": 1.7479591836734694, "grad_norm": 0.03421643003821373, "learning_rate": 7.408321734484164e-05, "loss": 1.0298, "step": 433 }, { "epoch": 1.7520408163265306, "grad_norm": 0.036979790776968, "learning_rate": 7.366494450108659e-05, "loss": 1.0425, "step": 434 }, { "epoch": 1.7561224489795917, "grad_norm": 0.035175107419490814, "learning_rate": 7.324716614707793e-05, "loss": 1.0458, "step": 435 }, { "epoch": 1.760204081632653, "grad_norm": 0.035041388124227524, "learning_rate": 7.282989012738286e-05, "loss": 1.1375, "step": 436 }, { "epoch": 1.7642857142857142, "grad_norm": 0.034867141395807266, "learning_rate": 7.241312427713631e-05, "loss": 1.1873, "step": 437 }, { "epoch": 1.7683673469387755, "grad_norm": 0.034732844680547714, "learning_rate": 7.199687642189387e-05, "loss": 1.0746, "step": 438 }, { "epoch": 1.7724489795918368, "grad_norm": 0.033908359706401825, "learning_rate": 7.158115437748468e-05, "loss": 1.1305, "step": 439 }, { "epoch": 1.776530612244898, "grad_norm": 0.035412538796663284, "learning_rate": 7.116596594986494e-05, "loss": 1.0872, "step": 440 }, { "epoch": 1.7806122448979593, "grad_norm": 0.035827115178108215, "learning_rate": 7.075131893497111e-05, "loss": 1.1072, "step": 441 }, { "epoch": 1.7846938775510204, "grad_norm": 0.03458938002586365, "learning_rate": 7.033722111857373e-05, "loss": 1.0465, "step": 442 }, { "epoch": 1.7887755102040817, "grad_norm": 0.035993266850709915, "learning_rate": 6.992368027613092e-05, "loss": 1.1074, "step": 443 }, { "epoch": 1.7928571428571427, "grad_norm": 0.03477892279624939, "learning_rate": 6.951070417264277e-05, "loss": 1.0286, "step": 444 }, { "epoch": 1.796938775510204, "grad_norm": 0.03328714147210121, "learning_rate": 6.909830056250527e-05, "loss": 1.0954, "step": 445 }, { "epoch": 1.8010204081632653, "grad_norm": 0.03618752956390381, "learning_rate": 6.868647718936474e-05, "loss": 1.1123, "step": 446 }, { "epoch": 1.8051020408163265, "grad_norm": 0.034587301313877106, "learning_rate": 6.827524178597257e-05, "loss": 1.1773, "step": 447 }, { "epoch": 1.8091836734693878, "grad_norm": 0.03735874965786934, "learning_rate": 6.786460207403978e-05, "loss": 1.0689, "step": 448 }, { "epoch": 1.813265306122449, "grad_norm": 0.03305177018046379, "learning_rate": 6.745456576409227e-05, "loss": 1.1362, "step": 449 }, { "epoch": 1.8173469387755103, "grad_norm": 0.035907987505197525, "learning_rate": 6.704514055532597e-05, "loss": 1.1533, "step": 450 }, { "epoch": 1.8214285714285714, "grad_norm": 0.036774273961782455, "learning_rate": 6.663633413546215e-05, "loss": 1.1216, "step": 451 }, { "epoch": 1.8255102040816327, "grad_norm": 0.03590575233101845, "learning_rate": 6.622815418060329e-05, "loss": 0.993, "step": 452 }, { "epoch": 1.8295918367346937, "grad_norm": 0.03518223017454147, "learning_rate": 6.582060835508867e-05, "loss": 1.1572, "step": 453 }, { "epoch": 1.833673469387755, "grad_norm": 0.035274531692266464, "learning_rate": 6.541370431135072e-05, "loss": 1.0663, "step": 454 }, { "epoch": 1.8377551020408163, "grad_norm": 0.03429022058844566, "learning_rate": 6.500744968977116e-05, "loss": 1.2148, "step": 455 }, { "epoch": 1.8418367346938775, "grad_norm": 0.03577962517738342, "learning_rate": 6.460185211853766e-05, "loss": 1.1339, "step": 456 }, { "epoch": 1.8459183673469388, "grad_norm": 0.03668006509542465, "learning_rate": 6.419691921350056e-05, "loss": 1.1638, "step": 457 }, { "epoch": 1.85, "grad_norm": 0.034531764686107635, "learning_rate": 6.379265857802969e-05, "loss": 1.1482, "step": 458 }, { "epoch": 1.8540816326530614, "grad_norm": 0.036792222410440445, "learning_rate": 6.338907780287197e-05, "loss": 1.0448, "step": 459 }, { "epoch": 1.8581632653061224, "grad_norm": 0.03427064046263695, "learning_rate": 6.298618446600856e-05, "loss": 1.1263, "step": 460 }, { "epoch": 1.8622448979591837, "grad_norm": 0.03595108166337013, "learning_rate": 6.258398613251275e-05, "loss": 1.0291, "step": 461 }, { "epoch": 1.8663265306122447, "grad_norm": 0.03683997690677643, "learning_rate": 6.218249035440774e-05, "loss": 1.0407, "step": 462 }, { "epoch": 1.870408163265306, "grad_norm": 0.03469974175095558, "learning_rate": 6.178170467052504e-05, "loss": 1.0175, "step": 463 }, { "epoch": 1.8744897959183673, "grad_norm": 0.03577972948551178, "learning_rate": 6.138163660636284e-05, "loss": 1.0599, "step": 464 }, { "epoch": 1.8785714285714286, "grad_norm": 0.03648792952299118, "learning_rate": 6.0982293673944544e-05, "loss": 1.0879, "step": 465 }, { "epoch": 1.8826530612244898, "grad_norm": 0.03627919778227806, "learning_rate": 6.0583683371678045e-05, "loss": 1.065, "step": 466 }, { "epoch": 1.886734693877551, "grad_norm": 0.03576623648405075, "learning_rate": 6.0185813184214546e-05, "loss": 1.1024, "step": 467 }, { "epoch": 1.8908163265306124, "grad_norm": 0.03572770580649376, "learning_rate": 5.9788690582308404e-05, "loss": 1.1966, "step": 468 }, { "epoch": 1.8948979591836734, "grad_norm": 0.03741719573736191, "learning_rate": 5.9392323022676454e-05, "loss": 1.1496, "step": 469 }, { "epoch": 1.8989795918367347, "grad_norm": 0.03401537984609604, "learning_rate": 5.899671794785839e-05, "loss": 1.027, "step": 470 }, { "epoch": 1.9030612244897958, "grad_norm": 0.036519624292850494, "learning_rate": 5.8601882786076787e-05, "loss": 1.0297, "step": 471 }, { "epoch": 1.907142857142857, "grad_norm": 0.03723987936973572, "learning_rate": 5.82078249510976e-05, "loss": 1.1775, "step": 472 }, { "epoch": 1.9112244897959183, "grad_norm": 0.036263398826122284, "learning_rate": 5.781455184209106e-05, "loss": 1.1191, "step": 473 }, { "epoch": 1.9153061224489796, "grad_norm": 0.03478455916047096, "learning_rate": 5.7422070843492734e-05, "loss": 1.085, "step": 474 }, { "epoch": 1.9193877551020408, "grad_norm": 0.034283410757780075, "learning_rate": 5.703038932486484e-05, "loss": 1.1994, "step": 475 }, { "epoch": 1.9234693877551021, "grad_norm": 0.03725408762693405, "learning_rate": 5.663951464075781e-05, "loss": 1.1018, "step": 476 }, { "epoch": 1.9275510204081634, "grad_norm": 0.045808345079422, "learning_rate": 5.624945413057229e-05, "loss": 1.1621, "step": 477 }, { "epoch": 1.9316326530612244, "grad_norm": 0.035091597586870193, "learning_rate": 5.586021511842136e-05, "loss": 1.0313, "step": 478 }, { "epoch": 1.9357142857142857, "grad_norm": 0.03547152504324913, "learning_rate": 5.547180491299279e-05, "loss": 1.1557, "step": 479 }, { "epoch": 1.939795918367347, "grad_norm": 0.03589658439159393, "learning_rate": 5.5084230807412126e-05, "loss": 1.1496, "step": 480 }, { "epoch": 1.943877551020408, "grad_norm": 0.03566233068704605, "learning_rate": 5.4697500079105446e-05, "loss": 1.1116, "step": 481 }, { "epoch": 1.9479591836734693, "grad_norm": 0.03529804199934006, "learning_rate": 5.431161998966292e-05, "loss": 1.0662, "step": 482 }, { "epoch": 1.9520408163265306, "grad_norm": 0.03480341657996178, "learning_rate": 5.392659778470239e-05, "loss": 1.1851, "step": 483 }, { "epoch": 1.9561224489795919, "grad_norm": 0.03738265112042427, "learning_rate": 5.35424406937333e-05, "loss": 1.101, "step": 484 }, { "epoch": 1.9602040816326531, "grad_norm": 0.036112599074840546, "learning_rate": 5.3159155930021e-05, "loss": 1.0289, "step": 485 }, { "epoch": 1.9642857142857144, "grad_norm": 0.03390064463019371, "learning_rate": 5.277675069045116e-05, "loss": 0.9845, "step": 486 }, { "epoch": 1.9683673469387755, "grad_norm": 0.03706733137369156, "learning_rate": 5.239523215539491e-05, "loss": 1.0594, "step": 487 }, { "epoch": 1.9724489795918367, "grad_norm": 0.0366055890917778, "learning_rate": 5.201460748857369e-05, "loss": 0.9991, "step": 488 }, { "epoch": 1.976530612244898, "grad_norm": 0.03541799634695053, "learning_rate": 5.163488383692499e-05, "loss": 1.141, "step": 489 }, { "epoch": 1.980612244897959, "grad_norm": 0.035669054836034775, "learning_rate": 5.12560683304681e-05, "loss": 1.0591, "step": 490 } ], "logging_steps": 1, "max_steps": 735, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 245, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.908410477859308e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }