diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,70193 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8769177437454527, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.769177437454527e-05, + "grad_norm": 0.04833984375, + "learning_rate": 3e-06, + "loss": 1.1658, + "step": 1 + }, + { + "epoch": 0.00017538354874909053, + "grad_norm": 0.04296875, + "learning_rate": 6e-06, + "loss": 1.1284, + "step": 2 + }, + { + "epoch": 0.0002630753231236358, + "grad_norm": 0.04931640625, + "learning_rate": 9e-06, + "loss": 1.0655, + "step": 3 + }, + { + "epoch": 0.00035076709749818106, + "grad_norm": 0.04931640625, + "learning_rate": 1.2e-05, + "loss": 1.2313, + "step": 4 + }, + { + "epoch": 0.00043845887187272633, + "grad_norm": 0.048583984375, + "learning_rate": 1.5e-05, + "loss": 1.1557, + "step": 5 + }, + { + "epoch": 0.0005261506462472716, + "grad_norm": 0.05126953125, + "learning_rate": 1.8e-05, + "loss": 1.1887, + "step": 6 + }, + { + "epoch": 0.0006138424206218169, + "grad_norm": 0.052490234375, + "learning_rate": 2.1000000000000002e-05, + "loss": 1.1527, + "step": 7 + }, + { + "epoch": 0.0007015341949963621, + "grad_norm": 0.044921875, + "learning_rate": 2.4e-05, + "loss": 1.1127, + "step": 8 + }, + { + "epoch": 0.0007892259693709074, + "grad_norm": 0.051025390625, + "learning_rate": 2.7e-05, + "loss": 1.1494, + "step": 9 + }, + { + "epoch": 0.0008769177437454527, + "grad_norm": 0.047607421875, + "learning_rate": 3e-05, + "loss": 1.1626, + "step": 10 + }, + { + "epoch": 0.0009646095181199979, + "grad_norm": 0.04443359375, + "learning_rate": 3.2999999999999996e-05, + "loss": 1.1429, + "step": 11 + }, + { + "epoch": 0.0010523012924945432, + "grad_norm": 0.05322265625, + "learning_rate": 3.6e-05, + "loss": 1.1566, + "step": 12 + }, + { + "epoch": 0.0011399930668690886, + "grad_norm": 0.054931640625, + "learning_rate": 3.9e-05, + "loss": 1.1611, + "step": 13 + }, + { + "epoch": 0.0012276848412436337, + "grad_norm": 0.047607421875, + "learning_rate": 4.2000000000000004e-05, + "loss": 1.0719, + "step": 14 + }, + { + "epoch": 0.001315376615618179, + "grad_norm": 0.04443359375, + "learning_rate": 4.4999999999999996e-05, + "loss": 1.1424, + "step": 15 + }, + { + "epoch": 0.0014030683899927242, + "grad_norm": 0.04541015625, + "learning_rate": 4.8e-05, + "loss": 1.126, + "step": 16 + }, + { + "epoch": 0.0014907601643672696, + "grad_norm": 0.059326171875, + "learning_rate": 5.1000000000000006e-05, + "loss": 1.1402, + "step": 17 + }, + { + "epoch": 0.0015784519387418148, + "grad_norm": 0.048583984375, + "learning_rate": 5.4e-05, + "loss": 1.1307, + "step": 18 + }, + { + "epoch": 0.0016661437131163602, + "grad_norm": 0.045654296875, + "learning_rate": 5.7e-05, + "loss": 1.1494, + "step": 19 + }, + { + "epoch": 0.0017538354874909053, + "grad_norm": 0.046630859375, + "learning_rate": 6e-05, + "loss": 1.1678, + "step": 20 + }, + { + "epoch": 0.0018415272618654507, + "grad_norm": 0.0546875, + "learning_rate": 6.3e-05, + "loss": 1.1662, + "step": 21 + }, + { + "epoch": 0.0019292190362399958, + "grad_norm": 0.050537109375, + "learning_rate": 6.599999999999999e-05, + "loss": 1.1019, + "step": 22 + }, + { + "epoch": 0.002016910810614541, + "grad_norm": 0.045166015625, + "learning_rate": 6.9e-05, + "loss": 1.1203, + "step": 23 + }, + { + "epoch": 0.0021046025849890864, + "grad_norm": 0.0439453125, + "learning_rate": 7.2e-05, + "loss": 1.1157, + "step": 24 + }, + { + "epoch": 0.0021922943593636317, + "grad_norm": 0.04638671875, + "learning_rate": 7.500000000000001e-05, + "loss": 1.1259, + "step": 25 + }, + { + "epoch": 0.002279986133738177, + "grad_norm": 0.045654296875, + "learning_rate": 7.8e-05, + "loss": 1.1498, + "step": 26 + }, + { + "epoch": 0.0023676779081127225, + "grad_norm": 0.04541015625, + "learning_rate": 8.1e-05, + "loss": 1.1414, + "step": 27 + }, + { + "epoch": 0.0024553696824872674, + "grad_norm": 0.053955078125, + "learning_rate": 8.400000000000001e-05, + "loss": 1.1709, + "step": 28 + }, + { + "epoch": 0.002543061456861813, + "grad_norm": 0.048583984375, + "learning_rate": 8.7e-05, + "loss": 1.1324, + "step": 29 + }, + { + "epoch": 0.002630753231236358, + "grad_norm": 0.048583984375, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1482, + "step": 30 + }, + { + "epoch": 0.0027184450056109036, + "grad_norm": 0.0439453125, + "learning_rate": 9.3e-05, + "loss": 1.1014, + "step": 31 + }, + { + "epoch": 0.0028061367799854485, + "grad_norm": 0.0478515625, + "learning_rate": 9.6e-05, + "loss": 1.0998, + "step": 32 + }, + { + "epoch": 0.002893828554359994, + "grad_norm": 0.050537109375, + "learning_rate": 9.900000000000001e-05, + "loss": 1.1555, + "step": 33 + }, + { + "epoch": 0.0029815203287345392, + "grad_norm": 0.045654296875, + "learning_rate": 0.00010200000000000001, + "loss": 1.1934, + "step": 34 + }, + { + "epoch": 0.0030692121031090846, + "grad_norm": 0.04345703125, + "learning_rate": 0.00010500000000000002, + "loss": 1.1266, + "step": 35 + }, + { + "epoch": 0.0031569038774836296, + "grad_norm": 0.0439453125, + "learning_rate": 0.000108, + "loss": 1.1031, + "step": 36 + }, + { + "epoch": 0.003244595651858175, + "grad_norm": 0.046630859375, + "learning_rate": 0.000111, + "loss": 1.1517, + "step": 37 + }, + { + "epoch": 0.0033322874262327203, + "grad_norm": 0.045166015625, + "learning_rate": 0.000114, + "loss": 1.096, + "step": 38 + }, + { + "epoch": 0.0034199792006072657, + "grad_norm": 0.044677734375, + "learning_rate": 0.000117, + "loss": 1.1044, + "step": 39 + }, + { + "epoch": 0.0035076709749818106, + "grad_norm": 0.04150390625, + "learning_rate": 0.00012, + "loss": 1.0901, + "step": 40 + }, + { + "epoch": 0.003595362749356356, + "grad_norm": 0.044189453125, + "learning_rate": 0.000123, + "loss": 1.1287, + "step": 41 + }, + { + "epoch": 0.0036830545237309014, + "grad_norm": 0.055419921875, + "learning_rate": 0.000126, + "loss": 1.2277, + "step": 42 + }, + { + "epoch": 0.0037707462981054467, + "grad_norm": 0.051025390625, + "learning_rate": 0.000129, + "loss": 1.168, + "step": 43 + }, + { + "epoch": 0.0038584380724799917, + "grad_norm": 0.044677734375, + "learning_rate": 0.00013199999999999998, + "loss": 1.1054, + "step": 44 + }, + { + "epoch": 0.0039461298468545375, + "grad_norm": 0.0478515625, + "learning_rate": 0.000135, + "loss": 1.1698, + "step": 45 + }, + { + "epoch": 0.004033821621229082, + "grad_norm": 0.053466796875, + "learning_rate": 0.000138, + "loss": 1.1513, + "step": 46 + }, + { + "epoch": 0.004121513395603627, + "grad_norm": 0.047607421875, + "learning_rate": 0.000141, + "loss": 1.1666, + "step": 47 + }, + { + "epoch": 0.004209205169978173, + "grad_norm": 0.041259765625, + "learning_rate": 0.000144, + "loss": 1.1025, + "step": 48 + }, + { + "epoch": 0.004296896944352718, + "grad_norm": 0.048828125, + "learning_rate": 0.000147, + "loss": 1.1536, + "step": 49 + }, + { + "epoch": 0.0043845887187272635, + "grad_norm": 0.04541015625, + "learning_rate": 0.00015000000000000001, + "loss": 1.1672, + "step": 50 + }, + { + "epoch": 0.004472280493101809, + "grad_norm": 0.040771484375, + "learning_rate": 0.000153, + "loss": 1.0913, + "step": 51 + }, + { + "epoch": 0.004559972267476354, + "grad_norm": 0.041015625, + "learning_rate": 0.000156, + "loss": 1.1205, + "step": 52 + }, + { + "epoch": 0.0046476640418509, + "grad_norm": 0.04638671875, + "learning_rate": 0.000159, + "loss": 1.2006, + "step": 53 + }, + { + "epoch": 0.004735355816225445, + "grad_norm": 0.04736328125, + "learning_rate": 0.000162, + "loss": 1.1862, + "step": 54 + }, + { + "epoch": 0.0048230475905999895, + "grad_norm": 0.0458984375, + "learning_rate": 0.000165, + "loss": 1.2395, + "step": 55 + }, + { + "epoch": 0.004910739364974535, + "grad_norm": 0.04345703125, + "learning_rate": 0.00016800000000000002, + "loss": 1.138, + "step": 56 + }, + { + "epoch": 0.00499843113934908, + "grad_norm": 0.0576171875, + "learning_rate": 0.000171, + "loss": 1.1346, + "step": 57 + }, + { + "epoch": 0.005086122913723626, + "grad_norm": 0.04296875, + "learning_rate": 0.000174, + "loss": 1.0885, + "step": 58 + }, + { + "epoch": 0.005173814688098171, + "grad_norm": 0.044921875, + "learning_rate": 0.000177, + "loss": 1.1216, + "step": 59 + }, + { + "epoch": 0.005261506462472716, + "grad_norm": 0.04638671875, + "learning_rate": 0.00017999999999999998, + "loss": 1.1571, + "step": 60 + }, + { + "epoch": 0.005349198236847262, + "grad_norm": 0.047119140625, + "learning_rate": 0.000183, + "loss": 1.1255, + "step": 61 + }, + { + "epoch": 0.005436890011221807, + "grad_norm": 0.049560546875, + "learning_rate": 0.000186, + "loss": 1.1221, + "step": 62 + }, + { + "epoch": 0.005524581785596352, + "grad_norm": 0.052978515625, + "learning_rate": 0.000189, + "loss": 1.1555, + "step": 63 + }, + { + "epoch": 0.005612273559970897, + "grad_norm": 0.04541015625, + "learning_rate": 0.000192, + "loss": 1.2047, + "step": 64 + }, + { + "epoch": 0.005699965334345442, + "grad_norm": 0.045654296875, + "learning_rate": 0.00019500000000000002, + "loss": 1.1844, + "step": 65 + }, + { + "epoch": 0.005787657108719988, + "grad_norm": 0.045166015625, + "learning_rate": 0.00019800000000000002, + "loss": 1.123, + "step": 66 + }, + { + "epoch": 0.005875348883094533, + "grad_norm": 0.050048828125, + "learning_rate": 0.000201, + "loss": 1.1913, + "step": 67 + }, + { + "epoch": 0.0059630406574690785, + "grad_norm": 0.047607421875, + "learning_rate": 0.00020400000000000003, + "loss": 1.1745, + "step": 68 + }, + { + "epoch": 0.006050732431843624, + "grad_norm": 0.04296875, + "learning_rate": 0.00020700000000000002, + "loss": 1.094, + "step": 69 + }, + { + "epoch": 0.006138424206218169, + "grad_norm": 0.046875, + "learning_rate": 0.00021000000000000004, + "loss": 1.1598, + "step": 70 + }, + { + "epoch": 0.006226115980592715, + "grad_norm": 0.053955078125, + "learning_rate": 0.00021299999999999997, + "loss": 1.1675, + "step": 71 + }, + { + "epoch": 0.006313807754967259, + "grad_norm": 0.047607421875, + "learning_rate": 0.000216, + "loss": 1.1369, + "step": 72 + }, + { + "epoch": 0.0064014995293418045, + "grad_norm": 0.052978515625, + "learning_rate": 0.00021899999999999998, + "loss": 1.1755, + "step": 73 + }, + { + "epoch": 0.00648919130371635, + "grad_norm": 0.04638671875, + "learning_rate": 0.000222, + "loss": 1.2022, + "step": 74 + }, + { + "epoch": 0.006576883078090895, + "grad_norm": 0.049072265625, + "learning_rate": 0.000225, + "loss": 1.1722, + "step": 75 + }, + { + "epoch": 0.006664574852465441, + "grad_norm": 0.04345703125, + "learning_rate": 0.000228, + "loss": 1.0734, + "step": 76 + }, + { + "epoch": 0.006752266626839986, + "grad_norm": 0.04443359375, + "learning_rate": 0.000231, + "loss": 1.1007, + "step": 77 + }, + { + "epoch": 0.006839958401214531, + "grad_norm": 0.05810546875, + "learning_rate": 0.000234, + "loss": 1.0976, + "step": 78 + }, + { + "epoch": 0.006927650175589077, + "grad_norm": 0.04736328125, + "learning_rate": 0.00023700000000000001, + "loss": 1.1234, + "step": 79 + }, + { + "epoch": 0.007015341949963621, + "grad_norm": 0.045166015625, + "learning_rate": 0.00024, + "loss": 1.1001, + "step": 80 + }, + { + "epoch": 0.007103033724338167, + "grad_norm": 0.0439453125, + "learning_rate": 0.00024300000000000002, + "loss": 1.1492, + "step": 81 + }, + { + "epoch": 0.007190725498712712, + "grad_norm": 0.04443359375, + "learning_rate": 0.000246, + "loss": 1.1726, + "step": 82 + }, + { + "epoch": 0.007278417273087257, + "grad_norm": 0.0439453125, + "learning_rate": 0.00024900000000000004, + "loss": 1.1477, + "step": 83 + }, + { + "epoch": 0.007366109047461803, + "grad_norm": 0.05419921875, + "learning_rate": 0.000252, + "loss": 1.1242, + "step": 84 + }, + { + "epoch": 0.007453800821836348, + "grad_norm": 0.04443359375, + "learning_rate": 0.000255, + "loss": 1.1142, + "step": 85 + }, + { + "epoch": 0.0075414925962108935, + "grad_norm": 0.046875, + "learning_rate": 0.000258, + "loss": 1.1349, + "step": 86 + }, + { + "epoch": 0.007629184370585439, + "grad_norm": 0.05224609375, + "learning_rate": 0.000261, + "loss": 1.1336, + "step": 87 + }, + { + "epoch": 0.007716876144959983, + "grad_norm": 0.0517578125, + "learning_rate": 0.00026399999999999997, + "loss": 1.102, + "step": 88 + }, + { + "epoch": 0.007804567919334529, + "grad_norm": 0.048583984375, + "learning_rate": 0.000267, + "loss": 1.1895, + "step": 89 + }, + { + "epoch": 0.007892259693709075, + "grad_norm": 0.047119140625, + "learning_rate": 0.00027, + "loss": 1.1714, + "step": 90 + }, + { + "epoch": 0.00797995146808362, + "grad_norm": 0.04833984375, + "learning_rate": 0.000273, + "loss": 1.1803, + "step": 91 + }, + { + "epoch": 0.008067643242458164, + "grad_norm": 0.046142578125, + "learning_rate": 0.000276, + "loss": 1.1686, + "step": 92 + }, + { + "epoch": 0.00815533501683271, + "grad_norm": 0.0458984375, + "learning_rate": 0.000279, + "loss": 1.1308, + "step": 93 + }, + { + "epoch": 0.008243026791207255, + "grad_norm": 0.046875, + "learning_rate": 0.000282, + "loss": 1.1795, + "step": 94 + }, + { + "epoch": 0.008330718565581801, + "grad_norm": 0.05078125, + "learning_rate": 0.000285, + "loss": 1.1199, + "step": 95 + }, + { + "epoch": 0.008418410339956345, + "grad_norm": 0.0478515625, + "learning_rate": 0.000288, + "loss": 1.1489, + "step": 96 + }, + { + "epoch": 0.008506102114330892, + "grad_norm": 0.044677734375, + "learning_rate": 0.000291, + "loss": 1.086, + "step": 97 + }, + { + "epoch": 0.008593793888705436, + "grad_norm": 0.048095703125, + "learning_rate": 0.000294, + "loss": 1.1221, + "step": 98 + }, + { + "epoch": 0.008681485663079982, + "grad_norm": 0.04736328125, + "learning_rate": 0.000297, + "loss": 1.128, + "step": 99 + }, + { + "epoch": 0.008769177437454527, + "grad_norm": 0.05517578125, + "learning_rate": 0.00030000000000000003, + "loss": 1.177, + "step": 100 + }, + { + "epoch": 0.008856869211829071, + "grad_norm": 0.04296875, + "learning_rate": 0.00030300000000000005, + "loss": 1.0933, + "step": 101 + }, + { + "epoch": 0.008944560986203618, + "grad_norm": 0.04736328125, + "learning_rate": 0.000306, + "loss": 1.1342, + "step": 102 + }, + { + "epoch": 0.009032252760578162, + "grad_norm": 0.044921875, + "learning_rate": 0.000309, + "loss": 1.1317, + "step": 103 + }, + { + "epoch": 0.009119944534952708, + "grad_norm": 0.047607421875, + "learning_rate": 0.000312, + "loss": 1.1696, + "step": 104 + }, + { + "epoch": 0.009207636309327253, + "grad_norm": 0.0634765625, + "learning_rate": 0.000315, + "loss": 1.1273, + "step": 105 + }, + { + "epoch": 0.0092953280837018, + "grad_norm": 0.051025390625, + "learning_rate": 0.000318, + "loss": 1.1595, + "step": 106 + }, + { + "epoch": 0.009383019858076344, + "grad_norm": 0.0546875, + "learning_rate": 0.000321, + "loss": 1.1281, + "step": 107 + }, + { + "epoch": 0.00947071163245089, + "grad_norm": 0.050537109375, + "learning_rate": 0.000324, + "loss": 1.1297, + "step": 108 + }, + { + "epoch": 0.009558403406825434, + "grad_norm": 0.06494140625, + "learning_rate": 0.000327, + "loss": 1.1683, + "step": 109 + }, + { + "epoch": 0.009646095181199979, + "grad_norm": 0.046142578125, + "learning_rate": 0.00033, + "loss": 1.0732, + "step": 110 + }, + { + "epoch": 0.009733786955574525, + "grad_norm": 0.05126953125, + "learning_rate": 0.000333, + "loss": 1.1155, + "step": 111 + }, + { + "epoch": 0.00982147872994907, + "grad_norm": 0.051513671875, + "learning_rate": 0.00033600000000000004, + "loss": 1.1712, + "step": 112 + }, + { + "epoch": 0.009909170504323616, + "grad_norm": 0.0546875, + "learning_rate": 0.000339, + "loss": 1.1747, + "step": 113 + }, + { + "epoch": 0.00999686227869816, + "grad_norm": 0.04833984375, + "learning_rate": 0.000342, + "loss": 1.1377, + "step": 114 + }, + { + "epoch": 0.010084554053072707, + "grad_norm": 0.057373046875, + "learning_rate": 0.00034500000000000004, + "loss": 1.1282, + "step": 115 + }, + { + "epoch": 0.010172245827447251, + "grad_norm": 0.057373046875, + "learning_rate": 0.000348, + "loss": 1.1553, + "step": 116 + }, + { + "epoch": 0.010259937601821796, + "grad_norm": 0.04638671875, + "learning_rate": 0.000351, + "loss": 1.1211, + "step": 117 + }, + { + "epoch": 0.010347629376196342, + "grad_norm": 0.0625, + "learning_rate": 0.000354, + "loss": 1.1568, + "step": 118 + }, + { + "epoch": 0.010435321150570886, + "grad_norm": 0.060546875, + "learning_rate": 0.000357, + "loss": 1.1445, + "step": 119 + }, + { + "epoch": 0.010523012924945433, + "grad_norm": 0.046142578125, + "learning_rate": 0.00035999999999999997, + "loss": 1.1153, + "step": 120 + }, + { + "epoch": 0.010610704699319977, + "grad_norm": 0.047607421875, + "learning_rate": 0.000363, + "loss": 1.151, + "step": 121 + }, + { + "epoch": 0.010698396473694523, + "grad_norm": 0.0673828125, + "learning_rate": 0.000366, + "loss": 1.123, + "step": 122 + }, + { + "epoch": 0.010786088248069068, + "grad_norm": 0.04736328125, + "learning_rate": 0.000369, + "loss": 1.1733, + "step": 123 + }, + { + "epoch": 0.010873780022443614, + "grad_norm": 0.058837890625, + "learning_rate": 0.000372, + "loss": 1.1694, + "step": 124 + }, + { + "epoch": 0.010961471796818159, + "grad_norm": 0.0478515625, + "learning_rate": 0.000375, + "loss": 1.1659, + "step": 125 + }, + { + "epoch": 0.011049163571192703, + "grad_norm": 0.049072265625, + "learning_rate": 0.000378, + "loss": 1.1329, + "step": 126 + }, + { + "epoch": 0.01113685534556725, + "grad_norm": 0.050048828125, + "learning_rate": 0.000381, + "loss": 1.1064, + "step": 127 + }, + { + "epoch": 0.011224547119941794, + "grad_norm": 0.04833984375, + "learning_rate": 0.000384, + "loss": 1.1358, + "step": 128 + }, + { + "epoch": 0.01131223889431634, + "grad_norm": 0.04736328125, + "learning_rate": 0.00038700000000000003, + "loss": 1.1024, + "step": 129 + }, + { + "epoch": 0.011399930668690885, + "grad_norm": 0.0498046875, + "learning_rate": 0.00039000000000000005, + "loss": 1.2044, + "step": 130 + }, + { + "epoch": 0.011487622443065431, + "grad_norm": 0.06591796875, + "learning_rate": 0.000393, + "loss": 1.1628, + "step": 131 + }, + { + "epoch": 0.011575314217439975, + "grad_norm": 0.06396484375, + "learning_rate": 0.00039600000000000003, + "loss": 1.1833, + "step": 132 + }, + { + "epoch": 0.011663005991814522, + "grad_norm": 0.06201171875, + "learning_rate": 0.00039900000000000005, + "loss": 1.1599, + "step": 133 + }, + { + "epoch": 0.011750697766189066, + "grad_norm": 0.045654296875, + "learning_rate": 0.000402, + "loss": 1.122, + "step": 134 + }, + { + "epoch": 0.01183838954056361, + "grad_norm": 0.05078125, + "learning_rate": 0.00040500000000000003, + "loss": 1.1688, + "step": 135 + }, + { + "epoch": 0.011926081314938157, + "grad_norm": 0.056884765625, + "learning_rate": 0.00040800000000000005, + "loss": 1.1775, + "step": 136 + }, + { + "epoch": 0.012013773089312701, + "grad_norm": 0.055419921875, + "learning_rate": 0.000411, + "loss": 1.1348, + "step": 137 + }, + { + "epoch": 0.012101464863687248, + "grad_norm": 0.054443359375, + "learning_rate": 0.00041400000000000003, + "loss": 1.2099, + "step": 138 + }, + { + "epoch": 0.012189156638061792, + "grad_norm": 0.050048828125, + "learning_rate": 0.00041700000000000005, + "loss": 1.1488, + "step": 139 + }, + { + "epoch": 0.012276848412436338, + "grad_norm": 0.048828125, + "learning_rate": 0.00042000000000000007, + "loss": 1.176, + "step": 140 + }, + { + "epoch": 0.012364540186810883, + "grad_norm": 0.046630859375, + "learning_rate": 0.000423, + "loss": 1.1426, + "step": 141 + }, + { + "epoch": 0.01245223196118543, + "grad_norm": 0.0517578125, + "learning_rate": 0.00042599999999999995, + "loss": 1.1649, + "step": 142 + }, + { + "epoch": 0.012539923735559974, + "grad_norm": 0.056396484375, + "learning_rate": 0.00042899999999999997, + "loss": 1.1576, + "step": 143 + }, + { + "epoch": 0.012627615509934518, + "grad_norm": 0.04638671875, + "learning_rate": 0.000432, + "loss": 1.1381, + "step": 144 + }, + { + "epoch": 0.012715307284309064, + "grad_norm": 0.05078125, + "learning_rate": 0.000435, + "loss": 1.2093, + "step": 145 + }, + { + "epoch": 0.012802999058683609, + "grad_norm": 0.048828125, + "learning_rate": 0.00043799999999999997, + "loss": 1.1952, + "step": 146 + }, + { + "epoch": 0.012890690833058155, + "grad_norm": 0.055419921875, + "learning_rate": 0.000441, + "loss": 1.1391, + "step": 147 + }, + { + "epoch": 0.0129783826074327, + "grad_norm": 0.044677734375, + "learning_rate": 0.000444, + "loss": 1.113, + "step": 148 + }, + { + "epoch": 0.013066074381807246, + "grad_norm": 0.046630859375, + "learning_rate": 0.00044699999999999997, + "loss": 1.1297, + "step": 149 + }, + { + "epoch": 0.01315376615618179, + "grad_norm": 0.05224609375, + "learning_rate": 0.00045, + "loss": 1.1373, + "step": 150 + }, + { + "epoch": 0.013241457930556335, + "grad_norm": 0.05126953125, + "learning_rate": 0.000453, + "loss": 1.1788, + "step": 151 + }, + { + "epoch": 0.013329149704930881, + "grad_norm": 0.055908203125, + "learning_rate": 0.000456, + "loss": 1.1361, + "step": 152 + }, + { + "epoch": 0.013416841479305426, + "grad_norm": 0.049560546875, + "learning_rate": 0.000459, + "loss": 1.0928, + "step": 153 + }, + { + "epoch": 0.013504533253679972, + "grad_norm": 0.052001953125, + "learning_rate": 0.000462, + "loss": 1.1558, + "step": 154 + }, + { + "epoch": 0.013592225028054516, + "grad_norm": 0.045654296875, + "learning_rate": 0.000465, + "loss": 1.1215, + "step": 155 + }, + { + "epoch": 0.013679916802429063, + "grad_norm": 0.052734375, + "learning_rate": 0.000468, + "loss": 1.1364, + "step": 156 + }, + { + "epoch": 0.013767608576803607, + "grad_norm": 0.05029296875, + "learning_rate": 0.000471, + "loss": 1.2186, + "step": 157 + }, + { + "epoch": 0.013855300351178153, + "grad_norm": 0.051513671875, + "learning_rate": 0.00047400000000000003, + "loss": 1.132, + "step": 158 + }, + { + "epoch": 0.013942992125552698, + "grad_norm": 0.052978515625, + "learning_rate": 0.000477, + "loss": 1.1483, + "step": 159 + }, + { + "epoch": 0.014030683899927242, + "grad_norm": 0.06298828125, + "learning_rate": 0.00048, + "loss": 1.1765, + "step": 160 + }, + { + "epoch": 0.014118375674301789, + "grad_norm": 0.04833984375, + "learning_rate": 0.00048300000000000003, + "loss": 1.2102, + "step": 161 + }, + { + "epoch": 0.014206067448676333, + "grad_norm": 0.047119140625, + "learning_rate": 0.00048600000000000005, + "loss": 1.1264, + "step": 162 + }, + { + "epoch": 0.01429375922305088, + "grad_norm": 0.048583984375, + "learning_rate": 0.0004890000000000001, + "loss": 1.1673, + "step": 163 + }, + { + "epoch": 0.014381450997425424, + "grad_norm": 0.048095703125, + "learning_rate": 0.000492, + "loss": 1.1293, + "step": 164 + }, + { + "epoch": 0.01446914277179997, + "grad_norm": 0.0654296875, + "learning_rate": 0.000495, + "loss": 1.214, + "step": 165 + }, + { + "epoch": 0.014556834546174515, + "grad_norm": 0.04541015625, + "learning_rate": 0.0004980000000000001, + "loss": 1.1727, + "step": 166 + }, + { + "epoch": 0.014644526320549061, + "grad_norm": 0.047607421875, + "learning_rate": 0.000501, + "loss": 1.1803, + "step": 167 + }, + { + "epoch": 0.014732218094923605, + "grad_norm": 0.05126953125, + "learning_rate": 0.000504, + "loss": 1.1959, + "step": 168 + }, + { + "epoch": 0.01481990986929815, + "grad_norm": 0.0498046875, + "learning_rate": 0.0005070000000000001, + "loss": 1.1719, + "step": 169 + }, + { + "epoch": 0.014907601643672696, + "grad_norm": 0.05029296875, + "learning_rate": 0.00051, + "loss": 1.178, + "step": 170 + }, + { + "epoch": 0.01499529341804724, + "grad_norm": 0.0615234375, + "learning_rate": 0.000513, + "loss": 1.1316, + "step": 171 + }, + { + "epoch": 0.015082985192421787, + "grad_norm": 0.056640625, + "learning_rate": 0.000516, + "loss": 1.1646, + "step": 172 + }, + { + "epoch": 0.015170676966796331, + "grad_norm": 0.0498046875, + "learning_rate": 0.0005189999999999999, + "loss": 1.1202, + "step": 173 + }, + { + "epoch": 0.015258368741170878, + "grad_norm": 0.05322265625, + "learning_rate": 0.000522, + "loss": 1.1387, + "step": 174 + }, + { + "epoch": 0.015346060515545422, + "grad_norm": 0.05419921875, + "learning_rate": 0.000525, + "loss": 1.1942, + "step": 175 + }, + { + "epoch": 0.015433752289919967, + "grad_norm": 0.05517578125, + "learning_rate": 0.0005279999999999999, + "loss": 1.1539, + "step": 176 + }, + { + "epoch": 0.015521444064294513, + "grad_norm": 0.053466796875, + "learning_rate": 0.000531, + "loss": 1.1139, + "step": 177 + }, + { + "epoch": 0.015609135838669057, + "grad_norm": 0.049072265625, + "learning_rate": 0.000534, + "loss": 1.172, + "step": 178 + }, + { + "epoch": 0.015696827613043602, + "grad_norm": 0.046142578125, + "learning_rate": 0.000537, + "loss": 1.1199, + "step": 179 + }, + { + "epoch": 0.01578451938741815, + "grad_norm": 0.050048828125, + "learning_rate": 0.00054, + "loss": 1.1085, + "step": 180 + }, + { + "epoch": 0.015872211161792694, + "grad_norm": 0.04931640625, + "learning_rate": 0.000543, + "loss": 1.1067, + "step": 181 + }, + { + "epoch": 0.01595990293616724, + "grad_norm": 0.0712890625, + "learning_rate": 0.000546, + "loss": 1.2397, + "step": 182 + }, + { + "epoch": 0.016047594710541783, + "grad_norm": 0.04541015625, + "learning_rate": 0.000549, + "loss": 1.1286, + "step": 183 + }, + { + "epoch": 0.016135286484916328, + "grad_norm": 0.051513671875, + "learning_rate": 0.000552, + "loss": 1.1336, + "step": 184 + }, + { + "epoch": 0.016222978259290876, + "grad_norm": 0.052734375, + "learning_rate": 0.000555, + "loss": 1.1732, + "step": 185 + }, + { + "epoch": 0.01631067003366542, + "grad_norm": 0.05224609375, + "learning_rate": 0.000558, + "loss": 1.1995, + "step": 186 + }, + { + "epoch": 0.016398361808039965, + "grad_norm": 0.050537109375, + "learning_rate": 0.000561, + "loss": 1.1547, + "step": 187 + }, + { + "epoch": 0.01648605358241451, + "grad_norm": 0.050537109375, + "learning_rate": 0.000564, + "loss": 1.1546, + "step": 188 + }, + { + "epoch": 0.016573745356789057, + "grad_norm": 0.05224609375, + "learning_rate": 0.000567, + "loss": 1.134, + "step": 189 + }, + { + "epoch": 0.016661437131163602, + "grad_norm": 0.0625, + "learning_rate": 0.00057, + "loss": 1.1755, + "step": 190 + }, + { + "epoch": 0.016749128905538146, + "grad_norm": 0.06005859375, + "learning_rate": 0.000573, + "loss": 1.2015, + "step": 191 + }, + { + "epoch": 0.01683682067991269, + "grad_norm": 0.0556640625, + "learning_rate": 0.000576, + "loss": 1.2078, + "step": 192 + }, + { + "epoch": 0.016924512454287235, + "grad_norm": 0.049560546875, + "learning_rate": 0.000579, + "loss": 1.2049, + "step": 193 + }, + { + "epoch": 0.017012204228661783, + "grad_norm": 0.05029296875, + "learning_rate": 0.000582, + "loss": 1.1235, + "step": 194 + }, + { + "epoch": 0.017099896003036328, + "grad_norm": 0.054443359375, + "learning_rate": 0.000585, + "loss": 1.193, + "step": 195 + }, + { + "epoch": 0.017187587777410872, + "grad_norm": 0.052490234375, + "learning_rate": 0.000588, + "loss": 1.1205, + "step": 196 + }, + { + "epoch": 0.017275279551785417, + "grad_norm": 0.05419921875, + "learning_rate": 0.000591, + "loss": 1.1362, + "step": 197 + }, + { + "epoch": 0.017362971326159965, + "grad_norm": 0.051513671875, + "learning_rate": 0.000594, + "loss": 1.1614, + "step": 198 + }, + { + "epoch": 0.01745066310053451, + "grad_norm": 0.053955078125, + "learning_rate": 0.0005970000000000001, + "loss": 1.1948, + "step": 199 + }, + { + "epoch": 0.017538354874909054, + "grad_norm": 0.05517578125, + "learning_rate": 0.0006000000000000001, + "loss": 1.1425, + "step": 200 + }, + { + "epoch": 0.0176260466492836, + "grad_norm": 0.0625, + "learning_rate": 0.000603, + "loss": 1.1436, + "step": 201 + }, + { + "epoch": 0.017713738423658143, + "grad_norm": 0.0498046875, + "learning_rate": 0.0006060000000000001, + "loss": 1.1551, + "step": 202 + }, + { + "epoch": 0.01780143019803269, + "grad_norm": 0.052978515625, + "learning_rate": 0.0006090000000000001, + "loss": 1.1274, + "step": 203 + }, + { + "epoch": 0.017889121972407235, + "grad_norm": 0.052490234375, + "learning_rate": 0.000612, + "loss": 1.1853, + "step": 204 + }, + { + "epoch": 0.01797681374678178, + "grad_norm": 0.052001953125, + "learning_rate": 0.000615, + "loss": 1.1765, + "step": 205 + }, + { + "epoch": 0.018064505521156324, + "grad_norm": 0.06103515625, + "learning_rate": 0.000618, + "loss": 1.1268, + "step": 206 + }, + { + "epoch": 0.018152197295530872, + "grad_norm": 0.059326171875, + "learning_rate": 0.000621, + "loss": 1.1769, + "step": 207 + }, + { + "epoch": 0.018239889069905417, + "grad_norm": 0.047119140625, + "learning_rate": 0.000624, + "loss": 1.0777, + "step": 208 + }, + { + "epoch": 0.01832758084427996, + "grad_norm": 0.06005859375, + "learning_rate": 0.000627, + "loss": 1.0759, + "step": 209 + }, + { + "epoch": 0.018415272618654506, + "grad_norm": 0.049072265625, + "learning_rate": 0.00063, + "loss": 1.1398, + "step": 210 + }, + { + "epoch": 0.01850296439302905, + "grad_norm": 0.057373046875, + "learning_rate": 0.000633, + "loss": 1.1684, + "step": 211 + }, + { + "epoch": 0.0185906561674036, + "grad_norm": 0.048583984375, + "learning_rate": 0.000636, + "loss": 1.1609, + "step": 212 + }, + { + "epoch": 0.018678347941778143, + "grad_norm": 0.048828125, + "learning_rate": 0.000639, + "loss": 1.1409, + "step": 213 + }, + { + "epoch": 0.018766039716152687, + "grad_norm": 0.06201171875, + "learning_rate": 0.000642, + "loss": 1.1721, + "step": 214 + }, + { + "epoch": 0.018853731490527232, + "grad_norm": 0.04736328125, + "learning_rate": 0.000645, + "loss": 1.1046, + "step": 215 + }, + { + "epoch": 0.01894142326490178, + "grad_norm": 0.047607421875, + "learning_rate": 0.000648, + "loss": 1.1549, + "step": 216 + }, + { + "epoch": 0.019029115039276324, + "grad_norm": 0.0517578125, + "learning_rate": 0.000651, + "loss": 1.1698, + "step": 217 + }, + { + "epoch": 0.01911680681365087, + "grad_norm": 0.05126953125, + "learning_rate": 0.000654, + "loss": 1.1671, + "step": 218 + }, + { + "epoch": 0.019204498588025413, + "grad_norm": 0.051025390625, + "learning_rate": 0.000657, + "loss": 1.1809, + "step": 219 + }, + { + "epoch": 0.019292190362399958, + "grad_norm": 0.06787109375, + "learning_rate": 0.00066, + "loss": 1.1231, + "step": 220 + }, + { + "epoch": 0.019379882136774506, + "grad_norm": 0.05029296875, + "learning_rate": 0.0006630000000000001, + "loss": 1.2035, + "step": 221 + }, + { + "epoch": 0.01946757391114905, + "grad_norm": 0.0498046875, + "learning_rate": 0.000666, + "loss": 1.1891, + "step": 222 + }, + { + "epoch": 0.019555265685523595, + "grad_norm": 0.048828125, + "learning_rate": 0.000669, + "loss": 1.1181, + "step": 223 + }, + { + "epoch": 0.01964295745989814, + "grad_norm": 0.050048828125, + "learning_rate": 0.0006720000000000001, + "loss": 1.1088, + "step": 224 + }, + { + "epoch": 0.019730649234272687, + "grad_norm": 0.052001953125, + "learning_rate": 0.000675, + "loss": 1.1524, + "step": 225 + }, + { + "epoch": 0.019818341008647232, + "grad_norm": 0.051513671875, + "learning_rate": 0.000678, + "loss": 1.1337, + "step": 226 + }, + { + "epoch": 0.019906032783021776, + "grad_norm": 0.061767578125, + "learning_rate": 0.0006810000000000001, + "loss": 1.1906, + "step": 227 + }, + { + "epoch": 0.01999372455739632, + "grad_norm": 0.052734375, + "learning_rate": 0.000684, + "loss": 1.1515, + "step": 228 + }, + { + "epoch": 0.020081416331770865, + "grad_norm": 0.05810546875, + "learning_rate": 0.000687, + "loss": 1.1852, + "step": 229 + }, + { + "epoch": 0.020169108106145413, + "grad_norm": 0.055908203125, + "learning_rate": 0.0006900000000000001, + "loss": 1.161, + "step": 230 + }, + { + "epoch": 0.020256799880519958, + "grad_norm": 0.054931640625, + "learning_rate": 0.000693, + "loss": 1.1192, + "step": 231 + }, + { + "epoch": 0.020344491654894502, + "grad_norm": 0.05859375, + "learning_rate": 0.000696, + "loss": 1.1421, + "step": 232 + }, + { + "epoch": 0.020432183429269047, + "grad_norm": 0.048828125, + "learning_rate": 0.0006990000000000001, + "loss": 1.1496, + "step": 233 + }, + { + "epoch": 0.02051987520364359, + "grad_norm": 0.07080078125, + "learning_rate": 0.000702, + "loss": 1.1524, + "step": 234 + }, + { + "epoch": 0.02060756697801814, + "grad_norm": 0.0537109375, + "learning_rate": 0.000705, + "loss": 1.121, + "step": 235 + }, + { + "epoch": 0.020695258752392684, + "grad_norm": 0.05078125, + "learning_rate": 0.000708, + "loss": 1.1113, + "step": 236 + }, + { + "epoch": 0.02078295052676723, + "grad_norm": 0.06591796875, + "learning_rate": 0.0007109999999999999, + "loss": 1.1425, + "step": 237 + }, + { + "epoch": 0.020870642301141773, + "grad_norm": 0.08251953125, + "learning_rate": 0.000714, + "loss": 1.1002, + "step": 238 + }, + { + "epoch": 0.02095833407551632, + "grad_norm": 0.060791015625, + "learning_rate": 0.000717, + "loss": 1.143, + "step": 239 + }, + { + "epoch": 0.021046025849890865, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007199999999999999, + "loss": 1.1428, + "step": 240 + }, + { + "epoch": 0.02113371762426541, + "grad_norm": 0.08056640625, + "learning_rate": 0.000723, + "loss": 1.1528, + "step": 241 + }, + { + "epoch": 0.021221409398639954, + "grad_norm": 0.0908203125, + "learning_rate": 0.000726, + "loss": 1.1262, + "step": 242 + }, + { + "epoch": 0.0213091011730145, + "grad_norm": 0.056640625, + "learning_rate": 0.000729, + "loss": 1.0917, + "step": 243 + }, + { + "epoch": 0.021396792947389047, + "grad_norm": 0.051513671875, + "learning_rate": 0.000732, + "loss": 1.1335, + "step": 244 + }, + { + "epoch": 0.02148448472176359, + "grad_norm": 0.054931640625, + "learning_rate": 0.000735, + "loss": 1.1724, + "step": 245 + }, + { + "epoch": 0.021572176496138136, + "grad_norm": 0.08203125, + "learning_rate": 0.000738, + "loss": 1.1043, + "step": 246 + }, + { + "epoch": 0.02165986827051268, + "grad_norm": 0.05224609375, + "learning_rate": 0.000741, + "loss": 1.1998, + "step": 247 + }, + { + "epoch": 0.02174756004488723, + "grad_norm": 0.056640625, + "learning_rate": 0.000744, + "loss": 1.1098, + "step": 248 + }, + { + "epoch": 0.021835251819261773, + "grad_norm": 0.05615234375, + "learning_rate": 0.000747, + "loss": 1.1391, + "step": 249 + }, + { + "epoch": 0.021922943593636317, + "grad_norm": 0.057861328125, + "learning_rate": 0.00075, + "loss": 1.1297, + "step": 250 + }, + { + "epoch": 0.022010635368010862, + "grad_norm": 0.05224609375, + "learning_rate": 0.000753, + "loss": 1.1278, + "step": 251 + }, + { + "epoch": 0.022098327142385406, + "grad_norm": 0.05810546875, + "learning_rate": 0.000756, + "loss": 1.1577, + "step": 252 + }, + { + "epoch": 0.022186018916759954, + "grad_norm": 0.056396484375, + "learning_rate": 0.000759, + "loss": 1.162, + "step": 253 + }, + { + "epoch": 0.0222737106911345, + "grad_norm": 0.05517578125, + "learning_rate": 0.000762, + "loss": 1.1489, + "step": 254 + }, + { + "epoch": 0.022361402465509043, + "grad_norm": 0.049072265625, + "learning_rate": 0.0007650000000000001, + "loss": 1.1197, + "step": 255 + }, + { + "epoch": 0.022449094239883588, + "grad_norm": 0.052001953125, + "learning_rate": 0.000768, + "loss": 1.1886, + "step": 256 + }, + { + "epoch": 0.022536786014258136, + "grad_norm": 0.0634765625, + "learning_rate": 0.000771, + "loss": 1.1247, + "step": 257 + }, + { + "epoch": 0.02262447778863268, + "grad_norm": 0.053466796875, + "learning_rate": 0.0007740000000000001, + "loss": 1.0887, + "step": 258 + }, + { + "epoch": 0.022712169563007225, + "grad_norm": 0.05126953125, + "learning_rate": 0.000777, + "loss": 1.1116, + "step": 259 + }, + { + "epoch": 0.02279986133738177, + "grad_norm": 0.07373046875, + "learning_rate": 0.0007800000000000001, + "loss": 1.1466, + "step": 260 + }, + { + "epoch": 0.022887553111756314, + "grad_norm": 0.050048828125, + "learning_rate": 0.0007830000000000001, + "loss": 1.1808, + "step": 261 + }, + { + "epoch": 0.022975244886130862, + "grad_norm": 0.055419921875, + "learning_rate": 0.000786, + "loss": 1.1295, + "step": 262 + }, + { + "epoch": 0.023062936660505406, + "grad_norm": 0.0517578125, + "learning_rate": 0.0007890000000000001, + "loss": 1.1737, + "step": 263 + }, + { + "epoch": 0.02315062843487995, + "grad_norm": 0.0595703125, + "learning_rate": 0.0007920000000000001, + "loss": 1.0943, + "step": 264 + }, + { + "epoch": 0.023238320209254495, + "grad_norm": 0.05810546875, + "learning_rate": 0.000795, + "loss": 1.1239, + "step": 265 + }, + { + "epoch": 0.023326011983629043, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007980000000000001, + "loss": 1.1174, + "step": 266 + }, + { + "epoch": 0.023413703758003588, + "grad_norm": 0.06591796875, + "learning_rate": 0.0008010000000000001, + "loss": 1.1444, + "step": 267 + }, + { + "epoch": 0.023501395532378132, + "grad_norm": 0.0703125, + "learning_rate": 0.000804, + "loss": 1.1608, + "step": 268 + }, + { + "epoch": 0.023589087306752677, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008070000000000001, + "loss": 1.1755, + "step": 269 + }, + { + "epoch": 0.02367677908112722, + "grad_norm": 0.057373046875, + "learning_rate": 0.0008100000000000001, + "loss": 1.1465, + "step": 270 + }, + { + "epoch": 0.02376447085550177, + "grad_norm": 0.0751953125, + "learning_rate": 0.000813, + "loss": 1.1984, + "step": 271 + }, + { + "epoch": 0.023852162629876314, + "grad_norm": 0.07177734375, + "learning_rate": 0.0008160000000000001, + "loss": 1.1619, + "step": 272 + }, + { + "epoch": 0.02393985440425086, + "grad_norm": 0.060302734375, + "learning_rate": 0.0008190000000000001, + "loss": 1.1327, + "step": 273 + }, + { + "epoch": 0.024027546178625403, + "grad_norm": 0.05810546875, + "learning_rate": 0.000822, + "loss": 1.1771, + "step": 274 + }, + { + "epoch": 0.02411523795299995, + "grad_norm": 0.06396484375, + "learning_rate": 0.0008250000000000001, + "loss": 1.1673, + "step": 275 + }, + { + "epoch": 0.024202929727374495, + "grad_norm": 0.1015625, + "learning_rate": 0.0008280000000000001, + "loss": 1.138, + "step": 276 + }, + { + "epoch": 0.02429062150174904, + "grad_norm": 0.051025390625, + "learning_rate": 0.0008310000000000001, + "loss": 1.1402, + "step": 277 + }, + { + "epoch": 0.024378313276123584, + "grad_norm": 0.0546875, + "learning_rate": 0.0008340000000000001, + "loss": 1.1657, + "step": 278 + }, + { + "epoch": 0.02446600505049813, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008370000000000001, + "loss": 1.1595, + "step": 279 + }, + { + "epoch": 0.024553696824872677, + "grad_norm": 0.06494140625, + "learning_rate": 0.0008400000000000001, + "loss": 1.1554, + "step": 280 + }, + { + "epoch": 0.02464138859924722, + "grad_norm": 0.057373046875, + "learning_rate": 0.0008430000000000001, + "loss": 1.1981, + "step": 281 + }, + { + "epoch": 0.024729080373621766, + "grad_norm": 0.058837890625, + "learning_rate": 0.000846, + "loss": 1.1148, + "step": 282 + }, + { + "epoch": 0.02481677214799631, + "grad_norm": 0.050048828125, + "learning_rate": 0.0008489999999999999, + "loss": 1.1277, + "step": 283 + }, + { + "epoch": 0.02490446392237086, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008519999999999999, + "loss": 1.1403, + "step": 284 + }, + { + "epoch": 0.024992155696745403, + "grad_norm": 0.0595703125, + "learning_rate": 0.000855, + "loss": 1.0865, + "step": 285 + }, + { + "epoch": 0.025079847471119947, + "grad_norm": 0.05517578125, + "learning_rate": 0.0008579999999999999, + "loss": 1.1986, + "step": 286 + }, + { + "epoch": 0.025167539245494492, + "grad_norm": 0.05859375, + "learning_rate": 0.000861, + "loss": 1.1272, + "step": 287 + }, + { + "epoch": 0.025255231019869036, + "grad_norm": 0.05908203125, + "learning_rate": 0.000864, + "loss": 1.1783, + "step": 288 + }, + { + "epoch": 0.025342922794243584, + "grad_norm": 0.051025390625, + "learning_rate": 0.0008669999999999999, + "loss": 1.11, + "step": 289 + }, + { + "epoch": 0.02543061456861813, + "grad_norm": 0.048095703125, + "learning_rate": 0.00087, + "loss": 1.1297, + "step": 290 + }, + { + "epoch": 0.025518306342992673, + "grad_norm": 0.06298828125, + "learning_rate": 0.000873, + "loss": 1.132, + "step": 291 + }, + { + "epoch": 0.025605998117367218, + "grad_norm": 0.05810546875, + "learning_rate": 0.0008759999999999999, + "loss": 1.1429, + "step": 292 + }, + { + "epoch": 0.025693689891741762, + "grad_norm": 0.052978515625, + "learning_rate": 0.000879, + "loss": 1.1246, + "step": 293 + }, + { + "epoch": 0.02578138166611631, + "grad_norm": 0.08203125, + "learning_rate": 0.000882, + "loss": 1.1745, + "step": 294 + }, + { + "epoch": 0.025869073440490855, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008849999999999999, + "loss": 1.1477, + "step": 295 + }, + { + "epoch": 0.0259567652148654, + "grad_norm": 0.10400390625, + "learning_rate": 0.000888, + "loss": 1.2113, + "step": 296 + }, + { + "epoch": 0.026044456989239944, + "grad_norm": 0.0908203125, + "learning_rate": 0.000891, + "loss": 1.1925, + "step": 297 + }, + { + "epoch": 0.026132148763614492, + "grad_norm": 0.05419921875, + "learning_rate": 0.0008939999999999999, + "loss": 1.1713, + "step": 298 + }, + { + "epoch": 0.026219840537989036, + "grad_norm": 0.054443359375, + "learning_rate": 0.000897, + "loss": 1.1088, + "step": 299 + }, + { + "epoch": 0.02630753231236358, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009, + "loss": 1.1081, + "step": 300 + }, + { + "epoch": 0.026395224086738125, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009029999999999999, + "loss": 1.1574, + "step": 301 + }, + { + "epoch": 0.02648291586111267, + "grad_norm": 0.05810546875, + "learning_rate": 0.000906, + "loss": 1.182, + "step": 302 + }, + { + "epoch": 0.026570607635487218, + "grad_norm": 0.057861328125, + "learning_rate": 0.000909, + "loss": 1.142, + "step": 303 + }, + { + "epoch": 0.026658299409861762, + "grad_norm": 0.06591796875, + "learning_rate": 0.000912, + "loss": 1.1583, + "step": 304 + }, + { + "epoch": 0.026745991184236307, + "grad_norm": 0.056640625, + "learning_rate": 0.000915, + "loss": 1.1874, + "step": 305 + }, + { + "epoch": 0.02683368295861085, + "grad_norm": 0.05322265625, + "learning_rate": 0.000918, + "loss": 1.131, + "step": 306 + }, + { + "epoch": 0.0269213747329854, + "grad_norm": 0.055908203125, + "learning_rate": 0.000921, + "loss": 1.1403, + "step": 307 + }, + { + "epoch": 0.027009066507359944, + "grad_norm": 0.068359375, + "learning_rate": 0.000924, + "loss": 1.1459, + "step": 308 + }, + { + "epoch": 0.02709675828173449, + "grad_norm": 0.05224609375, + "learning_rate": 0.000927, + "loss": 1.12, + "step": 309 + }, + { + "epoch": 0.027184450056109033, + "grad_norm": 0.0537109375, + "learning_rate": 0.00093, + "loss": 1.1903, + "step": 310 + }, + { + "epoch": 0.027272141830483577, + "grad_norm": 0.051025390625, + "learning_rate": 0.000933, + "loss": 1.1237, + "step": 311 + }, + { + "epoch": 0.027359833604858125, + "grad_norm": 0.05419921875, + "learning_rate": 0.000936, + "loss": 1.1044, + "step": 312 + }, + { + "epoch": 0.02744752537923267, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009390000000000001, + "loss": 1.1263, + "step": 313 + }, + { + "epoch": 0.027535217153607214, + "grad_norm": 0.049560546875, + "learning_rate": 0.000942, + "loss": 1.1294, + "step": 314 + }, + { + "epoch": 0.02762290892798176, + "grad_norm": 0.055908203125, + "learning_rate": 0.000945, + "loss": 1.1812, + "step": 315 + }, + { + "epoch": 0.027710600702356307, + "grad_norm": 0.0556640625, + "learning_rate": 0.0009480000000000001, + "loss": 1.1673, + "step": 316 + }, + { + "epoch": 0.02779829247673085, + "grad_norm": 0.0576171875, + "learning_rate": 0.000951, + "loss": 1.1607, + "step": 317 + }, + { + "epoch": 0.027885984251105396, + "grad_norm": 0.05322265625, + "learning_rate": 0.000954, + "loss": 1.1432, + "step": 318 + }, + { + "epoch": 0.02797367602547994, + "grad_norm": 0.054443359375, + "learning_rate": 0.0009570000000000001, + "loss": 1.15, + "step": 319 + }, + { + "epoch": 0.028061367799854485, + "grad_norm": 0.078125, + "learning_rate": 0.00096, + "loss": 1.1851, + "step": 320 + }, + { + "epoch": 0.028149059574229033, + "grad_norm": 0.051513671875, + "learning_rate": 0.000963, + "loss": 1.1291, + "step": 321 + }, + { + "epoch": 0.028236751348603577, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009660000000000001, + "loss": 1.1273, + "step": 322 + }, + { + "epoch": 0.028324443122978122, + "grad_norm": 0.054931640625, + "learning_rate": 0.000969, + "loss": 1.1779, + "step": 323 + }, + { + "epoch": 0.028412134897352666, + "grad_norm": 0.052001953125, + "learning_rate": 0.0009720000000000001, + "loss": 1.1774, + "step": 324 + }, + { + "epoch": 0.028499826671727214, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009750000000000001, + "loss": 1.1247, + "step": 325 + }, + { + "epoch": 0.02858751844610176, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009780000000000001, + "loss": 1.1964, + "step": 326 + }, + { + "epoch": 0.028675210220476303, + "grad_norm": 0.054931640625, + "learning_rate": 0.000981, + "loss": 1.0994, + "step": 327 + }, + { + "epoch": 0.028762901994850848, + "grad_norm": 0.06591796875, + "learning_rate": 0.000984, + "loss": 1.1888, + "step": 328 + }, + { + "epoch": 0.028850593769225392, + "grad_norm": 0.0576171875, + "learning_rate": 0.000987, + "loss": 1.1212, + "step": 329 + }, + { + "epoch": 0.02893828554359994, + "grad_norm": 0.053955078125, + "learning_rate": 0.00099, + "loss": 1.1633, + "step": 330 + }, + { + "epoch": 0.029025977317974485, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009930000000000002, + "loss": 1.1294, + "step": 331 + }, + { + "epoch": 0.02911366909234903, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009960000000000001, + "loss": 1.191, + "step": 332 + }, + { + "epoch": 0.029201360866723574, + "grad_norm": 0.052490234375, + "learning_rate": 0.000999, + "loss": 1.1672, + "step": 333 + }, + { + "epoch": 0.029289052641098122, + "grad_norm": 0.061767578125, + "learning_rate": 0.001002, + "loss": 1.1475, + "step": 334 + }, + { + "epoch": 0.029376744415472666, + "grad_norm": 0.06591796875, + "learning_rate": 0.001005, + "loss": 1.1125, + "step": 335 + }, + { + "epoch": 0.02946443618984721, + "grad_norm": 0.05419921875, + "learning_rate": 0.001008, + "loss": 1.1321, + "step": 336 + }, + { + "epoch": 0.029552127964221755, + "grad_norm": 0.05810546875, + "learning_rate": 0.0010110000000000002, + "loss": 1.1477, + "step": 337 + }, + { + "epoch": 0.0296398197385963, + "grad_norm": 0.0556640625, + "learning_rate": 0.0010140000000000001, + "loss": 1.0962, + "step": 338 + }, + { + "epoch": 0.029727511512970848, + "grad_norm": 0.0576171875, + "learning_rate": 0.0010170000000000001, + "loss": 1.2346, + "step": 339 + }, + { + "epoch": 0.029815203287345392, + "grad_norm": 0.05615234375, + "learning_rate": 0.00102, + "loss": 1.15, + "step": 340 + }, + { + "epoch": 0.029902895061719937, + "grad_norm": 0.06787109375, + "learning_rate": 0.001023, + "loss": 1.1906, + "step": 341 + }, + { + "epoch": 0.02999058683609448, + "grad_norm": 0.06396484375, + "learning_rate": 0.001026, + "loss": 1.1226, + "step": 342 + }, + { + "epoch": 0.030078278610469026, + "grad_norm": 0.05126953125, + "learning_rate": 0.0010290000000000002, + "loss": 1.1437, + "step": 343 + }, + { + "epoch": 0.030165970384843574, + "grad_norm": 0.0693359375, + "learning_rate": 0.001032, + "loss": 1.1805, + "step": 344 + }, + { + "epoch": 0.03025366215921812, + "grad_norm": 0.07470703125, + "learning_rate": 0.001035, + "loss": 1.1995, + "step": 345 + }, + { + "epoch": 0.030341353933592663, + "grad_norm": 0.068359375, + "learning_rate": 0.0010379999999999999, + "loss": 1.1826, + "step": 346 + }, + { + "epoch": 0.030429045707967207, + "grad_norm": 0.057373046875, + "learning_rate": 0.001041, + "loss": 1.2097, + "step": 347 + }, + { + "epoch": 0.030516737482341755, + "grad_norm": 0.1259765625, + "learning_rate": 0.001044, + "loss": 1.1366, + "step": 348 + }, + { + "epoch": 0.0306044292567163, + "grad_norm": 0.0908203125, + "learning_rate": 0.001047, + "loss": 1.0497, + "step": 349 + }, + { + "epoch": 0.030692121031090844, + "grad_norm": 0.0810546875, + "learning_rate": 0.00105, + "loss": 1.1373, + "step": 350 + }, + { + "epoch": 0.03077981280546539, + "grad_norm": 0.072265625, + "learning_rate": 0.001053, + "loss": 1.154, + "step": 351 + }, + { + "epoch": 0.030867504579839933, + "grad_norm": 0.072265625, + "learning_rate": 0.0010559999999999999, + "loss": 1.0712, + "step": 352 + }, + { + "epoch": 0.03095519635421448, + "grad_norm": 0.0537109375, + "learning_rate": 0.001059, + "loss": 1.1138, + "step": 353 + }, + { + "epoch": 0.031042888128589026, + "grad_norm": 0.058349609375, + "learning_rate": 0.001062, + "loss": 1.1591, + "step": 354 + }, + { + "epoch": 0.03113057990296357, + "grad_norm": 0.07666015625, + "learning_rate": 0.001065, + "loss": 1.1796, + "step": 355 + }, + { + "epoch": 0.031218271677338115, + "grad_norm": 0.06640625, + "learning_rate": 0.001068, + "loss": 1.1627, + "step": 356 + }, + { + "epoch": 0.03130596345171266, + "grad_norm": 0.064453125, + "learning_rate": 0.001071, + "loss": 1.1789, + "step": 357 + }, + { + "epoch": 0.031393655226087204, + "grad_norm": 0.05810546875, + "learning_rate": 0.001074, + "loss": 1.1437, + "step": 358 + }, + { + "epoch": 0.031481347000461755, + "grad_norm": 0.05859375, + "learning_rate": 0.001077, + "loss": 1.135, + "step": 359 + }, + { + "epoch": 0.0315690387748363, + "grad_norm": 0.07275390625, + "learning_rate": 0.00108, + "loss": 1.1739, + "step": 360 + }, + { + "epoch": 0.031656730549210844, + "grad_norm": 0.064453125, + "learning_rate": 0.001083, + "loss": 1.1799, + "step": 361 + }, + { + "epoch": 0.03174442232358539, + "grad_norm": 0.052734375, + "learning_rate": 0.001086, + "loss": 1.1486, + "step": 362 + }, + { + "epoch": 0.03183211409795993, + "grad_norm": 0.07568359375, + "learning_rate": 0.001089, + "loss": 1.1124, + "step": 363 + }, + { + "epoch": 0.03191980587233448, + "grad_norm": 0.072265625, + "learning_rate": 0.001092, + "loss": 1.1203, + "step": 364 + }, + { + "epoch": 0.03200749764670902, + "grad_norm": 0.057373046875, + "learning_rate": 0.001095, + "loss": 1.1634, + "step": 365 + }, + { + "epoch": 0.03209518942108357, + "grad_norm": 0.06787109375, + "learning_rate": 0.001098, + "loss": 1.213, + "step": 366 + }, + { + "epoch": 0.03218288119545811, + "grad_norm": 0.0673828125, + "learning_rate": 0.001101, + "loss": 1.1538, + "step": 367 + }, + { + "epoch": 0.032270572969832656, + "grad_norm": 0.053466796875, + "learning_rate": 0.001104, + "loss": 1.1712, + "step": 368 + }, + { + "epoch": 0.03235826474420721, + "grad_norm": 0.07666015625, + "learning_rate": 0.001107, + "loss": 1.203, + "step": 369 + }, + { + "epoch": 0.03244595651858175, + "grad_norm": 0.052001953125, + "learning_rate": 0.00111, + "loss": 1.1138, + "step": 370 + }, + { + "epoch": 0.032533648292956296, + "grad_norm": 0.055908203125, + "learning_rate": 0.001113, + "loss": 1.2147, + "step": 371 + }, + { + "epoch": 0.03262134006733084, + "grad_norm": 0.06640625, + "learning_rate": 0.001116, + "loss": 1.1656, + "step": 372 + }, + { + "epoch": 0.032709031841705385, + "grad_norm": 0.0556640625, + "learning_rate": 0.001119, + "loss": 1.1468, + "step": 373 + }, + { + "epoch": 0.03279672361607993, + "grad_norm": 0.07373046875, + "learning_rate": 0.001122, + "loss": 1.0946, + "step": 374 + }, + { + "epoch": 0.032884415390454474, + "grad_norm": 0.064453125, + "learning_rate": 0.0011250000000000001, + "loss": 1.138, + "step": 375 + }, + { + "epoch": 0.03297210716482902, + "grad_norm": 0.056884765625, + "learning_rate": 0.001128, + "loss": 1.0943, + "step": 376 + }, + { + "epoch": 0.033059798939203563, + "grad_norm": 0.053955078125, + "learning_rate": 0.001131, + "loss": 1.175, + "step": 377 + }, + { + "epoch": 0.033147490713578115, + "grad_norm": 0.064453125, + "learning_rate": 0.001134, + "loss": 1.1694, + "step": 378 + }, + { + "epoch": 0.03323518248795266, + "grad_norm": 0.07666015625, + "learning_rate": 0.001137, + "loss": 1.1661, + "step": 379 + }, + { + "epoch": 0.033322874262327204, + "grad_norm": 0.056884765625, + "learning_rate": 0.00114, + "loss": 1.156, + "step": 380 + }, + { + "epoch": 0.03341056603670175, + "grad_norm": 0.078125, + "learning_rate": 0.0011430000000000001, + "loss": 1.1612, + "step": 381 + }, + { + "epoch": 0.03349825781107629, + "grad_norm": 0.07568359375, + "learning_rate": 0.001146, + "loss": 1.214, + "step": 382 + }, + { + "epoch": 0.03358594958545084, + "grad_norm": 0.0537109375, + "learning_rate": 0.001149, + "loss": 1.1871, + "step": 383 + }, + { + "epoch": 0.03367364135982538, + "grad_norm": 0.055419921875, + "learning_rate": 0.001152, + "loss": 1.1854, + "step": 384 + }, + { + "epoch": 0.033761333134199926, + "grad_norm": 0.1015625, + "learning_rate": 0.001155, + "loss": 1.2159, + "step": 385 + }, + { + "epoch": 0.03384902490857447, + "grad_norm": 0.07080078125, + "learning_rate": 0.001158, + "loss": 1.1848, + "step": 386 + }, + { + "epoch": 0.03393671668294902, + "grad_norm": 0.06201171875, + "learning_rate": 0.0011610000000000001, + "loss": 1.1314, + "step": 387 + }, + { + "epoch": 0.03402440845732357, + "grad_norm": 0.06591796875, + "learning_rate": 0.001164, + "loss": 1.0823, + "step": 388 + }, + { + "epoch": 0.03411210023169811, + "grad_norm": 0.12451171875, + "learning_rate": 0.001167, + "loss": 1.192, + "step": 389 + }, + { + "epoch": 0.034199792006072656, + "grad_norm": 0.05517578125, + "learning_rate": 0.00117, + "loss": 1.1671, + "step": 390 + }, + { + "epoch": 0.0342874837804472, + "grad_norm": 0.0703125, + "learning_rate": 0.001173, + "loss": 1.199, + "step": 391 + }, + { + "epoch": 0.034375175554821745, + "grad_norm": 0.087890625, + "learning_rate": 0.001176, + "loss": 1.1531, + "step": 392 + }, + { + "epoch": 0.03446286732919629, + "grad_norm": 0.0654296875, + "learning_rate": 0.0011790000000000001, + "loss": 1.1779, + "step": 393 + }, + { + "epoch": 0.034550559103570834, + "grad_norm": 0.0615234375, + "learning_rate": 0.001182, + "loss": 1.2149, + "step": 394 + }, + { + "epoch": 0.03463825087794538, + "grad_norm": 0.0859375, + "learning_rate": 0.001185, + "loss": 1.1088, + "step": 395 + }, + { + "epoch": 0.03472594265231993, + "grad_norm": 0.0810546875, + "learning_rate": 0.001188, + "loss": 1.2079, + "step": 396 + }, + { + "epoch": 0.034813634426694474, + "grad_norm": 0.06884765625, + "learning_rate": 0.001191, + "loss": 1.1626, + "step": 397 + }, + { + "epoch": 0.03490132620106902, + "grad_norm": 0.06787109375, + "learning_rate": 0.0011940000000000002, + "loss": 1.1697, + "step": 398 + }, + { + "epoch": 0.03498901797544356, + "grad_norm": 0.06787109375, + "learning_rate": 0.0011970000000000001, + "loss": 1.0707, + "step": 399 + }, + { + "epoch": 0.03507670974981811, + "grad_norm": 0.0625, + "learning_rate": 0.0012000000000000001, + "loss": 1.1594, + "step": 400 + }, + { + "epoch": 0.03516440152419265, + "grad_norm": 0.05712890625, + "learning_rate": 0.001203, + "loss": 1.1704, + "step": 401 + }, + { + "epoch": 0.0352520932985672, + "grad_norm": 0.05517578125, + "learning_rate": 0.001206, + "loss": 1.2377, + "step": 402 + }, + { + "epoch": 0.03533978507294174, + "grad_norm": 0.0654296875, + "learning_rate": 0.001209, + "loss": 1.1689, + "step": 403 + }, + { + "epoch": 0.035427476847316286, + "grad_norm": 0.09423828125, + "learning_rate": 0.0012120000000000002, + "loss": 1.1144, + "step": 404 + }, + { + "epoch": 0.03551516862169084, + "grad_norm": 0.06396484375, + "learning_rate": 0.0012150000000000002, + "loss": 1.1204, + "step": 405 + }, + { + "epoch": 0.03560286039606538, + "grad_norm": 0.052490234375, + "learning_rate": 0.0012180000000000001, + "loss": 1.0814, + "step": 406 + }, + { + "epoch": 0.035690552170439926, + "grad_norm": 0.06884765625, + "learning_rate": 0.0012209999999999999, + "loss": 1.1226, + "step": 407 + }, + { + "epoch": 0.03577824394481447, + "grad_norm": 0.0615234375, + "learning_rate": 0.001224, + "loss": 1.1736, + "step": 408 + }, + { + "epoch": 0.035865935719189015, + "grad_norm": 0.05908203125, + "learning_rate": 0.001227, + "loss": 1.1062, + "step": 409 + }, + { + "epoch": 0.03595362749356356, + "grad_norm": 0.08349609375, + "learning_rate": 0.00123, + "loss": 1.1278, + "step": 410 + }, + { + "epoch": 0.036041319267938104, + "grad_norm": 0.061279296875, + "learning_rate": 0.001233, + "loss": 1.1471, + "step": 411 + }, + { + "epoch": 0.03612901104231265, + "grad_norm": 0.0615234375, + "learning_rate": 0.001236, + "loss": 1.1663, + "step": 412 + }, + { + "epoch": 0.036216702816687193, + "grad_norm": 0.0673828125, + "learning_rate": 0.0012389999999999999, + "loss": 1.1849, + "step": 413 + }, + { + "epoch": 0.036304394591061745, + "grad_norm": 0.061279296875, + "learning_rate": 0.001242, + "loss": 1.1569, + "step": 414 + }, + { + "epoch": 0.03639208636543629, + "grad_norm": 0.055419921875, + "learning_rate": 0.001245, + "loss": 1.1768, + "step": 415 + }, + { + "epoch": 0.036479778139810834, + "grad_norm": 0.08154296875, + "learning_rate": 0.001248, + "loss": 1.1954, + "step": 416 + }, + { + "epoch": 0.03656746991418538, + "grad_norm": 0.05859375, + "learning_rate": 0.001251, + "loss": 1.1599, + "step": 417 + }, + { + "epoch": 0.03665516168855992, + "grad_norm": 0.07177734375, + "learning_rate": 0.001254, + "loss": 1.1768, + "step": 418 + }, + { + "epoch": 0.03674285346293447, + "grad_norm": 0.09521484375, + "learning_rate": 0.0012569999999999999, + "loss": 1.1885, + "step": 419 + }, + { + "epoch": 0.03683054523730901, + "grad_norm": 0.056884765625, + "learning_rate": 0.00126, + "loss": 1.1622, + "step": 420 + }, + { + "epoch": 0.036918237011683556, + "grad_norm": 0.06982421875, + "learning_rate": 0.001263, + "loss": 1.1106, + "step": 421 + }, + { + "epoch": 0.0370059287860581, + "grad_norm": 0.111328125, + "learning_rate": 0.001266, + "loss": 1.1625, + "step": 422 + }, + { + "epoch": 0.03709362056043265, + "grad_norm": 0.05908203125, + "learning_rate": 0.001269, + "loss": 1.196, + "step": 423 + }, + { + "epoch": 0.0371813123348072, + "grad_norm": 0.08935546875, + "learning_rate": 0.001272, + "loss": 1.1093, + "step": 424 + }, + { + "epoch": 0.03726900410918174, + "grad_norm": 0.099609375, + "learning_rate": 0.001275, + "loss": 1.1551, + "step": 425 + }, + { + "epoch": 0.037356695883556286, + "grad_norm": 0.061767578125, + "learning_rate": 0.001278, + "loss": 1.1507, + "step": 426 + }, + { + "epoch": 0.03744438765793083, + "grad_norm": 0.0986328125, + "learning_rate": 0.001281, + "loss": 1.1926, + "step": 427 + }, + { + "epoch": 0.037532079432305375, + "grad_norm": 0.1005859375, + "learning_rate": 0.001284, + "loss": 1.1639, + "step": 428 + }, + { + "epoch": 0.03761977120667992, + "grad_norm": 0.06689453125, + "learning_rate": 0.001287, + "loss": 1.0905, + "step": 429 + }, + { + "epoch": 0.037707462981054464, + "grad_norm": 0.08349609375, + "learning_rate": 0.00129, + "loss": 1.1512, + "step": 430 + }, + { + "epoch": 0.03779515475542901, + "grad_norm": 0.078125, + "learning_rate": 0.001293, + "loss": 1.2661, + "step": 431 + }, + { + "epoch": 0.03788284652980356, + "grad_norm": 0.056640625, + "learning_rate": 0.001296, + "loss": 1.1454, + "step": 432 + }, + { + "epoch": 0.037970538304178104, + "grad_norm": 0.0625, + "learning_rate": 0.001299, + "loss": 1.1774, + "step": 433 + }, + { + "epoch": 0.03805823007855265, + "grad_norm": 0.07177734375, + "learning_rate": 0.001302, + "loss": 1.1833, + "step": 434 + }, + { + "epoch": 0.03814592185292719, + "grad_norm": 0.058349609375, + "learning_rate": 0.001305, + "loss": 1.1726, + "step": 435 + }, + { + "epoch": 0.03823361362730174, + "grad_norm": 0.0703125, + "learning_rate": 0.001308, + "loss": 1.195, + "step": 436 + }, + { + "epoch": 0.03832130540167628, + "grad_norm": 0.07275390625, + "learning_rate": 0.001311, + "loss": 1.1641, + "step": 437 + }, + { + "epoch": 0.03840899717605083, + "grad_norm": 0.059326171875, + "learning_rate": 0.001314, + "loss": 1.1287, + "step": 438 + }, + { + "epoch": 0.03849668895042537, + "grad_norm": 0.059326171875, + "learning_rate": 0.001317, + "loss": 1.1811, + "step": 439 + }, + { + "epoch": 0.038584380724799916, + "grad_norm": 0.08154296875, + "learning_rate": 0.00132, + "loss": 1.176, + "step": 440 + }, + { + "epoch": 0.03867207249917447, + "grad_norm": 0.057373046875, + "learning_rate": 0.001323, + "loss": 1.1574, + "step": 441 + }, + { + "epoch": 0.03875976427354901, + "grad_norm": 0.059814453125, + "learning_rate": 0.0013260000000000001, + "loss": 1.1575, + "step": 442 + }, + { + "epoch": 0.038847456047923556, + "grad_norm": 0.06884765625, + "learning_rate": 0.001329, + "loss": 1.2033, + "step": 443 + }, + { + "epoch": 0.0389351478222981, + "grad_norm": 0.0751953125, + "learning_rate": 0.001332, + "loss": 1.0864, + "step": 444 + }, + { + "epoch": 0.039022839596672645, + "grad_norm": 0.06298828125, + "learning_rate": 0.001335, + "loss": 1.1684, + "step": 445 + }, + { + "epoch": 0.03911053137104719, + "grad_norm": 0.09375, + "learning_rate": 0.001338, + "loss": 1.1843, + "step": 446 + }, + { + "epoch": 0.039198223145421734, + "grad_norm": 0.1064453125, + "learning_rate": 0.001341, + "loss": 1.1668, + "step": 447 + }, + { + "epoch": 0.03928591491979628, + "grad_norm": 0.08056640625, + "learning_rate": 0.0013440000000000001, + "loss": 1.1332, + "step": 448 + }, + { + "epoch": 0.039373606694170823, + "grad_norm": 0.1025390625, + "learning_rate": 0.001347, + "loss": 1.1679, + "step": 449 + }, + { + "epoch": 0.039461298468545375, + "grad_norm": 0.0517578125, + "learning_rate": 0.00135, + "loss": 1.114, + "step": 450 + }, + { + "epoch": 0.03954899024291992, + "grad_norm": 0.0615234375, + "learning_rate": 0.001353, + "loss": 1.1945, + "step": 451 + }, + { + "epoch": 0.039636682017294464, + "grad_norm": 0.07080078125, + "learning_rate": 0.001356, + "loss": 1.1376, + "step": 452 + }, + { + "epoch": 0.03972437379166901, + "grad_norm": 0.05615234375, + "learning_rate": 0.001359, + "loss": 1.2279, + "step": 453 + }, + { + "epoch": 0.03981206556604355, + "grad_norm": 0.061767578125, + "learning_rate": 0.0013620000000000001, + "loss": 1.1463, + "step": 454 + }, + { + "epoch": 0.0398997573404181, + "grad_norm": 0.061279296875, + "learning_rate": 0.0013650000000000001, + "loss": 1.1059, + "step": 455 + }, + { + "epoch": 0.03998744911479264, + "grad_norm": 0.05859375, + "learning_rate": 0.001368, + "loss": 1.202, + "step": 456 + }, + { + "epoch": 0.040075140889167186, + "grad_norm": 0.07373046875, + "learning_rate": 0.001371, + "loss": 1.1904, + "step": 457 + }, + { + "epoch": 0.04016283266354173, + "grad_norm": 0.08740234375, + "learning_rate": 0.001374, + "loss": 1.1002, + "step": 458 + }, + { + "epoch": 0.04025052443791628, + "grad_norm": 0.0830078125, + "learning_rate": 0.0013770000000000002, + "loss": 1.175, + "step": 459 + }, + { + "epoch": 0.04033821621229083, + "grad_norm": 0.06591796875, + "learning_rate": 0.0013800000000000002, + "loss": 1.1055, + "step": 460 + }, + { + "epoch": 0.04042590798666537, + "grad_norm": 0.0732421875, + "learning_rate": 0.0013830000000000001, + "loss": 1.1636, + "step": 461 + }, + { + "epoch": 0.040513599761039916, + "grad_norm": 0.06396484375, + "learning_rate": 0.001386, + "loss": 1.2022, + "step": 462 + }, + { + "epoch": 0.04060129153541446, + "grad_norm": 0.0830078125, + "learning_rate": 0.001389, + "loss": 1.1555, + "step": 463 + }, + { + "epoch": 0.040688983309789005, + "grad_norm": 0.0869140625, + "learning_rate": 0.001392, + "loss": 1.127, + "step": 464 + }, + { + "epoch": 0.04077667508416355, + "grad_norm": 0.05908203125, + "learning_rate": 0.0013950000000000002, + "loss": 1.1509, + "step": 465 + }, + { + "epoch": 0.040864366858538094, + "grad_norm": 0.1396484375, + "learning_rate": 0.0013980000000000002, + "loss": 1.1856, + "step": 466 + }, + { + "epoch": 0.04095205863291264, + "grad_norm": 0.1044921875, + "learning_rate": 0.0014010000000000001, + "loss": 1.112, + "step": 467 + }, + { + "epoch": 0.04103975040728718, + "grad_norm": 0.0791015625, + "learning_rate": 0.001404, + "loss": 1.1798, + "step": 468 + }, + { + "epoch": 0.041127442181661734, + "grad_norm": 0.10302734375, + "learning_rate": 0.001407, + "loss": 1.157, + "step": 469 + }, + { + "epoch": 0.04121513395603628, + "grad_norm": 0.06787109375, + "learning_rate": 0.00141, + "loss": 1.1253, + "step": 470 + }, + { + "epoch": 0.04130282573041082, + "grad_norm": 0.06640625, + "learning_rate": 0.001413, + "loss": 1.2278, + "step": 471 + }, + { + "epoch": 0.04139051750478537, + "grad_norm": 0.07421875, + "learning_rate": 0.001416, + "loss": 1.1901, + "step": 472 + }, + { + "epoch": 0.04147820927915991, + "grad_norm": 0.058349609375, + "learning_rate": 0.001419, + "loss": 1.1247, + "step": 473 + }, + { + "epoch": 0.04156590105353446, + "grad_norm": 0.06884765625, + "learning_rate": 0.0014219999999999999, + "loss": 1.1976, + "step": 474 + }, + { + "epoch": 0.041653592827909, + "grad_norm": 0.07373046875, + "learning_rate": 0.001425, + "loss": 1.1147, + "step": 475 + }, + { + "epoch": 0.041741284602283546, + "grad_norm": 0.056396484375, + "learning_rate": 0.001428, + "loss": 1.1605, + "step": 476 + }, + { + "epoch": 0.04182897637665809, + "grad_norm": 0.06298828125, + "learning_rate": 0.001431, + "loss": 1.1467, + "step": 477 + }, + { + "epoch": 0.04191666815103264, + "grad_norm": 0.06982421875, + "learning_rate": 0.001434, + "loss": 1.1731, + "step": 478 + }, + { + "epoch": 0.042004359925407186, + "grad_norm": 0.0625, + "learning_rate": 0.001437, + "loss": 1.1652, + "step": 479 + }, + { + "epoch": 0.04209205169978173, + "grad_norm": 0.053466796875, + "learning_rate": 0.0014399999999999999, + "loss": 1.126, + "step": 480 + }, + { + "epoch": 0.042179743474156275, + "grad_norm": 0.05908203125, + "learning_rate": 0.001443, + "loss": 1.1246, + "step": 481 + }, + { + "epoch": 0.04226743524853082, + "grad_norm": 0.05615234375, + "learning_rate": 0.001446, + "loss": 1.1049, + "step": 482 + }, + { + "epoch": 0.042355127022905364, + "grad_norm": 0.0537109375, + "learning_rate": 0.001449, + "loss": 1.1786, + "step": 483 + }, + { + "epoch": 0.04244281879727991, + "grad_norm": 0.055419921875, + "learning_rate": 0.001452, + "loss": 1.1095, + "step": 484 + }, + { + "epoch": 0.042530510571654453, + "grad_norm": 0.05322265625, + "learning_rate": 0.001455, + "loss": 1.1608, + "step": 485 + }, + { + "epoch": 0.042618202346029, + "grad_norm": 0.08203125, + "learning_rate": 0.001458, + "loss": 1.2512, + "step": 486 + }, + { + "epoch": 0.04270589412040355, + "grad_norm": 0.05810546875, + "learning_rate": 0.001461, + "loss": 1.1778, + "step": 487 + }, + { + "epoch": 0.042793585894778094, + "grad_norm": 0.05322265625, + "learning_rate": 0.001464, + "loss": 1.1793, + "step": 488 + }, + { + "epoch": 0.04288127766915264, + "grad_norm": 0.0673828125, + "learning_rate": 0.001467, + "loss": 1.1419, + "step": 489 + }, + { + "epoch": 0.04296896944352718, + "grad_norm": 0.0654296875, + "learning_rate": 0.00147, + "loss": 1.1623, + "step": 490 + }, + { + "epoch": 0.04305666121790173, + "grad_norm": 0.064453125, + "learning_rate": 0.001473, + "loss": 1.162, + "step": 491 + }, + { + "epoch": 0.04314435299227627, + "grad_norm": 0.080078125, + "learning_rate": 0.001476, + "loss": 1.1999, + "step": 492 + }, + { + "epoch": 0.043232044766650816, + "grad_norm": 0.064453125, + "learning_rate": 0.001479, + "loss": 1.1679, + "step": 493 + }, + { + "epoch": 0.04331973654102536, + "grad_norm": 0.0634765625, + "learning_rate": 0.001482, + "loss": 1.1739, + "step": 494 + }, + { + "epoch": 0.043407428315399905, + "grad_norm": 0.05517578125, + "learning_rate": 0.001485, + "loss": 1.1403, + "step": 495 + }, + { + "epoch": 0.04349512008977446, + "grad_norm": 0.06494140625, + "learning_rate": 0.001488, + "loss": 1.1612, + "step": 496 + }, + { + "epoch": 0.043582811864149, + "grad_norm": 0.07373046875, + "learning_rate": 0.001491, + "loss": 1.1721, + "step": 497 + }, + { + "epoch": 0.043670503638523546, + "grad_norm": 0.07373046875, + "learning_rate": 0.001494, + "loss": 1.1623, + "step": 498 + }, + { + "epoch": 0.04375819541289809, + "grad_norm": 0.0693359375, + "learning_rate": 0.001497, + "loss": 1.1506, + "step": 499 + }, + { + "epoch": 0.043845887187272635, + "grad_norm": 0.2021484375, + "learning_rate": 0.0015, + "loss": 1.1622, + "step": 500 + }, + { + "epoch": 0.043845887187272635, + "eval_loss": 1.1753246784210205, + "eval_runtime": 427.0911, + "eval_samples_per_second": 33.827, + "eval_steps_per_second": 8.457, + "step": 500 + }, + { + "epoch": 0.04393357896164718, + "grad_norm": 0.06884765625, + "learning_rate": 0.001503, + "loss": 1.2038, + "step": 501 + }, + { + "epoch": 0.044021270736021724, + "grad_norm": 0.11083984375, + "learning_rate": 0.001506, + "loss": 1.1894, + "step": 502 + }, + { + "epoch": 0.04410896251039627, + "grad_norm": 0.10205078125, + "learning_rate": 0.0015090000000000001, + "loss": 1.1718, + "step": 503 + }, + { + "epoch": 0.04419665428477081, + "grad_norm": 0.083984375, + "learning_rate": 0.001512, + "loss": 1.1739, + "step": 504 + }, + { + "epoch": 0.044284346059145364, + "grad_norm": 0.10546875, + "learning_rate": 0.001515, + "loss": 1.2066, + "step": 505 + }, + { + "epoch": 0.04437203783351991, + "grad_norm": 0.11767578125, + "learning_rate": 0.001518, + "loss": 1.2004, + "step": 506 + }, + { + "epoch": 0.04445972960789445, + "grad_norm": 0.05615234375, + "learning_rate": 0.001521, + "loss": 1.1137, + "step": 507 + }, + { + "epoch": 0.044547421382269, + "grad_norm": 0.068359375, + "learning_rate": 0.001524, + "loss": 1.1641, + "step": 508 + }, + { + "epoch": 0.04463511315664354, + "grad_norm": 0.07177734375, + "learning_rate": 0.0015270000000000001, + "loss": 1.1574, + "step": 509 + }, + { + "epoch": 0.04472280493101809, + "grad_norm": 0.10205078125, + "learning_rate": 0.0015300000000000001, + "loss": 1.1937, + "step": 510 + }, + { + "epoch": 0.04481049670539263, + "grad_norm": 0.09130859375, + "learning_rate": 0.001533, + "loss": 1.1683, + "step": 511 + }, + { + "epoch": 0.044898188479767176, + "grad_norm": 0.0869140625, + "learning_rate": 0.001536, + "loss": 1.1322, + "step": 512 + }, + { + "epoch": 0.04498588025414172, + "grad_norm": 0.06689453125, + "learning_rate": 0.001539, + "loss": 1.2337, + "step": 513 + }, + { + "epoch": 0.04507357202851627, + "grad_norm": 0.06494140625, + "learning_rate": 0.001542, + "loss": 1.166, + "step": 514 + }, + { + "epoch": 0.045161263802890816, + "grad_norm": 0.06884765625, + "learning_rate": 0.0015450000000000001, + "loss": 1.172, + "step": 515 + }, + { + "epoch": 0.04524895557726536, + "grad_norm": 0.06982421875, + "learning_rate": 0.0015480000000000001, + "loss": 1.1241, + "step": 516 + }, + { + "epoch": 0.045336647351639905, + "grad_norm": 0.064453125, + "learning_rate": 0.001551, + "loss": 1.1513, + "step": 517 + }, + { + "epoch": 0.04542433912601445, + "grad_norm": 0.06591796875, + "learning_rate": 0.001554, + "loss": 1.1605, + "step": 518 + }, + { + "epoch": 0.045512030900388994, + "grad_norm": 0.06689453125, + "learning_rate": 0.001557, + "loss": 1.1583, + "step": 519 + }, + { + "epoch": 0.04559972267476354, + "grad_norm": 0.05908203125, + "learning_rate": 0.0015600000000000002, + "loss": 1.2352, + "step": 520 + }, + { + "epoch": 0.045687414449138083, + "grad_norm": 0.06982421875, + "learning_rate": 0.0015630000000000002, + "loss": 1.1565, + "step": 521 + }, + { + "epoch": 0.04577510622351263, + "grad_norm": 0.07177734375, + "learning_rate": 0.0015660000000000001, + "loss": 1.1127, + "step": 522 + }, + { + "epoch": 0.04586279799788718, + "grad_norm": 0.06103515625, + "learning_rate": 0.001569, + "loss": 1.1731, + "step": 523 + }, + { + "epoch": 0.045950489772261724, + "grad_norm": 0.062255859375, + "learning_rate": 0.001572, + "loss": 1.1597, + "step": 524 + }, + { + "epoch": 0.04603818154663627, + "grad_norm": 0.091796875, + "learning_rate": 0.001575, + "loss": 1.1712, + "step": 525 + }, + { + "epoch": 0.04612587332101081, + "grad_norm": 0.06103515625, + "learning_rate": 0.0015780000000000002, + "loss": 1.1518, + "step": 526 + }, + { + "epoch": 0.04621356509538536, + "grad_norm": 0.0576171875, + "learning_rate": 0.0015810000000000002, + "loss": 1.187, + "step": 527 + }, + { + "epoch": 0.0463012568697599, + "grad_norm": 0.059326171875, + "learning_rate": 0.0015840000000000001, + "loss": 1.2328, + "step": 528 + }, + { + "epoch": 0.046388948644134446, + "grad_norm": 0.057861328125, + "learning_rate": 0.001587, + "loss": 1.1717, + "step": 529 + }, + { + "epoch": 0.04647664041850899, + "grad_norm": 0.07763671875, + "learning_rate": 0.00159, + "loss": 1.195, + "step": 530 + }, + { + "epoch": 0.046564332192883535, + "grad_norm": 0.06982421875, + "learning_rate": 0.001593, + "loss": 1.1765, + "step": 531 + }, + { + "epoch": 0.04665202396725809, + "grad_norm": 0.072265625, + "learning_rate": 0.0015960000000000002, + "loss": 1.1699, + "step": 532 + }, + { + "epoch": 0.04673971574163263, + "grad_norm": 0.06591796875, + "learning_rate": 0.0015990000000000002, + "loss": 1.2119, + "step": 533 + }, + { + "epoch": 0.046827407516007176, + "grad_norm": 0.0654296875, + "learning_rate": 0.0016020000000000001, + "loss": 1.1685, + "step": 534 + }, + { + "epoch": 0.04691509929038172, + "grad_norm": 0.06689453125, + "learning_rate": 0.001605, + "loss": 1.1741, + "step": 535 + }, + { + "epoch": 0.047002791064756265, + "grad_norm": 0.0791015625, + "learning_rate": 0.001608, + "loss": 1.1958, + "step": 536 + }, + { + "epoch": 0.04709048283913081, + "grad_norm": 0.0634765625, + "learning_rate": 0.0016110000000000002, + "loss": 1.1116, + "step": 537 + }, + { + "epoch": 0.047178174613505354, + "grad_norm": 0.064453125, + "learning_rate": 0.0016140000000000002, + "loss": 1.1486, + "step": 538 + }, + { + "epoch": 0.0472658663878799, + "grad_norm": 0.07177734375, + "learning_rate": 0.0016170000000000002, + "loss": 1.2106, + "step": 539 + }, + { + "epoch": 0.04735355816225444, + "grad_norm": 0.06787109375, + "learning_rate": 0.0016200000000000001, + "loss": 1.084, + "step": 540 + }, + { + "epoch": 0.047441249936628994, + "grad_norm": 0.08740234375, + "learning_rate": 0.001623, + "loss": 1.1961, + "step": 541 + }, + { + "epoch": 0.04752894171100354, + "grad_norm": 0.064453125, + "learning_rate": 0.001626, + "loss": 1.1434, + "step": 542 + }, + { + "epoch": 0.04761663348537808, + "grad_norm": 0.07470703125, + "learning_rate": 0.0016290000000000002, + "loss": 1.104, + "step": 543 + }, + { + "epoch": 0.04770432525975263, + "grad_norm": 0.07763671875, + "learning_rate": 0.0016320000000000002, + "loss": 1.1166, + "step": 544 + }, + { + "epoch": 0.04779201703412717, + "grad_norm": 0.056884765625, + "learning_rate": 0.0016350000000000002, + "loss": 1.124, + "step": 545 + }, + { + "epoch": 0.04787970880850172, + "grad_norm": 0.080078125, + "learning_rate": 0.0016380000000000001, + "loss": 1.1609, + "step": 546 + }, + { + "epoch": 0.04796740058287626, + "grad_norm": 0.08984375, + "learning_rate": 0.001641, + "loss": 1.2102, + "step": 547 + }, + { + "epoch": 0.048055092357250806, + "grad_norm": 0.06982421875, + "learning_rate": 0.001644, + "loss": 1.1698, + "step": 548 + }, + { + "epoch": 0.04814278413162535, + "grad_norm": 0.09521484375, + "learning_rate": 0.0016470000000000002, + "loss": 1.1735, + "step": 549 + }, + { + "epoch": 0.0482304759059999, + "grad_norm": 0.07421875, + "learning_rate": 0.0016500000000000002, + "loss": 1.2511, + "step": 550 + }, + { + "epoch": 0.048318167680374446, + "grad_norm": 0.07666015625, + "learning_rate": 0.0016530000000000002, + "loss": 1.2098, + "step": 551 + }, + { + "epoch": 0.04840585945474899, + "grad_norm": 0.09521484375, + "learning_rate": 0.0016560000000000001, + "loss": 1.1615, + "step": 552 + }, + { + "epoch": 0.048493551229123535, + "grad_norm": 0.07666015625, + "learning_rate": 0.001659, + "loss": 1.1997, + "step": 553 + }, + { + "epoch": 0.04858124300349808, + "grad_norm": 0.05517578125, + "learning_rate": 0.0016620000000000003, + "loss": 1.1446, + "step": 554 + }, + { + "epoch": 0.048668934777872624, + "grad_norm": 0.06201171875, + "learning_rate": 0.0016650000000000002, + "loss": 1.2135, + "step": 555 + }, + { + "epoch": 0.04875662655224717, + "grad_norm": 0.08154296875, + "learning_rate": 0.0016680000000000002, + "loss": 1.156, + "step": 556 + }, + { + "epoch": 0.04884431832662171, + "grad_norm": 0.06298828125, + "learning_rate": 0.0016710000000000002, + "loss": 1.208, + "step": 557 + }, + { + "epoch": 0.04893201010099626, + "grad_norm": 0.08935546875, + "learning_rate": 0.0016740000000000001, + "loss": 1.1824, + "step": 558 + }, + { + "epoch": 0.04901970187537081, + "grad_norm": 0.072265625, + "learning_rate": 0.001677, + "loss": 1.124, + "step": 559 + }, + { + "epoch": 0.049107393649745354, + "grad_norm": 0.059814453125, + "learning_rate": 0.0016800000000000003, + "loss": 1.2047, + "step": 560 + }, + { + "epoch": 0.0491950854241199, + "grad_norm": 0.057861328125, + "learning_rate": 0.0016830000000000003, + "loss": 1.174, + "step": 561 + }, + { + "epoch": 0.04928277719849444, + "grad_norm": 0.05712890625, + "learning_rate": 0.0016860000000000002, + "loss": 1.1866, + "step": 562 + }, + { + "epoch": 0.04937046897286899, + "grad_norm": 0.08642578125, + "learning_rate": 0.001689, + "loss": 1.1495, + "step": 563 + }, + { + "epoch": 0.04945816074724353, + "grad_norm": 0.060546875, + "learning_rate": 0.001692, + "loss": 1.1851, + "step": 564 + }, + { + "epoch": 0.049545852521618076, + "grad_norm": 0.09326171875, + "learning_rate": 0.001695, + "loss": 1.1824, + "step": 565 + }, + { + "epoch": 0.04963354429599262, + "grad_norm": 0.0625, + "learning_rate": 0.0016979999999999999, + "loss": 1.1339, + "step": 566 + }, + { + "epoch": 0.049721236070367165, + "grad_norm": 0.09130859375, + "learning_rate": 0.0017009999999999998, + "loss": 1.1914, + "step": 567 + }, + { + "epoch": 0.04980892784474172, + "grad_norm": 0.06689453125, + "learning_rate": 0.0017039999999999998, + "loss": 1.1541, + "step": 568 + }, + { + "epoch": 0.04989661961911626, + "grad_norm": 0.072265625, + "learning_rate": 0.001707, + "loss": 1.1928, + "step": 569 + }, + { + "epoch": 0.049984311393490806, + "grad_norm": 0.083984375, + "learning_rate": 0.00171, + "loss": 1.1804, + "step": 570 + }, + { + "epoch": 0.05007200316786535, + "grad_norm": 0.08203125, + "learning_rate": 0.001713, + "loss": 1.2177, + "step": 571 + }, + { + "epoch": 0.050159694942239895, + "grad_norm": 0.06884765625, + "learning_rate": 0.0017159999999999999, + "loss": 1.1609, + "step": 572 + }, + { + "epoch": 0.05024738671661444, + "grad_norm": 0.061279296875, + "learning_rate": 0.0017189999999999998, + "loss": 1.1638, + "step": 573 + }, + { + "epoch": 0.050335078490988984, + "grad_norm": 0.059814453125, + "learning_rate": 0.001722, + "loss": 1.1652, + "step": 574 + }, + { + "epoch": 0.05042277026536353, + "grad_norm": 0.056640625, + "learning_rate": 0.001725, + "loss": 1.139, + "step": 575 + }, + { + "epoch": 0.05051046203973807, + "grad_norm": 0.060791015625, + "learning_rate": 0.001728, + "loss": 1.1755, + "step": 576 + }, + { + "epoch": 0.05059815381411262, + "grad_norm": 0.058837890625, + "learning_rate": 0.001731, + "loss": 1.1723, + "step": 577 + }, + { + "epoch": 0.05068584558848717, + "grad_norm": 0.06689453125, + "learning_rate": 0.0017339999999999999, + "loss": 1.2931, + "step": 578 + }, + { + "epoch": 0.05077353736286171, + "grad_norm": 0.1025390625, + "learning_rate": 0.0017369999999999998, + "loss": 1.2084, + "step": 579 + }, + { + "epoch": 0.05086122913723626, + "grad_norm": 0.08056640625, + "learning_rate": 0.00174, + "loss": 1.1393, + "step": 580 + }, + { + "epoch": 0.0509489209116108, + "grad_norm": 0.09814453125, + "learning_rate": 0.001743, + "loss": 1.1763, + "step": 581 + }, + { + "epoch": 0.05103661268598535, + "grad_norm": 0.060546875, + "learning_rate": 0.001746, + "loss": 1.1301, + "step": 582 + }, + { + "epoch": 0.05112430446035989, + "grad_norm": 0.10009765625, + "learning_rate": 0.001749, + "loss": 1.2042, + "step": 583 + }, + { + "epoch": 0.051211996234734436, + "grad_norm": 0.07958984375, + "learning_rate": 0.0017519999999999999, + "loss": 1.1959, + "step": 584 + }, + { + "epoch": 0.05129968800910898, + "grad_norm": 0.08203125, + "learning_rate": 0.0017549999999999998, + "loss": 1.1316, + "step": 585 + }, + { + "epoch": 0.051387379783483525, + "grad_norm": 0.072265625, + "learning_rate": 0.001758, + "loss": 1.2323, + "step": 586 + }, + { + "epoch": 0.051475071557858076, + "grad_norm": 0.059326171875, + "learning_rate": 0.001761, + "loss": 1.1328, + "step": 587 + }, + { + "epoch": 0.05156276333223262, + "grad_norm": 0.06982421875, + "learning_rate": 0.001764, + "loss": 1.1951, + "step": 588 + }, + { + "epoch": 0.051650455106607165, + "grad_norm": 0.0625, + "learning_rate": 0.001767, + "loss": 1.1176, + "step": 589 + }, + { + "epoch": 0.05173814688098171, + "grad_norm": 0.07373046875, + "learning_rate": 0.0017699999999999999, + "loss": 1.1789, + "step": 590 + }, + { + "epoch": 0.051825838655356254, + "grad_norm": 0.06298828125, + "learning_rate": 0.001773, + "loss": 1.161, + "step": 591 + }, + { + "epoch": 0.0519135304297308, + "grad_norm": 0.061279296875, + "learning_rate": 0.001776, + "loss": 1.194, + "step": 592 + }, + { + "epoch": 0.05200122220410534, + "grad_norm": 0.072265625, + "learning_rate": 0.001779, + "loss": 1.2048, + "step": 593 + }, + { + "epoch": 0.05208891397847989, + "grad_norm": 0.072265625, + "learning_rate": 0.001782, + "loss": 1.2059, + "step": 594 + }, + { + "epoch": 0.05217660575285443, + "grad_norm": 0.0712890625, + "learning_rate": 0.001785, + "loss": 1.1874, + "step": 595 + }, + { + "epoch": 0.052264297527228984, + "grad_norm": 0.08984375, + "learning_rate": 0.0017879999999999999, + "loss": 1.221, + "step": 596 + }, + { + "epoch": 0.05235198930160353, + "grad_norm": 0.064453125, + "learning_rate": 0.001791, + "loss": 1.165, + "step": 597 + }, + { + "epoch": 0.05243968107597807, + "grad_norm": 0.08935546875, + "learning_rate": 0.001794, + "loss": 1.2116, + "step": 598 + }, + { + "epoch": 0.05252737285035262, + "grad_norm": 0.07275390625, + "learning_rate": 0.001797, + "loss": 1.1841, + "step": 599 + }, + { + "epoch": 0.05261506462472716, + "grad_norm": 0.09130859375, + "learning_rate": 0.0018, + "loss": 1.1804, + "step": 600 + }, + { + "epoch": 0.052702756399101706, + "grad_norm": 0.060546875, + "learning_rate": 0.001803, + "loss": 1.1746, + "step": 601 + }, + { + "epoch": 0.05279044817347625, + "grad_norm": 0.068359375, + "learning_rate": 0.0018059999999999999, + "loss": 1.2058, + "step": 602 + }, + { + "epoch": 0.052878139947850795, + "grad_norm": 0.060546875, + "learning_rate": 0.001809, + "loss": 1.1783, + "step": 603 + }, + { + "epoch": 0.05296583172222534, + "grad_norm": 0.1318359375, + "learning_rate": 0.001812, + "loss": 1.1468, + "step": 604 + }, + { + "epoch": 0.05305352349659989, + "grad_norm": 0.062255859375, + "learning_rate": 0.001815, + "loss": 1.1409, + "step": 605 + }, + { + "epoch": 0.053141215270974436, + "grad_norm": 0.091796875, + "learning_rate": 0.001818, + "loss": 1.1699, + "step": 606 + }, + { + "epoch": 0.05322890704534898, + "grad_norm": 0.0771484375, + "learning_rate": 0.001821, + "loss": 1.1966, + "step": 607 + }, + { + "epoch": 0.053316598819723525, + "grad_norm": 0.055419921875, + "learning_rate": 0.001824, + "loss": 1.1734, + "step": 608 + }, + { + "epoch": 0.05340429059409807, + "grad_norm": 0.07666015625, + "learning_rate": 0.001827, + "loss": 1.183, + "step": 609 + }, + { + "epoch": 0.053491982368472614, + "grad_norm": 0.07421875, + "learning_rate": 0.00183, + "loss": 1.1807, + "step": 610 + }, + { + "epoch": 0.05357967414284716, + "grad_norm": 0.064453125, + "learning_rate": 0.001833, + "loss": 1.164, + "step": 611 + }, + { + "epoch": 0.0536673659172217, + "grad_norm": 0.0634765625, + "learning_rate": 0.001836, + "loss": 1.1253, + "step": 612 + }, + { + "epoch": 0.05375505769159625, + "grad_norm": 0.07861328125, + "learning_rate": 0.001839, + "loss": 1.1593, + "step": 613 + }, + { + "epoch": 0.0538427494659708, + "grad_norm": 0.0595703125, + "learning_rate": 0.001842, + "loss": 1.2045, + "step": 614 + }, + { + "epoch": 0.05393044124034534, + "grad_norm": 0.05810546875, + "learning_rate": 0.001845, + "loss": 1.1429, + "step": 615 + }, + { + "epoch": 0.05401813301471989, + "grad_norm": 0.064453125, + "learning_rate": 0.001848, + "loss": 1.1433, + "step": 616 + }, + { + "epoch": 0.05410582478909443, + "grad_norm": 0.054931640625, + "learning_rate": 0.001851, + "loss": 1.1286, + "step": 617 + }, + { + "epoch": 0.05419351656346898, + "grad_norm": 0.06396484375, + "learning_rate": 0.001854, + "loss": 1.1921, + "step": 618 + }, + { + "epoch": 0.05428120833784352, + "grad_norm": 0.07666015625, + "learning_rate": 0.001857, + "loss": 1.1742, + "step": 619 + }, + { + "epoch": 0.054368900112218066, + "grad_norm": 0.09814453125, + "learning_rate": 0.00186, + "loss": 1.2216, + "step": 620 + }, + { + "epoch": 0.05445659188659261, + "grad_norm": 0.059814453125, + "learning_rate": 0.001863, + "loss": 1.1694, + "step": 621 + }, + { + "epoch": 0.054544283660967155, + "grad_norm": 0.06787109375, + "learning_rate": 0.001866, + "loss": 1.1512, + "step": 622 + }, + { + "epoch": 0.054631975435341706, + "grad_norm": 0.06298828125, + "learning_rate": 0.001869, + "loss": 1.1755, + "step": 623 + }, + { + "epoch": 0.05471966720971625, + "grad_norm": 0.05615234375, + "learning_rate": 0.001872, + "loss": 1.1805, + "step": 624 + }, + { + "epoch": 0.054807358984090795, + "grad_norm": 0.06396484375, + "learning_rate": 0.001875, + "loss": 1.2201, + "step": 625 + }, + { + "epoch": 0.05489505075846534, + "grad_norm": 0.07568359375, + "learning_rate": 0.0018780000000000001, + "loss": 1.255, + "step": 626 + }, + { + "epoch": 0.054982742532839884, + "grad_norm": 0.099609375, + "learning_rate": 0.001881, + "loss": 1.1907, + "step": 627 + }, + { + "epoch": 0.05507043430721443, + "grad_norm": 0.06298828125, + "learning_rate": 0.001884, + "loss": 1.1669, + "step": 628 + }, + { + "epoch": 0.05515812608158897, + "grad_norm": 0.08935546875, + "learning_rate": 0.001887, + "loss": 1.1855, + "step": 629 + }, + { + "epoch": 0.05524581785596352, + "grad_norm": 0.07177734375, + "learning_rate": 0.00189, + "loss": 1.1983, + "step": 630 + }, + { + "epoch": 0.05533350963033806, + "grad_norm": 0.087890625, + "learning_rate": 0.0018930000000000002, + "loss": 1.2005, + "step": 631 + }, + { + "epoch": 0.055421201404712614, + "grad_norm": 0.06689453125, + "learning_rate": 0.0018960000000000001, + "loss": 1.1762, + "step": 632 + }, + { + "epoch": 0.05550889317908716, + "grad_norm": 0.10009765625, + "learning_rate": 0.001899, + "loss": 1.2092, + "step": 633 + }, + { + "epoch": 0.0555965849534617, + "grad_norm": 0.0859375, + "learning_rate": 0.001902, + "loss": 1.1696, + "step": 634 + }, + { + "epoch": 0.05568427672783625, + "grad_norm": 0.103515625, + "learning_rate": 0.001905, + "loss": 1.218, + "step": 635 + }, + { + "epoch": 0.05577196850221079, + "grad_norm": 0.138671875, + "learning_rate": 0.001908, + "loss": 1.2202, + "step": 636 + }, + { + "epoch": 0.055859660276585336, + "grad_norm": 0.0927734375, + "learning_rate": 0.0019110000000000002, + "loss": 1.1392, + "step": 637 + }, + { + "epoch": 0.05594735205095988, + "grad_norm": 0.1083984375, + "learning_rate": 0.0019140000000000001, + "loss": 1.1393, + "step": 638 + }, + { + "epoch": 0.056035043825334425, + "grad_norm": 0.072265625, + "learning_rate": 0.001917, + "loss": 1.1697, + "step": 639 + }, + { + "epoch": 0.05612273559970897, + "grad_norm": 0.08447265625, + "learning_rate": 0.00192, + "loss": 1.1154, + "step": 640 + }, + { + "epoch": 0.05621042737408352, + "grad_norm": 0.087890625, + "learning_rate": 0.001923, + "loss": 1.1602, + "step": 641 + }, + { + "epoch": 0.056298119148458066, + "grad_norm": 0.07080078125, + "learning_rate": 0.001926, + "loss": 1.163, + "step": 642 + }, + { + "epoch": 0.05638581092283261, + "grad_norm": 0.10400390625, + "learning_rate": 0.0019290000000000002, + "loss": 1.1692, + "step": 643 + }, + { + "epoch": 0.056473502697207155, + "grad_norm": 0.07080078125, + "learning_rate": 0.0019320000000000001, + "loss": 1.1855, + "step": 644 + }, + { + "epoch": 0.0565611944715817, + "grad_norm": 0.06982421875, + "learning_rate": 0.001935, + "loss": 1.1763, + "step": 645 + }, + { + "epoch": 0.056648886245956244, + "grad_norm": 0.115234375, + "learning_rate": 0.001938, + "loss": 1.1464, + "step": 646 + }, + { + "epoch": 0.05673657802033079, + "grad_norm": 0.060791015625, + "learning_rate": 0.001941, + "loss": 1.2453, + "step": 647 + }, + { + "epoch": 0.05682426979470533, + "grad_norm": 0.09033203125, + "learning_rate": 0.0019440000000000002, + "loss": 1.1012, + "step": 648 + }, + { + "epoch": 0.05691196156907988, + "grad_norm": 0.08251953125, + "learning_rate": 0.0019470000000000002, + "loss": 1.1887, + "step": 649 + }, + { + "epoch": 0.05699965334345443, + "grad_norm": 0.0693359375, + "learning_rate": 0.0019500000000000001, + "loss": 1.1739, + "step": 650 + }, + { + "epoch": 0.05708734511782897, + "grad_norm": 0.0751953125, + "learning_rate": 0.001953, + "loss": 1.1621, + "step": 651 + }, + { + "epoch": 0.05717503689220352, + "grad_norm": 0.0751953125, + "learning_rate": 0.0019560000000000003, + "loss": 1.2449, + "step": 652 + }, + { + "epoch": 0.05726272866657806, + "grad_norm": 0.06103515625, + "learning_rate": 0.0019590000000000002, + "loss": 1.2406, + "step": 653 + }, + { + "epoch": 0.05735042044095261, + "grad_norm": 0.0634765625, + "learning_rate": 0.001962, + "loss": 1.2487, + "step": 654 + }, + { + "epoch": 0.05743811221532715, + "grad_norm": 0.06201171875, + "learning_rate": 0.001965, + "loss": 1.1728, + "step": 655 + }, + { + "epoch": 0.057525803989701696, + "grad_norm": 0.09912109375, + "learning_rate": 0.001968, + "loss": 1.2062, + "step": 656 + }, + { + "epoch": 0.05761349576407624, + "grad_norm": 0.0703125, + "learning_rate": 0.001971, + "loss": 1.1946, + "step": 657 + }, + { + "epoch": 0.057701187538450785, + "grad_norm": 0.08349609375, + "learning_rate": 0.001974, + "loss": 1.1645, + "step": 658 + }, + { + "epoch": 0.057788879312825336, + "grad_norm": 0.08251953125, + "learning_rate": 0.001977, + "loss": 1.1662, + "step": 659 + }, + { + "epoch": 0.05787657108719988, + "grad_norm": 0.087890625, + "learning_rate": 0.00198, + "loss": 1.1606, + "step": 660 + }, + { + "epoch": 0.057964262861574425, + "grad_norm": 0.064453125, + "learning_rate": 0.001983, + "loss": 1.1823, + "step": 661 + }, + { + "epoch": 0.05805195463594897, + "grad_norm": 0.068359375, + "learning_rate": 0.0019860000000000004, + "loss": 1.2349, + "step": 662 + }, + { + "epoch": 0.058139646410323514, + "grad_norm": 0.07568359375, + "learning_rate": 0.0019890000000000003, + "loss": 1.1962, + "step": 663 + }, + { + "epoch": 0.05822733818469806, + "grad_norm": 0.07080078125, + "learning_rate": 0.0019920000000000003, + "loss": 1.2109, + "step": 664 + }, + { + "epoch": 0.0583150299590726, + "grad_norm": 0.0732421875, + "learning_rate": 0.0019950000000000002, + "loss": 1.2259, + "step": 665 + }, + { + "epoch": 0.05840272173344715, + "grad_norm": 0.0791015625, + "learning_rate": 0.001998, + "loss": 1.2124, + "step": 666 + }, + { + "epoch": 0.05849041350782169, + "grad_norm": 0.08349609375, + "learning_rate": 0.002001, + "loss": 1.1655, + "step": 667 + }, + { + "epoch": 0.058578105282196244, + "grad_norm": 0.078125, + "learning_rate": 0.002004, + "loss": 1.204, + "step": 668 + }, + { + "epoch": 0.05866579705657079, + "grad_norm": 0.0703125, + "learning_rate": 0.002007, + "loss": 1.1704, + "step": 669 + }, + { + "epoch": 0.05875348883094533, + "grad_norm": 0.08447265625, + "learning_rate": 0.00201, + "loss": 1.1911, + "step": 670 + }, + { + "epoch": 0.05884118060531988, + "grad_norm": 0.08984375, + "learning_rate": 0.002013, + "loss": 1.1369, + "step": 671 + }, + { + "epoch": 0.05892887237969442, + "grad_norm": 0.053955078125, + "learning_rate": 0.002016, + "loss": 1.1515, + "step": 672 + }, + { + "epoch": 0.059016564154068966, + "grad_norm": 0.08642578125, + "learning_rate": 0.002019, + "loss": 1.1569, + "step": 673 + }, + { + "epoch": 0.05910425592844351, + "grad_norm": 0.06298828125, + "learning_rate": 0.0020220000000000004, + "loss": 1.178, + "step": 674 + }, + { + "epoch": 0.059191947702818055, + "grad_norm": 0.06884765625, + "learning_rate": 0.0020250000000000003, + "loss": 1.1742, + "step": 675 + }, + { + "epoch": 0.0592796394771926, + "grad_norm": 0.0859375, + "learning_rate": 0.0020280000000000003, + "loss": 1.2247, + "step": 676 + }, + { + "epoch": 0.059367331251567144, + "grad_norm": 0.06298828125, + "learning_rate": 0.0020310000000000003, + "loss": 1.2672, + "step": 677 + }, + { + "epoch": 0.059455023025941696, + "grad_norm": 0.1025390625, + "learning_rate": 0.0020340000000000002, + "loss": 1.1972, + "step": 678 + }, + { + "epoch": 0.05954271480031624, + "grad_norm": 0.061279296875, + "learning_rate": 0.002037, + "loss": 1.1777, + "step": 679 + }, + { + "epoch": 0.059630406574690785, + "grad_norm": 0.0888671875, + "learning_rate": 0.00204, + "loss": 1.1733, + "step": 680 + }, + { + "epoch": 0.05971809834906533, + "grad_norm": 0.06494140625, + "learning_rate": 0.002043, + "loss": 1.1494, + "step": 681 + }, + { + "epoch": 0.059805790123439874, + "grad_norm": 0.06591796875, + "learning_rate": 0.002046, + "loss": 1.1838, + "step": 682 + }, + { + "epoch": 0.05989348189781442, + "grad_norm": 0.060302734375, + "learning_rate": 0.002049, + "loss": 1.2084, + "step": 683 + }, + { + "epoch": 0.05998117367218896, + "grad_norm": 0.064453125, + "learning_rate": 0.002052, + "loss": 1.1681, + "step": 684 + }, + { + "epoch": 0.06006886544656351, + "grad_norm": 0.08642578125, + "learning_rate": 0.0020550000000000004, + "loss": 1.2024, + "step": 685 + }, + { + "epoch": 0.06015655722093805, + "grad_norm": 0.06689453125, + "learning_rate": 0.0020580000000000004, + "loss": 1.1547, + "step": 686 + }, + { + "epoch": 0.0602442489953126, + "grad_norm": 0.0654296875, + "learning_rate": 0.0020610000000000003, + "loss": 1.226, + "step": 687 + }, + { + "epoch": 0.06033194076968715, + "grad_norm": 0.07421875, + "learning_rate": 0.002064, + "loss": 1.1922, + "step": 688 + }, + { + "epoch": 0.06041963254406169, + "grad_norm": 0.05810546875, + "learning_rate": 0.002067, + "loss": 1.1949, + "step": 689 + }, + { + "epoch": 0.06050732431843624, + "grad_norm": 0.0595703125, + "learning_rate": 0.00207, + "loss": 1.2049, + "step": 690 + }, + { + "epoch": 0.06059501609281078, + "grad_norm": 0.061279296875, + "learning_rate": 0.0020729999999999998, + "loss": 1.1883, + "step": 691 + }, + { + "epoch": 0.060682707867185326, + "grad_norm": 0.08740234375, + "learning_rate": 0.0020759999999999997, + "loss": 1.2099, + "step": 692 + }, + { + "epoch": 0.06077039964155987, + "grad_norm": 0.08251953125, + "learning_rate": 0.0020789999999999997, + "loss": 1.1957, + "step": 693 + }, + { + "epoch": 0.060858091415934415, + "grad_norm": 0.06494140625, + "learning_rate": 0.002082, + "loss": 1.2148, + "step": 694 + }, + { + "epoch": 0.06094578319030896, + "grad_norm": 0.05859375, + "learning_rate": 0.002085, + "loss": 1.1443, + "step": 695 + }, + { + "epoch": 0.06103347496468351, + "grad_norm": 0.058349609375, + "learning_rate": 0.002088, + "loss": 1.1784, + "step": 696 + }, + { + "epoch": 0.061121166739058055, + "grad_norm": 0.07275390625, + "learning_rate": 0.002091, + "loss": 1.1608, + "step": 697 + }, + { + "epoch": 0.0612088585134326, + "grad_norm": 0.08349609375, + "learning_rate": 0.002094, + "loss": 1.1311, + "step": 698 + }, + { + "epoch": 0.061296550287807144, + "grad_norm": 0.06884765625, + "learning_rate": 0.002097, + "loss": 1.2302, + "step": 699 + }, + { + "epoch": 0.06138424206218169, + "grad_norm": 0.0771484375, + "learning_rate": 0.0021, + "loss": 1.2455, + "step": 700 + }, + { + "epoch": 0.06147193383655623, + "grad_norm": 0.08154296875, + "learning_rate": 0.002103, + "loss": 1.2083, + "step": 701 + }, + { + "epoch": 0.06155962561093078, + "grad_norm": 0.06689453125, + "learning_rate": 0.002106, + "loss": 1.1682, + "step": 702 + }, + { + "epoch": 0.06164731738530532, + "grad_norm": 0.0712890625, + "learning_rate": 0.0021089999999999998, + "loss": 1.1818, + "step": 703 + }, + { + "epoch": 0.06173500915967987, + "grad_norm": 0.09423828125, + "learning_rate": 0.0021119999999999997, + "loss": 1.2536, + "step": 704 + }, + { + "epoch": 0.06182270093405442, + "grad_norm": 0.06640625, + "learning_rate": 0.002115, + "loss": 1.154, + "step": 705 + }, + { + "epoch": 0.06191039270842896, + "grad_norm": 0.140625, + "learning_rate": 0.002118, + "loss": 1.1649, + "step": 706 + }, + { + "epoch": 0.06199808448280351, + "grad_norm": 0.07666015625, + "learning_rate": 0.002121, + "loss": 1.2484, + "step": 707 + }, + { + "epoch": 0.06208577625717805, + "grad_norm": 0.11083984375, + "learning_rate": 0.002124, + "loss": 1.2103, + "step": 708 + }, + { + "epoch": 0.062173468031552596, + "grad_norm": 0.0732421875, + "learning_rate": 0.002127, + "loss": 1.1832, + "step": 709 + }, + { + "epoch": 0.06226115980592714, + "grad_norm": 0.06494140625, + "learning_rate": 0.00213, + "loss": 1.1885, + "step": 710 + }, + { + "epoch": 0.062348851580301685, + "grad_norm": 0.06396484375, + "learning_rate": 0.002133, + "loss": 1.2414, + "step": 711 + }, + { + "epoch": 0.06243654335467623, + "grad_norm": 0.09912109375, + "learning_rate": 0.002136, + "loss": 1.2285, + "step": 712 + }, + { + "epoch": 0.06252423512905078, + "grad_norm": 0.062255859375, + "learning_rate": 0.002139, + "loss": 1.2218, + "step": 713 + }, + { + "epoch": 0.06261192690342532, + "grad_norm": 0.0908203125, + "learning_rate": 0.002142, + "loss": 1.201, + "step": 714 + }, + { + "epoch": 0.06269961867779987, + "grad_norm": 0.1083984375, + "learning_rate": 0.0021449999999999998, + "loss": 1.1846, + "step": 715 + }, + { + "epoch": 0.06278731045217441, + "grad_norm": 0.07763671875, + "learning_rate": 0.002148, + "loss": 1.1882, + "step": 716 + }, + { + "epoch": 0.06287500222654896, + "grad_norm": 0.107421875, + "learning_rate": 0.002151, + "loss": 1.2032, + "step": 717 + }, + { + "epoch": 0.06296269400092351, + "grad_norm": 0.058349609375, + "learning_rate": 0.002154, + "loss": 1.2151, + "step": 718 + }, + { + "epoch": 0.06305038577529805, + "grad_norm": 0.07861328125, + "learning_rate": 0.002157, + "loss": 1.1866, + "step": 719 + }, + { + "epoch": 0.0631380775496726, + "grad_norm": 0.0751953125, + "learning_rate": 0.00216, + "loss": 1.1822, + "step": 720 + }, + { + "epoch": 0.06322576932404714, + "grad_norm": 0.06396484375, + "learning_rate": 0.002163, + "loss": 1.2161, + "step": 721 + }, + { + "epoch": 0.06331346109842169, + "grad_norm": 0.05908203125, + "learning_rate": 0.002166, + "loss": 1.1923, + "step": 722 + }, + { + "epoch": 0.06340115287279623, + "grad_norm": 0.07763671875, + "learning_rate": 0.002169, + "loss": 1.1727, + "step": 723 + }, + { + "epoch": 0.06348884464717078, + "grad_norm": 0.064453125, + "learning_rate": 0.002172, + "loss": 1.1592, + "step": 724 + }, + { + "epoch": 0.06357653642154532, + "grad_norm": 0.08642578125, + "learning_rate": 0.002175, + "loss": 1.1971, + "step": 725 + }, + { + "epoch": 0.06366422819591987, + "grad_norm": 0.068359375, + "learning_rate": 0.002178, + "loss": 1.2687, + "step": 726 + }, + { + "epoch": 0.0637519199702944, + "grad_norm": 0.1318359375, + "learning_rate": 0.0021809999999999998, + "loss": 1.2438, + "step": 727 + }, + { + "epoch": 0.06383961174466896, + "grad_norm": 0.061767578125, + "learning_rate": 0.002184, + "loss": 1.1864, + "step": 728 + }, + { + "epoch": 0.06392730351904351, + "grad_norm": 0.09765625, + "learning_rate": 0.002187, + "loss": 1.1569, + "step": 729 + }, + { + "epoch": 0.06401499529341804, + "grad_norm": 0.0625, + "learning_rate": 0.00219, + "loss": 1.203, + "step": 730 + }, + { + "epoch": 0.0641026870677926, + "grad_norm": 0.0810546875, + "learning_rate": 0.002193, + "loss": 1.1815, + "step": 731 + }, + { + "epoch": 0.06419037884216713, + "grad_norm": 0.0693359375, + "learning_rate": 0.002196, + "loss": 1.208, + "step": 732 + }, + { + "epoch": 0.06427807061654169, + "grad_norm": 0.07373046875, + "learning_rate": 0.002199, + "loss": 1.1815, + "step": 733 + }, + { + "epoch": 0.06436576239091622, + "grad_norm": 0.08154296875, + "learning_rate": 0.002202, + "loss": 1.2005, + "step": 734 + }, + { + "epoch": 0.06445345416529077, + "grad_norm": 0.059814453125, + "learning_rate": 0.002205, + "loss": 1.1672, + "step": 735 + }, + { + "epoch": 0.06454114593966531, + "grad_norm": 0.08203125, + "learning_rate": 0.002208, + "loss": 1.2257, + "step": 736 + }, + { + "epoch": 0.06462883771403986, + "grad_norm": 0.06640625, + "learning_rate": 0.002211, + "loss": 1.1748, + "step": 737 + }, + { + "epoch": 0.06471652948841441, + "grad_norm": 0.061279296875, + "learning_rate": 0.002214, + "loss": 1.1862, + "step": 738 + }, + { + "epoch": 0.06480422126278895, + "grad_norm": 0.06640625, + "learning_rate": 0.0022170000000000002, + "loss": 1.1603, + "step": 739 + }, + { + "epoch": 0.0648919130371635, + "grad_norm": 0.0712890625, + "learning_rate": 0.00222, + "loss": 1.2469, + "step": 740 + }, + { + "epoch": 0.06497960481153804, + "grad_norm": 0.10888671875, + "learning_rate": 0.002223, + "loss": 1.2134, + "step": 741 + }, + { + "epoch": 0.06506729658591259, + "grad_norm": 0.107421875, + "learning_rate": 0.002226, + "loss": 1.1568, + "step": 742 + }, + { + "epoch": 0.06515498836028713, + "grad_norm": 0.07568359375, + "learning_rate": 0.002229, + "loss": 1.1611, + "step": 743 + }, + { + "epoch": 0.06524268013466168, + "grad_norm": 0.1044921875, + "learning_rate": 0.002232, + "loss": 1.2252, + "step": 744 + }, + { + "epoch": 0.06533037190903622, + "grad_norm": 0.07177734375, + "learning_rate": 0.002235, + "loss": 1.2526, + "step": 745 + }, + { + "epoch": 0.06541806368341077, + "grad_norm": 0.068359375, + "learning_rate": 0.002238, + "loss": 1.2256, + "step": 746 + }, + { + "epoch": 0.06550575545778532, + "grad_norm": 0.0654296875, + "learning_rate": 0.002241, + "loss": 1.1456, + "step": 747 + }, + { + "epoch": 0.06559344723215986, + "grad_norm": 0.06640625, + "learning_rate": 0.002244, + "loss": 1.1424, + "step": 748 + }, + { + "epoch": 0.06568113900653441, + "grad_norm": 0.07373046875, + "learning_rate": 0.002247, + "loss": 1.1713, + "step": 749 + }, + { + "epoch": 0.06576883078090895, + "grad_norm": 0.08740234375, + "learning_rate": 0.0022500000000000003, + "loss": 1.1616, + "step": 750 + }, + { + "epoch": 0.0658565225552835, + "grad_norm": 0.07861328125, + "learning_rate": 0.0022530000000000002, + "loss": 1.2177, + "step": 751 + }, + { + "epoch": 0.06594421432965804, + "grad_norm": 0.0869140625, + "learning_rate": 0.002256, + "loss": 1.23, + "step": 752 + }, + { + "epoch": 0.06603190610403259, + "grad_norm": 0.10205078125, + "learning_rate": 0.002259, + "loss": 1.1637, + "step": 753 + }, + { + "epoch": 0.06611959787840713, + "grad_norm": 0.061279296875, + "learning_rate": 0.002262, + "loss": 1.1783, + "step": 754 + }, + { + "epoch": 0.06620728965278168, + "grad_norm": 0.126953125, + "learning_rate": 0.002265, + "loss": 1.158, + "step": 755 + }, + { + "epoch": 0.06629498142715623, + "grad_norm": 0.0673828125, + "learning_rate": 0.002268, + "loss": 1.1594, + "step": 756 + }, + { + "epoch": 0.06638267320153077, + "grad_norm": 0.11279296875, + "learning_rate": 0.002271, + "loss": 1.2228, + "step": 757 + }, + { + "epoch": 0.06647036497590532, + "grad_norm": 0.08154296875, + "learning_rate": 0.002274, + "loss": 1.1687, + "step": 758 + }, + { + "epoch": 0.06655805675027986, + "grad_norm": 0.0908203125, + "learning_rate": 0.002277, + "loss": 1.2647, + "step": 759 + }, + { + "epoch": 0.06664574852465441, + "grad_norm": 0.080078125, + "learning_rate": 0.00228, + "loss": 1.1637, + "step": 760 + }, + { + "epoch": 0.06673344029902895, + "grad_norm": 0.1064453125, + "learning_rate": 0.002283, + "loss": 1.1811, + "step": 761 + }, + { + "epoch": 0.0668211320734035, + "grad_norm": 0.10595703125, + "learning_rate": 0.0022860000000000003, + "loss": 1.1874, + "step": 762 + }, + { + "epoch": 0.06690882384777803, + "grad_norm": 0.150390625, + "learning_rate": 0.0022890000000000002, + "loss": 1.1863, + "step": 763 + }, + { + "epoch": 0.06699651562215259, + "grad_norm": 0.1474609375, + "learning_rate": 0.002292, + "loss": 1.195, + "step": 764 + }, + { + "epoch": 0.06708420739652714, + "grad_norm": 0.1279296875, + "learning_rate": 0.002295, + "loss": 1.1976, + "step": 765 + }, + { + "epoch": 0.06717189917090167, + "grad_norm": 0.14453125, + "learning_rate": 0.002298, + "loss": 1.179, + "step": 766 + }, + { + "epoch": 0.06725959094527623, + "grad_norm": 0.0712890625, + "learning_rate": 0.002301, + "loss": 1.2312, + "step": 767 + }, + { + "epoch": 0.06734728271965076, + "grad_norm": 0.0830078125, + "learning_rate": 0.002304, + "loss": 1.2145, + "step": 768 + }, + { + "epoch": 0.06743497449402532, + "grad_norm": 0.1015625, + "learning_rate": 0.002307, + "loss": 1.1487, + "step": 769 + }, + { + "epoch": 0.06752266626839985, + "grad_norm": 0.06689453125, + "learning_rate": 0.00231, + "loss": 1.1608, + "step": 770 + }, + { + "epoch": 0.0676103580427744, + "grad_norm": 0.09716796875, + "learning_rate": 0.002313, + "loss": 1.1619, + "step": 771 + }, + { + "epoch": 0.06769804981714894, + "grad_norm": 0.060546875, + "learning_rate": 0.002316, + "loss": 1.1695, + "step": 772 + }, + { + "epoch": 0.0677857415915235, + "grad_norm": 0.06494140625, + "learning_rate": 0.0023190000000000003, + "loss": 1.2122, + "step": 773 + }, + { + "epoch": 0.06787343336589804, + "grad_norm": 0.06640625, + "learning_rate": 0.0023220000000000003, + "loss": 1.2475, + "step": 774 + }, + { + "epoch": 0.06796112514027258, + "grad_norm": 0.062255859375, + "learning_rate": 0.0023250000000000002, + "loss": 1.1552, + "step": 775 + }, + { + "epoch": 0.06804881691464713, + "grad_norm": 0.07568359375, + "learning_rate": 0.002328, + "loss": 1.1782, + "step": 776 + }, + { + "epoch": 0.06813650868902167, + "grad_norm": 0.06201171875, + "learning_rate": 0.002331, + "loss": 1.2141, + "step": 777 + }, + { + "epoch": 0.06822420046339622, + "grad_norm": 0.064453125, + "learning_rate": 0.002334, + "loss": 1.225, + "step": 778 + }, + { + "epoch": 0.06831189223777076, + "grad_norm": 0.06396484375, + "learning_rate": 0.002337, + "loss": 1.1926, + "step": 779 + }, + { + "epoch": 0.06839958401214531, + "grad_norm": 0.072265625, + "learning_rate": 0.00234, + "loss": 1.1816, + "step": 780 + }, + { + "epoch": 0.06848727578651985, + "grad_norm": 0.06103515625, + "learning_rate": 0.002343, + "loss": 1.2141, + "step": 781 + }, + { + "epoch": 0.0685749675608944, + "grad_norm": 0.06689453125, + "learning_rate": 0.002346, + "loss": 1.2328, + "step": 782 + }, + { + "epoch": 0.06866265933526895, + "grad_norm": 0.0654296875, + "learning_rate": 0.002349, + "loss": 1.189, + "step": 783 + }, + { + "epoch": 0.06875035110964349, + "grad_norm": 0.06396484375, + "learning_rate": 0.002352, + "loss": 1.1854, + "step": 784 + }, + { + "epoch": 0.06883804288401804, + "grad_norm": 0.08740234375, + "learning_rate": 0.0023550000000000003, + "loss": 1.219, + "step": 785 + }, + { + "epoch": 0.06892573465839258, + "grad_norm": 0.057373046875, + "learning_rate": 0.0023580000000000003, + "loss": 1.186, + "step": 786 + }, + { + "epoch": 0.06901342643276713, + "grad_norm": 0.11376953125, + "learning_rate": 0.0023610000000000003, + "loss": 1.2529, + "step": 787 + }, + { + "epoch": 0.06910111820714167, + "grad_norm": 0.07421875, + "learning_rate": 0.002364, + "loss": 1.1786, + "step": 788 + }, + { + "epoch": 0.06918880998151622, + "grad_norm": 0.064453125, + "learning_rate": 0.002367, + "loss": 1.2026, + "step": 789 + }, + { + "epoch": 0.06927650175589076, + "grad_norm": 0.07080078125, + "learning_rate": 0.00237, + "loss": 1.2605, + "step": 790 + }, + { + "epoch": 0.06936419353026531, + "grad_norm": 0.0703125, + "learning_rate": 0.002373, + "loss": 1.2177, + "step": 791 + }, + { + "epoch": 0.06945188530463986, + "grad_norm": 0.064453125, + "learning_rate": 0.002376, + "loss": 1.2597, + "step": 792 + }, + { + "epoch": 0.0695395770790144, + "grad_norm": 0.08056640625, + "learning_rate": 0.002379, + "loss": 1.2239, + "step": 793 + }, + { + "epoch": 0.06962726885338895, + "grad_norm": 0.06396484375, + "learning_rate": 0.002382, + "loss": 1.1098, + "step": 794 + }, + { + "epoch": 0.06971496062776349, + "grad_norm": 0.1064453125, + "learning_rate": 0.002385, + "loss": 1.2398, + "step": 795 + }, + { + "epoch": 0.06980265240213804, + "grad_norm": 0.06689453125, + "learning_rate": 0.0023880000000000004, + "loss": 1.2229, + "step": 796 + }, + { + "epoch": 0.06989034417651258, + "grad_norm": 0.1220703125, + "learning_rate": 0.0023910000000000003, + "loss": 1.1818, + "step": 797 + }, + { + "epoch": 0.06997803595088713, + "grad_norm": 0.09375, + "learning_rate": 0.0023940000000000003, + "loss": 1.1801, + "step": 798 + }, + { + "epoch": 0.07006572772526166, + "grad_norm": 0.072265625, + "learning_rate": 0.0023970000000000003, + "loss": 1.2839, + "step": 799 + }, + { + "epoch": 0.07015341949963622, + "grad_norm": 0.10546875, + "learning_rate": 0.0024000000000000002, + "loss": 1.17, + "step": 800 + }, + { + "epoch": 0.07024111127401077, + "grad_norm": 0.07275390625, + "learning_rate": 0.002403, + "loss": 1.1887, + "step": 801 + }, + { + "epoch": 0.0703288030483853, + "grad_norm": 0.08544921875, + "learning_rate": 0.002406, + "loss": 1.2965, + "step": 802 + }, + { + "epoch": 0.07041649482275986, + "grad_norm": 0.057861328125, + "learning_rate": 0.002409, + "loss": 1.1965, + "step": 803 + }, + { + "epoch": 0.0705041865971344, + "grad_norm": 0.08837890625, + "learning_rate": 0.002412, + "loss": 1.169, + "step": 804 + }, + { + "epoch": 0.07059187837150895, + "grad_norm": 0.06787109375, + "learning_rate": 0.002415, + "loss": 1.2176, + "step": 805 + }, + { + "epoch": 0.07067957014588348, + "grad_norm": 0.1552734375, + "learning_rate": 0.002418, + "loss": 1.1354, + "step": 806 + }, + { + "epoch": 0.07076726192025803, + "grad_norm": 0.0947265625, + "learning_rate": 0.0024210000000000004, + "loss": 1.1584, + "step": 807 + }, + { + "epoch": 0.07085495369463257, + "grad_norm": 0.1357421875, + "learning_rate": 0.0024240000000000004, + "loss": 1.1809, + "step": 808 + }, + { + "epoch": 0.07094264546900712, + "grad_norm": 0.07421875, + "learning_rate": 0.0024270000000000003, + "loss": 1.1324, + "step": 809 + }, + { + "epoch": 0.07103033724338167, + "grad_norm": 0.1298828125, + "learning_rate": 0.0024300000000000003, + "loss": 1.2154, + "step": 810 + }, + { + "epoch": 0.07111802901775621, + "grad_norm": 0.06494140625, + "learning_rate": 0.0024330000000000003, + "loss": 1.1983, + "step": 811 + }, + { + "epoch": 0.07120572079213076, + "grad_norm": 0.10302734375, + "learning_rate": 0.0024360000000000002, + "loss": 1.2254, + "step": 812 + }, + { + "epoch": 0.0712934125665053, + "grad_norm": 0.06591796875, + "learning_rate": 0.0024389999999999998, + "loss": 1.165, + "step": 813 + }, + { + "epoch": 0.07138110434087985, + "grad_norm": 0.07373046875, + "learning_rate": 0.0024419999999999997, + "loss": 1.2106, + "step": 814 + }, + { + "epoch": 0.07146879611525439, + "grad_norm": 0.1064453125, + "learning_rate": 0.0024449999999999997, + "loss": 1.1528, + "step": 815 + }, + { + "epoch": 0.07155648788962894, + "grad_norm": 0.09765625, + "learning_rate": 0.002448, + "loss": 1.1848, + "step": 816 + }, + { + "epoch": 0.07164417966400348, + "grad_norm": 0.1142578125, + "learning_rate": 0.002451, + "loss": 1.1658, + "step": 817 + }, + { + "epoch": 0.07173187143837803, + "grad_norm": 0.12890625, + "learning_rate": 0.002454, + "loss": 1.2963, + "step": 818 + }, + { + "epoch": 0.07181956321275258, + "grad_norm": 0.09423828125, + "learning_rate": 0.002457, + "loss": 1.1949, + "step": 819 + }, + { + "epoch": 0.07190725498712712, + "grad_norm": 0.1337890625, + "learning_rate": 0.00246, + "loss": 1.2026, + "step": 820 + }, + { + "epoch": 0.07199494676150167, + "grad_norm": 0.05908203125, + "learning_rate": 0.002463, + "loss": 1.187, + "step": 821 + }, + { + "epoch": 0.07208263853587621, + "grad_norm": 0.0859375, + "learning_rate": 0.002466, + "loss": 1.2453, + "step": 822 + }, + { + "epoch": 0.07217033031025076, + "grad_norm": 0.072265625, + "learning_rate": 0.002469, + "loss": 1.2092, + "step": 823 + }, + { + "epoch": 0.0722580220846253, + "grad_norm": 0.0771484375, + "learning_rate": 0.002472, + "loss": 1.2321, + "step": 824 + }, + { + "epoch": 0.07234571385899985, + "grad_norm": 0.059814453125, + "learning_rate": 0.0024749999999999998, + "loss": 1.1999, + "step": 825 + }, + { + "epoch": 0.07243340563337439, + "grad_norm": 0.08544921875, + "learning_rate": 0.0024779999999999997, + "loss": 1.2179, + "step": 826 + }, + { + "epoch": 0.07252109740774894, + "grad_norm": 0.0712890625, + "learning_rate": 0.002481, + "loss": 1.2094, + "step": 827 + }, + { + "epoch": 0.07260878918212349, + "grad_norm": 0.0859375, + "learning_rate": 0.002484, + "loss": 1.2573, + "step": 828 + }, + { + "epoch": 0.07269648095649803, + "grad_norm": 0.1005859375, + "learning_rate": 0.002487, + "loss": 1.2118, + "step": 829 + }, + { + "epoch": 0.07278417273087258, + "grad_norm": 0.08203125, + "learning_rate": 0.00249, + "loss": 1.1918, + "step": 830 + }, + { + "epoch": 0.07287186450524712, + "grad_norm": 0.099609375, + "learning_rate": 0.002493, + "loss": 1.1716, + "step": 831 + }, + { + "epoch": 0.07295955627962167, + "grad_norm": 0.0791015625, + "learning_rate": 0.002496, + "loss": 1.2032, + "step": 832 + }, + { + "epoch": 0.0730472480539962, + "grad_norm": 0.08447265625, + "learning_rate": 0.002499, + "loss": 1.2014, + "step": 833 + }, + { + "epoch": 0.07313493982837076, + "grad_norm": 0.0712890625, + "learning_rate": 0.002502, + "loss": 1.2087, + "step": 834 + }, + { + "epoch": 0.0732226316027453, + "grad_norm": 0.060791015625, + "learning_rate": 0.002505, + "loss": 1.1957, + "step": 835 + }, + { + "epoch": 0.07331032337711985, + "grad_norm": 0.087890625, + "learning_rate": 0.002508, + "loss": 1.217, + "step": 836 + }, + { + "epoch": 0.0733980151514944, + "grad_norm": 0.06640625, + "learning_rate": 0.0025109999999999998, + "loss": 1.2022, + "step": 837 + }, + { + "epoch": 0.07348570692586893, + "grad_norm": 0.09716796875, + "learning_rate": 0.0025139999999999997, + "loss": 1.2414, + "step": 838 + }, + { + "epoch": 0.07357339870024349, + "grad_norm": 0.0771484375, + "learning_rate": 0.002517, + "loss": 1.2123, + "step": 839 + }, + { + "epoch": 0.07366109047461802, + "grad_norm": 0.0771484375, + "learning_rate": 0.00252, + "loss": 1.1463, + "step": 840 + }, + { + "epoch": 0.07374878224899258, + "grad_norm": 0.07177734375, + "learning_rate": 0.002523, + "loss": 1.24, + "step": 841 + }, + { + "epoch": 0.07383647402336711, + "grad_norm": 0.08056640625, + "learning_rate": 0.002526, + "loss": 1.2643, + "step": 842 + }, + { + "epoch": 0.07392416579774166, + "grad_norm": 0.06298828125, + "learning_rate": 0.002529, + "loss": 1.1958, + "step": 843 + }, + { + "epoch": 0.0740118575721162, + "grad_norm": 0.0732421875, + "learning_rate": 0.002532, + "loss": 1.1996, + "step": 844 + }, + { + "epoch": 0.07409954934649075, + "grad_norm": 0.107421875, + "learning_rate": 0.002535, + "loss": 1.1694, + "step": 845 + }, + { + "epoch": 0.0741872411208653, + "grad_norm": 0.09765625, + "learning_rate": 0.002538, + "loss": 1.2552, + "step": 846 + }, + { + "epoch": 0.07427493289523984, + "grad_norm": 0.1103515625, + "learning_rate": 0.002541, + "loss": 1.2317, + "step": 847 + }, + { + "epoch": 0.0743626246696144, + "grad_norm": 0.1103515625, + "learning_rate": 0.002544, + "loss": 1.2529, + "step": 848 + }, + { + "epoch": 0.07445031644398893, + "grad_norm": 0.0859375, + "learning_rate": 0.002547, + "loss": 1.1682, + "step": 849 + }, + { + "epoch": 0.07453800821836348, + "grad_norm": 0.06787109375, + "learning_rate": 0.00255, + "loss": 1.2095, + "step": 850 + }, + { + "epoch": 0.07462569999273802, + "grad_norm": 0.0732421875, + "learning_rate": 0.002553, + "loss": 1.3158, + "step": 851 + }, + { + "epoch": 0.07471339176711257, + "grad_norm": 0.064453125, + "learning_rate": 0.002556, + "loss": 1.2173, + "step": 852 + }, + { + "epoch": 0.07480108354148711, + "grad_norm": 0.1162109375, + "learning_rate": 0.002559, + "loss": 1.2856, + "step": 853 + }, + { + "epoch": 0.07488877531586166, + "grad_norm": 0.07421875, + "learning_rate": 0.002562, + "loss": 1.2333, + "step": 854 + }, + { + "epoch": 0.07497646709023621, + "grad_norm": 0.12451171875, + "learning_rate": 0.002565, + "loss": 1.2129, + "step": 855 + }, + { + "epoch": 0.07506415886461075, + "grad_norm": 0.078125, + "learning_rate": 0.002568, + "loss": 1.2414, + "step": 856 + }, + { + "epoch": 0.0751518506389853, + "grad_norm": 0.087890625, + "learning_rate": 0.002571, + "loss": 1.2281, + "step": 857 + }, + { + "epoch": 0.07523954241335984, + "grad_norm": 0.08251953125, + "learning_rate": 0.002574, + "loss": 1.2306, + "step": 858 + }, + { + "epoch": 0.07532723418773439, + "grad_norm": 0.06640625, + "learning_rate": 0.002577, + "loss": 1.2104, + "step": 859 + }, + { + "epoch": 0.07541492596210893, + "grad_norm": 0.12353515625, + "learning_rate": 0.00258, + "loss": 1.2253, + "step": 860 + }, + { + "epoch": 0.07550261773648348, + "grad_norm": 0.10205078125, + "learning_rate": 0.0025830000000000002, + "loss": 1.2411, + "step": 861 + }, + { + "epoch": 0.07559030951085802, + "grad_norm": 0.1533203125, + "learning_rate": 0.002586, + "loss": 1.1738, + "step": 862 + }, + { + "epoch": 0.07567800128523257, + "grad_norm": 0.11279296875, + "learning_rate": 0.002589, + "loss": 1.2206, + "step": 863 + }, + { + "epoch": 0.07576569305960712, + "grad_norm": 0.1396484375, + "learning_rate": 0.002592, + "loss": 1.2041, + "step": 864 + }, + { + "epoch": 0.07585338483398166, + "grad_norm": 0.07958984375, + "learning_rate": 0.002595, + "loss": 1.1726, + "step": 865 + }, + { + "epoch": 0.07594107660835621, + "grad_norm": 0.11279296875, + "learning_rate": 0.002598, + "loss": 1.2984, + "step": 866 + }, + { + "epoch": 0.07602876838273075, + "grad_norm": 0.10400390625, + "learning_rate": 0.002601, + "loss": 1.1888, + "step": 867 + }, + { + "epoch": 0.0761164601571053, + "grad_norm": 0.203125, + "learning_rate": 0.002604, + "loss": 1.1803, + "step": 868 + }, + { + "epoch": 0.07620415193147984, + "grad_norm": 0.06640625, + "learning_rate": 0.002607, + "loss": 1.1997, + "step": 869 + }, + { + "epoch": 0.07629184370585439, + "grad_norm": 0.19140625, + "learning_rate": 0.00261, + "loss": 1.2026, + "step": 870 + }, + { + "epoch": 0.07637953548022892, + "grad_norm": 0.08154296875, + "learning_rate": 0.002613, + "loss": 1.2052, + "step": 871 + }, + { + "epoch": 0.07646722725460348, + "grad_norm": 0.1767578125, + "learning_rate": 0.002616, + "loss": 1.1758, + "step": 872 + }, + { + "epoch": 0.07655491902897803, + "grad_norm": 0.138671875, + "learning_rate": 0.0026190000000000002, + "loss": 1.2126, + "step": 873 + }, + { + "epoch": 0.07664261080335256, + "grad_norm": 0.126953125, + "learning_rate": 0.002622, + "loss": 1.1752, + "step": 874 + }, + { + "epoch": 0.07673030257772712, + "grad_norm": 0.1591796875, + "learning_rate": 0.002625, + "loss": 1.1959, + "step": 875 + }, + { + "epoch": 0.07681799435210165, + "grad_norm": 0.1162109375, + "learning_rate": 0.002628, + "loss": 1.1591, + "step": 876 + }, + { + "epoch": 0.0769056861264762, + "grad_norm": 0.1240234375, + "learning_rate": 0.002631, + "loss": 1.2444, + "step": 877 + }, + { + "epoch": 0.07699337790085074, + "grad_norm": 0.07861328125, + "learning_rate": 0.002634, + "loss": 1.2269, + "step": 878 + }, + { + "epoch": 0.0770810696752253, + "grad_norm": 0.1484375, + "learning_rate": 0.002637, + "loss": 1.199, + "step": 879 + }, + { + "epoch": 0.07716876144959983, + "grad_norm": 0.07373046875, + "learning_rate": 0.00264, + "loss": 1.2457, + "step": 880 + }, + { + "epoch": 0.07725645322397438, + "grad_norm": 0.1416015625, + "learning_rate": 0.002643, + "loss": 1.1842, + "step": 881 + }, + { + "epoch": 0.07734414499834893, + "grad_norm": 0.08935546875, + "learning_rate": 0.002646, + "loss": 1.1871, + "step": 882 + }, + { + "epoch": 0.07743183677272347, + "grad_norm": 0.12109375, + "learning_rate": 0.002649, + "loss": 1.248, + "step": 883 + }, + { + "epoch": 0.07751952854709802, + "grad_norm": 0.0693359375, + "learning_rate": 0.0026520000000000003, + "loss": 1.2379, + "step": 884 + }, + { + "epoch": 0.07760722032147256, + "grad_norm": 0.08642578125, + "learning_rate": 0.0026550000000000002, + "loss": 1.1706, + "step": 885 + }, + { + "epoch": 0.07769491209584711, + "grad_norm": 0.083984375, + "learning_rate": 0.002658, + "loss": 1.2141, + "step": 886 + }, + { + "epoch": 0.07778260387022165, + "grad_norm": 0.0693359375, + "learning_rate": 0.002661, + "loss": 1.1919, + "step": 887 + }, + { + "epoch": 0.0778702956445962, + "grad_norm": 0.1533203125, + "learning_rate": 0.002664, + "loss": 1.2467, + "step": 888 + }, + { + "epoch": 0.07795798741897074, + "grad_norm": 0.06591796875, + "learning_rate": 0.002667, + "loss": 1.1961, + "step": 889 + }, + { + "epoch": 0.07804567919334529, + "grad_norm": 0.146484375, + "learning_rate": 0.00267, + "loss": 1.2178, + "step": 890 + }, + { + "epoch": 0.07813337096771984, + "grad_norm": 0.07861328125, + "learning_rate": 0.002673, + "loss": 1.1719, + "step": 891 + }, + { + "epoch": 0.07822106274209438, + "grad_norm": 0.12890625, + "learning_rate": 0.002676, + "loss": 1.2323, + "step": 892 + }, + { + "epoch": 0.07830875451646893, + "grad_norm": 0.06787109375, + "learning_rate": 0.002679, + "loss": 1.2321, + "step": 893 + }, + { + "epoch": 0.07839644629084347, + "grad_norm": 0.08349609375, + "learning_rate": 0.002682, + "loss": 1.211, + "step": 894 + }, + { + "epoch": 0.07848413806521802, + "grad_norm": 0.0654296875, + "learning_rate": 0.0026850000000000003, + "loss": 1.2103, + "step": 895 + }, + { + "epoch": 0.07857182983959256, + "grad_norm": 0.0732421875, + "learning_rate": 0.0026880000000000003, + "loss": 1.1379, + "step": 896 + }, + { + "epoch": 0.07865952161396711, + "grad_norm": 0.08935546875, + "learning_rate": 0.0026910000000000002, + "loss": 1.2037, + "step": 897 + }, + { + "epoch": 0.07874721338834165, + "grad_norm": 0.06982421875, + "learning_rate": 0.002694, + "loss": 1.2589, + "step": 898 + }, + { + "epoch": 0.0788349051627162, + "grad_norm": 0.10546875, + "learning_rate": 0.002697, + "loss": 1.1949, + "step": 899 + }, + { + "epoch": 0.07892259693709075, + "grad_norm": 0.06494140625, + "learning_rate": 0.0027, + "loss": 1.182, + "step": 900 + }, + { + "epoch": 0.07901028871146529, + "grad_norm": 0.09765625, + "learning_rate": 0.002703, + "loss": 1.1464, + "step": 901 + }, + { + "epoch": 0.07909798048583984, + "grad_norm": 0.0751953125, + "learning_rate": 0.002706, + "loss": 1.2589, + "step": 902 + }, + { + "epoch": 0.07918567226021438, + "grad_norm": 0.08642578125, + "learning_rate": 0.002709, + "loss": 1.148, + "step": 903 + }, + { + "epoch": 0.07927336403458893, + "grad_norm": 0.087890625, + "learning_rate": 0.002712, + "loss": 1.1859, + "step": 904 + }, + { + "epoch": 0.07936105580896347, + "grad_norm": 0.072265625, + "learning_rate": 0.002715, + "loss": 1.1813, + "step": 905 + }, + { + "epoch": 0.07944874758333802, + "grad_norm": 0.10986328125, + "learning_rate": 0.002718, + "loss": 1.2407, + "step": 906 + }, + { + "epoch": 0.07953643935771255, + "grad_norm": 0.0927734375, + "learning_rate": 0.0027210000000000003, + "loss": 1.2042, + "step": 907 + }, + { + "epoch": 0.0796241311320871, + "grad_norm": 0.0810546875, + "learning_rate": 0.0027240000000000003, + "loss": 1.279, + "step": 908 + }, + { + "epoch": 0.07971182290646166, + "grad_norm": 0.08740234375, + "learning_rate": 0.0027270000000000003, + "loss": 1.2023, + "step": 909 + }, + { + "epoch": 0.0797995146808362, + "grad_norm": 0.0693359375, + "learning_rate": 0.0027300000000000002, + "loss": 1.2134, + "step": 910 + }, + { + "epoch": 0.07988720645521075, + "grad_norm": 0.13671875, + "learning_rate": 0.002733, + "loss": 1.2442, + "step": 911 + }, + { + "epoch": 0.07997489822958528, + "grad_norm": 0.1494140625, + "learning_rate": 0.002736, + "loss": 1.2028, + "step": 912 + }, + { + "epoch": 0.08006259000395984, + "grad_norm": 0.06884765625, + "learning_rate": 0.002739, + "loss": 1.2103, + "step": 913 + }, + { + "epoch": 0.08015028177833437, + "grad_norm": 0.0869140625, + "learning_rate": 0.002742, + "loss": 1.1703, + "step": 914 + }, + { + "epoch": 0.08023797355270892, + "grad_norm": 0.07568359375, + "learning_rate": 0.002745, + "loss": 1.2154, + "step": 915 + }, + { + "epoch": 0.08032566532708346, + "grad_norm": 0.06396484375, + "learning_rate": 0.002748, + "loss": 1.2085, + "step": 916 + }, + { + "epoch": 0.08041335710145801, + "grad_norm": 0.08984375, + "learning_rate": 0.002751, + "loss": 1.2022, + "step": 917 + }, + { + "epoch": 0.08050104887583256, + "grad_norm": 0.115234375, + "learning_rate": 0.0027540000000000004, + "loss": 1.2548, + "step": 918 + }, + { + "epoch": 0.0805887406502071, + "grad_norm": 0.107421875, + "learning_rate": 0.0027570000000000003, + "loss": 1.1738, + "step": 919 + }, + { + "epoch": 0.08067643242458165, + "grad_norm": 0.09765625, + "learning_rate": 0.0027600000000000003, + "loss": 1.1934, + "step": 920 + }, + { + "epoch": 0.08076412419895619, + "grad_norm": 0.0771484375, + "learning_rate": 0.0027630000000000003, + "loss": 1.2174, + "step": 921 + }, + { + "epoch": 0.08085181597333074, + "grad_norm": 0.0673828125, + "learning_rate": 0.0027660000000000002, + "loss": 1.2276, + "step": 922 + }, + { + "epoch": 0.08093950774770528, + "grad_norm": 0.06298828125, + "learning_rate": 0.002769, + "loss": 1.124, + "step": 923 + }, + { + "epoch": 0.08102719952207983, + "grad_norm": 0.0634765625, + "learning_rate": 0.002772, + "loss": 1.2187, + "step": 924 + }, + { + "epoch": 0.08111489129645437, + "grad_norm": 0.07275390625, + "learning_rate": 0.002775, + "loss": 1.2156, + "step": 925 + }, + { + "epoch": 0.08120258307082892, + "grad_norm": 0.078125, + "learning_rate": 0.002778, + "loss": 1.2305, + "step": 926 + }, + { + "epoch": 0.08129027484520347, + "grad_norm": 0.146484375, + "learning_rate": 0.002781, + "loss": 1.236, + "step": 927 + }, + { + "epoch": 0.08137796661957801, + "grad_norm": 0.07763671875, + "learning_rate": 0.002784, + "loss": 1.2164, + "step": 928 + }, + { + "epoch": 0.08146565839395256, + "grad_norm": 0.12158203125, + "learning_rate": 0.0027870000000000004, + "loss": 1.232, + "step": 929 + }, + { + "epoch": 0.0815533501683271, + "grad_norm": 0.091796875, + "learning_rate": 0.0027900000000000004, + "loss": 1.1919, + "step": 930 + }, + { + "epoch": 0.08164104194270165, + "grad_norm": 0.07177734375, + "learning_rate": 0.0027930000000000003, + "loss": 1.2217, + "step": 931 + }, + { + "epoch": 0.08172873371707619, + "grad_norm": 0.083984375, + "learning_rate": 0.0027960000000000003, + "loss": 1.24, + "step": 932 + }, + { + "epoch": 0.08181642549145074, + "grad_norm": 0.06884765625, + "learning_rate": 0.0027990000000000003, + "loss": 1.1977, + "step": 933 + }, + { + "epoch": 0.08190411726582528, + "grad_norm": 0.0986328125, + "learning_rate": 0.0028020000000000002, + "loss": 1.3008, + "step": 934 + }, + { + "epoch": 0.08199180904019983, + "grad_norm": 0.078125, + "learning_rate": 0.002805, + "loss": 1.1938, + "step": 935 + }, + { + "epoch": 0.08207950081457437, + "grad_norm": 0.07568359375, + "learning_rate": 0.002808, + "loss": 1.2427, + "step": 936 + }, + { + "epoch": 0.08216719258894892, + "grad_norm": 0.0751953125, + "learning_rate": 0.002811, + "loss": 1.2226, + "step": 937 + }, + { + "epoch": 0.08225488436332347, + "grad_norm": 0.06640625, + "learning_rate": 0.002814, + "loss": 1.2063, + "step": 938 + }, + { + "epoch": 0.082342576137698, + "grad_norm": 0.0927734375, + "learning_rate": 0.002817, + "loss": 1.1925, + "step": 939 + }, + { + "epoch": 0.08243026791207256, + "grad_norm": 0.061279296875, + "learning_rate": 0.00282, + "loss": 1.1635, + "step": 940 + }, + { + "epoch": 0.0825179596864471, + "grad_norm": 0.0693359375, + "learning_rate": 0.002823, + "loss": 1.1588, + "step": 941 + }, + { + "epoch": 0.08260565146082165, + "grad_norm": 0.06640625, + "learning_rate": 0.002826, + "loss": 1.2534, + "step": 942 + }, + { + "epoch": 0.08269334323519618, + "grad_norm": 0.0888671875, + "learning_rate": 0.002829, + "loss": 1.253, + "step": 943 + }, + { + "epoch": 0.08278103500957074, + "grad_norm": 0.06201171875, + "learning_rate": 0.002832, + "loss": 1.1999, + "step": 944 + }, + { + "epoch": 0.08286872678394527, + "grad_norm": 0.1572265625, + "learning_rate": 0.002835, + "loss": 1.2377, + "step": 945 + }, + { + "epoch": 0.08295641855831982, + "grad_norm": 0.0634765625, + "learning_rate": 0.002838, + "loss": 1.1636, + "step": 946 + }, + { + "epoch": 0.08304411033269438, + "grad_norm": 0.138671875, + "learning_rate": 0.0028409999999999998, + "loss": 1.1766, + "step": 947 + }, + { + "epoch": 0.08313180210706891, + "grad_norm": 0.06884765625, + "learning_rate": 0.0028439999999999997, + "loss": 1.2019, + "step": 948 + }, + { + "epoch": 0.08321949388144347, + "grad_norm": 0.1552734375, + "learning_rate": 0.002847, + "loss": 1.1763, + "step": 949 + }, + { + "epoch": 0.083307185655818, + "grad_norm": 0.1279296875, + "learning_rate": 0.00285, + "loss": 1.2541, + "step": 950 + }, + { + "epoch": 0.08339487743019255, + "grad_norm": 0.0849609375, + "learning_rate": 0.002853, + "loss": 1.2463, + "step": 951 + }, + { + "epoch": 0.08348256920456709, + "grad_norm": 0.1630859375, + "learning_rate": 0.002856, + "loss": 1.2397, + "step": 952 + }, + { + "epoch": 0.08357026097894164, + "grad_norm": 0.080078125, + "learning_rate": 0.002859, + "loss": 1.2318, + "step": 953 + }, + { + "epoch": 0.08365795275331618, + "grad_norm": 0.142578125, + "learning_rate": 0.002862, + "loss": 1.2518, + "step": 954 + }, + { + "epoch": 0.08374564452769073, + "grad_norm": 0.068359375, + "learning_rate": 0.002865, + "loss": 1.2705, + "step": 955 + }, + { + "epoch": 0.08383333630206528, + "grad_norm": 0.1455078125, + "learning_rate": 0.002868, + "loss": 1.2331, + "step": 956 + }, + { + "epoch": 0.08392102807643982, + "grad_norm": 0.078125, + "learning_rate": 0.002871, + "loss": 1.2133, + "step": 957 + }, + { + "epoch": 0.08400871985081437, + "grad_norm": 0.10205078125, + "learning_rate": 0.002874, + "loss": 1.1885, + "step": 958 + }, + { + "epoch": 0.08409641162518891, + "grad_norm": 0.08349609375, + "learning_rate": 0.002877, + "loss": 1.1859, + "step": 959 + }, + { + "epoch": 0.08418410339956346, + "grad_norm": 0.076171875, + "learning_rate": 0.0028799999999999997, + "loss": 1.1881, + "step": 960 + }, + { + "epoch": 0.084271795173938, + "grad_norm": 0.1044921875, + "learning_rate": 0.002883, + "loss": 1.2317, + "step": 961 + }, + { + "epoch": 0.08435948694831255, + "grad_norm": 0.06103515625, + "learning_rate": 0.002886, + "loss": 1.212, + "step": 962 + }, + { + "epoch": 0.08444717872268709, + "grad_norm": 0.09912109375, + "learning_rate": 0.002889, + "loss": 1.2385, + "step": 963 + }, + { + "epoch": 0.08453487049706164, + "grad_norm": 0.06640625, + "learning_rate": 0.002892, + "loss": 1.1706, + "step": 964 + }, + { + "epoch": 0.08462256227143619, + "grad_norm": 0.08447265625, + "learning_rate": 0.002895, + "loss": 1.2452, + "step": 965 + }, + { + "epoch": 0.08471025404581073, + "grad_norm": 0.07275390625, + "learning_rate": 0.002898, + "loss": 1.2055, + "step": 966 + }, + { + "epoch": 0.08479794582018528, + "grad_norm": 0.078125, + "learning_rate": 0.002901, + "loss": 1.2252, + "step": 967 + }, + { + "epoch": 0.08488563759455982, + "grad_norm": 0.076171875, + "learning_rate": 0.002904, + "loss": 1.198, + "step": 968 + }, + { + "epoch": 0.08497332936893437, + "grad_norm": 0.12255859375, + "learning_rate": 0.002907, + "loss": 1.2776, + "step": 969 + }, + { + "epoch": 0.08506102114330891, + "grad_norm": 0.07763671875, + "learning_rate": 0.00291, + "loss": 1.2509, + "step": 970 + }, + { + "epoch": 0.08514871291768346, + "grad_norm": 0.138671875, + "learning_rate": 0.002913, + "loss": 1.1464, + "step": 971 + }, + { + "epoch": 0.085236404692058, + "grad_norm": 0.0732421875, + "learning_rate": 0.002916, + "loss": 1.2446, + "step": 972 + }, + { + "epoch": 0.08532409646643255, + "grad_norm": 0.2099609375, + "learning_rate": 0.002919, + "loss": 1.2405, + "step": 973 + }, + { + "epoch": 0.0854117882408071, + "grad_norm": 0.107421875, + "learning_rate": 0.002922, + "loss": 1.2451, + "step": 974 + }, + { + "epoch": 0.08549948001518164, + "grad_norm": 0.16796875, + "learning_rate": 0.002925, + "loss": 1.2476, + "step": 975 + }, + { + "epoch": 0.08558717178955619, + "grad_norm": 0.080078125, + "learning_rate": 0.002928, + "loss": 1.2005, + "step": 976 + }, + { + "epoch": 0.08567486356393073, + "grad_norm": 0.17578125, + "learning_rate": 0.002931, + "loss": 1.2115, + "step": 977 + }, + { + "epoch": 0.08576255533830528, + "grad_norm": 0.072265625, + "learning_rate": 0.002934, + "loss": 1.1749, + "step": 978 + }, + { + "epoch": 0.08585024711267981, + "grad_norm": 0.08544921875, + "learning_rate": 0.002937, + "loss": 1.2287, + "step": 979 + }, + { + "epoch": 0.08593793888705437, + "grad_norm": 0.09033203125, + "learning_rate": 0.00294, + "loss": 1.1517, + "step": 980 + }, + { + "epoch": 0.0860256306614289, + "grad_norm": 0.07763671875, + "learning_rate": 0.002943, + "loss": 1.244, + "step": 981 + }, + { + "epoch": 0.08611332243580345, + "grad_norm": 0.0732421875, + "learning_rate": 0.002946, + "loss": 1.1973, + "step": 982 + }, + { + "epoch": 0.086201014210178, + "grad_norm": 0.09521484375, + "learning_rate": 0.0029490000000000002, + "loss": 1.2874, + "step": 983 + }, + { + "epoch": 0.08628870598455254, + "grad_norm": 0.08349609375, + "learning_rate": 0.002952, + "loss": 1.2582, + "step": 984 + }, + { + "epoch": 0.0863763977589271, + "grad_norm": 0.0771484375, + "learning_rate": 0.002955, + "loss": 1.2256, + "step": 985 + }, + { + "epoch": 0.08646408953330163, + "grad_norm": 0.09765625, + "learning_rate": 0.002958, + "loss": 1.219, + "step": 986 + }, + { + "epoch": 0.08655178130767618, + "grad_norm": 0.0849609375, + "learning_rate": 0.002961, + "loss": 1.2202, + "step": 987 + }, + { + "epoch": 0.08663947308205072, + "grad_norm": 0.08837890625, + "learning_rate": 0.002964, + "loss": 1.2885, + "step": 988 + }, + { + "epoch": 0.08672716485642527, + "grad_norm": 0.0703125, + "learning_rate": 0.002967, + "loss": 1.2178, + "step": 989 + }, + { + "epoch": 0.08681485663079981, + "grad_norm": 0.0927734375, + "learning_rate": 0.00297, + "loss": 1.2682, + "step": 990 + }, + { + "epoch": 0.08690254840517436, + "grad_norm": 0.099609375, + "learning_rate": 0.002973, + "loss": 1.2171, + "step": 991 + }, + { + "epoch": 0.08699024017954891, + "grad_norm": 0.08740234375, + "learning_rate": 0.002976, + "loss": 1.2959, + "step": 992 + }, + { + "epoch": 0.08707793195392345, + "grad_norm": 0.10400390625, + "learning_rate": 0.002979, + "loss": 1.2449, + "step": 993 + }, + { + "epoch": 0.087165623728298, + "grad_norm": 0.10546875, + "learning_rate": 0.002982, + "loss": 1.2013, + "step": 994 + }, + { + "epoch": 0.08725331550267254, + "grad_norm": 0.091796875, + "learning_rate": 0.0029850000000000002, + "loss": 1.1975, + "step": 995 + }, + { + "epoch": 0.08734100727704709, + "grad_norm": 0.1396484375, + "learning_rate": 0.002988, + "loss": 1.2065, + "step": 996 + }, + { + "epoch": 0.08742869905142163, + "grad_norm": 0.11328125, + "learning_rate": 0.002991, + "loss": 1.2246, + "step": 997 + }, + { + "epoch": 0.08751639082579618, + "grad_norm": 0.10009765625, + "learning_rate": 0.002994, + "loss": 1.3003, + "step": 998 + }, + { + "epoch": 0.08760408260017072, + "grad_norm": 0.1298828125, + "learning_rate": 0.002997, + "loss": 1.1732, + "step": 999 + }, + { + "epoch": 0.08769177437454527, + "grad_norm": 0.078125, + "learning_rate": 0.003, + "loss": 1.2027, + "step": 1000 + }, + { + "epoch": 0.08769177437454527, + "eval_loss": 1.2352585792541504, + "eval_runtime": 429.1616, + "eval_samples_per_second": 33.663, + "eval_steps_per_second": 8.416, + "step": 1000 + }, + { + "epoch": 0.08777946614891982, + "grad_norm": 0.169921875, + "learning_rate": 0.0029999999384417424, + "loss": 1.2718, + "step": 1001 + }, + { + "epoch": 0.08786715792329436, + "grad_norm": 0.080078125, + "learning_rate": 0.0029999997537669756, + "loss": 1.2869, + "step": 1002 + }, + { + "epoch": 0.08795484969766891, + "grad_norm": 0.224609375, + "learning_rate": 0.002999999445975716, + "loss": 1.2037, + "step": 1003 + }, + { + "epoch": 0.08804254147204345, + "grad_norm": 0.07568359375, + "learning_rate": 0.0029999990150679926, + "loss": 1.2742, + "step": 1004 + }, + { + "epoch": 0.088130233246418, + "grad_norm": 0.1748046875, + "learning_rate": 0.002999998461043843, + "loss": 1.2448, + "step": 1005 + }, + { + "epoch": 0.08821792502079254, + "grad_norm": 0.08251953125, + "learning_rate": 0.0029999977839033198, + "loss": 1.1709, + "step": 1006 + }, + { + "epoch": 0.08830561679516709, + "grad_norm": 0.162109375, + "learning_rate": 0.0029999969836464833, + "loss": 1.222, + "step": 1007 + }, + { + "epoch": 0.08839330856954163, + "grad_norm": 0.08984375, + "learning_rate": 0.002999996060273407, + "loss": 1.2483, + "step": 1008 + }, + { + "epoch": 0.08848100034391618, + "grad_norm": 0.142578125, + "learning_rate": 0.0029999950137841744, + "loss": 1.1605, + "step": 1009 + }, + { + "epoch": 0.08856869211829073, + "grad_norm": 0.0732421875, + "learning_rate": 0.002999993844178882, + "loss": 1.2067, + "step": 1010 + }, + { + "epoch": 0.08865638389266527, + "grad_norm": 0.1630859375, + "learning_rate": 0.002999992551457636, + "loss": 1.2093, + "step": 1011 + }, + { + "epoch": 0.08874407566703982, + "grad_norm": 0.07080078125, + "learning_rate": 0.002999991135620554, + "loss": 1.2135, + "step": 1012 + }, + { + "epoch": 0.08883176744141436, + "grad_norm": 0.134765625, + "learning_rate": 0.0029999895966677658, + "loss": 1.1967, + "step": 1013 + }, + { + "epoch": 0.0889194592157889, + "grad_norm": 0.05810546875, + "learning_rate": 0.0029999879345994113, + "loss": 1.2596, + "step": 1014 + }, + { + "epoch": 0.08900715099016344, + "grad_norm": 0.07861328125, + "learning_rate": 0.002999986149415642, + "loss": 1.205, + "step": 1015 + }, + { + "epoch": 0.089094842764538, + "grad_norm": 0.09912109375, + "learning_rate": 0.002999984241116621, + "loss": 1.2303, + "step": 1016 + }, + { + "epoch": 0.08918253453891253, + "grad_norm": 0.11474609375, + "learning_rate": 0.0029999822097025223, + "loss": 1.2616, + "step": 1017 + }, + { + "epoch": 0.08927022631328708, + "grad_norm": 0.0732421875, + "learning_rate": 0.0029999800551735304, + "loss": 1.2431, + "step": 1018 + }, + { + "epoch": 0.08935791808766164, + "grad_norm": 0.076171875, + "learning_rate": 0.002999977777529843, + "loss": 1.199, + "step": 1019 + }, + { + "epoch": 0.08944560986203617, + "grad_norm": 0.1123046875, + "learning_rate": 0.002999975376771667, + "loss": 1.2637, + "step": 1020 + }, + { + "epoch": 0.08953330163641073, + "grad_norm": 0.091796875, + "learning_rate": 0.0029999728528992214, + "loss": 1.2013, + "step": 1021 + }, + { + "epoch": 0.08962099341078526, + "grad_norm": 0.1435546875, + "learning_rate": 0.002999970205912737, + "loss": 1.2653, + "step": 1022 + }, + { + "epoch": 0.08970868518515981, + "grad_norm": 0.1669921875, + "learning_rate": 0.002999967435812455, + "loss": 1.27, + "step": 1023 + }, + { + "epoch": 0.08979637695953435, + "grad_norm": 0.08984375, + "learning_rate": 0.0029999645425986265, + "loss": 1.2497, + "step": 1024 + }, + { + "epoch": 0.0898840687339089, + "grad_norm": 0.1318359375, + "learning_rate": 0.0029999615262715183, + "loss": 1.2822, + "step": 1025 + }, + { + "epoch": 0.08997176050828344, + "grad_norm": 0.08251953125, + "learning_rate": 0.0029999583868314025, + "loss": 1.2212, + "step": 1026 + }, + { + "epoch": 0.09005945228265799, + "grad_norm": 0.0673828125, + "learning_rate": 0.0029999551242785674, + "loss": 1.2082, + "step": 1027 + }, + { + "epoch": 0.09014714405703254, + "grad_norm": 0.09423828125, + "learning_rate": 0.0029999517386133097, + "loss": 1.1986, + "step": 1028 + }, + { + "epoch": 0.09023483583140708, + "grad_norm": 0.10205078125, + "learning_rate": 0.0029999482298359386, + "loss": 1.2419, + "step": 1029 + }, + { + "epoch": 0.09032252760578163, + "grad_norm": 0.0927734375, + "learning_rate": 0.002999944597946773, + "loss": 1.1901, + "step": 1030 + }, + { + "epoch": 0.09041021938015617, + "grad_norm": 0.10009765625, + "learning_rate": 0.0029999408429461456, + "loss": 1.2331, + "step": 1031 + }, + { + "epoch": 0.09049791115453072, + "grad_norm": 0.09033203125, + "learning_rate": 0.0029999369648343976, + "loss": 1.2162, + "step": 1032 + }, + { + "epoch": 0.09058560292890526, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029999329636118837, + "loss": 1.1974, + "step": 1033 + }, + { + "epoch": 0.09067329470327981, + "grad_norm": 0.0751953125, + "learning_rate": 0.0029999288392789686, + "loss": 1.2497, + "step": 1034 + }, + { + "epoch": 0.09076098647765435, + "grad_norm": 0.06640625, + "learning_rate": 0.002999924591836028, + "loss": 1.2433, + "step": 1035 + }, + { + "epoch": 0.0908486782520289, + "grad_norm": 0.0849609375, + "learning_rate": 0.002999920221283449, + "loss": 1.2738, + "step": 1036 + }, + { + "epoch": 0.09093637002640345, + "grad_norm": 0.09375, + "learning_rate": 0.002999915727621631, + "loss": 1.2344, + "step": 1037 + }, + { + "epoch": 0.09102406180077799, + "grad_norm": 0.0576171875, + "learning_rate": 0.0029999111108509834, + "loss": 1.213, + "step": 1038 + }, + { + "epoch": 0.09111175357515254, + "grad_norm": 0.0859375, + "learning_rate": 0.0029999063709719278, + "loss": 1.2075, + "step": 1039 + }, + { + "epoch": 0.09119944534952708, + "grad_norm": 0.068359375, + "learning_rate": 0.002999901507984895, + "loss": 1.2245, + "step": 1040 + }, + { + "epoch": 0.09128713712390163, + "grad_norm": 0.1318359375, + "learning_rate": 0.0029998965218903297, + "loss": 1.2, + "step": 1041 + }, + { + "epoch": 0.09137482889827617, + "grad_norm": 0.08642578125, + "learning_rate": 0.0029998914126886864, + "loss": 1.2469, + "step": 1042 + }, + { + "epoch": 0.09146252067265072, + "grad_norm": 0.1015625, + "learning_rate": 0.002999886180380431, + "loss": 1.2332, + "step": 1043 + }, + { + "epoch": 0.09155021244702526, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029998808249660407, + "loss": 1.2035, + "step": 1044 + }, + { + "epoch": 0.09163790422139981, + "grad_norm": 0.09912109375, + "learning_rate": 0.0029998753464460038, + "loss": 1.2663, + "step": 1045 + }, + { + "epoch": 0.09172559599577436, + "grad_norm": 0.134765625, + "learning_rate": 0.0029998697448208205, + "loss": 1.2392, + "step": 1046 + }, + { + "epoch": 0.0918132877701489, + "grad_norm": 0.07177734375, + "learning_rate": 0.0029998640200910006, + "loss": 1.2818, + "step": 1047 + }, + { + "epoch": 0.09190097954452345, + "grad_norm": 0.080078125, + "learning_rate": 0.002999858172257067, + "loss": 1.261, + "step": 1048 + }, + { + "epoch": 0.09198867131889799, + "grad_norm": 0.06640625, + "learning_rate": 0.0029998522013195525, + "loss": 1.2804, + "step": 1049 + }, + { + "epoch": 0.09207636309327254, + "grad_norm": 0.125, + "learning_rate": 0.0029998461072790017, + "loss": 1.2455, + "step": 1050 + }, + { + "epoch": 0.09216405486764707, + "grad_norm": 0.07568359375, + "learning_rate": 0.002999839890135971, + "loss": 1.22, + "step": 1051 + }, + { + "epoch": 0.09225174664202163, + "grad_norm": 0.1396484375, + "learning_rate": 0.0029998335498910263, + "loss": 1.2181, + "step": 1052 + }, + { + "epoch": 0.09233943841639616, + "grad_norm": 0.09814453125, + "learning_rate": 0.002999827086544747, + "loss": 1.2936, + "step": 1053 + }, + { + "epoch": 0.09242713019077071, + "grad_norm": 0.10009765625, + "learning_rate": 0.0029998205000977213, + "loss": 1.215, + "step": 1054 + }, + { + "epoch": 0.09251482196514527, + "grad_norm": 0.09033203125, + "learning_rate": 0.0029998137905505513, + "loss": 1.2093, + "step": 1055 + }, + { + "epoch": 0.0926025137395198, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029998069579038476, + "loss": 1.2312, + "step": 1056 + }, + { + "epoch": 0.09269020551389436, + "grad_norm": 0.09375, + "learning_rate": 0.0029998000021582345, + "loss": 1.1841, + "step": 1057 + }, + { + "epoch": 0.09277789728826889, + "grad_norm": 0.0751953125, + "learning_rate": 0.002999792923314345, + "loss": 1.2149, + "step": 1058 + }, + { + "epoch": 0.09286558906264344, + "grad_norm": 0.10205078125, + "learning_rate": 0.0029997857213728257, + "loss": 1.2036, + "step": 1059 + }, + { + "epoch": 0.09295328083701798, + "grad_norm": 0.0712890625, + "learning_rate": 0.0029997783963343332, + "loss": 1.2765, + "step": 1060 + }, + { + "epoch": 0.09304097261139253, + "grad_norm": 0.0732421875, + "learning_rate": 0.002999770948199535, + "loss": 1.2624, + "step": 1061 + }, + { + "epoch": 0.09312866438576707, + "grad_norm": 0.072265625, + "learning_rate": 0.002999763376969111, + "loss": 1.2227, + "step": 1062 + }, + { + "epoch": 0.09321635616014162, + "grad_norm": 0.0673828125, + "learning_rate": 0.002999755682643751, + "loss": 1.2304, + "step": 1063 + }, + { + "epoch": 0.09330404793451617, + "grad_norm": 0.10693359375, + "learning_rate": 0.0029997478652241576, + "loss": 1.2698, + "step": 1064 + }, + { + "epoch": 0.09339173970889071, + "grad_norm": 0.10693359375, + "learning_rate": 0.002999739924711043, + "loss": 1.2416, + "step": 1065 + }, + { + "epoch": 0.09347943148326526, + "grad_norm": 0.08154296875, + "learning_rate": 0.002999731861105132, + "loss": 1.2739, + "step": 1066 + }, + { + "epoch": 0.0935671232576398, + "grad_norm": 0.1513671875, + "learning_rate": 0.002999723674407159, + "loss": 1.241, + "step": 1067 + }, + { + "epoch": 0.09365481503201435, + "grad_norm": 0.10302734375, + "learning_rate": 0.0029997153646178715, + "loss": 1.2133, + "step": 1068 + }, + { + "epoch": 0.09374250680638889, + "grad_norm": 0.064453125, + "learning_rate": 0.0029997069317380274, + "loss": 1.2393, + "step": 1069 + }, + { + "epoch": 0.09383019858076344, + "grad_norm": 0.10791015625, + "learning_rate": 0.002999698375768395, + "loss": 1.2339, + "step": 1070 + }, + { + "epoch": 0.09391789035513798, + "grad_norm": 0.0732421875, + "learning_rate": 0.002999689696709755, + "loss": 1.2435, + "step": 1071 + }, + { + "epoch": 0.09400558212951253, + "grad_norm": 0.1455078125, + "learning_rate": 0.0029996808945628984, + "loss": 1.2748, + "step": 1072 + }, + { + "epoch": 0.09409327390388708, + "grad_norm": 0.064453125, + "learning_rate": 0.002999671969328629, + "loss": 1.2199, + "step": 1073 + }, + { + "epoch": 0.09418096567826162, + "grad_norm": 0.1259765625, + "learning_rate": 0.00299966292100776, + "loss": 1.2602, + "step": 1074 + }, + { + "epoch": 0.09426865745263617, + "grad_norm": 0.06640625, + "learning_rate": 0.0029996537496011166, + "loss": 1.2514, + "step": 1075 + }, + { + "epoch": 0.09435634922701071, + "grad_norm": 0.154296875, + "learning_rate": 0.0029996444551095352, + "loss": 1.2206, + "step": 1076 + }, + { + "epoch": 0.09444404100138526, + "grad_norm": 0.06591796875, + "learning_rate": 0.002999635037533864, + "loss": 1.2731, + "step": 1077 + }, + { + "epoch": 0.0945317327757598, + "grad_norm": 0.109375, + "learning_rate": 0.0029996254968749614, + "loss": 1.2356, + "step": 1078 + }, + { + "epoch": 0.09461942455013435, + "grad_norm": 0.06591796875, + "learning_rate": 0.002999615833133697, + "loss": 1.2592, + "step": 1079 + }, + { + "epoch": 0.09470711632450889, + "grad_norm": 0.1318359375, + "learning_rate": 0.0029996060463109535, + "loss": 1.1929, + "step": 1080 + }, + { + "epoch": 0.09479480809888344, + "grad_norm": 0.0810546875, + "learning_rate": 0.002999596136407622, + "loss": 1.2117, + "step": 1081 + }, + { + "epoch": 0.09488249987325799, + "grad_norm": 0.12353515625, + "learning_rate": 0.002999586103424607, + "loss": 1.173, + "step": 1082 + }, + { + "epoch": 0.09497019164763253, + "grad_norm": 0.0849609375, + "learning_rate": 0.0029995759473628227, + "loss": 1.2415, + "step": 1083 + }, + { + "epoch": 0.09505788342200708, + "grad_norm": 0.1474609375, + "learning_rate": 0.0029995656682231964, + "loss": 1.233, + "step": 1084 + }, + { + "epoch": 0.09514557519638162, + "grad_norm": 0.08837890625, + "learning_rate": 0.0029995552660066656, + "loss": 1.2176, + "step": 1085 + }, + { + "epoch": 0.09523326697075617, + "grad_norm": 0.119140625, + "learning_rate": 0.0029995447407141777, + "loss": 1.2514, + "step": 1086 + }, + { + "epoch": 0.0953209587451307, + "grad_norm": 0.07861328125, + "learning_rate": 0.0029995340923466935, + "loss": 1.1719, + "step": 1087 + }, + { + "epoch": 0.09540865051950526, + "grad_norm": 0.142578125, + "learning_rate": 0.0029995233209051835, + "loss": 1.2674, + "step": 1088 + }, + { + "epoch": 0.0954963422938798, + "grad_norm": 0.08447265625, + "learning_rate": 0.002999512426390631, + "loss": 1.1974, + "step": 1089 + }, + { + "epoch": 0.09558403406825434, + "grad_norm": 0.171875, + "learning_rate": 0.0029995014088040287, + "loss": 1.2171, + "step": 1090 + }, + { + "epoch": 0.0956717258426289, + "grad_norm": 0.1328125, + "learning_rate": 0.0029994902681463815, + "loss": 1.2789, + "step": 1091 + }, + { + "epoch": 0.09575941761700343, + "grad_norm": 0.12353515625, + "learning_rate": 0.0029994790044187056, + "loss": 1.1824, + "step": 1092 + }, + { + "epoch": 0.09584710939137799, + "grad_norm": 0.10009765625, + "learning_rate": 0.002999467617622028, + "loss": 1.1969, + "step": 1093 + }, + { + "epoch": 0.09593480116575252, + "grad_norm": 0.0908203125, + "learning_rate": 0.002999456107757388, + "loss": 1.2936, + "step": 1094 + }, + { + "epoch": 0.09602249294012707, + "grad_norm": 0.08056640625, + "learning_rate": 0.0029994444748258344, + "loss": 1.2661, + "step": 1095 + }, + { + "epoch": 0.09611018471450161, + "grad_norm": 0.08935546875, + "learning_rate": 0.0029994327188284276, + "loss": 1.2637, + "step": 1096 + }, + { + "epoch": 0.09619787648887616, + "grad_norm": 0.07080078125, + "learning_rate": 0.002999420839766241, + "loss": 1.2525, + "step": 1097 + }, + { + "epoch": 0.0962855682632507, + "grad_norm": 0.1298828125, + "learning_rate": 0.002999408837640357, + "loss": 1.2274, + "step": 1098 + }, + { + "epoch": 0.09637326003762525, + "grad_norm": 0.0791015625, + "learning_rate": 0.0029993967124518706, + "loss": 1.1619, + "step": 1099 + }, + { + "epoch": 0.0964609518119998, + "grad_norm": 0.08837890625, + "learning_rate": 0.0029993844642018873, + "loss": 1.1803, + "step": 1100 + }, + { + "epoch": 0.09654864358637434, + "grad_norm": 0.0810546875, + "learning_rate": 0.0029993720928915245, + "loss": 1.2556, + "step": 1101 + }, + { + "epoch": 0.09663633536074889, + "grad_norm": 0.09033203125, + "learning_rate": 0.0029993595985219105, + "loss": 1.2952, + "step": 1102 + }, + { + "epoch": 0.09672402713512343, + "grad_norm": 0.10205078125, + "learning_rate": 0.0029993469810941837, + "loss": 1.2566, + "step": 1103 + }, + { + "epoch": 0.09681171890949798, + "grad_norm": 0.10546875, + "learning_rate": 0.0029993342406094965, + "loss": 1.1917, + "step": 1104 + }, + { + "epoch": 0.09689941068387252, + "grad_norm": 0.1357421875, + "learning_rate": 0.002999321377069009, + "loss": 1.2433, + "step": 1105 + }, + { + "epoch": 0.09698710245824707, + "grad_norm": 0.0791015625, + "learning_rate": 0.0029993083904738954, + "loss": 1.2338, + "step": 1106 + }, + { + "epoch": 0.09707479423262161, + "grad_norm": 0.18359375, + "learning_rate": 0.00299929528082534, + "loss": 1.2022, + "step": 1107 + }, + { + "epoch": 0.09716248600699616, + "grad_norm": 0.0869140625, + "learning_rate": 0.0029992820481245385, + "loss": 1.2294, + "step": 1108 + }, + { + "epoch": 0.09725017778137071, + "grad_norm": 0.1318359375, + "learning_rate": 0.0029992686923726963, + "loss": 1.2095, + "step": 1109 + }, + { + "epoch": 0.09733786955574525, + "grad_norm": 0.09912109375, + "learning_rate": 0.0029992552135710334, + "loss": 1.1755, + "step": 1110 + }, + { + "epoch": 0.0974255613301198, + "grad_norm": 0.09521484375, + "learning_rate": 0.0029992416117207775, + "loss": 1.2073, + "step": 1111 + }, + { + "epoch": 0.09751325310449434, + "grad_norm": 0.0908203125, + "learning_rate": 0.00299922788682317, + "loss": 1.2102, + "step": 1112 + }, + { + "epoch": 0.09760094487886889, + "grad_norm": 0.09326171875, + "learning_rate": 0.0029992140388794626, + "loss": 1.2712, + "step": 1113 + }, + { + "epoch": 0.09768863665324343, + "grad_norm": 0.0625, + "learning_rate": 0.0029992000678909173, + "loss": 1.2604, + "step": 1114 + }, + { + "epoch": 0.09777632842761798, + "grad_norm": 0.12353515625, + "learning_rate": 0.002999185973858809, + "loss": 1.2931, + "step": 1115 + }, + { + "epoch": 0.09786402020199252, + "grad_norm": 0.0693359375, + "learning_rate": 0.0029991717567844226, + "loss": 1.2079, + "step": 1116 + }, + { + "epoch": 0.09795171197636707, + "grad_norm": 0.083984375, + "learning_rate": 0.002999157416669055, + "loss": 1.2036, + "step": 1117 + }, + { + "epoch": 0.09803940375074162, + "grad_norm": 0.0732421875, + "learning_rate": 0.002999142953514014, + "loss": 1.2258, + "step": 1118 + }, + { + "epoch": 0.09812709552511616, + "grad_norm": 0.09912109375, + "learning_rate": 0.002999128367320618, + "loss": 1.3019, + "step": 1119 + }, + { + "epoch": 0.09821478729949071, + "grad_norm": 0.0654296875, + "learning_rate": 0.002999113658090198, + "loss": 1.23, + "step": 1120 + }, + { + "epoch": 0.09830247907386525, + "grad_norm": 0.07470703125, + "learning_rate": 0.002999098825824095, + "loss": 1.2882, + "step": 1121 + }, + { + "epoch": 0.0983901708482398, + "grad_norm": 0.07080078125, + "learning_rate": 0.0029990838705236614, + "loss": 1.2154, + "step": 1122 + }, + { + "epoch": 0.09847786262261433, + "grad_norm": 0.091796875, + "learning_rate": 0.002999068792190262, + "loss": 1.2567, + "step": 1123 + }, + { + "epoch": 0.09856555439698889, + "grad_norm": 0.0751953125, + "learning_rate": 0.0029990535908252713, + "loss": 1.2004, + "step": 1124 + }, + { + "epoch": 0.09865324617136342, + "grad_norm": 0.11767578125, + "learning_rate": 0.0029990382664300754, + "loss": 1.1912, + "step": 1125 + }, + { + "epoch": 0.09874093794573797, + "grad_norm": 0.08056640625, + "learning_rate": 0.0029990228190060722, + "loss": 1.2991, + "step": 1126 + }, + { + "epoch": 0.09882862972011253, + "grad_norm": 0.1328125, + "learning_rate": 0.0029990072485546705, + "loss": 1.2323, + "step": 1127 + }, + { + "epoch": 0.09891632149448706, + "grad_norm": 0.080078125, + "learning_rate": 0.00299899155507729, + "loss": 1.342, + "step": 1128 + }, + { + "epoch": 0.09900401326886162, + "grad_norm": 0.09912109375, + "learning_rate": 0.002998975738575362, + "loss": 1.2406, + "step": 1129 + }, + { + "epoch": 0.09909170504323615, + "grad_norm": 0.09716796875, + "learning_rate": 0.002998959799050329, + "loss": 1.2276, + "step": 1130 + }, + { + "epoch": 0.0991793968176107, + "grad_norm": 0.0654296875, + "learning_rate": 0.0029989437365036454, + "loss": 1.2036, + "step": 1131 + }, + { + "epoch": 0.09926708859198524, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029989275509367747, + "loss": 1.2688, + "step": 1132 + }, + { + "epoch": 0.0993547803663598, + "grad_norm": 0.060302734375, + "learning_rate": 0.0029989112423511933, + "loss": 1.2139, + "step": 1133 + }, + { + "epoch": 0.09944247214073433, + "grad_norm": 0.08349609375, + "learning_rate": 0.0029988948107483896, + "loss": 1.2585, + "step": 1134 + }, + { + "epoch": 0.09953016391510888, + "grad_norm": 0.0830078125, + "learning_rate": 0.0029988782561298608, + "loss": 1.2379, + "step": 1135 + }, + { + "epoch": 0.09961785568948343, + "grad_norm": 0.0791015625, + "learning_rate": 0.002998861578497117, + "loss": 1.2192, + "step": 1136 + }, + { + "epoch": 0.09970554746385797, + "grad_norm": 0.09765625, + "learning_rate": 0.00299884477785168, + "loss": 1.1981, + "step": 1137 + }, + { + "epoch": 0.09979323923823252, + "grad_norm": 0.0849609375, + "learning_rate": 0.0029988278541950805, + "loss": 1.1949, + "step": 1138 + }, + { + "epoch": 0.09988093101260706, + "grad_norm": 0.31640625, + "learning_rate": 0.002998810807528863, + "loss": 1.2511, + "step": 1139 + }, + { + "epoch": 0.09996862278698161, + "grad_norm": 0.068359375, + "learning_rate": 0.0029987936378545813, + "loss": 1.2232, + "step": 1140 + }, + { + "epoch": 0.10005631456135615, + "grad_norm": 0.087890625, + "learning_rate": 0.0029987763451738026, + "loss": 1.2326, + "step": 1141 + }, + { + "epoch": 0.1001440063357307, + "grad_norm": 0.07275390625, + "learning_rate": 0.0029987589294881026, + "loss": 1.2861, + "step": 1142 + }, + { + "epoch": 0.10023169811010524, + "grad_norm": 0.1064453125, + "learning_rate": 0.0029987413907990703, + "loss": 1.209, + "step": 1143 + }, + { + "epoch": 0.10031938988447979, + "grad_norm": 0.0703125, + "learning_rate": 0.0029987237291083046, + "loss": 1.2089, + "step": 1144 + }, + { + "epoch": 0.10040708165885433, + "grad_norm": 0.1162109375, + "learning_rate": 0.002998705944417417, + "loss": 1.241, + "step": 1145 + }, + { + "epoch": 0.10049477343322888, + "grad_norm": 0.0703125, + "learning_rate": 0.002998688036728028, + "loss": 1.2162, + "step": 1146 + }, + { + "epoch": 0.10058246520760343, + "grad_norm": 0.0732421875, + "learning_rate": 0.0029986700060417723, + "loss": 1.1938, + "step": 1147 + }, + { + "epoch": 0.10067015698197797, + "grad_norm": 0.08203125, + "learning_rate": 0.0029986518523602936, + "loss": 1.246, + "step": 1148 + }, + { + "epoch": 0.10075784875635252, + "grad_norm": 0.09716796875, + "learning_rate": 0.0029986335756852474, + "loss": 1.231, + "step": 1149 + }, + { + "epoch": 0.10084554053072706, + "grad_norm": 0.0791015625, + "learning_rate": 0.002998615176018301, + "loss": 1.1912, + "step": 1150 + }, + { + "epoch": 0.10093323230510161, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029985966533611313, + "loss": 1.2493, + "step": 1151 + }, + { + "epoch": 0.10102092407947615, + "grad_norm": 0.0751953125, + "learning_rate": 0.0029985780077154286, + "loss": 1.1881, + "step": 1152 + }, + { + "epoch": 0.1011086158538507, + "grad_norm": 0.11328125, + "learning_rate": 0.002998559239082893, + "loss": 1.2643, + "step": 1153 + }, + { + "epoch": 0.10119630762822523, + "grad_norm": 0.0859375, + "learning_rate": 0.002998540347465236, + "loss": 1.2051, + "step": 1154 + }, + { + "epoch": 0.10128399940259979, + "grad_norm": 0.09716796875, + "learning_rate": 0.0029985213328641803, + "loss": 1.237, + "step": 1155 + }, + { + "epoch": 0.10137169117697434, + "grad_norm": 0.06884765625, + "learning_rate": 0.0029985021952814604, + "loss": 1.1926, + "step": 1156 + }, + { + "epoch": 0.10145938295134888, + "grad_norm": 0.1689453125, + "learning_rate": 0.0029984829347188212, + "loss": 1.2545, + "step": 1157 + }, + { + "epoch": 0.10154707472572343, + "grad_norm": 0.08447265625, + "learning_rate": 0.00299846355117802, + "loss": 1.2598, + "step": 1158 + }, + { + "epoch": 0.10163476650009796, + "grad_norm": 0.11083984375, + "learning_rate": 0.0029984440446608235, + "loss": 1.2418, + "step": 1159 + }, + { + "epoch": 0.10172245827447252, + "grad_norm": 0.0859375, + "learning_rate": 0.0029984244151690116, + "loss": 1.2688, + "step": 1160 + }, + { + "epoch": 0.10181015004884705, + "grad_norm": 0.1611328125, + "learning_rate": 0.0029984046627043737, + "loss": 1.2354, + "step": 1161 + }, + { + "epoch": 0.1018978418232216, + "grad_norm": 0.10693359375, + "learning_rate": 0.0029983847872687114, + "loss": 1.2647, + "step": 1162 + }, + { + "epoch": 0.10198553359759614, + "grad_norm": 0.20703125, + "learning_rate": 0.002998364788863837, + "loss": 1.2424, + "step": 1163 + }, + { + "epoch": 0.1020732253719707, + "grad_norm": 0.06787109375, + "learning_rate": 0.002998344667491575, + "loss": 1.2311, + "step": 1164 + }, + { + "epoch": 0.10216091714634525, + "grad_norm": 0.1884765625, + "learning_rate": 0.00299832442315376, + "loss": 1.2327, + "step": 1165 + }, + { + "epoch": 0.10224860892071978, + "grad_norm": 0.107421875, + "learning_rate": 0.0029983040558522384, + "loss": 1.3207, + "step": 1166 + }, + { + "epoch": 0.10233630069509433, + "grad_norm": 0.09765625, + "learning_rate": 0.0029982835655888674, + "loss": 1.254, + "step": 1167 + }, + { + "epoch": 0.10242399246946887, + "grad_norm": 0.0830078125, + "learning_rate": 0.002998262952365516, + "loss": 1.1578, + "step": 1168 + }, + { + "epoch": 0.10251168424384342, + "grad_norm": 0.08984375, + "learning_rate": 0.0029982422161840636, + "loss": 1.2216, + "step": 1169 + }, + { + "epoch": 0.10259937601821796, + "grad_norm": 0.08447265625, + "learning_rate": 0.0029982213570464017, + "loss": 1.2352, + "step": 1170 + }, + { + "epoch": 0.10268706779259251, + "grad_norm": 0.11962890625, + "learning_rate": 0.0029982003749544324, + "loss": 1.2474, + "step": 1171 + }, + { + "epoch": 0.10277475956696705, + "grad_norm": 0.1396484375, + "learning_rate": 0.0029981792699100692, + "loss": 1.2753, + "step": 1172 + }, + { + "epoch": 0.1028624513413416, + "grad_norm": 0.07275390625, + "learning_rate": 0.0029981580419152372, + "loss": 1.1818, + "step": 1173 + }, + { + "epoch": 0.10295014311571615, + "grad_norm": 0.08154296875, + "learning_rate": 0.0029981366909718715, + "loss": 1.1834, + "step": 1174 + }, + { + "epoch": 0.10303783489009069, + "grad_norm": 0.08349609375, + "learning_rate": 0.0029981152170819206, + "loss": 1.2568, + "step": 1175 + }, + { + "epoch": 0.10312552666446524, + "grad_norm": 0.06591796875, + "learning_rate": 0.0029980936202473416, + "loss": 1.2909, + "step": 1176 + }, + { + "epoch": 0.10321321843883978, + "grad_norm": 0.0712890625, + "learning_rate": 0.002998071900470104, + "loss": 1.2636, + "step": 1177 + }, + { + "epoch": 0.10330091021321433, + "grad_norm": 0.10693359375, + "learning_rate": 0.00299805005775219, + "loss": 1.1922, + "step": 1178 + }, + { + "epoch": 0.10338860198758887, + "grad_norm": 0.08544921875, + "learning_rate": 0.00299802809209559, + "loss": 1.207, + "step": 1179 + }, + { + "epoch": 0.10347629376196342, + "grad_norm": 0.08544921875, + "learning_rate": 0.002998006003502308, + "loss": 1.3056, + "step": 1180 + }, + { + "epoch": 0.10356398553633796, + "grad_norm": 0.08642578125, + "learning_rate": 0.0029979837919743586, + "loss": 1.2899, + "step": 1181 + }, + { + "epoch": 0.10365167731071251, + "grad_norm": 0.0703125, + "learning_rate": 0.0029979614575137673, + "loss": 1.258, + "step": 1182 + }, + { + "epoch": 0.10373936908508706, + "grad_norm": 0.0771484375, + "learning_rate": 0.00299793900012257, + "loss": 1.2957, + "step": 1183 + }, + { + "epoch": 0.1038270608594616, + "grad_norm": 0.11328125, + "learning_rate": 0.002997916419802817, + "loss": 1.2868, + "step": 1184 + }, + { + "epoch": 0.10391475263383615, + "grad_norm": 0.06640625, + "learning_rate": 0.0029978937165565656, + "loss": 1.2358, + "step": 1185 + }, + { + "epoch": 0.10400244440821069, + "grad_norm": 0.11328125, + "learning_rate": 0.002997870890385886, + "loss": 1.2856, + "step": 1186 + }, + { + "epoch": 0.10409013618258524, + "grad_norm": 0.07080078125, + "learning_rate": 0.002997847941292861, + "loss": 1.2968, + "step": 1187 + }, + { + "epoch": 0.10417782795695978, + "grad_norm": 0.0810546875, + "learning_rate": 0.0029978248692795837, + "loss": 1.205, + "step": 1188 + }, + { + "epoch": 0.10426551973133433, + "grad_norm": 0.10595703125, + "learning_rate": 0.0029978016743481576, + "loss": 1.1977, + "step": 1189 + }, + { + "epoch": 0.10435321150570886, + "grad_norm": 0.080078125, + "learning_rate": 0.0029977783565006983, + "loss": 1.1755, + "step": 1190 + }, + { + "epoch": 0.10444090328008342, + "grad_norm": 0.076171875, + "learning_rate": 0.0029977549157393316, + "loss": 1.1375, + "step": 1191 + }, + { + "epoch": 0.10452859505445797, + "grad_norm": 0.09033203125, + "learning_rate": 0.0029977313520661956, + "loss": 1.2817, + "step": 1192 + }, + { + "epoch": 0.1046162868288325, + "grad_norm": 0.10205078125, + "learning_rate": 0.0029977076654834406, + "loss": 1.2136, + "step": 1193 + }, + { + "epoch": 0.10470397860320706, + "grad_norm": 0.06689453125, + "learning_rate": 0.0029976838559932248, + "loss": 1.2893, + "step": 1194 + }, + { + "epoch": 0.1047916703775816, + "grad_norm": 0.11279296875, + "learning_rate": 0.0029976599235977206, + "loss": 1.2309, + "step": 1195 + }, + { + "epoch": 0.10487936215195615, + "grad_norm": 0.0712890625, + "learning_rate": 0.0029976358682991104, + "loss": 1.1881, + "step": 1196 + }, + { + "epoch": 0.10496705392633068, + "grad_norm": 0.09033203125, + "learning_rate": 0.002997611690099587, + "loss": 1.2545, + "step": 1197 + }, + { + "epoch": 0.10505474570070523, + "grad_norm": 0.064453125, + "learning_rate": 0.0029975873890013575, + "loss": 1.2107, + "step": 1198 + }, + { + "epoch": 0.10514243747507977, + "grad_norm": 0.1484375, + "learning_rate": 0.0029975629650066367, + "loss": 1.2703, + "step": 1199 + }, + { + "epoch": 0.10523012924945432, + "grad_norm": 0.138671875, + "learning_rate": 0.0029975384181176513, + "loss": 1.2156, + "step": 1200 + }, + { + "epoch": 0.10531782102382888, + "grad_norm": 0.12353515625, + "learning_rate": 0.0029975137483366416, + "loss": 1.2019, + "step": 1201 + }, + { + "epoch": 0.10540551279820341, + "grad_norm": 0.16796875, + "learning_rate": 0.0029974889556658568, + "loss": 1.1964, + "step": 1202 + }, + { + "epoch": 0.10549320457257796, + "grad_norm": 0.07177734375, + "learning_rate": 0.0029974640401075575, + "loss": 1.2435, + "step": 1203 + }, + { + "epoch": 0.1055808963469525, + "grad_norm": 0.09619140625, + "learning_rate": 0.002997439001664016, + "loss": 1.2428, + "step": 1204 + }, + { + "epoch": 0.10566858812132705, + "grad_norm": 0.0849609375, + "learning_rate": 0.002997413840337516, + "loss": 1.2146, + "step": 1205 + }, + { + "epoch": 0.10575627989570159, + "grad_norm": 0.07470703125, + "learning_rate": 0.0029973885561303524, + "loss": 1.1744, + "step": 1206 + }, + { + "epoch": 0.10584397167007614, + "grad_norm": 0.0693359375, + "learning_rate": 0.0029973631490448306, + "loss": 1.2676, + "step": 1207 + }, + { + "epoch": 0.10593166344445068, + "grad_norm": 0.08203125, + "learning_rate": 0.0029973376190832674, + "loss": 1.2675, + "step": 1208 + }, + { + "epoch": 0.10601935521882523, + "grad_norm": 0.08349609375, + "learning_rate": 0.002997311966247992, + "loss": 1.2295, + "step": 1209 + }, + { + "epoch": 0.10610704699319978, + "grad_norm": 0.1025390625, + "learning_rate": 0.0029972861905413436, + "loss": 1.2739, + "step": 1210 + }, + { + "epoch": 0.10619473876757432, + "grad_norm": 0.07958984375, + "learning_rate": 0.002997260291965672, + "loss": 1.1552, + "step": 1211 + }, + { + "epoch": 0.10628243054194887, + "grad_norm": 0.06787109375, + "learning_rate": 0.0029972342705233404, + "loss": 1.1411, + "step": 1212 + }, + { + "epoch": 0.10637012231632341, + "grad_norm": 0.08837890625, + "learning_rate": 0.0029972081262167197, + "loss": 1.2573, + "step": 1213 + }, + { + "epoch": 0.10645781409069796, + "grad_norm": 0.06494140625, + "learning_rate": 0.0029971818590481974, + "loss": 1.2577, + "step": 1214 + }, + { + "epoch": 0.1065455058650725, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029971554690201665, + "loss": 1.2148, + "step": 1215 + }, + { + "epoch": 0.10663319763944705, + "grad_norm": 0.1005859375, + "learning_rate": 0.0029971289561350344, + "loss": 1.2585, + "step": 1216 + }, + { + "epoch": 0.10672088941382159, + "grad_norm": 0.06591796875, + "learning_rate": 0.0029971023203952192, + "loss": 1.2893, + "step": 1217 + }, + { + "epoch": 0.10680858118819614, + "grad_norm": 0.1416015625, + "learning_rate": 0.00299707556180315, + "loss": 1.2542, + "step": 1218 + }, + { + "epoch": 0.10689627296257069, + "grad_norm": 0.138671875, + "learning_rate": 0.0029970486803612673, + "loss": 1.2508, + "step": 1219 + }, + { + "epoch": 0.10698396473694523, + "grad_norm": 0.07568359375, + "learning_rate": 0.002997021676072022, + "loss": 1.2215, + "step": 1220 + }, + { + "epoch": 0.10707165651131978, + "grad_norm": 0.171875, + "learning_rate": 0.002996994548937877, + "loss": 1.2594, + "step": 1221 + }, + { + "epoch": 0.10715934828569432, + "grad_norm": 0.10498046875, + "learning_rate": 0.002996967298961307, + "loss": 1.2044, + "step": 1222 + }, + { + "epoch": 0.10724704006006887, + "grad_norm": 0.13671875, + "learning_rate": 0.0029969399261447955, + "loss": 1.1786, + "step": 1223 + }, + { + "epoch": 0.1073347318344434, + "grad_norm": 0.08203125, + "learning_rate": 0.002996912430490841, + "loss": 1.2013, + "step": 1224 + }, + { + "epoch": 0.10742242360881796, + "grad_norm": 0.1826171875, + "learning_rate": 0.002996884812001949, + "loss": 1.2407, + "step": 1225 + }, + { + "epoch": 0.1075101153831925, + "grad_norm": 0.07666015625, + "learning_rate": 0.002996857070680639, + "loss": 1.2203, + "step": 1226 + }, + { + "epoch": 0.10759780715756705, + "grad_norm": 0.171875, + "learning_rate": 0.002996829206529442, + "loss": 1.2675, + "step": 1227 + }, + { + "epoch": 0.1076854989319416, + "grad_norm": 0.10498046875, + "learning_rate": 0.0029968012195508978, + "loss": 1.2556, + "step": 1228 + }, + { + "epoch": 0.10777319070631614, + "grad_norm": 0.11474609375, + "learning_rate": 0.0029967731097475586, + "loss": 1.2905, + "step": 1229 + }, + { + "epoch": 0.10786088248069069, + "grad_norm": 0.06982421875, + "learning_rate": 0.002996744877121989, + "loss": 1.2587, + "step": 1230 + }, + { + "epoch": 0.10794857425506522, + "grad_norm": 0.10791015625, + "learning_rate": 0.0029967165216767634, + "loss": 1.2594, + "step": 1231 + }, + { + "epoch": 0.10803626602943978, + "grad_norm": 0.080078125, + "learning_rate": 0.0029966880434144673, + "loss": 1.1906, + "step": 1232 + }, + { + "epoch": 0.10812395780381431, + "grad_norm": 0.12109375, + "learning_rate": 0.0029966594423376973, + "loss": 1.213, + "step": 1233 + }, + { + "epoch": 0.10821164957818886, + "grad_norm": 0.06494140625, + "learning_rate": 0.002996630718449064, + "loss": 1.2473, + "step": 1234 + }, + { + "epoch": 0.1082993413525634, + "grad_norm": 0.10400390625, + "learning_rate": 0.0029966018717511845, + "loss": 1.2035, + "step": 1235 + }, + { + "epoch": 0.10838703312693795, + "grad_norm": 0.07275390625, + "learning_rate": 0.002996572902246691, + "loss": 1.2043, + "step": 1236 + }, + { + "epoch": 0.1084747249013125, + "grad_norm": 0.06787109375, + "learning_rate": 0.0029965438099382245, + "loss": 1.2142, + "step": 1237 + }, + { + "epoch": 0.10856241667568704, + "grad_norm": 0.07666015625, + "learning_rate": 0.0029965145948284387, + "loss": 1.2318, + "step": 1238 + }, + { + "epoch": 0.1086501084500616, + "grad_norm": 0.06494140625, + "learning_rate": 0.0029964852569199984, + "loss": 1.1786, + "step": 1239 + }, + { + "epoch": 0.10873780022443613, + "grad_norm": 0.11181640625, + "learning_rate": 0.002996455796215578, + "loss": 1.2101, + "step": 1240 + }, + { + "epoch": 0.10882549199881068, + "grad_norm": 0.061767578125, + "learning_rate": 0.0029964262127178654, + "loss": 1.2245, + "step": 1241 + }, + { + "epoch": 0.10891318377318522, + "grad_norm": 0.130859375, + "learning_rate": 0.0029963965064295573, + "loss": 1.1896, + "step": 1242 + }, + { + "epoch": 0.10900087554755977, + "grad_norm": 0.055908203125, + "learning_rate": 0.002996366677353364, + "loss": 1.2298, + "step": 1243 + }, + { + "epoch": 0.10908856732193431, + "grad_norm": 0.07666015625, + "learning_rate": 0.0029963367254920055, + "loss": 1.2506, + "step": 1244 + }, + { + "epoch": 0.10917625909630886, + "grad_norm": 0.09765625, + "learning_rate": 0.002996306650848213, + "loss": 1.2451, + "step": 1245 + }, + { + "epoch": 0.10926395087068341, + "grad_norm": 0.11083984375, + "learning_rate": 0.0029962764534247287, + "loss": 1.2024, + "step": 1246 + }, + { + "epoch": 0.10935164264505795, + "grad_norm": 0.0654296875, + "learning_rate": 0.0029962461332243085, + "loss": 1.2491, + "step": 1247 + }, + { + "epoch": 0.1094393344194325, + "grad_norm": 0.09521484375, + "learning_rate": 0.0029962156902497154, + "loss": 1.1692, + "step": 1248 + }, + { + "epoch": 0.10952702619380704, + "grad_norm": 0.0830078125, + "learning_rate": 0.002996185124503727, + "loss": 1.1808, + "step": 1249 + }, + { + "epoch": 0.10961471796818159, + "grad_norm": 0.12353515625, + "learning_rate": 0.0029961544359891302, + "loss": 1.2391, + "step": 1250 + }, + { + "epoch": 0.10970240974255613, + "grad_norm": 0.10546875, + "learning_rate": 0.0029961236247087234, + "loss": 1.2164, + "step": 1251 + }, + { + "epoch": 0.10979010151693068, + "grad_norm": 0.1328125, + "learning_rate": 0.0029960926906653185, + "loss": 1.2398, + "step": 1252 + }, + { + "epoch": 0.10987779329130522, + "grad_norm": 0.09716796875, + "learning_rate": 0.0029960616338617335, + "loss": 1.227, + "step": 1253 + }, + { + "epoch": 0.10996548506567977, + "grad_norm": 0.09228515625, + "learning_rate": 0.0029960304543008034, + "loss": 1.1366, + "step": 1254 + }, + { + "epoch": 0.11005317684005432, + "grad_norm": 0.1142578125, + "learning_rate": 0.00299599915198537, + "loss": 1.2744, + "step": 1255 + }, + { + "epoch": 0.11014086861442886, + "grad_norm": 0.11328125, + "learning_rate": 0.002995967726918289, + "loss": 1.276, + "step": 1256 + }, + { + "epoch": 0.11022856038880341, + "grad_norm": 0.0625, + "learning_rate": 0.002995936179102426, + "loss": 1.2089, + "step": 1257 + }, + { + "epoch": 0.11031625216317795, + "grad_norm": 0.0654296875, + "learning_rate": 0.0029959045085406573, + "loss": 1.289, + "step": 1258 + }, + { + "epoch": 0.1104039439375525, + "grad_norm": 0.1259765625, + "learning_rate": 0.002995872715235873, + "loss": 1.1645, + "step": 1259 + }, + { + "epoch": 0.11049163571192704, + "grad_norm": 0.0703125, + "learning_rate": 0.0029958407991909704, + "loss": 1.2306, + "step": 1260 + }, + { + "epoch": 0.11057932748630159, + "grad_norm": 0.09912109375, + "learning_rate": 0.0029958087604088617, + "loss": 1.2353, + "step": 1261 + }, + { + "epoch": 0.11066701926067612, + "grad_norm": 0.09912109375, + "learning_rate": 0.002995776598892468, + "loss": 1.2188, + "step": 1262 + }, + { + "epoch": 0.11075471103505068, + "grad_norm": 0.060546875, + "learning_rate": 0.0029957443146447224, + "loss": 1.2458, + "step": 1263 + }, + { + "epoch": 0.11084240280942523, + "grad_norm": 0.07763671875, + "learning_rate": 0.0029957119076685695, + "loss": 1.2121, + "step": 1264 + }, + { + "epoch": 0.11093009458379977, + "grad_norm": 0.08251953125, + "learning_rate": 0.002995679377966965, + "loss": 1.2579, + "step": 1265 + }, + { + "epoch": 0.11101778635817432, + "grad_norm": 0.10791015625, + "learning_rate": 0.0029956467255428743, + "loss": 1.2074, + "step": 1266 + }, + { + "epoch": 0.11110547813254885, + "grad_norm": 0.11181640625, + "learning_rate": 0.0029956139503992765, + "loss": 1.2233, + "step": 1267 + }, + { + "epoch": 0.1111931699069234, + "grad_norm": 0.1103515625, + "learning_rate": 0.0029955810525391603, + "loss": 1.1884, + "step": 1268 + }, + { + "epoch": 0.11128086168129794, + "grad_norm": 0.1005859375, + "learning_rate": 0.002995548031965526, + "loss": 1.2408, + "step": 1269 + }, + { + "epoch": 0.1113685534556725, + "grad_norm": 0.130859375, + "learning_rate": 0.0029955148886813836, + "loss": 1.2718, + "step": 1270 + }, + { + "epoch": 0.11145624523004703, + "grad_norm": 0.0888671875, + "learning_rate": 0.0029954816226897573, + "loss": 1.2472, + "step": 1271 + }, + { + "epoch": 0.11154393700442158, + "grad_norm": 0.1357421875, + "learning_rate": 0.0029954482339936803, + "loss": 1.2382, + "step": 1272 + }, + { + "epoch": 0.11163162877879614, + "grad_norm": 0.07861328125, + "learning_rate": 0.002995414722596198, + "loss": 1.2168, + "step": 1273 + }, + { + "epoch": 0.11171932055317067, + "grad_norm": 0.12451171875, + "learning_rate": 0.0029953810885003655, + "loss": 1.2388, + "step": 1274 + }, + { + "epoch": 0.11180701232754522, + "grad_norm": 0.0712890625, + "learning_rate": 0.0029953473317092514, + "loss": 1.2328, + "step": 1275 + }, + { + "epoch": 0.11189470410191976, + "grad_norm": 0.08203125, + "learning_rate": 0.0029953134522259337, + "loss": 1.2185, + "step": 1276 + }, + { + "epoch": 0.11198239587629431, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029952794500535014, + "loss": 1.2673, + "step": 1277 + }, + { + "epoch": 0.11207008765066885, + "grad_norm": 0.09716796875, + "learning_rate": 0.0029952453251950563, + "loss": 1.1878, + "step": 1278 + }, + { + "epoch": 0.1121577794250434, + "grad_norm": 0.08447265625, + "learning_rate": 0.0029952110776537105, + "loss": 1.1887, + "step": 1279 + }, + { + "epoch": 0.11224547119941794, + "grad_norm": 0.1220703125, + "learning_rate": 0.002995176707432587, + "loss": 1.2541, + "step": 1280 + }, + { + "epoch": 0.11233316297379249, + "grad_norm": 0.09326171875, + "learning_rate": 0.0029951422145348206, + "loss": 1.1988, + "step": 1281 + }, + { + "epoch": 0.11242085474816704, + "grad_norm": 0.0986328125, + "learning_rate": 0.0029951075989635566, + "loss": 1.1977, + "step": 1282 + }, + { + "epoch": 0.11250854652254158, + "grad_norm": 0.068359375, + "learning_rate": 0.0029950728607219513, + "loss": 1.2858, + "step": 1283 + }, + { + "epoch": 0.11259623829691613, + "grad_norm": 0.09814453125, + "learning_rate": 0.0029950379998131744, + "loss": 1.2136, + "step": 1284 + }, + { + "epoch": 0.11268393007129067, + "grad_norm": 0.0615234375, + "learning_rate": 0.0029950030162404035, + "loss": 1.1722, + "step": 1285 + }, + { + "epoch": 0.11277162184566522, + "grad_norm": 0.06787109375, + "learning_rate": 0.0029949679100068297, + "loss": 1.2412, + "step": 1286 + }, + { + "epoch": 0.11285931362003976, + "grad_norm": 0.0712890625, + "learning_rate": 0.002994932681115655, + "loss": 1.2478, + "step": 1287 + }, + { + "epoch": 0.11294700539441431, + "grad_norm": 0.0615234375, + "learning_rate": 0.0029948973295700907, + "loss": 1.2344, + "step": 1288 + }, + { + "epoch": 0.11303469716878885, + "grad_norm": 0.06689453125, + "learning_rate": 0.002994861855373362, + "loss": 1.3181, + "step": 1289 + }, + { + "epoch": 0.1131223889431634, + "grad_norm": 0.0703125, + "learning_rate": 0.002994826258528705, + "loss": 1.1635, + "step": 1290 + }, + { + "epoch": 0.11321008071753795, + "grad_norm": 0.0615234375, + "learning_rate": 0.0029947905390393637, + "loss": 1.1966, + "step": 1291 + }, + { + "epoch": 0.11329777249191249, + "grad_norm": 0.06982421875, + "learning_rate": 0.002994754696908597, + "loss": 1.2595, + "step": 1292 + }, + { + "epoch": 0.11338546426628704, + "grad_norm": 0.06396484375, + "learning_rate": 0.0029947187321396735, + "loss": 1.2539, + "step": 1293 + }, + { + "epoch": 0.11347315604066158, + "grad_norm": 0.0693359375, + "learning_rate": 0.002994682644735873, + "loss": 1.2618, + "step": 1294 + }, + { + "epoch": 0.11356084781503613, + "grad_norm": 0.08251953125, + "learning_rate": 0.0029946464347004867, + "loss": 1.2937, + "step": 1295 + }, + { + "epoch": 0.11364853958941067, + "grad_norm": 0.08203125, + "learning_rate": 0.002994610102036817, + "loss": 1.2365, + "step": 1296 + }, + { + "epoch": 0.11373623136378522, + "grad_norm": 0.083984375, + "learning_rate": 0.0029945736467481767, + "loss": 1.2025, + "step": 1297 + }, + { + "epoch": 0.11382392313815975, + "grad_norm": 0.0703125, + "learning_rate": 0.002994537068837891, + "loss": 1.2138, + "step": 1298 + }, + { + "epoch": 0.1139116149125343, + "grad_norm": 0.09130859375, + "learning_rate": 0.002994500368309295, + "loss": 1.2215, + "step": 1299 + }, + { + "epoch": 0.11399930668690886, + "grad_norm": 0.072265625, + "learning_rate": 0.0029944635451657365, + "loss": 1.2056, + "step": 1300 + }, + { + "epoch": 0.1140869984612834, + "grad_norm": 0.07568359375, + "learning_rate": 0.002994426599410574, + "loss": 1.2978, + "step": 1301 + }, + { + "epoch": 0.11417469023565795, + "grad_norm": 0.08740234375, + "learning_rate": 0.0029943895310471755, + "loss": 1.2251, + "step": 1302 + }, + { + "epoch": 0.11426238201003248, + "grad_norm": 0.0888671875, + "learning_rate": 0.0029943523400789225, + "loss": 1.2908, + "step": 1303 + }, + { + "epoch": 0.11435007378440704, + "grad_norm": 0.0615234375, + "learning_rate": 0.0029943150265092067, + "loss": 1.2399, + "step": 1304 + }, + { + "epoch": 0.11443776555878157, + "grad_norm": 0.0859375, + "learning_rate": 0.0029942775903414307, + "loss": 1.2414, + "step": 1305 + }, + { + "epoch": 0.11452545733315612, + "grad_norm": 0.0634765625, + "learning_rate": 0.0029942400315790085, + "loss": 1.298, + "step": 1306 + }, + { + "epoch": 0.11461314910753066, + "grad_norm": 0.08056640625, + "learning_rate": 0.0029942023502253657, + "loss": 1.2238, + "step": 1307 + }, + { + "epoch": 0.11470084088190521, + "grad_norm": 0.07421875, + "learning_rate": 0.0029941645462839388, + "loss": 1.2058, + "step": 1308 + }, + { + "epoch": 0.11478853265627977, + "grad_norm": 0.07373046875, + "learning_rate": 0.0029941266197581755, + "loss": 1.2297, + "step": 1309 + }, + { + "epoch": 0.1148762244306543, + "grad_norm": 0.09130859375, + "learning_rate": 0.0029940885706515336, + "loss": 1.261, + "step": 1310 + }, + { + "epoch": 0.11496391620502885, + "grad_norm": 0.07958984375, + "learning_rate": 0.002994050398967484, + "loss": 1.2694, + "step": 1311 + }, + { + "epoch": 0.11505160797940339, + "grad_norm": 0.142578125, + "learning_rate": 0.002994012104709508, + "loss": 1.2114, + "step": 1312 + }, + { + "epoch": 0.11513929975377794, + "grad_norm": 0.0751953125, + "learning_rate": 0.002993973687881097, + "loss": 1.3171, + "step": 1313 + }, + { + "epoch": 0.11522699152815248, + "grad_norm": 0.19140625, + "learning_rate": 0.0029939351484857555, + "loss": 1.179, + "step": 1314 + }, + { + "epoch": 0.11531468330252703, + "grad_norm": 0.138671875, + "learning_rate": 0.0029938964865269977, + "loss": 1.1591, + "step": 1315 + }, + { + "epoch": 0.11540237507690157, + "grad_norm": 0.08984375, + "learning_rate": 0.0029938577020083503, + "loss": 1.2616, + "step": 1316 + }, + { + "epoch": 0.11549006685127612, + "grad_norm": 0.1494140625, + "learning_rate": 0.0029938187949333484, + "loss": 1.2064, + "step": 1317 + }, + { + "epoch": 0.11557775862565067, + "grad_norm": 0.06689453125, + "learning_rate": 0.0029937797653055423, + "loss": 1.2495, + "step": 1318 + }, + { + "epoch": 0.11566545040002521, + "grad_norm": 0.166015625, + "learning_rate": 0.00299374061312849, + "loss": 1.1988, + "step": 1319 + }, + { + "epoch": 0.11575314217439976, + "grad_norm": 0.059814453125, + "learning_rate": 0.002993701338405763, + "loss": 1.2312, + "step": 1320 + }, + { + "epoch": 0.1158408339487743, + "grad_norm": 0.2431640625, + "learning_rate": 0.002993661941140943, + "loss": 1.2405, + "step": 1321 + }, + { + "epoch": 0.11592852572314885, + "grad_norm": 0.064453125, + "learning_rate": 0.002993622421337622, + "loss": 1.3058, + "step": 1322 + }, + { + "epoch": 0.11601621749752339, + "grad_norm": 0.1884765625, + "learning_rate": 0.0029935827789994048, + "loss": 1.2511, + "step": 1323 + }, + { + "epoch": 0.11610390927189794, + "grad_norm": 0.06787109375, + "learning_rate": 0.002993543014129907, + "loss": 1.21, + "step": 1324 + }, + { + "epoch": 0.11619160104627248, + "grad_norm": 0.087890625, + "learning_rate": 0.0029935031267327543, + "loss": 1.275, + "step": 1325 + }, + { + "epoch": 0.11627929282064703, + "grad_norm": 0.06689453125, + "learning_rate": 0.0029934631168115847, + "loss": 1.1929, + "step": 1326 + }, + { + "epoch": 0.11636698459502158, + "grad_norm": 0.09326171875, + "learning_rate": 0.002993422984370047, + "loss": 1.2059, + "step": 1327 + }, + { + "epoch": 0.11645467636939612, + "grad_norm": 0.06640625, + "learning_rate": 0.002993382729411801, + "loss": 1.2447, + "step": 1328 + }, + { + "epoch": 0.11654236814377067, + "grad_norm": 0.1259765625, + "learning_rate": 0.0029933423519405184, + "loss": 1.2291, + "step": 1329 + }, + { + "epoch": 0.1166300599181452, + "grad_norm": 0.07373046875, + "learning_rate": 0.002993301851959881, + "loss": 1.1888, + "step": 1330 + }, + { + "epoch": 0.11671775169251976, + "grad_norm": 0.1005859375, + "learning_rate": 0.002993261229473582, + "loss": 1.223, + "step": 1331 + }, + { + "epoch": 0.1168054434668943, + "grad_norm": 0.0791015625, + "learning_rate": 0.0029932204844853273, + "loss": 1.2749, + "step": 1332 + }, + { + "epoch": 0.11689313524126885, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029931796169988313, + "loss": 1.2093, + "step": 1333 + }, + { + "epoch": 0.11698082701564338, + "grad_norm": 0.095703125, + "learning_rate": 0.002993138627017822, + "loss": 1.2348, + "step": 1334 + }, + { + "epoch": 0.11706851879001794, + "grad_norm": 0.0625, + "learning_rate": 0.002993097514546037, + "loss": 1.2177, + "step": 1335 + }, + { + "epoch": 0.11715621056439249, + "grad_norm": 0.06689453125, + "learning_rate": 0.0029930562795872258, + "loss": 1.1629, + "step": 1336 + }, + { + "epoch": 0.11724390233876703, + "grad_norm": 0.1171875, + "learning_rate": 0.002993014922145149, + "loss": 1.2504, + "step": 1337 + }, + { + "epoch": 0.11733159411314158, + "grad_norm": 0.0791015625, + "learning_rate": 0.002992973442223578, + "loss": 1.2193, + "step": 1338 + }, + { + "epoch": 0.11741928588751611, + "grad_norm": 0.1533203125, + "learning_rate": 0.0029929318398262965, + "loss": 1.2525, + "step": 1339 + }, + { + "epoch": 0.11750697766189067, + "grad_norm": 0.0673828125, + "learning_rate": 0.002992890114957098, + "loss": 1.2871, + "step": 1340 + }, + { + "epoch": 0.1175946694362652, + "grad_norm": 0.11572265625, + "learning_rate": 0.0029928482676197872, + "loss": 1.1741, + "step": 1341 + }, + { + "epoch": 0.11768236121063975, + "grad_norm": 0.2109375, + "learning_rate": 0.002992806297818181, + "loss": 1.276, + "step": 1342 + }, + { + "epoch": 0.11777005298501429, + "grad_norm": 0.2119140625, + "learning_rate": 0.002992764205556107, + "loss": 1.2048, + "step": 1343 + }, + { + "epoch": 0.11785774475938884, + "grad_norm": 0.15234375, + "learning_rate": 0.0029927219908374037, + "loss": 1.2242, + "step": 1344 + }, + { + "epoch": 0.1179454365337634, + "grad_norm": 0.13671875, + "learning_rate": 0.002992679653665921, + "loss": 1.2332, + "step": 1345 + }, + { + "epoch": 0.11803312830813793, + "grad_norm": 0.1376953125, + "learning_rate": 0.0029926371940455204, + "loss": 1.2667, + "step": 1346 + }, + { + "epoch": 0.11812082008251248, + "grad_norm": 0.1611328125, + "learning_rate": 0.0029925946119800737, + "loss": 1.3023, + "step": 1347 + }, + { + "epoch": 0.11820851185688702, + "grad_norm": 0.095703125, + "learning_rate": 0.0029925519074734635, + "loss": 1.2397, + "step": 1348 + }, + { + "epoch": 0.11829620363126157, + "grad_norm": 0.13671875, + "learning_rate": 0.002992509080529586, + "loss": 1.2721, + "step": 1349 + }, + { + "epoch": 0.11838389540563611, + "grad_norm": 0.1220703125, + "learning_rate": 0.002992466131152345, + "loss": 1.3193, + "step": 1350 + }, + { + "epoch": 0.11847158718001066, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029924230593456596, + "loss": 1.1955, + "step": 1351 + }, + { + "epoch": 0.1185592789543852, + "grad_norm": 0.1689453125, + "learning_rate": 0.002992379865113456, + "loss": 1.2117, + "step": 1352 + }, + { + "epoch": 0.11864697072875975, + "grad_norm": 0.07763671875, + "learning_rate": 0.0029923365484596743, + "loss": 1.2117, + "step": 1353 + }, + { + "epoch": 0.11873466250313429, + "grad_norm": 0.1123046875, + "learning_rate": 0.0029922931093882645, + "loss": 1.2643, + "step": 1354 + }, + { + "epoch": 0.11882235427750884, + "grad_norm": 0.1669921875, + "learning_rate": 0.002992249547903188, + "loss": 1.2068, + "step": 1355 + }, + { + "epoch": 0.11891004605188339, + "grad_norm": 0.07080078125, + "learning_rate": 0.002992205864008418, + "loss": 1.2446, + "step": 1356 + }, + { + "epoch": 0.11899773782625793, + "grad_norm": 0.150390625, + "learning_rate": 0.002992162057707938, + "loss": 1.2532, + "step": 1357 + }, + { + "epoch": 0.11908542960063248, + "grad_norm": 0.0849609375, + "learning_rate": 0.0029921181290057425, + "loss": 1.2755, + "step": 1358 + }, + { + "epoch": 0.11917312137500702, + "grad_norm": 0.0859375, + "learning_rate": 0.0029920740779058393, + "loss": 1.2407, + "step": 1359 + }, + { + "epoch": 0.11926081314938157, + "grad_norm": 0.10205078125, + "learning_rate": 0.0029920299044122445, + "loss": 1.2351, + "step": 1360 + }, + { + "epoch": 0.11934850492375611, + "grad_norm": 0.06689453125, + "learning_rate": 0.0029919856085289863, + "loss": 1.2408, + "step": 1361 + }, + { + "epoch": 0.11943619669813066, + "grad_norm": 0.1337890625, + "learning_rate": 0.0029919411902601055, + "loss": 1.2901, + "step": 1362 + }, + { + "epoch": 0.1195238884725052, + "grad_norm": 0.0849609375, + "learning_rate": 0.002991896649609652, + "loss": 1.2612, + "step": 1363 + }, + { + "epoch": 0.11961158024687975, + "grad_norm": 0.11376953125, + "learning_rate": 0.0029918519865816885, + "loss": 1.263, + "step": 1364 + }, + { + "epoch": 0.1196992720212543, + "grad_norm": 0.08837890625, + "learning_rate": 0.0029918072011802872, + "loss": 1.1916, + "step": 1365 + }, + { + "epoch": 0.11978696379562884, + "grad_norm": 0.09130859375, + "learning_rate": 0.002991762293409534, + "loss": 1.2163, + "step": 1366 + }, + { + "epoch": 0.11987465557000339, + "grad_norm": 0.09033203125, + "learning_rate": 0.002991717263273523, + "loss": 1.1645, + "step": 1367 + }, + { + "epoch": 0.11996234734437793, + "grad_norm": 0.09375, + "learning_rate": 0.0029916721107763606, + "loss": 1.1664, + "step": 1368 + }, + { + "epoch": 0.12005003911875248, + "grad_norm": 0.14453125, + "learning_rate": 0.0029916268359221655, + "loss": 1.295, + "step": 1369 + }, + { + "epoch": 0.12013773089312701, + "grad_norm": 0.06982421875, + "learning_rate": 0.002991581438715066, + "loss": 1.1667, + "step": 1370 + }, + { + "epoch": 0.12022542266750157, + "grad_norm": 0.107421875, + "learning_rate": 0.0029915359191592036, + "loss": 1.1757, + "step": 1371 + }, + { + "epoch": 0.1203131144418761, + "grad_norm": 0.1689453125, + "learning_rate": 0.002991490277258728, + "loss": 1.208, + "step": 1372 + }, + { + "epoch": 0.12040080621625066, + "grad_norm": 0.1455078125, + "learning_rate": 0.0029914445130178016, + "loss": 1.2035, + "step": 1373 + }, + { + "epoch": 0.1204884979906252, + "grad_norm": 0.171875, + "learning_rate": 0.002991398626440599, + "loss": 1.225, + "step": 1374 + }, + { + "epoch": 0.12057618976499974, + "grad_norm": 0.09228515625, + "learning_rate": 0.0029913526175313052, + "loss": 1.2107, + "step": 1375 + }, + { + "epoch": 0.1206638815393743, + "grad_norm": 0.140625, + "learning_rate": 0.0029913064862941144, + "loss": 1.2726, + "step": 1376 + }, + { + "epoch": 0.12075157331374883, + "grad_norm": 0.0703125, + "learning_rate": 0.002991260232733235, + "loss": 1.214, + "step": 1377 + }, + { + "epoch": 0.12083926508812338, + "grad_norm": 0.07568359375, + "learning_rate": 0.0029912138568528846, + "loss": 1.2616, + "step": 1378 + }, + { + "epoch": 0.12092695686249792, + "grad_norm": 0.06591796875, + "learning_rate": 0.002991167358657293, + "loss": 1.2894, + "step": 1379 + }, + { + "epoch": 0.12101464863687247, + "grad_norm": 0.06103515625, + "learning_rate": 0.0029911207381507, + "loss": 1.1904, + "step": 1380 + }, + { + "epoch": 0.12110234041124701, + "grad_norm": 0.06689453125, + "learning_rate": 0.0029910739953373584, + "loss": 1.1905, + "step": 1381 + }, + { + "epoch": 0.12119003218562156, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029910271302215304, + "loss": 1.2156, + "step": 1382 + }, + { + "epoch": 0.12127772395999611, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029909801428074896, + "loss": 1.2372, + "step": 1383 + }, + { + "epoch": 0.12136541573437065, + "grad_norm": 0.08154296875, + "learning_rate": 0.0029909330330995213, + "loss": 1.2494, + "step": 1384 + }, + { + "epoch": 0.1214531075087452, + "grad_norm": 0.0751953125, + "learning_rate": 0.0029908858011019226, + "loss": 1.2056, + "step": 1385 + }, + { + "epoch": 0.12154079928311974, + "grad_norm": 0.1181640625, + "learning_rate": 0.002990838446819, + "loss": 1.266, + "step": 1386 + }, + { + "epoch": 0.12162849105749429, + "grad_norm": 0.1357421875, + "learning_rate": 0.002990790970255072, + "loss": 1.2105, + "step": 1387 + }, + { + "epoch": 0.12171618283186883, + "grad_norm": 0.06396484375, + "learning_rate": 0.0029907433714144696, + "loss": 1.1864, + "step": 1388 + }, + { + "epoch": 0.12180387460624338, + "grad_norm": 0.138671875, + "learning_rate": 0.0029906956503015325, + "loss": 1.2204, + "step": 1389 + }, + { + "epoch": 0.12189156638061792, + "grad_norm": 0.0712890625, + "learning_rate": 0.002990647806920613, + "loss": 1.2312, + "step": 1390 + }, + { + "epoch": 0.12197925815499247, + "grad_norm": 0.1318359375, + "learning_rate": 0.0029905998412760744, + "loss": 1.2329, + "step": 1391 + }, + { + "epoch": 0.12206694992936702, + "grad_norm": 0.07080078125, + "learning_rate": 0.002990551753372291, + "loss": 1.1962, + "step": 1392 + }, + { + "epoch": 0.12215464170374156, + "grad_norm": 0.11865234375, + "learning_rate": 0.0029905035432136484, + "loss": 1.1929, + "step": 1393 + }, + { + "epoch": 0.12224233347811611, + "grad_norm": 0.07470703125, + "learning_rate": 0.0029904552108045426, + "loss": 1.2269, + "step": 1394 + }, + { + "epoch": 0.12233002525249065, + "grad_norm": 0.11572265625, + "learning_rate": 0.0029904067561493824, + "loss": 1.2734, + "step": 1395 + }, + { + "epoch": 0.1224177170268652, + "grad_norm": 0.09619140625, + "learning_rate": 0.0029903581792525866, + "loss": 1.1868, + "step": 1396 + }, + { + "epoch": 0.12250540880123974, + "grad_norm": 0.08837890625, + "learning_rate": 0.0029903094801185847, + "loss": 1.2306, + "step": 1397 + }, + { + "epoch": 0.12259310057561429, + "grad_norm": 0.08203125, + "learning_rate": 0.002990260658751818, + "loss": 1.2046, + "step": 1398 + }, + { + "epoch": 0.12268079234998883, + "grad_norm": 0.09619140625, + "learning_rate": 0.0029902117151567394, + "loss": 1.2347, + "step": 1399 + }, + { + "epoch": 0.12276848412436338, + "grad_norm": 0.0869140625, + "learning_rate": 0.002990162649337812, + "loss": 1.2616, + "step": 1400 + }, + { + "epoch": 0.12285617589873793, + "grad_norm": 0.07470703125, + "learning_rate": 0.00299011346129951, + "loss": 1.2644, + "step": 1401 + }, + { + "epoch": 0.12294386767311247, + "grad_norm": 0.095703125, + "learning_rate": 0.00299006415104632, + "loss": 1.2854, + "step": 1402 + }, + { + "epoch": 0.12303155944748702, + "grad_norm": 0.0771484375, + "learning_rate": 0.00299001471858274, + "loss": 1.2626, + "step": 1403 + }, + { + "epoch": 0.12311925122186156, + "grad_norm": 0.1357421875, + "learning_rate": 0.002989965163913276, + "loss": 1.186, + "step": 1404 + }, + { + "epoch": 0.12320694299623611, + "grad_norm": 0.07373046875, + "learning_rate": 0.002989915487042448, + "loss": 1.1977, + "step": 1405 + }, + { + "epoch": 0.12329463477061064, + "grad_norm": 0.12255859375, + "learning_rate": 0.002989865687974787, + "loss": 1.2557, + "step": 1406 + }, + { + "epoch": 0.1233823265449852, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029898157667148334, + "loss": 1.2345, + "step": 1407 + }, + { + "epoch": 0.12347001831935973, + "grad_norm": 0.125, + "learning_rate": 0.0029897657232671408, + "loss": 1.2252, + "step": 1408 + }, + { + "epoch": 0.12355771009373429, + "grad_norm": 0.0966796875, + "learning_rate": 0.002989715557636273, + "loss": 1.2605, + "step": 1409 + }, + { + "epoch": 0.12364540186810884, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029896652698268057, + "loss": 1.2567, + "step": 1410 + }, + { + "epoch": 0.12373309364248337, + "grad_norm": 0.080078125, + "learning_rate": 0.0029896148598433227, + "loss": 1.2003, + "step": 1411 + }, + { + "epoch": 0.12382078541685793, + "grad_norm": 0.08203125, + "learning_rate": 0.0029895643276904235, + "loss": 1.2087, + "step": 1412 + }, + { + "epoch": 0.12390847719123246, + "grad_norm": 0.09814453125, + "learning_rate": 0.0029895136733727157, + "loss": 1.1963, + "step": 1413 + }, + { + "epoch": 0.12399616896560701, + "grad_norm": 0.10107421875, + "learning_rate": 0.0029894628968948184, + "loss": 1.2341, + "step": 1414 + }, + { + "epoch": 0.12408386073998155, + "grad_norm": 0.12158203125, + "learning_rate": 0.002989411998261363, + "loss": 1.2918, + "step": 1415 + }, + { + "epoch": 0.1241715525143561, + "grad_norm": 0.08935546875, + "learning_rate": 0.0029893609774769908, + "loss": 1.2334, + "step": 1416 + }, + { + "epoch": 0.12425924428873064, + "grad_norm": 0.142578125, + "learning_rate": 0.0029893098345463547, + "loss": 1.2939, + "step": 1417 + }, + { + "epoch": 0.12434693606310519, + "grad_norm": 0.061279296875, + "learning_rate": 0.0029892585694741196, + "loss": 1.2374, + "step": 1418 + }, + { + "epoch": 0.12443462783747974, + "grad_norm": 0.08544921875, + "learning_rate": 0.00298920718226496, + "loss": 1.2522, + "step": 1419 + }, + { + "epoch": 0.12452231961185428, + "grad_norm": 0.076171875, + "learning_rate": 0.0029891556729235626, + "loss": 1.2733, + "step": 1420 + }, + { + "epoch": 0.12461001138622883, + "grad_norm": 0.0693359375, + "learning_rate": 0.002989104041454625, + "loss": 1.2333, + "step": 1421 + }, + { + "epoch": 0.12469770316060337, + "grad_norm": 0.1376953125, + "learning_rate": 0.002989052287862856, + "loss": 1.2635, + "step": 1422 + }, + { + "epoch": 0.12478539493497792, + "grad_norm": 0.11572265625, + "learning_rate": 0.002989000412152975, + "loss": 1.2516, + "step": 1423 + }, + { + "epoch": 0.12487308670935246, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029889484143297124, + "loss": 1.2372, + "step": 1424 + }, + { + "epoch": 0.12496077848372701, + "grad_norm": 0.09521484375, + "learning_rate": 0.0029888962943978113, + "loss": 1.2064, + "step": 1425 + }, + { + "epoch": 0.12504847025810156, + "grad_norm": 0.064453125, + "learning_rate": 0.002988844052362024, + "loss": 1.2496, + "step": 1426 + }, + { + "epoch": 0.1251361620324761, + "grad_norm": 0.11767578125, + "learning_rate": 0.002988791688227116, + "loss": 1.2487, + "step": 1427 + }, + { + "epoch": 0.12522385380685064, + "grad_norm": 0.064453125, + "learning_rate": 0.002988739201997862, + "loss": 1.2427, + "step": 1428 + }, + { + "epoch": 0.1253115455812252, + "grad_norm": 0.11669921875, + "learning_rate": 0.002988686593679048, + "loss": 1.152, + "step": 1429 + }, + { + "epoch": 0.12539923735559974, + "grad_norm": 0.0751953125, + "learning_rate": 0.0029886338632754733, + "loss": 1.2219, + "step": 1430 + }, + { + "epoch": 0.12548692912997428, + "grad_norm": 0.0634765625, + "learning_rate": 0.0029885810107919456, + "loss": 1.2429, + "step": 1431 + }, + { + "epoch": 0.12557462090434882, + "grad_norm": 0.0791015625, + "learning_rate": 0.0029885280362332853, + "loss": 1.2552, + "step": 1432 + }, + { + "epoch": 0.12566231267872338, + "grad_norm": 0.05810546875, + "learning_rate": 0.0029884749396043237, + "loss": 1.2547, + "step": 1433 + }, + { + "epoch": 0.12575000445309792, + "grad_norm": 0.08740234375, + "learning_rate": 0.0029884217209099023, + "loss": 1.1672, + "step": 1434 + }, + { + "epoch": 0.12583769622747246, + "grad_norm": 0.0791015625, + "learning_rate": 0.0029883683801548754, + "loss": 1.2381, + "step": 1435 + }, + { + "epoch": 0.12592538800184702, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029883149173441075, + "loss": 1.2191, + "step": 1436 + }, + { + "epoch": 0.12601307977622156, + "grad_norm": 0.1123046875, + "learning_rate": 0.0029882613324824737, + "loss": 1.2202, + "step": 1437 + }, + { + "epoch": 0.1261007715505961, + "grad_norm": 0.0712890625, + "learning_rate": 0.002988207625574861, + "loss": 1.2144, + "step": 1438 + }, + { + "epoch": 0.12618846332497063, + "grad_norm": 0.06884765625, + "learning_rate": 0.0029881537966261673, + "loss": 1.2211, + "step": 1439 + }, + { + "epoch": 0.1262761550993452, + "grad_norm": 0.06689453125, + "learning_rate": 0.0029880998456413023, + "loss": 1.2397, + "step": 1440 + }, + { + "epoch": 0.12636384687371974, + "grad_norm": 0.061279296875, + "learning_rate": 0.002988045772625185, + "loss": 1.1851, + "step": 1441 + }, + { + "epoch": 0.12645153864809427, + "grad_norm": 0.08642578125, + "learning_rate": 0.0029879915775827483, + "loss": 1.1996, + "step": 1442 + }, + { + "epoch": 0.1265392304224688, + "grad_norm": 0.0654296875, + "learning_rate": 0.002987937260518933, + "loss": 1.2325, + "step": 1443 + }, + { + "epoch": 0.12662692219684338, + "grad_norm": 0.11669921875, + "learning_rate": 0.0029878828214386934, + "loss": 1.2232, + "step": 1444 + }, + { + "epoch": 0.12671461397121792, + "grad_norm": 0.09521484375, + "learning_rate": 0.0029878282603469945, + "loss": 1.2375, + "step": 1445 + }, + { + "epoch": 0.12680230574559245, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029877735772488117, + "loss": 1.2673, + "step": 1446 + }, + { + "epoch": 0.12688999751996702, + "grad_norm": 0.07421875, + "learning_rate": 0.0029877187721491323, + "loss": 1.2063, + "step": 1447 + }, + { + "epoch": 0.12697768929434156, + "grad_norm": 0.07373046875, + "learning_rate": 0.002987663845052954, + "loss": 1.213, + "step": 1448 + }, + { + "epoch": 0.1270653810687161, + "grad_norm": 0.0849609375, + "learning_rate": 0.002987608795965286, + "loss": 1.247, + "step": 1449 + }, + { + "epoch": 0.12715307284309063, + "grad_norm": 0.08203125, + "learning_rate": 0.002987553624891149, + "loss": 1.2318, + "step": 1450 + }, + { + "epoch": 0.1272407646174652, + "grad_norm": 0.0888671875, + "learning_rate": 0.0029874983318355745, + "loss": 1.2509, + "step": 1451 + }, + { + "epoch": 0.12732845639183973, + "grad_norm": 0.1201171875, + "learning_rate": 0.0029874429168036047, + "loss": 1.2573, + "step": 1452 + }, + { + "epoch": 0.12741614816621427, + "grad_norm": 0.1318359375, + "learning_rate": 0.0029873873798002934, + "loss": 1.1857, + "step": 1453 + }, + { + "epoch": 0.1275038399405888, + "grad_norm": 0.091796875, + "learning_rate": 0.0029873317208307056, + "loss": 1.2676, + "step": 1454 + }, + { + "epoch": 0.12759153171496337, + "grad_norm": 0.08935546875, + "learning_rate": 0.002987275939899917, + "loss": 1.2704, + "step": 1455 + }, + { + "epoch": 0.1276792234893379, + "grad_norm": 0.0908203125, + "learning_rate": 0.002987220037013015, + "loss": 1.1953, + "step": 1456 + }, + { + "epoch": 0.12776691526371245, + "grad_norm": 0.0830078125, + "learning_rate": 0.002987164012175098, + "loss": 1.2961, + "step": 1457 + }, + { + "epoch": 0.12785460703808701, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029871078653912744, + "loss": 1.2668, + "step": 1458 + }, + { + "epoch": 0.12794229881246155, + "grad_norm": 0.1552734375, + "learning_rate": 0.0029870515966666654, + "loss": 1.2222, + "step": 1459 + }, + { + "epoch": 0.1280299905868361, + "grad_norm": 0.09033203125, + "learning_rate": 0.002986995206006402, + "loss": 1.2353, + "step": 1460 + }, + { + "epoch": 0.12811768236121063, + "grad_norm": 0.11181640625, + "learning_rate": 0.0029869386934156276, + "loss": 1.2168, + "step": 1461 + }, + { + "epoch": 0.1282053741355852, + "grad_norm": 0.06689453125, + "learning_rate": 0.0029868820588994957, + "loss": 1.1639, + "step": 1462 + }, + { + "epoch": 0.12829306590995973, + "grad_norm": 0.20703125, + "learning_rate": 0.0029868253024631715, + "loss": 1.2447, + "step": 1463 + }, + { + "epoch": 0.12838075768433427, + "grad_norm": 0.11083984375, + "learning_rate": 0.00298676842411183, + "loss": 1.2335, + "step": 1464 + }, + { + "epoch": 0.12846844945870883, + "grad_norm": 0.12255859375, + "learning_rate": 0.00298671142385066, + "loss": 1.222, + "step": 1465 + }, + { + "epoch": 0.12855614123308337, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029866543016848577, + "loss": 1.2759, + "step": 1466 + }, + { + "epoch": 0.1286438330074579, + "grad_norm": 0.0888671875, + "learning_rate": 0.002986597057619634, + "loss": 1.1909, + "step": 1467 + }, + { + "epoch": 0.12873152478183245, + "grad_norm": 0.08935546875, + "learning_rate": 0.0029865396916602094, + "loss": 1.2919, + "step": 1468 + }, + { + "epoch": 0.128819216556207, + "grad_norm": 0.08056640625, + "learning_rate": 0.002986482203811815, + "loss": 1.2541, + "step": 1469 + }, + { + "epoch": 0.12890690833058155, + "grad_norm": 0.125, + "learning_rate": 0.002986424594079694, + "loss": 1.2356, + "step": 1470 + }, + { + "epoch": 0.1289946001049561, + "grad_norm": 0.0869140625, + "learning_rate": 0.0029863668624690995, + "loss": 1.2285, + "step": 1471 + }, + { + "epoch": 0.12908229187933062, + "grad_norm": 0.12890625, + "learning_rate": 0.0029863090089852972, + "loss": 1.2105, + "step": 1472 + }, + { + "epoch": 0.1291699836537052, + "grad_norm": 0.1474609375, + "learning_rate": 0.0029862510336335635, + "loss": 1.2278, + "step": 1473 + }, + { + "epoch": 0.12925767542807973, + "grad_norm": 0.134765625, + "learning_rate": 0.002986192936419184, + "loss": 1.2569, + "step": 1474 + }, + { + "epoch": 0.12934536720245426, + "grad_norm": 0.09375, + "learning_rate": 0.0029861347173474584, + "loss": 1.2992, + "step": 1475 + }, + { + "epoch": 0.12943305897682883, + "grad_norm": 0.1181640625, + "learning_rate": 0.0029860763764236963, + "loss": 1.2915, + "step": 1476 + }, + { + "epoch": 0.12952075075120337, + "grad_norm": 0.06494140625, + "learning_rate": 0.0029860179136532174, + "loss": 1.2971, + "step": 1477 + }, + { + "epoch": 0.1296084425255779, + "grad_norm": 0.1142578125, + "learning_rate": 0.0029859593290413535, + "loss": 1.2245, + "step": 1478 + }, + { + "epoch": 0.12969613429995244, + "grad_norm": 0.05712890625, + "learning_rate": 0.002985900622593448, + "loss": 1.1996, + "step": 1479 + }, + { + "epoch": 0.129783826074327, + "grad_norm": 0.11474609375, + "learning_rate": 0.0029858417943148534, + "loss": 1.2459, + "step": 1480 + }, + { + "epoch": 0.12987151784870155, + "grad_norm": 0.06640625, + "learning_rate": 0.0029857828442109366, + "loss": 1.1602, + "step": 1481 + }, + { + "epoch": 0.12995920962307608, + "grad_norm": 0.15625, + "learning_rate": 0.0029857237722870724, + "loss": 1.2804, + "step": 1482 + }, + { + "epoch": 0.13004690139745065, + "grad_norm": 0.0693359375, + "learning_rate": 0.002985664578548648, + "loss": 1.2021, + "step": 1483 + }, + { + "epoch": 0.13013459317182519, + "grad_norm": 0.2890625, + "learning_rate": 0.0029856052630010625, + "loss": 1.2046, + "step": 1484 + }, + { + "epoch": 0.13022228494619972, + "grad_norm": 0.0771484375, + "learning_rate": 0.002985545825649725, + "loss": 1.2553, + "step": 1485 + }, + { + "epoch": 0.13030997672057426, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029854862665000554, + "loss": 1.2347, + "step": 1486 + }, + { + "epoch": 0.13039766849494883, + "grad_norm": 0.06689453125, + "learning_rate": 0.002985426585557486, + "loss": 1.1759, + "step": 1487 + }, + { + "epoch": 0.13048536026932336, + "grad_norm": 0.07470703125, + "learning_rate": 0.0029853667828274593, + "loss": 1.2221, + "step": 1488 + }, + { + "epoch": 0.1305730520436979, + "grad_norm": 0.08935546875, + "learning_rate": 0.0029853068583154292, + "loss": 1.2229, + "step": 1489 + }, + { + "epoch": 0.13066074381807244, + "grad_norm": 0.06689453125, + "learning_rate": 0.0029852468120268607, + "loss": 1.2474, + "step": 1490 + }, + { + "epoch": 0.130748435592447, + "grad_norm": 0.1171875, + "learning_rate": 0.00298518664396723, + "loss": 1.276, + "step": 1491 + }, + { + "epoch": 0.13083612736682154, + "grad_norm": 0.076171875, + "learning_rate": 0.0029851263541420246, + "loss": 1.259, + "step": 1492 + }, + { + "epoch": 0.13092381914119608, + "grad_norm": 0.09130859375, + "learning_rate": 0.0029850659425567413, + "loss": 1.2486, + "step": 1493 + }, + { + "epoch": 0.13101151091557064, + "grad_norm": 0.0927734375, + "learning_rate": 0.0029850054092168915, + "loss": 1.238, + "step": 1494 + }, + { + "epoch": 0.13109920268994518, + "grad_norm": 0.0849609375, + "learning_rate": 0.002984944754127994, + "loss": 1.233, + "step": 1495 + }, + { + "epoch": 0.13118689446431972, + "grad_norm": 0.091796875, + "learning_rate": 0.0029848839772955815, + "loss": 1.1471, + "step": 1496 + }, + { + "epoch": 0.13127458623869426, + "grad_norm": 0.0673828125, + "learning_rate": 0.002984823078725196, + "loss": 1.2504, + "step": 1497 + }, + { + "epoch": 0.13136227801306882, + "grad_norm": 0.09033203125, + "learning_rate": 0.0029847620584223917, + "loss": 1.1724, + "step": 1498 + }, + { + "epoch": 0.13144996978744336, + "grad_norm": 0.08447265625, + "learning_rate": 0.0029847009163927333, + "loss": 1.2235, + "step": 1499 + }, + { + "epoch": 0.1315376615618179, + "grad_norm": 0.07421875, + "learning_rate": 0.0029846396526417964, + "loss": 1.207, + "step": 1500 + }, + { + "epoch": 0.1315376615618179, + "eval_loss": 1.2418609857559204, + "eval_runtime": 429.7895, + "eval_samples_per_second": 33.614, + "eval_steps_per_second": 8.404, + "step": 1500 + }, + { + "epoch": 0.13162535333619246, + "grad_norm": 0.0791015625, + "learning_rate": 0.002984578267175169, + "loss": 1.21, + "step": 1501 + }, + { + "epoch": 0.131713045110567, + "grad_norm": 0.0693359375, + "learning_rate": 0.0029845167599984487, + "loss": 1.2382, + "step": 1502 + }, + { + "epoch": 0.13180073688494154, + "grad_norm": 0.07861328125, + "learning_rate": 0.002984455131117245, + "loss": 1.2137, + "step": 1503 + }, + { + "epoch": 0.13188842865931608, + "grad_norm": 0.06884765625, + "learning_rate": 0.0029843933805371783, + "loss": 1.2029, + "step": 1504 + }, + { + "epoch": 0.13197612043369064, + "grad_norm": 0.07470703125, + "learning_rate": 0.00298433150826388, + "loss": 1.2327, + "step": 1505 + }, + { + "epoch": 0.13206381220806518, + "grad_norm": 0.1357421875, + "learning_rate": 0.0029842695143029925, + "loss": 1.2136, + "step": 1506 + }, + { + "epoch": 0.13215150398243972, + "grad_norm": 0.07763671875, + "learning_rate": 0.0029842073986601696, + "loss": 1.2261, + "step": 1507 + }, + { + "epoch": 0.13223919575681425, + "grad_norm": 0.11279296875, + "learning_rate": 0.0029841451613410765, + "loss": 1.296, + "step": 1508 + }, + { + "epoch": 0.13232688753118882, + "grad_norm": 0.07763671875, + "learning_rate": 0.0029840828023513888, + "loss": 1.2243, + "step": 1509 + }, + { + "epoch": 0.13241457930556336, + "grad_norm": 0.146484375, + "learning_rate": 0.0029840203216967933, + "loss": 1.1826, + "step": 1510 + }, + { + "epoch": 0.1325022710799379, + "grad_norm": 0.07666015625, + "learning_rate": 0.002983957719382988, + "loss": 1.2358, + "step": 1511 + }, + { + "epoch": 0.13258996285431246, + "grad_norm": 0.1611328125, + "learning_rate": 0.0029838949954156826, + "loss": 1.255, + "step": 1512 + }, + { + "epoch": 0.132677654628687, + "grad_norm": 0.0986328125, + "learning_rate": 0.002983832149800597, + "loss": 1.2429, + "step": 1513 + }, + { + "epoch": 0.13276534640306153, + "grad_norm": 0.154296875, + "learning_rate": 0.0029837691825434624, + "loss": 1.3166, + "step": 1514 + }, + { + "epoch": 0.13285303817743607, + "grad_norm": 0.0966796875, + "learning_rate": 0.0029837060936500214, + "loss": 1.238, + "step": 1515 + }, + { + "epoch": 0.13294072995181064, + "grad_norm": 0.1513671875, + "learning_rate": 0.002983642883126028, + "loss": 1.246, + "step": 1516 + }, + { + "epoch": 0.13302842172618518, + "grad_norm": 0.1025390625, + "learning_rate": 0.002983579550977246, + "loss": 1.2287, + "step": 1517 + }, + { + "epoch": 0.1331161135005597, + "grad_norm": 0.08154296875, + "learning_rate": 0.002983516097209452, + "loss": 1.2517, + "step": 1518 + }, + { + "epoch": 0.13320380527493428, + "grad_norm": 0.0771484375, + "learning_rate": 0.002983452521828432, + "loss": 1.2551, + "step": 1519 + }, + { + "epoch": 0.13329149704930882, + "grad_norm": 0.072265625, + "learning_rate": 0.0029833888248399845, + "loss": 1.2386, + "step": 1520 + }, + { + "epoch": 0.13337918882368335, + "grad_norm": 0.08349609375, + "learning_rate": 0.0029833250062499188, + "loss": 1.2619, + "step": 1521 + }, + { + "epoch": 0.1334668805980579, + "grad_norm": 0.072265625, + "learning_rate": 0.0029832610660640536, + "loss": 1.3181, + "step": 1522 + }, + { + "epoch": 0.13355457237243246, + "grad_norm": 0.10888671875, + "learning_rate": 0.0029831970042882216, + "loss": 1.1977, + "step": 1523 + }, + { + "epoch": 0.133642264146807, + "grad_norm": 0.07568359375, + "learning_rate": 0.0029831328209282645, + "loss": 1.2195, + "step": 1524 + }, + { + "epoch": 0.13372995592118153, + "grad_norm": 0.142578125, + "learning_rate": 0.0029830685159900347, + "loss": 1.2581, + "step": 1525 + }, + { + "epoch": 0.13381764769555607, + "grad_norm": 0.10595703125, + "learning_rate": 0.0029830040894793983, + "loss": 1.2582, + "step": 1526 + }, + { + "epoch": 0.13390533946993063, + "grad_norm": 0.2041015625, + "learning_rate": 0.0029829395414022303, + "loss": 1.2592, + "step": 1527 + }, + { + "epoch": 0.13399303124430517, + "grad_norm": 0.06787109375, + "learning_rate": 0.0029828748717644167, + "loss": 1.2546, + "step": 1528 + }, + { + "epoch": 0.1340807230186797, + "grad_norm": 0.12890625, + "learning_rate": 0.0029828100805718554, + "loss": 1.2101, + "step": 1529 + }, + { + "epoch": 0.13416841479305427, + "grad_norm": 0.0791015625, + "learning_rate": 0.0029827451678304555, + "loss": 1.2072, + "step": 1530 + }, + { + "epoch": 0.1342561065674288, + "grad_norm": 0.111328125, + "learning_rate": 0.002982680133546137, + "loss": 1.27, + "step": 1531 + }, + { + "epoch": 0.13434379834180335, + "grad_norm": 0.061279296875, + "learning_rate": 0.0029826149777248305, + "loss": 1.1899, + "step": 1532 + }, + { + "epoch": 0.1344314901161779, + "grad_norm": 0.10986328125, + "learning_rate": 0.002982549700372478, + "loss": 1.2853, + "step": 1533 + }, + { + "epoch": 0.13451918189055245, + "grad_norm": 0.05712890625, + "learning_rate": 0.0029824843014950326, + "loss": 1.1747, + "step": 1534 + }, + { + "epoch": 0.134606873664927, + "grad_norm": 0.0869140625, + "learning_rate": 0.002982418781098459, + "loss": 1.2496, + "step": 1535 + }, + { + "epoch": 0.13469456543930153, + "grad_norm": 0.06884765625, + "learning_rate": 0.0029823531391887322, + "loss": 1.2428, + "step": 1536 + }, + { + "epoch": 0.1347822572136761, + "grad_norm": 0.09423828125, + "learning_rate": 0.0029822873757718387, + "loss": 1.2669, + "step": 1537 + }, + { + "epoch": 0.13486994898805063, + "grad_norm": 0.103515625, + "learning_rate": 0.0029822214908537758, + "loss": 1.2088, + "step": 1538 + }, + { + "epoch": 0.13495764076242517, + "grad_norm": 0.1474609375, + "learning_rate": 0.0029821554844405517, + "loss": 1.2522, + "step": 1539 + }, + { + "epoch": 0.1350453325367997, + "grad_norm": 0.06982421875, + "learning_rate": 0.002982089356538187, + "loss": 1.2662, + "step": 1540 + }, + { + "epoch": 0.13513302431117427, + "grad_norm": 0.11767578125, + "learning_rate": 0.002982023107152711, + "loss": 1.2763, + "step": 1541 + }, + { + "epoch": 0.1352207160855488, + "grad_norm": 0.06787109375, + "learning_rate": 0.0029819567362901664, + "loss": 1.2297, + "step": 1542 + }, + { + "epoch": 0.13530840785992335, + "grad_norm": 0.11328125, + "learning_rate": 0.002981890243956606, + "loss": 1.2362, + "step": 1543 + }, + { + "epoch": 0.13539609963429788, + "grad_norm": 0.06201171875, + "learning_rate": 0.002981823630158094, + "loss": 1.244, + "step": 1544 + }, + { + "epoch": 0.13548379140867245, + "grad_norm": 0.08056640625, + "learning_rate": 0.0029817568949007047, + "loss": 1.2439, + "step": 1545 + }, + { + "epoch": 0.135571483183047, + "grad_norm": 0.06396484375, + "learning_rate": 0.002981690038190524, + "loss": 1.229, + "step": 1546 + }, + { + "epoch": 0.13565917495742152, + "grad_norm": 0.10009765625, + "learning_rate": 0.0029816230600336504, + "loss": 1.2143, + "step": 1547 + }, + { + "epoch": 0.1357468667317961, + "grad_norm": 0.06005859375, + "learning_rate": 0.0029815559604361905, + "loss": 1.2115, + "step": 1548 + }, + { + "epoch": 0.13583455850617063, + "grad_norm": 0.10400390625, + "learning_rate": 0.002981488739404265, + "loss": 1.2156, + "step": 1549 + }, + { + "epoch": 0.13592225028054516, + "grad_norm": 0.08740234375, + "learning_rate": 0.0029814213969440034, + "loss": 1.2304, + "step": 1550 + }, + { + "epoch": 0.1360099420549197, + "grad_norm": 0.0712890625, + "learning_rate": 0.0029813539330615477, + "loss": 1.264, + "step": 1551 + }, + { + "epoch": 0.13609763382929427, + "grad_norm": 0.09423828125, + "learning_rate": 0.0029812863477630502, + "loss": 1.2226, + "step": 1552 + }, + { + "epoch": 0.1361853256036688, + "grad_norm": 0.0654296875, + "learning_rate": 0.002981218641054674, + "loss": 1.1539, + "step": 1553 + }, + { + "epoch": 0.13627301737804334, + "grad_norm": 0.07568359375, + "learning_rate": 0.002981150812942595, + "loss": 1.1669, + "step": 1554 + }, + { + "epoch": 0.1363607091524179, + "grad_norm": 0.1181640625, + "learning_rate": 0.0029810828634329973, + "loss": 1.2288, + "step": 1555 + }, + { + "epoch": 0.13644840092679245, + "grad_norm": 0.06591796875, + "learning_rate": 0.002981014792532079, + "loss": 1.2686, + "step": 1556 + }, + { + "epoch": 0.13653609270116698, + "grad_norm": 0.07763671875, + "learning_rate": 0.0029809466002460477, + "loss": 1.2389, + "step": 1557 + }, + { + "epoch": 0.13662378447554152, + "grad_norm": 0.060546875, + "learning_rate": 0.0029808782865811223, + "loss": 1.2592, + "step": 1558 + }, + { + "epoch": 0.1367114762499161, + "grad_norm": 0.0927734375, + "learning_rate": 0.002980809851543533, + "loss": 1.3417, + "step": 1559 + }, + { + "epoch": 0.13679916802429062, + "grad_norm": 0.0615234375, + "learning_rate": 0.0029807412951395203, + "loss": 1.2297, + "step": 1560 + }, + { + "epoch": 0.13688685979866516, + "grad_norm": 0.1484375, + "learning_rate": 0.0029806726173753372, + "loss": 1.2832, + "step": 1561 + }, + { + "epoch": 0.1369745515730397, + "grad_norm": 0.061767578125, + "learning_rate": 0.0029806038182572463, + "loss": 1.2379, + "step": 1562 + }, + { + "epoch": 0.13706224334741426, + "grad_norm": 0.15234375, + "learning_rate": 0.0029805348977915216, + "loss": 1.258, + "step": 1563 + }, + { + "epoch": 0.1371499351217888, + "grad_norm": 0.10791015625, + "learning_rate": 0.00298046585598445, + "loss": 1.2212, + "step": 1564 + }, + { + "epoch": 0.13723762689616334, + "grad_norm": 0.07177734375, + "learning_rate": 0.0029803966928423262, + "loss": 1.2305, + "step": 1565 + }, + { + "epoch": 0.1373253186705379, + "grad_norm": 0.09228515625, + "learning_rate": 0.0029803274083714582, + "loss": 1.2141, + "step": 1566 + }, + { + "epoch": 0.13741301044491244, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029802580025781655, + "loss": 1.2364, + "step": 1567 + }, + { + "epoch": 0.13750070221928698, + "grad_norm": 0.06787109375, + "learning_rate": 0.0029801884754687767, + "loss": 1.2939, + "step": 1568 + }, + { + "epoch": 0.13758839399366152, + "grad_norm": 0.10986328125, + "learning_rate": 0.0029801188270496327, + "loss": 1.2379, + "step": 1569 + }, + { + "epoch": 0.13767608576803608, + "grad_norm": 0.06787109375, + "learning_rate": 0.002980049057327086, + "loss": 1.1832, + "step": 1570 + }, + { + "epoch": 0.13776377754241062, + "grad_norm": 0.06884765625, + "learning_rate": 0.002979979166307498, + "loss": 1.2445, + "step": 1571 + }, + { + "epoch": 0.13785146931678516, + "grad_norm": 0.0810546875, + "learning_rate": 0.002979909153997243, + "loss": 1.233, + "step": 1572 + }, + { + "epoch": 0.13793916109115972, + "grad_norm": 0.0625, + "learning_rate": 0.002979839020402707, + "loss": 1.1687, + "step": 1573 + }, + { + "epoch": 0.13802685286553426, + "grad_norm": 0.0654296875, + "learning_rate": 0.0029797687655302853, + "loss": 1.2213, + "step": 1574 + }, + { + "epoch": 0.1381145446399088, + "grad_norm": 0.08349609375, + "learning_rate": 0.002979698389386385, + "loss": 1.3031, + "step": 1575 + }, + { + "epoch": 0.13820223641428334, + "grad_norm": 0.1259765625, + "learning_rate": 0.002979627891977424, + "loss": 1.3023, + "step": 1576 + }, + { + "epoch": 0.1382899281886579, + "grad_norm": 0.0693359375, + "learning_rate": 0.0029795572733098317, + "loss": 1.1969, + "step": 1577 + }, + { + "epoch": 0.13837761996303244, + "grad_norm": 0.08349609375, + "learning_rate": 0.002979486533390048, + "loss": 1.2537, + "step": 1578 + }, + { + "epoch": 0.13846531173740698, + "grad_norm": 0.06591796875, + "learning_rate": 0.002979415672224525, + "loss": 1.2111, + "step": 1579 + }, + { + "epoch": 0.13855300351178151, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029793446898197244, + "loss": 1.2083, + "step": 1580 + }, + { + "epoch": 0.13864069528615608, + "grad_norm": 0.08154296875, + "learning_rate": 0.00297927358618212, + "loss": 1.2314, + "step": 1581 + }, + { + "epoch": 0.13872838706053062, + "grad_norm": 0.07763671875, + "learning_rate": 0.0029792023613181957, + "loss": 1.1932, + "step": 1582 + }, + { + "epoch": 0.13881607883490515, + "grad_norm": 0.0625, + "learning_rate": 0.0029791310152344473, + "loss": 1.1545, + "step": 1583 + }, + { + "epoch": 0.13890377060927972, + "grad_norm": 0.08837890625, + "learning_rate": 0.0029790595479373813, + "loss": 1.2353, + "step": 1584 + }, + { + "epoch": 0.13899146238365426, + "grad_norm": 0.1416015625, + "learning_rate": 0.0029789879594335164, + "loss": 1.2443, + "step": 1585 + }, + { + "epoch": 0.1390791541580288, + "grad_norm": 0.07275390625, + "learning_rate": 0.0029789162497293794, + "loss": 1.2265, + "step": 1586 + }, + { + "epoch": 0.13916684593240333, + "grad_norm": 0.09619140625, + "learning_rate": 0.002978844418831511, + "loss": 1.2414, + "step": 1587 + }, + { + "epoch": 0.1392545377067779, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029787724667464624, + "loss": 1.2488, + "step": 1588 + }, + { + "epoch": 0.13934222948115244, + "grad_norm": 0.060791015625, + "learning_rate": 0.002978700393480795, + "loss": 1.1473, + "step": 1589 + }, + { + "epoch": 0.13942992125552697, + "grad_norm": 0.09619140625, + "learning_rate": 0.0029786281990410823, + "loss": 1.3154, + "step": 1590 + }, + { + "epoch": 0.13951761302990154, + "grad_norm": 0.08837890625, + "learning_rate": 0.0029785558834339067, + "loss": 1.2534, + "step": 1591 + }, + { + "epoch": 0.13960530480427608, + "grad_norm": 0.06884765625, + "learning_rate": 0.002978483446665865, + "loss": 1.201, + "step": 1592 + }, + { + "epoch": 0.1396929965786506, + "grad_norm": 0.06494140625, + "learning_rate": 0.0029784108887435617, + "loss": 1.2566, + "step": 1593 + }, + { + "epoch": 0.13978068835302515, + "grad_norm": 0.07861328125, + "learning_rate": 0.0029783382096736145, + "loss": 1.2412, + "step": 1594 + }, + { + "epoch": 0.13986838012739972, + "grad_norm": 0.13671875, + "learning_rate": 0.0029782654094626525, + "loss": 1.2333, + "step": 1595 + }, + { + "epoch": 0.13995607190177425, + "grad_norm": 0.0830078125, + "learning_rate": 0.0029781924881173137, + "loss": 1.237, + "step": 1596 + }, + { + "epoch": 0.1400437636761488, + "grad_norm": 0.12060546875, + "learning_rate": 0.0029781194456442485, + "loss": 1.2805, + "step": 1597 + }, + { + "epoch": 0.14013145545052333, + "grad_norm": 0.134765625, + "learning_rate": 0.0029780462820501188, + "loss": 1.2017, + "step": 1598 + }, + { + "epoch": 0.1402191472248979, + "grad_norm": 0.1259765625, + "learning_rate": 0.0029779729973415958, + "loss": 1.2132, + "step": 1599 + }, + { + "epoch": 0.14030683899927243, + "grad_norm": 0.1591796875, + "learning_rate": 0.0029778995915253643, + "loss": 1.2458, + "step": 1600 + }, + { + "epoch": 0.14039453077364697, + "grad_norm": 0.10986328125, + "learning_rate": 0.002977826064608118, + "loss": 1.1536, + "step": 1601 + }, + { + "epoch": 0.14048222254802153, + "grad_norm": 0.1328125, + "learning_rate": 0.002977752416596562, + "loss": 1.1708, + "step": 1602 + }, + { + "epoch": 0.14056991432239607, + "grad_norm": 0.10400390625, + "learning_rate": 0.002977678647497413, + "loss": 1.2385, + "step": 1603 + }, + { + "epoch": 0.1406576060967706, + "grad_norm": 0.1171875, + "learning_rate": 0.0029776047573173993, + "loss": 1.2509, + "step": 1604 + }, + { + "epoch": 0.14074529787114515, + "grad_norm": 0.0791015625, + "learning_rate": 0.0029775307460632584, + "loss": 1.2119, + "step": 1605 + }, + { + "epoch": 0.1408329896455197, + "grad_norm": 0.11376953125, + "learning_rate": 0.002977456613741741, + "loss": 1.2478, + "step": 1606 + }, + { + "epoch": 0.14092068141989425, + "grad_norm": 0.068359375, + "learning_rate": 0.002977382360359607, + "loss": 1.2545, + "step": 1607 + }, + { + "epoch": 0.1410083731942688, + "grad_norm": 0.111328125, + "learning_rate": 0.002977307985923628, + "loss": 1.2298, + "step": 1608 + }, + { + "epoch": 0.14109606496864335, + "grad_norm": 0.0859375, + "learning_rate": 0.0029772334904405876, + "loss": 1.1832, + "step": 1609 + }, + { + "epoch": 0.1411837567430179, + "grad_norm": 0.10009765625, + "learning_rate": 0.002977158873917279, + "loss": 1.2469, + "step": 1610 + }, + { + "epoch": 0.14127144851739243, + "grad_norm": 0.10595703125, + "learning_rate": 0.0029770841363605067, + "loss": 1.1923, + "step": 1611 + }, + { + "epoch": 0.14135914029176697, + "grad_norm": 0.099609375, + "learning_rate": 0.0029770092777770874, + "loss": 1.2275, + "step": 1612 + }, + { + "epoch": 0.14144683206614153, + "grad_norm": 0.061279296875, + "learning_rate": 0.002976934298173848, + "loss": 1.2375, + "step": 1613 + }, + { + "epoch": 0.14153452384051607, + "grad_norm": 0.06201171875, + "learning_rate": 0.0029768591975576253, + "loss": 1.2115, + "step": 1614 + }, + { + "epoch": 0.1416222156148906, + "grad_norm": 0.07568359375, + "learning_rate": 0.0029767839759352694, + "loss": 1.2172, + "step": 1615 + }, + { + "epoch": 0.14170990738926514, + "grad_norm": 0.06787109375, + "learning_rate": 0.0029767086333136405, + "loss": 1.2543, + "step": 1616 + }, + { + "epoch": 0.1417975991636397, + "grad_norm": 0.0810546875, + "learning_rate": 0.0029766331696996083, + "loss": 1.2789, + "step": 1617 + }, + { + "epoch": 0.14188529093801425, + "grad_norm": 0.061279296875, + "learning_rate": 0.002976557585100056, + "loss": 1.277, + "step": 1618 + }, + { + "epoch": 0.14197298271238878, + "grad_norm": 0.09716796875, + "learning_rate": 0.0029764818795218763, + "loss": 1.2974, + "step": 1619 + }, + { + "epoch": 0.14206067448676335, + "grad_norm": 0.08203125, + "learning_rate": 0.002976406052971974, + "loss": 1.263, + "step": 1620 + }, + { + "epoch": 0.1421483662611379, + "grad_norm": 0.072265625, + "learning_rate": 0.0029763301054572633, + "loss": 1.2491, + "step": 1621 + }, + { + "epoch": 0.14223605803551242, + "grad_norm": 0.09521484375, + "learning_rate": 0.0029762540369846708, + "loss": 1.1965, + "step": 1622 + }, + { + "epoch": 0.14232374980988696, + "grad_norm": 0.061767578125, + "learning_rate": 0.002976177847561134, + "loss": 1.1656, + "step": 1623 + }, + { + "epoch": 0.14241144158426153, + "grad_norm": 0.09912109375, + "learning_rate": 0.0029761015371936013, + "loss": 1.2432, + "step": 1624 + }, + { + "epoch": 0.14249913335863607, + "grad_norm": 0.0625, + "learning_rate": 0.0029760251058890312, + "loss": 1.2994, + "step": 1625 + }, + { + "epoch": 0.1425868251330106, + "grad_norm": 0.1103515625, + "learning_rate": 0.002975948553654395, + "loss": 1.3091, + "step": 1626 + }, + { + "epoch": 0.14267451690738517, + "grad_norm": 0.0908203125, + "learning_rate": 0.002975871880496673, + "loss": 1.2049, + "step": 1627 + }, + { + "epoch": 0.1427622086817597, + "grad_norm": 0.11572265625, + "learning_rate": 0.002975795086422859, + "loss": 1.2023, + "step": 1628 + }, + { + "epoch": 0.14284990045613424, + "grad_norm": 0.11083984375, + "learning_rate": 0.002975718171439955, + "loss": 1.2296, + "step": 1629 + }, + { + "epoch": 0.14293759223050878, + "grad_norm": 0.08642578125, + "learning_rate": 0.002975641135554977, + "loss": 1.2299, + "step": 1630 + }, + { + "epoch": 0.14302528400488335, + "grad_norm": 0.1044921875, + "learning_rate": 0.0029755639787749488, + "loss": 1.2367, + "step": 1631 + }, + { + "epoch": 0.14311297577925788, + "grad_norm": 0.060791015625, + "learning_rate": 0.0029754867011069076, + "loss": 1.2048, + "step": 1632 + }, + { + "epoch": 0.14320066755363242, + "grad_norm": 0.09912109375, + "learning_rate": 0.0029754093025579015, + "loss": 1.2234, + "step": 1633 + }, + { + "epoch": 0.14328835932800696, + "grad_norm": 0.0625, + "learning_rate": 0.002975331783134988, + "loss": 1.2113, + "step": 1634 + }, + { + "epoch": 0.14337605110238152, + "grad_norm": 0.1171875, + "learning_rate": 0.0029752541428452375, + "loss": 1.1891, + "step": 1635 + }, + { + "epoch": 0.14346374287675606, + "grad_norm": 0.09130859375, + "learning_rate": 0.0029751763816957305, + "loss": 1.2037, + "step": 1636 + }, + { + "epoch": 0.1435514346511306, + "grad_norm": 0.1455078125, + "learning_rate": 0.002975098499693558, + "loss": 1.2415, + "step": 1637 + }, + { + "epoch": 0.14363912642550516, + "grad_norm": 0.0927734375, + "learning_rate": 0.0029750204968458233, + "loss": 1.2437, + "step": 1638 + }, + { + "epoch": 0.1437268181998797, + "grad_norm": 0.1640625, + "learning_rate": 0.0029749423731596394, + "loss": 1.1978, + "step": 1639 + }, + { + "epoch": 0.14381450997425424, + "grad_norm": 0.11767578125, + "learning_rate": 0.002974864128642132, + "loss": 1.2061, + "step": 1640 + }, + { + "epoch": 0.14390220174862878, + "grad_norm": 0.064453125, + "learning_rate": 0.002974785763300436, + "loss": 1.2559, + "step": 1641 + }, + { + "epoch": 0.14398989352300334, + "grad_norm": 0.08154296875, + "learning_rate": 0.002974707277141698, + "loss": 1.2282, + "step": 1642 + }, + { + "epoch": 0.14407758529737788, + "grad_norm": 0.1142578125, + "learning_rate": 0.0029746286701730763, + "loss": 1.2323, + "step": 1643 + }, + { + "epoch": 0.14416527707175242, + "grad_norm": 0.107421875, + "learning_rate": 0.0029745499424017395, + "loss": 1.2645, + "step": 1644 + }, + { + "epoch": 0.14425296884612698, + "grad_norm": 0.0830078125, + "learning_rate": 0.002974471093834867, + "loss": 1.2442, + "step": 1645 + }, + { + "epoch": 0.14434066062050152, + "grad_norm": 0.1162109375, + "learning_rate": 0.00297439212447965, + "loss": 1.2318, + "step": 1646 + }, + { + "epoch": 0.14442835239487606, + "grad_norm": 0.07080078125, + "learning_rate": 0.00297431303434329, + "loss": 1.2473, + "step": 1647 + }, + { + "epoch": 0.1445160441692506, + "grad_norm": 0.055908203125, + "learning_rate": 0.002974233823433, + "loss": 1.2531, + "step": 1648 + }, + { + "epoch": 0.14460373594362516, + "grad_norm": 0.06640625, + "learning_rate": 0.002974154491756004, + "loss": 1.2138, + "step": 1649 + }, + { + "epoch": 0.1446914277179997, + "grad_norm": 0.064453125, + "learning_rate": 0.002974075039319536, + "loss": 1.185, + "step": 1650 + }, + { + "epoch": 0.14477911949237424, + "grad_norm": 0.07763671875, + "learning_rate": 0.0029739954661308437, + "loss": 1.1755, + "step": 1651 + }, + { + "epoch": 0.14486681126674877, + "grad_norm": 0.0654296875, + "learning_rate": 0.002973915772197182, + "loss": 1.2024, + "step": 1652 + }, + { + "epoch": 0.14495450304112334, + "grad_norm": 0.06591796875, + "learning_rate": 0.0029738359575258192, + "loss": 1.2599, + "step": 1653 + }, + { + "epoch": 0.14504219481549788, + "grad_norm": 0.06787109375, + "learning_rate": 0.002973756022124035, + "loss": 1.2355, + "step": 1654 + }, + { + "epoch": 0.14512988658987241, + "grad_norm": 0.059814453125, + "learning_rate": 0.0029736759659991186, + "loss": 1.2386, + "step": 1655 + }, + { + "epoch": 0.14521757836424698, + "grad_norm": 0.10693359375, + "learning_rate": 0.0029735957891583713, + "loss": 1.1994, + "step": 1656 + }, + { + "epoch": 0.14530527013862152, + "grad_norm": 0.080078125, + "learning_rate": 0.0029735154916091053, + "loss": 1.263, + "step": 1657 + }, + { + "epoch": 0.14539296191299605, + "grad_norm": 0.068359375, + "learning_rate": 0.002973435073358643, + "loss": 1.2615, + "step": 1658 + }, + { + "epoch": 0.1454806536873706, + "grad_norm": 0.07470703125, + "learning_rate": 0.0029733545344143176, + "loss": 1.2311, + "step": 1659 + }, + { + "epoch": 0.14556834546174516, + "grad_norm": 0.08154296875, + "learning_rate": 0.002973273874783475, + "loss": 1.2165, + "step": 1660 + }, + { + "epoch": 0.1456560372361197, + "grad_norm": 0.0771484375, + "learning_rate": 0.0029731930944734723, + "loss": 1.2473, + "step": 1661 + }, + { + "epoch": 0.14574372901049423, + "grad_norm": 0.0810546875, + "learning_rate": 0.0029731121934916736, + "loss": 1.2503, + "step": 1662 + }, + { + "epoch": 0.14583142078486877, + "grad_norm": 0.0673828125, + "learning_rate": 0.0029730311718454594, + "loss": 1.329, + "step": 1663 + }, + { + "epoch": 0.14591911255924334, + "grad_norm": 0.08984375, + "learning_rate": 0.002972950029542218, + "loss": 1.2774, + "step": 1664 + }, + { + "epoch": 0.14600680433361787, + "grad_norm": 0.1279296875, + "learning_rate": 0.002972868766589348, + "loss": 1.2674, + "step": 1665 + }, + { + "epoch": 0.1460944961079924, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029727873829942623, + "loss": 1.2709, + "step": 1666 + }, + { + "epoch": 0.14618218788236698, + "grad_norm": 0.0693359375, + "learning_rate": 0.002972705878764382, + "loss": 1.2515, + "step": 1667 + }, + { + "epoch": 0.1462698796567415, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029726242539071398, + "loss": 1.1644, + "step": 1668 + }, + { + "epoch": 0.14635757143111605, + "grad_norm": 0.076171875, + "learning_rate": 0.0029725425084299803, + "loss": 1.1872, + "step": 1669 + }, + { + "epoch": 0.1464452632054906, + "grad_norm": 0.11572265625, + "learning_rate": 0.0029724606423403577, + "loss": 1.2981, + "step": 1670 + }, + { + "epoch": 0.14653295497986515, + "grad_norm": 0.06298828125, + "learning_rate": 0.0029723786556457386, + "loss": 1.2188, + "step": 1671 + }, + { + "epoch": 0.1466206467542397, + "grad_norm": 0.1259765625, + "learning_rate": 0.0029722965483535996, + "loss": 1.1862, + "step": 1672 + }, + { + "epoch": 0.14670833852861423, + "grad_norm": 0.064453125, + "learning_rate": 0.0029722143204714293, + "loss": 1.253, + "step": 1673 + }, + { + "epoch": 0.1467960303029888, + "grad_norm": 0.14453125, + "learning_rate": 0.0029721319720067262, + "loss": 1.2437, + "step": 1674 + }, + { + "epoch": 0.14688372207736333, + "grad_norm": 0.1376953125, + "learning_rate": 0.002972049502967, + "loss": 1.2417, + "step": 1675 + }, + { + "epoch": 0.14697141385173787, + "grad_norm": 0.1376953125, + "learning_rate": 0.0029719669133597723, + "loss": 1.3076, + "step": 1676 + }, + { + "epoch": 0.1470591056261124, + "grad_norm": 0.228515625, + "learning_rate": 0.002971884203192575, + "loss": 1.255, + "step": 1677 + }, + { + "epoch": 0.14714679740048697, + "grad_norm": 0.08349609375, + "learning_rate": 0.00297180137247295, + "loss": 1.2467, + "step": 1678 + }, + { + "epoch": 0.1472344891748615, + "grad_norm": 0.23046875, + "learning_rate": 0.002971718421208453, + "loss": 1.2239, + "step": 1679 + }, + { + "epoch": 0.14732218094923605, + "grad_norm": 0.0869140625, + "learning_rate": 0.002971635349406647, + "loss": 1.2074, + "step": 1680 + }, + { + "epoch": 0.14740987272361059, + "grad_norm": 0.134765625, + "learning_rate": 0.0029715521570751096, + "loss": 1.2576, + "step": 1681 + }, + { + "epoch": 0.14749756449798515, + "grad_norm": 0.07568359375, + "learning_rate": 0.002971468844221427, + "loss": 1.2036, + "step": 1682 + }, + { + "epoch": 0.1475852562723597, + "grad_norm": 0.09619140625, + "learning_rate": 0.0029713854108531965, + "loss": 1.2238, + "step": 1683 + }, + { + "epoch": 0.14767294804673423, + "grad_norm": 0.0830078125, + "learning_rate": 0.0029713018569780284, + "loss": 1.2164, + "step": 1684 + }, + { + "epoch": 0.1477606398211088, + "grad_norm": 0.10400390625, + "learning_rate": 0.0029712181826035415, + "loss": 1.2165, + "step": 1685 + }, + { + "epoch": 0.14784833159548333, + "grad_norm": 0.068359375, + "learning_rate": 0.0029711343877373672, + "loss": 1.2262, + "step": 1686 + }, + { + "epoch": 0.14793602336985787, + "grad_norm": 0.107421875, + "learning_rate": 0.0029710504723871474, + "loss": 1.201, + "step": 1687 + }, + { + "epoch": 0.1480237151442324, + "grad_norm": 0.08251953125, + "learning_rate": 0.0029709664365605345, + "loss": 1.2723, + "step": 1688 + }, + { + "epoch": 0.14811140691860697, + "grad_norm": 0.1591796875, + "learning_rate": 0.0029708822802651928, + "loss": 1.2362, + "step": 1689 + }, + { + "epoch": 0.1481990986929815, + "grad_norm": 0.060791015625, + "learning_rate": 0.0029707980035087967, + "loss": 1.184, + "step": 1690 + }, + { + "epoch": 0.14828679046735604, + "grad_norm": 0.224609375, + "learning_rate": 0.002970713606299033, + "loss": 1.1951, + "step": 1691 + }, + { + "epoch": 0.1483744822417306, + "grad_norm": 0.08544921875, + "learning_rate": 0.002970629088643597, + "loss": 1.2467, + "step": 1692 + }, + { + "epoch": 0.14846217401610515, + "grad_norm": 0.1748046875, + "learning_rate": 0.0029705444505501977, + "loss": 1.2589, + "step": 1693 + }, + { + "epoch": 0.14854986579047968, + "grad_norm": 0.060791015625, + "learning_rate": 0.0029704596920265536, + "loss": 1.2283, + "step": 1694 + }, + { + "epoch": 0.14863755756485422, + "grad_norm": 0.06396484375, + "learning_rate": 0.0029703748130803943, + "loss": 1.2138, + "step": 1695 + }, + { + "epoch": 0.1487252493392288, + "grad_norm": 0.07080078125, + "learning_rate": 0.0029702898137194604, + "loss": 1.221, + "step": 1696 + }, + { + "epoch": 0.14881294111360333, + "grad_norm": 0.07275390625, + "learning_rate": 0.0029702046939515036, + "loss": 1.235, + "step": 1697 + }, + { + "epoch": 0.14890063288797786, + "grad_norm": 0.09033203125, + "learning_rate": 0.002970119453784287, + "loss": 1.3198, + "step": 1698 + }, + { + "epoch": 0.1489883246623524, + "grad_norm": 0.07373046875, + "learning_rate": 0.0029700340932255842, + "loss": 1.2292, + "step": 1699 + }, + { + "epoch": 0.14907601643672697, + "grad_norm": 0.0771484375, + "learning_rate": 0.0029699486122831795, + "loss": 1.2851, + "step": 1700 + }, + { + "epoch": 0.1491637082111015, + "grad_norm": 0.07958984375, + "learning_rate": 0.002969863010964869, + "loss": 1.2093, + "step": 1701 + }, + { + "epoch": 0.14925139998547604, + "grad_norm": 0.0654296875, + "learning_rate": 0.002969777289278459, + "loss": 1.2072, + "step": 1702 + }, + { + "epoch": 0.1493390917598506, + "grad_norm": 0.0703125, + "learning_rate": 0.0029696914472317672, + "loss": 1.2596, + "step": 1703 + }, + { + "epoch": 0.14942678353422514, + "grad_norm": 0.08203125, + "learning_rate": 0.0029696054848326226, + "loss": 1.2439, + "step": 1704 + }, + { + "epoch": 0.14951447530859968, + "grad_norm": 0.07177734375, + "learning_rate": 0.002969519402088864, + "loss": 1.2596, + "step": 1705 + }, + { + "epoch": 0.14960216708297422, + "grad_norm": 0.059326171875, + "learning_rate": 0.002969433199008342, + "loss": 1.2164, + "step": 1706 + }, + { + "epoch": 0.14968985885734878, + "grad_norm": 0.05859375, + "learning_rate": 0.0029693468755989188, + "loss": 1.2637, + "step": 1707 + }, + { + "epoch": 0.14977755063172332, + "grad_norm": 0.06787109375, + "learning_rate": 0.002969260431868466, + "loss": 1.1995, + "step": 1708 + }, + { + "epoch": 0.14986524240609786, + "grad_norm": 0.08154296875, + "learning_rate": 0.002969173867824868, + "loss": 1.2292, + "step": 1709 + }, + { + "epoch": 0.14995293418047242, + "grad_norm": 0.07470703125, + "learning_rate": 0.002969087183476018, + "loss": 1.2604, + "step": 1710 + }, + { + "epoch": 0.15004062595484696, + "grad_norm": 0.072265625, + "learning_rate": 0.0029690003788298224, + "loss": 1.2612, + "step": 1711 + }, + { + "epoch": 0.1501283177292215, + "grad_norm": 0.08642578125, + "learning_rate": 0.002968913453894197, + "loss": 1.1911, + "step": 1712 + }, + { + "epoch": 0.15021600950359604, + "grad_norm": 0.0947265625, + "learning_rate": 0.00296882640867707, + "loss": 1.1865, + "step": 1713 + }, + { + "epoch": 0.1503037012779706, + "grad_norm": 0.06884765625, + "learning_rate": 0.0029687392431863788, + "loss": 1.2322, + "step": 1714 + }, + { + "epoch": 0.15039139305234514, + "grad_norm": 0.1513671875, + "learning_rate": 0.0029686519574300724, + "loss": 1.3256, + "step": 1715 + }, + { + "epoch": 0.15047908482671968, + "grad_norm": 0.06591796875, + "learning_rate": 0.0029685645514161123, + "loss": 1.1571, + "step": 1716 + }, + { + "epoch": 0.15056677660109422, + "grad_norm": 0.1201171875, + "learning_rate": 0.0029684770251524684, + "loss": 1.1894, + "step": 1717 + }, + { + "epoch": 0.15065446837546878, + "grad_norm": 0.0751953125, + "learning_rate": 0.002968389378647124, + "loss": 1.2602, + "step": 1718 + }, + { + "epoch": 0.15074216014984332, + "grad_norm": 0.1005859375, + "learning_rate": 0.0029683016119080715, + "loss": 1.2948, + "step": 1719 + }, + { + "epoch": 0.15082985192421786, + "grad_norm": 0.12158203125, + "learning_rate": 0.002968213724943315, + "loss": 1.2603, + "step": 1720 + }, + { + "epoch": 0.15091754369859242, + "grad_norm": 0.0673828125, + "learning_rate": 0.0029681257177608697, + "loss": 1.2357, + "step": 1721 + }, + { + "epoch": 0.15100523547296696, + "grad_norm": 0.1015625, + "learning_rate": 0.0029680375903687614, + "loss": 1.2241, + "step": 1722 + }, + { + "epoch": 0.1510929272473415, + "grad_norm": 0.0849609375, + "learning_rate": 0.0029679493427750277, + "loss": 1.291, + "step": 1723 + }, + { + "epoch": 0.15118061902171603, + "grad_norm": 0.08984375, + "learning_rate": 0.002967860974987716, + "loss": 1.2624, + "step": 1724 + }, + { + "epoch": 0.1512683107960906, + "grad_norm": 0.07666015625, + "learning_rate": 0.002967772487014886, + "loss": 1.2348, + "step": 1725 + }, + { + "epoch": 0.15135600257046514, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029676838788646066, + "loss": 1.2996, + "step": 1726 + }, + { + "epoch": 0.15144369434483967, + "grad_norm": 0.06640625, + "learning_rate": 0.0029675951505449593, + "loss": 1.2744, + "step": 1727 + }, + { + "epoch": 0.15153138611921424, + "grad_norm": 0.0908203125, + "learning_rate": 0.002967506302064035, + "loss": 1.2476, + "step": 1728 + }, + { + "epoch": 0.15161907789358878, + "grad_norm": 0.1103515625, + "learning_rate": 0.0029674173334299377, + "loss": 1.1993, + "step": 1729 + }, + { + "epoch": 0.15170676966796331, + "grad_norm": 0.0595703125, + "learning_rate": 0.00296732824465078, + "loss": 1.3051, + "step": 1730 + }, + { + "epoch": 0.15179446144233785, + "grad_norm": 0.08251953125, + "learning_rate": 0.0029672390357346873, + "loss": 1.2253, + "step": 1731 + }, + { + "epoch": 0.15188215321671242, + "grad_norm": 0.06494140625, + "learning_rate": 0.002967149706689795, + "loss": 1.2658, + "step": 1732 + }, + { + "epoch": 0.15196984499108696, + "grad_norm": 0.11767578125, + "learning_rate": 0.00296706025752425, + "loss": 1.184, + "step": 1733 + }, + { + "epoch": 0.1520575367654615, + "grad_norm": 0.060791015625, + "learning_rate": 0.002966970688246209, + "loss": 1.221, + "step": 1734 + }, + { + "epoch": 0.15214522853983603, + "grad_norm": 0.0810546875, + "learning_rate": 0.0029668809988638405, + "loss": 1.2489, + "step": 1735 + }, + { + "epoch": 0.1522329203142106, + "grad_norm": 0.064453125, + "learning_rate": 0.002966791189385325, + "loss": 1.2001, + "step": 1736 + }, + { + "epoch": 0.15232061208858513, + "grad_norm": 0.09375, + "learning_rate": 0.0029667012598188526, + "loss": 1.2649, + "step": 1737 + }, + { + "epoch": 0.15240830386295967, + "grad_norm": 0.09130859375, + "learning_rate": 0.0029666112101726237, + "loss": 1.1744, + "step": 1738 + }, + { + "epoch": 0.15249599563733424, + "grad_norm": 0.1318359375, + "learning_rate": 0.002966521040454852, + "loss": 1.2393, + "step": 1739 + }, + { + "epoch": 0.15258368741170877, + "grad_norm": 0.1376953125, + "learning_rate": 0.0029664307506737596, + "loss": 1.2344, + "step": 1740 + }, + { + "epoch": 0.1526713791860833, + "grad_norm": 0.068359375, + "learning_rate": 0.0029663403408375813, + "loss": 1.1983, + "step": 1741 + }, + { + "epoch": 0.15275907096045785, + "grad_norm": 0.09765625, + "learning_rate": 0.002966249810954561, + "loss": 1.2052, + "step": 1742 + }, + { + "epoch": 0.15284676273483241, + "grad_norm": 0.06884765625, + "learning_rate": 0.002966159161032957, + "loss": 1.2302, + "step": 1743 + }, + { + "epoch": 0.15293445450920695, + "grad_norm": 0.123046875, + "learning_rate": 0.002966068391081035, + "loss": 1.1733, + "step": 1744 + }, + { + "epoch": 0.1530221462835815, + "grad_norm": 0.0654296875, + "learning_rate": 0.002965977501107073, + "loss": 1.1805, + "step": 1745 + }, + { + "epoch": 0.15310983805795605, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029658864911193596, + "loss": 1.2527, + "step": 1746 + }, + { + "epoch": 0.1531975298323306, + "grad_norm": 0.061767578125, + "learning_rate": 0.0029657953611261956, + "loss": 1.2283, + "step": 1747 + }, + { + "epoch": 0.15328522160670513, + "grad_norm": 0.09375, + "learning_rate": 0.0029657041111358917, + "loss": 1.2021, + "step": 1748 + }, + { + "epoch": 0.15337291338107967, + "grad_norm": 0.0625, + "learning_rate": 0.0029656127411567686, + "loss": 1.2102, + "step": 1749 + }, + { + "epoch": 0.15346060515545423, + "grad_norm": 0.08447265625, + "learning_rate": 0.0029655212511971603, + "loss": 1.2241, + "step": 1750 + }, + { + "epoch": 0.15354829692982877, + "grad_norm": 0.09765625, + "learning_rate": 0.0029654296412654096, + "loss": 1.2334, + "step": 1751 + }, + { + "epoch": 0.1536359887042033, + "grad_norm": 0.0791015625, + "learning_rate": 0.0029653379113698717, + "loss": 1.2512, + "step": 1752 + }, + { + "epoch": 0.15372368047857785, + "grad_norm": 0.052490234375, + "learning_rate": 0.0029652460615189114, + "loss": 1.1526, + "step": 1753 + }, + { + "epoch": 0.1538113722529524, + "grad_norm": 0.059326171875, + "learning_rate": 0.002965154091720906, + "loss": 1.2399, + "step": 1754 + }, + { + "epoch": 0.15389906402732695, + "grad_norm": 0.07861328125, + "learning_rate": 0.002965062001984242, + "loss": 1.2404, + "step": 1755 + }, + { + "epoch": 0.15398675580170149, + "grad_norm": 0.06884765625, + "learning_rate": 0.0029649697923173185, + "loss": 1.2426, + "step": 1756 + }, + { + "epoch": 0.15407444757607605, + "grad_norm": 0.076171875, + "learning_rate": 0.0029648774627285446, + "loss": 1.2045, + "step": 1757 + }, + { + "epoch": 0.1541621393504506, + "grad_norm": 0.107421875, + "learning_rate": 0.0029647850132263403, + "loss": 1.3156, + "step": 1758 + }, + { + "epoch": 0.15424983112482513, + "grad_norm": 0.142578125, + "learning_rate": 0.002964692443819137, + "loss": 1.2339, + "step": 1759 + }, + { + "epoch": 0.15433752289919966, + "grad_norm": 0.08740234375, + "learning_rate": 0.002964599754515377, + "loss": 1.2263, + "step": 1760 + }, + { + "epoch": 0.15442521467357423, + "grad_norm": 0.1484375, + "learning_rate": 0.002964506945323512, + "loss": 1.2452, + "step": 1761 + }, + { + "epoch": 0.15451290644794877, + "grad_norm": 0.08203125, + "learning_rate": 0.002964414016252008, + "loss": 1.2067, + "step": 1762 + }, + { + "epoch": 0.1546005982223233, + "grad_norm": 0.142578125, + "learning_rate": 0.0029643209673093384, + "loss": 1.2505, + "step": 1763 + }, + { + "epoch": 0.15468828999669787, + "grad_norm": 0.1572265625, + "learning_rate": 0.0029642277985039893, + "loss": 1.3219, + "step": 1764 + }, + { + "epoch": 0.1547759817710724, + "grad_norm": 0.09375, + "learning_rate": 0.0029641345098444585, + "loss": 1.2168, + "step": 1765 + }, + { + "epoch": 0.15486367354544694, + "grad_norm": 0.2060546875, + "learning_rate": 0.002964041101339252, + "loss": 1.231, + "step": 1766 + }, + { + "epoch": 0.15495136531982148, + "grad_norm": 0.07763671875, + "learning_rate": 0.0029639475729968893, + "loss": 1.19, + "step": 1767 + }, + { + "epoch": 0.15503905709419605, + "grad_norm": 0.2265625, + "learning_rate": 0.0029638539248259, + "loss": 1.2543, + "step": 1768 + }, + { + "epoch": 0.15512674886857059, + "grad_norm": 0.06396484375, + "learning_rate": 0.002963760156834825, + "loss": 1.2207, + "step": 1769 + }, + { + "epoch": 0.15521444064294512, + "grad_norm": 0.154296875, + "learning_rate": 0.0029636662690322143, + "loss": 1.2478, + "step": 1770 + }, + { + "epoch": 0.15530213241731966, + "grad_norm": 0.07421875, + "learning_rate": 0.0029635722614266315, + "loss": 1.2565, + "step": 1771 + }, + { + "epoch": 0.15538982419169423, + "grad_norm": 0.1572265625, + "learning_rate": 0.002963478134026649, + "loss": 1.2674, + "step": 1772 + }, + { + "epoch": 0.15547751596606876, + "grad_norm": 0.0849609375, + "learning_rate": 0.002963383886840852, + "loss": 1.2259, + "step": 1773 + }, + { + "epoch": 0.1555652077404433, + "grad_norm": 0.1396484375, + "learning_rate": 0.002963289519877835, + "loss": 1.2103, + "step": 1774 + }, + { + "epoch": 0.15565289951481787, + "grad_norm": 0.10986328125, + "learning_rate": 0.0029631950331462037, + "loss": 1.2993, + "step": 1775 + }, + { + "epoch": 0.1557405912891924, + "grad_norm": 0.11328125, + "learning_rate": 0.0029631004266545756, + "loss": 1.2323, + "step": 1776 + }, + { + "epoch": 0.15582828306356694, + "grad_norm": 0.1318359375, + "learning_rate": 0.002963005700411578, + "loss": 1.2281, + "step": 1777 + }, + { + "epoch": 0.15591597483794148, + "grad_norm": 0.0966796875, + "learning_rate": 0.00296291085442585, + "loss": 1.2804, + "step": 1778 + }, + { + "epoch": 0.15600366661231604, + "grad_norm": 0.1220703125, + "learning_rate": 0.0029628158887060416, + "loss": 1.1703, + "step": 1779 + }, + { + "epoch": 0.15609135838669058, + "grad_norm": 0.0703125, + "learning_rate": 0.002962720803260813, + "loss": 1.1887, + "step": 1780 + }, + { + "epoch": 0.15617905016106512, + "grad_norm": 0.0576171875, + "learning_rate": 0.0029626255980988365, + "loss": 1.2142, + "step": 1781 + }, + { + "epoch": 0.15626674193543968, + "grad_norm": 0.0732421875, + "learning_rate": 0.0029625302732287934, + "loss": 1.1763, + "step": 1782 + }, + { + "epoch": 0.15635443370981422, + "grad_norm": 0.0625, + "learning_rate": 0.0029624348286593776, + "loss": 1.2937, + "step": 1783 + }, + { + "epoch": 0.15644212548418876, + "grad_norm": 0.0732421875, + "learning_rate": 0.0029623392643992937, + "loss": 1.2644, + "step": 1784 + }, + { + "epoch": 0.1565298172585633, + "grad_norm": 0.0986328125, + "learning_rate": 0.0029622435804572563, + "loss": 1.2322, + "step": 1785 + }, + { + "epoch": 0.15661750903293786, + "grad_norm": 0.072265625, + "learning_rate": 0.0029621477768419927, + "loss": 1.2369, + "step": 1786 + }, + { + "epoch": 0.1567052008073124, + "grad_norm": 0.0791015625, + "learning_rate": 0.002962051853562238, + "loss": 1.1507, + "step": 1787 + }, + { + "epoch": 0.15679289258168694, + "grad_norm": 0.1064453125, + "learning_rate": 0.0029619558106267424, + "loss": 1.2517, + "step": 1788 + }, + { + "epoch": 0.15688058435606148, + "grad_norm": 0.1201171875, + "learning_rate": 0.0029618596480442635, + "loss": 1.2031, + "step": 1789 + }, + { + "epoch": 0.15696827613043604, + "grad_norm": 0.13671875, + "learning_rate": 0.0029617633658235707, + "loss": 1.3067, + "step": 1790 + }, + { + "epoch": 0.15705596790481058, + "grad_norm": 0.1767578125, + "learning_rate": 0.0029616669639734457, + "loss": 1.2568, + "step": 1791 + }, + { + "epoch": 0.15714365967918512, + "grad_norm": 0.10009765625, + "learning_rate": 0.002961570442502679, + "loss": 1.2221, + "step": 1792 + }, + { + "epoch": 0.15723135145355968, + "grad_norm": 0.08251953125, + "learning_rate": 0.0029614738014200745, + "loss": 1.3026, + "step": 1793 + }, + { + "epoch": 0.15731904322793422, + "grad_norm": 0.1103515625, + "learning_rate": 0.0029613770407344447, + "loss": 1.1919, + "step": 1794 + }, + { + "epoch": 0.15740673500230876, + "grad_norm": 0.0634765625, + "learning_rate": 0.002961280160454614, + "loss": 1.222, + "step": 1795 + }, + { + "epoch": 0.1574944267766833, + "grad_norm": 0.0908203125, + "learning_rate": 0.0029611831605894172, + "loss": 1.1845, + "step": 1796 + }, + { + "epoch": 0.15758211855105786, + "grad_norm": 0.064453125, + "learning_rate": 0.0029610860411477015, + "loss": 1.3043, + "step": 1797 + }, + { + "epoch": 0.1576698103254324, + "grad_norm": 0.062255859375, + "learning_rate": 0.0029609888021383235, + "loss": 1.2524, + "step": 1798 + }, + { + "epoch": 0.15775750209980693, + "grad_norm": 0.059326171875, + "learning_rate": 0.002960891443570151, + "loss": 1.2282, + "step": 1799 + }, + { + "epoch": 0.1578451938741815, + "grad_norm": 0.0634765625, + "learning_rate": 0.0029607939654520627, + "loss": 1.2411, + "step": 1800 + }, + { + "epoch": 0.15793288564855604, + "grad_norm": 0.07275390625, + "learning_rate": 0.0029606963677929485, + "loss": 1.23, + "step": 1801 + }, + { + "epoch": 0.15802057742293057, + "grad_norm": 0.06396484375, + "learning_rate": 0.0029605986506017093, + "loss": 1.2296, + "step": 1802 + }, + { + "epoch": 0.1581082691973051, + "grad_norm": 0.11572265625, + "learning_rate": 0.0029605008138872562, + "loss": 1.151, + "step": 1803 + }, + { + "epoch": 0.15819596097167968, + "grad_norm": 0.06298828125, + "learning_rate": 0.0029604028576585124, + "loss": 1.1825, + "step": 1804 + }, + { + "epoch": 0.15828365274605422, + "grad_norm": 0.1513671875, + "learning_rate": 0.00296030478192441, + "loss": 1.251, + "step": 1805 + }, + { + "epoch": 0.15837134452042875, + "grad_norm": 0.0908203125, + "learning_rate": 0.002960206586693895, + "loss": 1.1839, + "step": 1806 + }, + { + "epoch": 0.1584590362948033, + "grad_norm": 0.08984375, + "learning_rate": 0.002960108271975921, + "loss": 1.2336, + "step": 1807 + }, + { + "epoch": 0.15854672806917786, + "grad_norm": 0.06884765625, + "learning_rate": 0.0029600098377794543, + "loss": 1.2594, + "step": 1808 + }, + { + "epoch": 0.1586344198435524, + "grad_norm": 0.07177734375, + "learning_rate": 0.0029599112841134723, + "loss": 1.2354, + "step": 1809 + }, + { + "epoch": 0.15872211161792693, + "grad_norm": 0.07373046875, + "learning_rate": 0.0029598126109869633, + "loss": 1.2487, + "step": 1810 + }, + { + "epoch": 0.1588098033923015, + "grad_norm": 0.08251953125, + "learning_rate": 0.002959713818408925, + "loss": 1.2244, + "step": 1811 + }, + { + "epoch": 0.15889749516667603, + "grad_norm": 0.1298828125, + "learning_rate": 0.0029596149063883677, + "loss": 1.1701, + "step": 1812 + }, + { + "epoch": 0.15898518694105057, + "grad_norm": 0.119140625, + "learning_rate": 0.0029595158749343114, + "loss": 1.2335, + "step": 1813 + }, + { + "epoch": 0.1590728787154251, + "grad_norm": 0.1005859375, + "learning_rate": 0.0029594167240557883, + "loss": 1.2393, + "step": 1814 + }, + { + "epoch": 0.15916057048979967, + "grad_norm": 0.1025390625, + "learning_rate": 0.0029593174537618392, + "loss": 1.244, + "step": 1815 + }, + { + "epoch": 0.1592482622641742, + "grad_norm": 0.07080078125, + "learning_rate": 0.0029592180640615195, + "loss": 1.2361, + "step": 1816 + }, + { + "epoch": 0.15933595403854875, + "grad_norm": 0.11181640625, + "learning_rate": 0.0029591185549638914, + "loss": 1.2492, + "step": 1817 + }, + { + "epoch": 0.15942364581292331, + "grad_norm": 0.087890625, + "learning_rate": 0.002959018926478031, + "loss": 1.2262, + "step": 1818 + }, + { + "epoch": 0.15951133758729785, + "grad_norm": 0.07275390625, + "learning_rate": 0.002958919178613023, + "loss": 1.2143, + "step": 1819 + }, + { + "epoch": 0.1595990293616724, + "grad_norm": 0.059326171875, + "learning_rate": 0.002958819311377965, + "loss": 1.2711, + "step": 1820 + }, + { + "epoch": 0.15968672113604693, + "grad_norm": 0.099609375, + "learning_rate": 0.002958719324781965, + "loss": 1.2125, + "step": 1821 + }, + { + "epoch": 0.1597744129104215, + "grad_norm": 0.08935546875, + "learning_rate": 0.002958619218834141, + "loss": 1.1969, + "step": 1822 + }, + { + "epoch": 0.15986210468479603, + "grad_norm": 0.0732421875, + "learning_rate": 0.002958518993543622, + "loss": 1.2653, + "step": 1823 + }, + { + "epoch": 0.15994979645917057, + "grad_norm": 0.06640625, + "learning_rate": 0.002958418648919549, + "loss": 1.293, + "step": 1824 + }, + { + "epoch": 0.1600374882335451, + "grad_norm": 0.07080078125, + "learning_rate": 0.0029583181849710733, + "loss": 1.2418, + "step": 1825 + }, + { + "epoch": 0.16012518000791967, + "grad_norm": 0.0654296875, + "learning_rate": 0.0029582176017073558, + "loss": 1.2498, + "step": 1826 + }, + { + "epoch": 0.1602128717822942, + "grad_norm": 0.07275390625, + "learning_rate": 0.00295811689913757, + "loss": 1.1711, + "step": 1827 + }, + { + "epoch": 0.16030056355666875, + "grad_norm": 0.068359375, + "learning_rate": 0.002958016077270901, + "loss": 1.2305, + "step": 1828 + }, + { + "epoch": 0.1603882553310433, + "grad_norm": 0.0595703125, + "learning_rate": 0.0029579151361165414, + "loss": 1.262, + "step": 1829 + }, + { + "epoch": 0.16047594710541785, + "grad_norm": 0.07958984375, + "learning_rate": 0.0029578140756836985, + "loss": 1.147, + "step": 1830 + }, + { + "epoch": 0.1605636388797924, + "grad_norm": 0.064453125, + "learning_rate": 0.0029577128959815875, + "loss": 1.2276, + "step": 1831 + }, + { + "epoch": 0.16065133065416692, + "grad_norm": 0.07763671875, + "learning_rate": 0.0029576115970194362, + "loss": 1.2673, + "step": 1832 + }, + { + "epoch": 0.1607390224285415, + "grad_norm": 0.0654296875, + "learning_rate": 0.0029575101788064826, + "loss": 1.2303, + "step": 1833 + }, + { + "epoch": 0.16082671420291603, + "grad_norm": 0.06396484375, + "learning_rate": 0.0029574086413519766, + "loss": 1.2291, + "step": 1834 + }, + { + "epoch": 0.16091440597729056, + "grad_norm": 0.06689453125, + "learning_rate": 0.0029573069846651773, + "loss": 1.2494, + "step": 1835 + }, + { + "epoch": 0.16100209775166513, + "grad_norm": 0.0615234375, + "learning_rate": 0.002957205208755356, + "loss": 1.1361, + "step": 1836 + }, + { + "epoch": 0.16108978952603967, + "grad_norm": 0.0673828125, + "learning_rate": 0.0029571033136317937, + "loss": 1.2498, + "step": 1837 + }, + { + "epoch": 0.1611774813004142, + "grad_norm": 0.09130859375, + "learning_rate": 0.002957001299303784, + "loss": 1.1754, + "step": 1838 + }, + { + "epoch": 0.16126517307478874, + "grad_norm": 0.1064453125, + "learning_rate": 0.0029568991657806295, + "loss": 1.2336, + "step": 1839 + }, + { + "epoch": 0.1613528648491633, + "grad_norm": 0.111328125, + "learning_rate": 0.002956796913071645, + "loss": 1.21, + "step": 1840 + }, + { + "epoch": 0.16144055662353785, + "grad_norm": 0.060302734375, + "learning_rate": 0.002956694541186155, + "loss": 1.1894, + "step": 1841 + }, + { + "epoch": 0.16152824839791238, + "grad_norm": 0.10888671875, + "learning_rate": 0.002956592050133497, + "loss": 1.214, + "step": 1842 + }, + { + "epoch": 0.16161594017228692, + "grad_norm": 0.0693359375, + "learning_rate": 0.0029564894399230165, + "loss": 1.2265, + "step": 1843 + }, + { + "epoch": 0.16170363194666149, + "grad_norm": 0.07421875, + "learning_rate": 0.0029563867105640716, + "loss": 1.2602, + "step": 1844 + }, + { + "epoch": 0.16179132372103602, + "grad_norm": 0.08544921875, + "learning_rate": 0.002956283862066031, + "loss": 1.2213, + "step": 1845 + }, + { + "epoch": 0.16187901549541056, + "grad_norm": 0.09423828125, + "learning_rate": 0.0029561808944382744, + "loss": 1.1755, + "step": 1846 + }, + { + "epoch": 0.16196670726978513, + "grad_norm": 0.076171875, + "learning_rate": 0.0029560778076901926, + "loss": 1.2572, + "step": 1847 + }, + { + "epoch": 0.16205439904415966, + "grad_norm": 0.06640625, + "learning_rate": 0.0029559746018311857, + "loss": 1.2332, + "step": 1848 + }, + { + "epoch": 0.1621420908185342, + "grad_norm": 0.06689453125, + "learning_rate": 0.002955871276870667, + "loss": 1.2821, + "step": 1849 + }, + { + "epoch": 0.16222978259290874, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029557678328180586, + "loss": 1.2363, + "step": 1850 + }, + { + "epoch": 0.1623174743672833, + "grad_norm": 0.057373046875, + "learning_rate": 0.002955664269682795, + "loss": 1.2524, + "step": 1851 + }, + { + "epoch": 0.16240516614165784, + "grad_norm": 0.0810546875, + "learning_rate": 0.0029555605874743204, + "loss": 1.1926, + "step": 1852 + }, + { + "epoch": 0.16249285791603238, + "grad_norm": 0.0673828125, + "learning_rate": 0.0029554567862020904, + "loss": 1.2426, + "step": 1853 + }, + { + "epoch": 0.16258054969040694, + "grad_norm": 0.1396484375, + "learning_rate": 0.002955352865875572, + "loss": 1.2712, + "step": 1854 + }, + { + "epoch": 0.16266824146478148, + "grad_norm": 0.08349609375, + "learning_rate": 0.002955248826504241, + "loss": 1.3041, + "step": 1855 + }, + { + "epoch": 0.16275593323915602, + "grad_norm": 0.146484375, + "learning_rate": 0.002955144668097588, + "loss": 1.1817, + "step": 1856 + }, + { + "epoch": 0.16284362501353056, + "grad_norm": 0.1064453125, + "learning_rate": 0.0029550403906651087, + "loss": 1.1873, + "step": 1857 + }, + { + "epoch": 0.16293131678790512, + "grad_norm": 0.12109375, + "learning_rate": 0.0029549359942163157, + "loss": 1.2236, + "step": 1858 + }, + { + "epoch": 0.16301900856227966, + "grad_norm": 0.12890625, + "learning_rate": 0.0029548314787607288, + "loss": 1.2345, + "step": 1859 + }, + { + "epoch": 0.1631067003366542, + "grad_norm": 0.0888671875, + "learning_rate": 0.002954726844307879, + "loss": 1.2695, + "step": 1860 + }, + { + "epoch": 0.16319439211102874, + "grad_norm": 0.1337890625, + "learning_rate": 0.0029546220908673094, + "loss": 1.2535, + "step": 1861 + }, + { + "epoch": 0.1632820838854033, + "grad_norm": 0.060546875, + "learning_rate": 0.0029545172184485733, + "loss": 1.1989, + "step": 1862 + }, + { + "epoch": 0.16336977565977784, + "grad_norm": 0.07470703125, + "learning_rate": 0.002954412227061234, + "loss": 1.1743, + "step": 1863 + }, + { + "epoch": 0.16345746743415238, + "grad_norm": 0.08447265625, + "learning_rate": 0.0029543071167148667, + "loss": 1.1973, + "step": 1864 + }, + { + "epoch": 0.16354515920852694, + "grad_norm": 0.0654296875, + "learning_rate": 0.002954201887419058, + "loss": 1.1789, + "step": 1865 + }, + { + "epoch": 0.16363285098290148, + "grad_norm": 0.078125, + "learning_rate": 0.002954096539183404, + "loss": 1.2212, + "step": 1866 + }, + { + "epoch": 0.16372054275727602, + "grad_norm": 0.0673828125, + "learning_rate": 0.0029539910720175113, + "loss": 1.2066, + "step": 1867 + }, + { + "epoch": 0.16380823453165055, + "grad_norm": 0.06787109375, + "learning_rate": 0.0029538854859309997, + "loss": 1.2717, + "step": 1868 + }, + { + "epoch": 0.16389592630602512, + "grad_norm": 0.06494140625, + "learning_rate": 0.0029537797809334977, + "loss": 1.1794, + "step": 1869 + }, + { + "epoch": 0.16398361808039966, + "grad_norm": 0.07177734375, + "learning_rate": 0.002953673957034645, + "loss": 1.2161, + "step": 1870 + }, + { + "epoch": 0.1640713098547742, + "grad_norm": 0.06298828125, + "learning_rate": 0.0029535680142440935, + "loss": 1.229, + "step": 1871 + }, + { + "epoch": 0.16415900162914873, + "grad_norm": 0.0771484375, + "learning_rate": 0.0029534619525715036, + "loss": 1.2013, + "step": 1872 + }, + { + "epoch": 0.1642466934035233, + "grad_norm": 0.0888671875, + "learning_rate": 0.0029533557720265494, + "loss": 1.2405, + "step": 1873 + }, + { + "epoch": 0.16433438517789783, + "grad_norm": 0.09228515625, + "learning_rate": 0.0029532494726189126, + "loss": 1.2009, + "step": 1874 + }, + { + "epoch": 0.16442207695227237, + "grad_norm": 0.0693359375, + "learning_rate": 0.002953143054358288, + "loss": 1.1971, + "step": 1875 + }, + { + "epoch": 0.16450976872664694, + "grad_norm": 0.05810546875, + "learning_rate": 0.0029530365172543817, + "loss": 1.1584, + "step": 1876 + }, + { + "epoch": 0.16459746050102148, + "grad_norm": 0.057861328125, + "learning_rate": 0.002952929861316908, + "loss": 1.2321, + "step": 1877 + }, + { + "epoch": 0.164685152275396, + "grad_norm": 0.091796875, + "learning_rate": 0.002952823086555595, + "loss": 1.235, + "step": 1878 + }, + { + "epoch": 0.16477284404977055, + "grad_norm": 0.057861328125, + "learning_rate": 0.0029527161929801798, + "loss": 1.2132, + "step": 1879 + }, + { + "epoch": 0.16486053582414512, + "grad_norm": 0.12255859375, + "learning_rate": 0.00295260918060041, + "loss": 1.2056, + "step": 1880 + }, + { + "epoch": 0.16494822759851965, + "grad_norm": 0.068359375, + "learning_rate": 0.0029525020494260462, + "loss": 1.2095, + "step": 1881 + }, + { + "epoch": 0.1650359193728942, + "grad_norm": 0.1005859375, + "learning_rate": 0.0029523947994668583, + "loss": 1.2664, + "step": 1882 + }, + { + "epoch": 0.16512361114726876, + "grad_norm": 0.07666015625, + "learning_rate": 0.0029522874307326263, + "loss": 1.2446, + "step": 1883 + }, + { + "epoch": 0.1652113029216433, + "grad_norm": 0.19140625, + "learning_rate": 0.002952179943233142, + "loss": 1.2069, + "step": 1884 + }, + { + "epoch": 0.16529899469601783, + "grad_norm": 0.05908203125, + "learning_rate": 0.002952072336978209, + "loss": 1.2048, + "step": 1885 + }, + { + "epoch": 0.16538668647039237, + "grad_norm": 0.19921875, + "learning_rate": 0.00295196461197764, + "loss": 1.2081, + "step": 1886 + }, + { + "epoch": 0.16547437824476693, + "grad_norm": 0.0634765625, + "learning_rate": 0.0029518567682412593, + "loss": 1.1748, + "step": 1887 + }, + { + "epoch": 0.16556207001914147, + "grad_norm": 0.1806640625, + "learning_rate": 0.0029517488057789025, + "loss": 1.1854, + "step": 1888 + }, + { + "epoch": 0.165649761793516, + "grad_norm": 0.064453125, + "learning_rate": 0.002951640724600414, + "loss": 1.2574, + "step": 1889 + }, + { + "epoch": 0.16573745356789055, + "grad_norm": 0.1279296875, + "learning_rate": 0.0029515325247156526, + "loss": 1.261, + "step": 1890 + }, + { + "epoch": 0.1658251453422651, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029514242061344847, + "loss": 1.245, + "step": 1891 + }, + { + "epoch": 0.16591283711663965, + "grad_norm": 0.08642578125, + "learning_rate": 0.0029513157688667883, + "loss": 1.2528, + "step": 1892 + }, + { + "epoch": 0.1660005288910142, + "grad_norm": 0.0693359375, + "learning_rate": 0.002951207212922454, + "loss": 1.1879, + "step": 1893 + }, + { + "epoch": 0.16608822066538875, + "grad_norm": 0.06689453125, + "learning_rate": 0.00295109853831138, + "loss": 1.2397, + "step": 1894 + }, + { + "epoch": 0.1661759124397633, + "grad_norm": 0.0810546875, + "learning_rate": 0.0029509897450434785, + "loss": 1.2009, + "step": 1895 + }, + { + "epoch": 0.16626360421413783, + "grad_norm": 0.1044921875, + "learning_rate": 0.002950880833128671, + "loss": 1.2706, + "step": 1896 + }, + { + "epoch": 0.16635129598851237, + "grad_norm": 0.08642578125, + "learning_rate": 0.0029507718025768895, + "loss": 1.2773, + "step": 1897 + }, + { + "epoch": 0.16643898776288693, + "grad_norm": 0.10693359375, + "learning_rate": 0.0029506626533980776, + "loss": 1.2068, + "step": 1898 + }, + { + "epoch": 0.16652667953726147, + "grad_norm": 0.0810546875, + "learning_rate": 0.002950553385602189, + "loss": 1.2264, + "step": 1899 + }, + { + "epoch": 0.166614371311636, + "grad_norm": 0.07470703125, + "learning_rate": 0.002950443999199189, + "loss": 1.2045, + "step": 1900 + }, + { + "epoch": 0.16670206308601057, + "grad_norm": 0.08203125, + "learning_rate": 0.002950334494199054, + "loss": 1.2409, + "step": 1901 + }, + { + "epoch": 0.1667897548603851, + "grad_norm": 0.07470703125, + "learning_rate": 0.0029502248706117693, + "loss": 1.2565, + "step": 1902 + }, + { + "epoch": 0.16687744663475965, + "grad_norm": 0.138671875, + "learning_rate": 0.002950115128447333, + "loss": 1.3078, + "step": 1903 + }, + { + "epoch": 0.16696513840913418, + "grad_norm": 0.06298828125, + "learning_rate": 0.0029500052677157535, + "loss": 1.1882, + "step": 1904 + }, + { + "epoch": 0.16705283018350875, + "grad_norm": 0.1611328125, + "learning_rate": 0.0029498952884270493, + "loss": 1.2154, + "step": 1905 + }, + { + "epoch": 0.1671405219578833, + "grad_norm": 0.07275390625, + "learning_rate": 0.00294978519059125, + "loss": 1.2375, + "step": 1906 + }, + { + "epoch": 0.16722821373225782, + "grad_norm": 0.083984375, + "learning_rate": 0.0029496749742183976, + "loss": 1.2393, + "step": 1907 + }, + { + "epoch": 0.16731590550663236, + "grad_norm": 0.07470703125, + "learning_rate": 0.002949564639318542, + "loss": 1.2693, + "step": 1908 + }, + { + "epoch": 0.16740359728100693, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029494541859017465, + "loss": 1.2, + "step": 1909 + }, + { + "epoch": 0.16749128905538146, + "grad_norm": 0.09716796875, + "learning_rate": 0.0029493436139780838, + "loss": 1.2399, + "step": 1910 + }, + { + "epoch": 0.167578980829756, + "grad_norm": 0.05712890625, + "learning_rate": 0.0029492329235576375, + "loss": 1.2153, + "step": 1911 + }, + { + "epoch": 0.16766667260413057, + "grad_norm": 0.09716796875, + "learning_rate": 0.0029491221146505024, + "loss": 1.176, + "step": 1912 + }, + { + "epoch": 0.1677543643785051, + "grad_norm": 0.058349609375, + "learning_rate": 0.0029490111872667846, + "loss": 1.1792, + "step": 1913 + }, + { + "epoch": 0.16784205615287964, + "grad_norm": 0.083984375, + "learning_rate": 0.0029489001414165997, + "loss": 1.2286, + "step": 1914 + }, + { + "epoch": 0.16792974792725418, + "grad_norm": 0.0625, + "learning_rate": 0.0029487889771100746, + "loss": 1.1379, + "step": 1915 + }, + { + "epoch": 0.16801743970162875, + "grad_norm": 0.0673828125, + "learning_rate": 0.002948677694357348, + "loss": 1.1805, + "step": 1916 + }, + { + "epoch": 0.16810513147600328, + "grad_norm": 0.062255859375, + "learning_rate": 0.002948566293168568, + "loss": 1.1697, + "step": 1917 + }, + { + "epoch": 0.16819282325037782, + "grad_norm": 0.06298828125, + "learning_rate": 0.0029484547735538946, + "loss": 1.2461, + "step": 1918 + }, + { + "epoch": 0.1682805150247524, + "grad_norm": 0.06298828125, + "learning_rate": 0.0029483431355234973, + "loss": 1.229, + "step": 1919 + }, + { + "epoch": 0.16836820679912692, + "grad_norm": 0.07373046875, + "learning_rate": 0.002948231379087558, + "loss": 1.2439, + "step": 1920 + }, + { + "epoch": 0.16845589857350146, + "grad_norm": 0.0625, + "learning_rate": 0.0029481195042562686, + "loss": 1.2012, + "step": 1921 + }, + { + "epoch": 0.168543590347876, + "grad_norm": 0.06494140625, + "learning_rate": 0.002948007511039831, + "loss": 1.2381, + "step": 1922 + }, + { + "epoch": 0.16863128212225056, + "grad_norm": 0.06689453125, + "learning_rate": 0.002947895399448459, + "loss": 1.2534, + "step": 1923 + }, + { + "epoch": 0.1687189738966251, + "grad_norm": 0.06005859375, + "learning_rate": 0.0029477831694923774, + "loss": 1.2072, + "step": 1924 + }, + { + "epoch": 0.16880666567099964, + "grad_norm": 0.1572265625, + "learning_rate": 0.0029476708211818212, + "loss": 1.2774, + "step": 1925 + }, + { + "epoch": 0.16889435744537418, + "grad_norm": 0.09228515625, + "learning_rate": 0.002947558354527036, + "loss": 1.2066, + "step": 1926 + }, + { + "epoch": 0.16898204921974874, + "grad_norm": 0.14453125, + "learning_rate": 0.002947445769538278, + "loss": 1.2122, + "step": 1927 + }, + { + "epoch": 0.16906974099412328, + "grad_norm": 0.10400390625, + "learning_rate": 0.0029473330662258155, + "loss": 1.2672, + "step": 1928 + }, + { + "epoch": 0.16915743276849782, + "grad_norm": 0.0810546875, + "learning_rate": 0.002947220244599926, + "loss": 1.195, + "step": 1929 + }, + { + "epoch": 0.16924512454287238, + "grad_norm": 0.08154296875, + "learning_rate": 0.0029471073046709, + "loss": 1.1915, + "step": 1930 + }, + { + "epoch": 0.16933281631724692, + "grad_norm": 0.07080078125, + "learning_rate": 0.0029469942464490353, + "loss": 1.2258, + "step": 1931 + }, + { + "epoch": 0.16942050809162146, + "grad_norm": 0.07080078125, + "learning_rate": 0.002946881069944644, + "loss": 1.2318, + "step": 1932 + }, + { + "epoch": 0.169508199865996, + "grad_norm": 0.09765625, + "learning_rate": 0.002946767775168047, + "loss": 1.2427, + "step": 1933 + }, + { + "epoch": 0.16959589164037056, + "grad_norm": 0.0625, + "learning_rate": 0.002946654362129576, + "loss": 1.1934, + "step": 1934 + }, + { + "epoch": 0.1696835834147451, + "grad_norm": 0.1123046875, + "learning_rate": 0.0029465408308395748, + "loss": 1.2515, + "step": 1935 + }, + { + "epoch": 0.16977127518911964, + "grad_norm": 0.0810546875, + "learning_rate": 0.002946427181308397, + "loss": 1.2293, + "step": 1936 + }, + { + "epoch": 0.1698589669634942, + "grad_norm": 0.1005859375, + "learning_rate": 0.0029463134135464066, + "loss": 1.1644, + "step": 1937 + }, + { + "epoch": 0.16994665873786874, + "grad_norm": 0.10205078125, + "learning_rate": 0.0029461995275639795, + "loss": 1.2246, + "step": 1938 + }, + { + "epoch": 0.17003435051224328, + "grad_norm": 0.0888671875, + "learning_rate": 0.0029460855233715013, + "loss": 1.2079, + "step": 1939 + }, + { + "epoch": 0.17012204228661781, + "grad_norm": 0.07421875, + "learning_rate": 0.0029459714009793694, + "loss": 1.2012, + "step": 1940 + }, + { + "epoch": 0.17020973406099238, + "grad_norm": 0.072265625, + "learning_rate": 0.0029458571603979914, + "loss": 1.2467, + "step": 1941 + }, + { + "epoch": 0.17029742583536692, + "grad_norm": 0.078125, + "learning_rate": 0.0029457428016377856, + "loss": 1.2008, + "step": 1942 + }, + { + "epoch": 0.17038511760974145, + "grad_norm": 0.0810546875, + "learning_rate": 0.002945628324709181, + "loss": 1.2972, + "step": 1943 + }, + { + "epoch": 0.170472809384116, + "grad_norm": 0.0771484375, + "learning_rate": 0.0029455137296226183, + "loss": 1.2295, + "step": 1944 + }, + { + "epoch": 0.17056050115849056, + "grad_norm": 0.061279296875, + "learning_rate": 0.0029453990163885472, + "loss": 1.2901, + "step": 1945 + }, + { + "epoch": 0.1706481929328651, + "grad_norm": 0.05712890625, + "learning_rate": 0.00294528418501743, + "loss": 1.2214, + "step": 1946 + }, + { + "epoch": 0.17073588470723963, + "grad_norm": 0.07177734375, + "learning_rate": 0.0029451692355197396, + "loss": 1.2061, + "step": 1947 + }, + { + "epoch": 0.1708235764816142, + "grad_norm": 0.056640625, + "learning_rate": 0.0029450541679059573, + "loss": 1.2308, + "step": 1948 + }, + { + "epoch": 0.17091126825598874, + "grad_norm": 0.0908203125, + "learning_rate": 0.0029449389821865793, + "loss": 1.1401, + "step": 1949 + }, + { + "epoch": 0.17099896003036327, + "grad_norm": 0.08349609375, + "learning_rate": 0.0029448236783721077, + "loss": 1.2164, + "step": 1950 + }, + { + "epoch": 0.1710866518047378, + "grad_norm": 0.072265625, + "learning_rate": 0.00294470825647306, + "loss": 1.3284, + "step": 1951 + }, + { + "epoch": 0.17117434357911238, + "grad_norm": 0.07421875, + "learning_rate": 0.0029445927164999616, + "loss": 1.2917, + "step": 1952 + }, + { + "epoch": 0.1712620353534869, + "grad_norm": 0.057861328125, + "learning_rate": 0.0029444770584633488, + "loss": 1.2137, + "step": 1953 + }, + { + "epoch": 0.17134972712786145, + "grad_norm": 0.068359375, + "learning_rate": 0.0029443612823737706, + "loss": 1.1714, + "step": 1954 + }, + { + "epoch": 0.17143741890223602, + "grad_norm": 0.06689453125, + "learning_rate": 0.0029442453882417845, + "loss": 1.2233, + "step": 1955 + }, + { + "epoch": 0.17152511067661055, + "grad_norm": 0.0703125, + "learning_rate": 0.00294412937607796, + "loss": 1.2183, + "step": 1956 + }, + { + "epoch": 0.1716128024509851, + "grad_norm": 0.1015625, + "learning_rate": 0.002944013245892877, + "loss": 1.2503, + "step": 1957 + }, + { + "epoch": 0.17170049422535963, + "grad_norm": 0.07470703125, + "learning_rate": 0.0029438969976971273, + "loss": 1.2305, + "step": 1958 + }, + { + "epoch": 0.1717881859997342, + "grad_norm": 0.12109375, + "learning_rate": 0.002943780631501311, + "loss": 1.1944, + "step": 1959 + }, + { + "epoch": 0.17187587777410873, + "grad_norm": 0.1064453125, + "learning_rate": 0.002943664147316041, + "loss": 1.247, + "step": 1960 + }, + { + "epoch": 0.17196356954848327, + "grad_norm": 0.083984375, + "learning_rate": 0.0029435475451519404, + "loss": 1.2013, + "step": 1961 + }, + { + "epoch": 0.1720512613228578, + "grad_norm": 0.05908203125, + "learning_rate": 0.002943430825019643, + "loss": 1.3264, + "step": 1962 + }, + { + "epoch": 0.17213895309723237, + "grad_norm": 0.09130859375, + "learning_rate": 0.002943313986929793, + "loss": 1.2173, + "step": 1963 + }, + { + "epoch": 0.1722266448716069, + "grad_norm": 0.0810546875, + "learning_rate": 0.0029431970308930466, + "loss": 1.258, + "step": 1964 + }, + { + "epoch": 0.17231433664598145, + "grad_norm": 0.06103515625, + "learning_rate": 0.0029430799569200687, + "loss": 1.2104, + "step": 1965 + }, + { + "epoch": 0.172402028420356, + "grad_norm": 0.072265625, + "learning_rate": 0.0029429627650215374, + "loss": 1.172, + "step": 1966 + }, + { + "epoch": 0.17248972019473055, + "grad_norm": 0.06103515625, + "learning_rate": 0.0029428454552081393, + "loss": 1.2442, + "step": 1967 + }, + { + "epoch": 0.1725774119691051, + "grad_norm": 0.1279296875, + "learning_rate": 0.002942728027490574, + "loss": 1.1975, + "step": 1968 + }, + { + "epoch": 0.17266510374347963, + "grad_norm": 0.0810546875, + "learning_rate": 0.002942610481879549, + "loss": 1.2699, + "step": 1969 + }, + { + "epoch": 0.1727527955178542, + "grad_norm": 0.10107421875, + "learning_rate": 0.0029424928183857846, + "loss": 1.2762, + "step": 1970 + }, + { + "epoch": 0.17284048729222873, + "grad_norm": 0.06298828125, + "learning_rate": 0.002942375037020012, + "loss": 1.2617, + "step": 1971 + }, + { + "epoch": 0.17292817906660327, + "grad_norm": 0.0751953125, + "learning_rate": 0.0029422571377929726, + "loss": 1.2496, + "step": 1972 + }, + { + "epoch": 0.17301587084097783, + "grad_norm": 0.0634765625, + "learning_rate": 0.0029421391207154184, + "loss": 1.1984, + "step": 1973 + }, + { + "epoch": 0.17310356261535237, + "grad_norm": 0.060302734375, + "learning_rate": 0.0029420209857981116, + "loss": 1.2159, + "step": 1974 + }, + { + "epoch": 0.1731912543897269, + "grad_norm": 0.061767578125, + "learning_rate": 0.0029419027330518255, + "loss": 1.1537, + "step": 1975 + }, + { + "epoch": 0.17327894616410144, + "grad_norm": 0.099609375, + "learning_rate": 0.002941784362487346, + "loss": 1.2383, + "step": 1976 + }, + { + "epoch": 0.173366637938476, + "grad_norm": 0.0712890625, + "learning_rate": 0.0029416658741154675, + "loss": 1.1781, + "step": 1977 + }, + { + "epoch": 0.17345432971285055, + "grad_norm": 0.0625, + "learning_rate": 0.0029415472679469954, + "loss": 1.2203, + "step": 1978 + }, + { + "epoch": 0.17354202148722508, + "grad_norm": 0.0625, + "learning_rate": 0.0029414285439927465, + "loss": 1.2184, + "step": 1979 + }, + { + "epoch": 0.17362971326159962, + "grad_norm": 0.05908203125, + "learning_rate": 0.0029413097022635486, + "loss": 1.2343, + "step": 1980 + }, + { + "epoch": 0.1737174050359742, + "grad_norm": 0.056884765625, + "learning_rate": 0.002941190742770239, + "loss": 1.2458, + "step": 1981 + }, + { + "epoch": 0.17380509681034872, + "grad_norm": 0.06640625, + "learning_rate": 0.002941071665523667, + "loss": 1.1951, + "step": 1982 + }, + { + "epoch": 0.17389278858472326, + "grad_norm": 0.0771484375, + "learning_rate": 0.0029409524705346917, + "loss": 1.2263, + "step": 1983 + }, + { + "epoch": 0.17398048035909783, + "grad_norm": 0.130859375, + "learning_rate": 0.002940833157814184, + "loss": 1.3174, + "step": 1984 + }, + { + "epoch": 0.17406817213347237, + "grad_norm": 0.08203125, + "learning_rate": 0.0029407137273730244, + "loss": 1.2567, + "step": 1985 + }, + { + "epoch": 0.1741558639078469, + "grad_norm": 0.162109375, + "learning_rate": 0.002940594179222105, + "loss": 1.3004, + "step": 1986 + }, + { + "epoch": 0.17424355568222144, + "grad_norm": 0.057861328125, + "learning_rate": 0.002940474513372328, + "loss": 1.1722, + "step": 1987 + }, + { + "epoch": 0.174331247456596, + "grad_norm": 0.1298828125, + "learning_rate": 0.0029403547298346064, + "loss": 1.264, + "step": 1988 + }, + { + "epoch": 0.17441893923097054, + "grad_norm": 0.10009765625, + "learning_rate": 0.0029402348286198653, + "loss": 1.204, + "step": 1989 + }, + { + "epoch": 0.17450663100534508, + "grad_norm": 0.06591796875, + "learning_rate": 0.002940114809739038, + "loss": 1.1928, + "step": 1990 + }, + { + "epoch": 0.17459432277971965, + "grad_norm": 0.1689453125, + "learning_rate": 0.0029399946732030706, + "loss": 1.2589, + "step": 1991 + }, + { + "epoch": 0.17468201455409418, + "grad_norm": 0.09375, + "learning_rate": 0.0029398744190229187, + "loss": 1.1797, + "step": 1992 + }, + { + "epoch": 0.17476970632846872, + "grad_norm": 0.09033203125, + "learning_rate": 0.00293975404720955, + "loss": 1.247, + "step": 1993 + }, + { + "epoch": 0.17485739810284326, + "grad_norm": 0.0654296875, + "learning_rate": 0.0029396335577739413, + "loss": 1.1877, + "step": 1994 + }, + { + "epoch": 0.17494508987721782, + "grad_norm": 0.11181640625, + "learning_rate": 0.0029395129507270817, + "loss": 1.2879, + "step": 1995 + }, + { + "epoch": 0.17503278165159236, + "grad_norm": 0.07861328125, + "learning_rate": 0.00293939222607997, + "loss": 1.284, + "step": 1996 + }, + { + "epoch": 0.1751204734259669, + "grad_norm": 0.08837890625, + "learning_rate": 0.002939271383843615, + "loss": 1.1872, + "step": 1997 + }, + { + "epoch": 0.17520816520034144, + "grad_norm": 0.09619140625, + "learning_rate": 0.002939150424029039, + "loss": 1.2073, + "step": 1998 + }, + { + "epoch": 0.175295856974716, + "grad_norm": 0.07666015625, + "learning_rate": 0.002939029346647272, + "loss": 1.242, + "step": 1999 + }, + { + "epoch": 0.17538354874909054, + "grad_norm": 0.1103515625, + "learning_rate": 0.0029389081517093556, + "loss": 1.2536, + "step": 2000 + }, + { + "epoch": 0.17538354874909054, + "eval_loss": 1.2374000549316406, + "eval_runtime": 428.5743, + "eval_samples_per_second": 33.709, + "eval_steps_per_second": 8.428, + "step": 2000 + }, + { + "epoch": 0.17547124052346508, + "grad_norm": 0.07763671875, + "learning_rate": 0.002938786839226344, + "loss": 1.1888, + "step": 2001 + }, + { + "epoch": 0.17555893229783964, + "grad_norm": 0.0947265625, + "learning_rate": 0.002938665409209299, + "loss": 1.2685, + "step": 2002 + }, + { + "epoch": 0.17564662407221418, + "grad_norm": 0.07763671875, + "learning_rate": 0.0029385438616692954, + "loss": 1.2854, + "step": 2003 + }, + { + "epoch": 0.17573431584658872, + "grad_norm": 0.083984375, + "learning_rate": 0.002938422196617418, + "loss": 1.2861, + "step": 2004 + }, + { + "epoch": 0.17582200762096326, + "grad_norm": 0.10888671875, + "learning_rate": 0.0029383004140647625, + "loss": 1.2657, + "step": 2005 + }, + { + "epoch": 0.17590969939533782, + "grad_norm": 0.1396484375, + "learning_rate": 0.002938178514022435, + "loss": 1.2393, + "step": 2006 + }, + { + "epoch": 0.17599739116971236, + "grad_norm": 0.0703125, + "learning_rate": 0.002938056496501552, + "loss": 1.2239, + "step": 2007 + }, + { + "epoch": 0.1760850829440869, + "grad_norm": 0.1005859375, + "learning_rate": 0.002937934361513242, + "loss": 1.2971, + "step": 2008 + }, + { + "epoch": 0.17617277471846146, + "grad_norm": 0.0732421875, + "learning_rate": 0.002937812109068643, + "loss": 1.2387, + "step": 2009 + }, + { + "epoch": 0.176260466492836, + "grad_norm": 0.064453125, + "learning_rate": 0.002937689739178904, + "loss": 1.2409, + "step": 2010 + }, + { + "epoch": 0.17634815826721054, + "grad_norm": 0.0830078125, + "learning_rate": 0.002937567251855185, + "loss": 1.2703, + "step": 2011 + }, + { + "epoch": 0.17643585004158507, + "grad_norm": 0.10986328125, + "learning_rate": 0.0029374446471086566, + "loss": 1.1958, + "step": 2012 + }, + { + "epoch": 0.17652354181595964, + "grad_norm": 0.130859375, + "learning_rate": 0.0029373219249504996, + "loss": 1.23, + "step": 2013 + }, + { + "epoch": 0.17661123359033418, + "grad_norm": 0.1181640625, + "learning_rate": 0.002937199085391907, + "loss": 1.2415, + "step": 2014 + }, + { + "epoch": 0.17669892536470871, + "grad_norm": 0.1455078125, + "learning_rate": 0.00293707612844408, + "loss": 1.2129, + "step": 2015 + }, + { + "epoch": 0.17678661713908325, + "grad_norm": 0.1083984375, + "learning_rate": 0.0029369530541182324, + "loss": 1.1714, + "step": 2016 + }, + { + "epoch": 0.17687430891345782, + "grad_norm": 0.138671875, + "learning_rate": 0.0029368298624255895, + "loss": 1.2369, + "step": 2017 + }, + { + "epoch": 0.17696200068783235, + "grad_norm": 0.080078125, + "learning_rate": 0.002936706553377384, + "loss": 1.1888, + "step": 2018 + }, + { + "epoch": 0.1770496924622069, + "grad_norm": 0.12060546875, + "learning_rate": 0.002936583126984863, + "loss": 1.2226, + "step": 2019 + }, + { + "epoch": 0.17713738423658146, + "grad_norm": 0.06787109375, + "learning_rate": 0.002936459583259282, + "loss": 1.264, + "step": 2020 + }, + { + "epoch": 0.177225076010956, + "grad_norm": 0.11279296875, + "learning_rate": 0.002936335922211908, + "loss": 1.2089, + "step": 2021 + }, + { + "epoch": 0.17731276778533053, + "grad_norm": 0.068359375, + "learning_rate": 0.0029362121438540187, + "loss": 1.2172, + "step": 2022 + }, + { + "epoch": 0.17740045955970507, + "grad_norm": 0.1435546875, + "learning_rate": 0.002936088248196902, + "loss": 1.2532, + "step": 2023 + }, + { + "epoch": 0.17748815133407964, + "grad_norm": 0.0673828125, + "learning_rate": 0.002935964235251857, + "loss": 1.2628, + "step": 2024 + }, + { + "epoch": 0.17757584310845417, + "grad_norm": 0.0908203125, + "learning_rate": 0.002935840105030194, + "loss": 1.2052, + "step": 2025 + }, + { + "epoch": 0.1776635348828287, + "grad_norm": 0.06591796875, + "learning_rate": 0.0029357158575432324, + "loss": 1.26, + "step": 2026 + }, + { + "epoch": 0.17775122665720328, + "grad_norm": 0.07080078125, + "learning_rate": 0.0029355914928023037, + "loss": 1.2588, + "step": 2027 + }, + { + "epoch": 0.1778389184315778, + "grad_norm": 0.0595703125, + "learning_rate": 0.0029354670108187497, + "loss": 1.2044, + "step": 2028 + }, + { + "epoch": 0.17792661020595235, + "grad_norm": 0.06298828125, + "learning_rate": 0.0029353424116039225, + "loss": 1.1899, + "step": 2029 + }, + { + "epoch": 0.1780143019803269, + "grad_norm": 0.0615234375, + "learning_rate": 0.0029352176951691856, + "loss": 1.235, + "step": 2030 + }, + { + "epoch": 0.17810199375470145, + "grad_norm": 0.0771484375, + "learning_rate": 0.0029350928615259125, + "loss": 1.2379, + "step": 2031 + }, + { + "epoch": 0.178189685529076, + "grad_norm": 0.0859375, + "learning_rate": 0.002934967910685488, + "loss": 1.3001, + "step": 2032 + }, + { + "epoch": 0.17827737730345053, + "grad_norm": 0.08935546875, + "learning_rate": 0.002934842842659307, + "loss": 1.2333, + "step": 2033 + }, + { + "epoch": 0.17836506907782507, + "grad_norm": 0.0703125, + "learning_rate": 0.002934717657458776, + "loss": 1.2045, + "step": 2034 + }, + { + "epoch": 0.17845276085219963, + "grad_norm": 0.07177734375, + "learning_rate": 0.0029345923550953106, + "loss": 1.1979, + "step": 2035 + }, + { + "epoch": 0.17854045262657417, + "grad_norm": 0.095703125, + "learning_rate": 0.0029344669355803386, + "loss": 1.2464, + "step": 2036 + }, + { + "epoch": 0.1786281444009487, + "grad_norm": 0.057861328125, + "learning_rate": 0.0029343413989252986, + "loss": 1.1283, + "step": 2037 + }, + { + "epoch": 0.17871583617532327, + "grad_norm": 0.06689453125, + "learning_rate": 0.0029342157451416376, + "loss": 1.3007, + "step": 2038 + }, + { + "epoch": 0.1788035279496978, + "grad_norm": 0.0771484375, + "learning_rate": 0.002934089974240817, + "loss": 1.2194, + "step": 2039 + }, + { + "epoch": 0.17889121972407235, + "grad_norm": 0.09326171875, + "learning_rate": 0.0029339640862343047, + "loss": 1.2209, + "step": 2040 + }, + { + "epoch": 0.17897891149844689, + "grad_norm": 0.06591796875, + "learning_rate": 0.0029338380811335823, + "loss": 1.1927, + "step": 2041 + }, + { + "epoch": 0.17906660327282145, + "grad_norm": 0.0859375, + "learning_rate": 0.0029337119589501414, + "loss": 1.2917, + "step": 2042 + }, + { + "epoch": 0.179154295047196, + "grad_norm": 0.125, + "learning_rate": 0.0029335857196954836, + "loss": 1.3238, + "step": 2043 + }, + { + "epoch": 0.17924198682157053, + "grad_norm": 0.0654296875, + "learning_rate": 0.002933459363381122, + "loss": 1.2641, + "step": 2044 + }, + { + "epoch": 0.1793296785959451, + "grad_norm": 0.0908203125, + "learning_rate": 0.0029333328900185795, + "loss": 1.2249, + "step": 2045 + }, + { + "epoch": 0.17941737037031963, + "grad_norm": 0.06396484375, + "learning_rate": 0.00293320629961939, + "loss": 1.2576, + "step": 2046 + }, + { + "epoch": 0.17950506214469417, + "grad_norm": 0.07421875, + "learning_rate": 0.002933079592195099, + "loss": 1.2239, + "step": 2047 + }, + { + "epoch": 0.1795927539190687, + "grad_norm": 0.07080078125, + "learning_rate": 0.0029329527677572614, + "loss": 1.2142, + "step": 2048 + }, + { + "epoch": 0.17968044569344327, + "grad_norm": 0.08154296875, + "learning_rate": 0.0029328258263174433, + "loss": 1.2847, + "step": 2049 + }, + { + "epoch": 0.1797681374678178, + "grad_norm": 0.0654296875, + "learning_rate": 0.002932698767887221, + "loss": 1.338, + "step": 2050 + }, + { + "epoch": 0.17985582924219234, + "grad_norm": 0.06689453125, + "learning_rate": 0.0029325715924781825, + "loss": 1.2185, + "step": 2051 + }, + { + "epoch": 0.17994352101656688, + "grad_norm": 0.07958984375, + "learning_rate": 0.002932444300101926, + "loss": 1.1705, + "step": 2052 + }, + { + "epoch": 0.18003121279094145, + "grad_norm": 0.08642578125, + "learning_rate": 0.0029323168907700596, + "loss": 1.2895, + "step": 2053 + }, + { + "epoch": 0.18011890456531598, + "grad_norm": 0.062255859375, + "learning_rate": 0.0029321893644942035, + "loss": 1.2526, + "step": 2054 + }, + { + "epoch": 0.18020659633969052, + "grad_norm": 0.0712890625, + "learning_rate": 0.0029320617212859872, + "loss": 1.1969, + "step": 2055 + }, + { + "epoch": 0.1802942881140651, + "grad_norm": 0.0732421875, + "learning_rate": 0.002931933961157051, + "loss": 1.1678, + "step": 2056 + }, + { + "epoch": 0.18038197988843963, + "grad_norm": 0.06640625, + "learning_rate": 0.002931806084119048, + "loss": 1.2011, + "step": 2057 + }, + { + "epoch": 0.18046967166281416, + "grad_norm": 0.11279296875, + "learning_rate": 0.0029316780901836375, + "loss": 1.1828, + "step": 2058 + }, + { + "epoch": 0.1805573634371887, + "grad_norm": 0.10498046875, + "learning_rate": 0.002931549979362495, + "loss": 1.2415, + "step": 2059 + }, + { + "epoch": 0.18064505521156327, + "grad_norm": 0.12890625, + "learning_rate": 0.002931421751667302, + "loss": 1.2183, + "step": 2060 + }, + { + "epoch": 0.1807327469859378, + "grad_norm": 0.1318359375, + "learning_rate": 0.002931293407109754, + "loss": 1.2753, + "step": 2061 + }, + { + "epoch": 0.18082043876031234, + "grad_norm": 0.1376953125, + "learning_rate": 0.0029311649457015544, + "loss": 1.3004, + "step": 2062 + }, + { + "epoch": 0.1809081305346869, + "grad_norm": 0.09033203125, + "learning_rate": 0.0029310363674544195, + "loss": 1.1922, + "step": 2063 + }, + { + "epoch": 0.18099582230906144, + "grad_norm": 0.0791015625, + "learning_rate": 0.0029309076723800743, + "loss": 1.2012, + "step": 2064 + }, + { + "epoch": 0.18108351408343598, + "grad_norm": 0.10986328125, + "learning_rate": 0.0029307788604902565, + "loss": 1.2232, + "step": 2065 + }, + { + "epoch": 0.18117120585781052, + "grad_norm": 0.0673828125, + "learning_rate": 0.002930649931796712, + "loss": 1.3227, + "step": 2066 + }, + { + "epoch": 0.18125889763218508, + "grad_norm": 0.1259765625, + "learning_rate": 0.0029305208863112005, + "loss": 1.1751, + "step": 2067 + }, + { + "epoch": 0.18134658940655962, + "grad_norm": 0.07763671875, + "learning_rate": 0.00293039172404549, + "loss": 1.1815, + "step": 2068 + }, + { + "epoch": 0.18143428118093416, + "grad_norm": 0.083984375, + "learning_rate": 0.0029302624450113592, + "loss": 1.2264, + "step": 2069 + }, + { + "epoch": 0.1815219729553087, + "grad_norm": 0.0703125, + "learning_rate": 0.002930133049220598, + "loss": 1.1518, + "step": 2070 + }, + { + "epoch": 0.18160966472968326, + "grad_norm": 0.08447265625, + "learning_rate": 0.002930003536685008, + "loss": 1.217, + "step": 2071 + }, + { + "epoch": 0.1816973565040578, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029298739074164, + "loss": 1.1827, + "step": 2072 + }, + { + "epoch": 0.18178504827843234, + "grad_norm": 0.06494140625, + "learning_rate": 0.002929744161426595, + "loss": 1.2134, + "step": 2073 + }, + { + "epoch": 0.1818727400528069, + "grad_norm": 0.0615234375, + "learning_rate": 0.002929614298727426, + "loss": 1.215, + "step": 2074 + }, + { + "epoch": 0.18196043182718144, + "grad_norm": 0.0634765625, + "learning_rate": 0.0029294843193307368, + "loss": 1.2318, + "step": 2075 + }, + { + "epoch": 0.18204812360155598, + "grad_norm": 0.06494140625, + "learning_rate": 0.0029293542232483806, + "loss": 1.2634, + "step": 2076 + }, + { + "epoch": 0.18213581537593052, + "grad_norm": 0.06884765625, + "learning_rate": 0.0029292240104922215, + "loss": 1.2311, + "step": 2077 + }, + { + "epoch": 0.18222350715030508, + "grad_norm": 0.12109375, + "learning_rate": 0.0029290936810741353, + "loss": 1.2753, + "step": 2078 + }, + { + "epoch": 0.18231119892467962, + "grad_norm": 0.0634765625, + "learning_rate": 0.0029289632350060074, + "loss": 1.2298, + "step": 2079 + }, + { + "epoch": 0.18239889069905416, + "grad_norm": 0.08251953125, + "learning_rate": 0.002928832672299734, + "loss": 1.2143, + "step": 2080 + }, + { + "epoch": 0.1824865824734287, + "grad_norm": 0.06884765625, + "learning_rate": 0.002928701992967222, + "loss": 1.2469, + "step": 2081 + }, + { + "epoch": 0.18257427424780326, + "grad_norm": 0.076171875, + "learning_rate": 0.0029285711970203888, + "loss": 1.2208, + "step": 2082 + }, + { + "epoch": 0.1826619660221778, + "grad_norm": 0.06396484375, + "learning_rate": 0.0029284402844711637, + "loss": 1.2153, + "step": 2083 + }, + { + "epoch": 0.18274965779655233, + "grad_norm": 0.0673828125, + "learning_rate": 0.0029283092553314847, + "loss": 1.2403, + "step": 2084 + }, + { + "epoch": 0.1828373495709269, + "grad_norm": 0.059326171875, + "learning_rate": 0.0029281781096133015, + "loss": 1.1838, + "step": 2085 + }, + { + "epoch": 0.18292504134530144, + "grad_norm": 0.0830078125, + "learning_rate": 0.0029280468473285746, + "loss": 1.25, + "step": 2086 + }, + { + "epoch": 0.18301273311967597, + "grad_norm": 0.06494140625, + "learning_rate": 0.002927915468489274, + "loss": 1.2227, + "step": 2087 + }, + { + "epoch": 0.1831004248940505, + "grad_norm": 0.0888671875, + "learning_rate": 0.002927783973107382, + "loss": 1.276, + "step": 2088 + }, + { + "epoch": 0.18318811666842508, + "grad_norm": 0.06982421875, + "learning_rate": 0.0029276523611948906, + "loss": 1.2114, + "step": 2089 + }, + { + "epoch": 0.18327580844279961, + "grad_norm": 0.07275390625, + "learning_rate": 0.0029275206327638017, + "loss": 1.2702, + "step": 2090 + }, + { + "epoch": 0.18336350021717415, + "grad_norm": 0.06494140625, + "learning_rate": 0.0029273887878261285, + "loss": 1.185, + "step": 2091 + }, + { + "epoch": 0.18345119199154872, + "grad_norm": 0.1015625, + "learning_rate": 0.002927256826393896, + "loss": 1.1702, + "step": 2092 + }, + { + "epoch": 0.18353888376592326, + "grad_norm": 0.0712890625, + "learning_rate": 0.0029271247484791377, + "loss": 1.2359, + "step": 2093 + }, + { + "epoch": 0.1836265755402978, + "grad_norm": 0.09716796875, + "learning_rate": 0.0029269925540939, + "loss": 1.2313, + "step": 2094 + }, + { + "epoch": 0.18371426731467233, + "grad_norm": 0.059326171875, + "learning_rate": 0.002926860243250237, + "loss": 1.2114, + "step": 2095 + }, + { + "epoch": 0.1838019590890469, + "grad_norm": 0.1259765625, + "learning_rate": 0.002926727815960216, + "loss": 1.2709, + "step": 2096 + }, + { + "epoch": 0.18388965086342143, + "grad_norm": 0.057861328125, + "learning_rate": 0.0029265952722359154, + "loss": 1.2259, + "step": 2097 + }, + { + "epoch": 0.18397734263779597, + "grad_norm": 0.1337890625, + "learning_rate": 0.0029264626120894203, + "loss": 1.2063, + "step": 2098 + }, + { + "epoch": 0.1840650344121705, + "grad_norm": 0.08349609375, + "learning_rate": 0.0029263298355328305, + "loss": 1.1828, + "step": 2099 + }, + { + "epoch": 0.18415272618654507, + "grad_norm": 0.07421875, + "learning_rate": 0.0029261969425782544, + "loss": 1.212, + "step": 2100 + }, + { + "epoch": 0.1842404179609196, + "grad_norm": 0.07666015625, + "learning_rate": 0.0029260639332378117, + "loss": 1.1854, + "step": 2101 + }, + { + "epoch": 0.18432810973529415, + "grad_norm": 0.06298828125, + "learning_rate": 0.002925930807523632, + "loss": 1.2632, + "step": 2102 + }, + { + "epoch": 0.18441580150966871, + "grad_norm": 0.08740234375, + "learning_rate": 0.002925797565447857, + "loss": 1.231, + "step": 2103 + }, + { + "epoch": 0.18450349328404325, + "grad_norm": 0.1357421875, + "learning_rate": 0.0029256642070226374, + "loss": 1.3015, + "step": 2104 + }, + { + "epoch": 0.1845911850584178, + "grad_norm": 0.06494140625, + "learning_rate": 0.002925530732260135, + "loss": 1.1686, + "step": 2105 + }, + { + "epoch": 0.18467887683279233, + "grad_norm": 0.13671875, + "learning_rate": 0.002925397141172523, + "loss": 1.2009, + "step": 2106 + }, + { + "epoch": 0.1847665686071669, + "grad_norm": 0.1123046875, + "learning_rate": 0.0029252634337719836, + "loss": 1.2491, + "step": 2107 + }, + { + "epoch": 0.18485426038154143, + "grad_norm": 0.2216796875, + "learning_rate": 0.0029251296100707117, + "loss": 1.252, + "step": 2108 + }, + { + "epoch": 0.18494195215591597, + "grad_norm": 0.0869140625, + "learning_rate": 0.0029249956700809106, + "loss": 1.2084, + "step": 2109 + }, + { + "epoch": 0.18502964393029053, + "grad_norm": 0.12158203125, + "learning_rate": 0.002924861613814796, + "loss": 1.2012, + "step": 2110 + }, + { + "epoch": 0.18511733570466507, + "grad_norm": 0.11279296875, + "learning_rate": 0.0029247274412845933, + "loss": 1.2048, + "step": 2111 + }, + { + "epoch": 0.1852050274790396, + "grad_norm": 0.06591796875, + "learning_rate": 0.0029245931525025384, + "loss": 1.3062, + "step": 2112 + }, + { + "epoch": 0.18529271925341415, + "grad_norm": 0.09619140625, + "learning_rate": 0.002924458747480879, + "loss": 1.2684, + "step": 2113 + }, + { + "epoch": 0.1853804110277887, + "grad_norm": 0.060302734375, + "learning_rate": 0.0029243242262318708, + "loss": 1.1959, + "step": 2114 + }, + { + "epoch": 0.18546810280216325, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029241895887677834, + "loss": 1.2658, + "step": 2115 + }, + { + "epoch": 0.18555579457653779, + "grad_norm": 0.0810546875, + "learning_rate": 0.002924054835100895, + "loss": 1.241, + "step": 2116 + }, + { + "epoch": 0.18564348635091232, + "grad_norm": 0.09326171875, + "learning_rate": 0.0029239199652434937, + "loss": 1.1998, + "step": 2117 + }, + { + "epoch": 0.1857311781252869, + "grad_norm": 0.1103515625, + "learning_rate": 0.0029237849792078815, + "loss": 1.2349, + "step": 2118 + }, + { + "epoch": 0.18581886989966143, + "grad_norm": 0.11376953125, + "learning_rate": 0.0029236498770063664, + "loss": 1.275, + "step": 2119 + }, + { + "epoch": 0.18590656167403596, + "grad_norm": 0.11328125, + "learning_rate": 0.002923514658651271, + "loss": 1.2772, + "step": 2120 + }, + { + "epoch": 0.18599425344841053, + "grad_norm": 0.134765625, + "learning_rate": 0.0029233793241549262, + "loss": 1.2346, + "step": 2121 + }, + { + "epoch": 0.18608194522278507, + "grad_norm": 0.10205078125, + "learning_rate": 0.0029232438735296742, + "loss": 1.2353, + "step": 2122 + }, + { + "epoch": 0.1861696369971596, + "grad_norm": 0.1630859375, + "learning_rate": 0.002923108306787868, + "loss": 1.2354, + "step": 2123 + }, + { + "epoch": 0.18625732877153414, + "grad_norm": 0.068359375, + "learning_rate": 0.00292297262394187, + "loss": 1.1676, + "step": 2124 + }, + { + "epoch": 0.1863450205459087, + "grad_norm": 0.158203125, + "learning_rate": 0.002922836825004056, + "loss": 1.1817, + "step": 2125 + }, + { + "epoch": 0.18643271232028324, + "grad_norm": 0.06591796875, + "learning_rate": 0.002922700909986808, + "loss": 1.2602, + "step": 2126 + }, + { + "epoch": 0.18652040409465778, + "grad_norm": 0.173828125, + "learning_rate": 0.0029225648789025236, + "loss": 1.274, + "step": 2127 + }, + { + "epoch": 0.18660809586903235, + "grad_norm": 0.068359375, + "learning_rate": 0.0029224287317636067, + "loss": 1.2017, + "step": 2128 + }, + { + "epoch": 0.18669578764340689, + "grad_norm": 0.1806640625, + "learning_rate": 0.002922292468582475, + "loss": 1.2335, + "step": 2129 + }, + { + "epoch": 0.18678347941778142, + "grad_norm": 0.0751953125, + "learning_rate": 0.0029221560893715535, + "loss": 1.2363, + "step": 2130 + }, + { + "epoch": 0.18687117119215596, + "grad_norm": 0.1611328125, + "learning_rate": 0.002922019594143281, + "loss": 1.2476, + "step": 2131 + }, + { + "epoch": 0.18695886296653053, + "grad_norm": 0.06298828125, + "learning_rate": 0.0029218829829101055, + "loss": 1.2307, + "step": 2132 + }, + { + "epoch": 0.18704655474090506, + "grad_norm": 0.12890625, + "learning_rate": 0.002921746255684485, + "loss": 1.2079, + "step": 2133 + }, + { + "epoch": 0.1871342465152796, + "grad_norm": 0.076171875, + "learning_rate": 0.0029216094124788895, + "loss": 1.1987, + "step": 2134 + }, + { + "epoch": 0.18722193828965414, + "grad_norm": 0.056884765625, + "learning_rate": 0.0029214724533057983, + "loss": 1.205, + "step": 2135 + }, + { + "epoch": 0.1873096300640287, + "grad_norm": 0.072265625, + "learning_rate": 0.002921335378177701, + "loss": 1.2035, + "step": 2136 + }, + { + "epoch": 0.18739732183840324, + "grad_norm": 0.06201171875, + "learning_rate": 0.002921198187107099, + "loss": 1.1853, + "step": 2137 + }, + { + "epoch": 0.18748501361277778, + "grad_norm": 0.09228515625, + "learning_rate": 0.0029210608801065047, + "loss": 1.2574, + "step": 2138 + }, + { + "epoch": 0.18757270538715234, + "grad_norm": 0.062255859375, + "learning_rate": 0.002920923457188439, + "loss": 1.2154, + "step": 2139 + }, + { + "epoch": 0.18766039716152688, + "grad_norm": 0.0595703125, + "learning_rate": 0.002920785918365435, + "loss": 1.259, + "step": 2140 + }, + { + "epoch": 0.18774808893590142, + "grad_norm": 0.056884765625, + "learning_rate": 0.002920648263650036, + "loss": 1.1876, + "step": 2141 + }, + { + "epoch": 0.18783578071027596, + "grad_norm": 0.08203125, + "learning_rate": 0.0029205104930547948, + "loss": 1.2673, + "step": 2142 + }, + { + "epoch": 0.18792347248465052, + "grad_norm": 0.062255859375, + "learning_rate": 0.002920372606592277, + "loss": 1.246, + "step": 2143 + }, + { + "epoch": 0.18801116425902506, + "grad_norm": 0.1044921875, + "learning_rate": 0.002920234604275057, + "loss": 1.2107, + "step": 2144 + }, + { + "epoch": 0.1880988560333996, + "grad_norm": 0.06396484375, + "learning_rate": 0.00292009648611572, + "loss": 1.255, + "step": 2145 + }, + { + "epoch": 0.18818654780777416, + "grad_norm": 0.1728515625, + "learning_rate": 0.0029199582521268618, + "loss": 1.2306, + "step": 2146 + }, + { + "epoch": 0.1882742395821487, + "grad_norm": 0.1181640625, + "learning_rate": 0.00291981990232109, + "loss": 1.2099, + "step": 2147 + }, + { + "epoch": 0.18836193135652324, + "grad_norm": 0.1474609375, + "learning_rate": 0.002919681436711021, + "loss": 1.2574, + "step": 2148 + }, + { + "epoch": 0.18844962313089778, + "grad_norm": 0.126953125, + "learning_rate": 0.0029195428553092824, + "loss": 1.2209, + "step": 2149 + }, + { + "epoch": 0.18853731490527234, + "grad_norm": 0.1142578125, + "learning_rate": 0.002919404158128513, + "loss": 1.2061, + "step": 2150 + }, + { + "epoch": 0.18862500667964688, + "grad_norm": 0.07568359375, + "learning_rate": 0.0029192653451813604, + "loss": 1.2044, + "step": 2151 + }, + { + "epoch": 0.18871269845402142, + "grad_norm": 0.1728515625, + "learning_rate": 0.002919126416480486, + "loss": 1.2074, + "step": 2152 + }, + { + "epoch": 0.18880039022839595, + "grad_norm": 0.07763671875, + "learning_rate": 0.002918987372038558, + "loss": 1.2965, + "step": 2153 + }, + { + "epoch": 0.18888808200277052, + "grad_norm": 0.158203125, + "learning_rate": 0.002918848211868258, + "loss": 1.2164, + "step": 2154 + }, + { + "epoch": 0.18897577377714506, + "grad_norm": 0.06494140625, + "learning_rate": 0.0029187089359822764, + "loss": 1.2771, + "step": 2155 + }, + { + "epoch": 0.1890634655515196, + "grad_norm": 0.1279296875, + "learning_rate": 0.002918569544393315, + "loss": 1.2297, + "step": 2156 + }, + { + "epoch": 0.18915115732589416, + "grad_norm": 0.064453125, + "learning_rate": 0.002918430037114086, + "loss": 1.2593, + "step": 2157 + }, + { + "epoch": 0.1892388491002687, + "grad_norm": 0.05908203125, + "learning_rate": 0.002918290414157312, + "loss": 1.2255, + "step": 2158 + }, + { + "epoch": 0.18932654087464323, + "grad_norm": 0.08251953125, + "learning_rate": 0.0029181506755357263, + "loss": 1.2266, + "step": 2159 + }, + { + "epoch": 0.18941423264901777, + "grad_norm": 0.05859375, + "learning_rate": 0.0029180108212620726, + "loss": 1.2049, + "step": 2160 + }, + { + "epoch": 0.18950192442339234, + "grad_norm": 0.064453125, + "learning_rate": 0.0029178708513491053, + "loss": 1.2316, + "step": 2161 + }, + { + "epoch": 0.18958961619776687, + "grad_norm": 0.07177734375, + "learning_rate": 0.0029177307658095897, + "loss": 1.2655, + "step": 2162 + }, + { + "epoch": 0.1896773079721414, + "grad_norm": 0.061767578125, + "learning_rate": 0.0029175905646563007, + "loss": 1.2376, + "step": 2163 + }, + { + "epoch": 0.18976499974651598, + "grad_norm": 0.06591796875, + "learning_rate": 0.0029174502479020247, + "loss": 1.2186, + "step": 2164 + }, + { + "epoch": 0.18985269152089052, + "grad_norm": 0.07080078125, + "learning_rate": 0.002917309815559558, + "loss": 1.2951, + "step": 2165 + }, + { + "epoch": 0.18994038329526505, + "grad_norm": 0.0693359375, + "learning_rate": 0.0029171692676417072, + "loss": 1.2102, + "step": 2166 + }, + { + "epoch": 0.1900280750696396, + "grad_norm": 0.061767578125, + "learning_rate": 0.0029170286041612908, + "loss": 1.2002, + "step": 2167 + }, + { + "epoch": 0.19011576684401416, + "grad_norm": 0.0703125, + "learning_rate": 0.002916887825131136, + "loss": 1.2508, + "step": 2168 + }, + { + "epoch": 0.1902034586183887, + "grad_norm": 0.07470703125, + "learning_rate": 0.0029167469305640824, + "loss": 1.2636, + "step": 2169 + }, + { + "epoch": 0.19029115039276323, + "grad_norm": 0.06103515625, + "learning_rate": 0.0029166059204729787, + "loss": 1.2211, + "step": 2170 + }, + { + "epoch": 0.19037884216713777, + "grad_norm": 0.0556640625, + "learning_rate": 0.002916464794870685, + "loss": 1.2183, + "step": 2171 + }, + { + "epoch": 0.19046653394151233, + "grad_norm": 0.08056640625, + "learning_rate": 0.002916323553770071, + "loss": 1.291, + "step": 2172 + }, + { + "epoch": 0.19055422571588687, + "grad_norm": 0.057861328125, + "learning_rate": 0.0029161821971840185, + "loss": 1.2438, + "step": 2173 + }, + { + "epoch": 0.1906419174902614, + "grad_norm": 0.08447265625, + "learning_rate": 0.0029160407251254176, + "loss": 1.218, + "step": 2174 + }, + { + "epoch": 0.19072960926463597, + "grad_norm": 0.07568359375, + "learning_rate": 0.0029158991376071713, + "loss": 1.1763, + "step": 2175 + }, + { + "epoch": 0.1908173010390105, + "grad_norm": 0.0791015625, + "learning_rate": 0.0029157574346421918, + "loss": 1.2258, + "step": 2176 + }, + { + "epoch": 0.19090499281338505, + "grad_norm": 0.054443359375, + "learning_rate": 0.0029156156162434012, + "loss": 1.2147, + "step": 2177 + }, + { + "epoch": 0.1909926845877596, + "grad_norm": 0.078125, + "learning_rate": 0.0029154736824237334, + "loss": 1.2553, + "step": 2178 + }, + { + "epoch": 0.19108037636213415, + "grad_norm": 0.059814453125, + "learning_rate": 0.002915331633196133, + "loss": 1.1916, + "step": 2179 + }, + { + "epoch": 0.1911680681365087, + "grad_norm": 0.091796875, + "learning_rate": 0.002915189468573554, + "loss": 1.1776, + "step": 2180 + }, + { + "epoch": 0.19125575991088323, + "grad_norm": 0.0576171875, + "learning_rate": 0.0029150471885689613, + "loss": 1.208, + "step": 2181 + }, + { + "epoch": 0.1913434516852578, + "grad_norm": 0.09619140625, + "learning_rate": 0.002914904793195331, + "loss": 1.2131, + "step": 2182 + }, + { + "epoch": 0.19143114345963233, + "grad_norm": 0.10498046875, + "learning_rate": 0.0029147622824656485, + "loss": 1.2452, + "step": 2183 + }, + { + "epoch": 0.19151883523400687, + "grad_norm": 0.0712890625, + "learning_rate": 0.0029146196563929116, + "loss": 1.2204, + "step": 2184 + }, + { + "epoch": 0.1916065270083814, + "grad_norm": 0.083984375, + "learning_rate": 0.002914476914990126, + "loss": 1.2134, + "step": 2185 + }, + { + "epoch": 0.19169421878275597, + "grad_norm": 0.08056640625, + "learning_rate": 0.0029143340582703098, + "loss": 1.2047, + "step": 2186 + }, + { + "epoch": 0.1917819105571305, + "grad_norm": 0.06787109375, + "learning_rate": 0.0029141910862464917, + "loss": 1.2427, + "step": 2187 + }, + { + "epoch": 0.19186960233150505, + "grad_norm": 0.107421875, + "learning_rate": 0.0029140479989317103, + "loss": 1.2311, + "step": 2188 + }, + { + "epoch": 0.19195729410587958, + "grad_norm": 0.091796875, + "learning_rate": 0.002913904796339014, + "loss": 1.2115, + "step": 2189 + }, + { + "epoch": 0.19204498588025415, + "grad_norm": 0.0654296875, + "learning_rate": 0.002913761478481463, + "loss": 1.2186, + "step": 2190 + }, + { + "epoch": 0.1921326776546287, + "grad_norm": 0.072265625, + "learning_rate": 0.002913618045372128, + "loss": 1.278, + "step": 2191 + }, + { + "epoch": 0.19222036942900322, + "grad_norm": 0.07666015625, + "learning_rate": 0.0029134744970240895, + "loss": 1.2644, + "step": 2192 + }, + { + "epoch": 0.1923080612033778, + "grad_norm": 0.07177734375, + "learning_rate": 0.002913330833450438, + "loss": 1.2333, + "step": 2193 + }, + { + "epoch": 0.19239575297775233, + "grad_norm": 0.0703125, + "learning_rate": 0.002913187054664276, + "loss": 1.2697, + "step": 2194 + }, + { + "epoch": 0.19248344475212686, + "grad_norm": 0.0537109375, + "learning_rate": 0.0029130431606787153, + "loss": 1.2019, + "step": 2195 + }, + { + "epoch": 0.1925711365265014, + "grad_norm": 0.07763671875, + "learning_rate": 0.0029128991515068787, + "loss": 1.2012, + "step": 2196 + }, + { + "epoch": 0.19265882830087597, + "grad_norm": 0.06005859375, + "learning_rate": 0.0029127550271619003, + "loss": 1.2528, + "step": 2197 + }, + { + "epoch": 0.1927465200752505, + "grad_norm": 0.1513671875, + "learning_rate": 0.0029126107876569223, + "loss": 1.2931, + "step": 2198 + }, + { + "epoch": 0.19283421184962504, + "grad_norm": 0.11572265625, + "learning_rate": 0.002912466433005101, + "loss": 1.2057, + "step": 2199 + }, + { + "epoch": 0.1929219036239996, + "grad_norm": 0.06884765625, + "learning_rate": 0.0029123219632195985, + "loss": 1.2672, + "step": 2200 + }, + { + "epoch": 0.19300959539837415, + "grad_norm": 0.1396484375, + "learning_rate": 0.002912177378313593, + "loss": 1.207, + "step": 2201 + }, + { + "epoch": 0.19309728717274868, + "grad_norm": 0.07275390625, + "learning_rate": 0.002912032678300268, + "loss": 1.2218, + "step": 2202 + }, + { + "epoch": 0.19318497894712322, + "grad_norm": 0.1455078125, + "learning_rate": 0.0029118878631928216, + "loss": 1.2283, + "step": 2203 + }, + { + "epoch": 0.19327267072149779, + "grad_norm": 0.1484375, + "learning_rate": 0.002911742933004459, + "loss": 1.277, + "step": 2204 + }, + { + "epoch": 0.19336036249587232, + "grad_norm": 0.1484375, + "learning_rate": 0.0029115978877483984, + "loss": 1.18, + "step": 2205 + }, + { + "epoch": 0.19344805427024686, + "grad_norm": 0.123046875, + "learning_rate": 0.0029114527274378666, + "loss": 1.1876, + "step": 2206 + }, + { + "epoch": 0.1935357460446214, + "grad_norm": 0.07373046875, + "learning_rate": 0.0029113074520861035, + "loss": 1.2212, + "step": 2207 + }, + { + "epoch": 0.19362343781899596, + "grad_norm": 0.12353515625, + "learning_rate": 0.002911162061706356, + "loss": 1.1823, + "step": 2208 + }, + { + "epoch": 0.1937111295933705, + "grad_norm": 0.060791015625, + "learning_rate": 0.0029110165563118844, + "loss": 1.2737, + "step": 2209 + }, + { + "epoch": 0.19379882136774504, + "grad_norm": 0.0830078125, + "learning_rate": 0.002910870935915958, + "loss": 1.2434, + "step": 2210 + }, + { + "epoch": 0.1938865131421196, + "grad_norm": 0.08984375, + "learning_rate": 0.0029107252005318575, + "loss": 1.2678, + "step": 2211 + }, + { + "epoch": 0.19397420491649414, + "grad_norm": 0.0888671875, + "learning_rate": 0.002910579350172873, + "loss": 1.2172, + "step": 2212 + }, + { + "epoch": 0.19406189669086868, + "grad_norm": 0.0703125, + "learning_rate": 0.002910433384852306, + "loss": 1.2179, + "step": 2213 + }, + { + "epoch": 0.19414958846524322, + "grad_norm": 0.0732421875, + "learning_rate": 0.0029102873045834684, + "loss": 1.2059, + "step": 2214 + }, + { + "epoch": 0.19423728023961778, + "grad_norm": 0.0576171875, + "learning_rate": 0.0029101411093796813, + "loss": 1.1747, + "step": 2215 + }, + { + "epoch": 0.19432497201399232, + "grad_norm": 0.08984375, + "learning_rate": 0.0029099947992542786, + "loss": 1.2096, + "step": 2216 + }, + { + "epoch": 0.19441266378836686, + "grad_norm": 0.06640625, + "learning_rate": 0.002909848374220603, + "loss": 1.1815, + "step": 2217 + }, + { + "epoch": 0.19450035556274142, + "grad_norm": 0.10400390625, + "learning_rate": 0.0029097018342920077, + "loss": 1.2166, + "step": 2218 + }, + { + "epoch": 0.19458804733711596, + "grad_norm": 0.054443359375, + "learning_rate": 0.0029095551794818568, + "loss": 1.2344, + "step": 2219 + }, + { + "epoch": 0.1946757391114905, + "grad_norm": 0.0673828125, + "learning_rate": 0.002909408409803525, + "loss": 1.1877, + "step": 2220 + }, + { + "epoch": 0.19476343088586504, + "grad_norm": 0.06494140625, + "learning_rate": 0.0029092615252703977, + "loss": 1.2411, + "step": 2221 + }, + { + "epoch": 0.1948511226602396, + "grad_norm": 0.060546875, + "learning_rate": 0.0029091145258958702, + "loss": 1.2854, + "step": 2222 + }, + { + "epoch": 0.19493881443461414, + "grad_norm": 0.06298828125, + "learning_rate": 0.0029089674116933484, + "loss": 1.2076, + "step": 2223 + }, + { + "epoch": 0.19502650620898868, + "grad_norm": 0.0615234375, + "learning_rate": 0.002908820182676248, + "loss": 1.217, + "step": 2224 + }, + { + "epoch": 0.1951141979833632, + "grad_norm": 0.0654296875, + "learning_rate": 0.002908672838857997, + "loss": 1.3098, + "step": 2225 + }, + { + "epoch": 0.19520188975773778, + "grad_norm": 0.07470703125, + "learning_rate": 0.0029085253802520325, + "loss": 1.2397, + "step": 2226 + }, + { + "epoch": 0.19528958153211232, + "grad_norm": 0.0732421875, + "learning_rate": 0.0029083778068718018, + "loss": 1.2012, + "step": 2227 + }, + { + "epoch": 0.19537727330648685, + "grad_norm": 0.11865234375, + "learning_rate": 0.002908230118730764, + "loss": 1.3807, + "step": 2228 + }, + { + "epoch": 0.19546496508086142, + "grad_norm": 0.126953125, + "learning_rate": 0.002908082315842387, + "loss": 1.2038, + "step": 2229 + }, + { + "epoch": 0.19555265685523596, + "grad_norm": 0.11279296875, + "learning_rate": 0.0029079343982201513, + "loss": 1.24, + "step": 2230 + }, + { + "epoch": 0.1956403486296105, + "grad_norm": 0.1357421875, + "learning_rate": 0.0029077863658775447, + "loss": 1.2039, + "step": 2231 + }, + { + "epoch": 0.19572804040398503, + "grad_norm": 0.119140625, + "learning_rate": 0.0029076382188280695, + "loss": 1.1737, + "step": 2232 + }, + { + "epoch": 0.1958157321783596, + "grad_norm": 0.10791015625, + "learning_rate": 0.002907489957085235, + "loss": 1.2468, + "step": 2233 + }, + { + "epoch": 0.19590342395273413, + "grad_norm": 0.10302734375, + "learning_rate": 0.0029073415806625622, + "loss": 1.2271, + "step": 2234 + }, + { + "epoch": 0.19599111572710867, + "grad_norm": 0.072265625, + "learning_rate": 0.0029071930895735835, + "loss": 1.2028, + "step": 2235 + }, + { + "epoch": 0.19607880750148324, + "grad_norm": 0.06640625, + "learning_rate": 0.0029070444838318406, + "loss": 1.2072, + "step": 2236 + }, + { + "epoch": 0.19616649927585778, + "grad_norm": 0.078125, + "learning_rate": 0.0029068957634508855, + "loss": 1.1776, + "step": 2237 + }, + { + "epoch": 0.1962541910502323, + "grad_norm": 0.09375, + "learning_rate": 0.002906746928444281, + "loss": 1.2094, + "step": 2238 + }, + { + "epoch": 0.19634188282460685, + "grad_norm": 0.06201171875, + "learning_rate": 0.0029065979788256016, + "loss": 1.2517, + "step": 2239 + }, + { + "epoch": 0.19642957459898142, + "grad_norm": 0.09716796875, + "learning_rate": 0.0029064489146084305, + "loss": 1.2747, + "step": 2240 + }, + { + "epoch": 0.19651726637335595, + "grad_norm": 0.057861328125, + "learning_rate": 0.0029062997358063614, + "loss": 1.1872, + "step": 2241 + }, + { + "epoch": 0.1966049581477305, + "grad_norm": 0.07763671875, + "learning_rate": 0.0029061504424329997, + "loss": 1.2261, + "step": 2242 + }, + { + "epoch": 0.19669264992210503, + "grad_norm": 0.08447265625, + "learning_rate": 0.0029060010345019604, + "loss": 1.2154, + "step": 2243 + }, + { + "epoch": 0.1967803416964796, + "grad_norm": 0.08056640625, + "learning_rate": 0.0029058515120268693, + "loss": 1.1852, + "step": 2244 + }, + { + "epoch": 0.19686803347085413, + "grad_norm": 0.0849609375, + "learning_rate": 0.0029057018750213623, + "loss": 1.2149, + "step": 2245 + }, + { + "epoch": 0.19695572524522867, + "grad_norm": 0.07373046875, + "learning_rate": 0.0029055521234990856, + "loss": 1.2663, + "step": 2246 + }, + { + "epoch": 0.19704341701960323, + "grad_norm": 0.0888671875, + "learning_rate": 0.0029054022574736965, + "loss": 1.229, + "step": 2247 + }, + { + "epoch": 0.19713110879397777, + "grad_norm": 0.10791015625, + "learning_rate": 0.0029052522769588618, + "loss": 1.2652, + "step": 2248 + }, + { + "epoch": 0.1972188005683523, + "grad_norm": 0.0712890625, + "learning_rate": 0.002905102181968261, + "loss": 1.1608, + "step": 2249 + }, + { + "epoch": 0.19730649234272685, + "grad_norm": 0.0732421875, + "learning_rate": 0.0029049519725155805, + "loss": 1.2427, + "step": 2250 + }, + { + "epoch": 0.1973941841171014, + "grad_norm": 0.072265625, + "learning_rate": 0.00290480164861452, + "loss": 1.2146, + "step": 2251 + }, + { + "epoch": 0.19748187589147595, + "grad_norm": 0.1005859375, + "learning_rate": 0.0029046512102787877, + "loss": 1.2384, + "step": 2252 + }, + { + "epoch": 0.1975695676658505, + "grad_norm": 0.158203125, + "learning_rate": 0.002904500657522105, + "loss": 1.2716, + "step": 2253 + }, + { + "epoch": 0.19765725944022505, + "grad_norm": 0.0703125, + "learning_rate": 0.0029043499903582, + "loss": 1.2459, + "step": 2254 + }, + { + "epoch": 0.1977449512145996, + "grad_norm": 0.0791015625, + "learning_rate": 0.002904199208800814, + "loss": 1.1976, + "step": 2255 + }, + { + "epoch": 0.19783264298897413, + "grad_norm": 0.0712890625, + "learning_rate": 0.0029040483128636983, + "loss": 1.2153, + "step": 2256 + }, + { + "epoch": 0.19792033476334867, + "grad_norm": 0.078125, + "learning_rate": 0.0029038973025606136, + "loss": 1.243, + "step": 2257 + }, + { + "epoch": 0.19800802653772323, + "grad_norm": 0.062255859375, + "learning_rate": 0.0029037461779053315, + "loss": 1.1208, + "step": 2258 + }, + { + "epoch": 0.19809571831209777, + "grad_norm": 0.10009765625, + "learning_rate": 0.002903594938911635, + "loss": 1.2481, + "step": 2259 + }, + { + "epoch": 0.1981834100864723, + "grad_norm": 0.072265625, + "learning_rate": 0.0029034435855933164, + "loss": 1.2173, + "step": 2260 + }, + { + "epoch": 0.19827110186084684, + "grad_norm": 0.087890625, + "learning_rate": 0.0029032921179641775, + "loss": 1.227, + "step": 2261 + }, + { + "epoch": 0.1983587936352214, + "grad_norm": 0.060546875, + "learning_rate": 0.002903140536038034, + "loss": 1.2387, + "step": 2262 + }, + { + "epoch": 0.19844648540959595, + "grad_norm": 0.0634765625, + "learning_rate": 0.002902988839828708, + "loss": 1.2604, + "step": 2263 + }, + { + "epoch": 0.19853417718397048, + "grad_norm": 0.0751953125, + "learning_rate": 0.002902837029350034, + "loss": 1.2199, + "step": 2264 + }, + { + "epoch": 0.19862186895834505, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029026851046158575, + "loss": 1.2029, + "step": 2265 + }, + { + "epoch": 0.1987095607327196, + "grad_norm": 0.083984375, + "learning_rate": 0.0029025330656400334, + "loss": 1.2838, + "step": 2266 + }, + { + "epoch": 0.19879725250709412, + "grad_norm": 0.08642578125, + "learning_rate": 0.002902380912436427, + "loss": 1.2007, + "step": 2267 + }, + { + "epoch": 0.19888494428146866, + "grad_norm": 0.07666015625, + "learning_rate": 0.0029022286450189143, + "loss": 1.2441, + "step": 2268 + }, + { + "epoch": 0.19897263605584323, + "grad_norm": 0.166015625, + "learning_rate": 0.002902076263401382, + "loss": 1.274, + "step": 2269 + }, + { + "epoch": 0.19906032783021776, + "grad_norm": 0.06787109375, + "learning_rate": 0.002901923767597726, + "loss": 1.2176, + "step": 2270 + }, + { + "epoch": 0.1991480196045923, + "grad_norm": 0.1083984375, + "learning_rate": 0.0029017711576218553, + "loss": 1.2198, + "step": 2271 + }, + { + "epoch": 0.19923571137896687, + "grad_norm": 0.1396484375, + "learning_rate": 0.0029016184334876858, + "loss": 1.2703, + "step": 2272 + }, + { + "epoch": 0.1993234031533414, + "grad_norm": 0.08544921875, + "learning_rate": 0.0029014655952091465, + "loss": 1.2411, + "step": 2273 + }, + { + "epoch": 0.19941109492771594, + "grad_norm": 0.15625, + "learning_rate": 0.0029013126428001756, + "loss": 1.2385, + "step": 2274 + }, + { + "epoch": 0.19949878670209048, + "grad_norm": 0.0810546875, + "learning_rate": 0.002901159576274721, + "loss": 1.2118, + "step": 2275 + }, + { + "epoch": 0.19958647847646505, + "grad_norm": 0.09619140625, + "learning_rate": 0.002901006395646744, + "loss": 1.226, + "step": 2276 + }, + { + "epoch": 0.19967417025083958, + "grad_norm": 0.06787109375, + "learning_rate": 0.0029008531009302127, + "loss": 1.2068, + "step": 2277 + }, + { + "epoch": 0.19976186202521412, + "grad_norm": 0.0810546875, + "learning_rate": 0.0029006996921391074, + "loss": 1.1829, + "step": 2278 + }, + { + "epoch": 0.19984955379958866, + "grad_norm": 0.059326171875, + "learning_rate": 0.0029005461692874195, + "loss": 1.1688, + "step": 2279 + }, + { + "epoch": 0.19993724557396322, + "grad_norm": 0.0888671875, + "learning_rate": 0.0029003925323891487, + "loss": 1.249, + "step": 2280 + }, + { + "epoch": 0.20002493734833776, + "grad_norm": 0.06982421875, + "learning_rate": 0.002900238781458307, + "loss": 1.2226, + "step": 2281 + }, + { + "epoch": 0.2001126291227123, + "grad_norm": 0.064453125, + "learning_rate": 0.002900084916508916, + "loss": 1.2285, + "step": 2282 + }, + { + "epoch": 0.20020032089708686, + "grad_norm": 0.07470703125, + "learning_rate": 0.002899930937555008, + "loss": 1.1471, + "step": 2283 + }, + { + "epoch": 0.2002880126714614, + "grad_norm": 0.099609375, + "learning_rate": 0.0028997768446106247, + "loss": 1.2034, + "step": 2284 + }, + { + "epoch": 0.20037570444583594, + "grad_norm": 0.09326171875, + "learning_rate": 0.002899622637689819, + "loss": 1.2634, + "step": 2285 + }, + { + "epoch": 0.20046339622021048, + "grad_norm": 0.126953125, + "learning_rate": 0.0028994683168066557, + "loss": 1.2306, + "step": 2286 + }, + { + "epoch": 0.20055108799458504, + "grad_norm": 0.10302734375, + "learning_rate": 0.0028993138819752066, + "loss": 1.2843, + "step": 2287 + }, + { + "epoch": 0.20063877976895958, + "grad_norm": 0.18359375, + "learning_rate": 0.002899159333209557, + "loss": 1.2464, + "step": 2288 + }, + { + "epoch": 0.20072647154333412, + "grad_norm": 0.0693359375, + "learning_rate": 0.0028990046705238003, + "loss": 1.2667, + "step": 2289 + }, + { + "epoch": 0.20081416331770865, + "grad_norm": 0.1279296875, + "learning_rate": 0.0028988498939320428, + "loss": 1.2393, + "step": 2290 + }, + { + "epoch": 0.20090185509208322, + "grad_norm": 0.0830078125, + "learning_rate": 0.002898695003448398, + "loss": 1.2104, + "step": 2291 + }, + { + "epoch": 0.20098954686645776, + "grad_norm": 0.1220703125, + "learning_rate": 0.0028985399990869927, + "loss": 1.2468, + "step": 2292 + }, + { + "epoch": 0.2010772386408323, + "grad_norm": 0.072265625, + "learning_rate": 0.002898384880861963, + "loss": 1.1991, + "step": 2293 + }, + { + "epoch": 0.20116493041520686, + "grad_norm": 0.07470703125, + "learning_rate": 0.0028982296487874543, + "loss": 1.2474, + "step": 2294 + }, + { + "epoch": 0.2012526221895814, + "grad_norm": 0.07666015625, + "learning_rate": 0.0028980743028776245, + "loss": 1.2428, + "step": 2295 + }, + { + "epoch": 0.20134031396395594, + "grad_norm": 0.1044921875, + "learning_rate": 0.0028979188431466395, + "loss": 1.2533, + "step": 2296 + }, + { + "epoch": 0.20142800573833047, + "grad_norm": 0.08935546875, + "learning_rate": 0.002897763269608678, + "loss": 1.2392, + "step": 2297 + }, + { + "epoch": 0.20151569751270504, + "grad_norm": 0.14453125, + "learning_rate": 0.0028976075822779272, + "loss": 1.2092, + "step": 2298 + }, + { + "epoch": 0.20160338928707958, + "grad_norm": 0.06591796875, + "learning_rate": 0.0028974517811685852, + "loss": 1.2441, + "step": 2299 + }, + { + "epoch": 0.2016910810614541, + "grad_norm": 0.06494140625, + "learning_rate": 0.002897295866294862, + "loss": 1.2282, + "step": 2300 + }, + { + "epoch": 0.20177877283582868, + "grad_norm": 0.1318359375, + "learning_rate": 0.0028971398376709746, + "loss": 1.2885, + "step": 2301 + }, + { + "epoch": 0.20186646461020322, + "grad_norm": 0.0908203125, + "learning_rate": 0.002896983695311154, + "loss": 1.1914, + "step": 2302 + }, + { + "epoch": 0.20195415638457775, + "grad_norm": 0.07958984375, + "learning_rate": 0.002896827439229639, + "loss": 1.1823, + "step": 2303 + }, + { + "epoch": 0.2020418481589523, + "grad_norm": 0.06494140625, + "learning_rate": 0.002896671069440681, + "loss": 1.1945, + "step": 2304 + }, + { + "epoch": 0.20212953993332686, + "grad_norm": 0.1474609375, + "learning_rate": 0.0028965145859585393, + "loss": 1.1994, + "step": 2305 + }, + { + "epoch": 0.2022172317077014, + "grad_norm": 0.11083984375, + "learning_rate": 0.0028963579887974853, + "loss": 1.2515, + "step": 2306 + }, + { + "epoch": 0.20230492348207593, + "grad_norm": 0.103515625, + "learning_rate": 0.0028962012779718002, + "loss": 1.1885, + "step": 2307 + }, + { + "epoch": 0.20239261525645047, + "grad_norm": 0.076171875, + "learning_rate": 0.0028960444534957756, + "loss": 1.1365, + "step": 2308 + }, + { + "epoch": 0.20248030703082504, + "grad_norm": 0.06494140625, + "learning_rate": 0.0028958875153837133, + "loss": 1.2862, + "step": 2309 + }, + { + "epoch": 0.20256799880519957, + "grad_norm": 0.07177734375, + "learning_rate": 0.002895730463649926, + "loss": 1.2135, + "step": 2310 + }, + { + "epoch": 0.2026556905795741, + "grad_norm": 0.068359375, + "learning_rate": 0.002895573298308736, + "loss": 1.1977, + "step": 2311 + }, + { + "epoch": 0.20274338235394868, + "grad_norm": 0.08251953125, + "learning_rate": 0.0028954160193744773, + "loss": 1.2087, + "step": 2312 + }, + { + "epoch": 0.2028310741283232, + "grad_norm": 0.232421875, + "learning_rate": 0.002895258626861492, + "loss": 1.2667, + "step": 2313 + }, + { + "epoch": 0.20291876590269775, + "grad_norm": 0.07470703125, + "learning_rate": 0.0028951011207841346, + "loss": 1.1602, + "step": 2314 + }, + { + "epoch": 0.2030064576770723, + "grad_norm": 0.0732421875, + "learning_rate": 0.00289494350115677, + "loss": 1.2413, + "step": 2315 + }, + { + "epoch": 0.20309414945144685, + "grad_norm": 0.0673828125, + "learning_rate": 0.002894785767993771, + "loss": 1.2095, + "step": 2316 + }, + { + "epoch": 0.2031818412258214, + "grad_norm": 0.0859375, + "learning_rate": 0.0028946279213095233, + "loss": 1.2188, + "step": 2317 + }, + { + "epoch": 0.20326953300019593, + "grad_norm": 0.06982421875, + "learning_rate": 0.0028944699611184225, + "loss": 1.2537, + "step": 2318 + }, + { + "epoch": 0.2033572247745705, + "grad_norm": 0.1015625, + "learning_rate": 0.002894311887434874, + "loss": 1.2358, + "step": 2319 + }, + { + "epoch": 0.20344491654894503, + "grad_norm": 0.10693359375, + "learning_rate": 0.0028941537002732933, + "loss": 1.2175, + "step": 2320 + }, + { + "epoch": 0.20353260832331957, + "grad_norm": 0.0634765625, + "learning_rate": 0.002893995399648107, + "loss": 1.2213, + "step": 2321 + }, + { + "epoch": 0.2036203000976941, + "grad_norm": 0.11376953125, + "learning_rate": 0.0028938369855737514, + "loss": 1.2282, + "step": 2322 + }, + { + "epoch": 0.20370799187206867, + "grad_norm": 0.07275390625, + "learning_rate": 0.002893678458064674, + "loss": 1.2173, + "step": 2323 + }, + { + "epoch": 0.2037956836464432, + "grad_norm": 0.115234375, + "learning_rate": 0.002893519817135332, + "loss": 1.2252, + "step": 2324 + }, + { + "epoch": 0.20388337542081775, + "grad_norm": 0.076171875, + "learning_rate": 0.0028933610628001922, + "loss": 1.2915, + "step": 2325 + }, + { + "epoch": 0.20397106719519228, + "grad_norm": 0.08349609375, + "learning_rate": 0.0028932021950737333, + "loss": 1.1885, + "step": 2326 + }, + { + "epoch": 0.20405875896956685, + "grad_norm": 0.0634765625, + "learning_rate": 0.0028930432139704435, + "loss": 1.2358, + "step": 2327 + }, + { + "epoch": 0.2041464507439414, + "grad_norm": 0.08740234375, + "learning_rate": 0.0028928841195048215, + "loss": 1.1917, + "step": 2328 + }, + { + "epoch": 0.20423414251831593, + "grad_norm": 0.061279296875, + "learning_rate": 0.0028927249116913766, + "loss": 1.187, + "step": 2329 + }, + { + "epoch": 0.2043218342926905, + "grad_norm": 0.0654296875, + "learning_rate": 0.0028925655905446277, + "loss": 1.1759, + "step": 2330 + }, + { + "epoch": 0.20440952606706503, + "grad_norm": 0.07861328125, + "learning_rate": 0.002892406156079104, + "loss": 1.2177, + "step": 2331 + }, + { + "epoch": 0.20449721784143957, + "grad_norm": 0.08447265625, + "learning_rate": 0.002892246608309347, + "loss": 1.1686, + "step": 2332 + }, + { + "epoch": 0.2045849096158141, + "grad_norm": 0.09033203125, + "learning_rate": 0.0028920869472499056, + "loss": 1.2278, + "step": 2333 + }, + { + "epoch": 0.20467260139018867, + "grad_norm": 0.10009765625, + "learning_rate": 0.0028919271729153416, + "loss": 1.2325, + "step": 2334 + }, + { + "epoch": 0.2047602931645632, + "grad_norm": 0.06689453125, + "learning_rate": 0.002891767285320225, + "loss": 1.1896, + "step": 2335 + }, + { + "epoch": 0.20484798493893774, + "grad_norm": 0.06005859375, + "learning_rate": 0.0028916072844791378, + "loss": 1.3283, + "step": 2336 + }, + { + "epoch": 0.2049356767133123, + "grad_norm": 0.06640625, + "learning_rate": 0.002891447170406671, + "loss": 1.2072, + "step": 2337 + }, + { + "epoch": 0.20502336848768685, + "grad_norm": 0.06201171875, + "learning_rate": 0.0028912869431174278, + "loss": 1.2582, + "step": 2338 + }, + { + "epoch": 0.20511106026206138, + "grad_norm": 0.0625, + "learning_rate": 0.002891126602626019, + "loss": 1.2377, + "step": 2339 + }, + { + "epoch": 0.20519875203643592, + "grad_norm": 0.078125, + "learning_rate": 0.0028909661489470682, + "loss": 1.2506, + "step": 2340 + }, + { + "epoch": 0.2052864438108105, + "grad_norm": 0.06640625, + "learning_rate": 0.002890805582095209, + "loss": 1.1812, + "step": 2341 + }, + { + "epoch": 0.20537413558518502, + "grad_norm": 0.1005859375, + "learning_rate": 0.002890644902085083, + "loss": 1.2333, + "step": 2342 + }, + { + "epoch": 0.20546182735955956, + "grad_norm": 0.052734375, + "learning_rate": 0.0028904841089313444, + "loss": 1.2214, + "step": 2343 + }, + { + "epoch": 0.2055495191339341, + "grad_norm": 0.10107421875, + "learning_rate": 0.0028903232026486576, + "loss": 1.2293, + "step": 2344 + }, + { + "epoch": 0.20563721090830867, + "grad_norm": 0.0791015625, + "learning_rate": 0.0028901621832516967, + "loss": 1.2232, + "step": 2345 + }, + { + "epoch": 0.2057249026826832, + "grad_norm": 0.056396484375, + "learning_rate": 0.0028900010507551463, + "loss": 1.2796, + "step": 2346 + }, + { + "epoch": 0.20581259445705774, + "grad_norm": 0.06201171875, + "learning_rate": 0.002889839805173701, + "loss": 1.2772, + "step": 2347 + }, + { + "epoch": 0.2059002862314323, + "grad_norm": 0.08447265625, + "learning_rate": 0.002889678446522066, + "loss": 1.17, + "step": 2348 + }, + { + "epoch": 0.20598797800580684, + "grad_norm": 0.080078125, + "learning_rate": 0.002889516974814957, + "loss": 1.1554, + "step": 2349 + }, + { + "epoch": 0.20607566978018138, + "grad_norm": 0.05859375, + "learning_rate": 0.0028893553900670994, + "loss": 1.2177, + "step": 2350 + }, + { + "epoch": 0.20616336155455592, + "grad_norm": 0.0751953125, + "learning_rate": 0.00288919369229323, + "loss": 1.2588, + "step": 2351 + }, + { + "epoch": 0.20625105332893048, + "grad_norm": 0.0693359375, + "learning_rate": 0.0028890318815080946, + "loss": 1.2717, + "step": 2352 + }, + { + "epoch": 0.20633874510330502, + "grad_norm": 0.11767578125, + "learning_rate": 0.0028888699577264503, + "loss": 1.2739, + "step": 2353 + }, + { + "epoch": 0.20642643687767956, + "grad_norm": 0.06298828125, + "learning_rate": 0.002888707920963064, + "loss": 1.2117, + "step": 2354 + }, + { + "epoch": 0.20651412865205412, + "grad_norm": 0.0849609375, + "learning_rate": 0.0028885457712327133, + "loss": 1.2334, + "step": 2355 + }, + { + "epoch": 0.20660182042642866, + "grad_norm": 0.099609375, + "learning_rate": 0.002888383508550185, + "loss": 1.2427, + "step": 2356 + }, + { + "epoch": 0.2066895122008032, + "grad_norm": 0.0556640625, + "learning_rate": 0.0028882211329302773, + "loss": 1.2333, + "step": 2357 + }, + { + "epoch": 0.20677720397517774, + "grad_norm": 0.072265625, + "learning_rate": 0.0028880586443878, + "loss": 1.2249, + "step": 2358 + }, + { + "epoch": 0.2068648957495523, + "grad_norm": 0.076171875, + "learning_rate": 0.002887896042937569, + "loss": 1.2365, + "step": 2359 + }, + { + "epoch": 0.20695258752392684, + "grad_norm": 0.07275390625, + "learning_rate": 0.0028877333285944155, + "loss": 1.2562, + "step": 2360 + }, + { + "epoch": 0.20704027929830138, + "grad_norm": 0.076171875, + "learning_rate": 0.0028875705013731764, + "loss": 1.1434, + "step": 2361 + }, + { + "epoch": 0.20712797107267591, + "grad_norm": 0.06103515625, + "learning_rate": 0.0028874075612887037, + "loss": 1.2028, + "step": 2362 + }, + { + "epoch": 0.20721566284705048, + "grad_norm": 0.0751953125, + "learning_rate": 0.002887244508355855, + "loss": 1.158, + "step": 2363 + }, + { + "epoch": 0.20730335462142502, + "grad_norm": 0.058349609375, + "learning_rate": 0.0028870813425895016, + "loss": 1.1906, + "step": 2364 + }, + { + "epoch": 0.20739104639579956, + "grad_norm": 0.07421875, + "learning_rate": 0.002886918064004523, + "loss": 1.2418, + "step": 2365 + }, + { + "epoch": 0.20747873817017412, + "grad_norm": 0.08154296875, + "learning_rate": 0.0028867546726158095, + "loss": 1.2573, + "step": 2366 + }, + { + "epoch": 0.20756642994454866, + "grad_norm": 0.09033203125, + "learning_rate": 0.0028865911684382633, + "loss": 1.208, + "step": 2367 + }, + { + "epoch": 0.2076541217189232, + "grad_norm": 0.09375, + "learning_rate": 0.0028864275514867946, + "loss": 1.228, + "step": 2368 + }, + { + "epoch": 0.20774181349329773, + "grad_norm": 0.142578125, + "learning_rate": 0.0028862638217763245, + "loss": 1.2732, + "step": 2369 + }, + { + "epoch": 0.2078295052676723, + "grad_norm": 0.0625, + "learning_rate": 0.0028860999793217856, + "loss": 1.1994, + "step": 2370 + }, + { + "epoch": 0.20791719704204684, + "grad_norm": 0.11279296875, + "learning_rate": 0.00288593602413812, + "loss": 1.2048, + "step": 2371 + }, + { + "epoch": 0.20800488881642137, + "grad_norm": 0.0654296875, + "learning_rate": 0.0028857719562402793, + "loss": 1.3161, + "step": 2372 + }, + { + "epoch": 0.20809258059079594, + "grad_norm": 0.09814453125, + "learning_rate": 0.002885607775643226, + "loss": 1.2285, + "step": 2373 + }, + { + "epoch": 0.20818027236517048, + "grad_norm": 0.0859375, + "learning_rate": 0.0028854434823619335, + "loss": 1.2331, + "step": 2374 + }, + { + "epoch": 0.20826796413954501, + "grad_norm": 0.08056640625, + "learning_rate": 0.0028852790764113847, + "loss": 1.2045, + "step": 2375 + }, + { + "epoch": 0.20835565591391955, + "grad_norm": 0.09228515625, + "learning_rate": 0.002885114557806573, + "loss": 1.1751, + "step": 2376 + }, + { + "epoch": 0.20844334768829412, + "grad_norm": 0.0869140625, + "learning_rate": 0.002884949926562502, + "loss": 1.2258, + "step": 2377 + }, + { + "epoch": 0.20853103946266865, + "grad_norm": 0.08935546875, + "learning_rate": 0.0028847851826941853, + "loss": 1.1964, + "step": 2378 + }, + { + "epoch": 0.2086187312370432, + "grad_norm": 0.146484375, + "learning_rate": 0.0028846203262166484, + "loss": 1.2436, + "step": 2379 + }, + { + "epoch": 0.20870642301141773, + "grad_norm": 0.07958984375, + "learning_rate": 0.0028844553571449245, + "loss": 1.2333, + "step": 2380 + }, + { + "epoch": 0.2087941147857923, + "grad_norm": 0.146484375, + "learning_rate": 0.002884290275494059, + "loss": 1.2614, + "step": 2381 + }, + { + "epoch": 0.20888180656016683, + "grad_norm": 0.06396484375, + "learning_rate": 0.002884125081279106, + "loss": 1.1986, + "step": 2382 + }, + { + "epoch": 0.20896949833454137, + "grad_norm": 0.0869140625, + "learning_rate": 0.002883959774515132, + "loss": 1.2024, + "step": 2383 + }, + { + "epoch": 0.20905719010891594, + "grad_norm": 0.08056640625, + "learning_rate": 0.0028837943552172114, + "loss": 1.2517, + "step": 2384 + }, + { + "epoch": 0.20914488188329047, + "grad_norm": 0.10595703125, + "learning_rate": 0.002883628823400431, + "loss": 1.1813, + "step": 2385 + }, + { + "epoch": 0.209232573657665, + "grad_norm": 0.072265625, + "learning_rate": 0.002883463179079887, + "loss": 1.2123, + "step": 2386 + }, + { + "epoch": 0.20932026543203955, + "grad_norm": 0.07373046875, + "learning_rate": 0.0028832974222706845, + "loss": 1.1983, + "step": 2387 + }, + { + "epoch": 0.2094079572064141, + "grad_norm": 0.07861328125, + "learning_rate": 0.002883131552987941, + "loss": 1.2064, + "step": 2388 + }, + { + "epoch": 0.20949564898078865, + "grad_norm": 0.11279296875, + "learning_rate": 0.0028829655712467837, + "loss": 1.206, + "step": 2389 + }, + { + "epoch": 0.2095833407551632, + "grad_norm": 0.10009765625, + "learning_rate": 0.002882799477062349, + "loss": 1.208, + "step": 2390 + }, + { + "epoch": 0.20967103252953775, + "grad_norm": 0.10400390625, + "learning_rate": 0.002882633270449784, + "loss": 1.2076, + "step": 2391 + }, + { + "epoch": 0.2097587243039123, + "grad_norm": 0.07275390625, + "learning_rate": 0.002882466951424247, + "loss": 1.2018, + "step": 2392 + }, + { + "epoch": 0.20984641607828683, + "grad_norm": 0.10205078125, + "learning_rate": 0.0028823005200009056, + "loss": 1.1516, + "step": 2393 + }, + { + "epoch": 0.20993410785266137, + "grad_norm": 0.0947265625, + "learning_rate": 0.002882133976194938, + "loss": 1.1705, + "step": 2394 + }, + { + "epoch": 0.21002179962703593, + "grad_norm": 0.11279296875, + "learning_rate": 0.0028819673200215327, + "loss": 1.2489, + "step": 2395 + }, + { + "epoch": 0.21010949140141047, + "grad_norm": 0.08740234375, + "learning_rate": 0.002881800551495888, + "loss": 1.1655, + "step": 2396 + }, + { + "epoch": 0.210197183175785, + "grad_norm": 0.07861328125, + "learning_rate": 0.002881633670633213, + "loss": 1.1966, + "step": 2397 + }, + { + "epoch": 0.21028487495015954, + "grad_norm": 0.10693359375, + "learning_rate": 0.002881466677448726, + "loss": 1.2546, + "step": 2398 + }, + { + "epoch": 0.2103725667245341, + "grad_norm": 0.0625, + "learning_rate": 0.0028812995719576574, + "loss": 1.231, + "step": 2399 + }, + { + "epoch": 0.21046025849890865, + "grad_norm": 0.12158203125, + "learning_rate": 0.0028811323541752466, + "loss": 1.3084, + "step": 2400 + }, + { + "epoch": 0.21054795027328319, + "grad_norm": 0.06689453125, + "learning_rate": 0.0028809650241167434, + "loss": 1.2047, + "step": 2401 + }, + { + "epoch": 0.21063564204765775, + "grad_norm": 0.12060546875, + "learning_rate": 0.0028807975817974076, + "loss": 1.2099, + "step": 2402 + }, + { + "epoch": 0.2107233338220323, + "grad_norm": 0.06982421875, + "learning_rate": 0.002880630027232509, + "loss": 1.221, + "step": 2403 + }, + { + "epoch": 0.21081102559640683, + "grad_norm": 0.15625, + "learning_rate": 0.0028804623604373296, + "loss": 1.2288, + "step": 2404 + }, + { + "epoch": 0.21089871737078136, + "grad_norm": 0.07080078125, + "learning_rate": 0.0028802945814271593, + "loss": 1.23, + "step": 2405 + }, + { + "epoch": 0.21098640914515593, + "grad_norm": 0.181640625, + "learning_rate": 0.002880126690217299, + "loss": 1.2156, + "step": 2406 + }, + { + "epoch": 0.21107410091953047, + "grad_norm": 0.06982421875, + "learning_rate": 0.00287995868682306, + "loss": 1.1559, + "step": 2407 + }, + { + "epoch": 0.211161792693905, + "grad_norm": 0.1259765625, + "learning_rate": 0.0028797905712597636, + "loss": 1.1611, + "step": 2408 + }, + { + "epoch": 0.21124948446827957, + "grad_norm": 0.2197265625, + "learning_rate": 0.0028796223435427425, + "loss": 1.2363, + "step": 2409 + }, + { + "epoch": 0.2113371762426541, + "grad_norm": 0.1640625, + "learning_rate": 0.002879454003687337, + "loss": 1.2284, + "step": 2410 + }, + { + "epoch": 0.21142486801702864, + "grad_norm": 0.1728515625, + "learning_rate": 0.0028792855517089013, + "loss": 1.1585, + "step": 2411 + }, + { + "epoch": 0.21151255979140318, + "grad_norm": 0.07080078125, + "learning_rate": 0.0028791169876227958, + "loss": 1.2513, + "step": 2412 + }, + { + "epoch": 0.21160025156577775, + "grad_norm": 0.130859375, + "learning_rate": 0.0028789483114443942, + "loss": 1.1754, + "step": 2413 + }, + { + "epoch": 0.21168794334015228, + "grad_norm": 0.0927734375, + "learning_rate": 0.002878779523189079, + "loss": 1.2413, + "step": 2414 + }, + { + "epoch": 0.21177563511452682, + "grad_norm": 0.1298828125, + "learning_rate": 0.0028786106228722436, + "loss": 1.2058, + "step": 2415 + }, + { + "epoch": 0.21186332688890136, + "grad_norm": 0.103515625, + "learning_rate": 0.002878441610509291, + "loss": 1.2768, + "step": 2416 + }, + { + "epoch": 0.21195101866327593, + "grad_norm": 0.12353515625, + "learning_rate": 0.002878272486115635, + "loss": 1.2236, + "step": 2417 + }, + { + "epoch": 0.21203871043765046, + "grad_norm": 0.1572265625, + "learning_rate": 0.002878103249706699, + "loss": 1.1947, + "step": 2418 + }, + { + "epoch": 0.212126402212025, + "grad_norm": 0.0986328125, + "learning_rate": 0.0028779339012979166, + "loss": 1.238, + "step": 2419 + }, + { + "epoch": 0.21221409398639957, + "grad_norm": 0.12158203125, + "learning_rate": 0.002877764440904732, + "loss": 1.1973, + "step": 2420 + }, + { + "epoch": 0.2123017857607741, + "grad_norm": 0.06396484375, + "learning_rate": 0.0028775948685426, + "loss": 1.2345, + "step": 2421 + }, + { + "epoch": 0.21238947753514864, + "grad_norm": 0.07861328125, + "learning_rate": 0.0028774251842269855, + "loss": 1.2017, + "step": 2422 + }, + { + "epoch": 0.21247716930952318, + "grad_norm": 0.0771484375, + "learning_rate": 0.002877255387973362, + "loss": 1.2343, + "step": 2423 + }, + { + "epoch": 0.21256486108389774, + "grad_norm": 0.060791015625, + "learning_rate": 0.0028770854797972164, + "loss": 1.2073, + "step": 2424 + }, + { + "epoch": 0.21265255285827228, + "grad_norm": 0.09765625, + "learning_rate": 0.002876915459714042, + "loss": 1.1852, + "step": 2425 + }, + { + "epoch": 0.21274024463264682, + "grad_norm": 0.062255859375, + "learning_rate": 0.0028767453277393448, + "loss": 1.2246, + "step": 2426 + }, + { + "epoch": 0.21282793640702138, + "grad_norm": 0.0625, + "learning_rate": 0.0028765750838886414, + "loss": 1.1585, + "step": 2427 + }, + { + "epoch": 0.21291562818139592, + "grad_norm": 0.0859375, + "learning_rate": 0.0028764047281774558, + "loss": 1.1536, + "step": 2428 + }, + { + "epoch": 0.21300331995577046, + "grad_norm": 0.09716796875, + "learning_rate": 0.0028762342606213254, + "loss": 1.2755, + "step": 2429 + }, + { + "epoch": 0.213091011730145, + "grad_norm": 0.06298828125, + "learning_rate": 0.0028760636812357955, + "loss": 1.2515, + "step": 2430 + }, + { + "epoch": 0.21317870350451956, + "grad_norm": 0.1044921875, + "learning_rate": 0.0028758929900364236, + "loss": 1.2335, + "step": 2431 + }, + { + "epoch": 0.2132663952788941, + "grad_norm": 0.07763671875, + "learning_rate": 0.002875722187038775, + "loss": 1.2292, + "step": 2432 + }, + { + "epoch": 0.21335408705326864, + "grad_norm": 0.142578125, + "learning_rate": 0.0028755512722584277, + "loss": 1.2493, + "step": 2433 + }, + { + "epoch": 0.21344177882764317, + "grad_norm": 0.07861328125, + "learning_rate": 0.002875380245710968, + "loss": 1.2574, + "step": 2434 + }, + { + "epoch": 0.21352947060201774, + "grad_norm": 0.08154296875, + "learning_rate": 0.002875209107411993, + "loss": 1.3211, + "step": 2435 + }, + { + "epoch": 0.21361716237639228, + "grad_norm": 0.0830078125, + "learning_rate": 0.0028750378573771103, + "loss": 1.1894, + "step": 2436 + }, + { + "epoch": 0.21370485415076682, + "grad_norm": 0.06201171875, + "learning_rate": 0.0028748664956219374, + "loss": 1.1815, + "step": 2437 + }, + { + "epoch": 0.21379254592514138, + "grad_norm": 0.062255859375, + "learning_rate": 0.002874695022162102, + "loss": 1.2064, + "step": 2438 + }, + { + "epoch": 0.21388023769951592, + "grad_norm": 0.05908203125, + "learning_rate": 0.0028745234370132424, + "loss": 1.2104, + "step": 2439 + }, + { + "epoch": 0.21396792947389046, + "grad_norm": 0.07470703125, + "learning_rate": 0.0028743517401910064, + "loss": 1.2221, + "step": 2440 + }, + { + "epoch": 0.214055621248265, + "grad_norm": 0.056396484375, + "learning_rate": 0.0028741799317110518, + "loss": 1.2286, + "step": 2441 + }, + { + "epoch": 0.21414331302263956, + "grad_norm": 0.07861328125, + "learning_rate": 0.002874008011589048, + "loss": 1.2943, + "step": 2442 + }, + { + "epoch": 0.2142310047970141, + "grad_norm": 0.06640625, + "learning_rate": 0.0028738359798406734, + "loss": 1.2198, + "step": 2443 + }, + { + "epoch": 0.21431869657138863, + "grad_norm": 0.06884765625, + "learning_rate": 0.0028736638364816164, + "loss": 1.2397, + "step": 2444 + }, + { + "epoch": 0.2144063883457632, + "grad_norm": 0.078125, + "learning_rate": 0.0028734915815275767, + "loss": 1.2015, + "step": 2445 + }, + { + "epoch": 0.21449408012013774, + "grad_norm": 0.0703125, + "learning_rate": 0.0028733192149942633, + "loss": 1.245, + "step": 2446 + }, + { + "epoch": 0.21458177189451227, + "grad_norm": 0.07568359375, + "learning_rate": 0.0028731467368973956, + "loss": 1.2822, + "step": 2447 + }, + { + "epoch": 0.2146694636688868, + "grad_norm": 0.06884765625, + "learning_rate": 0.0028729741472527026, + "loss": 1.243, + "step": 2448 + }, + { + "epoch": 0.21475715544326138, + "grad_norm": 0.07275390625, + "learning_rate": 0.0028728014460759244, + "loss": 1.2323, + "step": 2449 + }, + { + "epoch": 0.21484484721763591, + "grad_norm": 0.06640625, + "learning_rate": 0.0028726286333828114, + "loss": 1.2198, + "step": 2450 + }, + { + "epoch": 0.21493253899201045, + "grad_norm": 0.07763671875, + "learning_rate": 0.002872455709189123, + "loss": 1.2101, + "step": 2451 + }, + { + "epoch": 0.215020230766385, + "grad_norm": 0.064453125, + "learning_rate": 0.00287228267351063, + "loss": 1.2838, + "step": 2452 + }, + { + "epoch": 0.21510792254075956, + "grad_norm": 0.1328125, + "learning_rate": 0.002872109526363112, + "loss": 1.2129, + "step": 2453 + }, + { + "epoch": 0.2151956143151341, + "grad_norm": 0.08740234375, + "learning_rate": 0.0028719362677623603, + "loss": 1.1734, + "step": 2454 + }, + { + "epoch": 0.21528330608950863, + "grad_norm": 0.162109375, + "learning_rate": 0.0028717628977241753, + "loss": 1.244, + "step": 2455 + }, + { + "epoch": 0.2153709978638832, + "grad_norm": 0.060791015625, + "learning_rate": 0.0028715894162643675, + "loss": 1.2126, + "step": 2456 + }, + { + "epoch": 0.21545868963825773, + "grad_norm": 0.1796875, + "learning_rate": 0.0028714158233987593, + "loss": 1.2491, + "step": 2457 + }, + { + "epoch": 0.21554638141263227, + "grad_norm": 0.07861328125, + "learning_rate": 0.002871242119143181, + "loss": 1.1801, + "step": 2458 + }, + { + "epoch": 0.2156340731870068, + "grad_norm": 0.1953125, + "learning_rate": 0.002871068303513474, + "loss": 1.2637, + "step": 2459 + }, + { + "epoch": 0.21572176496138137, + "grad_norm": 0.091796875, + "learning_rate": 0.0028708943765254893, + "loss": 1.2972, + "step": 2460 + }, + { + "epoch": 0.2158094567357559, + "grad_norm": 0.2109375, + "learning_rate": 0.0028707203381950893, + "loss": 1.2826, + "step": 2461 + }, + { + "epoch": 0.21589714851013045, + "grad_norm": 0.0908203125, + "learning_rate": 0.0028705461885381463, + "loss": 1.2108, + "step": 2462 + }, + { + "epoch": 0.21598484028450501, + "grad_norm": 0.177734375, + "learning_rate": 0.0028703719275705412, + "loss": 1.1843, + "step": 2463 + }, + { + "epoch": 0.21607253205887955, + "grad_norm": 0.126953125, + "learning_rate": 0.002870197555308167, + "loss": 1.2419, + "step": 2464 + }, + { + "epoch": 0.2161602238332541, + "grad_norm": 0.1689453125, + "learning_rate": 0.0028700230717669255, + "loss": 1.2047, + "step": 2465 + }, + { + "epoch": 0.21624791560762863, + "grad_norm": 0.1298828125, + "learning_rate": 0.0028698484769627297, + "loss": 1.2018, + "step": 2466 + }, + { + "epoch": 0.2163356073820032, + "grad_norm": 0.11669921875, + "learning_rate": 0.0028696737709115014, + "loss": 1.2248, + "step": 2467 + }, + { + "epoch": 0.21642329915637773, + "grad_norm": 0.095703125, + "learning_rate": 0.002869498953629174, + "loss": 1.2404, + "step": 2468 + }, + { + "epoch": 0.21651099093075227, + "grad_norm": 0.07080078125, + "learning_rate": 0.00286932402513169, + "loss": 1.2839, + "step": 2469 + }, + { + "epoch": 0.2165986827051268, + "grad_norm": 0.08740234375, + "learning_rate": 0.002869148985435003, + "loss": 1.1919, + "step": 2470 + }, + { + "epoch": 0.21668637447950137, + "grad_norm": 0.06640625, + "learning_rate": 0.002868973834555075, + "loss": 1.2441, + "step": 2471 + }, + { + "epoch": 0.2167740662538759, + "grad_norm": 0.059814453125, + "learning_rate": 0.0028687985725078806, + "loss": 1.2368, + "step": 2472 + }, + { + "epoch": 0.21686175802825045, + "grad_norm": 0.0634765625, + "learning_rate": 0.0028686231993094026, + "loss": 1.1671, + "step": 2473 + }, + { + "epoch": 0.216949449802625, + "grad_norm": 0.0625, + "learning_rate": 0.0028684477149756344, + "loss": 1.2183, + "step": 2474 + }, + { + "epoch": 0.21703714157699955, + "grad_norm": 0.06298828125, + "learning_rate": 0.0028682721195225805, + "loss": 1.2049, + "step": 2475 + }, + { + "epoch": 0.21712483335137409, + "grad_norm": 0.091796875, + "learning_rate": 0.0028680964129662537, + "loss": 1.191, + "step": 2476 + }, + { + "epoch": 0.21721252512574862, + "grad_norm": 0.06787109375, + "learning_rate": 0.002867920595322679, + "loss": 1.1428, + "step": 2477 + }, + { + "epoch": 0.2173002169001232, + "grad_norm": 0.142578125, + "learning_rate": 0.00286774466660789, + "loss": 1.1859, + "step": 2478 + }, + { + "epoch": 0.21738790867449773, + "grad_norm": 0.0625, + "learning_rate": 0.0028675686268379305, + "loss": 1.1801, + "step": 2479 + }, + { + "epoch": 0.21747560044887226, + "grad_norm": 0.130859375, + "learning_rate": 0.002867392476028856, + "loss": 1.2237, + "step": 2480 + }, + { + "epoch": 0.21756329222324683, + "grad_norm": 0.0966796875, + "learning_rate": 0.00286721621419673, + "loss": 1.1939, + "step": 2481 + }, + { + "epoch": 0.21765098399762137, + "grad_norm": 0.08837890625, + "learning_rate": 0.0028670398413576277, + "loss": 1.2146, + "step": 2482 + }, + { + "epoch": 0.2177386757719959, + "grad_norm": 0.13671875, + "learning_rate": 0.0028668633575276332, + "loss": 1.2449, + "step": 2483 + }, + { + "epoch": 0.21782636754637044, + "grad_norm": 0.057861328125, + "learning_rate": 0.002866686762722842, + "loss": 1.2002, + "step": 2484 + }, + { + "epoch": 0.217914059320745, + "grad_norm": 0.1044921875, + "learning_rate": 0.0028665100569593596, + "loss": 1.2648, + "step": 2485 + }, + { + "epoch": 0.21800175109511954, + "grad_norm": 0.060791015625, + "learning_rate": 0.0028663332402533, + "loss": 1.2024, + "step": 2486 + }, + { + "epoch": 0.21808944286949408, + "grad_norm": 0.06884765625, + "learning_rate": 0.0028661563126207885, + "loss": 1.2726, + "step": 2487 + }, + { + "epoch": 0.21817713464386862, + "grad_norm": 0.06640625, + "learning_rate": 0.0028659792740779607, + "loss": 1.2363, + "step": 2488 + }, + { + "epoch": 0.21826482641824319, + "grad_norm": 0.08154296875, + "learning_rate": 0.0028658021246409627, + "loss": 1.2317, + "step": 2489 + }, + { + "epoch": 0.21835251819261772, + "grad_norm": 0.11279296875, + "learning_rate": 0.002865624864325949, + "loss": 1.2804, + "step": 2490 + }, + { + "epoch": 0.21844020996699226, + "grad_norm": 0.10595703125, + "learning_rate": 0.0028654474931490864, + "loss": 1.19, + "step": 2491 + }, + { + "epoch": 0.21852790174136683, + "grad_norm": 0.10400390625, + "learning_rate": 0.0028652700111265494, + "loss": 1.2135, + "step": 2492 + }, + { + "epoch": 0.21861559351574136, + "grad_norm": 0.05859375, + "learning_rate": 0.002865092418274525, + "loss": 1.2374, + "step": 2493 + }, + { + "epoch": 0.2187032852901159, + "grad_norm": 0.0830078125, + "learning_rate": 0.0028649147146092087, + "loss": 1.2551, + "step": 2494 + }, + { + "epoch": 0.21879097706449044, + "grad_norm": 0.062255859375, + "learning_rate": 0.0028647369001468066, + "loss": 1.2747, + "step": 2495 + }, + { + "epoch": 0.218878668838865, + "grad_norm": 0.1123046875, + "learning_rate": 0.002864558974903535, + "loss": 1.2038, + "step": 2496 + }, + { + "epoch": 0.21896636061323954, + "grad_norm": 0.0625, + "learning_rate": 0.0028643809388956202, + "loss": 1.2603, + "step": 2497 + }, + { + "epoch": 0.21905405238761408, + "grad_norm": 0.07275390625, + "learning_rate": 0.0028642027921392996, + "loss": 1.225, + "step": 2498 + }, + { + "epoch": 0.21914174416198862, + "grad_norm": 0.07763671875, + "learning_rate": 0.0028640245346508177, + "loss": 1.2936, + "step": 2499 + }, + { + "epoch": 0.21922943593636318, + "grad_norm": 0.09228515625, + "learning_rate": 0.0028638461664464328, + "loss": 1.2442, + "step": 2500 + }, + { + "epoch": 0.21922943593636318, + "eval_loss": 1.2314904928207397, + "eval_runtime": 428.341, + "eval_samples_per_second": 33.728, + "eval_steps_per_second": 8.433, + "step": 2500 + }, + { + "epoch": 0.21931712771073772, + "grad_norm": 0.0693359375, + "learning_rate": 0.002863667687542411, + "loss": 1.2263, + "step": 2501 + }, + { + "epoch": 0.21940481948511226, + "grad_norm": 0.0634765625, + "learning_rate": 0.0028634890979550285, + "loss": 1.2238, + "step": 2502 + }, + { + "epoch": 0.21949251125948682, + "grad_norm": 0.09326171875, + "learning_rate": 0.002863310397700574, + "loss": 1.1454, + "step": 2503 + }, + { + "epoch": 0.21958020303386136, + "grad_norm": 0.70703125, + "learning_rate": 0.0028631315867953424, + "loss": 1.1863, + "step": 2504 + }, + { + "epoch": 0.2196678948082359, + "grad_norm": 0.08203125, + "learning_rate": 0.0028629526652556424, + "loss": 1.2242, + "step": 2505 + }, + { + "epoch": 0.21975558658261043, + "grad_norm": 0.07080078125, + "learning_rate": 0.0028627736330977902, + "loss": 1.2066, + "step": 2506 + }, + { + "epoch": 0.219843278356985, + "grad_norm": 0.1357421875, + "learning_rate": 0.0028625944903381134, + "loss": 1.1913, + "step": 2507 + }, + { + "epoch": 0.21993097013135954, + "grad_norm": 0.087890625, + "learning_rate": 0.0028624152369929492, + "loss": 1.2735, + "step": 2508 + }, + { + "epoch": 0.22001866190573408, + "grad_norm": 0.099609375, + "learning_rate": 0.0028622358730786457, + "loss": 1.1731, + "step": 2509 + }, + { + "epoch": 0.22010635368010864, + "grad_norm": 0.08740234375, + "learning_rate": 0.00286205639861156, + "loss": 1.1977, + "step": 2510 + }, + { + "epoch": 0.22019404545448318, + "grad_norm": 0.080078125, + "learning_rate": 0.0028618768136080596, + "loss": 1.2318, + "step": 2511 + }, + { + "epoch": 0.22028173722885772, + "grad_norm": 0.1005859375, + "learning_rate": 0.0028616971180845216, + "loss": 1.2453, + "step": 2512 + }, + { + "epoch": 0.22036942900323225, + "grad_norm": 0.07763671875, + "learning_rate": 0.002861517312057335, + "loss": 1.2301, + "step": 2513 + }, + { + "epoch": 0.22045712077760682, + "grad_norm": 0.07568359375, + "learning_rate": 0.0028613373955428964, + "loss": 1.238, + "step": 2514 + }, + { + "epoch": 0.22054481255198136, + "grad_norm": 0.078125, + "learning_rate": 0.002861157368557615, + "loss": 1.2046, + "step": 2515 + }, + { + "epoch": 0.2206325043263559, + "grad_norm": 0.0615234375, + "learning_rate": 0.0028609772311179085, + "loss": 1.2274, + "step": 2516 + }, + { + "epoch": 0.22072019610073043, + "grad_norm": 0.06005859375, + "learning_rate": 0.002860796983240204, + "loss": 1.1724, + "step": 2517 + }, + { + "epoch": 0.220807887875105, + "grad_norm": 0.0693359375, + "learning_rate": 0.0028606166249409404, + "loss": 1.2101, + "step": 2518 + }, + { + "epoch": 0.22089557964947953, + "grad_norm": 0.06298828125, + "learning_rate": 0.002860436156236566, + "loss": 1.1699, + "step": 2519 + }, + { + "epoch": 0.22098327142385407, + "grad_norm": 0.0966796875, + "learning_rate": 0.0028602555771435386, + "loss": 1.226, + "step": 2520 + }, + { + "epoch": 0.22107096319822864, + "grad_norm": 0.06298828125, + "learning_rate": 0.002860074887678327, + "loss": 1.2821, + "step": 2521 + }, + { + "epoch": 0.22115865497260317, + "grad_norm": 0.06298828125, + "learning_rate": 0.0028598940878574097, + "loss": 1.1798, + "step": 2522 + }, + { + "epoch": 0.2212463467469777, + "grad_norm": 0.060546875, + "learning_rate": 0.0028597131776972743, + "loss": 1.2217, + "step": 2523 + }, + { + "epoch": 0.22133403852135225, + "grad_norm": 0.09130859375, + "learning_rate": 0.0028595321572144204, + "loss": 1.2743, + "step": 2524 + }, + { + "epoch": 0.22142173029572682, + "grad_norm": 0.0986328125, + "learning_rate": 0.002859351026425356, + "loss": 1.1935, + "step": 2525 + }, + { + "epoch": 0.22150942207010135, + "grad_norm": 0.0625, + "learning_rate": 0.0028591697853466, + "loss": 1.2026, + "step": 2526 + }, + { + "epoch": 0.2215971138444759, + "grad_norm": 0.078125, + "learning_rate": 0.002858988433994681, + "loss": 1.2264, + "step": 2527 + }, + { + "epoch": 0.22168480561885046, + "grad_norm": 0.064453125, + "learning_rate": 0.002858806972386138, + "loss": 1.1948, + "step": 2528 + }, + { + "epoch": 0.221772497393225, + "grad_norm": 0.09716796875, + "learning_rate": 0.0028586254005375194, + "loss": 1.2117, + "step": 2529 + }, + { + "epoch": 0.22186018916759953, + "grad_norm": 0.059814453125, + "learning_rate": 0.0028584437184653844, + "loss": 1.247, + "step": 2530 + }, + { + "epoch": 0.22194788094197407, + "grad_norm": 0.060546875, + "learning_rate": 0.0028582619261863017, + "loss": 1.1657, + "step": 2531 + }, + { + "epoch": 0.22203557271634863, + "grad_norm": 0.0634765625, + "learning_rate": 0.0028580800237168504, + "loss": 1.2557, + "step": 2532 + }, + { + "epoch": 0.22212326449072317, + "grad_norm": 0.060791015625, + "learning_rate": 0.0028578980110736197, + "loss": 1.1681, + "step": 2533 + }, + { + "epoch": 0.2222109562650977, + "grad_norm": 0.07666015625, + "learning_rate": 0.0028577158882732087, + "loss": 1.2274, + "step": 2534 + }, + { + "epoch": 0.22229864803947225, + "grad_norm": 0.07177734375, + "learning_rate": 0.0028575336553322266, + "loss": 1.2279, + "step": 2535 + }, + { + "epoch": 0.2223863398138468, + "grad_norm": 0.060546875, + "learning_rate": 0.002857351312267292, + "loss": 1.2491, + "step": 2536 + }, + { + "epoch": 0.22247403158822135, + "grad_norm": 0.0849609375, + "learning_rate": 0.0028571688590950345, + "loss": 1.2769, + "step": 2537 + }, + { + "epoch": 0.2225617233625959, + "grad_norm": 0.06884765625, + "learning_rate": 0.0028569862958320934, + "loss": 1.1647, + "step": 2538 + }, + { + "epoch": 0.22264941513697045, + "grad_norm": 0.058837890625, + "learning_rate": 0.0028568036224951177, + "loss": 1.2185, + "step": 2539 + }, + { + "epoch": 0.222737106911345, + "grad_norm": 0.06884765625, + "learning_rate": 0.0028566208391007674, + "loss": 1.2816, + "step": 2540 + }, + { + "epoch": 0.22282479868571953, + "grad_norm": 0.08203125, + "learning_rate": 0.002856437945665711, + "loss": 1.2133, + "step": 2541 + }, + { + "epoch": 0.22291249046009406, + "grad_norm": 0.07958984375, + "learning_rate": 0.002856254942206629, + "loss": 1.24, + "step": 2542 + }, + { + "epoch": 0.22300018223446863, + "grad_norm": 0.1337890625, + "learning_rate": 0.0028560718287402097, + "loss": 1.1865, + "step": 2543 + }, + { + "epoch": 0.22308787400884317, + "grad_norm": 0.076171875, + "learning_rate": 0.002855888605283153, + "loss": 1.2732, + "step": 2544 + }, + { + "epoch": 0.2231755657832177, + "grad_norm": 0.1474609375, + "learning_rate": 0.002855705271852169, + "loss": 1.179, + "step": 2545 + }, + { + "epoch": 0.22326325755759227, + "grad_norm": 0.0693359375, + "learning_rate": 0.002855521828463976, + "loss": 1.211, + "step": 2546 + }, + { + "epoch": 0.2233509493319668, + "grad_norm": 0.10205078125, + "learning_rate": 0.002855338275135305, + "loss": 1.1939, + "step": 2547 + }, + { + "epoch": 0.22343864110634135, + "grad_norm": 0.0712890625, + "learning_rate": 0.002855154611882894, + "loss": 1.2526, + "step": 2548 + }, + { + "epoch": 0.22352633288071588, + "grad_norm": 0.126953125, + "learning_rate": 0.0028549708387234944, + "loss": 1.267, + "step": 2549 + }, + { + "epoch": 0.22361402465509045, + "grad_norm": 0.11083984375, + "learning_rate": 0.0028547869556738645, + "loss": 1.2507, + "step": 2550 + }, + { + "epoch": 0.223701716429465, + "grad_norm": 0.08935546875, + "learning_rate": 0.0028546029627507744, + "loss": 1.256, + "step": 2551 + }, + { + "epoch": 0.22378940820383952, + "grad_norm": 0.1044921875, + "learning_rate": 0.0028544188599710038, + "loss": 1.1852, + "step": 2552 + }, + { + "epoch": 0.22387709997821406, + "grad_norm": 0.10546875, + "learning_rate": 0.0028542346473513424, + "loss": 1.2165, + "step": 2553 + }, + { + "epoch": 0.22396479175258863, + "grad_norm": 0.09716796875, + "learning_rate": 0.00285405032490859, + "loss": 1.154, + "step": 2554 + }, + { + "epoch": 0.22405248352696316, + "grad_norm": 0.10693359375, + "learning_rate": 0.0028538658926595558, + "loss": 1.1535, + "step": 2555 + }, + { + "epoch": 0.2241401753013377, + "grad_norm": 0.1015625, + "learning_rate": 0.0028536813506210602, + "loss": 1.2141, + "step": 2556 + }, + { + "epoch": 0.22422786707571227, + "grad_norm": 0.0732421875, + "learning_rate": 0.0028534966988099327, + "loss": 1.2095, + "step": 2557 + }, + { + "epoch": 0.2243155588500868, + "grad_norm": 0.0908203125, + "learning_rate": 0.002853311937243013, + "loss": 1.2091, + "step": 2558 + }, + { + "epoch": 0.22440325062446134, + "grad_norm": 0.08740234375, + "learning_rate": 0.002853127065937151, + "loss": 1.2521, + "step": 2559 + }, + { + "epoch": 0.22449094239883588, + "grad_norm": 0.0927734375, + "learning_rate": 0.0028529420849092066, + "loss": 1.2062, + "step": 2560 + }, + { + "epoch": 0.22457863417321045, + "grad_norm": 0.08349609375, + "learning_rate": 0.0028527569941760493, + "loss": 1.17, + "step": 2561 + }, + { + "epoch": 0.22466632594758498, + "grad_norm": 0.0927734375, + "learning_rate": 0.002852571793754559, + "loss": 1.204, + "step": 2562 + }, + { + "epoch": 0.22475401772195952, + "grad_norm": 0.09375, + "learning_rate": 0.0028523864836616257, + "loss": 1.207, + "step": 2563 + }, + { + "epoch": 0.22484170949633409, + "grad_norm": 0.12353515625, + "learning_rate": 0.002852201063914149, + "loss": 1.1967, + "step": 2564 + }, + { + "epoch": 0.22492940127070862, + "grad_norm": 0.08447265625, + "learning_rate": 0.002852015534529039, + "loss": 1.1718, + "step": 2565 + }, + { + "epoch": 0.22501709304508316, + "grad_norm": 0.1181640625, + "learning_rate": 0.0028518298955232148, + "loss": 1.2645, + "step": 2566 + }, + { + "epoch": 0.2251047848194577, + "grad_norm": 0.123046875, + "learning_rate": 0.0028516441469136067, + "loss": 1.242, + "step": 2567 + }, + { + "epoch": 0.22519247659383226, + "grad_norm": 0.1787109375, + "learning_rate": 0.002851458288717154, + "loss": 1.2146, + "step": 2568 + }, + { + "epoch": 0.2252801683682068, + "grad_norm": 0.07568359375, + "learning_rate": 0.0028512723209508078, + "loss": 1.2159, + "step": 2569 + }, + { + "epoch": 0.22536786014258134, + "grad_norm": 0.1376953125, + "learning_rate": 0.002851086243631527, + "loss": 1.2566, + "step": 2570 + }, + { + "epoch": 0.22545555191695588, + "grad_norm": 0.06591796875, + "learning_rate": 0.0028509000567762807, + "loss": 1.2297, + "step": 2571 + }, + { + "epoch": 0.22554324369133044, + "grad_norm": 0.08935546875, + "learning_rate": 0.00285071376040205, + "loss": 1.2149, + "step": 2572 + }, + { + "epoch": 0.22563093546570498, + "grad_norm": 0.06884765625, + "learning_rate": 0.002850527354525823, + "loss": 1.1809, + "step": 2573 + }, + { + "epoch": 0.22571862724007952, + "grad_norm": 0.09423828125, + "learning_rate": 0.002850340839164601, + "loss": 1.1465, + "step": 2574 + }, + { + "epoch": 0.22580631901445408, + "grad_norm": 0.062255859375, + "learning_rate": 0.002850154214335393, + "loss": 1.1898, + "step": 2575 + }, + { + "epoch": 0.22589401078882862, + "grad_norm": 0.0634765625, + "learning_rate": 0.002849967480055219, + "loss": 1.1575, + "step": 2576 + }, + { + "epoch": 0.22598170256320316, + "grad_norm": 0.09814453125, + "learning_rate": 0.0028497806363411084, + "loss": 1.2273, + "step": 2577 + }, + { + "epoch": 0.2260693943375777, + "grad_norm": 0.06201171875, + "learning_rate": 0.002849593683210101, + "loss": 1.215, + "step": 2578 + }, + { + "epoch": 0.22615708611195226, + "grad_norm": 0.06103515625, + "learning_rate": 0.0028494066206792464, + "loss": 1.249, + "step": 2579 + }, + { + "epoch": 0.2262447778863268, + "grad_norm": 0.08984375, + "learning_rate": 0.0028492194487656046, + "loss": 1.2699, + "step": 2580 + }, + { + "epoch": 0.22633246966070134, + "grad_norm": 0.054931640625, + "learning_rate": 0.0028490321674862445, + "loss": 1.2158, + "step": 2581 + }, + { + "epoch": 0.2264201614350759, + "grad_norm": 0.1103515625, + "learning_rate": 0.0028488447768582463, + "loss": 1.2098, + "step": 2582 + }, + { + "epoch": 0.22650785320945044, + "grad_norm": 0.06787109375, + "learning_rate": 0.0028486572768986985, + "loss": 1.2614, + "step": 2583 + }, + { + "epoch": 0.22659554498382498, + "grad_norm": 0.150390625, + "learning_rate": 0.0028484696676247022, + "loss": 1.2308, + "step": 2584 + }, + { + "epoch": 0.2266832367581995, + "grad_norm": 0.0634765625, + "learning_rate": 0.0028482819490533657, + "loss": 1.2501, + "step": 2585 + }, + { + "epoch": 0.22677092853257408, + "grad_norm": 0.12060546875, + "learning_rate": 0.0028480941212018084, + "loss": 1.2196, + "step": 2586 + }, + { + "epoch": 0.22685862030694862, + "grad_norm": 0.09130859375, + "learning_rate": 0.0028479061840871608, + "loss": 1.2298, + "step": 2587 + }, + { + "epoch": 0.22694631208132315, + "grad_norm": 0.0849609375, + "learning_rate": 0.0028477181377265614, + "loss": 1.2795, + "step": 2588 + }, + { + "epoch": 0.2270340038556977, + "grad_norm": 0.1337890625, + "learning_rate": 0.00284752998213716, + "loss": 1.2464, + "step": 2589 + }, + { + "epoch": 0.22712169563007226, + "grad_norm": 0.1328125, + "learning_rate": 0.002847341717336115, + "loss": 1.2755, + "step": 2590 + }, + { + "epoch": 0.2272093874044468, + "grad_norm": 0.1533203125, + "learning_rate": 0.0028471533433405967, + "loss": 1.1613, + "step": 2591 + }, + { + "epoch": 0.22729707917882133, + "grad_norm": 0.1884765625, + "learning_rate": 0.002846964860167784, + "loss": 1.2249, + "step": 2592 + }, + { + "epoch": 0.2273847709531959, + "grad_norm": 0.08544921875, + "learning_rate": 0.0028467762678348655, + "loss": 1.1909, + "step": 2593 + }, + { + "epoch": 0.22747246272757043, + "grad_norm": 0.0712890625, + "learning_rate": 0.002846587566359041, + "loss": 1.1888, + "step": 2594 + }, + { + "epoch": 0.22756015450194497, + "grad_norm": 0.193359375, + "learning_rate": 0.0028463987557575195, + "loss": 1.2059, + "step": 2595 + }, + { + "epoch": 0.2276478462763195, + "grad_norm": 0.07080078125, + "learning_rate": 0.0028462098360475204, + "loss": 1.2286, + "step": 2596 + }, + { + "epoch": 0.22773553805069408, + "grad_norm": 0.119140625, + "learning_rate": 0.0028460208072462715, + "loss": 1.2356, + "step": 2597 + }, + { + "epoch": 0.2278232298250686, + "grad_norm": 0.107421875, + "learning_rate": 0.002845831669371013, + "loss": 1.1843, + "step": 2598 + }, + { + "epoch": 0.22791092159944315, + "grad_norm": 0.07421875, + "learning_rate": 0.0028456424224389933, + "loss": 1.3018, + "step": 2599 + }, + { + "epoch": 0.22799861337381772, + "grad_norm": 0.140625, + "learning_rate": 0.0028454530664674712, + "loss": 1.213, + "step": 2600 + }, + { + "epoch": 0.22808630514819225, + "grad_norm": 0.06689453125, + "learning_rate": 0.0028452636014737152, + "loss": 1.2077, + "step": 2601 + }, + { + "epoch": 0.2281739969225668, + "grad_norm": 0.109375, + "learning_rate": 0.002845074027475005, + "loss": 1.1707, + "step": 2602 + }, + { + "epoch": 0.22826168869694133, + "grad_norm": 0.05908203125, + "learning_rate": 0.0028448843444886275, + "loss": 1.2423, + "step": 2603 + }, + { + "epoch": 0.2283493804713159, + "grad_norm": 0.11376953125, + "learning_rate": 0.002844694552531883, + "loss": 1.1616, + "step": 2604 + }, + { + "epoch": 0.22843707224569043, + "grad_norm": 0.07080078125, + "learning_rate": 0.0028445046516220795, + "loss": 1.2025, + "step": 2605 + }, + { + "epoch": 0.22852476402006497, + "grad_norm": 0.1201171875, + "learning_rate": 0.0028443146417765353, + "loss": 1.2347, + "step": 2606 + }, + { + "epoch": 0.2286124557944395, + "grad_norm": 0.059326171875, + "learning_rate": 0.0028441245230125785, + "loss": 1.245, + "step": 2607 + }, + { + "epoch": 0.22870014756881407, + "grad_norm": 0.11279296875, + "learning_rate": 0.002843934295347548, + "loss": 1.2798, + "step": 2608 + }, + { + "epoch": 0.2287878393431886, + "grad_norm": 0.095703125, + "learning_rate": 0.002843743958798792, + "loss": 1.2723, + "step": 2609 + }, + { + "epoch": 0.22887553111756315, + "grad_norm": 0.1494140625, + "learning_rate": 0.0028435535133836684, + "loss": 1.2685, + "step": 2610 + }, + { + "epoch": 0.2289632228919377, + "grad_norm": 0.07568359375, + "learning_rate": 0.0028433629591195454, + "loss": 1.1875, + "step": 2611 + }, + { + "epoch": 0.22905091466631225, + "grad_norm": 0.1181640625, + "learning_rate": 0.0028431722960238015, + "loss": 1.2832, + "step": 2612 + }, + { + "epoch": 0.2291386064406868, + "grad_norm": 0.07861328125, + "learning_rate": 0.002842981524113825, + "loss": 1.1799, + "step": 2613 + }, + { + "epoch": 0.22922629821506132, + "grad_norm": 0.1318359375, + "learning_rate": 0.002842790643407012, + "loss": 1.2775, + "step": 2614 + }, + { + "epoch": 0.2293139899894359, + "grad_norm": 0.07763671875, + "learning_rate": 0.0028425996539207716, + "loss": 1.2276, + "step": 2615 + }, + { + "epoch": 0.22940168176381043, + "grad_norm": 0.10009765625, + "learning_rate": 0.0028424085556725217, + "loss": 1.2903, + "step": 2616 + }, + { + "epoch": 0.22948937353818497, + "grad_norm": 0.0888671875, + "learning_rate": 0.00284221734867969, + "loss": 1.1447, + "step": 2617 + }, + { + "epoch": 0.22957706531255953, + "grad_norm": 0.0654296875, + "learning_rate": 0.0028420260329597136, + "loss": 1.2324, + "step": 2618 + }, + { + "epoch": 0.22966475708693407, + "grad_norm": 0.087890625, + "learning_rate": 0.0028418346085300407, + "loss": 1.2355, + "step": 2619 + }, + { + "epoch": 0.2297524488613086, + "grad_norm": 0.08154296875, + "learning_rate": 0.0028416430754081273, + "loss": 1.2618, + "step": 2620 + }, + { + "epoch": 0.22984014063568314, + "grad_norm": 0.1318359375, + "learning_rate": 0.002841451433611442, + "loss": 1.2654, + "step": 2621 + }, + { + "epoch": 0.2299278324100577, + "grad_norm": 0.05859375, + "learning_rate": 0.0028412596831574616, + "loss": 1.2563, + "step": 2622 + }, + { + "epoch": 0.23001552418443225, + "grad_norm": 0.099609375, + "learning_rate": 0.0028410678240636737, + "loss": 1.2498, + "step": 2623 + }, + { + "epoch": 0.23010321595880678, + "grad_norm": 0.087890625, + "learning_rate": 0.0028408758563475745, + "loss": 1.2915, + "step": 2624 + }, + { + "epoch": 0.23019090773318132, + "grad_norm": 0.07080078125, + "learning_rate": 0.002840683780026672, + "loss": 1.1496, + "step": 2625 + }, + { + "epoch": 0.2302785995075559, + "grad_norm": 0.0771484375, + "learning_rate": 0.0028404915951184824, + "loss": 1.1934, + "step": 2626 + }, + { + "epoch": 0.23036629128193042, + "grad_norm": 0.06640625, + "learning_rate": 0.0028402993016405317, + "loss": 1.2236, + "step": 2627 + }, + { + "epoch": 0.23045398305630496, + "grad_norm": 0.060791015625, + "learning_rate": 0.0028401068996103582, + "loss": 1.261, + "step": 2628 + }, + { + "epoch": 0.23054167483067953, + "grad_norm": 0.06298828125, + "learning_rate": 0.0028399143890455074, + "loss": 1.2673, + "step": 2629 + }, + { + "epoch": 0.23062936660505406, + "grad_norm": 0.064453125, + "learning_rate": 0.0028397217699635364, + "loss": 1.1613, + "step": 2630 + }, + { + "epoch": 0.2307170583794286, + "grad_norm": 0.07470703125, + "learning_rate": 0.002839529042382011, + "loss": 1.2489, + "step": 2631 + }, + { + "epoch": 0.23080475015380314, + "grad_norm": 0.09912109375, + "learning_rate": 0.002839336206318508, + "loss": 1.1916, + "step": 2632 + }, + { + "epoch": 0.2308924419281777, + "grad_norm": 0.08740234375, + "learning_rate": 0.0028391432617906123, + "loss": 1.2008, + "step": 2633 + }, + { + "epoch": 0.23098013370255224, + "grad_norm": 0.07421875, + "learning_rate": 0.002838950208815922, + "loss": 1.2168, + "step": 2634 + }, + { + "epoch": 0.23106782547692678, + "grad_norm": 0.0791015625, + "learning_rate": 0.0028387570474120412, + "loss": 1.2238, + "step": 2635 + }, + { + "epoch": 0.23115551725130135, + "grad_norm": 0.06005859375, + "learning_rate": 0.0028385637775965866, + "loss": 1.2041, + "step": 2636 + }, + { + "epoch": 0.23124320902567588, + "grad_norm": 0.0537109375, + "learning_rate": 0.002838370399387184, + "loss": 1.2188, + "step": 2637 + }, + { + "epoch": 0.23133090080005042, + "grad_norm": 0.054931640625, + "learning_rate": 0.002838176912801468, + "loss": 1.1179, + "step": 2638 + }, + { + "epoch": 0.23141859257442496, + "grad_norm": 0.095703125, + "learning_rate": 0.002837983317857086, + "loss": 1.257, + "step": 2639 + }, + { + "epoch": 0.23150628434879952, + "grad_norm": 0.0791015625, + "learning_rate": 0.0028377896145716916, + "loss": 1.2597, + "step": 2640 + }, + { + "epoch": 0.23159397612317406, + "grad_norm": 0.057861328125, + "learning_rate": 0.0028375958029629505, + "loss": 1.2147, + "step": 2641 + }, + { + "epoch": 0.2316816678975486, + "grad_norm": 0.0556640625, + "learning_rate": 0.0028374018830485377, + "loss": 1.1762, + "step": 2642 + }, + { + "epoch": 0.23176935967192314, + "grad_norm": 0.06982421875, + "learning_rate": 0.0028372078548461388, + "loss": 1.2334, + "step": 2643 + }, + { + "epoch": 0.2318570514462977, + "grad_norm": 0.0654296875, + "learning_rate": 0.002837013718373448, + "loss": 1.1568, + "step": 2644 + }, + { + "epoch": 0.23194474322067224, + "grad_norm": 0.08740234375, + "learning_rate": 0.0028368194736481706, + "loss": 1.2559, + "step": 2645 + }, + { + "epoch": 0.23203243499504678, + "grad_norm": 0.060302734375, + "learning_rate": 0.002836625120688021, + "loss": 1.2163, + "step": 2646 + }, + { + "epoch": 0.23212012676942134, + "grad_norm": 0.11474609375, + "learning_rate": 0.002836430659510724, + "loss": 1.26, + "step": 2647 + }, + { + "epoch": 0.23220781854379588, + "grad_norm": 0.0654296875, + "learning_rate": 0.002836236090134013, + "loss": 1.1791, + "step": 2648 + }, + { + "epoch": 0.23229551031817042, + "grad_norm": 0.1416015625, + "learning_rate": 0.002836041412575633, + "loss": 1.2183, + "step": 2649 + }, + { + "epoch": 0.23238320209254495, + "grad_norm": 0.061279296875, + "learning_rate": 0.0028358466268533383, + "loss": 1.2223, + "step": 2650 + }, + { + "epoch": 0.23247089386691952, + "grad_norm": 0.1318359375, + "learning_rate": 0.0028356517329848918, + "loss": 1.2644, + "step": 2651 + }, + { + "epoch": 0.23255858564129406, + "grad_norm": 0.06396484375, + "learning_rate": 0.0028354567309880683, + "loss": 1.2642, + "step": 2652 + }, + { + "epoch": 0.2326462774156686, + "grad_norm": 0.07861328125, + "learning_rate": 0.002835261620880651, + "loss": 1.1787, + "step": 2653 + }, + { + "epoch": 0.23273396919004316, + "grad_norm": 0.0732421875, + "learning_rate": 0.002835066402680434, + "loss": 1.254, + "step": 2654 + }, + { + "epoch": 0.2328216609644177, + "grad_norm": 0.1005859375, + "learning_rate": 0.00283487107640522, + "loss": 1.2245, + "step": 2655 + }, + { + "epoch": 0.23290935273879224, + "grad_norm": 0.0712890625, + "learning_rate": 0.002834675642072823, + "loss": 1.1839, + "step": 2656 + }, + { + "epoch": 0.23299704451316677, + "grad_norm": 0.08544921875, + "learning_rate": 0.0028344800997010654, + "loss": 1.2421, + "step": 2657 + }, + { + "epoch": 0.23308473628754134, + "grad_norm": 0.0791015625, + "learning_rate": 0.00283428444930778, + "loss": 1.2176, + "step": 2658 + }, + { + "epoch": 0.23317242806191588, + "grad_norm": 0.056640625, + "learning_rate": 0.0028340886909108106, + "loss": 1.2124, + "step": 2659 + }, + { + "epoch": 0.2332601198362904, + "grad_norm": 0.1708984375, + "learning_rate": 0.002833892824528009, + "loss": 1.2466, + "step": 2660 + }, + { + "epoch": 0.23334781161066495, + "grad_norm": 0.109375, + "learning_rate": 0.002833696850177238, + "loss": 1.2166, + "step": 2661 + }, + { + "epoch": 0.23343550338503952, + "grad_norm": 0.111328125, + "learning_rate": 0.00283350076787637, + "loss": 1.203, + "step": 2662 + }, + { + "epoch": 0.23352319515941405, + "grad_norm": 0.06640625, + "learning_rate": 0.0028333045776432874, + "loss": 1.254, + "step": 2663 + }, + { + "epoch": 0.2336108869337886, + "grad_norm": 0.10009765625, + "learning_rate": 0.002833108279495882, + "loss": 1.2022, + "step": 2664 + }, + { + "epoch": 0.23369857870816316, + "grad_norm": 0.060791015625, + "learning_rate": 0.0028329118734520555, + "loss": 1.2903, + "step": 2665 + }, + { + "epoch": 0.2337862704825377, + "grad_norm": 0.076171875, + "learning_rate": 0.0028327153595297198, + "loss": 1.269, + "step": 2666 + }, + { + "epoch": 0.23387396225691223, + "grad_norm": 0.0654296875, + "learning_rate": 0.002832518737746797, + "loss": 1.129, + "step": 2667 + }, + { + "epoch": 0.23396165403128677, + "grad_norm": 0.07177734375, + "learning_rate": 0.0028323220081212168, + "loss": 1.2234, + "step": 2668 + }, + { + "epoch": 0.23404934580566134, + "grad_norm": 0.10498046875, + "learning_rate": 0.0028321251706709223, + "loss": 1.2084, + "step": 2669 + }, + { + "epoch": 0.23413703758003587, + "grad_norm": 0.10986328125, + "learning_rate": 0.0028319282254138643, + "loss": 1.1985, + "step": 2670 + }, + { + "epoch": 0.2342247293544104, + "grad_norm": 0.07275390625, + "learning_rate": 0.0028317311723680023, + "loss": 1.169, + "step": 2671 + }, + { + "epoch": 0.23431242112878498, + "grad_norm": 0.14453125, + "learning_rate": 0.0028315340115513087, + "loss": 1.2108, + "step": 2672 + }, + { + "epoch": 0.2344001129031595, + "grad_norm": 0.0732421875, + "learning_rate": 0.002831336742981763, + "loss": 1.2387, + "step": 2673 + }, + { + "epoch": 0.23448780467753405, + "grad_norm": 0.1376953125, + "learning_rate": 0.0028311393666773563, + "loss": 1.1739, + "step": 2674 + }, + { + "epoch": 0.2345754964519086, + "grad_norm": 0.0849609375, + "learning_rate": 0.0028309418826560884, + "loss": 1.238, + "step": 2675 + }, + { + "epoch": 0.23466318822628315, + "grad_norm": 0.07373046875, + "learning_rate": 0.0028307442909359694, + "loss": 1.2088, + "step": 2676 + }, + { + "epoch": 0.2347508800006577, + "grad_norm": 0.10400390625, + "learning_rate": 0.002830546591535019, + "loss": 1.2301, + "step": 2677 + }, + { + "epoch": 0.23483857177503223, + "grad_norm": 0.083984375, + "learning_rate": 0.002830348784471267, + "loss": 1.2532, + "step": 2678 + }, + { + "epoch": 0.23492626354940677, + "grad_norm": 0.07958984375, + "learning_rate": 0.0028301508697627527, + "loss": 1.2274, + "step": 2679 + }, + { + "epoch": 0.23501395532378133, + "grad_norm": 0.058349609375, + "learning_rate": 0.0028299528474275263, + "loss": 1.2825, + "step": 2680 + }, + { + "epoch": 0.23510164709815587, + "grad_norm": 0.0869140625, + "learning_rate": 0.0028297547174836457, + "loss": 1.1979, + "step": 2681 + }, + { + "epoch": 0.2351893388725304, + "grad_norm": 0.0732421875, + "learning_rate": 0.0028295564799491807, + "loss": 1.2234, + "step": 2682 + }, + { + "epoch": 0.23527703064690497, + "grad_norm": 0.0751953125, + "learning_rate": 0.0028293581348422096, + "loss": 1.1451, + "step": 2683 + }, + { + "epoch": 0.2353647224212795, + "grad_norm": 0.0693359375, + "learning_rate": 0.002829159682180821, + "loss": 1.1801, + "step": 2684 + }, + { + "epoch": 0.23545241419565405, + "grad_norm": 0.091796875, + "learning_rate": 0.0028289611219831133, + "loss": 1.1826, + "step": 2685 + }, + { + "epoch": 0.23554010597002858, + "grad_norm": 0.07080078125, + "learning_rate": 0.002828762454267195, + "loss": 1.2583, + "step": 2686 + }, + { + "epoch": 0.23562779774440315, + "grad_norm": 0.103515625, + "learning_rate": 0.002828563679051184, + "loss": 1.2453, + "step": 2687 + }, + { + "epoch": 0.2357154895187777, + "grad_norm": 0.05859375, + "learning_rate": 0.002828364796353207, + "loss": 1.2039, + "step": 2688 + }, + { + "epoch": 0.23580318129315223, + "grad_norm": 0.1240234375, + "learning_rate": 0.0028281658061914032, + "loss": 1.1532, + "step": 2689 + }, + { + "epoch": 0.2358908730675268, + "grad_norm": 0.099609375, + "learning_rate": 0.0028279667085839197, + "loss": 1.2449, + "step": 2690 + }, + { + "epoch": 0.23597856484190133, + "grad_norm": 0.12109375, + "learning_rate": 0.0028277675035489128, + "loss": 1.1931, + "step": 2691 + }, + { + "epoch": 0.23606625661627587, + "grad_norm": 0.061279296875, + "learning_rate": 0.0028275681911045496, + "loss": 1.1981, + "step": 2692 + }, + { + "epoch": 0.2361539483906504, + "grad_norm": 0.1513671875, + "learning_rate": 0.002827368771269008, + "loss": 1.2245, + "step": 2693 + }, + { + "epoch": 0.23624164016502497, + "grad_norm": 0.0947265625, + "learning_rate": 0.0028271692440604733, + "loss": 1.2685, + "step": 2694 + }, + { + "epoch": 0.2363293319393995, + "grad_norm": 0.08642578125, + "learning_rate": 0.0028269696094971423, + "loss": 1.1999, + "step": 2695 + }, + { + "epoch": 0.23641702371377404, + "grad_norm": 0.1298828125, + "learning_rate": 0.0028267698675972217, + "loss": 1.2199, + "step": 2696 + }, + { + "epoch": 0.23650471548814858, + "grad_norm": 0.07666015625, + "learning_rate": 0.002826570018378927, + "loss": 1.1412, + "step": 2697 + }, + { + "epoch": 0.23659240726252315, + "grad_norm": 0.1689453125, + "learning_rate": 0.0028263700618604832, + "loss": 1.2175, + "step": 2698 + }, + { + "epoch": 0.23668009903689768, + "grad_norm": 0.053466796875, + "learning_rate": 0.0028261699980601275, + "loss": 1.2005, + "step": 2699 + }, + { + "epoch": 0.23676779081127222, + "grad_norm": 0.0693359375, + "learning_rate": 0.0028259698269961033, + "loss": 1.1881, + "step": 2700 + }, + { + "epoch": 0.2368554825856468, + "grad_norm": 0.1259765625, + "learning_rate": 0.0028257695486866674, + "loss": 1.2221, + "step": 2701 + }, + { + "epoch": 0.23694317436002132, + "grad_norm": 0.12109375, + "learning_rate": 0.0028255691631500837, + "loss": 1.229, + "step": 2702 + }, + { + "epoch": 0.23703086613439586, + "grad_norm": 0.11669921875, + "learning_rate": 0.0028253686704046268, + "loss": 1.2126, + "step": 2703 + }, + { + "epoch": 0.2371185579087704, + "grad_norm": 0.1123046875, + "learning_rate": 0.0028251680704685814, + "loss": 1.2506, + "step": 2704 + }, + { + "epoch": 0.23720624968314497, + "grad_norm": 0.0966796875, + "learning_rate": 0.002824967363360242, + "loss": 1.1674, + "step": 2705 + }, + { + "epoch": 0.2372939414575195, + "grad_norm": 0.060302734375, + "learning_rate": 0.002824766549097912, + "loss": 1.2024, + "step": 2706 + }, + { + "epoch": 0.23738163323189404, + "grad_norm": 0.12255859375, + "learning_rate": 0.0028245656276999053, + "loss": 1.1874, + "step": 2707 + }, + { + "epoch": 0.23746932500626858, + "grad_norm": 0.07568359375, + "learning_rate": 0.0028243645991845455, + "loss": 1.2237, + "step": 2708 + }, + { + "epoch": 0.23755701678064314, + "grad_norm": 0.061279296875, + "learning_rate": 0.002824163463570166, + "loss": 1.216, + "step": 2709 + }, + { + "epoch": 0.23764470855501768, + "grad_norm": 0.11669921875, + "learning_rate": 0.0028239622208751096, + "loss": 1.2144, + "step": 2710 + }, + { + "epoch": 0.23773240032939222, + "grad_norm": 0.08203125, + "learning_rate": 0.002823760871117729, + "loss": 1.3082, + "step": 2711 + }, + { + "epoch": 0.23782009210376678, + "grad_norm": 0.09375, + "learning_rate": 0.0028235594143163874, + "loss": 1.2421, + "step": 2712 + }, + { + "epoch": 0.23790778387814132, + "grad_norm": 0.057861328125, + "learning_rate": 0.0028233578504894565, + "loss": 1.2216, + "step": 2713 + }, + { + "epoch": 0.23799547565251586, + "grad_norm": 0.060546875, + "learning_rate": 0.0028231561796553187, + "loss": 1.2552, + "step": 2714 + }, + { + "epoch": 0.2380831674268904, + "grad_norm": 0.06884765625, + "learning_rate": 0.0028229544018323655, + "loss": 1.2286, + "step": 2715 + }, + { + "epoch": 0.23817085920126496, + "grad_norm": 0.0732421875, + "learning_rate": 0.0028227525170389993, + "loss": 1.2183, + "step": 2716 + }, + { + "epoch": 0.2382585509756395, + "grad_norm": 0.0595703125, + "learning_rate": 0.0028225505252936307, + "loss": 1.2415, + "step": 2717 + }, + { + "epoch": 0.23834624275001404, + "grad_norm": 0.061767578125, + "learning_rate": 0.0028223484266146816, + "loss": 1.1825, + "step": 2718 + }, + { + "epoch": 0.2384339345243886, + "grad_norm": 0.0986328125, + "learning_rate": 0.002822146221020582, + "loss": 1.2649, + "step": 2719 + }, + { + "epoch": 0.23852162629876314, + "grad_norm": 0.1455078125, + "learning_rate": 0.0028219439085297726, + "loss": 1.2604, + "step": 2720 + }, + { + "epoch": 0.23860931807313768, + "grad_norm": 0.06787109375, + "learning_rate": 0.0028217414891607046, + "loss": 1.2391, + "step": 2721 + }, + { + "epoch": 0.23869700984751221, + "grad_norm": 0.126953125, + "learning_rate": 0.0028215389629318377, + "loss": 1.2664, + "step": 2722 + }, + { + "epoch": 0.23878470162188678, + "grad_norm": 0.0859375, + "learning_rate": 0.0028213363298616413, + "loss": 1.2401, + "step": 2723 + }, + { + "epoch": 0.23887239339626132, + "grad_norm": 0.0908203125, + "learning_rate": 0.0028211335899685957, + "loss": 1.1829, + "step": 2724 + }, + { + "epoch": 0.23896008517063586, + "grad_norm": 0.09228515625, + "learning_rate": 0.00282093074327119, + "loss": 1.2116, + "step": 2725 + }, + { + "epoch": 0.2390477769450104, + "grad_norm": 0.0947265625, + "learning_rate": 0.002820727789787923, + "loss": 1.3203, + "step": 2726 + }, + { + "epoch": 0.23913546871938496, + "grad_norm": 0.11181640625, + "learning_rate": 0.002820524729537304, + "loss": 1.2356, + "step": 2727 + }, + { + "epoch": 0.2392231604937595, + "grad_norm": 0.1240234375, + "learning_rate": 0.0028203215625378507, + "loss": 1.2577, + "step": 2728 + }, + { + "epoch": 0.23931085226813403, + "grad_norm": 0.0693359375, + "learning_rate": 0.0028201182888080933, + "loss": 1.1915, + "step": 2729 + }, + { + "epoch": 0.2393985440425086, + "grad_norm": 0.1328125, + "learning_rate": 0.0028199149083665675, + "loss": 1.1676, + "step": 2730 + }, + { + "epoch": 0.23948623581688314, + "grad_norm": 0.062255859375, + "learning_rate": 0.0028197114212318223, + "loss": 1.2593, + "step": 2731 + }, + { + "epoch": 0.23957392759125767, + "grad_norm": 0.10302734375, + "learning_rate": 0.002819507827422416, + "loss": 1.1877, + "step": 2732 + }, + { + "epoch": 0.2396616193656322, + "grad_norm": 0.06689453125, + "learning_rate": 0.0028193041269569136, + "loss": 1.2384, + "step": 2733 + }, + { + "epoch": 0.23974931114000678, + "grad_norm": 0.1376953125, + "learning_rate": 0.0028191003198538944, + "loss": 1.2381, + "step": 2734 + }, + { + "epoch": 0.23983700291438131, + "grad_norm": 0.0947265625, + "learning_rate": 0.002818896406131944, + "loss": 1.2749, + "step": 2735 + }, + { + "epoch": 0.23992469468875585, + "grad_norm": 0.078125, + "learning_rate": 0.0028186923858096583, + "loss": 1.2021, + "step": 2736 + }, + { + "epoch": 0.24001238646313042, + "grad_norm": 0.09375, + "learning_rate": 0.002818488258905644, + "loss": 1.2125, + "step": 2737 + }, + { + "epoch": 0.24010007823750495, + "grad_norm": 0.0947265625, + "learning_rate": 0.0028182840254385175, + "loss": 1.2032, + "step": 2738 + }, + { + "epoch": 0.2401877700118795, + "grad_norm": 0.0732421875, + "learning_rate": 0.0028180796854269034, + "loss": 1.2294, + "step": 2739 + }, + { + "epoch": 0.24027546178625403, + "grad_norm": 0.1064453125, + "learning_rate": 0.0028178752388894374, + "loss": 1.1691, + "step": 2740 + }, + { + "epoch": 0.2403631535606286, + "grad_norm": 0.0751953125, + "learning_rate": 0.0028176706858447646, + "loss": 1.2234, + "step": 2741 + }, + { + "epoch": 0.24045084533500313, + "grad_norm": 0.115234375, + "learning_rate": 0.002817466026311539, + "loss": 1.1813, + "step": 2742 + }, + { + "epoch": 0.24053853710937767, + "grad_norm": 0.08251953125, + "learning_rate": 0.002817261260308426, + "loss": 1.1748, + "step": 2743 + }, + { + "epoch": 0.2406262288837522, + "grad_norm": 0.0634765625, + "learning_rate": 0.0028170563878541, + "loss": 1.2314, + "step": 2744 + }, + { + "epoch": 0.24071392065812677, + "grad_norm": 0.080078125, + "learning_rate": 0.0028168514089672433, + "loss": 1.2772, + "step": 2745 + }, + { + "epoch": 0.2408016124325013, + "grad_norm": 0.04931640625, + "learning_rate": 0.0028166463236665505, + "loss": 1.2319, + "step": 2746 + }, + { + "epoch": 0.24088930420687585, + "grad_norm": 0.064453125, + "learning_rate": 0.0028164411319707245, + "loss": 1.2018, + "step": 2747 + }, + { + "epoch": 0.2409769959812504, + "grad_norm": 0.064453125, + "learning_rate": 0.002816235833898479, + "loss": 1.3023, + "step": 2748 + }, + { + "epoch": 0.24106468775562495, + "grad_norm": 0.054931640625, + "learning_rate": 0.002816030429468535, + "loss": 1.2087, + "step": 2749 + }, + { + "epoch": 0.2411523795299995, + "grad_norm": 0.07177734375, + "learning_rate": 0.0028158249186996274, + "loss": 1.1666, + "step": 2750 + }, + { + "epoch": 0.24124007130437403, + "grad_norm": 0.08349609375, + "learning_rate": 0.002815619301610496, + "loss": 1.2572, + "step": 2751 + }, + { + "epoch": 0.2413277630787486, + "grad_norm": 0.0791015625, + "learning_rate": 0.002815413578219893, + "loss": 1.2339, + "step": 2752 + }, + { + "epoch": 0.24141545485312313, + "grad_norm": 0.0693359375, + "learning_rate": 0.0028152077485465807, + "loss": 1.1894, + "step": 2753 + }, + { + "epoch": 0.24150314662749767, + "grad_norm": 0.07373046875, + "learning_rate": 0.0028150018126093294, + "loss": 1.233, + "step": 2754 + }, + { + "epoch": 0.24159083840187223, + "grad_norm": 0.07421875, + "learning_rate": 0.0028147957704269208, + "loss": 1.2057, + "step": 2755 + }, + { + "epoch": 0.24167853017624677, + "grad_norm": 0.1162109375, + "learning_rate": 0.0028145896220181437, + "loss": 1.2393, + "step": 2756 + }, + { + "epoch": 0.2417662219506213, + "grad_norm": 0.07470703125, + "learning_rate": 0.0028143833674018006, + "loss": 1.2517, + "step": 2757 + }, + { + "epoch": 0.24185391372499584, + "grad_norm": 0.11181640625, + "learning_rate": 0.0028141770065966995, + "loss": 1.2469, + "step": 2758 + }, + { + "epoch": 0.2419416054993704, + "grad_norm": 0.0771484375, + "learning_rate": 0.002813970539621661, + "loss": 1.2275, + "step": 2759 + }, + { + "epoch": 0.24202929727374495, + "grad_norm": 0.07080078125, + "learning_rate": 0.002813763966495514, + "loss": 1.1829, + "step": 2760 + }, + { + "epoch": 0.24211698904811949, + "grad_norm": 0.07958984375, + "learning_rate": 0.0028135572872370973, + "loss": 1.1754, + "step": 2761 + }, + { + "epoch": 0.24220468082249402, + "grad_norm": 0.064453125, + "learning_rate": 0.00281335050186526, + "loss": 1.2077, + "step": 2762 + }, + { + "epoch": 0.2422923725968686, + "grad_norm": 0.064453125, + "learning_rate": 0.00281314361039886, + "loss": 1.2106, + "step": 2763 + }, + { + "epoch": 0.24238006437124313, + "grad_norm": 0.10498046875, + "learning_rate": 0.0028129366128567656, + "loss": 1.2277, + "step": 2764 + }, + { + "epoch": 0.24246775614561766, + "grad_norm": 0.06396484375, + "learning_rate": 0.002812729509257854, + "loss": 1.2201, + "step": 2765 + }, + { + "epoch": 0.24255544791999223, + "grad_norm": 0.09814453125, + "learning_rate": 0.002812522299621013, + "loss": 1.198, + "step": 2766 + }, + { + "epoch": 0.24264313969436677, + "grad_norm": 0.0703125, + "learning_rate": 0.0028123149839651388, + "loss": 1.2044, + "step": 2767 + }, + { + "epoch": 0.2427308314687413, + "grad_norm": 0.0810546875, + "learning_rate": 0.002812107562309139, + "loss": 1.2222, + "step": 2768 + }, + { + "epoch": 0.24281852324311584, + "grad_norm": 0.0732421875, + "learning_rate": 0.002811900034671929, + "loss": 1.2447, + "step": 2769 + }, + { + "epoch": 0.2429062150174904, + "grad_norm": 0.111328125, + "learning_rate": 0.0028116924010724354, + "loss": 1.1934, + "step": 2770 + }, + { + "epoch": 0.24299390679186494, + "grad_norm": 0.1015625, + "learning_rate": 0.0028114846615295945, + "loss": 1.2241, + "step": 2771 + }, + { + "epoch": 0.24308159856623948, + "grad_norm": 0.13671875, + "learning_rate": 0.0028112768160623498, + "loss": 1.1974, + "step": 2772 + }, + { + "epoch": 0.24316929034061405, + "grad_norm": 0.08349609375, + "learning_rate": 0.002811068864689658, + "loss": 1.2624, + "step": 2773 + }, + { + "epoch": 0.24325698211498858, + "grad_norm": 0.1572265625, + "learning_rate": 0.0028108608074304824, + "loss": 1.2345, + "step": 2774 + }, + { + "epoch": 0.24334467388936312, + "grad_norm": 0.09814453125, + "learning_rate": 0.0028106526443037985, + "loss": 1.2242, + "step": 2775 + }, + { + "epoch": 0.24343236566373766, + "grad_norm": 0.1201171875, + "learning_rate": 0.002810444375328589, + "loss": 1.204, + "step": 2776 + }, + { + "epoch": 0.24352005743811223, + "grad_norm": 0.1103515625, + "learning_rate": 0.0028102360005238486, + "loss": 1.1605, + "step": 2777 + }, + { + "epoch": 0.24360774921248676, + "grad_norm": 0.09033203125, + "learning_rate": 0.0028100275199085803, + "loss": 1.2478, + "step": 2778 + }, + { + "epoch": 0.2436954409868613, + "grad_norm": 0.07958984375, + "learning_rate": 0.002809818933501796, + "loss": 1.1483, + "step": 2779 + }, + { + "epoch": 0.24378313276123584, + "grad_norm": 0.0703125, + "learning_rate": 0.00280961024132252, + "loss": 1.2006, + "step": 2780 + }, + { + "epoch": 0.2438708245356104, + "grad_norm": 0.06689453125, + "learning_rate": 0.002809401443389783, + "loss": 1.1655, + "step": 2781 + }, + { + "epoch": 0.24395851630998494, + "grad_norm": 0.06298828125, + "learning_rate": 0.0028091925397226276, + "loss": 1.2363, + "step": 2782 + }, + { + "epoch": 0.24404620808435948, + "grad_norm": 0.061279296875, + "learning_rate": 0.0028089835303401044, + "loss": 1.2547, + "step": 2783 + }, + { + "epoch": 0.24413389985873404, + "grad_norm": 0.068359375, + "learning_rate": 0.0028087744152612754, + "loss": 1.1715, + "step": 2784 + }, + { + "epoch": 0.24422159163310858, + "grad_norm": 0.232421875, + "learning_rate": 0.0028085651945052113, + "loss": 1.2515, + "step": 2785 + }, + { + "epoch": 0.24430928340748312, + "grad_norm": 0.09619140625, + "learning_rate": 0.002808355868090992, + "loss": 1.2418, + "step": 2786 + }, + { + "epoch": 0.24439697518185766, + "grad_norm": 0.06787109375, + "learning_rate": 0.002808146436037708, + "loss": 1.1804, + "step": 2787 + }, + { + "epoch": 0.24448466695623222, + "grad_norm": 0.07421875, + "learning_rate": 0.002807936898364459, + "loss": 1.2145, + "step": 2788 + }, + { + "epoch": 0.24457235873060676, + "grad_norm": 0.0712890625, + "learning_rate": 0.0028077272550903537, + "loss": 1.2147, + "step": 2789 + }, + { + "epoch": 0.2446600505049813, + "grad_norm": 0.06201171875, + "learning_rate": 0.0028075175062345116, + "loss": 1.2056, + "step": 2790 + }, + { + "epoch": 0.24474774227935586, + "grad_norm": 0.0615234375, + "learning_rate": 0.002807307651816061, + "loss": 1.1901, + "step": 2791 + }, + { + "epoch": 0.2448354340537304, + "grad_norm": 0.1103515625, + "learning_rate": 0.00280709769185414, + "loss": 1.2624, + "step": 2792 + }, + { + "epoch": 0.24492312582810494, + "grad_norm": 0.05419921875, + "learning_rate": 0.002806887626367897, + "loss": 1.2017, + "step": 2793 + }, + { + "epoch": 0.24501081760247947, + "grad_norm": 0.123046875, + "learning_rate": 0.0028066774553764887, + "loss": 1.1858, + "step": 2794 + }, + { + "epoch": 0.24509850937685404, + "grad_norm": 0.057373046875, + "learning_rate": 0.002806467178899082, + "loss": 1.1742, + "step": 2795 + }, + { + "epoch": 0.24518620115122858, + "grad_norm": 0.07421875, + "learning_rate": 0.0028062567969548546, + "loss": 1.2245, + "step": 2796 + }, + { + "epoch": 0.24527389292560312, + "grad_norm": 0.062255859375, + "learning_rate": 0.002806046309562992, + "loss": 1.2189, + "step": 2797 + }, + { + "epoch": 0.24536158469997765, + "grad_norm": 0.055908203125, + "learning_rate": 0.00280583571674269, + "loss": 1.1877, + "step": 2798 + }, + { + "epoch": 0.24544927647435222, + "grad_norm": 0.07373046875, + "learning_rate": 0.0028056250185131545, + "loss": 1.2477, + "step": 2799 + }, + { + "epoch": 0.24553696824872676, + "grad_norm": 0.057861328125, + "learning_rate": 0.0028054142148936007, + "loss": 1.2512, + "step": 2800 + }, + { + "epoch": 0.2456246600231013, + "grad_norm": 0.0986328125, + "learning_rate": 0.002805203305903253, + "loss": 1.1946, + "step": 2801 + }, + { + "epoch": 0.24571235179747586, + "grad_norm": 0.05615234375, + "learning_rate": 0.0028049922915613463, + "loss": 1.1907, + "step": 2802 + }, + { + "epoch": 0.2458000435718504, + "grad_norm": 0.10498046875, + "learning_rate": 0.0028047811718871236, + "loss": 1.2198, + "step": 2803 + }, + { + "epoch": 0.24588773534622493, + "grad_norm": 0.058349609375, + "learning_rate": 0.0028045699468998396, + "loss": 1.2383, + "step": 2804 + }, + { + "epoch": 0.24597542712059947, + "grad_norm": 0.1650390625, + "learning_rate": 0.0028043586166187568, + "loss": 1.2598, + "step": 2805 + }, + { + "epoch": 0.24606311889497404, + "grad_norm": 0.0771484375, + "learning_rate": 0.002804147181063148, + "loss": 1.1921, + "step": 2806 + }, + { + "epoch": 0.24615081066934857, + "grad_norm": 0.08447265625, + "learning_rate": 0.002803935640252296, + "loss": 1.1618, + "step": 2807 + }, + { + "epoch": 0.2462385024437231, + "grad_norm": 0.0595703125, + "learning_rate": 0.0028037239942054924, + "loss": 1.1913, + "step": 2808 + }, + { + "epoch": 0.24632619421809768, + "grad_norm": 0.091796875, + "learning_rate": 0.0028035122429420386, + "loss": 1.216, + "step": 2809 + }, + { + "epoch": 0.24641388599247221, + "grad_norm": 0.06103515625, + "learning_rate": 0.002803300386481246, + "loss": 1.2054, + "step": 2810 + }, + { + "epoch": 0.24650157776684675, + "grad_norm": 0.05810546875, + "learning_rate": 0.002803088424842436, + "loss": 1.2625, + "step": 2811 + }, + { + "epoch": 0.2465892695412213, + "grad_norm": 0.062255859375, + "learning_rate": 0.0028028763580449376, + "loss": 1.197, + "step": 2812 + }, + { + "epoch": 0.24667696131559586, + "grad_norm": 0.06298828125, + "learning_rate": 0.002802664186108092, + "loss": 1.222, + "step": 2813 + }, + { + "epoch": 0.2467646530899704, + "grad_norm": 0.064453125, + "learning_rate": 0.002802451909051248, + "loss": 1.2714, + "step": 2814 + }, + { + "epoch": 0.24685234486434493, + "grad_norm": 0.06591796875, + "learning_rate": 0.002802239526893765, + "loss": 1.2253, + "step": 2815 + }, + { + "epoch": 0.24694003663871947, + "grad_norm": 0.07080078125, + "learning_rate": 0.002802027039655012, + "loss": 1.1983, + "step": 2816 + }, + { + "epoch": 0.24702772841309403, + "grad_norm": 0.056396484375, + "learning_rate": 0.002801814447354366, + "loss": 1.1837, + "step": 2817 + }, + { + "epoch": 0.24711542018746857, + "grad_norm": 0.07958984375, + "learning_rate": 0.0028016017500112162, + "loss": 1.2055, + "step": 2818 + }, + { + "epoch": 0.2472031119618431, + "grad_norm": 0.07861328125, + "learning_rate": 0.0028013889476449596, + "loss": 1.2456, + "step": 2819 + }, + { + "epoch": 0.24729080373621767, + "grad_norm": 0.06787109375, + "learning_rate": 0.0028011760402750037, + "loss": 1.2163, + "step": 2820 + }, + { + "epoch": 0.2473784955105922, + "grad_norm": 0.09033203125, + "learning_rate": 0.0028009630279207643, + "loss": 1.2467, + "step": 2821 + }, + { + "epoch": 0.24746618728496675, + "grad_norm": 0.115234375, + "learning_rate": 0.002800749910601668, + "loss": 1.2159, + "step": 2822 + }, + { + "epoch": 0.2475538790593413, + "grad_norm": 0.0732421875, + "learning_rate": 0.00280053668833715, + "loss": 1.2399, + "step": 2823 + }, + { + "epoch": 0.24764157083371585, + "grad_norm": 0.09521484375, + "learning_rate": 0.0028003233611466572, + "loss": 1.2219, + "step": 2824 + }, + { + "epoch": 0.2477292626080904, + "grad_norm": 0.0703125, + "learning_rate": 0.0028001099290496426, + "loss": 1.2125, + "step": 2825 + }, + { + "epoch": 0.24781695438246493, + "grad_norm": 0.06884765625, + "learning_rate": 0.0027998963920655715, + "loss": 1.2351, + "step": 2826 + }, + { + "epoch": 0.2479046461568395, + "grad_norm": 0.09375, + "learning_rate": 0.0027996827502139183, + "loss": 1.2392, + "step": 2827 + }, + { + "epoch": 0.24799233793121403, + "grad_norm": 0.125, + "learning_rate": 0.002799469003514166, + "loss": 1.2123, + "step": 2828 + }, + { + "epoch": 0.24808002970558857, + "grad_norm": 0.1044921875, + "learning_rate": 0.0027992551519858077, + "loss": 1.2462, + "step": 2829 + }, + { + "epoch": 0.2481677214799631, + "grad_norm": 0.06640625, + "learning_rate": 0.0027990411956483464, + "loss": 1.233, + "step": 2830 + }, + { + "epoch": 0.24825541325433767, + "grad_norm": 0.193359375, + "learning_rate": 0.0027988271345212945, + "loss": 1.2389, + "step": 2831 + }, + { + "epoch": 0.2483431050287122, + "grad_norm": 0.0751953125, + "learning_rate": 0.0027986129686241737, + "loss": 1.2273, + "step": 2832 + }, + { + "epoch": 0.24843079680308675, + "grad_norm": 0.126953125, + "learning_rate": 0.0027983986979765152, + "loss": 1.2232, + "step": 2833 + }, + { + "epoch": 0.24851848857746128, + "grad_norm": 0.05712890625, + "learning_rate": 0.00279818432259786, + "loss": 1.2149, + "step": 2834 + }, + { + "epoch": 0.24860618035183585, + "grad_norm": 0.06689453125, + "learning_rate": 0.0027979698425077584, + "loss": 1.2788, + "step": 2835 + }, + { + "epoch": 0.24869387212621039, + "grad_norm": 0.0673828125, + "learning_rate": 0.0027977552577257707, + "loss": 1.2552, + "step": 2836 + }, + { + "epoch": 0.24878156390058492, + "grad_norm": 0.0888671875, + "learning_rate": 0.0027975405682714666, + "loss": 1.2173, + "step": 2837 + }, + { + "epoch": 0.2488692556749595, + "grad_norm": 0.059326171875, + "learning_rate": 0.0027973257741644247, + "loss": 1.1674, + "step": 2838 + }, + { + "epoch": 0.24895694744933403, + "grad_norm": 0.06494140625, + "learning_rate": 0.002797110875424234, + "loss": 1.2148, + "step": 2839 + }, + { + "epoch": 0.24904463922370856, + "grad_norm": 0.0615234375, + "learning_rate": 0.0027968958720704937, + "loss": 1.2162, + "step": 2840 + }, + { + "epoch": 0.2491323309980831, + "grad_norm": 0.06689453125, + "learning_rate": 0.0027966807641228095, + "loss": 1.2317, + "step": 2841 + }, + { + "epoch": 0.24922002277245767, + "grad_norm": 0.07275390625, + "learning_rate": 0.0027964655516008, + "loss": 1.2457, + "step": 2842 + }, + { + "epoch": 0.2493077145468322, + "grad_norm": 0.055908203125, + "learning_rate": 0.0027962502345240917, + "loss": 1.1574, + "step": 2843 + }, + { + "epoch": 0.24939540632120674, + "grad_norm": 0.058837890625, + "learning_rate": 0.002796034812912321, + "loss": 1.2483, + "step": 2844 + }, + { + "epoch": 0.2494830980955813, + "grad_norm": 0.08837890625, + "learning_rate": 0.002795819286785134, + "loss": 1.1864, + "step": 2845 + }, + { + "epoch": 0.24957078986995584, + "grad_norm": 0.0791015625, + "learning_rate": 0.0027956036561621865, + "loss": 1.2003, + "step": 2846 + }, + { + "epoch": 0.24965848164433038, + "grad_norm": 0.078125, + "learning_rate": 0.0027953879210631423, + "loss": 1.3245, + "step": 2847 + }, + { + "epoch": 0.24974617341870492, + "grad_norm": 0.09423828125, + "learning_rate": 0.0027951720815076763, + "loss": 1.1422, + "step": 2848 + }, + { + "epoch": 0.24983386519307949, + "grad_norm": 0.06005859375, + "learning_rate": 0.002794956137515473, + "loss": 1.2049, + "step": 2849 + }, + { + "epoch": 0.24992155696745402, + "grad_norm": 0.07470703125, + "learning_rate": 0.002794740089106226, + "loss": 1.2318, + "step": 2850 + }, + { + "epoch": 0.25000924874182856, + "grad_norm": 0.05908203125, + "learning_rate": 0.0027945239362996374, + "loss": 1.1833, + "step": 2851 + }, + { + "epoch": 0.2500969405162031, + "grad_norm": 0.0703125, + "learning_rate": 0.0027943076791154204, + "loss": 1.2134, + "step": 2852 + }, + { + "epoch": 0.25018463229057764, + "grad_norm": 0.107421875, + "learning_rate": 0.0027940913175732974, + "loss": 1.2269, + "step": 2853 + }, + { + "epoch": 0.2502723240649522, + "grad_norm": 0.08154296875, + "learning_rate": 0.0027938748516929996, + "loss": 1.2345, + "step": 2854 + }, + { + "epoch": 0.25036001583932677, + "grad_norm": 0.07568359375, + "learning_rate": 0.0027936582814942685, + "loss": 1.2196, + "step": 2855 + }, + { + "epoch": 0.2504477076137013, + "grad_norm": 0.064453125, + "learning_rate": 0.002793441606996854, + "loss": 1.1914, + "step": 2856 + }, + { + "epoch": 0.25053539938807584, + "grad_norm": 0.0791015625, + "learning_rate": 0.002793224828220517, + "loss": 1.1923, + "step": 2857 + }, + { + "epoch": 0.2506230911624504, + "grad_norm": 0.07861328125, + "learning_rate": 0.0027930079451850267, + "loss": 1.2086, + "step": 2858 + }, + { + "epoch": 0.2507107829368249, + "grad_norm": 0.09521484375, + "learning_rate": 0.0027927909579101625, + "loss": 1.2562, + "step": 2859 + }, + { + "epoch": 0.2507984747111995, + "grad_norm": 0.1474609375, + "learning_rate": 0.002792573866415713, + "loss": 1.2113, + "step": 2860 + }, + { + "epoch": 0.250886166485574, + "grad_norm": 0.06103515625, + "learning_rate": 0.002792356670721477, + "loss": 1.1959, + "step": 2861 + }, + { + "epoch": 0.25097385825994856, + "grad_norm": 0.09326171875, + "learning_rate": 0.002792139370847261, + "loss": 1.2234, + "step": 2862 + }, + { + "epoch": 0.2510615500343231, + "grad_norm": 0.0595703125, + "learning_rate": 0.002791921966812883, + "loss": 1.1612, + "step": 2863 + }, + { + "epoch": 0.25114924180869763, + "grad_norm": 0.09521484375, + "learning_rate": 0.0027917044586381694, + "loss": 1.1847, + "step": 2864 + }, + { + "epoch": 0.2512369335830722, + "grad_norm": 0.060546875, + "learning_rate": 0.0027914868463429564, + "loss": 1.2312, + "step": 2865 + }, + { + "epoch": 0.25132462535744676, + "grad_norm": 0.0771484375, + "learning_rate": 0.00279126912994709, + "loss": 1.1881, + "step": 2866 + }, + { + "epoch": 0.2514123171318213, + "grad_norm": 0.060302734375, + "learning_rate": 0.0027910513094704247, + "loss": 1.2322, + "step": 2867 + }, + { + "epoch": 0.25150000890619584, + "grad_norm": 0.09716796875, + "learning_rate": 0.002790833384932826, + "loss": 1.2592, + "step": 2868 + }, + { + "epoch": 0.2515877006805704, + "grad_norm": 0.060546875, + "learning_rate": 0.0027906153563541673, + "loss": 1.1932, + "step": 2869 + }, + { + "epoch": 0.2516753924549449, + "grad_norm": 0.0751953125, + "learning_rate": 0.002790397223754333, + "loss": 1.2049, + "step": 2870 + }, + { + "epoch": 0.2517630842293195, + "grad_norm": 0.064453125, + "learning_rate": 0.0027901789871532154, + "loss": 1.2293, + "step": 2871 + }, + { + "epoch": 0.25185077600369404, + "grad_norm": 0.09619140625, + "learning_rate": 0.0027899606465707177, + "loss": 1.2083, + "step": 2872 + }, + { + "epoch": 0.25193846777806855, + "grad_norm": 0.07470703125, + "learning_rate": 0.0027897422020267512, + "loss": 1.2075, + "step": 2873 + }, + { + "epoch": 0.2520261595524431, + "grad_norm": 0.07470703125, + "learning_rate": 0.002789523653541239, + "loss": 1.2167, + "step": 2874 + }, + { + "epoch": 0.25211385132681763, + "grad_norm": 0.07177734375, + "learning_rate": 0.002789305001134111, + "loss": 1.2596, + "step": 2875 + }, + { + "epoch": 0.2522015431011922, + "grad_norm": 0.115234375, + "learning_rate": 0.0027890862448253085, + "loss": 1.2052, + "step": 2876 + }, + { + "epoch": 0.25228923487556676, + "grad_norm": 0.1015625, + "learning_rate": 0.00278886738463478, + "loss": 1.1974, + "step": 2877 + }, + { + "epoch": 0.25237692664994127, + "grad_norm": 0.08642578125, + "learning_rate": 0.0027886484205824867, + "loss": 1.193, + "step": 2878 + }, + { + "epoch": 0.25246461842431583, + "grad_norm": 0.07666015625, + "learning_rate": 0.002788429352688397, + "loss": 1.2423, + "step": 2879 + }, + { + "epoch": 0.2525523101986904, + "grad_norm": 0.08642578125, + "learning_rate": 0.0027882101809724885, + "loss": 1.2184, + "step": 2880 + }, + { + "epoch": 0.2526400019730649, + "grad_norm": 0.09619140625, + "learning_rate": 0.00278799090545475, + "loss": 1.2665, + "step": 2881 + }, + { + "epoch": 0.2527276937474395, + "grad_norm": 0.0673828125, + "learning_rate": 0.002787771526155179, + "loss": 1.1902, + "step": 2882 + }, + { + "epoch": 0.25281538552181404, + "grad_norm": 0.072265625, + "learning_rate": 0.0027875520430937816, + "loss": 1.1662, + "step": 2883 + }, + { + "epoch": 0.25290307729618855, + "grad_norm": 0.05859375, + "learning_rate": 0.0027873324562905743, + "loss": 1.1653, + "step": 2884 + }, + { + "epoch": 0.2529907690705631, + "grad_norm": 0.07958984375, + "learning_rate": 0.0027871127657655833, + "loss": 1.1631, + "step": 2885 + }, + { + "epoch": 0.2530784608449376, + "grad_norm": 0.059326171875, + "learning_rate": 0.0027868929715388433, + "loss": 1.2427, + "step": 2886 + }, + { + "epoch": 0.2531661526193122, + "grad_norm": 0.08154296875, + "learning_rate": 0.0027866730736303994, + "loss": 1.2334, + "step": 2887 + }, + { + "epoch": 0.25325384439368676, + "grad_norm": 0.0869140625, + "learning_rate": 0.0027864530720603056, + "loss": 1.2079, + "step": 2888 + }, + { + "epoch": 0.25334153616806127, + "grad_norm": 0.083984375, + "learning_rate": 0.0027862329668486246, + "loss": 1.2163, + "step": 2889 + }, + { + "epoch": 0.25342922794243583, + "grad_norm": 0.083984375, + "learning_rate": 0.0027860127580154313, + "loss": 1.2109, + "step": 2890 + }, + { + "epoch": 0.2535169197168104, + "grad_norm": 0.06591796875, + "learning_rate": 0.0027857924455808062, + "loss": 1.1996, + "step": 2891 + }, + { + "epoch": 0.2536046114911849, + "grad_norm": 0.080078125, + "learning_rate": 0.0027855720295648424, + "loss": 1.1655, + "step": 2892 + }, + { + "epoch": 0.25369230326555947, + "grad_norm": 0.06787109375, + "learning_rate": 0.002785351509987641, + "loss": 1.293, + "step": 2893 + }, + { + "epoch": 0.25377999503993404, + "grad_norm": 0.0888671875, + "learning_rate": 0.0027851308868693126, + "loss": 1.3099, + "step": 2894 + }, + { + "epoch": 0.25386768681430855, + "grad_norm": 0.060546875, + "learning_rate": 0.002784910160229978, + "loss": 1.1904, + "step": 2895 + }, + { + "epoch": 0.2539553785886831, + "grad_norm": 0.0888671875, + "learning_rate": 0.002784689330089766, + "loss": 1.2164, + "step": 2896 + }, + { + "epoch": 0.2540430703630576, + "grad_norm": 0.06005859375, + "learning_rate": 0.002784468396468817, + "loss": 1.2586, + "step": 2897 + }, + { + "epoch": 0.2541307621374322, + "grad_norm": 0.061767578125, + "learning_rate": 0.0027842473593872783, + "loss": 1.2428, + "step": 2898 + }, + { + "epoch": 0.25421845391180675, + "grad_norm": 0.11572265625, + "learning_rate": 0.002784026218865309, + "loss": 1.1785, + "step": 2899 + }, + { + "epoch": 0.25430614568618126, + "grad_norm": 0.08447265625, + "learning_rate": 0.0027838049749230754, + "loss": 1.2431, + "step": 2900 + }, + { + "epoch": 0.2543938374605558, + "grad_norm": 0.09912109375, + "learning_rate": 0.0027835836275807552, + "loss": 1.1473, + "step": 2901 + }, + { + "epoch": 0.2544815292349304, + "grad_norm": 0.087890625, + "learning_rate": 0.0027833621768585345, + "loss": 1.201, + "step": 2902 + }, + { + "epoch": 0.2545692210093049, + "grad_norm": 0.1259765625, + "learning_rate": 0.002783140622776609, + "loss": 1.2095, + "step": 2903 + }, + { + "epoch": 0.25465691278367947, + "grad_norm": 0.1162109375, + "learning_rate": 0.0027829189653551844, + "loss": 1.1969, + "step": 2904 + }, + { + "epoch": 0.25474460455805403, + "grad_norm": 0.1455078125, + "learning_rate": 0.002782697204614474, + "loss": 1.2771, + "step": 2905 + }, + { + "epoch": 0.25483229633242854, + "grad_norm": 0.15625, + "learning_rate": 0.0027824753405747025, + "loss": 1.2157, + "step": 2906 + }, + { + "epoch": 0.2549199881068031, + "grad_norm": 0.146484375, + "learning_rate": 0.002782253373256104, + "loss": 1.2889, + "step": 2907 + }, + { + "epoch": 0.2550076798811776, + "grad_norm": 0.134765625, + "learning_rate": 0.002782031302678921, + "loss": 1.2213, + "step": 2908 + }, + { + "epoch": 0.2550953716555522, + "grad_norm": 0.12255859375, + "learning_rate": 0.002781809128863405, + "loss": 1.198, + "step": 2909 + }, + { + "epoch": 0.25518306342992675, + "grad_norm": 0.1123046875, + "learning_rate": 0.002781586851829818, + "loss": 1.2364, + "step": 2910 + }, + { + "epoch": 0.25527075520430126, + "grad_norm": 0.05810546875, + "learning_rate": 0.002781364471598432, + "loss": 1.2125, + "step": 2911 + }, + { + "epoch": 0.2553584469786758, + "grad_norm": 0.0859375, + "learning_rate": 0.0027811419881895263, + "loss": 1.2548, + "step": 2912 + }, + { + "epoch": 0.2554461387530504, + "grad_norm": 0.05810546875, + "learning_rate": 0.002780919401623391, + "loss": 1.2392, + "step": 2913 + }, + { + "epoch": 0.2555338305274249, + "grad_norm": 0.07275390625, + "learning_rate": 0.0027806967119203265, + "loss": 1.2197, + "step": 2914 + }, + { + "epoch": 0.25562152230179946, + "grad_norm": 0.0771484375, + "learning_rate": 0.0027804739191006405, + "loss": 1.2204, + "step": 2915 + }, + { + "epoch": 0.25570921407617403, + "grad_norm": 0.06591796875, + "learning_rate": 0.002780251023184651, + "loss": 1.186, + "step": 2916 + }, + { + "epoch": 0.25579690585054854, + "grad_norm": 0.0869140625, + "learning_rate": 0.0027800280241926864, + "loss": 1.1727, + "step": 2917 + }, + { + "epoch": 0.2558845976249231, + "grad_norm": 0.0888671875, + "learning_rate": 0.002779804922145083, + "loss": 1.22, + "step": 2918 + }, + { + "epoch": 0.25597228939929767, + "grad_norm": 0.06640625, + "learning_rate": 0.0027795817170621867, + "loss": 1.2085, + "step": 2919 + }, + { + "epoch": 0.2560599811736722, + "grad_norm": 0.07421875, + "learning_rate": 0.0027793584089643546, + "loss": 1.2524, + "step": 2920 + }, + { + "epoch": 0.25614767294804675, + "grad_norm": 0.05419921875, + "learning_rate": 0.0027791349978719504, + "loss": 1.2057, + "step": 2921 + }, + { + "epoch": 0.25623536472242125, + "grad_norm": 0.1328125, + "learning_rate": 0.0027789114838053497, + "loss": 1.2192, + "step": 2922 + }, + { + "epoch": 0.2563230564967958, + "grad_norm": 0.09814453125, + "learning_rate": 0.0027786878667849357, + "loss": 1.2144, + "step": 2923 + }, + { + "epoch": 0.2564107482711704, + "grad_norm": 0.12060546875, + "learning_rate": 0.0027784641468311024, + "loss": 1.2547, + "step": 2924 + }, + { + "epoch": 0.2564984400455449, + "grad_norm": 0.061279296875, + "learning_rate": 0.0027782403239642517, + "loss": 1.2265, + "step": 2925 + }, + { + "epoch": 0.25658613181991946, + "grad_norm": 0.1123046875, + "learning_rate": 0.002778016398204796, + "loss": 1.2449, + "step": 2926 + }, + { + "epoch": 0.256673823594294, + "grad_norm": 0.0673828125, + "learning_rate": 0.0027777923695731566, + "loss": 1.2019, + "step": 2927 + }, + { + "epoch": 0.25676151536866854, + "grad_norm": 0.06689453125, + "learning_rate": 0.0027775682380897647, + "loss": 1.2748, + "step": 2928 + }, + { + "epoch": 0.2568492071430431, + "grad_norm": 0.07275390625, + "learning_rate": 0.00277734400377506, + "loss": 1.1948, + "step": 2929 + }, + { + "epoch": 0.25693689891741767, + "grad_norm": 0.087890625, + "learning_rate": 0.0027771196666494928, + "loss": 1.2228, + "step": 2930 + }, + { + "epoch": 0.2570245906917922, + "grad_norm": 0.06201171875, + "learning_rate": 0.0027768952267335214, + "loss": 1.1704, + "step": 2931 + }, + { + "epoch": 0.25711228246616674, + "grad_norm": 0.158203125, + "learning_rate": 0.002776670684047614, + "loss": 1.2227, + "step": 2932 + }, + { + "epoch": 0.25719997424054125, + "grad_norm": 0.0849609375, + "learning_rate": 0.0027764460386122494, + "loss": 1.1527, + "step": 2933 + }, + { + "epoch": 0.2572876660149158, + "grad_norm": 0.162109375, + "learning_rate": 0.0027762212904479137, + "loss": 1.2376, + "step": 2934 + }, + { + "epoch": 0.2573753577892904, + "grad_norm": 0.06884765625, + "learning_rate": 0.002775996439575103, + "loss": 1.205, + "step": 2935 + }, + { + "epoch": 0.2574630495636649, + "grad_norm": 0.1376953125, + "learning_rate": 0.0027757714860143245, + "loss": 1.2358, + "step": 2936 + }, + { + "epoch": 0.25755074133803946, + "grad_norm": 0.07421875, + "learning_rate": 0.002775546429786093, + "loss": 1.2129, + "step": 2937 + }, + { + "epoch": 0.257638433112414, + "grad_norm": 0.1123046875, + "learning_rate": 0.0027753212709109316, + "loss": 1.2641, + "step": 2938 + }, + { + "epoch": 0.25772612488678853, + "grad_norm": 0.0966796875, + "learning_rate": 0.0027750960094093764, + "loss": 1.154, + "step": 2939 + }, + { + "epoch": 0.2578138166611631, + "grad_norm": 0.11376953125, + "learning_rate": 0.002774870645301969, + "loss": 1.1743, + "step": 2940 + }, + { + "epoch": 0.25790150843553766, + "grad_norm": 0.06494140625, + "learning_rate": 0.0027746451786092624, + "loss": 1.3064, + "step": 2941 + }, + { + "epoch": 0.2579892002099122, + "grad_norm": 0.14453125, + "learning_rate": 0.0027744196093518188, + "loss": 1.2278, + "step": 2942 + }, + { + "epoch": 0.25807689198428674, + "grad_norm": 0.08642578125, + "learning_rate": 0.0027741939375502097, + "loss": 1.2385, + "step": 2943 + }, + { + "epoch": 0.25816458375866125, + "grad_norm": 0.1689453125, + "learning_rate": 0.0027739681632250155, + "loss": 1.2469, + "step": 2944 + }, + { + "epoch": 0.2582522755330358, + "grad_norm": 0.09814453125, + "learning_rate": 0.002773742286396827, + "loss": 1.1947, + "step": 2945 + }, + { + "epoch": 0.2583399673074104, + "grad_norm": 0.10791015625, + "learning_rate": 0.002773516307086242, + "loss": 1.2256, + "step": 2946 + }, + { + "epoch": 0.2584276590817849, + "grad_norm": 0.0751953125, + "learning_rate": 0.0027732902253138707, + "loss": 1.2535, + "step": 2947 + }, + { + "epoch": 0.25851535085615945, + "grad_norm": 0.060546875, + "learning_rate": 0.00277306404110033, + "loss": 1.1799, + "step": 2948 + }, + { + "epoch": 0.258603042630534, + "grad_norm": 0.068359375, + "learning_rate": 0.0027728377544662484, + "loss": 1.2235, + "step": 2949 + }, + { + "epoch": 0.25869073440490853, + "grad_norm": 0.054443359375, + "learning_rate": 0.0027726113654322616, + "loss": 1.1489, + "step": 2950 + }, + { + "epoch": 0.2587784261792831, + "grad_norm": 0.07080078125, + "learning_rate": 0.002772384874019017, + "loss": 1.2553, + "step": 2951 + }, + { + "epoch": 0.25886611795365766, + "grad_norm": 0.05810546875, + "learning_rate": 0.0027721582802471695, + "loss": 1.2271, + "step": 2952 + }, + { + "epoch": 0.25895380972803217, + "grad_norm": 0.05859375, + "learning_rate": 0.0027719315841373826, + "loss": 1.148, + "step": 2953 + }, + { + "epoch": 0.25904150150240673, + "grad_norm": 0.057373046875, + "learning_rate": 0.0027717047857103326, + "loss": 1.2451, + "step": 2954 + }, + { + "epoch": 0.2591291932767813, + "grad_norm": 0.055908203125, + "learning_rate": 0.0027714778849867016, + "loss": 1.1276, + "step": 2955 + }, + { + "epoch": 0.2592168850511558, + "grad_norm": 0.0888671875, + "learning_rate": 0.002771250881987182, + "loss": 1.2717, + "step": 2956 + }, + { + "epoch": 0.2593045768255304, + "grad_norm": 0.0712890625, + "learning_rate": 0.002771023776732477, + "loss": 1.1861, + "step": 2957 + }, + { + "epoch": 0.2593922685999049, + "grad_norm": 0.083984375, + "learning_rate": 0.002770796569243297, + "loss": 1.2405, + "step": 2958 + }, + { + "epoch": 0.25947996037427945, + "grad_norm": 0.06201171875, + "learning_rate": 0.0027705692595403632, + "loss": 1.1906, + "step": 2959 + }, + { + "epoch": 0.259567652148654, + "grad_norm": 0.060791015625, + "learning_rate": 0.002770341847644406, + "loss": 1.245, + "step": 2960 + }, + { + "epoch": 0.2596553439230285, + "grad_norm": 0.060791015625, + "learning_rate": 0.0027701143335761643, + "loss": 1.2197, + "step": 2961 + }, + { + "epoch": 0.2597430356974031, + "grad_norm": 0.055908203125, + "learning_rate": 0.002769886717356387, + "loss": 1.1793, + "step": 2962 + }, + { + "epoch": 0.25983072747177766, + "grad_norm": 0.07666015625, + "learning_rate": 0.002769658999005832, + "loss": 1.1586, + "step": 2963 + }, + { + "epoch": 0.25991841924615217, + "grad_norm": 0.09130859375, + "learning_rate": 0.0027694311785452673, + "loss": 1.2168, + "step": 2964 + }, + { + "epoch": 0.26000611102052673, + "grad_norm": 0.055908203125, + "learning_rate": 0.002769203255995468, + "loss": 1.2114, + "step": 2965 + }, + { + "epoch": 0.2600938027949013, + "grad_norm": 0.07568359375, + "learning_rate": 0.002768975231377221, + "loss": 1.319, + "step": 2966 + }, + { + "epoch": 0.2601814945692758, + "grad_norm": 0.06298828125, + "learning_rate": 0.002768747104711322, + "loss": 1.1747, + "step": 2967 + }, + { + "epoch": 0.26026918634365037, + "grad_norm": 0.06591796875, + "learning_rate": 0.0027685188760185747, + "loss": 1.2456, + "step": 2968 + }, + { + "epoch": 0.2603568781180249, + "grad_norm": 0.08251953125, + "learning_rate": 0.0027682905453197937, + "loss": 1.2478, + "step": 2969 + }, + { + "epoch": 0.26044456989239945, + "grad_norm": 0.064453125, + "learning_rate": 0.0027680621126358017, + "loss": 1.2655, + "step": 2970 + }, + { + "epoch": 0.260532261666774, + "grad_norm": 0.059326171875, + "learning_rate": 0.002767833577987431, + "loss": 1.2369, + "step": 2971 + }, + { + "epoch": 0.2606199534411485, + "grad_norm": 0.11865234375, + "learning_rate": 0.0027676049413955244, + "loss": 1.2384, + "step": 2972 + }, + { + "epoch": 0.2607076452155231, + "grad_norm": 0.06298828125, + "learning_rate": 0.002767376202880931, + "loss": 1.184, + "step": 2973 + }, + { + "epoch": 0.26079533698989765, + "grad_norm": 0.1005859375, + "learning_rate": 0.002767147362464514, + "loss": 1.2164, + "step": 2974 + }, + { + "epoch": 0.26088302876427216, + "grad_norm": 0.057861328125, + "learning_rate": 0.0027669184201671395, + "loss": 1.2302, + "step": 2975 + }, + { + "epoch": 0.2609707205386467, + "grad_norm": 0.08203125, + "learning_rate": 0.0027666893760096897, + "loss": 1.2051, + "step": 2976 + }, + { + "epoch": 0.2610584123130213, + "grad_norm": 0.0673828125, + "learning_rate": 0.0027664602300130517, + "loss": 1.2319, + "step": 2977 + }, + { + "epoch": 0.2611461040873958, + "grad_norm": 0.07275390625, + "learning_rate": 0.0027662309821981225, + "loss": 1.2215, + "step": 2978 + }, + { + "epoch": 0.26123379586177037, + "grad_norm": 0.05615234375, + "learning_rate": 0.002766001632585809, + "loss": 1.2031, + "step": 2979 + }, + { + "epoch": 0.2613214876361449, + "grad_norm": 0.06640625, + "learning_rate": 0.0027657721811970275, + "loss": 1.1895, + "step": 2980 + }, + { + "epoch": 0.26140917941051944, + "grad_norm": 0.05712890625, + "learning_rate": 0.0027655426280527038, + "loss": 1.1953, + "step": 2981 + }, + { + "epoch": 0.261496871184894, + "grad_norm": 0.07470703125, + "learning_rate": 0.0027653129731737723, + "loss": 1.2263, + "step": 2982 + }, + { + "epoch": 0.2615845629592685, + "grad_norm": 0.09765625, + "learning_rate": 0.0027650832165811764, + "loss": 1.211, + "step": 2983 + }, + { + "epoch": 0.2616722547336431, + "grad_norm": 0.05810546875, + "learning_rate": 0.0027648533582958702, + "loss": 1.2051, + "step": 2984 + }, + { + "epoch": 0.26175994650801765, + "grad_norm": 0.064453125, + "learning_rate": 0.002764623398338815, + "loss": 1.2248, + "step": 2985 + }, + { + "epoch": 0.26184763828239216, + "grad_norm": 0.076171875, + "learning_rate": 0.0027643933367309838, + "loss": 1.159, + "step": 2986 + }, + { + "epoch": 0.2619353300567667, + "grad_norm": 0.06982421875, + "learning_rate": 0.0027641631734933564, + "loss": 1.292, + "step": 2987 + }, + { + "epoch": 0.2620230218311413, + "grad_norm": 0.05712890625, + "learning_rate": 0.0027639329086469244, + "loss": 1.2101, + "step": 2988 + }, + { + "epoch": 0.2621107136055158, + "grad_norm": 0.059326171875, + "learning_rate": 0.002763702542212687, + "loss": 1.2107, + "step": 2989 + }, + { + "epoch": 0.26219840537989036, + "grad_norm": 0.0615234375, + "learning_rate": 0.002763472074211652, + "loss": 1.2438, + "step": 2990 + }, + { + "epoch": 0.26228609715426493, + "grad_norm": 0.0654296875, + "learning_rate": 0.0027632415046648386, + "loss": 1.2295, + "step": 2991 + }, + { + "epoch": 0.26237378892863944, + "grad_norm": 0.0888671875, + "learning_rate": 0.0027630108335932745, + "loss": 1.1904, + "step": 2992 + }, + { + "epoch": 0.262461480703014, + "grad_norm": 0.11181640625, + "learning_rate": 0.0027627800610179943, + "loss": 1.2132, + "step": 2993 + }, + { + "epoch": 0.2625491724773885, + "grad_norm": 0.06884765625, + "learning_rate": 0.002762549186960046, + "loss": 1.2255, + "step": 2994 + }, + { + "epoch": 0.2626368642517631, + "grad_norm": 0.09521484375, + "learning_rate": 0.0027623182114404838, + "loss": 1.1881, + "step": 2995 + }, + { + "epoch": 0.26272455602613765, + "grad_norm": 0.06982421875, + "learning_rate": 0.0027620871344803716, + "loss": 1.211, + "step": 2996 + }, + { + "epoch": 0.26281224780051216, + "grad_norm": 0.119140625, + "learning_rate": 0.0027618559561007837, + "loss": 1.2015, + "step": 2997 + }, + { + "epoch": 0.2628999395748867, + "grad_norm": 0.07763671875, + "learning_rate": 0.0027616246763228036, + "loss": 1.2013, + "step": 2998 + }, + { + "epoch": 0.2629876313492613, + "grad_norm": 0.140625, + "learning_rate": 0.0027613932951675224, + "loss": 1.2283, + "step": 2999 + }, + { + "epoch": 0.2630753231236358, + "grad_norm": 0.09912109375, + "learning_rate": 0.0027611618126560417, + "loss": 1.2163, + "step": 3000 + }, + { + "epoch": 0.2630753231236358, + "eval_loss": 1.2262382507324219, + "eval_runtime": 428.7851, + "eval_samples_per_second": 33.693, + "eval_steps_per_second": 8.424, + "step": 3000 + }, + { + "epoch": 0.26316301489801036, + "grad_norm": 0.15234375, + "learning_rate": 0.002760930228809472, + "loss": 1.2914, + "step": 3001 + }, + { + "epoch": 0.2632507066723849, + "grad_norm": 0.11181640625, + "learning_rate": 0.002760698543648933, + "loss": 1.199, + "step": 3002 + }, + { + "epoch": 0.26333839844675944, + "grad_norm": 0.1025390625, + "learning_rate": 0.002760466757195554, + "loss": 1.2503, + "step": 3003 + }, + { + "epoch": 0.263426090221134, + "grad_norm": 0.0634765625, + "learning_rate": 0.0027602348694704738, + "loss": 1.1814, + "step": 3004 + }, + { + "epoch": 0.2635137819955085, + "grad_norm": 0.09912109375, + "learning_rate": 0.0027600028804948395, + "loss": 1.2841, + "step": 3005 + }, + { + "epoch": 0.2636014737698831, + "grad_norm": 0.076171875, + "learning_rate": 0.0027597707902898084, + "loss": 1.2287, + "step": 3006 + }, + { + "epoch": 0.26368916554425764, + "grad_norm": 0.07568359375, + "learning_rate": 0.002759538598876545, + "loss": 1.197, + "step": 3007 + }, + { + "epoch": 0.26377685731863215, + "grad_norm": 0.057861328125, + "learning_rate": 0.0027593063062762266, + "loss": 1.23, + "step": 3008 + }, + { + "epoch": 0.2638645490930067, + "grad_norm": 0.1162109375, + "learning_rate": 0.002759073912510036, + "loss": 1.278, + "step": 3009 + }, + { + "epoch": 0.2639522408673813, + "grad_norm": 0.0556640625, + "learning_rate": 0.002758841417599168, + "loss": 1.2244, + "step": 3010 + }, + { + "epoch": 0.2640399326417558, + "grad_norm": 0.08935546875, + "learning_rate": 0.0027586088215648254, + "loss": 1.1954, + "step": 3011 + }, + { + "epoch": 0.26412762441613036, + "grad_norm": 0.06005859375, + "learning_rate": 0.00275837612442822, + "loss": 1.2664, + "step": 3012 + }, + { + "epoch": 0.2642153161905049, + "grad_norm": 0.0732421875, + "learning_rate": 0.002758143326210573, + "loss": 1.1921, + "step": 3013 + }, + { + "epoch": 0.26430300796487943, + "grad_norm": 0.06396484375, + "learning_rate": 0.0027579104269331157, + "loss": 1.231, + "step": 3014 + }, + { + "epoch": 0.264390699739254, + "grad_norm": 0.0693359375, + "learning_rate": 0.0027576774266170876, + "loss": 1.2155, + "step": 3015 + }, + { + "epoch": 0.2644783915136285, + "grad_norm": 0.080078125, + "learning_rate": 0.0027574443252837374, + "loss": 1.1872, + "step": 3016 + }, + { + "epoch": 0.2645660832880031, + "grad_norm": 0.064453125, + "learning_rate": 0.0027572111229543232, + "loss": 1.1784, + "step": 3017 + }, + { + "epoch": 0.26465377506237764, + "grad_norm": 0.07568359375, + "learning_rate": 0.0027569778196501137, + "loss": 1.1992, + "step": 3018 + }, + { + "epoch": 0.26474146683675215, + "grad_norm": 0.068359375, + "learning_rate": 0.0027567444153923843, + "loss": 1.2336, + "step": 3019 + }, + { + "epoch": 0.2648291586111267, + "grad_norm": 0.0654296875, + "learning_rate": 0.0027565109102024206, + "loss": 1.2378, + "step": 3020 + }, + { + "epoch": 0.2649168503855013, + "grad_norm": 0.0751953125, + "learning_rate": 0.0027562773041015193, + "loss": 1.2282, + "step": 3021 + }, + { + "epoch": 0.2650045421598758, + "grad_norm": 0.07080078125, + "learning_rate": 0.002756043597110983, + "loss": 1.2049, + "step": 3022 + }, + { + "epoch": 0.26509223393425035, + "grad_norm": 0.087890625, + "learning_rate": 0.0027558097892521265, + "loss": 1.2567, + "step": 3023 + }, + { + "epoch": 0.2651799257086249, + "grad_norm": 0.0791015625, + "learning_rate": 0.002755575880546272, + "loss": 1.2378, + "step": 3024 + }, + { + "epoch": 0.26526761748299943, + "grad_norm": 0.076171875, + "learning_rate": 0.0027553418710147505, + "loss": 1.2389, + "step": 3025 + }, + { + "epoch": 0.265355309257374, + "grad_norm": 0.07861328125, + "learning_rate": 0.002755107760678904, + "loss": 1.2203, + "step": 3026 + }, + { + "epoch": 0.26544300103174856, + "grad_norm": 0.064453125, + "learning_rate": 0.002754873549560083, + "loss": 1.1923, + "step": 3027 + }, + { + "epoch": 0.26553069280612307, + "grad_norm": 0.06396484375, + "learning_rate": 0.002754639237679646, + "loss": 1.2175, + "step": 3028 + }, + { + "epoch": 0.26561838458049764, + "grad_norm": 0.06982421875, + "learning_rate": 0.0027544048250589623, + "loss": 1.2011, + "step": 3029 + }, + { + "epoch": 0.26570607635487214, + "grad_norm": 0.064453125, + "learning_rate": 0.002754170311719409, + "loss": 1.2183, + "step": 3030 + }, + { + "epoch": 0.2657937681292467, + "grad_norm": 0.0712890625, + "learning_rate": 0.0027539356976823744, + "loss": 1.1718, + "step": 3031 + }, + { + "epoch": 0.2658814599036213, + "grad_norm": 0.0556640625, + "learning_rate": 0.0027537009829692537, + "loss": 1.1575, + "step": 3032 + }, + { + "epoch": 0.2659691516779958, + "grad_norm": 0.091796875, + "learning_rate": 0.0027534661676014527, + "loss": 1.2039, + "step": 3033 + }, + { + "epoch": 0.26605684345237035, + "grad_norm": 0.068359375, + "learning_rate": 0.002753231251600386, + "loss": 1.2451, + "step": 3034 + }, + { + "epoch": 0.2661445352267449, + "grad_norm": 0.0732421875, + "learning_rate": 0.0027529962349874767, + "loss": 1.2158, + "step": 3035 + }, + { + "epoch": 0.2662322270011194, + "grad_norm": 0.06787109375, + "learning_rate": 0.002752761117784158, + "loss": 1.2443, + "step": 3036 + }, + { + "epoch": 0.266319918775494, + "grad_norm": 0.07958984375, + "learning_rate": 0.0027525259000118723, + "loss": 1.2247, + "step": 3037 + }, + { + "epoch": 0.26640761054986856, + "grad_norm": 0.06494140625, + "learning_rate": 0.002752290581692071, + "loss": 1.1676, + "step": 3038 + }, + { + "epoch": 0.26649530232424307, + "grad_norm": 0.150390625, + "learning_rate": 0.002752055162846214, + "loss": 1.1866, + "step": 3039 + }, + { + "epoch": 0.26658299409861763, + "grad_norm": 0.11376953125, + "learning_rate": 0.002751819643495771, + "loss": 1.2442, + "step": 3040 + }, + { + "epoch": 0.26667068587299214, + "grad_norm": 0.09912109375, + "learning_rate": 0.002751584023662221, + "loss": 1.1324, + "step": 3041 + }, + { + "epoch": 0.2667583776473667, + "grad_norm": 0.087890625, + "learning_rate": 0.0027513483033670513, + "loss": 1.2184, + "step": 3042 + }, + { + "epoch": 0.26684606942174127, + "grad_norm": 0.0791015625, + "learning_rate": 0.00275111248263176, + "loss": 1.2118, + "step": 3043 + }, + { + "epoch": 0.2669337611961158, + "grad_norm": 0.0595703125, + "learning_rate": 0.0027508765614778527, + "loss": 1.1679, + "step": 3044 + }, + { + "epoch": 0.26702145297049035, + "grad_norm": 0.0771484375, + "learning_rate": 0.0027506405399268445, + "loss": 1.1669, + "step": 3045 + }, + { + "epoch": 0.2671091447448649, + "grad_norm": 0.061279296875, + "learning_rate": 0.002750404418000261, + "loss": 1.2071, + "step": 3046 + }, + { + "epoch": 0.2671968365192394, + "grad_norm": 0.0703125, + "learning_rate": 0.002750168195719635, + "loss": 1.3092, + "step": 3047 + }, + { + "epoch": 0.267284528293614, + "grad_norm": 0.0693359375, + "learning_rate": 0.0027499318731065096, + "loss": 1.195, + "step": 3048 + }, + { + "epoch": 0.26737222006798855, + "grad_norm": 0.08056640625, + "learning_rate": 0.002749695450182437, + "loss": 1.2021, + "step": 3049 + }, + { + "epoch": 0.26745991184236306, + "grad_norm": 0.0859375, + "learning_rate": 0.0027494589269689775, + "loss": 1.1933, + "step": 3050 + }, + { + "epoch": 0.26754760361673763, + "grad_norm": 0.107421875, + "learning_rate": 0.0027492223034877024, + "loss": 1.2187, + "step": 3051 + }, + { + "epoch": 0.26763529539111214, + "grad_norm": 0.061767578125, + "learning_rate": 0.0027489855797601915, + "loss": 1.1839, + "step": 3052 + }, + { + "epoch": 0.2677229871654867, + "grad_norm": 0.10888671875, + "learning_rate": 0.002748748755808032, + "loss": 1.1965, + "step": 3053 + }, + { + "epoch": 0.26781067893986127, + "grad_norm": 0.056884765625, + "learning_rate": 0.0027485118316528225, + "loss": 1.2002, + "step": 3054 + }, + { + "epoch": 0.2678983707142358, + "grad_norm": 0.059814453125, + "learning_rate": 0.0027482748073161703, + "loss": 1.2611, + "step": 3055 + }, + { + "epoch": 0.26798606248861034, + "grad_norm": 0.0634765625, + "learning_rate": 0.00274803768281969, + "loss": 1.246, + "step": 3056 + }, + { + "epoch": 0.2680737542629849, + "grad_norm": 0.056640625, + "learning_rate": 0.002747800458185008, + "loss": 1.2235, + "step": 3057 + }, + { + "epoch": 0.2681614460373594, + "grad_norm": 0.06298828125, + "learning_rate": 0.0027475631334337586, + "loss": 1.1927, + "step": 3058 + }, + { + "epoch": 0.268249137811734, + "grad_norm": 0.06787109375, + "learning_rate": 0.0027473257085875837, + "loss": 1.2451, + "step": 3059 + }, + { + "epoch": 0.26833682958610855, + "grad_norm": 0.0810546875, + "learning_rate": 0.0027470881836681375, + "loss": 1.195, + "step": 3060 + }, + { + "epoch": 0.26842452136048306, + "grad_norm": 0.056396484375, + "learning_rate": 0.0027468505586970816, + "loss": 1.2327, + "step": 3061 + }, + { + "epoch": 0.2685122131348576, + "grad_norm": 0.1005859375, + "learning_rate": 0.002746612833696086, + "loss": 1.2268, + "step": 3062 + }, + { + "epoch": 0.2685999049092322, + "grad_norm": 0.078125, + "learning_rate": 0.00274637500868683, + "loss": 1.1877, + "step": 3063 + }, + { + "epoch": 0.2686875966836067, + "grad_norm": 0.1640625, + "learning_rate": 0.0027461370836910047, + "loss": 1.2453, + "step": 3064 + }, + { + "epoch": 0.26877528845798127, + "grad_norm": 0.064453125, + "learning_rate": 0.002745899058730306, + "loss": 1.1945, + "step": 3065 + }, + { + "epoch": 0.2688629802323558, + "grad_norm": 0.10107421875, + "learning_rate": 0.002745660933826443, + "loss": 1.2393, + "step": 3066 + }, + { + "epoch": 0.26895067200673034, + "grad_norm": 0.09326171875, + "learning_rate": 0.0027454227090011308, + "loss": 1.2457, + "step": 3067 + }, + { + "epoch": 0.2690383637811049, + "grad_norm": 0.115234375, + "learning_rate": 0.0027451843842760957, + "loss": 1.1826, + "step": 3068 + }, + { + "epoch": 0.2691260555554794, + "grad_norm": 0.076171875, + "learning_rate": 0.0027449459596730714, + "loss": 1.2722, + "step": 3069 + }, + { + "epoch": 0.269213747329854, + "grad_norm": 0.12158203125, + "learning_rate": 0.002744707435213803, + "loss": 1.2205, + "step": 3070 + }, + { + "epoch": 0.26930143910422855, + "grad_norm": 0.08251953125, + "learning_rate": 0.002744468810920042, + "loss": 1.2162, + "step": 3071 + }, + { + "epoch": 0.26938913087860306, + "grad_norm": 0.119140625, + "learning_rate": 0.0027442300868135506, + "loss": 1.2269, + "step": 3072 + }, + { + "epoch": 0.2694768226529776, + "grad_norm": 0.0703125, + "learning_rate": 0.0027439912629161, + "loss": 1.2111, + "step": 3073 + }, + { + "epoch": 0.2695645144273522, + "grad_norm": 0.0888671875, + "learning_rate": 0.002743752339249471, + "loss": 1.2563, + "step": 3074 + }, + { + "epoch": 0.2696522062017267, + "grad_norm": 0.06640625, + "learning_rate": 0.0027435133158354515, + "loss": 1.218, + "step": 3075 + }, + { + "epoch": 0.26973989797610126, + "grad_norm": 0.0810546875, + "learning_rate": 0.0027432741926958406, + "loss": 1.2171, + "step": 3076 + }, + { + "epoch": 0.26982758975047577, + "grad_norm": 0.07470703125, + "learning_rate": 0.0027430349698524463, + "loss": 1.1564, + "step": 3077 + }, + { + "epoch": 0.26991528152485034, + "grad_norm": 0.0830078125, + "learning_rate": 0.002742795647327083, + "loss": 1.277, + "step": 3078 + }, + { + "epoch": 0.2700029732992249, + "grad_norm": 0.08642578125, + "learning_rate": 0.0027425562251415787, + "loss": 1.2364, + "step": 3079 + }, + { + "epoch": 0.2700906650735994, + "grad_norm": 0.07080078125, + "learning_rate": 0.002742316703317767, + "loss": 1.2028, + "step": 3080 + }, + { + "epoch": 0.270178356847974, + "grad_norm": 0.0751953125, + "learning_rate": 0.0027420770818774916, + "loss": 1.2521, + "step": 3081 + }, + { + "epoch": 0.27026604862234854, + "grad_norm": 0.1025390625, + "learning_rate": 0.002741837360842606, + "loss": 1.269, + "step": 3082 + }, + { + "epoch": 0.27035374039672305, + "grad_norm": 0.068359375, + "learning_rate": 0.002741597540234971, + "loss": 1.2486, + "step": 3083 + }, + { + "epoch": 0.2704414321710976, + "grad_norm": 0.06787109375, + "learning_rate": 0.0027413576200764583, + "loss": 1.1683, + "step": 3084 + }, + { + "epoch": 0.2705291239454722, + "grad_norm": 0.08740234375, + "learning_rate": 0.0027411176003889486, + "loss": 1.2407, + "step": 3085 + }, + { + "epoch": 0.2706168157198467, + "grad_norm": 0.062255859375, + "learning_rate": 0.0027408774811943297, + "loss": 1.1782, + "step": 3086 + }, + { + "epoch": 0.27070450749422126, + "grad_norm": 0.061279296875, + "learning_rate": 0.0027406372625145012, + "loss": 1.2137, + "step": 3087 + }, + { + "epoch": 0.27079219926859577, + "grad_norm": 0.0673828125, + "learning_rate": 0.0027403969443713693, + "loss": 1.1782, + "step": 3088 + }, + { + "epoch": 0.27087989104297033, + "grad_norm": 0.072265625, + "learning_rate": 0.0027401565267868512, + "loss": 1.2019, + "step": 3089 + }, + { + "epoch": 0.2709675828173449, + "grad_norm": 0.05078125, + "learning_rate": 0.002739916009782872, + "loss": 1.2204, + "step": 3090 + }, + { + "epoch": 0.2710552745917194, + "grad_norm": 0.08837890625, + "learning_rate": 0.0027396753933813666, + "loss": 1.194, + "step": 3091 + }, + { + "epoch": 0.271142966366094, + "grad_norm": 0.1171875, + "learning_rate": 0.002739434677604278, + "loss": 1.259, + "step": 3092 + }, + { + "epoch": 0.27123065814046854, + "grad_norm": 0.060302734375, + "learning_rate": 0.002739193862473559, + "loss": 1.1818, + "step": 3093 + }, + { + "epoch": 0.27131834991484305, + "grad_norm": 0.064453125, + "learning_rate": 0.002738952948011172, + "loss": 1.2055, + "step": 3094 + }, + { + "epoch": 0.2714060416892176, + "grad_norm": 0.0546875, + "learning_rate": 0.0027387119342390867, + "loss": 1.1827, + "step": 3095 + }, + { + "epoch": 0.2714937334635922, + "grad_norm": 0.09228515625, + "learning_rate": 0.002738470821179284, + "loss": 1.1592, + "step": 3096 + }, + { + "epoch": 0.2715814252379667, + "grad_norm": 0.09033203125, + "learning_rate": 0.002738229608853752, + "loss": 1.2316, + "step": 3097 + }, + { + "epoch": 0.27166911701234125, + "grad_norm": 0.0654296875, + "learning_rate": 0.0027379882972844894, + "loss": 1.2042, + "step": 3098 + }, + { + "epoch": 0.2717568087867158, + "grad_norm": 0.07421875, + "learning_rate": 0.002737746886493502, + "loss": 1.168, + "step": 3099 + }, + { + "epoch": 0.27184450056109033, + "grad_norm": 0.059326171875, + "learning_rate": 0.0027375053765028067, + "loss": 1.2092, + "step": 3100 + }, + { + "epoch": 0.2719321923354649, + "grad_norm": 0.07373046875, + "learning_rate": 0.0027372637673344284, + "loss": 1.2274, + "step": 3101 + }, + { + "epoch": 0.2720198841098394, + "grad_norm": 0.0712890625, + "learning_rate": 0.0027370220590104016, + "loss": 1.2464, + "step": 3102 + }, + { + "epoch": 0.27210757588421397, + "grad_norm": 0.07763671875, + "learning_rate": 0.0027367802515527694, + "loss": 1.1675, + "step": 3103 + }, + { + "epoch": 0.27219526765858854, + "grad_norm": 0.061767578125, + "learning_rate": 0.002736538344983583, + "loss": 1.2383, + "step": 3104 + }, + { + "epoch": 0.27228295943296305, + "grad_norm": 0.07763671875, + "learning_rate": 0.0027362963393249046, + "loss": 1.2322, + "step": 3105 + }, + { + "epoch": 0.2723706512073376, + "grad_norm": 0.07177734375, + "learning_rate": 0.002736054234598805, + "loss": 1.1892, + "step": 3106 + }, + { + "epoch": 0.2724583429817122, + "grad_norm": 0.08935546875, + "learning_rate": 0.0027358120308273626, + "loss": 1.2003, + "step": 3107 + }, + { + "epoch": 0.2725460347560867, + "grad_norm": 0.0732421875, + "learning_rate": 0.002735569728032665, + "loss": 1.2341, + "step": 3108 + }, + { + "epoch": 0.27263372653046125, + "grad_norm": 0.11181640625, + "learning_rate": 0.0027353273262368115, + "loss": 1.2852, + "step": 3109 + }, + { + "epoch": 0.2727214183048358, + "grad_norm": 0.0546875, + "learning_rate": 0.0027350848254619076, + "loss": 1.1885, + "step": 3110 + }, + { + "epoch": 0.2728091100792103, + "grad_norm": 0.057373046875, + "learning_rate": 0.0027348422257300687, + "loss": 1.1697, + "step": 3111 + }, + { + "epoch": 0.2728968018535849, + "grad_norm": 0.06884765625, + "learning_rate": 0.0027345995270634193, + "loss": 1.1881, + "step": 3112 + }, + { + "epoch": 0.2729844936279594, + "grad_norm": 0.053955078125, + "learning_rate": 0.0027343567294840928, + "loss": 1.2431, + "step": 3113 + }, + { + "epoch": 0.27307218540233397, + "grad_norm": 0.09130859375, + "learning_rate": 0.0027341138330142316, + "loss": 1.2331, + "step": 3114 + }, + { + "epoch": 0.27315987717670853, + "grad_norm": 0.055908203125, + "learning_rate": 0.0027338708376759883, + "loss": 1.1984, + "step": 3115 + }, + { + "epoch": 0.27324756895108304, + "grad_norm": 0.0634765625, + "learning_rate": 0.002733627743491522, + "loss": 1.1997, + "step": 3116 + }, + { + "epoch": 0.2733352607254576, + "grad_norm": 0.0615234375, + "learning_rate": 0.0027333845504830027, + "loss": 1.1682, + "step": 3117 + }, + { + "epoch": 0.2734229524998322, + "grad_norm": 0.0751953125, + "learning_rate": 0.002733141258672609, + "loss": 1.2168, + "step": 3118 + }, + { + "epoch": 0.2735106442742067, + "grad_norm": 0.060546875, + "learning_rate": 0.0027328978680825292, + "loss": 1.1736, + "step": 3119 + }, + { + "epoch": 0.27359833604858125, + "grad_norm": 0.06982421875, + "learning_rate": 0.0027326543787349594, + "loss": 1.2487, + "step": 3120 + }, + { + "epoch": 0.2736860278229558, + "grad_norm": 0.06787109375, + "learning_rate": 0.0027324107906521043, + "loss": 1.1617, + "step": 3121 + }, + { + "epoch": 0.2737737195973303, + "grad_norm": 0.0849609375, + "learning_rate": 0.00273216710385618, + "loss": 1.2255, + "step": 3122 + }, + { + "epoch": 0.2738614113717049, + "grad_norm": 0.05615234375, + "learning_rate": 0.002731923318369409, + "loss": 1.2132, + "step": 3123 + }, + { + "epoch": 0.2739491031460794, + "grad_norm": 0.09716796875, + "learning_rate": 0.0027316794342140244, + "loss": 1.2828, + "step": 3124 + }, + { + "epoch": 0.27403679492045396, + "grad_norm": 0.054443359375, + "learning_rate": 0.0027314354514122686, + "loss": 1.2747, + "step": 3125 + }, + { + "epoch": 0.27412448669482853, + "grad_norm": 0.0830078125, + "learning_rate": 0.0027311913699863905, + "loss": 1.1524, + "step": 3126 + }, + { + "epoch": 0.27421217846920304, + "grad_norm": 0.0634765625, + "learning_rate": 0.002730947189958651, + "loss": 1.2235, + "step": 3127 + }, + { + "epoch": 0.2742998702435776, + "grad_norm": 0.1083984375, + "learning_rate": 0.002730702911351317, + "loss": 1.2542, + "step": 3128 + }, + { + "epoch": 0.27438756201795217, + "grad_norm": 0.06689453125, + "learning_rate": 0.0027304585341866687, + "loss": 1.212, + "step": 3129 + }, + { + "epoch": 0.2744752537923267, + "grad_norm": 0.09375, + "learning_rate": 0.0027302140584869903, + "loss": 1.1597, + "step": 3130 + }, + { + "epoch": 0.27456294556670124, + "grad_norm": 0.07666015625, + "learning_rate": 0.0027299694842745793, + "loss": 1.2361, + "step": 3131 + }, + { + "epoch": 0.2746506373410758, + "grad_norm": 0.0859375, + "learning_rate": 0.002729724811571739, + "loss": 1.1911, + "step": 3132 + }, + { + "epoch": 0.2747383291154503, + "grad_norm": 0.07080078125, + "learning_rate": 0.002729480040400783, + "loss": 1.2191, + "step": 3133 + }, + { + "epoch": 0.2748260208898249, + "grad_norm": 0.095703125, + "learning_rate": 0.002729235170784033, + "loss": 1.1737, + "step": 3134 + }, + { + "epoch": 0.2749137126641994, + "grad_norm": 0.06494140625, + "learning_rate": 0.0027289902027438227, + "loss": 1.1687, + "step": 3135 + }, + { + "epoch": 0.27500140443857396, + "grad_norm": 0.083984375, + "learning_rate": 0.0027287451363024917, + "loss": 1.1117, + "step": 3136 + }, + { + "epoch": 0.2750890962129485, + "grad_norm": 0.142578125, + "learning_rate": 0.002728499971482388, + "loss": 1.2436, + "step": 3137 + }, + { + "epoch": 0.27517678798732303, + "grad_norm": 0.09033203125, + "learning_rate": 0.0027282547083058713, + "loss": 1.2373, + "step": 3138 + }, + { + "epoch": 0.2752644797616976, + "grad_norm": 0.12451171875, + "learning_rate": 0.0027280093467953086, + "loss": 1.2195, + "step": 3139 + }, + { + "epoch": 0.27535217153607217, + "grad_norm": 0.06884765625, + "learning_rate": 0.0027277638869730764, + "loss": 1.1732, + "step": 3140 + }, + { + "epoch": 0.2754398633104467, + "grad_norm": 0.10205078125, + "learning_rate": 0.0027275183288615603, + "loss": 1.203, + "step": 3141 + }, + { + "epoch": 0.27552755508482124, + "grad_norm": 0.05517578125, + "learning_rate": 0.002727272672483154, + "loss": 1.1519, + "step": 3142 + }, + { + "epoch": 0.2756152468591958, + "grad_norm": 0.10302734375, + "learning_rate": 0.0027270269178602605, + "loss": 1.157, + "step": 3143 + }, + { + "epoch": 0.2757029386335703, + "grad_norm": 0.06884765625, + "learning_rate": 0.002726781065015293, + "loss": 1.1774, + "step": 3144 + }, + { + "epoch": 0.2757906304079449, + "grad_norm": 0.15234375, + "learning_rate": 0.002726535113970672, + "loss": 1.2341, + "step": 3145 + }, + { + "epoch": 0.27587832218231945, + "grad_norm": 0.07080078125, + "learning_rate": 0.0027262890647488276, + "loss": 1.2086, + "step": 3146 + }, + { + "epoch": 0.27596601395669396, + "grad_norm": 0.189453125, + "learning_rate": 0.002726042917372199, + "loss": 1.1979, + "step": 3147 + }, + { + "epoch": 0.2760537057310685, + "grad_norm": 0.0693359375, + "learning_rate": 0.0027257966718632343, + "loss": 1.1944, + "step": 3148 + }, + { + "epoch": 0.27614139750544303, + "grad_norm": 0.138671875, + "learning_rate": 0.0027255503282443898, + "loss": 1.2485, + "step": 3149 + }, + { + "epoch": 0.2762290892798176, + "grad_norm": 0.087890625, + "learning_rate": 0.0027253038865381316, + "loss": 1.1847, + "step": 3150 + }, + { + "epoch": 0.27631678105419216, + "grad_norm": 0.1083984375, + "learning_rate": 0.0027250573467669354, + "loss": 1.2059, + "step": 3151 + }, + { + "epoch": 0.27640447282856667, + "grad_norm": 0.10888671875, + "learning_rate": 0.0027248107089532844, + "loss": 1.2377, + "step": 3152 + }, + { + "epoch": 0.27649216460294124, + "grad_norm": 0.08447265625, + "learning_rate": 0.002724563973119671, + "loss": 1.2669, + "step": 3153 + }, + { + "epoch": 0.2765798563773158, + "grad_norm": 0.07763671875, + "learning_rate": 0.0027243171392885976, + "loss": 1.2524, + "step": 3154 + }, + { + "epoch": 0.2766675481516903, + "grad_norm": 0.064453125, + "learning_rate": 0.002724070207482575, + "loss": 1.1837, + "step": 3155 + }, + { + "epoch": 0.2767552399260649, + "grad_norm": 0.09033203125, + "learning_rate": 0.002723823177724121, + "loss": 1.1579, + "step": 3156 + }, + { + "epoch": 0.27684293170043944, + "grad_norm": 0.08154296875, + "learning_rate": 0.0027235760500357656, + "loss": 1.2003, + "step": 3157 + }, + { + "epoch": 0.27693062347481395, + "grad_norm": 0.10498046875, + "learning_rate": 0.002723328824440046, + "loss": 1.1946, + "step": 3158 + }, + { + "epoch": 0.2770183152491885, + "grad_norm": 0.08056640625, + "learning_rate": 0.0027230815009595083, + "loss": 1.1877, + "step": 3159 + }, + { + "epoch": 0.27710600702356303, + "grad_norm": 0.103515625, + "learning_rate": 0.002722834079616708, + "loss": 1.1973, + "step": 3160 + }, + { + "epoch": 0.2771936987979376, + "grad_norm": 0.07177734375, + "learning_rate": 0.0027225865604342094, + "loss": 1.162, + "step": 3161 + }, + { + "epoch": 0.27728139057231216, + "grad_norm": 0.06787109375, + "learning_rate": 0.0027223389434345847, + "loss": 1.2216, + "step": 3162 + }, + { + "epoch": 0.27736908234668667, + "grad_norm": 0.126953125, + "learning_rate": 0.002722091228640417, + "loss": 1.206, + "step": 3163 + }, + { + "epoch": 0.27745677412106123, + "grad_norm": 0.0859375, + "learning_rate": 0.0027218434160742972, + "loss": 1.2357, + "step": 3164 + }, + { + "epoch": 0.2775444658954358, + "grad_norm": 0.107421875, + "learning_rate": 0.0027215955057588244, + "loss": 1.1718, + "step": 3165 + }, + { + "epoch": 0.2776321576698103, + "grad_norm": 0.056640625, + "learning_rate": 0.0027213474977166078, + "loss": 1.2912, + "step": 3166 + }, + { + "epoch": 0.2777198494441849, + "grad_norm": 0.07666015625, + "learning_rate": 0.0027210993919702647, + "loss": 1.2118, + "step": 3167 + }, + { + "epoch": 0.27780754121855944, + "grad_norm": 0.10400390625, + "learning_rate": 0.002720851188542423, + "loss": 1.2573, + "step": 3168 + }, + { + "epoch": 0.27789523299293395, + "grad_norm": 0.057861328125, + "learning_rate": 0.0027206028874557165, + "loss": 1.1541, + "step": 3169 + }, + { + "epoch": 0.2779829247673085, + "grad_norm": 0.2236328125, + "learning_rate": 0.0027203544887327907, + "loss": 1.294, + "step": 3170 + }, + { + "epoch": 0.278070616541683, + "grad_norm": 0.09228515625, + "learning_rate": 0.0027201059923962984, + "loss": 1.2322, + "step": 3171 + }, + { + "epoch": 0.2781583083160576, + "grad_norm": 0.1748046875, + "learning_rate": 0.002719857398468902, + "loss": 1.2167, + "step": 3172 + }, + { + "epoch": 0.27824600009043216, + "grad_norm": 0.064453125, + "learning_rate": 0.0027196087069732734, + "loss": 1.2141, + "step": 3173 + }, + { + "epoch": 0.27833369186480666, + "grad_norm": 0.171875, + "learning_rate": 0.002719359917932091, + "loss": 1.2145, + "step": 3174 + }, + { + "epoch": 0.27842138363918123, + "grad_norm": 0.06298828125, + "learning_rate": 0.002719111031368045, + "loss": 1.2392, + "step": 3175 + }, + { + "epoch": 0.2785090754135558, + "grad_norm": 0.146484375, + "learning_rate": 0.002718862047303833, + "loss": 1.2095, + "step": 3176 + }, + { + "epoch": 0.2785967671879303, + "grad_norm": 0.06494140625, + "learning_rate": 0.0027186129657621617, + "loss": 1.1472, + "step": 3177 + }, + { + "epoch": 0.27868445896230487, + "grad_norm": 0.1318359375, + "learning_rate": 0.002718363786765746, + "loss": 1.1702, + "step": 3178 + }, + { + "epoch": 0.27877215073667944, + "grad_norm": 0.0859375, + "learning_rate": 0.0027181145103373112, + "loss": 1.2109, + "step": 3179 + }, + { + "epoch": 0.27885984251105395, + "grad_norm": 0.0810546875, + "learning_rate": 0.0027178651364995906, + "loss": 1.2122, + "step": 3180 + }, + { + "epoch": 0.2789475342854285, + "grad_norm": 0.1396484375, + "learning_rate": 0.0027176156652753258, + "loss": 1.1962, + "step": 3181 + }, + { + "epoch": 0.2790352260598031, + "grad_norm": 0.05908203125, + "learning_rate": 0.0027173660966872686, + "loss": 1.1764, + "step": 3182 + }, + { + "epoch": 0.2791229178341776, + "grad_norm": 0.10546875, + "learning_rate": 0.002717116430758178, + "loss": 1.2399, + "step": 3183 + }, + { + "epoch": 0.27921060960855215, + "grad_norm": 0.06103515625, + "learning_rate": 0.0027168666675108253, + "loss": 1.2251, + "step": 3184 + }, + { + "epoch": 0.27929830138292666, + "grad_norm": 0.06884765625, + "learning_rate": 0.0027166168069679857, + "loss": 1.1583, + "step": 3185 + }, + { + "epoch": 0.2793859931573012, + "grad_norm": 0.07666015625, + "learning_rate": 0.0027163668491524463, + "loss": 1.2055, + "step": 3186 + }, + { + "epoch": 0.2794736849316758, + "grad_norm": 0.0693359375, + "learning_rate": 0.002716116794087004, + "loss": 1.2419, + "step": 3187 + }, + { + "epoch": 0.2795613767060503, + "grad_norm": 0.05712890625, + "learning_rate": 0.0027158666417944617, + "loss": 1.2447, + "step": 3188 + }, + { + "epoch": 0.27964906848042487, + "grad_norm": 0.06005859375, + "learning_rate": 0.002715616392297633, + "loss": 1.2102, + "step": 3189 + }, + { + "epoch": 0.27973676025479943, + "grad_norm": 0.056396484375, + "learning_rate": 0.0027153660456193406, + "loss": 1.2258, + "step": 3190 + }, + { + "epoch": 0.27982445202917394, + "grad_norm": 0.06884765625, + "learning_rate": 0.002715115601782415, + "loss": 1.1866, + "step": 3191 + }, + { + "epoch": 0.2799121438035485, + "grad_norm": 0.06982421875, + "learning_rate": 0.0027148650608096957, + "loss": 1.2743, + "step": 3192 + }, + { + "epoch": 0.2799998355779231, + "grad_norm": 0.11181640625, + "learning_rate": 0.002714614422724032, + "loss": 1.2109, + "step": 3193 + }, + { + "epoch": 0.2800875273522976, + "grad_norm": 0.06640625, + "learning_rate": 0.0027143636875482818, + "loss": 1.1708, + "step": 3194 + }, + { + "epoch": 0.28017521912667215, + "grad_norm": 0.08154296875, + "learning_rate": 0.00271411285530531, + "loss": 1.1406, + "step": 3195 + }, + { + "epoch": 0.28026291090104666, + "grad_norm": 0.1005859375, + "learning_rate": 0.0027138619260179933, + "loss": 1.2615, + "step": 3196 + }, + { + "epoch": 0.2803506026754212, + "grad_norm": 0.09326171875, + "learning_rate": 0.002713610899709216, + "loss": 1.1542, + "step": 3197 + }, + { + "epoch": 0.2804382944497958, + "grad_norm": 0.0703125, + "learning_rate": 0.0027133597764018693, + "loss": 1.2493, + "step": 3198 + }, + { + "epoch": 0.2805259862241703, + "grad_norm": 0.09130859375, + "learning_rate": 0.002713108556118856, + "loss": 1.1763, + "step": 3199 + }, + { + "epoch": 0.28061367799854486, + "grad_norm": 0.0576171875, + "learning_rate": 0.002712857238883087, + "loss": 1.1743, + "step": 3200 + }, + { + "epoch": 0.28070136977291943, + "grad_norm": 0.0859375, + "learning_rate": 0.0027126058247174826, + "loss": 1.1941, + "step": 3201 + }, + { + "epoch": 0.28078906154729394, + "grad_norm": 0.06787109375, + "learning_rate": 0.0027123543136449687, + "loss": 1.2341, + "step": 3202 + }, + { + "epoch": 0.2808767533216685, + "grad_norm": 0.07373046875, + "learning_rate": 0.002712102705688484, + "loss": 1.2625, + "step": 3203 + }, + { + "epoch": 0.28096444509604307, + "grad_norm": 0.057373046875, + "learning_rate": 0.002711851000870975, + "loss": 1.1388, + "step": 3204 + }, + { + "epoch": 0.2810521368704176, + "grad_norm": 0.10595703125, + "learning_rate": 0.0027115991992153953, + "loss": 1.1956, + "step": 3205 + }, + { + "epoch": 0.28113982864479214, + "grad_norm": 0.06689453125, + "learning_rate": 0.0027113473007447098, + "loss": 1.187, + "step": 3206 + }, + { + "epoch": 0.28122752041916665, + "grad_norm": 0.0751953125, + "learning_rate": 0.00271109530548189, + "loss": 1.243, + "step": 3207 + }, + { + "epoch": 0.2813152121935412, + "grad_norm": 0.06591796875, + "learning_rate": 0.002710843213449917, + "loss": 1.1944, + "step": 3208 + }, + { + "epoch": 0.2814029039679158, + "grad_norm": 0.0947265625, + "learning_rate": 0.0027105910246717825, + "loss": 1.1856, + "step": 3209 + }, + { + "epoch": 0.2814905957422903, + "grad_norm": 0.0634765625, + "learning_rate": 0.002710338739170484, + "loss": 1.1999, + "step": 3210 + }, + { + "epoch": 0.28157828751666486, + "grad_norm": 0.0771484375, + "learning_rate": 0.002710086356969029, + "loss": 1.2298, + "step": 3211 + }, + { + "epoch": 0.2816659792910394, + "grad_norm": 0.0576171875, + "learning_rate": 0.002709833878090436, + "loss": 1.166, + "step": 3212 + }, + { + "epoch": 0.28175367106541394, + "grad_norm": 0.080078125, + "learning_rate": 0.0027095813025577284, + "loss": 1.2138, + "step": 3213 + }, + { + "epoch": 0.2818413628397885, + "grad_norm": 0.06103515625, + "learning_rate": 0.002709328630393942, + "loss": 1.2286, + "step": 3214 + }, + { + "epoch": 0.28192905461416307, + "grad_norm": 0.057861328125, + "learning_rate": 0.0027090758616221183, + "loss": 1.1712, + "step": 3215 + }, + { + "epoch": 0.2820167463885376, + "grad_norm": 0.10693359375, + "learning_rate": 0.0027088229962653108, + "loss": 1.2474, + "step": 3216 + }, + { + "epoch": 0.28210443816291214, + "grad_norm": 0.06396484375, + "learning_rate": 0.002708570034346579, + "loss": 1.1507, + "step": 3217 + }, + { + "epoch": 0.2821921299372867, + "grad_norm": 0.0966796875, + "learning_rate": 0.002708316975888993, + "loss": 1.167, + "step": 3218 + }, + { + "epoch": 0.2822798217116612, + "grad_norm": 0.0595703125, + "learning_rate": 0.0027080638209156303, + "loss": 1.1839, + "step": 3219 + }, + { + "epoch": 0.2823675134860358, + "grad_norm": 0.10302734375, + "learning_rate": 0.0027078105694495792, + "loss": 1.1495, + "step": 3220 + }, + { + "epoch": 0.2824552052604103, + "grad_norm": 0.06005859375, + "learning_rate": 0.002707557221513935, + "loss": 1.219, + "step": 3221 + }, + { + "epoch": 0.28254289703478486, + "grad_norm": 0.0771484375, + "learning_rate": 0.0027073037771318015, + "loss": 1.1879, + "step": 3222 + }, + { + "epoch": 0.2826305888091594, + "grad_norm": 0.07080078125, + "learning_rate": 0.002707050236326293, + "loss": 1.1974, + "step": 3223 + }, + { + "epoch": 0.28271828058353393, + "grad_norm": 0.08154296875, + "learning_rate": 0.0027067965991205323, + "loss": 1.2573, + "step": 3224 + }, + { + "epoch": 0.2828059723579085, + "grad_norm": 0.06787109375, + "learning_rate": 0.002706542865537649, + "loss": 1.2002, + "step": 3225 + }, + { + "epoch": 0.28289366413228306, + "grad_norm": 0.0791015625, + "learning_rate": 0.002706289035600785, + "loss": 1.2129, + "step": 3226 + }, + { + "epoch": 0.2829813559066576, + "grad_norm": 0.06396484375, + "learning_rate": 0.002706035109333087, + "loss": 1.2783, + "step": 3227 + }, + { + "epoch": 0.28306904768103214, + "grad_norm": 0.0830078125, + "learning_rate": 0.002705781086757713, + "loss": 1.3446, + "step": 3228 + }, + { + "epoch": 0.2831567394554067, + "grad_norm": 0.06982421875, + "learning_rate": 0.00270552696789783, + "loss": 1.1802, + "step": 3229 + }, + { + "epoch": 0.2832444312297812, + "grad_norm": 0.06689453125, + "learning_rate": 0.0027052727527766118, + "loss": 1.2642, + "step": 3230 + }, + { + "epoch": 0.2833321230041558, + "grad_norm": 0.060302734375, + "learning_rate": 0.002705018441417243, + "loss": 1.2062, + "step": 3231 + }, + { + "epoch": 0.2834198147785303, + "grad_norm": 0.0791015625, + "learning_rate": 0.0027047640338429156, + "loss": 1.1977, + "step": 3232 + }, + { + "epoch": 0.28350750655290485, + "grad_norm": 0.07421875, + "learning_rate": 0.0027045095300768317, + "loss": 1.2683, + "step": 3233 + }, + { + "epoch": 0.2835951983272794, + "grad_norm": 0.140625, + "learning_rate": 0.0027042549301422004, + "loss": 1.2624, + "step": 3234 + }, + { + "epoch": 0.28368289010165393, + "grad_norm": 0.05810546875, + "learning_rate": 0.0027040002340622407, + "loss": 1.1793, + "step": 3235 + }, + { + "epoch": 0.2837705818760285, + "grad_norm": 0.11572265625, + "learning_rate": 0.0027037454418601813, + "loss": 1.2047, + "step": 3236 + }, + { + "epoch": 0.28385827365040306, + "grad_norm": 0.08251953125, + "learning_rate": 0.0027034905535592573, + "loss": 1.2205, + "step": 3237 + }, + { + "epoch": 0.28394596542477757, + "grad_norm": 0.0927734375, + "learning_rate": 0.002703235569182715, + "loss": 1.2565, + "step": 3238 + }, + { + "epoch": 0.28403365719915213, + "grad_norm": 0.080078125, + "learning_rate": 0.0027029804887538076, + "loss": 1.1787, + "step": 3239 + }, + { + "epoch": 0.2841213489735267, + "grad_norm": 0.05908203125, + "learning_rate": 0.0027027253122957973, + "loss": 1.1655, + "step": 3240 + }, + { + "epoch": 0.2842090407479012, + "grad_norm": 0.064453125, + "learning_rate": 0.0027024700398319565, + "loss": 1.2108, + "step": 3241 + }, + { + "epoch": 0.2842967325222758, + "grad_norm": 0.055908203125, + "learning_rate": 0.002702214671385565, + "loss": 1.2202, + "step": 3242 + }, + { + "epoch": 0.2843844242966503, + "grad_norm": 0.064453125, + "learning_rate": 0.002701959206979912, + "loss": 1.1777, + "step": 3243 + }, + { + "epoch": 0.28447211607102485, + "grad_norm": 0.06982421875, + "learning_rate": 0.0027017036466382943, + "loss": 1.2016, + "step": 3244 + }, + { + "epoch": 0.2845598078453994, + "grad_norm": 0.10791015625, + "learning_rate": 0.002701447990384019, + "loss": 1.2301, + "step": 3245 + }, + { + "epoch": 0.2846474996197739, + "grad_norm": 0.060791015625, + "learning_rate": 0.0027011922382404016, + "loss": 1.2284, + "step": 3246 + }, + { + "epoch": 0.2847351913941485, + "grad_norm": 0.146484375, + "learning_rate": 0.0027009363902307654, + "loss": 1.2319, + "step": 3247 + }, + { + "epoch": 0.28482288316852306, + "grad_norm": 0.055908203125, + "learning_rate": 0.002700680446378444, + "loss": 1.2084, + "step": 3248 + }, + { + "epoch": 0.28491057494289757, + "grad_norm": 0.12255859375, + "learning_rate": 0.002700424406706778, + "loss": 1.2149, + "step": 3249 + }, + { + "epoch": 0.28499826671727213, + "grad_norm": 0.068359375, + "learning_rate": 0.0027001682712391173, + "loss": 1.1756, + "step": 3250 + }, + { + "epoch": 0.2850859584916467, + "grad_norm": 0.07470703125, + "learning_rate": 0.0026999120399988216, + "loss": 1.1716, + "step": 3251 + }, + { + "epoch": 0.2851736502660212, + "grad_norm": 0.0771484375, + "learning_rate": 0.0026996557130092584, + "loss": 1.2102, + "step": 3252 + }, + { + "epoch": 0.28526134204039577, + "grad_norm": 0.10986328125, + "learning_rate": 0.0026993992902938036, + "loss": 1.2236, + "step": 3253 + }, + { + "epoch": 0.28534903381477034, + "grad_norm": 0.06298828125, + "learning_rate": 0.0026991427718758426, + "loss": 1.2205, + "step": 3254 + }, + { + "epoch": 0.28543672558914485, + "grad_norm": 0.08740234375, + "learning_rate": 0.002698886157778769, + "loss": 1.182, + "step": 3255 + }, + { + "epoch": 0.2855244173635194, + "grad_norm": 0.07958984375, + "learning_rate": 0.002698629448025986, + "loss": 1.1327, + "step": 3256 + }, + { + "epoch": 0.2856121091378939, + "grad_norm": 0.053955078125, + "learning_rate": 0.0026983726426409037, + "loss": 1.1479, + "step": 3257 + }, + { + "epoch": 0.2856998009122685, + "grad_norm": 0.107421875, + "learning_rate": 0.002698115741646943, + "loss": 1.1775, + "step": 3258 + }, + { + "epoch": 0.28578749268664305, + "grad_norm": 0.0634765625, + "learning_rate": 0.0026978587450675324, + "loss": 1.1728, + "step": 3259 + }, + { + "epoch": 0.28587518446101756, + "grad_norm": 0.053466796875, + "learning_rate": 0.00269760165292611, + "loss": 1.2283, + "step": 3260 + }, + { + "epoch": 0.2859628762353921, + "grad_norm": 0.0771484375, + "learning_rate": 0.00269734446524612, + "loss": 1.1522, + "step": 3261 + }, + { + "epoch": 0.2860505680097667, + "grad_norm": 0.0615234375, + "learning_rate": 0.0026970871820510187, + "loss": 1.2, + "step": 3262 + }, + { + "epoch": 0.2861382597841412, + "grad_norm": 0.0888671875, + "learning_rate": 0.00269682980336427, + "loss": 1.2017, + "step": 3263 + }, + { + "epoch": 0.28622595155851577, + "grad_norm": 0.0712890625, + "learning_rate": 0.0026965723292093455, + "loss": 1.2271, + "step": 3264 + }, + { + "epoch": 0.28631364333289033, + "grad_norm": 0.05810546875, + "learning_rate": 0.002696314759609726, + "loss": 1.1372, + "step": 3265 + }, + { + "epoch": 0.28640133510726484, + "grad_norm": 0.06884765625, + "learning_rate": 0.0026960570945889013, + "loss": 1.1639, + "step": 3266 + }, + { + "epoch": 0.2864890268816394, + "grad_norm": 0.0654296875, + "learning_rate": 0.0026957993341703702, + "loss": 1.1657, + "step": 3267 + }, + { + "epoch": 0.2865767186560139, + "grad_norm": 0.06396484375, + "learning_rate": 0.0026955414783776397, + "loss": 1.2948, + "step": 3268 + }, + { + "epoch": 0.2866644104303885, + "grad_norm": 0.061279296875, + "learning_rate": 0.002695283527234225, + "loss": 1.2274, + "step": 3269 + }, + { + "epoch": 0.28675210220476305, + "grad_norm": 0.0693359375, + "learning_rate": 0.0026950254807636517, + "loss": 1.1702, + "step": 3270 + }, + { + "epoch": 0.28683979397913756, + "grad_norm": 0.06689453125, + "learning_rate": 0.0026947673389894518, + "loss": 1.2168, + "step": 3271 + }, + { + "epoch": 0.2869274857535121, + "grad_norm": 0.056396484375, + "learning_rate": 0.002694509101935168, + "loss": 1.1172, + "step": 3272 + }, + { + "epoch": 0.2870151775278867, + "grad_norm": 0.06884765625, + "learning_rate": 0.0026942507696243495, + "loss": 1.2251, + "step": 3273 + }, + { + "epoch": 0.2871028693022612, + "grad_norm": 0.055908203125, + "learning_rate": 0.002693992342080558, + "loss": 1.2433, + "step": 3274 + }, + { + "epoch": 0.28719056107663576, + "grad_norm": 0.0830078125, + "learning_rate": 0.002693733819327359, + "loss": 1.2236, + "step": 3275 + }, + { + "epoch": 0.28727825285101033, + "grad_norm": 0.09765625, + "learning_rate": 0.00269347520138833, + "loss": 1.1721, + "step": 3276 + }, + { + "epoch": 0.28736594462538484, + "grad_norm": 0.06298828125, + "learning_rate": 0.0026932164882870566, + "loss": 1.2073, + "step": 3277 + }, + { + "epoch": 0.2874536363997594, + "grad_norm": 0.07177734375, + "learning_rate": 0.0026929576800471325, + "loss": 1.2224, + "step": 3278 + }, + { + "epoch": 0.2875413281741339, + "grad_norm": 0.0927734375, + "learning_rate": 0.00269269877669216, + "loss": 1.2111, + "step": 3279 + }, + { + "epoch": 0.2876290199485085, + "grad_norm": 0.1005859375, + "learning_rate": 0.0026924397782457517, + "loss": 1.1894, + "step": 3280 + }, + { + "epoch": 0.28771671172288305, + "grad_norm": 0.068359375, + "learning_rate": 0.002692180684731526, + "loss": 1.2371, + "step": 3281 + }, + { + "epoch": 0.28780440349725755, + "grad_norm": 0.10693359375, + "learning_rate": 0.0026919214961731125, + "loss": 1.1913, + "step": 3282 + }, + { + "epoch": 0.2878920952716321, + "grad_norm": 0.055419921875, + "learning_rate": 0.002691662212594148, + "loss": 1.2234, + "step": 3283 + }, + { + "epoch": 0.2879797870460067, + "grad_norm": 0.08349609375, + "learning_rate": 0.0026914028340182788, + "loss": 1.1804, + "step": 3284 + }, + { + "epoch": 0.2880674788203812, + "grad_norm": 0.1337890625, + "learning_rate": 0.0026911433604691593, + "loss": 1.1984, + "step": 3285 + }, + { + "epoch": 0.28815517059475576, + "grad_norm": 0.0947265625, + "learning_rate": 0.0026908837919704534, + "loss": 1.2014, + "step": 3286 + }, + { + "epoch": 0.2882428623691303, + "grad_norm": 0.11572265625, + "learning_rate": 0.002690624128545833, + "loss": 1.214, + "step": 3287 + }, + { + "epoch": 0.28833055414350484, + "grad_norm": 0.059814453125, + "learning_rate": 0.0026903643702189776, + "loss": 1.2022, + "step": 3288 + }, + { + "epoch": 0.2884182459178794, + "grad_norm": 0.083984375, + "learning_rate": 0.0026901045170135782, + "loss": 1.1854, + "step": 3289 + }, + { + "epoch": 0.28850593769225397, + "grad_norm": 0.07861328125, + "learning_rate": 0.0026898445689533313, + "loss": 1.2531, + "step": 3290 + }, + { + "epoch": 0.2885936294666285, + "grad_norm": 0.083984375, + "learning_rate": 0.002689584526061944, + "loss": 1.1876, + "step": 3291 + }, + { + "epoch": 0.28868132124100304, + "grad_norm": 0.05517578125, + "learning_rate": 0.0026893243883631315, + "loss": 1.2351, + "step": 3292 + }, + { + "epoch": 0.28876901301537755, + "grad_norm": 0.059326171875, + "learning_rate": 0.0026890641558806184, + "loss": 1.2028, + "step": 3293 + }, + { + "epoch": 0.2888567047897521, + "grad_norm": 0.054931640625, + "learning_rate": 0.0026888038286381364, + "loss": 1.2048, + "step": 3294 + }, + { + "epoch": 0.2889443965641267, + "grad_norm": 0.052734375, + "learning_rate": 0.002688543406659427, + "loss": 1.1785, + "step": 3295 + }, + { + "epoch": 0.2890320883385012, + "grad_norm": 0.07373046875, + "learning_rate": 0.0026882828899682395, + "loss": 1.1823, + "step": 3296 + }, + { + "epoch": 0.28911978011287576, + "grad_norm": 0.1044921875, + "learning_rate": 0.002688022278588332, + "loss": 1.2352, + "step": 3297 + }, + { + "epoch": 0.2892074718872503, + "grad_norm": 0.083984375, + "learning_rate": 0.0026877615725434735, + "loss": 1.2245, + "step": 3298 + }, + { + "epoch": 0.28929516366162483, + "grad_norm": 0.07861328125, + "learning_rate": 0.002687500771857439, + "loss": 1.2448, + "step": 3299 + }, + { + "epoch": 0.2893828554359994, + "grad_norm": 0.138671875, + "learning_rate": 0.002687239876554011, + "loss": 1.2425, + "step": 3300 + }, + { + "epoch": 0.28947054721037396, + "grad_norm": 0.060791015625, + "learning_rate": 0.002686978886656985, + "loss": 1.2207, + "step": 3301 + }, + { + "epoch": 0.2895582389847485, + "grad_norm": 0.10009765625, + "learning_rate": 0.002686717802190161, + "loss": 1.1927, + "step": 3302 + }, + { + "epoch": 0.28964593075912304, + "grad_norm": 0.0810546875, + "learning_rate": 0.00268645662317735, + "loss": 1.2497, + "step": 3303 + }, + { + "epoch": 0.28973362253349755, + "grad_norm": 0.095703125, + "learning_rate": 0.00268619534964237, + "loss": 1.185, + "step": 3304 + }, + { + "epoch": 0.2898213143078721, + "grad_norm": 0.09033203125, + "learning_rate": 0.00268593398160905, + "loss": 1.1985, + "step": 3305 + }, + { + "epoch": 0.2899090060822467, + "grad_norm": 0.130859375, + "learning_rate": 0.0026856725191012234, + "loss": 1.176, + "step": 3306 + }, + { + "epoch": 0.2899966978566212, + "grad_norm": 0.08837890625, + "learning_rate": 0.0026854109621427384, + "loss": 1.2014, + "step": 3307 + }, + { + "epoch": 0.29008438963099575, + "grad_norm": 0.146484375, + "learning_rate": 0.0026851493107574458, + "loss": 1.2225, + "step": 3308 + }, + { + "epoch": 0.2901720814053703, + "grad_norm": 0.07568359375, + "learning_rate": 0.002684887564969208, + "loss": 1.2287, + "step": 3309 + }, + { + "epoch": 0.29025977317974483, + "grad_norm": 0.09619140625, + "learning_rate": 0.002684625724801896, + "loss": 1.1743, + "step": 3310 + }, + { + "epoch": 0.2903474649541194, + "grad_norm": 0.0634765625, + "learning_rate": 0.002684363790279389, + "loss": 1.2122, + "step": 3311 + }, + { + "epoch": 0.29043515672849396, + "grad_norm": 0.0732421875, + "learning_rate": 0.0026841017614255744, + "loss": 1.2261, + "step": 3312 + }, + { + "epoch": 0.29052284850286847, + "grad_norm": 0.0810546875, + "learning_rate": 0.002683839638264349, + "loss": 1.2134, + "step": 3313 + }, + { + "epoch": 0.29061054027724303, + "grad_norm": 0.05859375, + "learning_rate": 0.0026835774208196174, + "loss": 1.2287, + "step": 3314 + }, + { + "epoch": 0.29069823205161754, + "grad_norm": 0.0927734375, + "learning_rate": 0.0026833151091152927, + "loss": 1.1888, + "step": 3315 + }, + { + "epoch": 0.2907859238259921, + "grad_norm": 0.059326171875, + "learning_rate": 0.002683052703175298, + "loss": 1.2475, + "step": 3316 + }, + { + "epoch": 0.2908736156003667, + "grad_norm": 0.130859375, + "learning_rate": 0.0026827902030235637, + "loss": 1.2276, + "step": 3317 + }, + { + "epoch": 0.2909613073747412, + "grad_norm": 0.06884765625, + "learning_rate": 0.002682527608684028, + "loss": 1.1959, + "step": 3318 + }, + { + "epoch": 0.29104899914911575, + "grad_norm": 0.08447265625, + "learning_rate": 0.002682264920180641, + "loss": 1.2085, + "step": 3319 + }, + { + "epoch": 0.2911366909234903, + "grad_norm": 0.057861328125, + "learning_rate": 0.0026820021375373575, + "loss": 1.2389, + "step": 3320 + }, + { + "epoch": 0.2912243826978648, + "grad_norm": 0.06884765625, + "learning_rate": 0.0026817392607781433, + "loss": 1.2551, + "step": 3321 + }, + { + "epoch": 0.2913120744722394, + "grad_norm": 0.062255859375, + "learning_rate": 0.002681476289926972, + "loss": 1.2067, + "step": 3322 + }, + { + "epoch": 0.29139976624661396, + "grad_norm": 0.0830078125, + "learning_rate": 0.0026812132250078257, + "loss": 1.2492, + "step": 3323 + }, + { + "epoch": 0.29148745802098847, + "grad_norm": 0.05712890625, + "learning_rate": 0.0026809500660446954, + "loss": 1.1585, + "step": 3324 + }, + { + "epoch": 0.29157514979536303, + "grad_norm": 0.0986328125, + "learning_rate": 0.0026806868130615806, + "loss": 1.2225, + "step": 3325 + }, + { + "epoch": 0.29166284156973754, + "grad_norm": 0.05615234375, + "learning_rate": 0.002680423466082489, + "loss": 1.2486, + "step": 3326 + }, + { + "epoch": 0.2917505333441121, + "grad_norm": 0.0732421875, + "learning_rate": 0.0026801600251314364, + "loss": 1.1767, + "step": 3327 + }, + { + "epoch": 0.29183822511848667, + "grad_norm": 0.07470703125, + "learning_rate": 0.00267989649023245, + "loss": 1.2791, + "step": 3328 + }, + { + "epoch": 0.2919259168928612, + "grad_norm": 0.07373046875, + "learning_rate": 0.0026796328614095625, + "loss": 1.19, + "step": 3329 + }, + { + "epoch": 0.29201360866723575, + "grad_norm": 0.058349609375, + "learning_rate": 0.002679369138686816, + "loss": 1.1732, + "step": 3330 + }, + { + "epoch": 0.2921013004416103, + "grad_norm": 0.061767578125, + "learning_rate": 0.0026791053220882603, + "loss": 1.1711, + "step": 3331 + }, + { + "epoch": 0.2921889922159848, + "grad_norm": 0.054443359375, + "learning_rate": 0.0026788414116379567, + "loss": 1.2378, + "step": 3332 + }, + { + "epoch": 0.2922766839903594, + "grad_norm": 0.07421875, + "learning_rate": 0.002678577407359972, + "loss": 1.2414, + "step": 3333 + }, + { + "epoch": 0.29236437576473395, + "grad_norm": 0.06982421875, + "learning_rate": 0.0026783133092783836, + "loss": 1.1671, + "step": 3334 + }, + { + "epoch": 0.29245206753910846, + "grad_norm": 0.1484375, + "learning_rate": 0.002678049117417275, + "loss": 1.2867, + "step": 3335 + }, + { + "epoch": 0.292539759313483, + "grad_norm": 0.0830078125, + "learning_rate": 0.0026777848318007415, + "loss": 1.1654, + "step": 3336 + }, + { + "epoch": 0.2926274510878576, + "grad_norm": 0.064453125, + "learning_rate": 0.0026775204524528843, + "loss": 1.1988, + "step": 3337 + }, + { + "epoch": 0.2927151428622321, + "grad_norm": 0.109375, + "learning_rate": 0.0026772559793978145, + "loss": 1.188, + "step": 3338 + }, + { + "epoch": 0.29280283463660667, + "grad_norm": 0.07763671875, + "learning_rate": 0.002676991412659651, + "loss": 1.1591, + "step": 3339 + }, + { + "epoch": 0.2928905264109812, + "grad_norm": 0.09130859375, + "learning_rate": 0.0026767267522625217, + "loss": 1.1497, + "step": 3340 + }, + { + "epoch": 0.29297821818535574, + "grad_norm": 0.05859375, + "learning_rate": 0.0026764619982305635, + "loss": 1.2675, + "step": 3341 + }, + { + "epoch": 0.2930659099597303, + "grad_norm": 0.08984375, + "learning_rate": 0.0026761971505879206, + "loss": 1.1982, + "step": 3342 + }, + { + "epoch": 0.2931536017341048, + "grad_norm": 0.059814453125, + "learning_rate": 0.002675932209358747, + "loss": 1.2707, + "step": 3343 + }, + { + "epoch": 0.2932412935084794, + "grad_norm": 0.12109375, + "learning_rate": 0.002675667174567204, + "loss": 1.1987, + "step": 3344 + }, + { + "epoch": 0.29332898528285395, + "grad_norm": 0.0634765625, + "learning_rate": 0.0026754020462374625, + "loss": 1.2035, + "step": 3345 + }, + { + "epoch": 0.29341667705722846, + "grad_norm": 0.11083984375, + "learning_rate": 0.0026751368243937016, + "loss": 1.1739, + "step": 3346 + }, + { + "epoch": 0.293504368831603, + "grad_norm": 0.054931640625, + "learning_rate": 0.0026748715090601084, + "loss": 1.2262, + "step": 3347 + }, + { + "epoch": 0.2935920606059776, + "grad_norm": 0.103515625, + "learning_rate": 0.00267460610026088, + "loss": 1.194, + "step": 3348 + }, + { + "epoch": 0.2936797523803521, + "grad_norm": 0.11767578125, + "learning_rate": 0.0026743405980202196, + "loss": 1.2192, + "step": 3349 + }, + { + "epoch": 0.29376744415472666, + "grad_norm": 0.1025390625, + "learning_rate": 0.002674075002362341, + "loss": 1.2768, + "step": 3350 + }, + { + "epoch": 0.2938551359291012, + "grad_norm": 0.130859375, + "learning_rate": 0.002673809313311466, + "loss": 1.202, + "step": 3351 + }, + { + "epoch": 0.29394282770347574, + "grad_norm": 0.07177734375, + "learning_rate": 0.002673543530891824, + "loss": 1.1624, + "step": 3352 + }, + { + "epoch": 0.2940305194778503, + "grad_norm": 0.1474609375, + "learning_rate": 0.0026732776551276548, + "loss": 1.1743, + "step": 3353 + }, + { + "epoch": 0.2941182112522248, + "grad_norm": 0.07568359375, + "learning_rate": 0.0026730116860432048, + "loss": 1.2838, + "step": 3354 + }, + { + "epoch": 0.2942059030265994, + "grad_norm": 0.09912109375, + "learning_rate": 0.00267274562366273, + "loss": 1.1809, + "step": 3355 + }, + { + "epoch": 0.29429359480097395, + "grad_norm": 0.054443359375, + "learning_rate": 0.002672479468010495, + "loss": 1.1581, + "step": 3356 + }, + { + "epoch": 0.29438128657534846, + "grad_norm": 0.068359375, + "learning_rate": 0.0026722132191107717, + "loss": 1.2221, + "step": 3357 + }, + { + "epoch": 0.294468978349723, + "grad_norm": 0.0673828125, + "learning_rate": 0.0026719468769878414, + "loss": 1.1325, + "step": 3358 + }, + { + "epoch": 0.2945566701240976, + "grad_norm": 0.0546875, + "learning_rate": 0.002671680441665994, + "loss": 1.2125, + "step": 3359 + }, + { + "epoch": 0.2946443618984721, + "grad_norm": 0.072265625, + "learning_rate": 0.002671413913169528, + "loss": 1.1941, + "step": 3360 + }, + { + "epoch": 0.29473205367284666, + "grad_norm": 0.0634765625, + "learning_rate": 0.00267114729152275, + "loss": 1.2597, + "step": 3361 + }, + { + "epoch": 0.29481974544722117, + "grad_norm": 0.06591796875, + "learning_rate": 0.0026708805767499755, + "loss": 1.2055, + "step": 3362 + }, + { + "epoch": 0.29490743722159574, + "grad_norm": 0.087890625, + "learning_rate": 0.0026706137688755267, + "loss": 1.2154, + "step": 3363 + }, + { + "epoch": 0.2949951289959703, + "grad_norm": 0.0654296875, + "learning_rate": 0.0026703468679237377, + "loss": 1.2426, + "step": 3364 + }, + { + "epoch": 0.2950828207703448, + "grad_norm": 0.06494140625, + "learning_rate": 0.002670079873918948, + "loss": 1.2014, + "step": 3365 + }, + { + "epoch": 0.2951705125447194, + "grad_norm": 0.05224609375, + "learning_rate": 0.0026698127868855074, + "loss": 1.127, + "step": 3366 + }, + { + "epoch": 0.29525820431909394, + "grad_norm": 0.08056640625, + "learning_rate": 0.002669545606847773, + "loss": 1.1882, + "step": 3367 + }, + { + "epoch": 0.29534589609346845, + "grad_norm": 0.06201171875, + "learning_rate": 0.0026692783338301117, + "loss": 1.1638, + "step": 3368 + }, + { + "epoch": 0.295433587867843, + "grad_norm": 0.0732421875, + "learning_rate": 0.002669010967856897, + "loss": 1.1676, + "step": 3369 + }, + { + "epoch": 0.2955212796422176, + "grad_norm": 0.09814453125, + "learning_rate": 0.002668743508952513, + "loss": 1.2229, + "step": 3370 + }, + { + "epoch": 0.2956089714165921, + "grad_norm": 0.08251953125, + "learning_rate": 0.00266847595714135, + "loss": 1.2415, + "step": 3371 + }, + { + "epoch": 0.29569666319096666, + "grad_norm": 0.06201171875, + "learning_rate": 0.00266820831244781, + "loss": 1.2129, + "step": 3372 + }, + { + "epoch": 0.2957843549653412, + "grad_norm": 0.07275390625, + "learning_rate": 0.0026679405748963, + "loss": 1.1786, + "step": 3373 + }, + { + "epoch": 0.29587204673971573, + "grad_norm": 0.06591796875, + "learning_rate": 0.0026676727445112372, + "loss": 1.2688, + "step": 3374 + }, + { + "epoch": 0.2959597385140903, + "grad_norm": 0.052978515625, + "learning_rate": 0.0026674048213170473, + "loss": 1.1816, + "step": 3375 + }, + { + "epoch": 0.2960474302884648, + "grad_norm": 0.053466796875, + "learning_rate": 0.0026671368053381643, + "loss": 1.2277, + "step": 3376 + }, + { + "epoch": 0.2961351220628394, + "grad_norm": 0.052490234375, + "learning_rate": 0.0026668686965990304, + "loss": 1.2234, + "step": 3377 + }, + { + "epoch": 0.29622281383721394, + "grad_norm": 0.053466796875, + "learning_rate": 0.002666600495124096, + "loss": 1.2073, + "step": 3378 + }, + { + "epoch": 0.29631050561158845, + "grad_norm": 0.0810546875, + "learning_rate": 0.002666332200937821, + "loss": 1.1695, + "step": 3379 + }, + { + "epoch": 0.296398197385963, + "grad_norm": 0.05224609375, + "learning_rate": 0.002666063814064673, + "loss": 1.1747, + "step": 3380 + }, + { + "epoch": 0.2964858891603376, + "grad_norm": 0.08447265625, + "learning_rate": 0.0026657953345291282, + "loss": 1.2501, + "step": 3381 + }, + { + "epoch": 0.2965735809347121, + "grad_norm": 0.0771484375, + "learning_rate": 0.0026655267623556713, + "loss": 1.1619, + "step": 3382 + }, + { + "epoch": 0.29666127270908665, + "grad_norm": 0.07958984375, + "learning_rate": 0.002665258097568795, + "loss": 1.226, + "step": 3383 + }, + { + "epoch": 0.2967489644834612, + "grad_norm": 0.0791015625, + "learning_rate": 0.002664989340193001, + "loss": 1.2084, + "step": 3384 + }, + { + "epoch": 0.29683665625783573, + "grad_norm": 0.07080078125, + "learning_rate": 0.002664720490252799, + "loss": 1.2736, + "step": 3385 + }, + { + "epoch": 0.2969243480322103, + "grad_norm": 0.0634765625, + "learning_rate": 0.0026644515477727082, + "loss": 1.1786, + "step": 3386 + }, + { + "epoch": 0.2970120398065848, + "grad_norm": 0.10693359375, + "learning_rate": 0.002664182512777255, + "loss": 1.1718, + "step": 3387 + }, + { + "epoch": 0.29709973158095937, + "grad_norm": 0.0703125, + "learning_rate": 0.002663913385290975, + "loss": 1.194, + "step": 3388 + }, + { + "epoch": 0.29718742335533394, + "grad_norm": 0.162109375, + "learning_rate": 0.002663644165338411, + "loss": 1.1978, + "step": 3389 + }, + { + "epoch": 0.29727511512970844, + "grad_norm": 0.0693359375, + "learning_rate": 0.0026633748529441164, + "loss": 1.2503, + "step": 3390 + }, + { + "epoch": 0.297362806904083, + "grad_norm": 0.1787109375, + "learning_rate": 0.002663105448132651, + "loss": 1.2191, + "step": 3391 + }, + { + "epoch": 0.2974504986784576, + "grad_norm": 0.080078125, + "learning_rate": 0.0026628359509285843, + "loss": 1.1757, + "step": 3392 + }, + { + "epoch": 0.2975381904528321, + "grad_norm": 0.126953125, + "learning_rate": 0.0026625663613564932, + "loss": 1.1931, + "step": 3393 + }, + { + "epoch": 0.29762588222720665, + "grad_norm": 0.0849609375, + "learning_rate": 0.0026622966794409638, + "loss": 1.2169, + "step": 3394 + }, + { + "epoch": 0.2977135740015812, + "grad_norm": 0.06787109375, + "learning_rate": 0.0026620269052065907, + "loss": 1.1785, + "step": 3395 + }, + { + "epoch": 0.2978012657759557, + "grad_norm": 0.09521484375, + "learning_rate": 0.0026617570386779765, + "loss": 1.1919, + "step": 3396 + }, + { + "epoch": 0.2978889575503303, + "grad_norm": 0.07275390625, + "learning_rate": 0.002661487079879732, + "loss": 1.2444, + "step": 3397 + }, + { + "epoch": 0.2979766493247048, + "grad_norm": 0.0908203125, + "learning_rate": 0.002661217028836477, + "loss": 1.1825, + "step": 3398 + }, + { + "epoch": 0.29806434109907937, + "grad_norm": 0.06591796875, + "learning_rate": 0.0026609468855728394, + "loss": 1.2251, + "step": 3399 + }, + { + "epoch": 0.29815203287345393, + "grad_norm": 0.10302734375, + "learning_rate": 0.0026606766501134555, + "loss": 1.1071, + "step": 3400 + }, + { + "epoch": 0.29823972464782844, + "grad_norm": 0.06298828125, + "learning_rate": 0.00266040632248297, + "loss": 1.1983, + "step": 3401 + }, + { + "epoch": 0.298327416422203, + "grad_norm": 0.130859375, + "learning_rate": 0.0026601359027060363, + "loss": 1.2267, + "step": 3402 + }, + { + "epoch": 0.29841510819657757, + "grad_norm": 0.06201171875, + "learning_rate": 0.0026598653908073166, + "loss": 1.2106, + "step": 3403 + }, + { + "epoch": 0.2985027999709521, + "grad_norm": 0.10009765625, + "learning_rate": 0.0026595947868114796, + "loss": 1.1717, + "step": 3404 + }, + { + "epoch": 0.29859049174532665, + "grad_norm": 0.09521484375, + "learning_rate": 0.0026593240907432044, + "loss": 1.2218, + "step": 3405 + }, + { + "epoch": 0.2986781835197012, + "grad_norm": 0.0546875, + "learning_rate": 0.002659053302627178, + "loss": 1.1307, + "step": 3406 + }, + { + "epoch": 0.2987658752940757, + "grad_norm": 0.11181640625, + "learning_rate": 0.0026587824224880946, + "loss": 1.2031, + "step": 3407 + }, + { + "epoch": 0.2988535670684503, + "grad_norm": 0.058837890625, + "learning_rate": 0.002658511450350659, + "loss": 1.2752, + "step": 3408 + }, + { + "epoch": 0.29894125884282485, + "grad_norm": 0.0703125, + "learning_rate": 0.0026582403862395825, + "loss": 1.2815, + "step": 3409 + }, + { + "epoch": 0.29902895061719936, + "grad_norm": 0.059326171875, + "learning_rate": 0.0026579692301795855, + "loss": 1.2104, + "step": 3410 + }, + { + "epoch": 0.29911664239157393, + "grad_norm": 0.064453125, + "learning_rate": 0.0026576979821953963, + "loss": 1.2245, + "step": 3411 + }, + { + "epoch": 0.29920433416594844, + "grad_norm": 0.08837890625, + "learning_rate": 0.0026574266423117525, + "loss": 1.1756, + "step": 3412 + }, + { + "epoch": 0.299292025940323, + "grad_norm": 0.0771484375, + "learning_rate": 0.0026571552105534003, + "loss": 1.2237, + "step": 3413 + }, + { + "epoch": 0.29937971771469757, + "grad_norm": 0.07373046875, + "learning_rate": 0.0026568836869450925, + "loss": 1.2208, + "step": 3414 + }, + { + "epoch": 0.2994674094890721, + "grad_norm": 0.08740234375, + "learning_rate": 0.0026566120715115914, + "loss": 1.2336, + "step": 3415 + }, + { + "epoch": 0.29955510126344664, + "grad_norm": 0.052001953125, + "learning_rate": 0.0026563403642776684, + "loss": 1.1912, + "step": 3416 + }, + { + "epoch": 0.2996427930378212, + "grad_norm": 0.08447265625, + "learning_rate": 0.002656068565268102, + "loss": 1.1874, + "step": 3417 + }, + { + "epoch": 0.2997304848121957, + "grad_norm": 0.0576171875, + "learning_rate": 0.002655796674507679, + "loss": 1.252, + "step": 3418 + }, + { + "epoch": 0.2998181765865703, + "grad_norm": 0.09130859375, + "learning_rate": 0.0026555246920211962, + "loss": 1.2643, + "step": 3419 + }, + { + "epoch": 0.29990586836094485, + "grad_norm": 0.0771484375, + "learning_rate": 0.0026552526178334574, + "loss": 1.2865, + "step": 3420 + }, + { + "epoch": 0.29999356013531936, + "grad_norm": 0.0625, + "learning_rate": 0.0026549804519692744, + "loss": 1.2064, + "step": 3421 + }, + { + "epoch": 0.3000812519096939, + "grad_norm": 0.0703125, + "learning_rate": 0.0026547081944534694, + "loss": 1.2559, + "step": 3422 + }, + { + "epoch": 0.30016894368406843, + "grad_norm": 0.07470703125, + "learning_rate": 0.0026544358453108705, + "loss": 1.1246, + "step": 3423 + }, + { + "epoch": 0.300256635458443, + "grad_norm": 0.0654296875, + "learning_rate": 0.0026541634045663147, + "loss": 1.1764, + "step": 3424 + }, + { + "epoch": 0.30034432723281757, + "grad_norm": 0.06884765625, + "learning_rate": 0.0026538908722446492, + "loss": 1.2398, + "step": 3425 + }, + { + "epoch": 0.3004320190071921, + "grad_norm": 0.06884765625, + "learning_rate": 0.0026536182483707276, + "loss": 1.2528, + "step": 3426 + }, + { + "epoch": 0.30051971078156664, + "grad_norm": 0.064453125, + "learning_rate": 0.002653345532969412, + "loss": 1.1639, + "step": 3427 + }, + { + "epoch": 0.3006074025559412, + "grad_norm": 0.0625, + "learning_rate": 0.0026530727260655747, + "loss": 1.2688, + "step": 3428 + }, + { + "epoch": 0.3006950943303157, + "grad_norm": 0.0810546875, + "learning_rate": 0.002652799827684094, + "loss": 1.1951, + "step": 3429 + }, + { + "epoch": 0.3007827861046903, + "grad_norm": 0.060791015625, + "learning_rate": 0.0026525268378498577, + "loss": 1.1979, + "step": 3430 + }, + { + "epoch": 0.30087047787906485, + "grad_norm": 0.0771484375, + "learning_rate": 0.0026522537565877614, + "loss": 1.1978, + "step": 3431 + }, + { + "epoch": 0.30095816965343936, + "grad_norm": 0.11279296875, + "learning_rate": 0.00265198058392271, + "loss": 1.2794, + "step": 3432 + }, + { + "epoch": 0.3010458614278139, + "grad_norm": 0.091796875, + "learning_rate": 0.0026517073198796164, + "loss": 1.1975, + "step": 3433 + }, + { + "epoch": 0.30113355320218843, + "grad_norm": 0.08740234375, + "learning_rate": 0.002651433964483401, + "loss": 1.2226, + "step": 3434 + }, + { + "epoch": 0.301221244976563, + "grad_norm": 0.07568359375, + "learning_rate": 0.0026511605177589933, + "loss": 1.1545, + "step": 3435 + }, + { + "epoch": 0.30130893675093756, + "grad_norm": 0.09375, + "learning_rate": 0.0026508869797313302, + "loss": 1.2286, + "step": 3436 + }, + { + "epoch": 0.30139662852531207, + "grad_norm": 0.061767578125, + "learning_rate": 0.002650613350425359, + "loss": 1.2159, + "step": 3437 + }, + { + "epoch": 0.30148432029968664, + "grad_norm": 0.11083984375, + "learning_rate": 0.002650339629866033, + "loss": 1.2146, + "step": 3438 + }, + { + "epoch": 0.3015720120740612, + "grad_norm": 0.05615234375, + "learning_rate": 0.0026500658180783152, + "loss": 1.2633, + "step": 3439 + }, + { + "epoch": 0.3016597038484357, + "grad_norm": 0.08251953125, + "learning_rate": 0.002649791915087176, + "loss": 1.2285, + "step": 3440 + }, + { + "epoch": 0.3017473956228103, + "grad_norm": 0.06640625, + "learning_rate": 0.0026495179209175958, + "loss": 1.1456, + "step": 3441 + }, + { + "epoch": 0.30183508739718484, + "grad_norm": 0.059814453125, + "learning_rate": 0.0026492438355945613, + "loss": 1.2146, + "step": 3442 + }, + { + "epoch": 0.30192277917155935, + "grad_norm": 0.054443359375, + "learning_rate": 0.0026489696591430684, + "loss": 1.1624, + "step": 3443 + }, + { + "epoch": 0.3020104709459339, + "grad_norm": 0.06689453125, + "learning_rate": 0.0026486953915881213, + "loss": 1.2009, + "step": 3444 + }, + { + "epoch": 0.3020981627203085, + "grad_norm": 0.05615234375, + "learning_rate": 0.0026484210329547325, + "loss": 1.1907, + "step": 3445 + }, + { + "epoch": 0.302185854494683, + "grad_norm": 0.05419921875, + "learning_rate": 0.002648146583267923, + "loss": 1.1996, + "step": 3446 + }, + { + "epoch": 0.30227354626905756, + "grad_norm": 0.06103515625, + "learning_rate": 0.002647872042552722, + "loss": 1.1837, + "step": 3447 + }, + { + "epoch": 0.30236123804343207, + "grad_norm": 0.08056640625, + "learning_rate": 0.0026475974108341664, + "loss": 1.1951, + "step": 3448 + }, + { + "epoch": 0.30244892981780663, + "grad_norm": 0.0634765625, + "learning_rate": 0.002647322688137303, + "loss": 1.2407, + "step": 3449 + }, + { + "epoch": 0.3025366215921812, + "grad_norm": 0.123046875, + "learning_rate": 0.002647047874487184, + "loss": 1.1998, + "step": 3450 + }, + { + "epoch": 0.3026243133665557, + "grad_norm": 0.05810546875, + "learning_rate": 0.0026467729699088736, + "loss": 1.2456, + "step": 3451 + }, + { + "epoch": 0.3027120051409303, + "grad_norm": 0.1318359375, + "learning_rate": 0.002646497974427441, + "loss": 1.1674, + "step": 3452 + }, + { + "epoch": 0.30279969691530484, + "grad_norm": 0.058837890625, + "learning_rate": 0.0026462228880679657, + "loss": 1.221, + "step": 3453 + }, + { + "epoch": 0.30288738868967935, + "grad_norm": 0.080078125, + "learning_rate": 0.0026459477108555346, + "loss": 1.2694, + "step": 3454 + }, + { + "epoch": 0.3029750804640539, + "grad_norm": 0.0537109375, + "learning_rate": 0.0026456724428152436, + "loss": 1.1538, + "step": 3455 + }, + { + "epoch": 0.3030627722384285, + "grad_norm": 0.11279296875, + "learning_rate": 0.002645397083972196, + "loss": 1.2451, + "step": 3456 + }, + { + "epoch": 0.303150464012803, + "grad_norm": 0.0576171875, + "learning_rate": 0.002645121634351504, + "loss": 1.1821, + "step": 3457 + }, + { + "epoch": 0.30323815578717755, + "grad_norm": 0.08056640625, + "learning_rate": 0.002644846093978287, + "loss": 1.2361, + "step": 3458 + }, + { + "epoch": 0.30332584756155206, + "grad_norm": 0.07763671875, + "learning_rate": 0.0026445704628776756, + "loss": 1.2057, + "step": 3459 + }, + { + "epoch": 0.30341353933592663, + "grad_norm": 0.0693359375, + "learning_rate": 0.002644294741074805, + "loss": 1.2027, + "step": 3460 + }, + { + "epoch": 0.3035012311103012, + "grad_norm": 0.10693359375, + "learning_rate": 0.002644018928594821, + "loss": 1.1607, + "step": 3461 + }, + { + "epoch": 0.3035889228846757, + "grad_norm": 0.05419921875, + "learning_rate": 0.002643743025462876, + "loss": 1.1377, + "step": 3462 + }, + { + "epoch": 0.30367661465905027, + "grad_norm": 0.0634765625, + "learning_rate": 0.002643467031704133, + "loss": 1.1791, + "step": 3463 + }, + { + "epoch": 0.30376430643342484, + "grad_norm": 0.054931640625, + "learning_rate": 0.0026431909473437613, + "loss": 1.211, + "step": 3464 + }, + { + "epoch": 0.30385199820779935, + "grad_norm": 0.057861328125, + "learning_rate": 0.0026429147724069393, + "loss": 1.2137, + "step": 3465 + }, + { + "epoch": 0.3039396899821739, + "grad_norm": 0.05126953125, + "learning_rate": 0.0026426385069188525, + "loss": 1.2094, + "step": 3466 + }, + { + "epoch": 0.3040273817565485, + "grad_norm": 0.060546875, + "learning_rate": 0.0026423621509046973, + "loss": 1.2065, + "step": 3467 + }, + { + "epoch": 0.304115073530923, + "grad_norm": 0.0634765625, + "learning_rate": 0.0026420857043896752, + "loss": 1.149, + "step": 3468 + }, + { + "epoch": 0.30420276530529755, + "grad_norm": 0.058349609375, + "learning_rate": 0.002641809167398998, + "loss": 1.2014, + "step": 3469 + }, + { + "epoch": 0.30429045707967206, + "grad_norm": 0.06884765625, + "learning_rate": 0.002641532539957885, + "loss": 1.1623, + "step": 3470 + }, + { + "epoch": 0.3043781488540466, + "grad_norm": 0.056396484375, + "learning_rate": 0.0026412558220915644, + "loss": 1.186, + "step": 3471 + }, + { + "epoch": 0.3044658406284212, + "grad_norm": 0.08935546875, + "learning_rate": 0.0026409790138252716, + "loss": 1.2664, + "step": 3472 + }, + { + "epoch": 0.3045535324027957, + "grad_norm": 0.072265625, + "learning_rate": 0.0026407021151842506, + "loss": 1.2906, + "step": 3473 + }, + { + "epoch": 0.30464122417717027, + "grad_norm": 0.07275390625, + "learning_rate": 0.002640425126193754, + "loss": 1.2422, + "step": 3474 + }, + { + "epoch": 0.30472891595154483, + "grad_norm": 0.10791015625, + "learning_rate": 0.002640148046879043, + "loss": 1.2103, + "step": 3475 + }, + { + "epoch": 0.30481660772591934, + "grad_norm": 0.056640625, + "learning_rate": 0.002639870877265386, + "loss": 1.2288, + "step": 3476 + }, + { + "epoch": 0.3049042995002939, + "grad_norm": 0.08642578125, + "learning_rate": 0.0026395936173780605, + "loss": 1.1978, + "step": 3477 + }, + { + "epoch": 0.3049919912746685, + "grad_norm": 0.064453125, + "learning_rate": 0.0026393162672423516, + "loss": 1.2104, + "step": 3478 + }, + { + "epoch": 0.305079683049043, + "grad_norm": 0.080078125, + "learning_rate": 0.002639038826883553, + "loss": 1.1958, + "step": 3479 + }, + { + "epoch": 0.30516737482341755, + "grad_norm": 0.055419921875, + "learning_rate": 0.002638761296326967, + "loss": 1.2448, + "step": 3480 + }, + { + "epoch": 0.3052550665977921, + "grad_norm": 0.12451171875, + "learning_rate": 0.0026384836755979036, + "loss": 1.226, + "step": 3481 + }, + { + "epoch": 0.3053427583721666, + "grad_norm": 0.06201171875, + "learning_rate": 0.00263820596472168, + "loss": 1.2219, + "step": 3482 + }, + { + "epoch": 0.3054304501465412, + "grad_norm": 0.09619140625, + "learning_rate": 0.002637928163723623, + "loss": 1.1962, + "step": 3483 + }, + { + "epoch": 0.3055181419209157, + "grad_norm": 0.08447265625, + "learning_rate": 0.0026376502726290683, + "loss": 1.2776, + "step": 3484 + }, + { + "epoch": 0.30560583369529026, + "grad_norm": 0.08984375, + "learning_rate": 0.0026373722914633585, + "loss": 1.2405, + "step": 3485 + }, + { + "epoch": 0.30569352546966483, + "grad_norm": 0.07373046875, + "learning_rate": 0.0026370942202518445, + "loss": 1.1839, + "step": 3486 + }, + { + "epoch": 0.30578121724403934, + "grad_norm": 0.06982421875, + "learning_rate": 0.002636816059019886, + "loss": 1.2358, + "step": 3487 + }, + { + "epoch": 0.3058689090184139, + "grad_norm": 0.0712890625, + "learning_rate": 0.00263653780779285, + "loss": 1.213, + "step": 3488 + }, + { + "epoch": 0.30595660079278847, + "grad_norm": 0.0712890625, + "learning_rate": 0.002636259466596113, + "loss": 1.2303, + "step": 3489 + }, + { + "epoch": 0.306044292567163, + "grad_norm": 0.060791015625, + "learning_rate": 0.0026359810354550585, + "loss": 1.1842, + "step": 3490 + }, + { + "epoch": 0.30613198434153754, + "grad_norm": 0.076171875, + "learning_rate": 0.002635702514395079, + "loss": 1.1999, + "step": 3491 + }, + { + "epoch": 0.3062196761159121, + "grad_norm": 0.09765625, + "learning_rate": 0.002635423903441574, + "loss": 1.2215, + "step": 3492 + }, + { + "epoch": 0.3063073678902866, + "grad_norm": 0.05810546875, + "learning_rate": 0.002635145202619954, + "loss": 1.1474, + "step": 3493 + }, + { + "epoch": 0.3063950596646612, + "grad_norm": 0.0986328125, + "learning_rate": 0.0026348664119556337, + "loss": 1.1982, + "step": 3494 + }, + { + "epoch": 0.3064827514390357, + "grad_norm": 0.057373046875, + "learning_rate": 0.00263458753147404, + "loss": 1.2316, + "step": 3495 + }, + { + "epoch": 0.30657044321341026, + "grad_norm": 0.1220703125, + "learning_rate": 0.0026343085612006044, + "loss": 1.2483, + "step": 3496 + }, + { + "epoch": 0.3066581349877848, + "grad_norm": 0.08154296875, + "learning_rate": 0.0026340295011607695, + "loss": 1.2385, + "step": 3497 + }, + { + "epoch": 0.30674582676215933, + "grad_norm": 0.1376953125, + "learning_rate": 0.0026337503513799844, + "loss": 1.2339, + "step": 3498 + }, + { + "epoch": 0.3068335185365339, + "grad_norm": 0.10546875, + "learning_rate": 0.0026334711118837055, + "loss": 1.2372, + "step": 3499 + }, + { + "epoch": 0.30692121031090847, + "grad_norm": 0.11083984375, + "learning_rate": 0.0026331917826974013, + "loss": 1.1969, + "step": 3500 + }, + { + "epoch": 0.30692121031090847, + "eval_loss": 1.2185108661651611, + "eval_runtime": 429.5269, + "eval_samples_per_second": 33.635, + "eval_steps_per_second": 8.409, + "step": 3500 + }, + { + "epoch": 0.307008902085283, + "grad_norm": 0.142578125, + "learning_rate": 0.0026329123638465443, + "loss": 1.2152, + "step": 3501 + }, + { + "epoch": 0.30709659385965754, + "grad_norm": 0.126953125, + "learning_rate": 0.0026326328553566173, + "loss": 1.2259, + "step": 3502 + }, + { + "epoch": 0.3071842856340321, + "grad_norm": 0.115234375, + "learning_rate": 0.0026323532572531103, + "loss": 1.2248, + "step": 3503 + }, + { + "epoch": 0.3072719774084066, + "grad_norm": 0.0634765625, + "learning_rate": 0.0026320735695615225, + "loss": 1.2517, + "step": 3504 + }, + { + "epoch": 0.3073596691827812, + "grad_norm": 0.10888671875, + "learning_rate": 0.00263179379230736, + "loss": 1.1755, + "step": 3505 + }, + { + "epoch": 0.3074473609571557, + "grad_norm": 0.06787109375, + "learning_rate": 0.002631513925516138, + "loss": 1.1728, + "step": 3506 + }, + { + "epoch": 0.30753505273153026, + "grad_norm": 0.07373046875, + "learning_rate": 0.00263123396921338, + "loss": 1.1534, + "step": 3507 + }, + { + "epoch": 0.3076227445059048, + "grad_norm": 0.11376953125, + "learning_rate": 0.002630953923424617, + "loss": 1.2124, + "step": 3508 + }, + { + "epoch": 0.30771043628027933, + "grad_norm": 0.09716796875, + "learning_rate": 0.0026306737881753887, + "loss": 1.2559, + "step": 3509 + }, + { + "epoch": 0.3077981280546539, + "grad_norm": 0.061767578125, + "learning_rate": 0.0026303935634912423, + "loss": 1.1662, + "step": 3510 + }, + { + "epoch": 0.30788581982902846, + "grad_norm": 0.0576171875, + "learning_rate": 0.0026301132493977333, + "loss": 1.2256, + "step": 3511 + }, + { + "epoch": 0.30797351160340297, + "grad_norm": 0.062255859375, + "learning_rate": 0.0026298328459204265, + "loss": 1.1922, + "step": 3512 + }, + { + "epoch": 0.30806120337777754, + "grad_norm": 0.064453125, + "learning_rate": 0.002629552353084894, + "loss": 1.2255, + "step": 3513 + }, + { + "epoch": 0.3081488951521521, + "grad_norm": 0.07470703125, + "learning_rate": 0.0026292717709167153, + "loss": 1.2495, + "step": 3514 + }, + { + "epoch": 0.3082365869265266, + "grad_norm": 0.06884765625, + "learning_rate": 0.002628991099441479, + "loss": 1.2112, + "step": 3515 + }, + { + "epoch": 0.3083242787009012, + "grad_norm": 0.064453125, + "learning_rate": 0.002628710338684782, + "loss": 1.1647, + "step": 3516 + }, + { + "epoch": 0.30841197047527574, + "grad_norm": 0.07080078125, + "learning_rate": 0.002628429488672228, + "loss": 1.2251, + "step": 3517 + }, + { + "epoch": 0.30849966224965025, + "grad_norm": 0.055419921875, + "learning_rate": 0.0026281485494294306, + "loss": 1.1533, + "step": 3518 + }, + { + "epoch": 0.3085873540240248, + "grad_norm": 0.0732421875, + "learning_rate": 0.002627867520982011, + "loss": 1.206, + "step": 3519 + }, + { + "epoch": 0.30867504579839933, + "grad_norm": 0.0556640625, + "learning_rate": 0.0026275864033555975, + "loss": 1.1906, + "step": 3520 + }, + { + "epoch": 0.3087627375727739, + "grad_norm": 0.0634765625, + "learning_rate": 0.0026273051965758284, + "loss": 1.1816, + "step": 3521 + }, + { + "epoch": 0.30885042934714846, + "grad_norm": 0.0888671875, + "learning_rate": 0.0026270239006683477, + "loss": 1.1512, + "step": 3522 + }, + { + "epoch": 0.30893812112152297, + "grad_norm": 0.0634765625, + "learning_rate": 0.002626742515658809, + "loss": 1.1593, + "step": 3523 + }, + { + "epoch": 0.30902581289589753, + "grad_norm": 0.099609375, + "learning_rate": 0.0026264610415728746, + "loss": 1.2025, + "step": 3524 + }, + { + "epoch": 0.3091135046702721, + "grad_norm": 0.061767578125, + "learning_rate": 0.0026261794784362144, + "loss": 1.2547, + "step": 3525 + }, + { + "epoch": 0.3092011964446466, + "grad_norm": 0.0986328125, + "learning_rate": 0.002625897826274505, + "loss": 1.1875, + "step": 3526 + }, + { + "epoch": 0.3092888882190212, + "grad_norm": 0.0673828125, + "learning_rate": 0.0026256160851134335, + "loss": 1.2426, + "step": 3527 + }, + { + "epoch": 0.30937657999339574, + "grad_norm": 0.125, + "learning_rate": 0.0026253342549786935, + "loss": 1.2302, + "step": 3528 + }, + { + "epoch": 0.30946427176777025, + "grad_norm": 0.053466796875, + "learning_rate": 0.002625052335895988, + "loss": 1.1756, + "step": 3529 + }, + { + "epoch": 0.3095519635421448, + "grad_norm": 0.09521484375, + "learning_rate": 0.0026247703278910253, + "loss": 1.256, + "step": 3530 + }, + { + "epoch": 0.3096396553165193, + "grad_norm": 0.0830078125, + "learning_rate": 0.002624488230989526, + "loss": 1.2101, + "step": 3531 + }, + { + "epoch": 0.3097273470908939, + "grad_norm": 0.0634765625, + "learning_rate": 0.0026242060452172154, + "loss": 1.1964, + "step": 3532 + }, + { + "epoch": 0.30981503886526846, + "grad_norm": 0.12451171875, + "learning_rate": 0.002623923770599828, + "loss": 1.1782, + "step": 3533 + }, + { + "epoch": 0.30990273063964296, + "grad_norm": 0.12353515625, + "learning_rate": 0.0026236414071631074, + "loss": 1.264, + "step": 3534 + }, + { + "epoch": 0.30999042241401753, + "grad_norm": 0.052734375, + "learning_rate": 0.0026233589549328035, + "loss": 1.1518, + "step": 3535 + }, + { + "epoch": 0.3100781141883921, + "grad_norm": 0.1396484375, + "learning_rate": 0.0026230764139346754, + "loss": 1.1993, + "step": 3536 + }, + { + "epoch": 0.3101658059627666, + "grad_norm": 0.05859375, + "learning_rate": 0.002622793784194491, + "loss": 1.2011, + "step": 3537 + }, + { + "epoch": 0.31025349773714117, + "grad_norm": 0.062255859375, + "learning_rate": 0.002622511065738024, + "loss": 1.2317, + "step": 3538 + }, + { + "epoch": 0.31034118951151574, + "grad_norm": 0.06982421875, + "learning_rate": 0.002622228258591059, + "loss": 1.2117, + "step": 3539 + }, + { + "epoch": 0.31042888128589025, + "grad_norm": 0.06396484375, + "learning_rate": 0.0026219453627793865, + "loss": 1.2107, + "step": 3540 + }, + { + "epoch": 0.3105165730602648, + "grad_norm": 0.08349609375, + "learning_rate": 0.002621662378328806, + "loss": 1.2337, + "step": 3541 + }, + { + "epoch": 0.3106042648346393, + "grad_norm": 0.08154296875, + "learning_rate": 0.002621379305265125, + "loss": 1.2475, + "step": 3542 + }, + { + "epoch": 0.3106919566090139, + "grad_norm": 0.09423828125, + "learning_rate": 0.0026210961436141586, + "loss": 1.2156, + "step": 3543 + }, + { + "epoch": 0.31077964838338845, + "grad_norm": 0.055419921875, + "learning_rate": 0.002620812893401731, + "loss": 1.1861, + "step": 3544 + }, + { + "epoch": 0.31086734015776296, + "grad_norm": 0.06494140625, + "learning_rate": 0.002620529554653674, + "loss": 1.1988, + "step": 3545 + }, + { + "epoch": 0.3109550319321375, + "grad_norm": 0.0791015625, + "learning_rate": 0.002620246127395826, + "loss": 1.2996, + "step": 3546 + }, + { + "epoch": 0.3110427237065121, + "grad_norm": 0.09619140625, + "learning_rate": 0.002619962611654037, + "loss": 1.1323, + "step": 3547 + }, + { + "epoch": 0.3111304154808866, + "grad_norm": 0.08056640625, + "learning_rate": 0.002619679007454161, + "loss": 1.1999, + "step": 3548 + }, + { + "epoch": 0.31121810725526117, + "grad_norm": 0.0546875, + "learning_rate": 0.002619395314822063, + "loss": 1.1919, + "step": 3549 + }, + { + "epoch": 0.31130579902963573, + "grad_norm": 0.0634765625, + "learning_rate": 0.002619111533783615, + "loss": 1.165, + "step": 3550 + }, + { + "epoch": 0.31139349080401024, + "grad_norm": 0.056640625, + "learning_rate": 0.0026188276643646963, + "loss": 1.1591, + "step": 3551 + }, + { + "epoch": 0.3114811825783848, + "grad_norm": 0.0712890625, + "learning_rate": 0.0026185437065911962, + "loss": 1.2056, + "step": 3552 + }, + { + "epoch": 0.3115688743527593, + "grad_norm": 0.08056640625, + "learning_rate": 0.0026182596604890102, + "loss": 1.1502, + "step": 3553 + }, + { + "epoch": 0.3116565661271339, + "grad_norm": 0.0986328125, + "learning_rate": 0.0026179755260840425, + "loss": 1.2779, + "step": 3554 + }, + { + "epoch": 0.31174425790150845, + "grad_norm": 0.068359375, + "learning_rate": 0.002617691303402206, + "loss": 1.226, + "step": 3555 + }, + { + "epoch": 0.31183194967588296, + "grad_norm": 0.054931640625, + "learning_rate": 0.00261740699246942, + "loss": 1.2453, + "step": 3556 + }, + { + "epoch": 0.3119196414502575, + "grad_norm": 0.058349609375, + "learning_rate": 0.0026171225933116138, + "loss": 1.2301, + "step": 3557 + }, + { + "epoch": 0.3120073332246321, + "grad_norm": 0.05908203125, + "learning_rate": 0.0026168381059547244, + "loss": 1.2133, + "step": 3558 + }, + { + "epoch": 0.3120950249990066, + "grad_norm": 0.076171875, + "learning_rate": 0.002616553530424695, + "loss": 1.1595, + "step": 3559 + }, + { + "epoch": 0.31218271677338116, + "grad_norm": 0.054931640625, + "learning_rate": 0.0026162688667474795, + "loss": 1.2748, + "step": 3560 + }, + { + "epoch": 0.31227040854775573, + "grad_norm": 0.08740234375, + "learning_rate": 0.0026159841149490373, + "loss": 1.1683, + "step": 3561 + }, + { + "epoch": 0.31235810032213024, + "grad_norm": 0.09423828125, + "learning_rate": 0.0026156992750553374, + "loss": 1.1743, + "step": 3562 + }, + { + "epoch": 0.3124457920965048, + "grad_norm": 0.072265625, + "learning_rate": 0.002615414347092356, + "loss": 1.2337, + "step": 3563 + }, + { + "epoch": 0.31253348387087937, + "grad_norm": 0.08447265625, + "learning_rate": 0.002615129331086079, + "loss": 1.2287, + "step": 3564 + }, + { + "epoch": 0.3126211756452539, + "grad_norm": 0.06494140625, + "learning_rate": 0.0026148442270624984, + "loss": 1.2678, + "step": 3565 + }, + { + "epoch": 0.31270886741962844, + "grad_norm": 0.10009765625, + "learning_rate": 0.0026145590350476152, + "loss": 1.1903, + "step": 3566 + }, + { + "epoch": 0.31279655919400295, + "grad_norm": 0.056396484375, + "learning_rate": 0.0026142737550674377, + "loss": 1.1978, + "step": 3567 + }, + { + "epoch": 0.3128842509683775, + "grad_norm": 0.10009765625, + "learning_rate": 0.0026139883871479836, + "loss": 1.2058, + "step": 3568 + }, + { + "epoch": 0.3129719427427521, + "grad_norm": 0.056884765625, + "learning_rate": 0.0026137029313152765, + "loss": 1.2322, + "step": 3569 + }, + { + "epoch": 0.3130596345171266, + "grad_norm": 0.08642578125, + "learning_rate": 0.0026134173875953497, + "loss": 1.1886, + "step": 3570 + }, + { + "epoch": 0.31314732629150116, + "grad_norm": 0.07275390625, + "learning_rate": 0.0026131317560142446, + "loss": 1.1288, + "step": 3571 + }, + { + "epoch": 0.3132350180658757, + "grad_norm": 0.08154296875, + "learning_rate": 0.0026128460365980096, + "loss": 1.2598, + "step": 3572 + }, + { + "epoch": 0.31332270984025024, + "grad_norm": 0.06396484375, + "learning_rate": 0.0026125602293727017, + "loss": 1.1925, + "step": 3573 + }, + { + "epoch": 0.3134104016146248, + "grad_norm": 0.076171875, + "learning_rate": 0.0026122743343643855, + "loss": 1.1877, + "step": 3574 + }, + { + "epoch": 0.31349809338899937, + "grad_norm": 0.10400390625, + "learning_rate": 0.0026119883515991348, + "loss": 1.1627, + "step": 3575 + }, + { + "epoch": 0.3135857851633739, + "grad_norm": 0.1103515625, + "learning_rate": 0.002611702281103029, + "loss": 1.1539, + "step": 3576 + }, + { + "epoch": 0.31367347693774844, + "grad_norm": 0.06396484375, + "learning_rate": 0.0026114161229021577, + "loss": 1.1983, + "step": 3577 + }, + { + "epoch": 0.31376116871212295, + "grad_norm": 0.11328125, + "learning_rate": 0.0026111298770226184, + "loss": 1.2329, + "step": 3578 + }, + { + "epoch": 0.3138488604864975, + "grad_norm": 0.0595703125, + "learning_rate": 0.002610843543490515, + "loss": 1.1944, + "step": 3579 + }, + { + "epoch": 0.3139365522608721, + "grad_norm": 0.07177734375, + "learning_rate": 0.0026105571223319613, + "loss": 1.1938, + "step": 3580 + }, + { + "epoch": 0.3140242440352466, + "grad_norm": 0.0703125, + "learning_rate": 0.0026102706135730774, + "loss": 1.2268, + "step": 3581 + }, + { + "epoch": 0.31411193580962116, + "grad_norm": 0.08740234375, + "learning_rate": 0.0026099840172399925, + "loss": 1.1975, + "step": 3582 + }, + { + "epoch": 0.3141996275839957, + "grad_norm": 0.07958984375, + "learning_rate": 0.0026096973333588434, + "loss": 1.2414, + "step": 3583 + }, + { + "epoch": 0.31428731935837023, + "grad_norm": 0.08544921875, + "learning_rate": 0.0026094105619557753, + "loss": 1.128, + "step": 3584 + }, + { + "epoch": 0.3143750111327448, + "grad_norm": 0.076171875, + "learning_rate": 0.0026091237030569404, + "loss": 1.1955, + "step": 3585 + }, + { + "epoch": 0.31446270290711936, + "grad_norm": 0.059814453125, + "learning_rate": 0.0026088367566885, + "loss": 1.2278, + "step": 3586 + }, + { + "epoch": 0.3145503946814939, + "grad_norm": 0.06787109375, + "learning_rate": 0.0026085497228766218, + "loss": 1.2631, + "step": 3587 + }, + { + "epoch": 0.31463808645586844, + "grad_norm": 0.07763671875, + "learning_rate": 0.002608262601647484, + "loss": 1.1882, + "step": 3588 + }, + { + "epoch": 0.31472577823024295, + "grad_norm": 0.1328125, + "learning_rate": 0.0026079753930272707, + "loss": 1.2073, + "step": 3589 + }, + { + "epoch": 0.3148134700046175, + "grad_norm": 0.10498046875, + "learning_rate": 0.002607688097042174, + "loss": 1.1955, + "step": 3590 + }, + { + "epoch": 0.3149011617789921, + "grad_norm": 0.1806640625, + "learning_rate": 0.002607400713718396, + "loss": 1.2355, + "step": 3591 + }, + { + "epoch": 0.3149888535533666, + "grad_norm": 0.11572265625, + "learning_rate": 0.002607113243082144, + "loss": 1.25, + "step": 3592 + }, + { + "epoch": 0.31507654532774115, + "grad_norm": 0.1279296875, + "learning_rate": 0.0026068256851596344, + "loss": 1.1725, + "step": 3593 + }, + { + "epoch": 0.3151642371021157, + "grad_norm": 0.1259765625, + "learning_rate": 0.0026065380399770935, + "loss": 1.2078, + "step": 3594 + }, + { + "epoch": 0.31525192887649023, + "grad_norm": 0.10986328125, + "learning_rate": 0.0026062503075607517, + "loss": 1.2682, + "step": 3595 + }, + { + "epoch": 0.3153396206508648, + "grad_norm": 0.12158203125, + "learning_rate": 0.002605962487936851, + "loss": 1.1699, + "step": 3596 + }, + { + "epoch": 0.31542731242523936, + "grad_norm": 0.1298828125, + "learning_rate": 0.0026056745811316386, + "loss": 1.1881, + "step": 3597 + }, + { + "epoch": 0.31551500419961387, + "grad_norm": 0.08642578125, + "learning_rate": 0.002605386587171372, + "loss": 1.19, + "step": 3598 + }, + { + "epoch": 0.31560269597398843, + "grad_norm": 0.078125, + "learning_rate": 0.0026050985060823146, + "loss": 1.2089, + "step": 3599 + }, + { + "epoch": 0.315690387748363, + "grad_norm": 0.09716796875, + "learning_rate": 0.002604810337890739, + "loss": 1.1829, + "step": 3600 + }, + { + "epoch": 0.3157780795227375, + "grad_norm": 0.06494140625, + "learning_rate": 0.002604522082622926, + "loss": 1.2315, + "step": 3601 + }, + { + "epoch": 0.3158657712971121, + "grad_norm": 0.06884765625, + "learning_rate": 0.0026042337403051627, + "loss": 1.2491, + "step": 3602 + }, + { + "epoch": 0.3159534630714866, + "grad_norm": 0.0615234375, + "learning_rate": 0.002603945310963746, + "loss": 1.2127, + "step": 3603 + }, + { + "epoch": 0.31604115484586115, + "grad_norm": 0.0751953125, + "learning_rate": 0.002603656794624979, + "loss": 1.1826, + "step": 3604 + }, + { + "epoch": 0.3161288466202357, + "grad_norm": 0.08154296875, + "learning_rate": 0.0026033681913151746, + "loss": 1.2227, + "step": 3605 + }, + { + "epoch": 0.3162165383946102, + "grad_norm": 0.07177734375, + "learning_rate": 0.002603079501060652, + "loss": 1.2382, + "step": 3606 + }, + { + "epoch": 0.3163042301689848, + "grad_norm": 0.06396484375, + "learning_rate": 0.0026027907238877395, + "loss": 1.1835, + "step": 3607 + }, + { + "epoch": 0.31639192194335936, + "grad_norm": 0.06298828125, + "learning_rate": 0.002602501859822772, + "loss": 1.2064, + "step": 3608 + }, + { + "epoch": 0.31647961371773387, + "grad_norm": 0.08203125, + "learning_rate": 0.0026022129088920944, + "loss": 1.1548, + "step": 3609 + }, + { + "epoch": 0.31656730549210843, + "grad_norm": 0.08251953125, + "learning_rate": 0.002601923871122057, + "loss": 1.1669, + "step": 3610 + }, + { + "epoch": 0.316654997266483, + "grad_norm": 0.08447265625, + "learning_rate": 0.0026016347465390205, + "loss": 1.2062, + "step": 3611 + }, + { + "epoch": 0.3167426890408575, + "grad_norm": 0.13671875, + "learning_rate": 0.0026013455351693517, + "loss": 1.2366, + "step": 3612 + }, + { + "epoch": 0.31683038081523207, + "grad_norm": 0.08984375, + "learning_rate": 0.002601056237039426, + "loss": 1.1988, + "step": 3613 + }, + { + "epoch": 0.3169180725896066, + "grad_norm": 0.1396484375, + "learning_rate": 0.0026007668521756265, + "loss": 1.2078, + "step": 3614 + }, + { + "epoch": 0.31700576436398115, + "grad_norm": 0.06787109375, + "learning_rate": 0.0026004773806043444, + "loss": 1.265, + "step": 3615 + }, + { + "epoch": 0.3170934561383557, + "grad_norm": 0.1376953125, + "learning_rate": 0.0026001878223519794, + "loss": 1.2037, + "step": 3616 + }, + { + "epoch": 0.3171811479127302, + "grad_norm": 0.11376953125, + "learning_rate": 0.0025998981774449374, + "loss": 1.2352, + "step": 3617 + }, + { + "epoch": 0.3172688396871048, + "grad_norm": 0.08349609375, + "learning_rate": 0.0025996084459096336, + "loss": 1.1897, + "step": 3618 + }, + { + "epoch": 0.31735653146147935, + "grad_norm": 0.0673828125, + "learning_rate": 0.002599318627772491, + "loss": 1.1959, + "step": 3619 + }, + { + "epoch": 0.31744422323585386, + "grad_norm": 0.06396484375, + "learning_rate": 0.0025990287230599402, + "loss": 1.2534, + "step": 3620 + }, + { + "epoch": 0.3175319150102284, + "grad_norm": 0.059814453125, + "learning_rate": 0.0025987387317984198, + "loss": 1.1965, + "step": 3621 + }, + { + "epoch": 0.317619606784603, + "grad_norm": 0.0556640625, + "learning_rate": 0.0025984486540143767, + "loss": 1.1945, + "step": 3622 + }, + { + "epoch": 0.3177072985589775, + "grad_norm": 0.056396484375, + "learning_rate": 0.002598158489734264, + "loss": 1.2277, + "step": 3623 + }, + { + "epoch": 0.31779499033335207, + "grad_norm": 0.05908203125, + "learning_rate": 0.0025978682389845454, + "loss": 1.2335, + "step": 3624 + }, + { + "epoch": 0.3178826821077266, + "grad_norm": 0.0546875, + "learning_rate": 0.0025975779017916897, + "loss": 1.188, + "step": 3625 + }, + { + "epoch": 0.31797037388210114, + "grad_norm": 0.06884765625, + "learning_rate": 0.002597287478182176, + "loss": 1.1692, + "step": 3626 + }, + { + "epoch": 0.3180580656564757, + "grad_norm": 0.072265625, + "learning_rate": 0.002596996968182489, + "loss": 1.2101, + "step": 3627 + }, + { + "epoch": 0.3181457574308502, + "grad_norm": 0.054931640625, + "learning_rate": 0.0025967063718191244, + "loss": 1.1681, + "step": 3628 + }, + { + "epoch": 0.3182334492052248, + "grad_norm": 0.0771484375, + "learning_rate": 0.0025964156891185816, + "loss": 1.1548, + "step": 3629 + }, + { + "epoch": 0.31832114097959935, + "grad_norm": 0.08154296875, + "learning_rate": 0.002596124920107371, + "loss": 1.2024, + "step": 3630 + }, + { + "epoch": 0.31840883275397386, + "grad_norm": 0.09912109375, + "learning_rate": 0.0025958340648120103, + "loss": 1.2057, + "step": 3631 + }, + { + "epoch": 0.3184965245283484, + "grad_norm": 0.12060546875, + "learning_rate": 0.002595543123259025, + "loss": 1.2492, + "step": 3632 + }, + { + "epoch": 0.318584216302723, + "grad_norm": 0.1337890625, + "learning_rate": 0.0025952520954749475, + "loss": 1.1861, + "step": 3633 + }, + { + "epoch": 0.3186719080770975, + "grad_norm": 0.12255859375, + "learning_rate": 0.002594960981486319, + "loss": 1.2378, + "step": 3634 + }, + { + "epoch": 0.31875959985147206, + "grad_norm": 0.1728515625, + "learning_rate": 0.002594669781319688, + "loss": 1.2508, + "step": 3635 + }, + { + "epoch": 0.31884729162584663, + "grad_norm": 0.08251953125, + "learning_rate": 0.0025943784950016122, + "loss": 1.2439, + "step": 3636 + }, + { + "epoch": 0.31893498340022114, + "grad_norm": 0.0712890625, + "learning_rate": 0.0025940871225586556, + "loss": 1.1863, + "step": 3637 + }, + { + "epoch": 0.3190226751745957, + "grad_norm": 0.12353515625, + "learning_rate": 0.00259379566401739, + "loss": 1.2902, + "step": 3638 + }, + { + "epoch": 0.3191103669489702, + "grad_norm": 0.06787109375, + "learning_rate": 0.002593504119404397, + "loss": 1.1508, + "step": 3639 + }, + { + "epoch": 0.3191980587233448, + "grad_norm": 0.08154296875, + "learning_rate": 0.002593212488746263, + "loss": 1.2137, + "step": 3640 + }, + { + "epoch": 0.31928575049771935, + "grad_norm": 0.09423828125, + "learning_rate": 0.002592920772069586, + "loss": 1.2045, + "step": 3641 + }, + { + "epoch": 0.31937344227209385, + "grad_norm": 0.07373046875, + "learning_rate": 0.0025926289694009675, + "loss": 1.1515, + "step": 3642 + }, + { + "epoch": 0.3194611340464684, + "grad_norm": 0.10986328125, + "learning_rate": 0.0025923370807670217, + "loss": 1.2219, + "step": 3643 + }, + { + "epoch": 0.319548825820843, + "grad_norm": 0.07373046875, + "learning_rate": 0.002592045106194366, + "loss": 1.1632, + "step": 3644 + }, + { + "epoch": 0.3196365175952175, + "grad_norm": 0.10595703125, + "learning_rate": 0.0025917530457096284, + "loss": 1.224, + "step": 3645 + }, + { + "epoch": 0.31972420936959206, + "grad_norm": 0.053955078125, + "learning_rate": 0.0025914608993394444, + "loss": 1.1729, + "step": 3646 + }, + { + "epoch": 0.3198119011439666, + "grad_norm": 0.078125, + "learning_rate": 0.0025911686671104563, + "loss": 1.1572, + "step": 3647 + }, + { + "epoch": 0.31989959291834114, + "grad_norm": 0.10400390625, + "learning_rate": 0.002590876349049316, + "loss": 1.2318, + "step": 3648 + }, + { + "epoch": 0.3199872846927157, + "grad_norm": 0.0546875, + "learning_rate": 0.0025905839451826817, + "loss": 1.2542, + "step": 3649 + }, + { + "epoch": 0.3200749764670902, + "grad_norm": 0.126953125, + "learning_rate": 0.0025902914555372193, + "loss": 1.2318, + "step": 3650 + }, + { + "epoch": 0.3201626682414648, + "grad_norm": 0.053955078125, + "learning_rate": 0.002589998880139603, + "loss": 1.201, + "step": 3651 + }, + { + "epoch": 0.32025036001583934, + "grad_norm": 0.0693359375, + "learning_rate": 0.002589706219016516, + "loss": 1.322, + "step": 3652 + }, + { + "epoch": 0.32033805179021385, + "grad_norm": 0.09716796875, + "learning_rate": 0.002589413472194648, + "loss": 1.1562, + "step": 3653 + }, + { + "epoch": 0.3204257435645884, + "grad_norm": 0.08056640625, + "learning_rate": 0.002589120639700696, + "loss": 1.1722, + "step": 3654 + }, + { + "epoch": 0.320513435338963, + "grad_norm": 0.07421875, + "learning_rate": 0.002588827721561366, + "loss": 1.1912, + "step": 3655 + }, + { + "epoch": 0.3206011271133375, + "grad_norm": 0.05517578125, + "learning_rate": 0.0025885347178033714, + "loss": 1.2354, + "step": 3656 + }, + { + "epoch": 0.32068881888771206, + "grad_norm": 0.0751953125, + "learning_rate": 0.002588241628453433, + "loss": 1.2044, + "step": 3657 + }, + { + "epoch": 0.3207765106620866, + "grad_norm": 0.059814453125, + "learning_rate": 0.0025879484535382804, + "loss": 1.2094, + "step": 3658 + }, + { + "epoch": 0.32086420243646113, + "grad_norm": 0.0986328125, + "learning_rate": 0.0025876551930846506, + "loss": 1.3157, + "step": 3659 + }, + { + "epoch": 0.3209518942108357, + "grad_norm": 0.0810546875, + "learning_rate": 0.0025873618471192873, + "loss": 1.2161, + "step": 3660 + }, + { + "epoch": 0.3210395859852102, + "grad_norm": 0.0703125, + "learning_rate": 0.002587068415668943, + "loss": 1.226, + "step": 3661 + }, + { + "epoch": 0.3211272777595848, + "grad_norm": 0.0595703125, + "learning_rate": 0.0025867748987603786, + "loss": 1.1852, + "step": 3662 + }, + { + "epoch": 0.32121496953395934, + "grad_norm": 0.07666015625, + "learning_rate": 0.0025864812964203616, + "loss": 1.171, + "step": 3663 + }, + { + "epoch": 0.32130266130833385, + "grad_norm": 0.0546875, + "learning_rate": 0.0025861876086756682, + "loss": 1.2078, + "step": 3664 + }, + { + "epoch": 0.3213903530827084, + "grad_norm": 0.08984375, + "learning_rate": 0.0025858938355530817, + "loss": 1.2128, + "step": 3665 + }, + { + "epoch": 0.321478044857083, + "grad_norm": 0.07470703125, + "learning_rate": 0.002585599977079393, + "loss": 1.182, + "step": 3666 + }, + { + "epoch": 0.3215657366314575, + "grad_norm": 0.07568359375, + "learning_rate": 0.002585306033281402, + "loss": 1.1406, + "step": 3667 + }, + { + "epoch": 0.32165342840583205, + "grad_norm": 0.0615234375, + "learning_rate": 0.002585012004185915, + "loss": 1.1854, + "step": 3668 + }, + { + "epoch": 0.3217411201802066, + "grad_norm": 0.1064453125, + "learning_rate": 0.002584717889819747, + "loss": 1.2627, + "step": 3669 + }, + { + "epoch": 0.32182881195458113, + "grad_norm": 0.054443359375, + "learning_rate": 0.0025844236902097203, + "loss": 1.3122, + "step": 3670 + }, + { + "epoch": 0.3219165037289557, + "grad_norm": 0.10205078125, + "learning_rate": 0.002584129405382665, + "loss": 1.1991, + "step": 3671 + }, + { + "epoch": 0.32200419550333026, + "grad_norm": 0.05615234375, + "learning_rate": 0.0025838350353654194, + "loss": 1.2346, + "step": 3672 + }, + { + "epoch": 0.32209188727770477, + "grad_norm": 0.058349609375, + "learning_rate": 0.00258354058018483, + "loss": 1.1088, + "step": 3673 + }, + { + "epoch": 0.32217957905207933, + "grad_norm": 0.10791015625, + "learning_rate": 0.0025832460398677493, + "loss": 1.1771, + "step": 3674 + }, + { + "epoch": 0.32226727082645384, + "grad_norm": 0.058349609375, + "learning_rate": 0.0025829514144410383, + "loss": 1.1773, + "step": 3675 + }, + { + "epoch": 0.3223549626008284, + "grad_norm": 0.10986328125, + "learning_rate": 0.002582656703931567, + "loss": 1.181, + "step": 3676 + }, + { + "epoch": 0.322442654375203, + "grad_norm": 0.06103515625, + "learning_rate": 0.002582361908366212, + "loss": 1.2021, + "step": 3677 + }, + { + "epoch": 0.3225303461495775, + "grad_norm": 0.05517578125, + "learning_rate": 0.002582067027771858, + "loss": 1.2342, + "step": 3678 + }, + { + "epoch": 0.32261803792395205, + "grad_norm": 0.056396484375, + "learning_rate": 0.002581772062175397, + "loss": 1.176, + "step": 3679 + }, + { + "epoch": 0.3227057296983266, + "grad_norm": 0.07080078125, + "learning_rate": 0.00258147701160373, + "loss": 1.2086, + "step": 3680 + }, + { + "epoch": 0.3227934214727011, + "grad_norm": 0.068359375, + "learning_rate": 0.0025811818760837634, + "loss": 1.2242, + "step": 3681 + }, + { + "epoch": 0.3228811132470757, + "grad_norm": 0.0556640625, + "learning_rate": 0.0025808866556424137, + "loss": 1.2311, + "step": 3682 + }, + { + "epoch": 0.32296880502145026, + "grad_norm": 0.06689453125, + "learning_rate": 0.002580591350306604, + "loss": 1.2864, + "step": 3683 + }, + { + "epoch": 0.32305649679582477, + "grad_norm": 0.09375, + "learning_rate": 0.0025802959601032654, + "loss": 1.2212, + "step": 3684 + }, + { + "epoch": 0.32314418857019933, + "grad_norm": 0.07373046875, + "learning_rate": 0.002580000485059337, + "loss": 1.1889, + "step": 3685 + }, + { + "epoch": 0.32323188034457384, + "grad_norm": 0.1064453125, + "learning_rate": 0.002579704925201765, + "loss": 1.2047, + "step": 3686 + }, + { + "epoch": 0.3233195721189484, + "grad_norm": 0.0927734375, + "learning_rate": 0.0025794092805575043, + "loss": 1.2856, + "step": 3687 + }, + { + "epoch": 0.32340726389332297, + "grad_norm": 0.056640625, + "learning_rate": 0.0025791135511535153, + "loss": 1.1939, + "step": 3688 + }, + { + "epoch": 0.3234949556676975, + "grad_norm": 0.17578125, + "learning_rate": 0.0025788177370167703, + "loss": 1.1674, + "step": 3689 + }, + { + "epoch": 0.32358264744207205, + "grad_norm": 0.064453125, + "learning_rate": 0.0025785218381742447, + "loss": 1.1952, + "step": 3690 + }, + { + "epoch": 0.3236703392164466, + "grad_norm": 0.154296875, + "learning_rate": 0.0025782258546529243, + "loss": 1.1901, + "step": 3691 + }, + { + "epoch": 0.3237580309908211, + "grad_norm": 0.08251953125, + "learning_rate": 0.002577929786479803, + "loss": 1.205, + "step": 3692 + }, + { + "epoch": 0.3238457227651957, + "grad_norm": 0.1533203125, + "learning_rate": 0.0025776336336818797, + "loss": 1.2244, + "step": 3693 + }, + { + "epoch": 0.32393341453957025, + "grad_norm": 0.10009765625, + "learning_rate": 0.002577337396286164, + "loss": 1.2313, + "step": 3694 + }, + { + "epoch": 0.32402110631394476, + "grad_norm": 0.15625, + "learning_rate": 0.0025770410743196715, + "loss": 1.1898, + "step": 3695 + }, + { + "epoch": 0.3241087980883193, + "grad_norm": 0.053955078125, + "learning_rate": 0.0025767446678094264, + "loss": 1.196, + "step": 3696 + }, + { + "epoch": 0.32419648986269384, + "grad_norm": 0.09033203125, + "learning_rate": 0.0025764481767824596, + "loss": 1.186, + "step": 3697 + }, + { + "epoch": 0.3242841816370684, + "grad_norm": 0.07470703125, + "learning_rate": 0.002576151601265811, + "loss": 1.2333, + "step": 3698 + }, + { + "epoch": 0.32437187341144297, + "grad_norm": 0.0888671875, + "learning_rate": 0.0025758549412865277, + "loss": 1.2599, + "step": 3699 + }, + { + "epoch": 0.3244595651858175, + "grad_norm": 0.072265625, + "learning_rate": 0.0025755581968716634, + "loss": 1.2609, + "step": 3700 + }, + { + "epoch": 0.32454725696019204, + "grad_norm": 0.07666015625, + "learning_rate": 0.002575261368048281, + "loss": 1.2195, + "step": 3701 + }, + { + "epoch": 0.3246349487345666, + "grad_norm": 0.07177734375, + "learning_rate": 0.0025749644548434497, + "loss": 1.1784, + "step": 3702 + }, + { + "epoch": 0.3247226405089411, + "grad_norm": 0.07763671875, + "learning_rate": 0.0025746674572842483, + "loss": 1.1971, + "step": 3703 + }, + { + "epoch": 0.3248103322833157, + "grad_norm": 0.062255859375, + "learning_rate": 0.002574370375397762, + "loss": 1.1759, + "step": 3704 + }, + { + "epoch": 0.32489802405769025, + "grad_norm": 0.095703125, + "learning_rate": 0.0025740732092110834, + "loss": 1.1879, + "step": 3705 + }, + { + "epoch": 0.32498571583206476, + "grad_norm": 0.0537109375, + "learning_rate": 0.0025737759587513144, + "loss": 1.2027, + "step": 3706 + }, + { + "epoch": 0.3250734076064393, + "grad_norm": 0.0771484375, + "learning_rate": 0.002573478624045562, + "loss": 1.2157, + "step": 3707 + }, + { + "epoch": 0.3251610993808139, + "grad_norm": 0.061279296875, + "learning_rate": 0.002573181205120943, + "loss": 1.1949, + "step": 3708 + }, + { + "epoch": 0.3252487911551884, + "grad_norm": 0.06982421875, + "learning_rate": 0.0025728837020045814, + "loss": 1.2089, + "step": 3709 + }, + { + "epoch": 0.32533648292956296, + "grad_norm": 0.061279296875, + "learning_rate": 0.0025725861147236086, + "loss": 1.2611, + "step": 3710 + }, + { + "epoch": 0.3254241747039375, + "grad_norm": 0.08935546875, + "learning_rate": 0.0025722884433051637, + "loss": 1.1872, + "step": 3711 + }, + { + "epoch": 0.32551186647831204, + "grad_norm": 0.0712890625, + "learning_rate": 0.002571990687776394, + "loss": 1.2293, + "step": 3712 + }, + { + "epoch": 0.3255995582526866, + "grad_norm": 0.1015625, + "learning_rate": 0.002571692848164453, + "loss": 1.1905, + "step": 3713 + }, + { + "epoch": 0.3256872500270611, + "grad_norm": 0.07763671875, + "learning_rate": 0.002571394924496504, + "loss": 1.2039, + "step": 3714 + }, + { + "epoch": 0.3257749418014357, + "grad_norm": 0.052734375, + "learning_rate": 0.002571096916799716, + "loss": 1.2498, + "step": 3715 + }, + { + "epoch": 0.32586263357581025, + "grad_norm": 0.0908203125, + "learning_rate": 0.002570798825101268, + "loss": 1.2058, + "step": 3716 + }, + { + "epoch": 0.32595032535018476, + "grad_norm": 0.054931640625, + "learning_rate": 0.002570500649428344, + "loss": 1.1403, + "step": 3717 + }, + { + "epoch": 0.3260380171245593, + "grad_norm": 0.055419921875, + "learning_rate": 0.002570202389808136, + "loss": 1.2172, + "step": 3718 + }, + { + "epoch": 0.3261257088989339, + "grad_norm": 0.05810546875, + "learning_rate": 0.0025699040462678464, + "loss": 1.2432, + "step": 3719 + }, + { + "epoch": 0.3262134006733084, + "grad_norm": 0.08837890625, + "learning_rate": 0.002569605618834682, + "loss": 1.1899, + "step": 3720 + }, + { + "epoch": 0.32630109244768296, + "grad_norm": 0.0537109375, + "learning_rate": 0.0025693071075358594, + "loss": 1.2001, + "step": 3721 + }, + { + "epoch": 0.32638878422205747, + "grad_norm": 0.0927734375, + "learning_rate": 0.002569008512398602, + "loss": 1.1853, + "step": 3722 + }, + { + "epoch": 0.32647647599643204, + "grad_norm": 0.10498046875, + "learning_rate": 0.0025687098334501402, + "loss": 1.2448, + "step": 3723 + }, + { + "epoch": 0.3265641677708066, + "grad_norm": 0.09716796875, + "learning_rate": 0.002568411070717714, + "loss": 1.1032, + "step": 3724 + }, + { + "epoch": 0.3266518595451811, + "grad_norm": 0.123046875, + "learning_rate": 0.002568112224228568, + "loss": 1.1911, + "step": 3725 + }, + { + "epoch": 0.3267395513195557, + "grad_norm": 0.064453125, + "learning_rate": 0.002567813294009958, + "loss": 1.2242, + "step": 3726 + }, + { + "epoch": 0.32682724309393024, + "grad_norm": 0.10986328125, + "learning_rate": 0.002567514280089144, + "loss": 1.246, + "step": 3727 + }, + { + "epoch": 0.32691493486830475, + "grad_norm": 0.052978515625, + "learning_rate": 0.0025672151824933974, + "loss": 1.1884, + "step": 3728 + }, + { + "epoch": 0.3270026266426793, + "grad_norm": 0.07275390625, + "learning_rate": 0.002566916001249993, + "loss": 1.189, + "step": 3729 + }, + { + "epoch": 0.3270903184170539, + "grad_norm": 0.061767578125, + "learning_rate": 0.002566616736386217, + "loss": 1.18, + "step": 3730 + }, + { + "epoch": 0.3271780101914284, + "grad_norm": 0.0576171875, + "learning_rate": 0.0025663173879293603, + "loss": 1.2033, + "step": 3731 + }, + { + "epoch": 0.32726570196580296, + "grad_norm": 0.04736328125, + "learning_rate": 0.002566017955906724, + "loss": 1.1358, + "step": 3732 + }, + { + "epoch": 0.32735339374017747, + "grad_norm": 0.07373046875, + "learning_rate": 0.002565718440345614, + "loss": 1.3151, + "step": 3733 + }, + { + "epoch": 0.32744108551455203, + "grad_norm": 0.0849609375, + "learning_rate": 0.002565418841273347, + "loss": 1.1943, + "step": 3734 + }, + { + "epoch": 0.3275287772889266, + "grad_norm": 0.0771484375, + "learning_rate": 0.002565119158717244, + "loss": 1.2604, + "step": 3735 + }, + { + "epoch": 0.3276164690633011, + "grad_norm": 0.0986328125, + "learning_rate": 0.002564819392704636, + "loss": 1.1909, + "step": 3736 + }, + { + "epoch": 0.3277041608376757, + "grad_norm": 0.10693359375, + "learning_rate": 0.0025645195432628617, + "loss": 1.2115, + "step": 3737 + }, + { + "epoch": 0.32779185261205024, + "grad_norm": 0.07666015625, + "learning_rate": 0.0025642196104192654, + "loss": 1.2065, + "step": 3738 + }, + { + "epoch": 0.32787954438642475, + "grad_norm": 0.08935546875, + "learning_rate": 0.002563919594201201, + "loss": 1.151, + "step": 3739 + }, + { + "epoch": 0.3279672361607993, + "grad_norm": 0.053466796875, + "learning_rate": 0.002563619494636029, + "loss": 1.2012, + "step": 3740 + }, + { + "epoch": 0.3280549279351739, + "grad_norm": 0.0654296875, + "learning_rate": 0.002563319311751117, + "loss": 1.1898, + "step": 3741 + }, + { + "epoch": 0.3281426197095484, + "grad_norm": 0.0537109375, + "learning_rate": 0.0025630190455738417, + "loss": 1.1545, + "step": 3742 + }, + { + "epoch": 0.32823031148392295, + "grad_norm": 0.06298828125, + "learning_rate": 0.0025627186961315867, + "loss": 1.2197, + "step": 3743 + }, + { + "epoch": 0.32831800325829746, + "grad_norm": 0.0537109375, + "learning_rate": 0.0025624182634517425, + "loss": 1.1481, + "step": 3744 + }, + { + "epoch": 0.32840569503267203, + "grad_norm": 0.0712890625, + "learning_rate": 0.0025621177475617083, + "loss": 1.2178, + "step": 3745 + }, + { + "epoch": 0.3284933868070466, + "grad_norm": 0.11181640625, + "learning_rate": 0.00256181714848889, + "loss": 1.2462, + "step": 3746 + }, + { + "epoch": 0.3285810785814211, + "grad_norm": 0.057861328125, + "learning_rate": 0.0025615164662607013, + "loss": 1.1869, + "step": 3747 + }, + { + "epoch": 0.32866877035579567, + "grad_norm": 0.0869140625, + "learning_rate": 0.0025612157009045644, + "loss": 1.1707, + "step": 3748 + }, + { + "epoch": 0.32875646213017024, + "grad_norm": 0.06689453125, + "learning_rate": 0.0025609148524479075, + "loss": 1.1469, + "step": 3749 + }, + { + "epoch": 0.32884415390454474, + "grad_norm": 0.055419921875, + "learning_rate": 0.002560613920918168, + "loss": 1.2231, + "step": 3750 + }, + { + "epoch": 0.3289318456789193, + "grad_norm": 0.0888671875, + "learning_rate": 0.0025603129063427894, + "loss": 1.1746, + "step": 3751 + }, + { + "epoch": 0.3290195374532939, + "grad_norm": 0.1015625, + "learning_rate": 0.0025600118087492235, + "loss": 1.1842, + "step": 3752 + }, + { + "epoch": 0.3291072292276684, + "grad_norm": 0.0927734375, + "learning_rate": 0.00255971062816493, + "loss": 1.196, + "step": 3753 + }, + { + "epoch": 0.32919492100204295, + "grad_norm": 0.064453125, + "learning_rate": 0.0025594093646173757, + "loss": 1.197, + "step": 3754 + }, + { + "epoch": 0.3292826127764175, + "grad_norm": 0.154296875, + "learning_rate": 0.0025591080181340355, + "loss": 1.1952, + "step": 3755 + }, + { + "epoch": 0.329370304550792, + "grad_norm": 0.09033203125, + "learning_rate": 0.0025588065887423898, + "loss": 1.2179, + "step": 3756 + }, + { + "epoch": 0.3294579963251666, + "grad_norm": 0.1513671875, + "learning_rate": 0.00255850507646993, + "loss": 1.1883, + "step": 3757 + }, + { + "epoch": 0.3295456880995411, + "grad_norm": 0.056640625, + "learning_rate": 0.0025582034813441522, + "loss": 1.1891, + "step": 3758 + }, + { + "epoch": 0.32963337987391567, + "grad_norm": 0.1279296875, + "learning_rate": 0.0025579018033925617, + "loss": 1.1942, + "step": 3759 + }, + { + "epoch": 0.32972107164829023, + "grad_norm": 0.162109375, + "learning_rate": 0.0025576000426426704, + "loss": 1.1921, + "step": 3760 + }, + { + "epoch": 0.32980876342266474, + "grad_norm": 0.09814453125, + "learning_rate": 0.002557298199121998, + "loss": 1.2723, + "step": 3761 + }, + { + "epoch": 0.3298964551970393, + "grad_norm": 0.2314453125, + "learning_rate": 0.002556996272858072, + "loss": 1.2081, + "step": 3762 + }, + { + "epoch": 0.32998414697141387, + "grad_norm": 0.072265625, + "learning_rate": 0.0025566942638784273, + "loss": 1.2951, + "step": 3763 + }, + { + "epoch": 0.3300718387457884, + "grad_norm": 0.1337890625, + "learning_rate": 0.0025563921722106067, + "loss": 1.1731, + "step": 3764 + }, + { + "epoch": 0.33015953052016295, + "grad_norm": 0.0810546875, + "learning_rate": 0.0025560899978821594, + "loss": 1.2414, + "step": 3765 + }, + { + "epoch": 0.3302472222945375, + "grad_norm": 0.09814453125, + "learning_rate": 0.0025557877409206438, + "loss": 1.2604, + "step": 3766 + }, + { + "epoch": 0.330334914068912, + "grad_norm": 0.060546875, + "learning_rate": 0.002555485401353624, + "loss": 1.1924, + "step": 3767 + }, + { + "epoch": 0.3304226058432866, + "grad_norm": 0.055908203125, + "learning_rate": 0.002555182979208674, + "loss": 1.2083, + "step": 3768 + }, + { + "epoch": 0.3305102976176611, + "grad_norm": 0.05712890625, + "learning_rate": 0.002554880474513372, + "loss": 1.2136, + "step": 3769 + }, + { + "epoch": 0.33059798939203566, + "grad_norm": 0.0830078125, + "learning_rate": 0.0025545778872953073, + "loss": 1.2399, + "step": 3770 + }, + { + "epoch": 0.33068568116641023, + "grad_norm": 0.06298828125, + "learning_rate": 0.002554275217582074, + "loss": 1.2094, + "step": 3771 + }, + { + "epoch": 0.33077337294078474, + "grad_norm": 0.08203125, + "learning_rate": 0.002553972465401275, + "loss": 1.2202, + "step": 3772 + }, + { + "epoch": 0.3308610647151593, + "grad_norm": 0.06982421875, + "learning_rate": 0.002553669630780521, + "loss": 1.2021, + "step": 3773 + }, + { + "epoch": 0.33094875648953387, + "grad_norm": 0.057373046875, + "learning_rate": 0.00255336671374743, + "loss": 1.1743, + "step": 3774 + }, + { + "epoch": 0.3310364482639084, + "grad_norm": 0.061767578125, + "learning_rate": 0.002553063714329626, + "loss": 1.2048, + "step": 3775 + }, + { + "epoch": 0.33112414003828294, + "grad_norm": 0.06005859375, + "learning_rate": 0.0025527606325547432, + "loss": 1.2572, + "step": 3776 + }, + { + "epoch": 0.3312118318126575, + "grad_norm": 0.0712890625, + "learning_rate": 0.002552457468450421, + "loss": 1.1929, + "step": 3777 + }, + { + "epoch": 0.331299523587032, + "grad_norm": 0.05810546875, + "learning_rate": 0.002552154222044307, + "loss": 1.259, + "step": 3778 + }, + { + "epoch": 0.3313872153614066, + "grad_norm": 0.0595703125, + "learning_rate": 0.0025518508933640572, + "loss": 1.2576, + "step": 3779 + }, + { + "epoch": 0.3314749071357811, + "grad_norm": 0.0595703125, + "learning_rate": 0.002551547482437334, + "loss": 1.205, + "step": 3780 + }, + { + "epoch": 0.33156259891015566, + "grad_norm": 0.060546875, + "learning_rate": 0.0025512439892918072, + "loss": 1.1563, + "step": 3781 + }, + { + "epoch": 0.3316502906845302, + "grad_norm": 0.0654296875, + "learning_rate": 0.0025509404139551555, + "loss": 1.2718, + "step": 3782 + }, + { + "epoch": 0.33173798245890473, + "grad_norm": 0.059326171875, + "learning_rate": 0.002550636756455064, + "loss": 1.1583, + "step": 3783 + }, + { + "epoch": 0.3318256742332793, + "grad_norm": 0.0556640625, + "learning_rate": 0.0025503330168192254, + "loss": 1.2247, + "step": 3784 + }, + { + "epoch": 0.33191336600765386, + "grad_norm": 0.068359375, + "learning_rate": 0.0025500291950753392, + "loss": 1.2187, + "step": 3785 + }, + { + "epoch": 0.3320010577820284, + "grad_norm": 0.0810546875, + "learning_rate": 0.0025497252912511145, + "loss": 1.1745, + "step": 3786 + }, + { + "epoch": 0.33208874955640294, + "grad_norm": 0.06103515625, + "learning_rate": 0.0025494213053742654, + "loss": 1.2522, + "step": 3787 + }, + { + "epoch": 0.3321764413307775, + "grad_norm": 0.0849609375, + "learning_rate": 0.0025491172374725157, + "loss": 1.2106, + "step": 3788 + }, + { + "epoch": 0.332264133105152, + "grad_norm": 0.0615234375, + "learning_rate": 0.0025488130875735947, + "loss": 1.1873, + "step": 3789 + }, + { + "epoch": 0.3323518248795266, + "grad_norm": 0.049072265625, + "learning_rate": 0.0025485088557052403, + "loss": 1.189, + "step": 3790 + }, + { + "epoch": 0.33243951665390115, + "grad_norm": 0.08349609375, + "learning_rate": 0.0025482045418951976, + "loss": 1.2034, + "step": 3791 + }, + { + "epoch": 0.33252720842827566, + "grad_norm": 0.053466796875, + "learning_rate": 0.0025479001461712193, + "loss": 1.2346, + "step": 3792 + }, + { + "epoch": 0.3326149002026502, + "grad_norm": 0.05615234375, + "learning_rate": 0.002547595668561066, + "loss": 1.1971, + "step": 3793 + }, + { + "epoch": 0.33270259197702473, + "grad_norm": 0.053955078125, + "learning_rate": 0.002547291109092505, + "loss": 1.17, + "step": 3794 + }, + { + "epoch": 0.3327902837513993, + "grad_norm": 0.0791015625, + "learning_rate": 0.002546986467793311, + "loss": 1.1872, + "step": 3795 + }, + { + "epoch": 0.33287797552577386, + "grad_norm": 0.061279296875, + "learning_rate": 0.0025466817446912664, + "loss": 1.1714, + "step": 3796 + }, + { + "epoch": 0.33296566730014837, + "grad_norm": 0.08740234375, + "learning_rate": 0.0025463769398141613, + "loss": 1.2313, + "step": 3797 + }, + { + "epoch": 0.33305335907452294, + "grad_norm": 0.080078125, + "learning_rate": 0.002546072053189793, + "loss": 1.2128, + "step": 3798 + }, + { + "epoch": 0.3331410508488975, + "grad_norm": 0.11279296875, + "learning_rate": 0.002545767084845968, + "loss": 1.2158, + "step": 3799 + }, + { + "epoch": 0.333228742623272, + "grad_norm": 0.09130859375, + "learning_rate": 0.002545462034810496, + "loss": 1.2332, + "step": 3800 + }, + { + "epoch": 0.3333164343976466, + "grad_norm": 0.09716796875, + "learning_rate": 0.002545156903111199, + "loss": 1.226, + "step": 3801 + }, + { + "epoch": 0.33340412617202114, + "grad_norm": 0.08203125, + "learning_rate": 0.002544851689775903, + "loss": 1.1971, + "step": 3802 + }, + { + "epoch": 0.33349181794639565, + "grad_norm": 0.0771484375, + "learning_rate": 0.0025445463948324423, + "loss": 1.1959, + "step": 3803 + }, + { + "epoch": 0.3335795097207702, + "grad_norm": 0.09228515625, + "learning_rate": 0.00254424101830866, + "loss": 1.216, + "step": 3804 + }, + { + "epoch": 0.3336672014951447, + "grad_norm": 0.10546875, + "learning_rate": 0.002543935560232406, + "loss": 1.2151, + "step": 3805 + }, + { + "epoch": 0.3337548932695193, + "grad_norm": 0.078125, + "learning_rate": 0.002543630020631536, + "loss": 1.2374, + "step": 3806 + }, + { + "epoch": 0.33384258504389386, + "grad_norm": 0.07666015625, + "learning_rate": 0.0025433243995339147, + "loss": 1.1865, + "step": 3807 + }, + { + "epoch": 0.33393027681826837, + "grad_norm": 0.07080078125, + "learning_rate": 0.0025430186969674145, + "loss": 1.2509, + "step": 3808 + }, + { + "epoch": 0.33401796859264293, + "grad_norm": 0.064453125, + "learning_rate": 0.0025427129129599146, + "loss": 1.2154, + "step": 3809 + }, + { + "epoch": 0.3341056603670175, + "grad_norm": 0.056396484375, + "learning_rate": 0.0025424070475393015, + "loss": 1.2328, + "step": 3810 + }, + { + "epoch": 0.334193352141392, + "grad_norm": 0.06103515625, + "learning_rate": 0.0025421011007334695, + "loss": 1.2164, + "step": 3811 + }, + { + "epoch": 0.3342810439157666, + "grad_norm": 0.06591796875, + "learning_rate": 0.0025417950725703203, + "loss": 1.2128, + "step": 3812 + }, + { + "epoch": 0.33436873569014114, + "grad_norm": 0.06591796875, + "learning_rate": 0.0025414889630777622, + "loss": 1.1729, + "step": 3813 + }, + { + "epoch": 0.33445642746451565, + "grad_norm": 0.08642578125, + "learning_rate": 0.002541182772283712, + "loss": 1.1531, + "step": 3814 + }, + { + "epoch": 0.3345441192388902, + "grad_norm": 0.0625, + "learning_rate": 0.0025408765002160934, + "loss": 1.155, + "step": 3815 + }, + { + "epoch": 0.3346318110132647, + "grad_norm": 0.1279296875, + "learning_rate": 0.0025405701469028382, + "loss": 1.2115, + "step": 3816 + }, + { + "epoch": 0.3347195027876393, + "grad_norm": 0.06884765625, + "learning_rate": 0.0025402637123718846, + "loss": 1.2175, + "step": 3817 + }, + { + "epoch": 0.33480719456201385, + "grad_norm": 0.130859375, + "learning_rate": 0.0025399571966511787, + "loss": 1.2269, + "step": 3818 + }, + { + "epoch": 0.33489488633638836, + "grad_norm": 0.06640625, + "learning_rate": 0.0025396505997686736, + "loss": 1.1774, + "step": 3819 + }, + { + "epoch": 0.33498257811076293, + "grad_norm": 0.0673828125, + "learning_rate": 0.00253934392175233, + "loss": 1.1798, + "step": 3820 + }, + { + "epoch": 0.3350702698851375, + "grad_norm": 0.080078125, + "learning_rate": 0.0025390371626301173, + "loss": 1.201, + "step": 3821 + }, + { + "epoch": 0.335157961659512, + "grad_norm": 0.068359375, + "learning_rate": 0.00253873032243001, + "loss": 1.2311, + "step": 3822 + }, + { + "epoch": 0.33524565343388657, + "grad_norm": 0.06201171875, + "learning_rate": 0.002538423401179992, + "loss": 1.2466, + "step": 3823 + }, + { + "epoch": 0.33533334520826114, + "grad_norm": 0.0654296875, + "learning_rate": 0.0025381163989080533, + "loss": 1.2409, + "step": 3824 + }, + { + "epoch": 0.33542103698263565, + "grad_norm": 0.06494140625, + "learning_rate": 0.002537809315642191, + "loss": 1.2727, + "step": 3825 + }, + { + "epoch": 0.3355087287570102, + "grad_norm": 0.078125, + "learning_rate": 0.002537502151410411, + "loss": 1.1955, + "step": 3826 + }, + { + "epoch": 0.3355964205313848, + "grad_norm": 0.0791015625, + "learning_rate": 0.0025371949062407266, + "loss": 1.2691, + "step": 3827 + }, + { + "epoch": 0.3356841123057593, + "grad_norm": 0.0751953125, + "learning_rate": 0.002536887580161157, + "loss": 1.1967, + "step": 3828 + }, + { + "epoch": 0.33577180408013385, + "grad_norm": 0.0859375, + "learning_rate": 0.002536580173199729, + "loss": 1.1999, + "step": 3829 + }, + { + "epoch": 0.33585949585450836, + "grad_norm": 0.057373046875, + "learning_rate": 0.002536272685384478, + "loss": 1.2114, + "step": 3830 + }, + { + "epoch": 0.3359471876288829, + "grad_norm": 0.0830078125, + "learning_rate": 0.002535965116743446, + "loss": 1.2365, + "step": 3831 + }, + { + "epoch": 0.3360348794032575, + "grad_norm": 0.058837890625, + "learning_rate": 0.0025356574673046826, + "loss": 1.1783, + "step": 3832 + }, + { + "epoch": 0.336122571177632, + "grad_norm": 0.0712890625, + "learning_rate": 0.002535349737096244, + "loss": 1.1973, + "step": 3833 + }, + { + "epoch": 0.33621026295200657, + "grad_norm": 0.06640625, + "learning_rate": 0.0025350419261461962, + "loss": 1.1656, + "step": 3834 + }, + { + "epoch": 0.33629795472638113, + "grad_norm": 0.05810546875, + "learning_rate": 0.002534734034482608, + "loss": 1.2391, + "step": 3835 + }, + { + "epoch": 0.33638564650075564, + "grad_norm": 0.0859375, + "learning_rate": 0.0025344260621335607, + "loss": 1.1438, + "step": 3836 + }, + { + "epoch": 0.3364733382751302, + "grad_norm": 0.059814453125, + "learning_rate": 0.0025341180091271393, + "loss": 1.2051, + "step": 3837 + }, + { + "epoch": 0.3365610300495048, + "grad_norm": 0.06591796875, + "learning_rate": 0.0025338098754914377, + "loss": 1.1704, + "step": 3838 + }, + { + "epoch": 0.3366487218238793, + "grad_norm": 0.052978515625, + "learning_rate": 0.0025335016612545573, + "loss": 1.1827, + "step": 3839 + }, + { + "epoch": 0.33673641359825385, + "grad_norm": 0.061279296875, + "learning_rate": 0.002533193366444606, + "loss": 1.2276, + "step": 3840 + }, + { + "epoch": 0.33682410537262836, + "grad_norm": 0.06640625, + "learning_rate": 0.0025328849910896995, + "loss": 1.2361, + "step": 3841 + }, + { + "epoch": 0.3369117971470029, + "grad_norm": 0.05322265625, + "learning_rate": 0.002532576535217961, + "loss": 1.1686, + "step": 3842 + }, + { + "epoch": 0.3369994889213775, + "grad_norm": 0.05517578125, + "learning_rate": 0.0025322679988575214, + "loss": 1.2593, + "step": 3843 + }, + { + "epoch": 0.337087180695752, + "grad_norm": 0.062255859375, + "learning_rate": 0.0025319593820365177, + "loss": 1.1464, + "step": 3844 + }, + { + "epoch": 0.33717487247012656, + "grad_norm": 0.061767578125, + "learning_rate": 0.0025316506847830947, + "loss": 1.2232, + "step": 3845 + }, + { + "epoch": 0.33726256424450113, + "grad_norm": 0.049072265625, + "learning_rate": 0.002531341907125405, + "loss": 1.1705, + "step": 3846 + }, + { + "epoch": 0.33735025601887564, + "grad_norm": 0.061767578125, + "learning_rate": 0.002531033049091609, + "loss": 1.1531, + "step": 3847 + }, + { + "epoch": 0.3374379477932502, + "grad_norm": 0.0810546875, + "learning_rate": 0.002530724110709873, + "loss": 1.1871, + "step": 3848 + }, + { + "epoch": 0.33752563956762477, + "grad_norm": 0.0703125, + "learning_rate": 0.0025304150920083717, + "loss": 1.228, + "step": 3849 + }, + { + "epoch": 0.3376133313419993, + "grad_norm": 0.0947265625, + "learning_rate": 0.0025301059930152876, + "loss": 1.2601, + "step": 3850 + }, + { + "epoch": 0.33770102311637384, + "grad_norm": 0.06396484375, + "learning_rate": 0.002529796813758808, + "loss": 1.2028, + "step": 3851 + }, + { + "epoch": 0.33778871489074835, + "grad_norm": 0.06689453125, + "learning_rate": 0.0025294875542671304, + "loss": 1.1734, + "step": 3852 + }, + { + "epoch": 0.3378764066651229, + "grad_norm": 0.08251953125, + "learning_rate": 0.0025291782145684585, + "loss": 1.1952, + "step": 3853 + }, + { + "epoch": 0.3379640984394975, + "grad_norm": 0.068359375, + "learning_rate": 0.0025288687946910028, + "loss": 1.2404, + "step": 3854 + }, + { + "epoch": 0.338051790213872, + "grad_norm": 0.1259765625, + "learning_rate": 0.0025285592946629816, + "loss": 1.2223, + "step": 3855 + }, + { + "epoch": 0.33813948198824656, + "grad_norm": 0.076171875, + "learning_rate": 0.002528249714512621, + "loss": 1.2059, + "step": 3856 + }, + { + "epoch": 0.3382271737626211, + "grad_norm": 0.1201171875, + "learning_rate": 0.0025279400542681536, + "loss": 1.1894, + "step": 3857 + }, + { + "epoch": 0.33831486553699563, + "grad_norm": 0.057861328125, + "learning_rate": 0.00252763031395782, + "loss": 1.2305, + "step": 3858 + }, + { + "epoch": 0.3384025573113702, + "grad_norm": 0.134765625, + "learning_rate": 0.002527320493609867, + "loss": 1.2219, + "step": 3859 + }, + { + "epoch": 0.33849024908574477, + "grad_norm": 0.05712890625, + "learning_rate": 0.002527010593252549, + "loss": 1.1756, + "step": 3860 + }, + { + "epoch": 0.3385779408601193, + "grad_norm": 0.0712890625, + "learning_rate": 0.0025267006129141303, + "loss": 1.1833, + "step": 3861 + }, + { + "epoch": 0.33866563263449384, + "grad_norm": 0.055419921875, + "learning_rate": 0.0025263905526228783, + "loss": 1.169, + "step": 3862 + }, + { + "epoch": 0.3387533244088684, + "grad_norm": 0.0791015625, + "learning_rate": 0.00252608041240707, + "loss": 1.2373, + "step": 3863 + }, + { + "epoch": 0.3388410161832429, + "grad_norm": 0.061767578125, + "learning_rate": 0.00252577019229499, + "loss": 1.1814, + "step": 3864 + }, + { + "epoch": 0.3389287079576175, + "grad_norm": 0.0771484375, + "learning_rate": 0.0025254598923149293, + "loss": 1.2448, + "step": 3865 + }, + { + "epoch": 0.339016399731992, + "grad_norm": 0.076171875, + "learning_rate": 0.002525149512495186, + "loss": 1.211, + "step": 3866 + }, + { + "epoch": 0.33910409150636656, + "grad_norm": 0.087890625, + "learning_rate": 0.0025248390528640665, + "loss": 1.1562, + "step": 3867 + }, + { + "epoch": 0.3391917832807411, + "grad_norm": 0.06005859375, + "learning_rate": 0.002524528513449884, + "loss": 1.2184, + "step": 3868 + }, + { + "epoch": 0.33927947505511563, + "grad_norm": 0.119140625, + "learning_rate": 0.0025242178942809584, + "loss": 1.169, + "step": 3869 + }, + { + "epoch": 0.3393671668294902, + "grad_norm": 0.08056640625, + "learning_rate": 0.002523907195385618, + "loss": 1.1899, + "step": 3870 + }, + { + "epoch": 0.33945485860386476, + "grad_norm": 0.08837890625, + "learning_rate": 0.002523596416792197, + "loss": 1.2032, + "step": 3871 + }, + { + "epoch": 0.33954255037823927, + "grad_norm": 0.06103515625, + "learning_rate": 0.002523285558529038, + "loss": 1.2399, + "step": 3872 + }, + { + "epoch": 0.33963024215261384, + "grad_norm": 0.07275390625, + "learning_rate": 0.0025229746206244905, + "loss": 1.2305, + "step": 3873 + }, + { + "epoch": 0.3397179339269884, + "grad_norm": 0.052001953125, + "learning_rate": 0.0025226636031069117, + "loss": 1.2, + "step": 3874 + }, + { + "epoch": 0.3398056257013629, + "grad_norm": 0.095703125, + "learning_rate": 0.002522352506004664, + "loss": 1.2619, + "step": 3875 + }, + { + "epoch": 0.3398933174757375, + "grad_norm": 0.07177734375, + "learning_rate": 0.0025220413293461203, + "loss": 1.184, + "step": 3876 + }, + { + "epoch": 0.339981009250112, + "grad_norm": 0.11474609375, + "learning_rate": 0.002521730073159659, + "loss": 1.1855, + "step": 3877 + }, + { + "epoch": 0.34006870102448655, + "grad_norm": 0.0703125, + "learning_rate": 0.0025214187374736647, + "loss": 1.1762, + "step": 3878 + }, + { + "epoch": 0.3401563927988611, + "grad_norm": 0.06640625, + "learning_rate": 0.002521107322316531, + "loss": 1.225, + "step": 3879 + }, + { + "epoch": 0.34024408457323563, + "grad_norm": 0.1083984375, + "learning_rate": 0.002520795827716659, + "loss": 1.2175, + "step": 3880 + }, + { + "epoch": 0.3403317763476102, + "grad_norm": 0.0537109375, + "learning_rate": 0.002520484253702455, + "loss": 1.2177, + "step": 3881 + }, + { + "epoch": 0.34041946812198476, + "grad_norm": 0.1083984375, + "learning_rate": 0.0025201726003023337, + "loss": 1.1474, + "step": 3882 + }, + { + "epoch": 0.34050715989635927, + "grad_norm": 0.0595703125, + "learning_rate": 0.0025198608675447184, + "loss": 1.2183, + "step": 3883 + }, + { + "epoch": 0.34059485167073383, + "grad_norm": 0.10205078125, + "learning_rate": 0.002519549055458037, + "loss": 1.258, + "step": 3884 + }, + { + "epoch": 0.3406825434451084, + "grad_norm": 0.0732421875, + "learning_rate": 0.0025192371640707267, + "loss": 1.2241, + "step": 3885 + }, + { + "epoch": 0.3407702352194829, + "grad_norm": 0.0859375, + "learning_rate": 0.002518925193411231, + "loss": 1.2153, + "step": 3886 + }, + { + "epoch": 0.3408579269938575, + "grad_norm": 0.060546875, + "learning_rate": 0.002518613143508001, + "loss": 1.1849, + "step": 3887 + }, + { + "epoch": 0.340945618768232, + "grad_norm": 0.0703125, + "learning_rate": 0.002518301014389494, + "loss": 1.1763, + "step": 3888 + }, + { + "epoch": 0.34103331054260655, + "grad_norm": 0.059814453125, + "learning_rate": 0.002517988806084176, + "loss": 1.2465, + "step": 3889 + }, + { + "epoch": 0.3411210023169811, + "grad_norm": 0.06640625, + "learning_rate": 0.00251767651862052, + "loss": 1.1731, + "step": 3890 + }, + { + "epoch": 0.3412086940913556, + "grad_norm": 0.0654296875, + "learning_rate": 0.0025173641520270054, + "loss": 1.1931, + "step": 3891 + }, + { + "epoch": 0.3412963858657302, + "grad_norm": 0.05810546875, + "learning_rate": 0.0025170517063321197, + "loss": 1.1758, + "step": 3892 + }, + { + "epoch": 0.34138407764010475, + "grad_norm": 0.0595703125, + "learning_rate": 0.0025167391815643556, + "loss": 1.165, + "step": 3893 + }, + { + "epoch": 0.34147176941447926, + "grad_norm": 0.07177734375, + "learning_rate": 0.0025164265777522163, + "loss": 1.1666, + "step": 3894 + }, + { + "epoch": 0.34155946118885383, + "grad_norm": 0.078125, + "learning_rate": 0.0025161138949242098, + "loss": 1.1887, + "step": 3895 + }, + { + "epoch": 0.3416471529632284, + "grad_norm": 0.0966796875, + "learning_rate": 0.0025158011331088514, + "loss": 1.2408, + "step": 3896 + }, + { + "epoch": 0.3417348447376029, + "grad_norm": 0.052001953125, + "learning_rate": 0.0025154882923346658, + "loss": 1.2252, + "step": 3897 + }, + { + "epoch": 0.34182253651197747, + "grad_norm": 0.0595703125, + "learning_rate": 0.0025151753726301808, + "loss": 1.2157, + "step": 3898 + }, + { + "epoch": 0.34191022828635204, + "grad_norm": 0.08056640625, + "learning_rate": 0.002514862374023936, + "loss": 1.2426, + "step": 3899 + }, + { + "epoch": 0.34199792006072655, + "grad_norm": 0.0810546875, + "learning_rate": 0.0025145492965444756, + "loss": 1.2843, + "step": 3900 + }, + { + "epoch": 0.3420856118351011, + "grad_norm": 0.054931640625, + "learning_rate": 0.0025142361402203505, + "loss": 1.1935, + "step": 3901 + }, + { + "epoch": 0.3421733036094756, + "grad_norm": 0.050537109375, + "learning_rate": 0.0025139229050801207, + "loss": 1.1665, + "step": 3902 + }, + { + "epoch": 0.3422609953838502, + "grad_norm": 0.054931640625, + "learning_rate": 0.0025136095911523523, + "loss": 1.1793, + "step": 3903 + }, + { + "epoch": 0.34234868715822475, + "grad_norm": 0.059814453125, + "learning_rate": 0.002513296198465619, + "loss": 1.1819, + "step": 3904 + }, + { + "epoch": 0.34243637893259926, + "grad_norm": 0.0478515625, + "learning_rate": 0.0025129827270485, + "loss": 1.2102, + "step": 3905 + }, + { + "epoch": 0.3425240707069738, + "grad_norm": 0.051513671875, + "learning_rate": 0.0025126691769295846, + "loss": 1.2154, + "step": 3906 + }, + { + "epoch": 0.3426117624813484, + "grad_norm": 0.07080078125, + "learning_rate": 0.002512355548137467, + "loss": 1.2066, + "step": 3907 + }, + { + "epoch": 0.3426994542557229, + "grad_norm": 0.0712890625, + "learning_rate": 0.00251204184070075, + "loss": 1.1542, + "step": 3908 + }, + { + "epoch": 0.34278714603009747, + "grad_norm": 0.0693359375, + "learning_rate": 0.0025117280546480417, + "loss": 1.1944, + "step": 3909 + }, + { + "epoch": 0.34287483780447203, + "grad_norm": 0.0673828125, + "learning_rate": 0.0025114141900079603, + "loss": 1.1956, + "step": 3910 + }, + { + "epoch": 0.34296252957884654, + "grad_norm": 0.0908203125, + "learning_rate": 0.002511100246809128, + "loss": 1.2062, + "step": 3911 + }, + { + "epoch": 0.3430502213532211, + "grad_norm": 0.060791015625, + "learning_rate": 0.0025107862250801764, + "loss": 1.2028, + "step": 3912 + }, + { + "epoch": 0.3431379131275956, + "grad_norm": 0.1328125, + "learning_rate": 0.002510472124849743, + "loss": 1.2182, + "step": 3913 + }, + { + "epoch": 0.3432256049019702, + "grad_norm": 0.061767578125, + "learning_rate": 0.002510157946146473, + "loss": 1.1644, + "step": 3914 + }, + { + "epoch": 0.34331329667634475, + "grad_norm": 0.103515625, + "learning_rate": 0.0025098436889990187, + "loss": 1.2194, + "step": 3915 + }, + { + "epoch": 0.34340098845071926, + "grad_norm": 0.06689453125, + "learning_rate": 0.0025095293534360395, + "loss": 1.2093, + "step": 3916 + }, + { + "epoch": 0.3434886802250938, + "grad_norm": 0.0771484375, + "learning_rate": 0.0025092149394862024, + "loss": 1.2212, + "step": 3917 + }, + { + "epoch": 0.3435763719994684, + "grad_norm": 0.08251953125, + "learning_rate": 0.0025089004471781813, + "loss": 1.1971, + "step": 3918 + }, + { + "epoch": 0.3436640637738429, + "grad_norm": 0.054443359375, + "learning_rate": 0.0025085858765406567, + "loss": 1.1094, + "step": 3919 + }, + { + "epoch": 0.34375175554821746, + "grad_norm": 0.06494140625, + "learning_rate": 0.002508271227602316, + "loss": 1.2273, + "step": 3920 + }, + { + "epoch": 0.34383944732259203, + "grad_norm": 0.06982421875, + "learning_rate": 0.002507956500391855, + "loss": 1.217, + "step": 3921 + }, + { + "epoch": 0.34392713909696654, + "grad_norm": 0.08154296875, + "learning_rate": 0.0025076416949379758, + "loss": 1.2243, + "step": 3922 + }, + { + "epoch": 0.3440148308713411, + "grad_norm": 0.0810546875, + "learning_rate": 0.0025073268112693887, + "loss": 1.1952, + "step": 3923 + }, + { + "epoch": 0.3441025226457156, + "grad_norm": 0.058837890625, + "learning_rate": 0.002507011849414809, + "loss": 1.1919, + "step": 3924 + }, + { + "epoch": 0.3441902144200902, + "grad_norm": 0.0751953125, + "learning_rate": 0.002506696809402961, + "loss": 1.2253, + "step": 3925 + }, + { + "epoch": 0.34427790619446474, + "grad_norm": 0.087890625, + "learning_rate": 0.002506381691262576, + "loss": 1.2012, + "step": 3926 + }, + { + "epoch": 0.34436559796883925, + "grad_norm": 0.0595703125, + "learning_rate": 0.0025060664950223915, + "loss": 1.1712, + "step": 3927 + }, + { + "epoch": 0.3444532897432138, + "grad_norm": 0.09375, + "learning_rate": 0.002505751220711152, + "loss": 1.1488, + "step": 3928 + }, + { + "epoch": 0.3445409815175884, + "grad_norm": 0.058837890625, + "learning_rate": 0.0025054358683576105, + "loss": 1.1479, + "step": 3929 + }, + { + "epoch": 0.3446286732919629, + "grad_norm": 0.0634765625, + "learning_rate": 0.0025051204379905262, + "loss": 1.2139, + "step": 3930 + }, + { + "epoch": 0.34471636506633746, + "grad_norm": 0.06787109375, + "learning_rate": 0.0025048049296386658, + "loss": 1.2481, + "step": 3931 + }, + { + "epoch": 0.344804056840712, + "grad_norm": 0.0751953125, + "learning_rate": 0.0025044893433308017, + "loss": 1.1538, + "step": 3932 + }, + { + "epoch": 0.34489174861508654, + "grad_norm": 0.0771484375, + "learning_rate": 0.0025041736790957156, + "loss": 1.194, + "step": 3933 + }, + { + "epoch": 0.3449794403894611, + "grad_norm": 0.062255859375, + "learning_rate": 0.0025038579369621953, + "loss": 1.1806, + "step": 3934 + }, + { + "epoch": 0.34506713216383567, + "grad_norm": 0.1240234375, + "learning_rate": 0.002503542116959035, + "loss": 1.245, + "step": 3935 + }, + { + "epoch": 0.3451548239382102, + "grad_norm": 0.07177734375, + "learning_rate": 0.0025032262191150376, + "loss": 1.1789, + "step": 3936 + }, + { + "epoch": 0.34524251571258474, + "grad_norm": 0.158203125, + "learning_rate": 0.0025029102434590113, + "loss": 1.2087, + "step": 3937 + }, + { + "epoch": 0.34533020748695925, + "grad_norm": 0.10302734375, + "learning_rate": 0.002502594190019773, + "loss": 1.19, + "step": 3938 + }, + { + "epoch": 0.3454178992613338, + "grad_norm": 0.150390625, + "learning_rate": 0.0025022780588261446, + "loss": 1.1945, + "step": 3939 + }, + { + "epoch": 0.3455055910357084, + "grad_norm": 0.0712890625, + "learning_rate": 0.0025019618499069578, + "loss": 1.1989, + "step": 3940 + }, + { + "epoch": 0.3455932828100829, + "grad_norm": 0.12109375, + "learning_rate": 0.00250164556329105, + "loss": 1.1456, + "step": 3941 + }, + { + "epoch": 0.34568097458445746, + "grad_norm": 0.06201171875, + "learning_rate": 0.0025013291990072647, + "loss": 1.1782, + "step": 3942 + }, + { + "epoch": 0.345768666358832, + "grad_norm": 0.06640625, + "learning_rate": 0.0025010127570844545, + "loss": 1.2424, + "step": 3943 + }, + { + "epoch": 0.34585635813320653, + "grad_norm": 0.056884765625, + "learning_rate": 0.002500696237551478, + "loss": 1.1906, + "step": 3944 + }, + { + "epoch": 0.3459440499075811, + "grad_norm": 0.060791015625, + "learning_rate": 0.0025003796404372003, + "loss": 1.1946, + "step": 3945 + }, + { + "epoch": 0.34603174168195566, + "grad_norm": 0.0576171875, + "learning_rate": 0.0025000629657704943, + "loss": 1.1226, + "step": 3946 + }, + { + "epoch": 0.34611943345633017, + "grad_norm": 0.08056640625, + "learning_rate": 0.0024997462135802413, + "loss": 1.233, + "step": 3947 + }, + { + "epoch": 0.34620712523070474, + "grad_norm": 0.08203125, + "learning_rate": 0.002499429383895326, + "loss": 1.2009, + "step": 3948 + }, + { + "epoch": 0.34629481700507925, + "grad_norm": 0.1083984375, + "learning_rate": 0.0024991124767446446, + "loss": 1.2342, + "step": 3949 + }, + { + "epoch": 0.3463825087794538, + "grad_norm": 0.06298828125, + "learning_rate": 0.0024987954921570966, + "loss": 1.1713, + "step": 3950 + }, + { + "epoch": 0.3464702005538284, + "grad_norm": 0.162109375, + "learning_rate": 0.0024984784301615914, + "loss": 1.1788, + "step": 3951 + }, + { + "epoch": 0.3465578923282029, + "grad_norm": 0.0986328125, + "learning_rate": 0.0024981612907870434, + "loss": 1.1809, + "step": 3952 + }, + { + "epoch": 0.34664558410257745, + "grad_norm": 0.080078125, + "learning_rate": 0.0024978440740623755, + "loss": 1.2026, + "step": 3953 + }, + { + "epoch": 0.346733275876952, + "grad_norm": 0.052734375, + "learning_rate": 0.002497526780016517, + "loss": 1.1263, + "step": 3954 + }, + { + "epoch": 0.34682096765132653, + "grad_norm": 0.07177734375, + "learning_rate": 0.002497209408678403, + "loss": 1.1873, + "step": 3955 + }, + { + "epoch": 0.3469086594257011, + "grad_norm": 0.06298828125, + "learning_rate": 0.0024968919600769786, + "loss": 1.2177, + "step": 3956 + }, + { + "epoch": 0.34699635120007566, + "grad_norm": 0.07568359375, + "learning_rate": 0.0024965744342411943, + "loss": 1.2806, + "step": 3957 + }, + { + "epoch": 0.34708404297445017, + "grad_norm": 0.0908203125, + "learning_rate": 0.002496256831200006, + "loss": 1.2113, + "step": 3958 + }, + { + "epoch": 0.34717173474882473, + "grad_norm": 0.06298828125, + "learning_rate": 0.0024959391509823798, + "loss": 1.2125, + "step": 3959 + }, + { + "epoch": 0.34725942652319924, + "grad_norm": 0.1044921875, + "learning_rate": 0.0024956213936172863, + "loss": 1.2437, + "step": 3960 + }, + { + "epoch": 0.3473471182975738, + "grad_norm": 0.061767578125, + "learning_rate": 0.002495303559133706, + "loss": 1.2195, + "step": 3961 + }, + { + "epoch": 0.3474348100719484, + "grad_norm": 0.130859375, + "learning_rate": 0.002494985647560622, + "loss": 1.2298, + "step": 3962 + }, + { + "epoch": 0.3475225018463229, + "grad_norm": 0.055908203125, + "learning_rate": 0.0024946676589270283, + "loss": 1.2144, + "step": 3963 + }, + { + "epoch": 0.34761019362069745, + "grad_norm": 0.095703125, + "learning_rate": 0.0024943495932619247, + "loss": 1.2405, + "step": 3964 + }, + { + "epoch": 0.347697885395072, + "grad_norm": 0.10546875, + "learning_rate": 0.0024940314505943187, + "loss": 1.1543, + "step": 3965 + }, + { + "epoch": 0.3477855771694465, + "grad_norm": 0.053466796875, + "learning_rate": 0.002493713230953222, + "loss": 1.2461, + "step": 3966 + }, + { + "epoch": 0.3478732689438211, + "grad_norm": 0.12060546875, + "learning_rate": 0.002493394934367658, + "loss": 1.1611, + "step": 3967 + }, + { + "epoch": 0.34796096071819566, + "grad_norm": 0.06298828125, + "learning_rate": 0.002493076560866652, + "loss": 1.2033, + "step": 3968 + }, + { + "epoch": 0.34804865249257017, + "grad_norm": 0.0517578125, + "learning_rate": 0.0024927581104792406, + "loss": 1.1924, + "step": 3969 + }, + { + "epoch": 0.34813634426694473, + "grad_norm": 0.08740234375, + "learning_rate": 0.0024924395832344654, + "loss": 1.2847, + "step": 3970 + }, + { + "epoch": 0.34822403604131924, + "grad_norm": 0.0634765625, + "learning_rate": 0.0024921209791613744, + "loss": 1.1938, + "step": 3971 + }, + { + "epoch": 0.3483117278156938, + "grad_norm": 0.055419921875, + "learning_rate": 0.002491802298289024, + "loss": 1.2244, + "step": 3972 + }, + { + "epoch": 0.34839941959006837, + "grad_norm": 0.06201171875, + "learning_rate": 0.0024914835406464773, + "loss": 1.2591, + "step": 3973 + }, + { + "epoch": 0.3484871113644429, + "grad_norm": 0.10107421875, + "learning_rate": 0.002491164706262804, + "loss": 1.1762, + "step": 3974 + }, + { + "epoch": 0.34857480313881745, + "grad_norm": 0.0869140625, + "learning_rate": 0.002490845795167081, + "loss": 1.2163, + "step": 3975 + }, + { + "epoch": 0.348662494913192, + "grad_norm": 0.10009765625, + "learning_rate": 0.0024905268073883925, + "loss": 1.2662, + "step": 3976 + }, + { + "epoch": 0.3487501866875665, + "grad_norm": 0.0556640625, + "learning_rate": 0.002490207742955828, + "loss": 1.208, + "step": 3977 + }, + { + "epoch": 0.3488378784619411, + "grad_norm": 0.08349609375, + "learning_rate": 0.0024898886018984868, + "loss": 1.2244, + "step": 3978 + }, + { + "epoch": 0.34892557023631565, + "grad_norm": 0.09375, + "learning_rate": 0.0024895693842454736, + "loss": 1.1687, + "step": 3979 + }, + { + "epoch": 0.34901326201069016, + "grad_norm": 0.0888671875, + "learning_rate": 0.002489250090025899, + "loss": 1.169, + "step": 3980 + }, + { + "epoch": 0.3491009537850647, + "grad_norm": 0.08642578125, + "learning_rate": 0.002488930719268884, + "loss": 1.1745, + "step": 3981 + }, + { + "epoch": 0.3491886455594393, + "grad_norm": 0.06640625, + "learning_rate": 0.0024886112720035524, + "loss": 1.1344, + "step": 3982 + }, + { + "epoch": 0.3492763373338138, + "grad_norm": 0.08203125, + "learning_rate": 0.0024882917482590377, + "loss": 1.1921, + "step": 3983 + }, + { + "epoch": 0.34936402910818837, + "grad_norm": 0.06689453125, + "learning_rate": 0.0024879721480644793, + "loss": 1.1843, + "step": 3984 + }, + { + "epoch": 0.3494517208825629, + "grad_norm": 0.08056640625, + "learning_rate": 0.0024876524714490247, + "loss": 1.2066, + "step": 3985 + }, + { + "epoch": 0.34953941265693744, + "grad_norm": 0.06640625, + "learning_rate": 0.002487332718441827, + "loss": 1.1675, + "step": 3986 + }, + { + "epoch": 0.349627104431312, + "grad_norm": 0.06689453125, + "learning_rate": 0.0024870128890720467, + "loss": 1.1768, + "step": 3987 + }, + { + "epoch": 0.3497147962056865, + "grad_norm": 0.08837890625, + "learning_rate": 0.002486692983368852, + "loss": 1.2232, + "step": 3988 + }, + { + "epoch": 0.3498024879800611, + "grad_norm": 0.06689453125, + "learning_rate": 0.0024863730013614167, + "loss": 1.1969, + "step": 3989 + }, + { + "epoch": 0.34989017975443565, + "grad_norm": 0.0595703125, + "learning_rate": 0.0024860529430789226, + "loss": 1.1865, + "step": 3990 + }, + { + "epoch": 0.34997787152881016, + "grad_norm": 0.058349609375, + "learning_rate": 0.0024857328085505586, + "loss": 1.1836, + "step": 3991 + }, + { + "epoch": 0.3500655633031847, + "grad_norm": 0.07275390625, + "learning_rate": 0.00248541259780552, + "loss": 1.1888, + "step": 3992 + }, + { + "epoch": 0.3501532550775593, + "grad_norm": 0.058837890625, + "learning_rate": 0.0024850923108730096, + "loss": 1.1849, + "step": 3993 + }, + { + "epoch": 0.3502409468519338, + "grad_norm": 0.1005859375, + "learning_rate": 0.0024847719477822353, + "loss": 1.2671, + "step": 3994 + }, + { + "epoch": 0.35032863862630836, + "grad_norm": 0.06396484375, + "learning_rate": 0.0024844515085624143, + "loss": 1.2104, + "step": 3995 + }, + { + "epoch": 0.3504163304006829, + "grad_norm": 0.0888671875, + "learning_rate": 0.002484130993242771, + "loss": 1.1946, + "step": 3996 + }, + { + "epoch": 0.35050402217505744, + "grad_norm": 0.0654296875, + "learning_rate": 0.002483810401852534, + "loss": 1.1576, + "step": 3997 + }, + { + "epoch": 0.350591713949432, + "grad_norm": 0.05712890625, + "learning_rate": 0.00248348973442094, + "loss": 1.2008, + "step": 3998 + }, + { + "epoch": 0.3506794057238065, + "grad_norm": 0.06396484375, + "learning_rate": 0.0024831689909772343, + "loss": 1.2239, + "step": 3999 + }, + { + "epoch": 0.3507670974981811, + "grad_norm": 0.07080078125, + "learning_rate": 0.002482848171550668, + "loss": 1.2102, + "step": 4000 + }, + { + "epoch": 0.3507670974981811, + "eval_loss": 1.2098665237426758, + "eval_runtime": 429.0914, + "eval_samples_per_second": 33.669, + "eval_steps_per_second": 8.418, + "step": 4000 + }, + { + "epoch": 0.35085478927255564, + "grad_norm": 0.0556640625, + "learning_rate": 0.0024825272761704975, + "loss": 1.1917, + "step": 4001 + }, + { + "epoch": 0.35094248104693015, + "grad_norm": 0.05322265625, + "learning_rate": 0.0024822063048659886, + "loss": 1.2528, + "step": 4002 + }, + { + "epoch": 0.3510301728213047, + "grad_norm": 0.054931640625, + "learning_rate": 0.0024818852576664135, + "loss": 1.1819, + "step": 4003 + }, + { + "epoch": 0.3511178645956793, + "grad_norm": 0.057373046875, + "learning_rate": 0.0024815641346010507, + "loss": 1.1739, + "step": 4004 + }, + { + "epoch": 0.3512055563700538, + "grad_norm": 0.06005859375, + "learning_rate": 0.002481242935699185, + "loss": 1.211, + "step": 4005 + }, + { + "epoch": 0.35129324814442836, + "grad_norm": 0.059814453125, + "learning_rate": 0.00248092166099011, + "loss": 1.1861, + "step": 4006 + }, + { + "epoch": 0.35138093991880287, + "grad_norm": 0.0986328125, + "learning_rate": 0.0024806003105031236, + "loss": 1.2647, + "step": 4007 + }, + { + "epoch": 0.35146863169317744, + "grad_norm": 0.061767578125, + "learning_rate": 0.0024802788842675336, + "loss": 1.1157, + "step": 4008 + }, + { + "epoch": 0.351556323467552, + "grad_norm": 0.059326171875, + "learning_rate": 0.0024799573823126525, + "loss": 1.236, + "step": 4009 + }, + { + "epoch": 0.3516440152419265, + "grad_norm": 0.054443359375, + "learning_rate": 0.002479635804667801, + "loss": 1.1695, + "step": 4010 + }, + { + "epoch": 0.3517317070163011, + "grad_norm": 0.056396484375, + "learning_rate": 0.0024793141513623053, + "loss": 1.1794, + "step": 4011 + }, + { + "epoch": 0.35181939879067564, + "grad_norm": 0.052734375, + "learning_rate": 0.0024789924224255006, + "loss": 1.255, + "step": 4012 + }, + { + "epoch": 0.35190709056505015, + "grad_norm": 0.0634765625, + "learning_rate": 0.0024786706178867267, + "loss": 1.1786, + "step": 4013 + }, + { + "epoch": 0.3519947823394247, + "grad_norm": 0.061279296875, + "learning_rate": 0.0024783487377753313, + "loss": 1.2247, + "step": 4014 + }, + { + "epoch": 0.3520824741137993, + "grad_norm": 0.0693359375, + "learning_rate": 0.00247802678212067, + "loss": 1.2098, + "step": 4015 + }, + { + "epoch": 0.3521701658881738, + "grad_norm": 0.06396484375, + "learning_rate": 0.002477704750952103, + "loss": 1.2144, + "step": 4016 + }, + { + "epoch": 0.35225785766254836, + "grad_norm": 0.0791015625, + "learning_rate": 0.0024773826442989998, + "loss": 1.2155, + "step": 4017 + }, + { + "epoch": 0.3523455494369229, + "grad_norm": 0.07861328125, + "learning_rate": 0.0024770604621907354, + "loss": 1.2848, + "step": 4018 + }, + { + "epoch": 0.35243324121129743, + "grad_norm": 0.0654296875, + "learning_rate": 0.002476738204656692, + "loss": 1.2201, + "step": 4019 + }, + { + "epoch": 0.352520932985672, + "grad_norm": 0.107421875, + "learning_rate": 0.0024764158717262576, + "loss": 1.2494, + "step": 4020 + }, + { + "epoch": 0.3526086247600465, + "grad_norm": 0.07080078125, + "learning_rate": 0.0024760934634288304, + "loss": 1.2228, + "step": 4021 + }, + { + "epoch": 0.3526963165344211, + "grad_norm": 0.1611328125, + "learning_rate": 0.002475770979793811, + "loss": 1.1534, + "step": 4022 + }, + { + "epoch": 0.35278400830879564, + "grad_norm": 0.058837890625, + "learning_rate": 0.0024754484208506103, + "loss": 1.1702, + "step": 4023 + }, + { + "epoch": 0.35287170008317015, + "grad_norm": 0.11083984375, + "learning_rate": 0.0024751257866286445, + "loss": 1.231, + "step": 4024 + }, + { + "epoch": 0.3529593918575447, + "grad_norm": 0.07470703125, + "learning_rate": 0.0024748030771573368, + "loss": 1.183, + "step": 4025 + }, + { + "epoch": 0.3530470836319193, + "grad_norm": 0.07958984375, + "learning_rate": 0.002474480292466118, + "loss": 1.1784, + "step": 4026 + }, + { + "epoch": 0.3531347754062938, + "grad_norm": 0.07470703125, + "learning_rate": 0.0024741574325844247, + "loss": 1.1829, + "step": 4027 + }, + { + "epoch": 0.35322246718066835, + "grad_norm": 0.06884765625, + "learning_rate": 0.002473834497541701, + "loss": 1.1981, + "step": 4028 + }, + { + "epoch": 0.3533101589550429, + "grad_norm": 0.0888671875, + "learning_rate": 0.002473511487367398, + "loss": 1.2156, + "step": 4029 + }, + { + "epoch": 0.35339785072941743, + "grad_norm": 0.064453125, + "learning_rate": 0.0024731884020909737, + "loss": 1.1608, + "step": 4030 + }, + { + "epoch": 0.353485542503792, + "grad_norm": 0.09228515625, + "learning_rate": 0.0024728652417418916, + "loss": 1.2088, + "step": 4031 + }, + { + "epoch": 0.3535732342781665, + "grad_norm": 0.07080078125, + "learning_rate": 0.0024725420063496234, + "loss": 1.1753, + "step": 4032 + }, + { + "epoch": 0.35366092605254107, + "grad_norm": 0.09521484375, + "learning_rate": 0.002472218695943648, + "loss": 1.1922, + "step": 4033 + }, + { + "epoch": 0.35374861782691563, + "grad_norm": 0.076171875, + "learning_rate": 0.00247189531055345, + "loss": 1.1724, + "step": 4034 + }, + { + "epoch": 0.35383630960129014, + "grad_norm": 0.0712890625, + "learning_rate": 0.0024715718502085213, + "loss": 1.1824, + "step": 4035 + }, + { + "epoch": 0.3539240013756647, + "grad_norm": 0.07763671875, + "learning_rate": 0.0024712483149383615, + "loss": 1.176, + "step": 4036 + }, + { + "epoch": 0.3540116931500393, + "grad_norm": 0.04931640625, + "learning_rate": 0.002470924704772474, + "loss": 1.2307, + "step": 4037 + }, + { + "epoch": 0.3540993849244138, + "grad_norm": 0.08056640625, + "learning_rate": 0.002470601019740374, + "loss": 1.1297, + "step": 4038 + }, + { + "epoch": 0.35418707669878835, + "grad_norm": 0.068359375, + "learning_rate": 0.002470277259871579, + "loss": 1.208, + "step": 4039 + }, + { + "epoch": 0.3542747684731629, + "grad_norm": 0.057861328125, + "learning_rate": 0.002469953425195615, + "loss": 1.2233, + "step": 4040 + }, + { + "epoch": 0.3543624602475374, + "grad_norm": 0.10888671875, + "learning_rate": 0.002469629515742015, + "loss": 1.1881, + "step": 4041 + }, + { + "epoch": 0.354450152021912, + "grad_norm": 0.0595703125, + "learning_rate": 0.00246930553154032, + "loss": 1.2324, + "step": 4042 + }, + { + "epoch": 0.3545378437962865, + "grad_norm": 0.119140625, + "learning_rate": 0.0024689814726200753, + "loss": 1.1999, + "step": 4043 + }, + { + "epoch": 0.35462553557066107, + "grad_norm": 0.072265625, + "learning_rate": 0.002468657339010834, + "loss": 1.1888, + "step": 4044 + }, + { + "epoch": 0.35471322734503563, + "grad_norm": 0.08056640625, + "learning_rate": 0.002468333130742157, + "loss": 1.1923, + "step": 4045 + }, + { + "epoch": 0.35480091911941014, + "grad_norm": 0.0673828125, + "learning_rate": 0.0024680088478436116, + "loss": 1.2299, + "step": 4046 + }, + { + "epoch": 0.3548886108937847, + "grad_norm": 0.06201171875, + "learning_rate": 0.00246768449034477, + "loss": 1.1515, + "step": 4047 + }, + { + "epoch": 0.35497630266815927, + "grad_norm": 0.0537109375, + "learning_rate": 0.002467360058275214, + "loss": 1.2143, + "step": 4048 + }, + { + "epoch": 0.3550639944425338, + "grad_norm": 0.05908203125, + "learning_rate": 0.002467035551664531, + "loss": 1.2015, + "step": 4049 + }, + { + "epoch": 0.35515168621690835, + "grad_norm": 0.056640625, + "learning_rate": 0.0024667109705423145, + "loss": 1.2023, + "step": 4050 + }, + { + "epoch": 0.3552393779912829, + "grad_norm": 0.05810546875, + "learning_rate": 0.002466386314938166, + "loss": 1.1811, + "step": 4051 + }, + { + "epoch": 0.3553270697656574, + "grad_norm": 0.0615234375, + "learning_rate": 0.0024660615848816933, + "loss": 1.1924, + "step": 4052 + }, + { + "epoch": 0.355414761540032, + "grad_norm": 0.052001953125, + "learning_rate": 0.0024657367804025105, + "loss": 1.2035, + "step": 4053 + }, + { + "epoch": 0.35550245331440655, + "grad_norm": 0.060302734375, + "learning_rate": 0.002465411901530239, + "loss": 1.1671, + "step": 4054 + }, + { + "epoch": 0.35559014508878106, + "grad_norm": 0.057373046875, + "learning_rate": 0.002465086948294507, + "loss": 1.2076, + "step": 4055 + }, + { + "epoch": 0.3556778368631556, + "grad_norm": 0.06396484375, + "learning_rate": 0.00246476192072495, + "loss": 1.175, + "step": 4056 + }, + { + "epoch": 0.35576552863753014, + "grad_norm": 0.0595703125, + "learning_rate": 0.0024644368188512086, + "loss": 1.1606, + "step": 4057 + }, + { + "epoch": 0.3558532204119047, + "grad_norm": 0.057373046875, + "learning_rate": 0.0024641116427029314, + "loss": 1.1418, + "step": 4058 + }, + { + "epoch": 0.35594091218627927, + "grad_norm": 0.06884765625, + "learning_rate": 0.0024637863923097753, + "loss": 1.1985, + "step": 4059 + }, + { + "epoch": 0.3560286039606538, + "grad_norm": 0.060302734375, + "learning_rate": 0.0024634610677013997, + "loss": 1.1962, + "step": 4060 + }, + { + "epoch": 0.35611629573502834, + "grad_norm": 0.05908203125, + "learning_rate": 0.0024631356689074754, + "loss": 1.1436, + "step": 4061 + }, + { + "epoch": 0.3562039875094029, + "grad_norm": 0.0478515625, + "learning_rate": 0.002462810195957677, + "loss": 1.1962, + "step": 4062 + }, + { + "epoch": 0.3562916792837774, + "grad_norm": 0.06396484375, + "learning_rate": 0.002462484648881687, + "loss": 1.1953, + "step": 4063 + }, + { + "epoch": 0.356379371058152, + "grad_norm": 0.07177734375, + "learning_rate": 0.002462159027709194, + "loss": 1.2035, + "step": 4064 + }, + { + "epoch": 0.35646706283252655, + "grad_norm": 0.058837890625, + "learning_rate": 0.0024618333324698954, + "loss": 1.2382, + "step": 4065 + }, + { + "epoch": 0.35655475460690106, + "grad_norm": 0.058349609375, + "learning_rate": 0.0024615075631934913, + "loss": 1.2347, + "step": 4066 + }, + { + "epoch": 0.3566424463812756, + "grad_norm": 0.055419921875, + "learning_rate": 0.002461181719909693, + "loss": 1.2166, + "step": 4067 + }, + { + "epoch": 0.35673013815565013, + "grad_norm": 0.07470703125, + "learning_rate": 0.0024608558026482158, + "loss": 1.217, + "step": 4068 + }, + { + "epoch": 0.3568178299300247, + "grad_norm": 0.0712890625, + "learning_rate": 0.0024605298114387824, + "loss": 1.1648, + "step": 4069 + }, + { + "epoch": 0.35690552170439926, + "grad_norm": 0.056396484375, + "learning_rate": 0.002460203746311123, + "loss": 1.1743, + "step": 4070 + }, + { + "epoch": 0.3569932134787738, + "grad_norm": 0.109375, + "learning_rate": 0.0024598776072949735, + "loss": 1.2306, + "step": 4071 + }, + { + "epoch": 0.35708090525314834, + "grad_norm": 0.0673828125, + "learning_rate": 0.002459551394420077, + "loss": 1.1495, + "step": 4072 + }, + { + "epoch": 0.3571685970275229, + "grad_norm": 0.12353515625, + "learning_rate": 0.0024592251077161825, + "loss": 1.1746, + "step": 4073 + }, + { + "epoch": 0.3572562888018974, + "grad_norm": 0.0537109375, + "learning_rate": 0.0024588987472130472, + "loss": 1.2092, + "step": 4074 + }, + { + "epoch": 0.357343980576272, + "grad_norm": 0.07763671875, + "learning_rate": 0.002458572312940435, + "loss": 1.2551, + "step": 4075 + }, + { + "epoch": 0.35743167235064655, + "grad_norm": 0.07275390625, + "learning_rate": 0.0024582458049281146, + "loss": 1.1381, + "step": 4076 + }, + { + "epoch": 0.35751936412502106, + "grad_norm": 0.08154296875, + "learning_rate": 0.0024579192232058638, + "loss": 1.2417, + "step": 4077 + }, + { + "epoch": 0.3576070558993956, + "grad_norm": 0.0693359375, + "learning_rate": 0.002457592567803465, + "loss": 1.1758, + "step": 4078 + }, + { + "epoch": 0.35769474767377013, + "grad_norm": 0.07080078125, + "learning_rate": 0.002457265838750709, + "loss": 1.2283, + "step": 4079 + }, + { + "epoch": 0.3577824394481447, + "grad_norm": 0.064453125, + "learning_rate": 0.002456939036077392, + "loss": 1.2084, + "step": 4080 + }, + { + "epoch": 0.35787013122251926, + "grad_norm": 0.080078125, + "learning_rate": 0.0024566121598133183, + "loss": 1.1438, + "step": 4081 + }, + { + "epoch": 0.35795782299689377, + "grad_norm": 0.0654296875, + "learning_rate": 0.0024562852099882983, + "loss": 1.2152, + "step": 4082 + }, + { + "epoch": 0.35804551477126834, + "grad_norm": 0.0732421875, + "learning_rate": 0.002455958186632148, + "loss": 1.2063, + "step": 4083 + }, + { + "epoch": 0.3581332065456429, + "grad_norm": 0.060546875, + "learning_rate": 0.0024556310897746922, + "loss": 1.2277, + "step": 4084 + }, + { + "epoch": 0.3582208983200174, + "grad_norm": 0.057373046875, + "learning_rate": 0.0024553039194457602, + "loss": 1.1478, + "step": 4085 + }, + { + "epoch": 0.358308590094392, + "grad_norm": 0.06982421875, + "learning_rate": 0.00245497667567519, + "loss": 1.2032, + "step": 4086 + }, + { + "epoch": 0.35839628186876654, + "grad_norm": 0.061767578125, + "learning_rate": 0.002454649358492825, + "loss": 1.1712, + "step": 4087 + }, + { + "epoch": 0.35848397364314105, + "grad_norm": 0.064453125, + "learning_rate": 0.0024543219679285153, + "loss": 1.1696, + "step": 4088 + }, + { + "epoch": 0.3585716654175156, + "grad_norm": 0.06591796875, + "learning_rate": 0.002453994504012119, + "loss": 1.2239, + "step": 4089 + }, + { + "epoch": 0.3586593571918902, + "grad_norm": 0.0751953125, + "learning_rate": 0.002453666966773499, + "loss": 1.1189, + "step": 4090 + }, + { + "epoch": 0.3587470489662647, + "grad_norm": 0.0966796875, + "learning_rate": 0.002453339356242527, + "loss": 1.2107, + "step": 4091 + }, + { + "epoch": 0.35883474074063926, + "grad_norm": 0.1650390625, + "learning_rate": 0.002453011672449079, + "loss": 1.1908, + "step": 4092 + }, + { + "epoch": 0.35892243251501377, + "grad_norm": 0.06640625, + "learning_rate": 0.0024526839154230392, + "loss": 1.2217, + "step": 4093 + }, + { + "epoch": 0.35901012428938833, + "grad_norm": 0.076171875, + "learning_rate": 0.0024523560851942984, + "loss": 1.2338, + "step": 4094 + }, + { + "epoch": 0.3590978160637629, + "grad_norm": 0.1591796875, + "learning_rate": 0.0024520281817927542, + "loss": 1.2004, + "step": 4095 + }, + { + "epoch": 0.3591855078381374, + "grad_norm": 0.05712890625, + "learning_rate": 0.0024517002052483103, + "loss": 1.1644, + "step": 4096 + }, + { + "epoch": 0.359273199612512, + "grad_norm": 0.1318359375, + "learning_rate": 0.0024513721555908773, + "loss": 1.1475, + "step": 4097 + }, + { + "epoch": 0.35936089138688654, + "grad_norm": 0.080078125, + "learning_rate": 0.002451044032850372, + "loss": 1.2503, + "step": 4098 + }, + { + "epoch": 0.35944858316126105, + "grad_norm": 0.08349609375, + "learning_rate": 0.0024507158370567195, + "loss": 1.1619, + "step": 4099 + }, + { + "epoch": 0.3595362749356356, + "grad_norm": 0.1669921875, + "learning_rate": 0.0024503875682398493, + "loss": 1.1601, + "step": 4100 + }, + { + "epoch": 0.3596239667100102, + "grad_norm": 0.0654296875, + "learning_rate": 0.002450059226429699, + "loss": 1.159, + "step": 4101 + }, + { + "epoch": 0.3597116584843847, + "grad_norm": 0.1591796875, + "learning_rate": 0.002449730811656213, + "loss": 1.2004, + "step": 4102 + }, + { + "epoch": 0.35979935025875925, + "grad_norm": 0.0556640625, + "learning_rate": 0.0024494023239493407, + "loss": 1.153, + "step": 4103 + }, + { + "epoch": 0.35988704203313376, + "grad_norm": 0.0859375, + "learning_rate": 0.002449073763339041, + "loss": 1.1981, + "step": 4104 + }, + { + "epoch": 0.35997473380750833, + "grad_norm": 0.10546875, + "learning_rate": 0.0024487451298552763, + "loss": 1.2117, + "step": 4105 + }, + { + "epoch": 0.3600624255818829, + "grad_norm": 0.06884765625, + "learning_rate": 0.002448416423528018, + "loss": 1.1941, + "step": 4106 + }, + { + "epoch": 0.3601501173562574, + "grad_norm": 0.08740234375, + "learning_rate": 0.0024480876443872426, + "loss": 1.148, + "step": 4107 + }, + { + "epoch": 0.36023780913063197, + "grad_norm": 0.05419921875, + "learning_rate": 0.002447758792462934, + "loss": 1.2041, + "step": 4108 + }, + { + "epoch": 0.36032550090500653, + "grad_norm": 0.057373046875, + "learning_rate": 0.002447429867785084, + "loss": 1.2234, + "step": 4109 + }, + { + "epoch": 0.36041319267938104, + "grad_norm": 0.0732421875, + "learning_rate": 0.0024471008703836877, + "loss": 1.1527, + "step": 4110 + }, + { + "epoch": 0.3605008844537556, + "grad_norm": 0.062255859375, + "learning_rate": 0.00244677180028875, + "loss": 1.2224, + "step": 4111 + }, + { + "epoch": 0.3605885762281302, + "grad_norm": 0.057861328125, + "learning_rate": 0.002446442657530281, + "loss": 1.1912, + "step": 4112 + }, + { + "epoch": 0.3606762680025047, + "grad_norm": 0.06298828125, + "learning_rate": 0.0024461134421382975, + "loss": 1.1434, + "step": 4113 + }, + { + "epoch": 0.36076395977687925, + "grad_norm": 0.06787109375, + "learning_rate": 0.002445784154142824, + "loss": 1.1893, + "step": 4114 + }, + { + "epoch": 0.36085165155125376, + "grad_norm": 0.05224609375, + "learning_rate": 0.0024454547935738894, + "loss": 1.1631, + "step": 4115 + }, + { + "epoch": 0.3609393433256283, + "grad_norm": 0.05615234375, + "learning_rate": 0.002445125360461531, + "loss": 1.191, + "step": 4116 + }, + { + "epoch": 0.3610270351000029, + "grad_norm": 0.06884765625, + "learning_rate": 0.002444795854835792, + "loss": 1.1859, + "step": 4117 + }, + { + "epoch": 0.3611147268743774, + "grad_norm": 0.051025390625, + "learning_rate": 0.0024444662767267233, + "loss": 1.1426, + "step": 4118 + }, + { + "epoch": 0.36120241864875197, + "grad_norm": 0.055908203125, + "learning_rate": 0.002444136626164381, + "loss": 1.1814, + "step": 4119 + }, + { + "epoch": 0.36129011042312653, + "grad_norm": 0.06396484375, + "learning_rate": 0.002443806903178829, + "loss": 1.2003, + "step": 4120 + }, + { + "epoch": 0.36137780219750104, + "grad_norm": 0.05712890625, + "learning_rate": 0.0024434771078001355, + "loss": 1.1729, + "step": 4121 + }, + { + "epoch": 0.3614654939718756, + "grad_norm": 0.050537109375, + "learning_rate": 0.0024431472400583787, + "loss": 1.1712, + "step": 4122 + }, + { + "epoch": 0.36155318574625017, + "grad_norm": 0.052001953125, + "learning_rate": 0.0024428172999836407, + "loss": 1.1327, + "step": 4123 + }, + { + "epoch": 0.3616408775206247, + "grad_norm": 0.064453125, + "learning_rate": 0.002442487287606012, + "loss": 1.185, + "step": 4124 + }, + { + "epoch": 0.36172856929499925, + "grad_norm": 0.060302734375, + "learning_rate": 0.002442157202955588, + "loss": 1.2227, + "step": 4125 + }, + { + "epoch": 0.3618162610693738, + "grad_norm": 0.056884765625, + "learning_rate": 0.002441827046062473, + "loss": 1.222, + "step": 4126 + }, + { + "epoch": 0.3619039528437483, + "grad_norm": 0.06640625, + "learning_rate": 0.002441496816956775, + "loss": 1.2575, + "step": 4127 + }, + { + "epoch": 0.3619916446181229, + "grad_norm": 0.07177734375, + "learning_rate": 0.00244116651566861, + "loss": 1.1627, + "step": 4128 + }, + { + "epoch": 0.3620793363924974, + "grad_norm": 0.076171875, + "learning_rate": 0.0024408361422281017, + "loss": 1.2641, + "step": 4129 + }, + { + "epoch": 0.36216702816687196, + "grad_norm": 0.06640625, + "learning_rate": 0.0024405056966653785, + "loss": 1.1924, + "step": 4130 + }, + { + "epoch": 0.36225471994124653, + "grad_norm": 0.07568359375, + "learning_rate": 0.002440175179010577, + "loss": 1.229, + "step": 4131 + }, + { + "epoch": 0.36234241171562104, + "grad_norm": 0.07763671875, + "learning_rate": 0.002439844589293838, + "loss": 1.1924, + "step": 4132 + }, + { + "epoch": 0.3624301034899956, + "grad_norm": 0.07470703125, + "learning_rate": 0.0024395139275453126, + "loss": 1.1221, + "step": 4133 + }, + { + "epoch": 0.36251779526437017, + "grad_norm": 0.057373046875, + "learning_rate": 0.0024391831937951546, + "loss": 1.2069, + "step": 4134 + }, + { + "epoch": 0.3626054870387447, + "grad_norm": 0.0556640625, + "learning_rate": 0.002438852388073526, + "loss": 1.1549, + "step": 4135 + }, + { + "epoch": 0.36269317881311924, + "grad_norm": 0.05712890625, + "learning_rate": 0.002438521510410597, + "loss": 1.2077, + "step": 4136 + }, + { + "epoch": 0.3627808705874938, + "grad_norm": 0.0810546875, + "learning_rate": 0.002438190560836541, + "loss": 1.2363, + "step": 4137 + }, + { + "epoch": 0.3628685623618683, + "grad_norm": 0.08740234375, + "learning_rate": 0.0024378595393815413, + "loss": 1.1664, + "step": 4138 + }, + { + "epoch": 0.3629562541362429, + "grad_norm": 0.0732421875, + "learning_rate": 0.002437528446075785, + "loss": 1.2101, + "step": 4139 + }, + { + "epoch": 0.3630439459106174, + "grad_norm": 0.07958984375, + "learning_rate": 0.0024371972809494676, + "loss": 1.1892, + "step": 4140 + }, + { + "epoch": 0.36313163768499196, + "grad_norm": 0.0869140625, + "learning_rate": 0.00243686604403279, + "loss": 1.2095, + "step": 4141 + }, + { + "epoch": 0.3632193294593665, + "grad_norm": 0.08544921875, + "learning_rate": 0.002436534735355961, + "loss": 1.2164, + "step": 4142 + }, + { + "epoch": 0.36330702123374103, + "grad_norm": 0.08203125, + "learning_rate": 0.0024362033549491942, + "loss": 1.2092, + "step": 4143 + }, + { + "epoch": 0.3633947130081156, + "grad_norm": 0.05029296875, + "learning_rate": 0.0024358719028427112, + "loss": 1.1836, + "step": 4144 + }, + { + "epoch": 0.36348240478249016, + "grad_norm": 0.08837890625, + "learning_rate": 0.0024355403790667393, + "loss": 1.2167, + "step": 4145 + }, + { + "epoch": 0.3635700965568647, + "grad_norm": 0.05322265625, + "learning_rate": 0.0024352087836515125, + "loss": 1.1796, + "step": 4146 + }, + { + "epoch": 0.36365778833123924, + "grad_norm": 0.087890625, + "learning_rate": 0.0024348771166272714, + "loss": 1.1778, + "step": 4147 + }, + { + "epoch": 0.3637454801056138, + "grad_norm": 0.0576171875, + "learning_rate": 0.0024345453780242644, + "loss": 1.2545, + "step": 4148 + }, + { + "epoch": 0.3638331718799883, + "grad_norm": 0.11376953125, + "learning_rate": 0.0024342135678727434, + "loss": 1.1539, + "step": 4149 + }, + { + "epoch": 0.3639208636543629, + "grad_norm": 0.0546875, + "learning_rate": 0.0024338816862029696, + "loss": 1.2414, + "step": 4150 + }, + { + "epoch": 0.3640085554287374, + "grad_norm": 0.0693359375, + "learning_rate": 0.0024335497330452097, + "loss": 1.2082, + "step": 4151 + }, + { + "epoch": 0.36409624720311196, + "grad_norm": 0.0615234375, + "learning_rate": 0.0024332177084297364, + "loss": 1.1856, + "step": 4152 + }, + { + "epoch": 0.3641839389774865, + "grad_norm": 0.12353515625, + "learning_rate": 0.0024328856123868306, + "loss": 1.1912, + "step": 4153 + }, + { + "epoch": 0.36427163075186103, + "grad_norm": 0.0546875, + "learning_rate": 0.002432553444946778, + "loss": 1.2292, + "step": 4154 + }, + { + "epoch": 0.3643593225262356, + "grad_norm": 0.10009765625, + "learning_rate": 0.002432221206139871, + "loss": 1.225, + "step": 4155 + }, + { + "epoch": 0.36444701430061016, + "grad_norm": 0.058837890625, + "learning_rate": 0.0024318888959964096, + "loss": 1.1632, + "step": 4156 + }, + { + "epoch": 0.36453470607498467, + "grad_norm": 0.09716796875, + "learning_rate": 0.0024315565145466987, + "loss": 1.2808, + "step": 4157 + }, + { + "epoch": 0.36462239784935924, + "grad_norm": 0.064453125, + "learning_rate": 0.002431224061821052, + "loss": 1.2205, + "step": 4158 + }, + { + "epoch": 0.3647100896237338, + "grad_norm": 0.07177734375, + "learning_rate": 0.0024308915378497874, + "loss": 1.2312, + "step": 4159 + }, + { + "epoch": 0.3647977813981083, + "grad_norm": 0.072265625, + "learning_rate": 0.0024305589426632302, + "loss": 1.1715, + "step": 4160 + }, + { + "epoch": 0.3648854731724829, + "grad_norm": 0.06884765625, + "learning_rate": 0.0024302262762917123, + "loss": 1.2137, + "step": 4161 + }, + { + "epoch": 0.3649731649468574, + "grad_norm": 0.1279296875, + "learning_rate": 0.002429893538765572, + "loss": 1.2057, + "step": 4162 + }, + { + "epoch": 0.36506085672123195, + "grad_norm": 0.05419921875, + "learning_rate": 0.002429560730115155, + "loss": 1.2417, + "step": 4163 + }, + { + "epoch": 0.3651485484956065, + "grad_norm": 0.05712890625, + "learning_rate": 0.002429227850370811, + "loss": 1.1862, + "step": 4164 + }, + { + "epoch": 0.365236240269981, + "grad_norm": 0.058837890625, + "learning_rate": 0.0024288948995628994, + "loss": 1.2219, + "step": 4165 + }, + { + "epoch": 0.3653239320443556, + "grad_norm": 0.06591796875, + "learning_rate": 0.0024285618777217834, + "loss": 1.2605, + "step": 4166 + }, + { + "epoch": 0.36541162381873016, + "grad_norm": 0.052734375, + "learning_rate": 0.002428228784877834, + "loss": 1.1386, + "step": 4167 + }, + { + "epoch": 0.36549931559310467, + "grad_norm": 0.06494140625, + "learning_rate": 0.002427895621061428, + "loss": 1.1934, + "step": 4168 + }, + { + "epoch": 0.36558700736747923, + "grad_norm": 0.07763671875, + "learning_rate": 0.00242756238630295, + "loss": 1.1323, + "step": 4169 + }, + { + "epoch": 0.3656746991418538, + "grad_norm": 0.059814453125, + "learning_rate": 0.002427229080632789, + "loss": 1.2227, + "step": 4170 + }, + { + "epoch": 0.3657623909162283, + "grad_norm": 0.0986328125, + "learning_rate": 0.002426895704081343, + "loss": 1.1657, + "step": 4171 + }, + { + "epoch": 0.3658500826906029, + "grad_norm": 0.054443359375, + "learning_rate": 0.002426562256679014, + "loss": 1.2945, + "step": 4172 + }, + { + "epoch": 0.36593777446497744, + "grad_norm": 0.1259765625, + "learning_rate": 0.002426228738456212, + "loss": 1.189, + "step": 4173 + }, + { + "epoch": 0.36602546623935195, + "grad_norm": 0.06689453125, + "learning_rate": 0.0024258951494433526, + "loss": 1.1634, + "step": 4174 + }, + { + "epoch": 0.3661131580137265, + "grad_norm": 0.08349609375, + "learning_rate": 0.0024255614896708587, + "loss": 1.2358, + "step": 4175 + }, + { + "epoch": 0.366200849788101, + "grad_norm": 0.060546875, + "learning_rate": 0.0024252277591691592, + "loss": 1.2129, + "step": 4176 + }, + { + "epoch": 0.3662885415624756, + "grad_norm": 0.07080078125, + "learning_rate": 0.002424893957968689, + "loss": 1.1566, + "step": 4177 + }, + { + "epoch": 0.36637623333685015, + "grad_norm": 0.09423828125, + "learning_rate": 0.00242456008609989, + "loss": 1.2257, + "step": 4178 + }, + { + "epoch": 0.36646392511122466, + "grad_norm": 0.052490234375, + "learning_rate": 0.0024242261435932114, + "loss": 1.1792, + "step": 4179 + }, + { + "epoch": 0.36655161688559923, + "grad_norm": 0.0625, + "learning_rate": 0.0024238921304791067, + "loss": 1.1862, + "step": 4180 + }, + { + "epoch": 0.3666393086599738, + "grad_norm": 0.08447265625, + "learning_rate": 0.0024235580467880376, + "loss": 1.1976, + "step": 4181 + }, + { + "epoch": 0.3667270004343483, + "grad_norm": 0.06298828125, + "learning_rate": 0.002423223892550472, + "loss": 1.1824, + "step": 4182 + }, + { + "epoch": 0.36681469220872287, + "grad_norm": 0.0869140625, + "learning_rate": 0.0024228896677968826, + "loss": 1.2103, + "step": 4183 + }, + { + "epoch": 0.36690238398309744, + "grad_norm": 0.06982421875, + "learning_rate": 0.002422555372557751, + "loss": 1.2096, + "step": 4184 + }, + { + "epoch": 0.36699007575747195, + "grad_norm": 0.052490234375, + "learning_rate": 0.0024222210068635635, + "loss": 1.1659, + "step": 4185 + }, + { + "epoch": 0.3670777675318465, + "grad_norm": 0.06396484375, + "learning_rate": 0.0024218865707448145, + "loss": 1.2103, + "step": 4186 + }, + { + "epoch": 0.367165459306221, + "grad_norm": 0.0546875, + "learning_rate": 0.0024215520642320025, + "loss": 1.1318, + "step": 4187 + }, + { + "epoch": 0.3672531510805956, + "grad_norm": 0.1220703125, + "learning_rate": 0.0024212174873556335, + "loss": 1.1934, + "step": 4188 + }, + { + "epoch": 0.36734084285497015, + "grad_norm": 0.05517578125, + "learning_rate": 0.002420882840146221, + "loss": 1.2553, + "step": 4189 + }, + { + "epoch": 0.36742853462934466, + "grad_norm": 0.1240234375, + "learning_rate": 0.0024205481226342836, + "loss": 1.1886, + "step": 4190 + }, + { + "epoch": 0.3675162264037192, + "grad_norm": 0.078125, + "learning_rate": 0.0024202133348503463, + "loss": 1.1369, + "step": 4191 + }, + { + "epoch": 0.3676039181780938, + "grad_norm": 0.07421875, + "learning_rate": 0.002419878476824942, + "loss": 1.1955, + "step": 4192 + }, + { + "epoch": 0.3676916099524683, + "grad_norm": 0.0859375, + "learning_rate": 0.0024195435485886065, + "loss": 1.2252, + "step": 4193 + }, + { + "epoch": 0.36777930172684287, + "grad_norm": 0.07080078125, + "learning_rate": 0.0024192085501718867, + "loss": 1.2567, + "step": 4194 + }, + { + "epoch": 0.36786699350121743, + "grad_norm": 0.06396484375, + "learning_rate": 0.002418873481605333, + "loss": 1.2069, + "step": 4195 + }, + { + "epoch": 0.36795468527559194, + "grad_norm": 0.06494140625, + "learning_rate": 0.0024185383429195027, + "loss": 1.1973, + "step": 4196 + }, + { + "epoch": 0.3680423770499665, + "grad_norm": 0.064453125, + "learning_rate": 0.0024182031341449593, + "loss": 1.2204, + "step": 4197 + }, + { + "epoch": 0.368130068824341, + "grad_norm": 0.060546875, + "learning_rate": 0.0024178678553122735, + "loss": 1.1874, + "step": 4198 + }, + { + "epoch": 0.3682177605987156, + "grad_norm": 0.05712890625, + "learning_rate": 0.002417532506452021, + "loss": 1.2007, + "step": 4199 + }, + { + "epoch": 0.36830545237309015, + "grad_norm": 0.0771484375, + "learning_rate": 0.0024171970875947854, + "loss": 1.2274, + "step": 4200 + }, + { + "epoch": 0.36839314414746466, + "grad_norm": 0.05712890625, + "learning_rate": 0.0024168615987711557, + "loss": 1.186, + "step": 4201 + }, + { + "epoch": 0.3684808359218392, + "grad_norm": 0.0712890625, + "learning_rate": 0.0024165260400117283, + "loss": 1.2233, + "step": 4202 + }, + { + "epoch": 0.3685685276962138, + "grad_norm": 0.068359375, + "learning_rate": 0.0024161904113471043, + "loss": 1.2034, + "step": 4203 + }, + { + "epoch": 0.3686562194705883, + "grad_norm": 0.06884765625, + "learning_rate": 0.0024158547128078933, + "loss": 1.1557, + "step": 4204 + }, + { + "epoch": 0.36874391124496286, + "grad_norm": 0.08984375, + "learning_rate": 0.002415518944424709, + "loss": 1.2064, + "step": 4205 + }, + { + "epoch": 0.36883160301933743, + "grad_norm": 0.11376953125, + "learning_rate": 0.002415183106228174, + "loss": 1.1588, + "step": 4206 + }, + { + "epoch": 0.36891929479371194, + "grad_norm": 0.0703125, + "learning_rate": 0.0024148471982489146, + "loss": 1.2246, + "step": 4207 + }, + { + "epoch": 0.3690069865680865, + "grad_norm": 0.11181640625, + "learning_rate": 0.0024145112205175645, + "loss": 1.2152, + "step": 4208 + }, + { + "epoch": 0.36909467834246107, + "grad_norm": 0.0927734375, + "learning_rate": 0.0024141751730647654, + "loss": 1.187, + "step": 4209 + }, + { + "epoch": 0.3691823701168356, + "grad_norm": 0.09228515625, + "learning_rate": 0.0024138390559211628, + "loss": 1.2557, + "step": 4210 + }, + { + "epoch": 0.36927006189121014, + "grad_norm": 0.11865234375, + "learning_rate": 0.0024135028691174105, + "loss": 1.2463, + "step": 4211 + }, + { + "epoch": 0.36935775366558465, + "grad_norm": 0.06591796875, + "learning_rate": 0.0024131666126841673, + "loss": 1.1331, + "step": 4212 + }, + { + "epoch": 0.3694454454399592, + "grad_norm": 0.07568359375, + "learning_rate": 0.002412830286652099, + "loss": 1.1649, + "step": 4213 + }, + { + "epoch": 0.3695331372143338, + "grad_norm": 0.0654296875, + "learning_rate": 0.0024124938910518785, + "loss": 1.1731, + "step": 4214 + }, + { + "epoch": 0.3696208289887083, + "grad_norm": 0.09619140625, + "learning_rate": 0.002412157425914183, + "loss": 1.2278, + "step": 4215 + }, + { + "epoch": 0.36970852076308286, + "grad_norm": 0.056640625, + "learning_rate": 0.0024118208912696977, + "loss": 1.1799, + "step": 4216 + }, + { + "epoch": 0.3697962125374574, + "grad_norm": 0.055419921875, + "learning_rate": 0.0024114842871491137, + "loss": 1.19, + "step": 4217 + }, + { + "epoch": 0.36988390431183193, + "grad_norm": 0.07080078125, + "learning_rate": 0.0024111476135831285, + "loss": 1.1761, + "step": 4218 + }, + { + "epoch": 0.3699715960862065, + "grad_norm": 0.06298828125, + "learning_rate": 0.002410810870602446, + "loss": 1.2772, + "step": 4219 + }, + { + "epoch": 0.37005928786058107, + "grad_norm": 0.052001953125, + "learning_rate": 0.002410474058237776, + "loss": 1.1806, + "step": 4220 + }, + { + "epoch": 0.3701469796349556, + "grad_norm": 0.05615234375, + "learning_rate": 0.002410137176519835, + "loss": 1.1487, + "step": 4221 + }, + { + "epoch": 0.37023467140933014, + "grad_norm": 0.056396484375, + "learning_rate": 0.002409800225479346, + "loss": 1.242, + "step": 4222 + }, + { + "epoch": 0.37032236318370465, + "grad_norm": 0.11181640625, + "learning_rate": 0.0024094632051470375, + "loss": 1.2088, + "step": 4223 + }, + { + "epoch": 0.3704100549580792, + "grad_norm": 0.06103515625, + "learning_rate": 0.002409126115553645, + "loss": 1.1997, + "step": 4224 + }, + { + "epoch": 0.3704977467324538, + "grad_norm": 0.11669921875, + "learning_rate": 0.0024087889567299113, + "loss": 1.1768, + "step": 4225 + }, + { + "epoch": 0.3705854385068283, + "grad_norm": 0.056640625, + "learning_rate": 0.002408451728706583, + "loss": 1.2187, + "step": 4226 + }, + { + "epoch": 0.37067313028120286, + "grad_norm": 0.05712890625, + "learning_rate": 0.002408114431514415, + "loss": 1.1996, + "step": 4227 + }, + { + "epoch": 0.3707608220555774, + "grad_norm": 0.062255859375, + "learning_rate": 0.0024077770651841682, + "loss": 1.2108, + "step": 4228 + }, + { + "epoch": 0.37084851382995193, + "grad_norm": 0.06689453125, + "learning_rate": 0.002407439629746609, + "loss": 1.1962, + "step": 4229 + }, + { + "epoch": 0.3709362056043265, + "grad_norm": 0.09765625, + "learning_rate": 0.0024071021252325105, + "loss": 1.2024, + "step": 4230 + }, + { + "epoch": 0.37102389737870106, + "grad_norm": 0.06298828125, + "learning_rate": 0.0024067645516726533, + "loss": 1.1889, + "step": 4231 + }, + { + "epoch": 0.37111158915307557, + "grad_norm": 0.10791015625, + "learning_rate": 0.0024064269090978228, + "loss": 1.2876, + "step": 4232 + }, + { + "epoch": 0.37119928092745014, + "grad_norm": 0.07568359375, + "learning_rate": 0.00240608919753881, + "loss": 1.2961, + "step": 4233 + }, + { + "epoch": 0.37128697270182465, + "grad_norm": 0.05517578125, + "learning_rate": 0.002405751417026415, + "loss": 1.2306, + "step": 4234 + }, + { + "epoch": 0.3713746644761992, + "grad_norm": 0.05712890625, + "learning_rate": 0.002405413567591441, + "loss": 1.2144, + "step": 4235 + }, + { + "epoch": 0.3714623562505738, + "grad_norm": 0.064453125, + "learning_rate": 0.0024050756492647, + "loss": 1.2143, + "step": 4236 + }, + { + "epoch": 0.3715500480249483, + "grad_norm": 0.05859375, + "learning_rate": 0.002404737662077009, + "loss": 1.2175, + "step": 4237 + }, + { + "epoch": 0.37163773979932285, + "grad_norm": 0.0703125, + "learning_rate": 0.002404399606059192, + "loss": 1.1913, + "step": 4238 + }, + { + "epoch": 0.3717254315736974, + "grad_norm": 0.053466796875, + "learning_rate": 0.002404061481242078, + "loss": 1.2011, + "step": 4239 + }, + { + "epoch": 0.37181312334807193, + "grad_norm": 0.05810546875, + "learning_rate": 0.0024037232876565036, + "loss": 1.1957, + "step": 4240 + }, + { + "epoch": 0.3719008151224465, + "grad_norm": 0.0673828125, + "learning_rate": 0.002403385025333311, + "loss": 1.1492, + "step": 4241 + }, + { + "epoch": 0.37198850689682106, + "grad_norm": 0.056640625, + "learning_rate": 0.0024030466943033495, + "loss": 1.2376, + "step": 4242 + }, + { + "epoch": 0.37207619867119557, + "grad_norm": 0.052734375, + "learning_rate": 0.002402708294597472, + "loss": 1.1493, + "step": 4243 + }, + { + "epoch": 0.37216389044557013, + "grad_norm": 0.1279296875, + "learning_rate": 0.0024023698262465422, + "loss": 1.16, + "step": 4244 + }, + { + "epoch": 0.3722515822199447, + "grad_norm": 0.057861328125, + "learning_rate": 0.0024020312892814267, + "loss": 1.1355, + "step": 4245 + }, + { + "epoch": 0.3723392739943192, + "grad_norm": 0.09521484375, + "learning_rate": 0.0024016926837329986, + "loss": 1.1972, + "step": 4246 + }, + { + "epoch": 0.3724269657686938, + "grad_norm": 0.07763671875, + "learning_rate": 0.0024013540096321385, + "loss": 1.1791, + "step": 4247 + }, + { + "epoch": 0.3725146575430683, + "grad_norm": 0.057373046875, + "learning_rate": 0.0024010152670097323, + "loss": 1.2102, + "step": 4248 + }, + { + "epoch": 0.37260234931744285, + "grad_norm": 0.09912109375, + "learning_rate": 0.002400676455896672, + "loss": 1.2079, + "step": 4249 + }, + { + "epoch": 0.3726900410918174, + "grad_norm": 0.080078125, + "learning_rate": 0.0024003375763238573, + "loss": 1.2348, + "step": 4250 + }, + { + "epoch": 0.3727777328661919, + "grad_norm": 0.10791015625, + "learning_rate": 0.0023999986283221924, + "loss": 1.164, + "step": 4251 + }, + { + "epoch": 0.3728654246405665, + "grad_norm": 0.1044921875, + "learning_rate": 0.002399659611922589, + "loss": 1.2384, + "step": 4252 + }, + { + "epoch": 0.37295311641494105, + "grad_norm": 0.10986328125, + "learning_rate": 0.0023993205271559638, + "loss": 1.1913, + "step": 4253 + }, + { + "epoch": 0.37304080818931556, + "grad_norm": 0.06640625, + "learning_rate": 0.002398981374053241, + "loss": 1.1624, + "step": 4254 + }, + { + "epoch": 0.37312849996369013, + "grad_norm": 0.154296875, + "learning_rate": 0.0023986421526453502, + "loss": 1.2163, + "step": 4255 + }, + { + "epoch": 0.3732161917380647, + "grad_norm": 0.08154296875, + "learning_rate": 0.002398302862963228, + "loss": 1.1505, + "step": 4256 + }, + { + "epoch": 0.3733038835124392, + "grad_norm": 0.0712890625, + "learning_rate": 0.002397963505037816, + "loss": 1.174, + "step": 4257 + }, + { + "epoch": 0.37339157528681377, + "grad_norm": 0.1123046875, + "learning_rate": 0.002397624078900064, + "loss": 1.2025, + "step": 4258 + }, + { + "epoch": 0.3734792670611883, + "grad_norm": 0.057373046875, + "learning_rate": 0.0023972845845809257, + "loss": 1.2265, + "step": 4259 + }, + { + "epoch": 0.37356695883556285, + "grad_norm": 0.0654296875, + "learning_rate": 0.002396945022111362, + "loss": 1.2228, + "step": 4260 + }, + { + "epoch": 0.3736546506099374, + "grad_norm": 0.057861328125, + "learning_rate": 0.0023966053915223406, + "loss": 1.1801, + "step": 4261 + }, + { + "epoch": 0.3737423423843119, + "grad_norm": 0.055419921875, + "learning_rate": 0.002396265692844835, + "loss": 1.135, + "step": 4262 + }, + { + "epoch": 0.3738300341586865, + "grad_norm": 0.07177734375, + "learning_rate": 0.002395925926109825, + "loss": 1.277, + "step": 4263 + }, + { + "epoch": 0.37391772593306105, + "grad_norm": 0.08837890625, + "learning_rate": 0.0023955860913482956, + "loss": 1.2154, + "step": 4264 + }, + { + "epoch": 0.37400541770743556, + "grad_norm": 0.0576171875, + "learning_rate": 0.0023952461885912396, + "loss": 1.2157, + "step": 4265 + }, + { + "epoch": 0.3740931094818101, + "grad_norm": 0.1279296875, + "learning_rate": 0.002394906217869655, + "loss": 1.2412, + "step": 4266 + }, + { + "epoch": 0.3741808012561847, + "grad_norm": 0.044677734375, + "learning_rate": 0.0023945661792145463, + "loss": 1.1501, + "step": 4267 + }, + { + "epoch": 0.3742684930305592, + "grad_norm": 0.103515625, + "learning_rate": 0.0023942260726569245, + "loss": 1.1912, + "step": 4268 + }, + { + "epoch": 0.37435618480493377, + "grad_norm": 0.058837890625, + "learning_rate": 0.002393885898227805, + "loss": 1.1913, + "step": 4269 + }, + { + "epoch": 0.3744438765793083, + "grad_norm": 0.054931640625, + "learning_rate": 0.0023935456559582135, + "loss": 1.171, + "step": 4270 + }, + { + "epoch": 0.37453156835368284, + "grad_norm": 0.07373046875, + "learning_rate": 0.0023932053458791762, + "loss": 1.1646, + "step": 4271 + }, + { + "epoch": 0.3746192601280574, + "grad_norm": 0.054931640625, + "learning_rate": 0.0023928649680217302, + "loss": 1.2058, + "step": 4272 + }, + { + "epoch": 0.3747069519024319, + "grad_norm": 0.06689453125, + "learning_rate": 0.0023925245224169175, + "loss": 1.1787, + "step": 4273 + }, + { + "epoch": 0.3747946436768065, + "grad_norm": 0.064453125, + "learning_rate": 0.0023921840090957848, + "loss": 1.2181, + "step": 4274 + }, + { + "epoch": 0.37488233545118105, + "grad_norm": 0.05615234375, + "learning_rate": 0.0023918434280893863, + "loss": 1.2035, + "step": 4275 + }, + { + "epoch": 0.37497002722555556, + "grad_norm": 0.0654296875, + "learning_rate": 0.0023915027794287826, + "loss": 1.1696, + "step": 4276 + }, + { + "epoch": 0.3750577189999301, + "grad_norm": 0.0537109375, + "learning_rate": 0.002391162063145039, + "loss": 1.1982, + "step": 4277 + }, + { + "epoch": 0.3751454107743047, + "grad_norm": 0.056396484375, + "learning_rate": 0.002390821279269229, + "loss": 1.2026, + "step": 4278 + }, + { + "epoch": 0.3752331025486792, + "grad_norm": 0.05712890625, + "learning_rate": 0.002390480427832431, + "loss": 1.1698, + "step": 4279 + }, + { + "epoch": 0.37532079432305376, + "grad_norm": 0.05224609375, + "learning_rate": 0.0023901395088657285, + "loss": 1.1955, + "step": 4280 + }, + { + "epoch": 0.37540848609742833, + "grad_norm": 0.055908203125, + "learning_rate": 0.002389798522400214, + "loss": 1.1708, + "step": 4281 + }, + { + "epoch": 0.37549617787180284, + "grad_norm": 0.068359375, + "learning_rate": 0.0023894574684669846, + "loss": 1.1912, + "step": 4282 + }, + { + "epoch": 0.3755838696461774, + "grad_norm": 0.07958984375, + "learning_rate": 0.0023891163470971424, + "loss": 1.2321, + "step": 4283 + }, + { + "epoch": 0.3756715614205519, + "grad_norm": 0.06787109375, + "learning_rate": 0.0023887751583217976, + "loss": 1.1552, + "step": 4284 + }, + { + "epoch": 0.3757592531949265, + "grad_norm": 0.059326171875, + "learning_rate": 0.002388433902172065, + "loss": 1.1564, + "step": 4285 + }, + { + "epoch": 0.37584694496930104, + "grad_norm": 0.0791015625, + "learning_rate": 0.0023880925786790674, + "loss": 1.1676, + "step": 4286 + }, + { + "epoch": 0.37593463674367555, + "grad_norm": 0.06982421875, + "learning_rate": 0.0023877511878739317, + "loss": 1.2311, + "step": 4287 + }, + { + "epoch": 0.3760223285180501, + "grad_norm": 0.061767578125, + "learning_rate": 0.0023874097297877923, + "loss": 1.2259, + "step": 4288 + }, + { + "epoch": 0.3761100202924247, + "grad_norm": 0.08984375, + "learning_rate": 0.002387068204451789, + "loss": 1.1995, + "step": 4289 + }, + { + "epoch": 0.3761977120667992, + "grad_norm": 0.05517578125, + "learning_rate": 0.002386726611897068, + "loss": 1.1985, + "step": 4290 + }, + { + "epoch": 0.37628540384117376, + "grad_norm": 0.057861328125, + "learning_rate": 0.0023863849521547826, + "loss": 1.1792, + "step": 4291 + }, + { + "epoch": 0.3763730956155483, + "grad_norm": 0.058837890625, + "learning_rate": 0.0023860432252560903, + "loss": 1.1654, + "step": 4292 + }, + { + "epoch": 0.37646078738992284, + "grad_norm": 0.06689453125, + "learning_rate": 0.002385701431232155, + "loss": 1.2859, + "step": 4293 + }, + { + "epoch": 0.3765484791642974, + "grad_norm": 0.052490234375, + "learning_rate": 0.0023853595701141496, + "loss": 1.1748, + "step": 4294 + }, + { + "epoch": 0.3766361709386719, + "grad_norm": 0.06201171875, + "learning_rate": 0.0023850176419332493, + "loss": 1.1836, + "step": 4295 + }, + { + "epoch": 0.3767238627130465, + "grad_norm": 0.064453125, + "learning_rate": 0.0023846756467206373, + "loss": 1.2103, + "step": 4296 + }, + { + "epoch": 0.37681155448742104, + "grad_norm": 0.072265625, + "learning_rate": 0.0023843335845075025, + "loss": 1.2232, + "step": 4297 + }, + { + "epoch": 0.37689924626179555, + "grad_norm": 0.10791015625, + "learning_rate": 0.002383991455325041, + "loss": 1.1305, + "step": 4298 + }, + { + "epoch": 0.3769869380361701, + "grad_norm": 0.052734375, + "learning_rate": 0.0023836492592044533, + "loss": 1.2417, + "step": 4299 + }, + { + "epoch": 0.3770746298105447, + "grad_norm": 0.0810546875, + "learning_rate": 0.0023833069961769478, + "loss": 1.2355, + "step": 4300 + }, + { + "epoch": 0.3771623215849192, + "grad_norm": 0.05322265625, + "learning_rate": 0.0023829646662737367, + "loss": 1.1322, + "step": 4301 + }, + { + "epoch": 0.37725001335929376, + "grad_norm": 0.052978515625, + "learning_rate": 0.0023826222695260395, + "loss": 1.2267, + "step": 4302 + }, + { + "epoch": 0.3773377051336683, + "grad_norm": 0.054931640625, + "learning_rate": 0.0023822798059650835, + "loss": 1.2061, + "step": 4303 + }, + { + "epoch": 0.37742539690804283, + "grad_norm": 0.05859375, + "learning_rate": 0.0023819372756220996, + "loss": 1.1751, + "step": 4304 + }, + { + "epoch": 0.3775130886824174, + "grad_norm": 0.05810546875, + "learning_rate": 0.002381594678528325, + "loss": 1.1727, + "step": 4305 + }, + { + "epoch": 0.3776007804567919, + "grad_norm": 0.06689453125, + "learning_rate": 0.0023812520147150048, + "loss": 1.1614, + "step": 4306 + }, + { + "epoch": 0.37768847223116647, + "grad_norm": 0.0537109375, + "learning_rate": 0.0023809092842133886, + "loss": 1.1658, + "step": 4307 + }, + { + "epoch": 0.37777616400554104, + "grad_norm": 0.08251953125, + "learning_rate": 0.0023805664870547322, + "loss": 1.208, + "step": 4308 + }, + { + "epoch": 0.37786385577991555, + "grad_norm": 0.076171875, + "learning_rate": 0.0023802236232702985, + "loss": 1.2419, + "step": 4309 + }, + { + "epoch": 0.3779515475542901, + "grad_norm": 0.08203125, + "learning_rate": 0.0023798806928913554, + "loss": 1.1895, + "step": 4310 + }, + { + "epoch": 0.3780392393286647, + "grad_norm": 0.1015625, + "learning_rate": 0.002379537695949177, + "loss": 1.2066, + "step": 4311 + }, + { + "epoch": 0.3781269311030392, + "grad_norm": 0.0830078125, + "learning_rate": 0.002379194632475044, + "loss": 1.2361, + "step": 4312 + }, + { + "epoch": 0.37821462287741375, + "grad_norm": 0.05810546875, + "learning_rate": 0.002378851502500243, + "loss": 1.2504, + "step": 4313 + }, + { + "epoch": 0.3783023146517883, + "grad_norm": 0.057373046875, + "learning_rate": 0.0023785083060560664, + "loss": 1.1892, + "step": 4314 + }, + { + "epoch": 0.37839000642616283, + "grad_norm": 0.078125, + "learning_rate": 0.002378165043173813, + "loss": 1.175, + "step": 4315 + }, + { + "epoch": 0.3784776982005374, + "grad_norm": 0.06103515625, + "learning_rate": 0.0023778217138847877, + "loss": 1.1791, + "step": 4316 + }, + { + "epoch": 0.37856538997491196, + "grad_norm": 0.0634765625, + "learning_rate": 0.002377478318220301, + "loss": 1.1963, + "step": 4317 + }, + { + "epoch": 0.37865308174928647, + "grad_norm": 0.09912109375, + "learning_rate": 0.0023771348562116696, + "loss": 1.2317, + "step": 4318 + }, + { + "epoch": 0.37874077352366103, + "grad_norm": 0.0556640625, + "learning_rate": 0.0023767913278902163, + "loss": 1.1678, + "step": 4319 + }, + { + "epoch": 0.37882846529803554, + "grad_norm": 0.126953125, + "learning_rate": 0.0023764477332872697, + "loss": 1.1759, + "step": 4320 + }, + { + "epoch": 0.3789161570724101, + "grad_norm": 0.07421875, + "learning_rate": 0.002376104072434165, + "loss": 1.188, + "step": 4321 + }, + { + "epoch": 0.3790038488467847, + "grad_norm": 0.061279296875, + "learning_rate": 0.002375760345362244, + "loss": 1.1986, + "step": 4322 + }, + { + "epoch": 0.3790915406211592, + "grad_norm": 0.158203125, + "learning_rate": 0.0023754165521028526, + "loss": 1.2153, + "step": 4323 + }, + { + "epoch": 0.37917923239553375, + "grad_norm": 0.052734375, + "learning_rate": 0.0023750726926873443, + "loss": 1.1526, + "step": 4324 + }, + { + "epoch": 0.3792669241699083, + "grad_norm": 0.14453125, + "learning_rate": 0.002374728767147079, + "loss": 1.1888, + "step": 4325 + }, + { + "epoch": 0.3793546159442828, + "grad_norm": 0.1181640625, + "learning_rate": 0.00237438477551342, + "loss": 1.235, + "step": 4326 + }, + { + "epoch": 0.3794423077186574, + "grad_norm": 0.1181640625, + "learning_rate": 0.0023740407178177406, + "loss": 1.167, + "step": 4327 + }, + { + "epoch": 0.37952999949303196, + "grad_norm": 0.1279296875, + "learning_rate": 0.0023736965940914156, + "loss": 1.222, + "step": 4328 + }, + { + "epoch": 0.37961769126740647, + "grad_norm": 0.10107421875, + "learning_rate": 0.00237335240436583, + "loss": 1.2199, + "step": 4329 + }, + { + "epoch": 0.37970538304178103, + "grad_norm": 0.10498046875, + "learning_rate": 0.0023730081486723724, + "loss": 1.2215, + "step": 4330 + }, + { + "epoch": 0.37979307481615554, + "grad_norm": 0.06298828125, + "learning_rate": 0.002372663827042438, + "loss": 1.2528, + "step": 4331 + }, + { + "epoch": 0.3798807665905301, + "grad_norm": 0.130859375, + "learning_rate": 0.002372319439507428, + "loss": 1.2067, + "step": 4332 + }, + { + "epoch": 0.37996845836490467, + "grad_norm": 0.061279296875, + "learning_rate": 0.00237197498609875, + "loss": 1.1944, + "step": 4333 + }, + { + "epoch": 0.3800561501392792, + "grad_norm": 0.0869140625, + "learning_rate": 0.0023716304668478174, + "loss": 1.1633, + "step": 4334 + }, + { + "epoch": 0.38014384191365375, + "grad_norm": 0.12158203125, + "learning_rate": 0.0023712858817860486, + "loss": 1.2245, + "step": 4335 + }, + { + "epoch": 0.3802315336880283, + "grad_norm": 0.05859375, + "learning_rate": 0.0023709412309448692, + "loss": 1.2107, + "step": 4336 + }, + { + "epoch": 0.3803192254624028, + "grad_norm": 0.10400390625, + "learning_rate": 0.0023705965143557112, + "loss": 1.1796, + "step": 4337 + }, + { + "epoch": 0.3804069172367774, + "grad_norm": 0.1220703125, + "learning_rate": 0.002370251732050011, + "loss": 1.2441, + "step": 4338 + }, + { + "epoch": 0.38049460901115195, + "grad_norm": 0.08984375, + "learning_rate": 0.0023699068840592123, + "loss": 1.1879, + "step": 4339 + }, + { + "epoch": 0.38058230078552646, + "grad_norm": 0.162109375, + "learning_rate": 0.002369561970414764, + "loss": 1.1904, + "step": 4340 + }, + { + "epoch": 0.380669992559901, + "grad_norm": 0.06396484375, + "learning_rate": 0.0023692169911481214, + "loss": 1.2185, + "step": 4341 + }, + { + "epoch": 0.38075768433427554, + "grad_norm": 0.134765625, + "learning_rate": 0.002368871946290746, + "loss": 1.1401, + "step": 4342 + }, + { + "epoch": 0.3808453761086501, + "grad_norm": 0.1376953125, + "learning_rate": 0.002368526835874105, + "loss": 1.2125, + "step": 4343 + }, + { + "epoch": 0.38093306788302467, + "grad_norm": 0.052734375, + "learning_rate": 0.0023681816599296717, + "loss": 1.2209, + "step": 4344 + }, + { + "epoch": 0.3810207596573992, + "grad_norm": 0.1533203125, + "learning_rate": 0.0023678364184889244, + "loss": 1.2007, + "step": 4345 + }, + { + "epoch": 0.38110845143177374, + "grad_norm": 0.07763671875, + "learning_rate": 0.0023674911115833487, + "loss": 1.1789, + "step": 4346 + }, + { + "epoch": 0.3811961432061483, + "grad_norm": 0.11083984375, + "learning_rate": 0.0023671457392444363, + "loss": 1.163, + "step": 4347 + }, + { + "epoch": 0.3812838349805228, + "grad_norm": 0.07373046875, + "learning_rate": 0.002366800301503684, + "loss": 1.1625, + "step": 4348 + }, + { + "epoch": 0.3813715267548974, + "grad_norm": 0.07373046875, + "learning_rate": 0.0023664547983925943, + "loss": 1.1849, + "step": 4349 + }, + { + "epoch": 0.38145921852927195, + "grad_norm": 0.08935546875, + "learning_rate": 0.0023661092299426764, + "loss": 1.2015, + "step": 4350 + }, + { + "epoch": 0.38154691030364646, + "grad_norm": 0.0625, + "learning_rate": 0.0023657635961854455, + "loss": 1.1743, + "step": 4351 + }, + { + "epoch": 0.381634602078021, + "grad_norm": 0.10205078125, + "learning_rate": 0.0023654178971524225, + "loss": 1.1563, + "step": 4352 + }, + { + "epoch": 0.3817222938523956, + "grad_norm": 0.05859375, + "learning_rate": 0.002365072132875134, + "loss": 1.1945, + "step": 4353 + }, + { + "epoch": 0.3818099856267701, + "grad_norm": 0.07861328125, + "learning_rate": 0.0023647263033851128, + "loss": 1.1769, + "step": 4354 + }, + { + "epoch": 0.38189767740114466, + "grad_norm": 0.064453125, + "learning_rate": 0.0023643804087138975, + "loss": 1.2177, + "step": 4355 + }, + { + "epoch": 0.3819853691755192, + "grad_norm": 0.058349609375, + "learning_rate": 0.0023640344488930335, + "loss": 1.1471, + "step": 4356 + }, + { + "epoch": 0.38207306094989374, + "grad_norm": 0.057373046875, + "learning_rate": 0.002363688423954071, + "loss": 1.1954, + "step": 4357 + }, + { + "epoch": 0.3821607527242683, + "grad_norm": 0.06494140625, + "learning_rate": 0.0023633423339285667, + "loss": 1.2234, + "step": 4358 + }, + { + "epoch": 0.3822484444986428, + "grad_norm": 0.072265625, + "learning_rate": 0.0023629961788480826, + "loss": 1.2009, + "step": 4359 + }, + { + "epoch": 0.3823361362730174, + "grad_norm": 0.07080078125, + "learning_rate": 0.0023626499587441883, + "loss": 1.1995, + "step": 4360 + }, + { + "epoch": 0.38242382804739194, + "grad_norm": 0.05712890625, + "learning_rate": 0.002362303673648457, + "loss": 1.1698, + "step": 4361 + }, + { + "epoch": 0.38251151982176645, + "grad_norm": 0.08740234375, + "learning_rate": 0.0023619573235924697, + "loss": 1.2508, + "step": 4362 + }, + { + "epoch": 0.382599211596141, + "grad_norm": 0.08203125, + "learning_rate": 0.0023616109086078124, + "loss": 1.142, + "step": 4363 + }, + { + "epoch": 0.3826869033705156, + "grad_norm": 0.07373046875, + "learning_rate": 0.0023612644287260764, + "loss": 1.1909, + "step": 4364 + }, + { + "epoch": 0.3827745951448901, + "grad_norm": 0.0732421875, + "learning_rate": 0.002360917883978861, + "loss": 1.217, + "step": 4365 + }, + { + "epoch": 0.38286228691926466, + "grad_norm": 0.052490234375, + "learning_rate": 0.0023605712743977702, + "loss": 1.1299, + "step": 4366 + }, + { + "epoch": 0.38294997869363917, + "grad_norm": 0.08447265625, + "learning_rate": 0.0023602246000144133, + "loss": 1.2982, + "step": 4367 + }, + { + "epoch": 0.38303767046801374, + "grad_norm": 0.068359375, + "learning_rate": 0.0023598778608604066, + "loss": 1.2395, + "step": 4368 + }, + { + "epoch": 0.3831253622423883, + "grad_norm": 0.1044921875, + "learning_rate": 0.002359531056967372, + "loss": 1.1757, + "step": 4369 + }, + { + "epoch": 0.3832130540167628, + "grad_norm": 0.049072265625, + "learning_rate": 0.0023591841883669356, + "loss": 1.1603, + "step": 4370 + }, + { + "epoch": 0.3833007457911374, + "grad_norm": 0.09228515625, + "learning_rate": 0.0023588372550907323, + "loss": 1.2002, + "step": 4371 + }, + { + "epoch": 0.38338843756551194, + "grad_norm": 0.06005859375, + "learning_rate": 0.0023584902571704013, + "loss": 1.1883, + "step": 4372 + }, + { + "epoch": 0.38347612933988645, + "grad_norm": 0.0732421875, + "learning_rate": 0.0023581431946375873, + "loss": 1.1866, + "step": 4373 + }, + { + "epoch": 0.383563821114261, + "grad_norm": 0.056884765625, + "learning_rate": 0.002357796067523943, + "loss": 1.1918, + "step": 4374 + }, + { + "epoch": 0.3836515128886356, + "grad_norm": 0.056884765625, + "learning_rate": 0.002357448875861124, + "loss": 1.1796, + "step": 4375 + }, + { + "epoch": 0.3837392046630101, + "grad_norm": 0.0576171875, + "learning_rate": 0.0023571016196807936, + "loss": 1.1758, + "step": 4376 + }, + { + "epoch": 0.38382689643738466, + "grad_norm": 0.06005859375, + "learning_rate": 0.0023567542990146208, + "loss": 1.1749, + "step": 4377 + }, + { + "epoch": 0.38391458821175917, + "grad_norm": 0.052734375, + "learning_rate": 0.0023564069138942807, + "loss": 1.1758, + "step": 4378 + }, + { + "epoch": 0.38400227998613373, + "grad_norm": 0.0703125, + "learning_rate": 0.0023560594643514537, + "loss": 1.2357, + "step": 4379 + }, + { + "epoch": 0.3840899717605083, + "grad_norm": 0.068359375, + "learning_rate": 0.002355711950417826, + "loss": 1.1883, + "step": 4380 + }, + { + "epoch": 0.3841776635348828, + "grad_norm": 0.0654296875, + "learning_rate": 0.0023553643721250903, + "loss": 1.1925, + "step": 4381 + }, + { + "epoch": 0.3842653553092574, + "grad_norm": 0.0986328125, + "learning_rate": 0.002355016729504945, + "loss": 1.1407, + "step": 4382 + }, + { + "epoch": 0.38435304708363194, + "grad_norm": 0.06396484375, + "learning_rate": 0.0023546690225890936, + "loss": 1.1061, + "step": 4383 + }, + { + "epoch": 0.38444073885800645, + "grad_norm": 0.08251953125, + "learning_rate": 0.0023543212514092467, + "loss": 1.1525, + "step": 4384 + }, + { + "epoch": 0.384528430632381, + "grad_norm": 0.0615234375, + "learning_rate": 0.0023539734159971197, + "loss": 1.1491, + "step": 4385 + }, + { + "epoch": 0.3846161224067556, + "grad_norm": 0.095703125, + "learning_rate": 0.002353625516384434, + "loss": 1.2202, + "step": 4386 + }, + { + "epoch": 0.3847038141811301, + "grad_norm": 0.05517578125, + "learning_rate": 0.0023532775526029183, + "loss": 1.2202, + "step": 4387 + }, + { + "epoch": 0.38479150595550465, + "grad_norm": 0.1533203125, + "learning_rate": 0.0023529295246843052, + "loss": 1.2204, + "step": 4388 + }, + { + "epoch": 0.38487919772987916, + "grad_norm": 0.058837890625, + "learning_rate": 0.002352581432660334, + "loss": 1.2238, + "step": 4389 + }, + { + "epoch": 0.38496688950425373, + "grad_norm": 0.09326171875, + "learning_rate": 0.0023522332765627493, + "loss": 1.1857, + "step": 4390 + }, + { + "epoch": 0.3850545812786283, + "grad_norm": 0.07275390625, + "learning_rate": 0.0023518850564233033, + "loss": 1.1549, + "step": 4391 + }, + { + "epoch": 0.3851422730530028, + "grad_norm": 0.056640625, + "learning_rate": 0.0023515367722737514, + "loss": 1.1915, + "step": 4392 + }, + { + "epoch": 0.38522996482737737, + "grad_norm": 0.06396484375, + "learning_rate": 0.0023511884241458575, + "loss": 1.2433, + "step": 4393 + }, + { + "epoch": 0.38531765660175193, + "grad_norm": 0.06005859375, + "learning_rate": 0.0023508400120713898, + "loss": 1.2096, + "step": 4394 + }, + { + "epoch": 0.38540534837612644, + "grad_norm": 0.0654296875, + "learning_rate": 0.0023504915360821215, + "loss": 1.2293, + "step": 4395 + }, + { + "epoch": 0.385493040150501, + "grad_norm": 0.076171875, + "learning_rate": 0.0023501429962098333, + "loss": 1.2381, + "step": 4396 + }, + { + "epoch": 0.3855807319248756, + "grad_norm": 0.0869140625, + "learning_rate": 0.0023497943924863118, + "loss": 1.1618, + "step": 4397 + }, + { + "epoch": 0.3856684236992501, + "grad_norm": 0.057861328125, + "learning_rate": 0.002349445724943348, + "loss": 1.1659, + "step": 4398 + }, + { + "epoch": 0.38575611547362465, + "grad_norm": 0.068359375, + "learning_rate": 0.002349096993612739, + "loss": 1.245, + "step": 4399 + }, + { + "epoch": 0.3858438072479992, + "grad_norm": 0.0810546875, + "learning_rate": 0.00234874819852629, + "loss": 1.2502, + "step": 4400 + }, + { + "epoch": 0.3859314990223737, + "grad_norm": 0.06640625, + "learning_rate": 0.0023483993397158086, + "loss": 1.159, + "step": 4401 + }, + { + "epoch": 0.3860191907967483, + "grad_norm": 0.099609375, + "learning_rate": 0.002348050417213111, + "loss": 1.2358, + "step": 4402 + }, + { + "epoch": 0.3861068825711228, + "grad_norm": 0.078125, + "learning_rate": 0.002347701431050016, + "loss": 1.1797, + "step": 4403 + }, + { + "epoch": 0.38619457434549737, + "grad_norm": 0.06982421875, + "learning_rate": 0.0023473523812583526, + "loss": 1.1931, + "step": 4404 + }, + { + "epoch": 0.38628226611987193, + "grad_norm": 0.068359375, + "learning_rate": 0.0023470032678699518, + "loss": 1.1632, + "step": 4405 + }, + { + "epoch": 0.38636995789424644, + "grad_norm": 0.061767578125, + "learning_rate": 0.0023466540909166523, + "loss": 1.1668, + "step": 4406 + }, + { + "epoch": 0.386457649668621, + "grad_norm": 0.056884765625, + "learning_rate": 0.0023463048504302982, + "loss": 1.2111, + "step": 4407 + }, + { + "epoch": 0.38654534144299557, + "grad_norm": 0.058349609375, + "learning_rate": 0.002345955546442739, + "loss": 1.2508, + "step": 4408 + }, + { + "epoch": 0.3866330332173701, + "grad_norm": 0.06396484375, + "learning_rate": 0.002345606178985831, + "loss": 1.2213, + "step": 4409 + }, + { + "epoch": 0.38672072499174465, + "grad_norm": 0.060302734375, + "learning_rate": 0.002345256748091435, + "loss": 1.1249, + "step": 4410 + }, + { + "epoch": 0.3868084167661192, + "grad_norm": 0.0634765625, + "learning_rate": 0.0023449072537914182, + "loss": 1.2265, + "step": 4411 + }, + { + "epoch": 0.3868961085404937, + "grad_norm": 0.06787109375, + "learning_rate": 0.002344557696117654, + "loss": 1.165, + "step": 4412 + }, + { + "epoch": 0.3869838003148683, + "grad_norm": 0.050048828125, + "learning_rate": 0.002344208075102021, + "loss": 1.2191, + "step": 4413 + }, + { + "epoch": 0.3870714920892428, + "grad_norm": 0.06591796875, + "learning_rate": 0.0023438583907764033, + "loss": 1.2516, + "step": 4414 + }, + { + "epoch": 0.38715918386361736, + "grad_norm": 0.057861328125, + "learning_rate": 0.002343508643172692, + "loss": 1.1963, + "step": 4415 + }, + { + "epoch": 0.3872468756379919, + "grad_norm": 0.0712890625, + "learning_rate": 0.0023431588323227824, + "loss": 1.2059, + "step": 4416 + }, + { + "epoch": 0.38733456741236644, + "grad_norm": 0.06884765625, + "learning_rate": 0.0023428089582585767, + "loss": 1.2504, + "step": 4417 + }, + { + "epoch": 0.387422259186741, + "grad_norm": 0.05517578125, + "learning_rate": 0.002342459021011983, + "loss": 1.1835, + "step": 4418 + }, + { + "epoch": 0.38750995096111557, + "grad_norm": 0.08251953125, + "learning_rate": 0.0023421090206149128, + "loss": 1.1912, + "step": 4419 + }, + { + "epoch": 0.3875976427354901, + "grad_norm": 0.057861328125, + "learning_rate": 0.002341758957099288, + "loss": 1.173, + "step": 4420 + }, + { + "epoch": 0.38768533450986464, + "grad_norm": 0.056640625, + "learning_rate": 0.002341408830497031, + "loss": 1.2235, + "step": 4421 + }, + { + "epoch": 0.3877730262842392, + "grad_norm": 0.06103515625, + "learning_rate": 0.0023410586408400746, + "loss": 1.2032, + "step": 4422 + }, + { + "epoch": 0.3878607180586137, + "grad_norm": 0.07421875, + "learning_rate": 0.002340708388160353, + "loss": 1.2516, + "step": 4423 + }, + { + "epoch": 0.3879484098329883, + "grad_norm": 0.05224609375, + "learning_rate": 0.0023403580724898104, + "loss": 1.1956, + "step": 4424 + }, + { + "epoch": 0.3880361016073628, + "grad_norm": 0.055908203125, + "learning_rate": 0.002340007693860393, + "loss": 1.151, + "step": 4425 + }, + { + "epoch": 0.38812379338173736, + "grad_norm": 0.0556640625, + "learning_rate": 0.002339657252304056, + "loss": 1.2015, + "step": 4426 + }, + { + "epoch": 0.3882114851561119, + "grad_norm": 0.0634765625, + "learning_rate": 0.002339306747852757, + "loss": 1.1914, + "step": 4427 + }, + { + "epoch": 0.38829917693048643, + "grad_norm": 0.0625, + "learning_rate": 0.0023389561805384628, + "loss": 1.2163, + "step": 4428 + }, + { + "epoch": 0.388386868704861, + "grad_norm": 0.061279296875, + "learning_rate": 0.002338605550393143, + "loss": 1.1853, + "step": 4429 + }, + { + "epoch": 0.38847456047923556, + "grad_norm": 0.057373046875, + "learning_rate": 0.002338254857448775, + "loss": 1.2342, + "step": 4430 + }, + { + "epoch": 0.3885622522536101, + "grad_norm": 0.09423828125, + "learning_rate": 0.00233790410173734, + "loss": 1.2106, + "step": 4431 + }, + { + "epoch": 0.38864994402798464, + "grad_norm": 0.061767578125, + "learning_rate": 0.0023375532832908275, + "loss": 1.2122, + "step": 4432 + }, + { + "epoch": 0.3887376358023592, + "grad_norm": 0.09130859375, + "learning_rate": 0.00233720240214123, + "loss": 1.144, + "step": 4433 + }, + { + "epoch": 0.3888253275767337, + "grad_norm": 0.06005859375, + "learning_rate": 0.0023368514583205478, + "loss": 1.2472, + "step": 4434 + }, + { + "epoch": 0.3889130193511083, + "grad_norm": 0.051513671875, + "learning_rate": 0.0023365004518607854, + "loss": 1.1903, + "step": 4435 + }, + { + "epoch": 0.38900071112548285, + "grad_norm": 0.0751953125, + "learning_rate": 0.002336149382793954, + "loss": 1.1802, + "step": 4436 + }, + { + "epoch": 0.38908840289985736, + "grad_norm": 0.052490234375, + "learning_rate": 0.0023357982511520705, + "loss": 1.2091, + "step": 4437 + }, + { + "epoch": 0.3891760946742319, + "grad_norm": 0.0966796875, + "learning_rate": 0.0023354470569671567, + "loss": 1.2581, + "step": 4438 + }, + { + "epoch": 0.38926378644860643, + "grad_norm": 0.057373046875, + "learning_rate": 0.0023350958002712405, + "loss": 1.2063, + "step": 4439 + }, + { + "epoch": 0.389351478222981, + "grad_norm": 0.08447265625, + "learning_rate": 0.0023347444810963565, + "loss": 1.1633, + "step": 4440 + }, + { + "epoch": 0.38943916999735556, + "grad_norm": 0.05322265625, + "learning_rate": 0.002334393099474543, + "loss": 1.1685, + "step": 4441 + }, + { + "epoch": 0.38952686177173007, + "grad_norm": 0.08447265625, + "learning_rate": 0.0023340416554378457, + "loss": 1.1964, + "step": 4442 + }, + { + "epoch": 0.38961455354610464, + "grad_norm": 0.053955078125, + "learning_rate": 0.0023336901490183153, + "loss": 1.2225, + "step": 4443 + }, + { + "epoch": 0.3897022453204792, + "grad_norm": 0.099609375, + "learning_rate": 0.0023333385802480084, + "loss": 1.2217, + "step": 4444 + }, + { + "epoch": 0.3897899370948537, + "grad_norm": 0.04931640625, + "learning_rate": 0.0023329869491589862, + "loss": 1.1519, + "step": 4445 + }, + { + "epoch": 0.3898776288692283, + "grad_norm": 0.1044921875, + "learning_rate": 0.0023326352557833186, + "loss": 1.1687, + "step": 4446 + }, + { + "epoch": 0.38996532064360284, + "grad_norm": 0.052490234375, + "learning_rate": 0.002332283500153077, + "loss": 1.1582, + "step": 4447 + }, + { + "epoch": 0.39005301241797735, + "grad_norm": 0.07275390625, + "learning_rate": 0.0023319316823003414, + "loss": 1.1993, + "step": 4448 + }, + { + "epoch": 0.3901407041923519, + "grad_norm": 0.06005859375, + "learning_rate": 0.0023315798022571976, + "loss": 1.2458, + "step": 4449 + }, + { + "epoch": 0.3902283959667264, + "grad_norm": 0.061279296875, + "learning_rate": 0.0023312278600557345, + "loss": 1.1381, + "step": 4450 + }, + { + "epoch": 0.390316087741101, + "grad_norm": 0.056884765625, + "learning_rate": 0.0023308758557280497, + "loss": 1.1344, + "step": 4451 + }, + { + "epoch": 0.39040377951547556, + "grad_norm": 0.0556640625, + "learning_rate": 0.002330523789306244, + "loss": 1.1845, + "step": 4452 + }, + { + "epoch": 0.39049147128985007, + "grad_norm": 0.05517578125, + "learning_rate": 0.002330171660822426, + "loss": 1.1683, + "step": 4453 + }, + { + "epoch": 0.39057916306422463, + "grad_norm": 0.05712890625, + "learning_rate": 0.0023298194703087085, + "loss": 1.1937, + "step": 4454 + }, + { + "epoch": 0.3906668548385992, + "grad_norm": 0.07470703125, + "learning_rate": 0.00232946721779721, + "loss": 1.206, + "step": 4455 + }, + { + "epoch": 0.3907545466129737, + "grad_norm": 0.055419921875, + "learning_rate": 0.0023291149033200552, + "loss": 1.1844, + "step": 4456 + }, + { + "epoch": 0.3908422383873483, + "grad_norm": 0.053955078125, + "learning_rate": 0.002328762526909374, + "loss": 1.188, + "step": 4457 + }, + { + "epoch": 0.39092993016172284, + "grad_norm": 0.0595703125, + "learning_rate": 0.002328410088597303, + "loss": 1.2321, + "step": 4458 + }, + { + "epoch": 0.39101762193609735, + "grad_norm": 0.056640625, + "learning_rate": 0.002328057588415983, + "loss": 1.1785, + "step": 4459 + }, + { + "epoch": 0.3911053137104719, + "grad_norm": 0.07568359375, + "learning_rate": 0.0023277050263975614, + "loss": 1.2156, + "step": 4460 + }, + { + "epoch": 0.3911930054848464, + "grad_norm": 0.057373046875, + "learning_rate": 0.002327352402574191, + "loss": 1.1433, + "step": 4461 + }, + { + "epoch": 0.391280697259221, + "grad_norm": 0.08251953125, + "learning_rate": 0.0023269997169780305, + "loss": 1.2131, + "step": 4462 + }, + { + "epoch": 0.39136838903359555, + "grad_norm": 0.07080078125, + "learning_rate": 0.0023266469696412427, + "loss": 1.205, + "step": 4463 + }, + { + "epoch": 0.39145608080797006, + "grad_norm": 0.052734375, + "learning_rate": 0.0023262941605959983, + "loss": 1.1669, + "step": 4464 + }, + { + "epoch": 0.39154377258234463, + "grad_norm": 0.05224609375, + "learning_rate": 0.0023259412898744723, + "loss": 1.1667, + "step": 4465 + }, + { + "epoch": 0.3916314643567192, + "grad_norm": 0.111328125, + "learning_rate": 0.002325588357508846, + "loss": 1.2205, + "step": 4466 + }, + { + "epoch": 0.3917191561310937, + "grad_norm": 0.060791015625, + "learning_rate": 0.0023252353635313046, + "loss": 1.3022, + "step": 4467 + }, + { + "epoch": 0.39180684790546827, + "grad_norm": 0.0703125, + "learning_rate": 0.002324882307974042, + "loss": 1.1683, + "step": 4468 + }, + { + "epoch": 0.39189453967984283, + "grad_norm": 0.0869140625, + "learning_rate": 0.002324529190869254, + "loss": 1.1594, + "step": 4469 + }, + { + "epoch": 0.39198223145421734, + "grad_norm": 0.059326171875, + "learning_rate": 0.0023241760122491464, + "loss": 1.196, + "step": 4470 + }, + { + "epoch": 0.3920699232285919, + "grad_norm": 0.09326171875, + "learning_rate": 0.0023238227721459268, + "loss": 1.18, + "step": 4471 + }, + { + "epoch": 0.3921576150029665, + "grad_norm": 0.060546875, + "learning_rate": 0.002323469470591809, + "loss": 1.1662, + "step": 4472 + }, + { + "epoch": 0.392245306777341, + "grad_norm": 0.0693359375, + "learning_rate": 0.0023231161076190145, + "loss": 1.214, + "step": 4473 + }, + { + "epoch": 0.39233299855171555, + "grad_norm": 0.0966796875, + "learning_rate": 0.0023227626832597686, + "loss": 1.2058, + "step": 4474 + }, + { + "epoch": 0.39242069032609006, + "grad_norm": 0.056396484375, + "learning_rate": 0.0023224091975463028, + "loss": 1.2157, + "step": 4475 + }, + { + "epoch": 0.3925083821004646, + "grad_norm": 0.07763671875, + "learning_rate": 0.0023220556505108535, + "loss": 1.2262, + "step": 4476 + }, + { + "epoch": 0.3925960738748392, + "grad_norm": 0.0654296875, + "learning_rate": 0.002321702042185664, + "loss": 1.1816, + "step": 4477 + }, + { + "epoch": 0.3926837656492137, + "grad_norm": 0.062255859375, + "learning_rate": 0.0023213483726029827, + "loss": 1.1763, + "step": 4478 + }, + { + "epoch": 0.39277145742358827, + "grad_norm": 0.05859375, + "learning_rate": 0.0023209946417950627, + "loss": 1.1918, + "step": 4479 + }, + { + "epoch": 0.39285914919796283, + "grad_norm": 0.0703125, + "learning_rate": 0.002320640849794163, + "loss": 1.124, + "step": 4480 + }, + { + "epoch": 0.39294684097233734, + "grad_norm": 0.07177734375, + "learning_rate": 0.0023202869966325495, + "loss": 1.2206, + "step": 4481 + }, + { + "epoch": 0.3930345327467119, + "grad_norm": 0.0771484375, + "learning_rate": 0.002319933082342492, + "loss": 1.2051, + "step": 4482 + }, + { + "epoch": 0.39312222452108647, + "grad_norm": 0.056884765625, + "learning_rate": 0.002319579106956267, + "loss": 1.2819, + "step": 4483 + }, + { + "epoch": 0.393209916295461, + "grad_norm": 0.087890625, + "learning_rate": 0.0023192250705061563, + "loss": 1.2025, + "step": 4484 + }, + { + "epoch": 0.39329760806983555, + "grad_norm": 0.07958984375, + "learning_rate": 0.0023188709730244464, + "loss": 1.1656, + "step": 4485 + }, + { + "epoch": 0.39338529984421006, + "grad_norm": 0.061279296875, + "learning_rate": 0.0023185168145434306, + "loss": 1.259, + "step": 4486 + }, + { + "epoch": 0.3934729916185846, + "grad_norm": 0.07958984375, + "learning_rate": 0.0023181625950954073, + "loss": 1.2147, + "step": 4487 + }, + { + "epoch": 0.3935606833929592, + "grad_norm": 0.058349609375, + "learning_rate": 0.0023178083147126793, + "loss": 1.2315, + "step": 4488 + }, + { + "epoch": 0.3936483751673337, + "grad_norm": 0.07666015625, + "learning_rate": 0.0023174539734275574, + "loss": 1.1948, + "step": 4489 + }, + { + "epoch": 0.39373606694170826, + "grad_norm": 0.06689453125, + "learning_rate": 0.0023170995712723567, + "loss": 1.1979, + "step": 4490 + }, + { + "epoch": 0.39382375871608283, + "grad_norm": 0.0869140625, + "learning_rate": 0.0023167451082793966, + "loss": 1.2401, + "step": 4491 + }, + { + "epoch": 0.39391145049045734, + "grad_norm": 0.07666015625, + "learning_rate": 0.0023163905844810037, + "loss": 1.2192, + "step": 4492 + }, + { + "epoch": 0.3939991422648319, + "grad_norm": 0.091796875, + "learning_rate": 0.0023160359999095104, + "loss": 1.183, + "step": 4493 + }, + { + "epoch": 0.39408683403920647, + "grad_norm": 0.0771484375, + "learning_rate": 0.0023156813545972526, + "loss": 1.154, + "step": 4494 + }, + { + "epoch": 0.394174525813581, + "grad_norm": 0.0654296875, + "learning_rate": 0.0023153266485765743, + "loss": 1.2804, + "step": 4495 + }, + { + "epoch": 0.39426221758795554, + "grad_norm": 0.0703125, + "learning_rate": 0.0023149718818798225, + "loss": 1.1826, + "step": 4496 + }, + { + "epoch": 0.39434990936233005, + "grad_norm": 0.0654296875, + "learning_rate": 0.0023146170545393524, + "loss": 1.1874, + "step": 4497 + }, + { + "epoch": 0.3944376011367046, + "grad_norm": 0.0693359375, + "learning_rate": 0.002314262166587522, + "loss": 1.1727, + "step": 4498 + }, + { + "epoch": 0.3945252929110792, + "grad_norm": 0.061279296875, + "learning_rate": 0.0023139072180566973, + "loss": 1.2275, + "step": 4499 + }, + { + "epoch": 0.3946129846854537, + "grad_norm": 0.05615234375, + "learning_rate": 0.002313552208979248, + "loss": 1.1908, + "step": 4500 + }, + { + "epoch": 0.3946129846854537, + "eval_loss": 1.2015705108642578, + "eval_runtime": 452.923, + "eval_samples_per_second": 31.897, + "eval_steps_per_second": 7.975, + "step": 4500 + }, + { + "epoch": 0.39470067645982826, + "grad_norm": 0.0634765625, + "learning_rate": 0.00231319713938755, + "loss": 1.1734, + "step": 4501 + }, + { + "epoch": 0.3947883682342028, + "grad_norm": 0.05126953125, + "learning_rate": 0.0023128420093139848, + "loss": 1.1641, + "step": 4502 + }, + { + "epoch": 0.39487606000857733, + "grad_norm": 0.06298828125, + "learning_rate": 0.0023124868187909403, + "loss": 1.1557, + "step": 4503 + }, + { + "epoch": 0.3949637517829519, + "grad_norm": 0.07421875, + "learning_rate": 0.0023121315678508074, + "loss": 1.1579, + "step": 4504 + }, + { + "epoch": 0.39505144355732646, + "grad_norm": 0.072265625, + "learning_rate": 0.002311776256525985, + "loss": 1.1644, + "step": 4505 + }, + { + "epoch": 0.395139135331701, + "grad_norm": 0.0732421875, + "learning_rate": 0.0023114208848488763, + "loss": 1.1869, + "step": 4506 + }, + { + "epoch": 0.39522682710607554, + "grad_norm": 0.0673828125, + "learning_rate": 0.0023110654528518903, + "loss": 1.2109, + "step": 4507 + }, + { + "epoch": 0.3953145188804501, + "grad_norm": 0.052490234375, + "learning_rate": 0.0023107099605674418, + "loss": 1.2391, + "step": 4508 + }, + { + "epoch": 0.3954022106548246, + "grad_norm": 0.057373046875, + "learning_rate": 0.00231035440802795, + "loss": 1.2041, + "step": 4509 + }, + { + "epoch": 0.3954899024291992, + "grad_norm": 0.06982421875, + "learning_rate": 0.002309998795265841, + "loss": 1.2319, + "step": 4510 + }, + { + "epoch": 0.3955775942035737, + "grad_norm": 0.05810546875, + "learning_rate": 0.002309643122313546, + "loss": 1.2718, + "step": 4511 + }, + { + "epoch": 0.39566528597794826, + "grad_norm": 0.06103515625, + "learning_rate": 0.002309287389203501, + "loss": 1.2, + "step": 4512 + }, + { + "epoch": 0.3957529777523228, + "grad_norm": 0.061279296875, + "learning_rate": 0.0023089315959681483, + "loss": 1.2292, + "step": 4513 + }, + { + "epoch": 0.39584066952669733, + "grad_norm": 0.06787109375, + "learning_rate": 0.0023085757426399346, + "loss": 1.2115, + "step": 4514 + }, + { + "epoch": 0.3959283613010719, + "grad_norm": 0.07958984375, + "learning_rate": 0.0023082198292513137, + "loss": 1.1656, + "step": 4515 + }, + { + "epoch": 0.39601605307544646, + "grad_norm": 0.0888671875, + "learning_rate": 0.002307863855834743, + "loss": 1.2009, + "step": 4516 + }, + { + "epoch": 0.39610374484982097, + "grad_norm": 0.07666015625, + "learning_rate": 0.0023075078224226876, + "loss": 1.1837, + "step": 4517 + }, + { + "epoch": 0.39619143662419554, + "grad_norm": 0.09521484375, + "learning_rate": 0.0023071517290476156, + "loss": 1.174, + "step": 4518 + }, + { + "epoch": 0.3962791283985701, + "grad_norm": 0.07568359375, + "learning_rate": 0.0023067955757420026, + "loss": 1.1315, + "step": 4519 + }, + { + "epoch": 0.3963668201729446, + "grad_norm": 0.08349609375, + "learning_rate": 0.0023064393625383288, + "loss": 1.1989, + "step": 4520 + }, + { + "epoch": 0.3964545119473192, + "grad_norm": 0.06396484375, + "learning_rate": 0.0023060830894690793, + "loss": 1.178, + "step": 4521 + }, + { + "epoch": 0.3965422037216937, + "grad_norm": 0.1181640625, + "learning_rate": 0.002305726756566746, + "loss": 1.1502, + "step": 4522 + }, + { + "epoch": 0.39662989549606825, + "grad_norm": 0.0771484375, + "learning_rate": 0.0023053703638638246, + "loss": 1.2172, + "step": 4523 + }, + { + "epoch": 0.3967175872704428, + "grad_norm": 0.1220703125, + "learning_rate": 0.0023050139113928187, + "loss": 1.1811, + "step": 4524 + }, + { + "epoch": 0.3968052790448173, + "grad_norm": 0.06494140625, + "learning_rate": 0.0023046573991862347, + "loss": 1.1775, + "step": 4525 + }, + { + "epoch": 0.3968929708191919, + "grad_norm": 0.07177734375, + "learning_rate": 0.002304300827276586, + "loss": 1.1931, + "step": 4526 + }, + { + "epoch": 0.39698066259356646, + "grad_norm": 0.060546875, + "learning_rate": 0.0023039441956963906, + "loss": 1.2344, + "step": 4527 + }, + { + "epoch": 0.39706835436794097, + "grad_norm": 0.05419921875, + "learning_rate": 0.002303587504478173, + "loss": 1.2004, + "step": 4528 + }, + { + "epoch": 0.39715604614231553, + "grad_norm": 0.072265625, + "learning_rate": 0.002303230753654462, + "loss": 1.2093, + "step": 4529 + }, + { + "epoch": 0.3972437379166901, + "grad_norm": 0.062255859375, + "learning_rate": 0.0023028739432577926, + "loss": 1.1811, + "step": 4530 + }, + { + "epoch": 0.3973314296910646, + "grad_norm": 0.061767578125, + "learning_rate": 0.002302517073320705, + "loss": 1.0967, + "step": 4531 + }, + { + "epoch": 0.3974191214654392, + "grad_norm": 0.0673828125, + "learning_rate": 0.0023021601438757444, + "loss": 1.2165, + "step": 4532 + }, + { + "epoch": 0.3975068132398137, + "grad_norm": 0.060791015625, + "learning_rate": 0.002301803154955463, + "loss": 1.1675, + "step": 4533 + }, + { + "epoch": 0.39759450501418825, + "grad_norm": 0.0615234375, + "learning_rate": 0.002301446106592416, + "loss": 1.1931, + "step": 4534 + }, + { + "epoch": 0.3976821967885628, + "grad_norm": 0.06689453125, + "learning_rate": 0.0023010889988191656, + "loss": 1.2032, + "step": 4535 + }, + { + "epoch": 0.3977698885629373, + "grad_norm": 0.059814453125, + "learning_rate": 0.0023007318316682797, + "loss": 1.1837, + "step": 4536 + }, + { + "epoch": 0.3978575803373119, + "grad_norm": 0.10400390625, + "learning_rate": 0.0023003746051723306, + "loss": 1.1994, + "step": 4537 + }, + { + "epoch": 0.39794527211168645, + "grad_norm": 0.0732421875, + "learning_rate": 0.002300017319363896, + "loss": 1.1971, + "step": 4538 + }, + { + "epoch": 0.39803296388606096, + "grad_norm": 0.0791015625, + "learning_rate": 0.0022996599742755607, + "loss": 1.1948, + "step": 4539 + }, + { + "epoch": 0.39812065566043553, + "grad_norm": 0.07373046875, + "learning_rate": 0.002299302569939912, + "loss": 1.1891, + "step": 4540 + }, + { + "epoch": 0.3982083474348101, + "grad_norm": 0.052001953125, + "learning_rate": 0.0022989451063895453, + "loss": 1.188, + "step": 4541 + }, + { + "epoch": 0.3982960392091846, + "grad_norm": 0.10400390625, + "learning_rate": 0.0022985875836570606, + "loss": 1.1454, + "step": 4542 + }, + { + "epoch": 0.39838373098355917, + "grad_norm": 0.05517578125, + "learning_rate": 0.002298230001775062, + "loss": 1.136, + "step": 4543 + }, + { + "epoch": 0.39847142275793374, + "grad_norm": 0.08349609375, + "learning_rate": 0.002297872360776161, + "loss": 1.2469, + "step": 4544 + }, + { + "epoch": 0.39855911453230825, + "grad_norm": 0.09375, + "learning_rate": 0.002297514660692973, + "loss": 1.2082, + "step": 4545 + }, + { + "epoch": 0.3986468063066828, + "grad_norm": 0.1162109375, + "learning_rate": 0.0022971569015581197, + "loss": 1.1765, + "step": 4546 + }, + { + "epoch": 0.3987344980810573, + "grad_norm": 0.0830078125, + "learning_rate": 0.0022967990834042275, + "loss": 1.1949, + "step": 4547 + }, + { + "epoch": 0.3988221898554319, + "grad_norm": 0.0556640625, + "learning_rate": 0.0022964412062639284, + "loss": 1.1734, + "step": 4548 + }, + { + "epoch": 0.39890988162980645, + "grad_norm": 0.05078125, + "learning_rate": 0.00229608327016986, + "loss": 1.2095, + "step": 4549 + }, + { + "epoch": 0.39899757340418096, + "grad_norm": 0.126953125, + "learning_rate": 0.0022957252751546657, + "loss": 1.1749, + "step": 4550 + }, + { + "epoch": 0.3990852651785555, + "grad_norm": 0.068359375, + "learning_rate": 0.0022953672212509928, + "loss": 1.2266, + "step": 4551 + }, + { + "epoch": 0.3991729569529301, + "grad_norm": 0.08984375, + "learning_rate": 0.002295009108491496, + "loss": 1.1793, + "step": 4552 + }, + { + "epoch": 0.3992606487273046, + "grad_norm": 0.052734375, + "learning_rate": 0.0022946509369088327, + "loss": 1.1168, + "step": 4553 + }, + { + "epoch": 0.39934834050167917, + "grad_norm": 0.06298828125, + "learning_rate": 0.0022942927065356684, + "loss": 1.2145, + "step": 4554 + }, + { + "epoch": 0.39943603227605373, + "grad_norm": 0.04931640625, + "learning_rate": 0.002293934417404673, + "loss": 1.1621, + "step": 4555 + }, + { + "epoch": 0.39952372405042824, + "grad_norm": 0.0576171875, + "learning_rate": 0.0022935760695485204, + "loss": 1.1743, + "step": 4556 + }, + { + "epoch": 0.3996114158248028, + "grad_norm": 0.0654296875, + "learning_rate": 0.002293217662999892, + "loss": 1.2235, + "step": 4557 + }, + { + "epoch": 0.3996991075991773, + "grad_norm": 0.059326171875, + "learning_rate": 0.0022928591977914733, + "loss": 1.2135, + "step": 4558 + }, + { + "epoch": 0.3997867993735519, + "grad_norm": 0.05029296875, + "learning_rate": 0.002292500673955955, + "loss": 1.1473, + "step": 4559 + }, + { + "epoch": 0.39987449114792645, + "grad_norm": 0.0517578125, + "learning_rate": 0.0022921420915260338, + "loss": 1.186, + "step": 4560 + }, + { + "epoch": 0.39996218292230096, + "grad_norm": 0.053466796875, + "learning_rate": 0.0022917834505344114, + "loss": 1.1226, + "step": 4561 + }, + { + "epoch": 0.4000498746966755, + "grad_norm": 0.055419921875, + "learning_rate": 0.0022914247510137952, + "loss": 1.1331, + "step": 4562 + }, + { + "epoch": 0.4001375664710501, + "grad_norm": 0.0673828125, + "learning_rate": 0.002291065992996898, + "loss": 1.1763, + "step": 4563 + }, + { + "epoch": 0.4002252582454246, + "grad_norm": 0.0634765625, + "learning_rate": 0.0022907071765164372, + "loss": 1.1668, + "step": 4564 + }, + { + "epoch": 0.40031295001979916, + "grad_norm": 0.058837890625, + "learning_rate": 0.0022903483016051347, + "loss": 1.2092, + "step": 4565 + }, + { + "epoch": 0.40040064179417373, + "grad_norm": 0.062255859375, + "learning_rate": 0.002289989368295721, + "loss": 1.1662, + "step": 4566 + }, + { + "epoch": 0.40048833356854824, + "grad_norm": 0.0869140625, + "learning_rate": 0.002289630376620929, + "loss": 1.2176, + "step": 4567 + }, + { + "epoch": 0.4005760253429228, + "grad_norm": 0.06494140625, + "learning_rate": 0.002289271326613498, + "loss": 1.183, + "step": 4568 + }, + { + "epoch": 0.4006637171172973, + "grad_norm": 0.056396484375, + "learning_rate": 0.002288912218306172, + "loss": 1.1982, + "step": 4569 + }, + { + "epoch": 0.4007514088916719, + "grad_norm": 0.1494140625, + "learning_rate": 0.002288553051731701, + "loss": 1.2129, + "step": 4570 + }, + { + "epoch": 0.40083910066604644, + "grad_norm": 0.0859375, + "learning_rate": 0.0022881938269228405, + "loss": 1.2557, + "step": 4571 + }, + { + "epoch": 0.40092679244042095, + "grad_norm": 0.06201171875, + "learning_rate": 0.00228783454391235, + "loss": 1.1269, + "step": 4572 + }, + { + "epoch": 0.4010144842147955, + "grad_norm": 0.0556640625, + "learning_rate": 0.002287475202732996, + "loss": 1.1852, + "step": 4573 + }, + { + "epoch": 0.4011021759891701, + "grad_norm": 0.0712890625, + "learning_rate": 0.0022871158034175484, + "loss": 1.2035, + "step": 4574 + }, + { + "epoch": 0.4011898677635446, + "grad_norm": 0.05859375, + "learning_rate": 0.0022867563459987856, + "loss": 1.2152, + "step": 4575 + }, + { + "epoch": 0.40127755953791916, + "grad_norm": 0.0634765625, + "learning_rate": 0.0022863968305094867, + "loss": 1.1928, + "step": 4576 + }, + { + "epoch": 0.4013652513122937, + "grad_norm": 0.059814453125, + "learning_rate": 0.00228603725698244, + "loss": 1.1563, + "step": 4577 + }, + { + "epoch": 0.40145294308666823, + "grad_norm": 0.059814453125, + "learning_rate": 0.0022856776254504374, + "loss": 1.1701, + "step": 4578 + }, + { + "epoch": 0.4015406348610428, + "grad_norm": 0.060302734375, + "learning_rate": 0.0022853179359462765, + "loss": 1.2062, + "step": 4579 + }, + { + "epoch": 0.4016283266354173, + "grad_norm": 0.07275390625, + "learning_rate": 0.00228495818850276, + "loss": 1.19, + "step": 4580 + }, + { + "epoch": 0.4017160184097919, + "grad_norm": 0.0732421875, + "learning_rate": 0.0022845983831526954, + "loss": 1.2226, + "step": 4581 + }, + { + "epoch": 0.40180371018416644, + "grad_norm": 0.0595703125, + "learning_rate": 0.002284238519928897, + "loss": 1.1557, + "step": 4582 + }, + { + "epoch": 0.40189140195854095, + "grad_norm": 0.06298828125, + "learning_rate": 0.0022838785988641822, + "loss": 1.1986, + "step": 4583 + }, + { + "epoch": 0.4019790937329155, + "grad_norm": 0.0517578125, + "learning_rate": 0.002283518619991376, + "loss": 1.189, + "step": 4584 + }, + { + "epoch": 0.4020667855072901, + "grad_norm": 0.06396484375, + "learning_rate": 0.002283158583343307, + "loss": 1.2521, + "step": 4585 + }, + { + "epoch": 0.4021544772816646, + "grad_norm": 0.054931640625, + "learning_rate": 0.0022827984889528096, + "loss": 1.2153, + "step": 4586 + }, + { + "epoch": 0.40224216905603916, + "grad_norm": 0.0576171875, + "learning_rate": 0.002282438336852723, + "loss": 1.1644, + "step": 4587 + }, + { + "epoch": 0.4023298608304137, + "grad_norm": 0.10009765625, + "learning_rate": 0.0022820781270758944, + "loss": 1.2138, + "step": 4588 + }, + { + "epoch": 0.40241755260478823, + "grad_norm": 0.0517578125, + "learning_rate": 0.0022817178596551708, + "loss": 1.2278, + "step": 4589 + }, + { + "epoch": 0.4025052443791628, + "grad_norm": 0.0771484375, + "learning_rate": 0.0022813575346234097, + "loss": 1.1527, + "step": 4590 + }, + { + "epoch": 0.40259293615353736, + "grad_norm": 0.0556640625, + "learning_rate": 0.002280997152013471, + "loss": 1.2198, + "step": 4591 + }, + { + "epoch": 0.40268062792791187, + "grad_norm": 0.08984375, + "learning_rate": 0.0022806367118582216, + "loss": 1.1774, + "step": 4592 + }, + { + "epoch": 0.40276831970228644, + "grad_norm": 0.09130859375, + "learning_rate": 0.002280276214190531, + "loss": 1.201, + "step": 4593 + }, + { + "epoch": 0.40285601147666095, + "grad_norm": 0.1025390625, + "learning_rate": 0.0022799156590432774, + "loss": 1.149, + "step": 4594 + }, + { + "epoch": 0.4029437032510355, + "grad_norm": 0.06298828125, + "learning_rate": 0.0022795550464493417, + "loss": 1.1615, + "step": 4595 + }, + { + "epoch": 0.4030313950254101, + "grad_norm": 0.12255859375, + "learning_rate": 0.0022791943764416114, + "loss": 1.2375, + "step": 4596 + }, + { + "epoch": 0.4031190867997846, + "grad_norm": 0.05517578125, + "learning_rate": 0.0022788336490529777, + "loss": 1.1875, + "step": 4597 + }, + { + "epoch": 0.40320677857415915, + "grad_norm": 0.134765625, + "learning_rate": 0.0022784728643163387, + "loss": 1.2051, + "step": 4598 + }, + { + "epoch": 0.4032944703485337, + "grad_norm": 0.05078125, + "learning_rate": 0.002278112022264597, + "loss": 1.2171, + "step": 4599 + }, + { + "epoch": 0.4033821621229082, + "grad_norm": 0.08984375, + "learning_rate": 0.0022777511229306597, + "loss": 1.1783, + "step": 4600 + }, + { + "epoch": 0.4034698538972828, + "grad_norm": 0.1201171875, + "learning_rate": 0.0022773901663474413, + "loss": 1.139, + "step": 4601 + }, + { + "epoch": 0.40355754567165736, + "grad_norm": 0.11376953125, + "learning_rate": 0.002277029152547859, + "loss": 1.1809, + "step": 4602 + }, + { + "epoch": 0.40364523744603187, + "grad_norm": 0.05712890625, + "learning_rate": 0.002276668081564837, + "loss": 1.1842, + "step": 4603 + }, + { + "epoch": 0.40373292922040643, + "grad_norm": 0.05078125, + "learning_rate": 0.0022763069534313027, + "loss": 1.229, + "step": 4604 + }, + { + "epoch": 0.40382062099478094, + "grad_norm": 0.05126953125, + "learning_rate": 0.002275945768180192, + "loss": 1.2391, + "step": 4605 + }, + { + "epoch": 0.4039083127691555, + "grad_norm": 0.058837890625, + "learning_rate": 0.002275584525844443, + "loss": 1.2351, + "step": 4606 + }, + { + "epoch": 0.4039960045435301, + "grad_norm": 0.05029296875, + "learning_rate": 0.0022752232264569997, + "loss": 1.2537, + "step": 4607 + }, + { + "epoch": 0.4040836963179046, + "grad_norm": 0.050048828125, + "learning_rate": 0.0022748618700508126, + "loss": 1.1731, + "step": 4608 + }, + { + "epoch": 0.40417138809227915, + "grad_norm": 0.06640625, + "learning_rate": 0.002274500456658836, + "loss": 1.2042, + "step": 4609 + }, + { + "epoch": 0.4042590798666537, + "grad_norm": 0.0703125, + "learning_rate": 0.0022741389863140293, + "loss": 1.1686, + "step": 4610 + }, + { + "epoch": 0.4043467716410282, + "grad_norm": 0.04931640625, + "learning_rate": 0.002273777459049359, + "loss": 1.139, + "step": 4611 + }, + { + "epoch": 0.4044344634154028, + "grad_norm": 0.0712890625, + "learning_rate": 0.0022734158748977944, + "loss": 1.176, + "step": 4612 + }, + { + "epoch": 0.40452215518977735, + "grad_norm": 0.11474609375, + "learning_rate": 0.0022730542338923114, + "loss": 1.2132, + "step": 4613 + }, + { + "epoch": 0.40460984696415186, + "grad_norm": 0.061279296875, + "learning_rate": 0.0022726925360658902, + "loss": 1.1708, + "step": 4614 + }, + { + "epoch": 0.40469753873852643, + "grad_norm": 0.052978515625, + "learning_rate": 0.0022723307814515184, + "loss": 1.1551, + "step": 4615 + }, + { + "epoch": 0.40478523051290094, + "grad_norm": 0.0947265625, + "learning_rate": 0.0022719689700821852, + "loss": 1.1605, + "step": 4616 + }, + { + "epoch": 0.4048729222872755, + "grad_norm": 0.05810546875, + "learning_rate": 0.0022716071019908872, + "loss": 1.2081, + "step": 4617 + }, + { + "epoch": 0.40496061406165007, + "grad_norm": 0.09326171875, + "learning_rate": 0.0022712451772106262, + "loss": 1.2364, + "step": 4618 + }, + { + "epoch": 0.4050483058360246, + "grad_norm": 0.07177734375, + "learning_rate": 0.002270883195774409, + "loss": 1.2434, + "step": 4619 + }, + { + "epoch": 0.40513599761039915, + "grad_norm": 0.06982421875, + "learning_rate": 0.002270521157715247, + "loss": 1.189, + "step": 4620 + }, + { + "epoch": 0.4052236893847737, + "grad_norm": 0.06689453125, + "learning_rate": 0.0022701590630661577, + "loss": 1.1543, + "step": 4621 + }, + { + "epoch": 0.4053113811591482, + "grad_norm": 0.052001953125, + "learning_rate": 0.0022697969118601626, + "loss": 1.1583, + "step": 4622 + }, + { + "epoch": 0.4053990729335228, + "grad_norm": 0.06201171875, + "learning_rate": 0.0022694347041302887, + "loss": 1.2092, + "step": 4623 + }, + { + "epoch": 0.40548676470789735, + "grad_norm": 0.07568359375, + "learning_rate": 0.0022690724399095693, + "loss": 1.2342, + "step": 4624 + }, + { + "epoch": 0.40557445648227186, + "grad_norm": 0.05810546875, + "learning_rate": 0.0022687101192310414, + "loss": 1.1907, + "step": 4625 + }, + { + "epoch": 0.4056621482566464, + "grad_norm": 0.056884765625, + "learning_rate": 0.0022683477421277477, + "loss": 1.2634, + "step": 4626 + }, + { + "epoch": 0.405749840031021, + "grad_norm": 0.060546875, + "learning_rate": 0.0022679853086327363, + "loss": 1.2333, + "step": 4627 + }, + { + "epoch": 0.4058375318053955, + "grad_norm": 0.045654296875, + "learning_rate": 0.00226762281877906, + "loss": 1.1411, + "step": 4628 + }, + { + "epoch": 0.40592522357977007, + "grad_norm": 0.0556640625, + "learning_rate": 0.002267260272599777, + "loss": 1.2024, + "step": 4629 + }, + { + "epoch": 0.4060129153541446, + "grad_norm": 0.07861328125, + "learning_rate": 0.002266897670127951, + "loss": 1.1921, + "step": 4630 + }, + { + "epoch": 0.40610060712851914, + "grad_norm": 0.05615234375, + "learning_rate": 0.0022665350113966498, + "loss": 1.2146, + "step": 4631 + }, + { + "epoch": 0.4061882989028937, + "grad_norm": 0.10693359375, + "learning_rate": 0.0022661722964389466, + "loss": 1.1346, + "step": 4632 + }, + { + "epoch": 0.4062759906772682, + "grad_norm": 0.050537109375, + "learning_rate": 0.0022658095252879214, + "loss": 1.1703, + "step": 4633 + }, + { + "epoch": 0.4063636824516428, + "grad_norm": 0.06689453125, + "learning_rate": 0.0022654466979766565, + "loss": 1.1633, + "step": 4634 + }, + { + "epoch": 0.40645137422601735, + "grad_norm": 0.053955078125, + "learning_rate": 0.0022650838145382417, + "loss": 1.1517, + "step": 4635 + }, + { + "epoch": 0.40653906600039186, + "grad_norm": 0.06640625, + "learning_rate": 0.002264720875005771, + "loss": 1.2363, + "step": 4636 + }, + { + "epoch": 0.4066267577747664, + "grad_norm": 0.0908203125, + "learning_rate": 0.0022643578794123436, + "loss": 1.2092, + "step": 4637 + }, + { + "epoch": 0.406714449549141, + "grad_norm": 0.0703125, + "learning_rate": 0.0022639948277910628, + "loss": 1.2184, + "step": 4638 + }, + { + "epoch": 0.4068021413235155, + "grad_norm": 0.055908203125, + "learning_rate": 0.0022636317201750397, + "loss": 1.1903, + "step": 4639 + }, + { + "epoch": 0.40688983309789006, + "grad_norm": 0.06689453125, + "learning_rate": 0.0022632685565973872, + "loss": 1.1191, + "step": 4640 + }, + { + "epoch": 0.4069775248722646, + "grad_norm": 0.054443359375, + "learning_rate": 0.0022629053370912255, + "loss": 1.1288, + "step": 4641 + }, + { + "epoch": 0.40706521664663914, + "grad_norm": 0.07666015625, + "learning_rate": 0.002262542061689679, + "loss": 1.2015, + "step": 4642 + }, + { + "epoch": 0.4071529084210137, + "grad_norm": 0.054931640625, + "learning_rate": 0.0022621787304258783, + "loss": 1.2369, + "step": 4643 + }, + { + "epoch": 0.4072406001953882, + "grad_norm": 0.048583984375, + "learning_rate": 0.0022618153433329575, + "loss": 1.2255, + "step": 4644 + }, + { + "epoch": 0.4073282919697628, + "grad_norm": 0.0546875, + "learning_rate": 0.0022614519004440573, + "loss": 1.201, + "step": 4645 + }, + { + "epoch": 0.40741598374413734, + "grad_norm": 0.0673828125, + "learning_rate": 0.0022610884017923217, + "loss": 1.1814, + "step": 4646 + }, + { + "epoch": 0.40750367551851185, + "grad_norm": 0.060791015625, + "learning_rate": 0.002260724847410901, + "loss": 1.212, + "step": 4647 + }, + { + "epoch": 0.4075913672928864, + "grad_norm": 0.05810546875, + "learning_rate": 0.0022603612373329517, + "loss": 1.2029, + "step": 4648 + }, + { + "epoch": 0.407679059067261, + "grad_norm": 0.06787109375, + "learning_rate": 0.0022599975715916327, + "loss": 1.1847, + "step": 4649 + }, + { + "epoch": 0.4077667508416355, + "grad_norm": 0.058349609375, + "learning_rate": 0.00225963385022011, + "loss": 1.2206, + "step": 4650 + }, + { + "epoch": 0.40785444261601006, + "grad_norm": 0.07666015625, + "learning_rate": 0.002259270073251553, + "loss": 1.1588, + "step": 4651 + }, + { + "epoch": 0.40794213439038457, + "grad_norm": 0.05224609375, + "learning_rate": 0.0022589062407191397, + "loss": 1.2069, + "step": 4652 + }, + { + "epoch": 0.40802982616475914, + "grad_norm": 0.08154296875, + "learning_rate": 0.002258542352656048, + "loss": 1.1887, + "step": 4653 + }, + { + "epoch": 0.4081175179391337, + "grad_norm": 0.050537109375, + "learning_rate": 0.0022581784090954654, + "loss": 1.1534, + "step": 4654 + }, + { + "epoch": 0.4082052097135082, + "grad_norm": 0.05322265625, + "learning_rate": 0.002257814410070581, + "loss": 1.2116, + "step": 4655 + }, + { + "epoch": 0.4082929014878828, + "grad_norm": 0.05126953125, + "learning_rate": 0.0022574503556145923, + "loss": 1.1905, + "step": 4656 + }, + { + "epoch": 0.40838059326225734, + "grad_norm": 0.056884765625, + "learning_rate": 0.0022570862457606993, + "loss": 1.2149, + "step": 4657 + }, + { + "epoch": 0.40846828503663185, + "grad_norm": 0.052734375, + "learning_rate": 0.0022567220805421073, + "loss": 1.1825, + "step": 4658 + }, + { + "epoch": 0.4085559768110064, + "grad_norm": 0.052001953125, + "learning_rate": 0.0022563578599920277, + "loss": 1.1967, + "step": 4659 + }, + { + "epoch": 0.408643668585381, + "grad_norm": 0.078125, + "learning_rate": 0.002255993584143677, + "loss": 1.1686, + "step": 4660 + }, + { + "epoch": 0.4087313603597555, + "grad_norm": 0.078125, + "learning_rate": 0.002255629253030276, + "loss": 1.1378, + "step": 4661 + }, + { + "epoch": 0.40881905213413006, + "grad_norm": 0.1279296875, + "learning_rate": 0.00225526486668505, + "loss": 1.1769, + "step": 4662 + }, + { + "epoch": 0.4089067439085046, + "grad_norm": 0.05078125, + "learning_rate": 0.0022549004251412315, + "loss": 1.2198, + "step": 4663 + }, + { + "epoch": 0.40899443568287913, + "grad_norm": 0.05615234375, + "learning_rate": 0.0022545359284320554, + "loss": 1.2296, + "step": 4664 + }, + { + "epoch": 0.4090821274572537, + "grad_norm": 0.06494140625, + "learning_rate": 0.0022541713765907626, + "loss": 1.2574, + "step": 4665 + }, + { + "epoch": 0.4091698192316282, + "grad_norm": 0.06591796875, + "learning_rate": 0.0022538067696506003, + "loss": 1.1995, + "step": 4666 + }, + { + "epoch": 0.40925751100600277, + "grad_norm": 0.06005859375, + "learning_rate": 0.0022534421076448197, + "loss": 1.1303, + "step": 4667 + }, + { + "epoch": 0.40934520278037734, + "grad_norm": 0.05810546875, + "learning_rate": 0.0022530773906066764, + "loss": 1.2421, + "step": 4668 + }, + { + "epoch": 0.40943289455475185, + "grad_norm": 0.09716796875, + "learning_rate": 0.0022527126185694314, + "loss": 1.2229, + "step": 4669 + }, + { + "epoch": 0.4095205863291264, + "grad_norm": 0.06689453125, + "learning_rate": 0.002252347791566352, + "loss": 1.1714, + "step": 4670 + }, + { + "epoch": 0.409608278103501, + "grad_norm": 0.07958984375, + "learning_rate": 0.002251982909630709, + "loss": 1.1859, + "step": 4671 + }, + { + "epoch": 0.4096959698778755, + "grad_norm": 0.07373046875, + "learning_rate": 0.0022516179727957784, + "loss": 1.163, + "step": 4672 + }, + { + "epoch": 0.40978366165225005, + "grad_norm": 0.055908203125, + "learning_rate": 0.002251252981094842, + "loss": 1.211, + "step": 4673 + }, + { + "epoch": 0.4098713534266246, + "grad_norm": 0.059814453125, + "learning_rate": 0.002250887934561186, + "loss": 1.2209, + "step": 4674 + }, + { + "epoch": 0.40995904520099913, + "grad_norm": 0.12158203125, + "learning_rate": 0.002250522833228101, + "loss": 1.1775, + "step": 4675 + }, + { + "epoch": 0.4100467369753737, + "grad_norm": 0.058837890625, + "learning_rate": 0.002250157677128884, + "loss": 1.1566, + "step": 4676 + }, + { + "epoch": 0.4101344287497482, + "grad_norm": 0.12109375, + "learning_rate": 0.0022497924662968362, + "loss": 1.1601, + "step": 4677 + }, + { + "epoch": 0.41022212052412277, + "grad_norm": 0.0712890625, + "learning_rate": 0.002249427200765264, + "loss": 1.1969, + "step": 4678 + }, + { + "epoch": 0.41030981229849733, + "grad_norm": 0.08154296875, + "learning_rate": 0.0022490618805674784, + "loss": 1.1831, + "step": 4679 + }, + { + "epoch": 0.41039750407287184, + "grad_norm": 0.07470703125, + "learning_rate": 0.002248696505736796, + "loss": 1.2024, + "step": 4680 + }, + { + "epoch": 0.4104851958472464, + "grad_norm": 0.0595703125, + "learning_rate": 0.0022483310763065376, + "loss": 1.1946, + "step": 4681 + }, + { + "epoch": 0.410572887621621, + "grad_norm": 0.16015625, + "learning_rate": 0.00224796559231003, + "loss": 1.1869, + "step": 4682 + }, + { + "epoch": 0.4106605793959955, + "grad_norm": 0.06640625, + "learning_rate": 0.0022476000537806034, + "loss": 1.2125, + "step": 4683 + }, + { + "epoch": 0.41074827117037005, + "grad_norm": 0.1142578125, + "learning_rate": 0.0022472344607515947, + "loss": 1.1468, + "step": 4684 + }, + { + "epoch": 0.4108359629447446, + "grad_norm": 0.07373046875, + "learning_rate": 0.002246868813256345, + "loss": 1.1662, + "step": 4685 + }, + { + "epoch": 0.4109236547191191, + "grad_norm": 0.0732421875, + "learning_rate": 0.0022465031113282005, + "loss": 1.2063, + "step": 4686 + }, + { + "epoch": 0.4110113464934937, + "grad_norm": 0.12158203125, + "learning_rate": 0.002246137355000512, + "loss": 1.1611, + "step": 4687 + }, + { + "epoch": 0.4110990382678682, + "grad_norm": 0.0615234375, + "learning_rate": 0.002245771544306636, + "loss": 1.1623, + "step": 4688 + }, + { + "epoch": 0.41118673004224277, + "grad_norm": 0.1484375, + "learning_rate": 0.002245405679279932, + "loss": 1.1834, + "step": 4689 + }, + { + "epoch": 0.41127442181661733, + "grad_norm": 0.060302734375, + "learning_rate": 0.002245039759953768, + "loss": 1.2068, + "step": 4690 + }, + { + "epoch": 0.41136211359099184, + "grad_norm": 0.138671875, + "learning_rate": 0.0022446737863615145, + "loss": 1.1509, + "step": 4691 + }, + { + "epoch": 0.4114498053653664, + "grad_norm": 0.09765625, + "learning_rate": 0.002244307758536546, + "loss": 1.1721, + "step": 4692 + }, + { + "epoch": 0.41153749713974097, + "grad_norm": 0.0693359375, + "learning_rate": 0.002243941676512244, + "loss": 1.1564, + "step": 4693 + }, + { + "epoch": 0.4116251889141155, + "grad_norm": 0.1591796875, + "learning_rate": 0.002243575540321995, + "loss": 1.1817, + "step": 4694 + }, + { + "epoch": 0.41171288068849005, + "grad_norm": 0.0693359375, + "learning_rate": 0.002243209349999189, + "loss": 1.1845, + "step": 4695 + }, + { + "epoch": 0.4118005724628646, + "grad_norm": 0.134765625, + "learning_rate": 0.002242843105577221, + "loss": 1.1246, + "step": 4696 + }, + { + "epoch": 0.4118882642372391, + "grad_norm": 0.1142578125, + "learning_rate": 0.0022424768070894922, + "loss": 1.1809, + "step": 4697 + }, + { + "epoch": 0.4119759560116137, + "grad_norm": 0.087890625, + "learning_rate": 0.0022421104545694084, + "loss": 1.1924, + "step": 4698 + }, + { + "epoch": 0.41206364778598825, + "grad_norm": 0.12158203125, + "learning_rate": 0.00224174404805038, + "loss": 1.1817, + "step": 4699 + }, + { + "epoch": 0.41215133956036276, + "grad_norm": 0.08544921875, + "learning_rate": 0.0022413775875658216, + "loss": 1.178, + "step": 4700 + }, + { + "epoch": 0.4122390313347373, + "grad_norm": 0.09423828125, + "learning_rate": 0.0022410110731491536, + "loss": 1.1739, + "step": 4701 + }, + { + "epoch": 0.41232672310911184, + "grad_norm": 0.11279296875, + "learning_rate": 0.0022406445048338015, + "loss": 1.1165, + "step": 4702 + }, + { + "epoch": 0.4124144148834864, + "grad_norm": 0.05078125, + "learning_rate": 0.002240277882653196, + "loss": 1.1801, + "step": 4703 + }, + { + "epoch": 0.41250210665786097, + "grad_norm": 0.134765625, + "learning_rate": 0.0022399112066407707, + "loss": 1.1847, + "step": 4704 + }, + { + "epoch": 0.4125897984322355, + "grad_norm": 0.06640625, + "learning_rate": 0.0022395444768299666, + "loss": 1.1788, + "step": 4705 + }, + { + "epoch": 0.41267749020661004, + "grad_norm": 0.11279296875, + "learning_rate": 0.0022391776932542276, + "loss": 1.2305, + "step": 4706 + }, + { + "epoch": 0.4127651819809846, + "grad_norm": 0.0751953125, + "learning_rate": 0.002238810855947004, + "loss": 1.2245, + "step": 4707 + }, + { + "epoch": 0.4128528737553591, + "grad_norm": 0.0673828125, + "learning_rate": 0.002238443964941751, + "loss": 1.2323, + "step": 4708 + }, + { + "epoch": 0.4129405655297337, + "grad_norm": 0.08544921875, + "learning_rate": 0.002238077020271927, + "loss": 1.1676, + "step": 4709 + }, + { + "epoch": 0.41302825730410825, + "grad_norm": 0.08251953125, + "learning_rate": 0.002237710021970997, + "loss": 1.1967, + "step": 4710 + }, + { + "epoch": 0.41311594907848276, + "grad_norm": 0.051513671875, + "learning_rate": 0.00223734297007243, + "loss": 1.1317, + "step": 4711 + }, + { + "epoch": 0.4132036408528573, + "grad_norm": 0.09619140625, + "learning_rate": 0.0022369758646097007, + "loss": 1.1089, + "step": 4712 + }, + { + "epoch": 0.41329133262723183, + "grad_norm": 0.0888671875, + "learning_rate": 0.002236608705616287, + "loss": 1.233, + "step": 4713 + }, + { + "epoch": 0.4133790244016064, + "grad_norm": 0.057861328125, + "learning_rate": 0.0022362414931256748, + "loss": 1.1774, + "step": 4714 + }, + { + "epoch": 0.41346671617598096, + "grad_norm": 0.1044921875, + "learning_rate": 0.002235874227171351, + "loss": 1.154, + "step": 4715 + }, + { + "epoch": 0.4135544079503555, + "grad_norm": 0.09716796875, + "learning_rate": 0.0022355069077868105, + "loss": 1.2476, + "step": 4716 + }, + { + "epoch": 0.41364209972473004, + "grad_norm": 0.0498046875, + "learning_rate": 0.0022351395350055513, + "loss": 1.1992, + "step": 4717 + }, + { + "epoch": 0.4137297914991046, + "grad_norm": 0.11181640625, + "learning_rate": 0.002234772108861077, + "loss": 1.1768, + "step": 4718 + }, + { + "epoch": 0.4138174832734791, + "grad_norm": 0.1201171875, + "learning_rate": 0.002234404629386896, + "loss": 1.1977, + "step": 4719 + }, + { + "epoch": 0.4139051750478537, + "grad_norm": 0.061279296875, + "learning_rate": 0.002234037096616521, + "loss": 1.1995, + "step": 4720 + }, + { + "epoch": 0.41399286682222824, + "grad_norm": 0.11474609375, + "learning_rate": 0.0022336695105834715, + "loss": 1.1652, + "step": 4721 + }, + { + "epoch": 0.41408055859660275, + "grad_norm": 0.060302734375, + "learning_rate": 0.0022333018713212686, + "loss": 1.1718, + "step": 4722 + }, + { + "epoch": 0.4141682503709773, + "grad_norm": 0.0732421875, + "learning_rate": 0.002232934178863441, + "loss": 1.1668, + "step": 4723 + }, + { + "epoch": 0.41425594214535183, + "grad_norm": 0.05517578125, + "learning_rate": 0.0022325664332435214, + "loss": 1.2276, + "step": 4724 + }, + { + "epoch": 0.4143436339197264, + "grad_norm": 0.056640625, + "learning_rate": 0.0022321986344950467, + "loss": 1.1783, + "step": 4725 + }, + { + "epoch": 0.41443132569410096, + "grad_norm": 0.0634765625, + "learning_rate": 0.0022318307826515596, + "loss": 1.211, + "step": 4726 + }, + { + "epoch": 0.41451901746847547, + "grad_norm": 0.05029296875, + "learning_rate": 0.0022314628777466068, + "loss": 1.2016, + "step": 4727 + }, + { + "epoch": 0.41460670924285004, + "grad_norm": 0.056640625, + "learning_rate": 0.0022310949198137407, + "loss": 1.1836, + "step": 4728 + }, + { + "epoch": 0.4146944010172246, + "grad_norm": 0.05419921875, + "learning_rate": 0.002230726908886518, + "loss": 1.177, + "step": 4729 + }, + { + "epoch": 0.4147820927915991, + "grad_norm": 0.057373046875, + "learning_rate": 0.0022303588449985, + "loss": 1.1801, + "step": 4730 + }, + { + "epoch": 0.4148697845659737, + "grad_norm": 0.0576171875, + "learning_rate": 0.0022299907281832544, + "loss": 1.1692, + "step": 4731 + }, + { + "epoch": 0.41495747634034824, + "grad_norm": 0.053466796875, + "learning_rate": 0.002229622558474351, + "loss": 1.1842, + "step": 4732 + }, + { + "epoch": 0.41504516811472275, + "grad_norm": 0.05810546875, + "learning_rate": 0.0022292543359053673, + "loss": 1.192, + "step": 4733 + }, + { + "epoch": 0.4151328598890973, + "grad_norm": 0.06005859375, + "learning_rate": 0.0022288860605098825, + "loss": 1.1334, + "step": 4734 + }, + { + "epoch": 0.4152205516634719, + "grad_norm": 0.05419921875, + "learning_rate": 0.0022285177323214836, + "loss": 1.1655, + "step": 4735 + }, + { + "epoch": 0.4153082434378464, + "grad_norm": 0.07958984375, + "learning_rate": 0.0022281493513737613, + "loss": 1.2125, + "step": 4736 + }, + { + "epoch": 0.41539593521222096, + "grad_norm": 0.080078125, + "learning_rate": 0.0022277809177003102, + "loss": 1.2054, + "step": 4737 + }, + { + "epoch": 0.41548362698659547, + "grad_norm": 0.083984375, + "learning_rate": 0.0022274124313347316, + "loss": 1.1648, + "step": 4738 + }, + { + "epoch": 0.41557131876097003, + "grad_norm": 0.05078125, + "learning_rate": 0.002227043892310629, + "loss": 1.1985, + "step": 4739 + }, + { + "epoch": 0.4156590105353446, + "grad_norm": 0.1357421875, + "learning_rate": 0.002226675300661614, + "loss": 1.1521, + "step": 4740 + }, + { + "epoch": 0.4157467023097191, + "grad_norm": 0.06494140625, + "learning_rate": 0.0022263066564212996, + "loss": 1.137, + "step": 4741 + }, + { + "epoch": 0.4158343940840937, + "grad_norm": 0.11474609375, + "learning_rate": 0.0022259379596233065, + "loss": 1.2149, + "step": 4742 + }, + { + "epoch": 0.41592208585846824, + "grad_norm": 0.06982421875, + "learning_rate": 0.0022255692103012575, + "loss": 1.2328, + "step": 4743 + }, + { + "epoch": 0.41600977763284275, + "grad_norm": 0.055419921875, + "learning_rate": 0.0022252004084887824, + "loss": 1.2007, + "step": 4744 + }, + { + "epoch": 0.4160974694072173, + "grad_norm": 0.1064453125, + "learning_rate": 0.0022248315542195154, + "loss": 1.2149, + "step": 4745 + }, + { + "epoch": 0.4161851611815919, + "grad_norm": 0.059814453125, + "learning_rate": 0.002224462647527094, + "loss": 1.1812, + "step": 4746 + }, + { + "epoch": 0.4162728529559664, + "grad_norm": 0.062255859375, + "learning_rate": 0.0022240936884451628, + "loss": 1.2071, + "step": 4747 + }, + { + "epoch": 0.41636054473034095, + "grad_norm": 0.11572265625, + "learning_rate": 0.002223724677007369, + "loss": 1.2017, + "step": 4748 + }, + { + "epoch": 0.41644823650471546, + "grad_norm": 0.07275390625, + "learning_rate": 0.0022233556132473654, + "loss": 1.1635, + "step": 4749 + }, + { + "epoch": 0.41653592827909003, + "grad_norm": 0.10888671875, + "learning_rate": 0.0022229864971988105, + "loss": 1.1337, + "step": 4750 + }, + { + "epoch": 0.4166236200534646, + "grad_norm": 0.130859375, + "learning_rate": 0.002222617328895366, + "loss": 1.1905, + "step": 4751 + }, + { + "epoch": 0.4167113118278391, + "grad_norm": 0.0703125, + "learning_rate": 0.002222248108370699, + "loss": 1.1773, + "step": 4752 + }, + { + "epoch": 0.41679900360221367, + "grad_norm": 0.107421875, + "learning_rate": 0.0022218788356584817, + "loss": 1.229, + "step": 4753 + }, + { + "epoch": 0.41688669537658823, + "grad_norm": 0.0576171875, + "learning_rate": 0.002221509510792391, + "loss": 1.1346, + "step": 4754 + }, + { + "epoch": 0.41697438715096274, + "grad_norm": 0.07861328125, + "learning_rate": 0.002221140133806109, + "loss": 1.1473, + "step": 4755 + }, + { + "epoch": 0.4170620789253373, + "grad_norm": 0.0703125, + "learning_rate": 0.0022207707047333203, + "loss": 1.1951, + "step": 4756 + }, + { + "epoch": 0.4171497706997119, + "grad_norm": 0.05810546875, + "learning_rate": 0.0022204012236077173, + "loss": 1.2017, + "step": 4757 + }, + { + "epoch": 0.4172374624740864, + "grad_norm": 0.09521484375, + "learning_rate": 0.002220031690462995, + "loss": 1.1507, + "step": 4758 + }, + { + "epoch": 0.41732515424846095, + "grad_norm": 0.056396484375, + "learning_rate": 0.0022196621053328543, + "loss": 1.1191, + "step": 4759 + }, + { + "epoch": 0.41741284602283546, + "grad_norm": 0.0634765625, + "learning_rate": 0.0022192924682509995, + "loss": 1.1995, + "step": 4760 + }, + { + "epoch": 0.41750053779721, + "grad_norm": 0.099609375, + "learning_rate": 0.002218922779251142, + "loss": 1.1791, + "step": 4761 + }, + { + "epoch": 0.4175882295715846, + "grad_norm": 0.055908203125, + "learning_rate": 0.0022185530383669948, + "loss": 1.1984, + "step": 4762 + }, + { + "epoch": 0.4176759213459591, + "grad_norm": 0.10009765625, + "learning_rate": 0.002218183245632279, + "loss": 1.1598, + "step": 4763 + }, + { + "epoch": 0.41776361312033367, + "grad_norm": 0.06689453125, + "learning_rate": 0.0022178134010807176, + "loss": 1.1963, + "step": 4764 + }, + { + "epoch": 0.41785130489470823, + "grad_norm": 0.0986328125, + "learning_rate": 0.00221744350474604, + "loss": 1.1663, + "step": 4765 + }, + { + "epoch": 0.41793899666908274, + "grad_norm": 0.058837890625, + "learning_rate": 0.0022170735566619795, + "loss": 1.1885, + "step": 4766 + }, + { + "epoch": 0.4180266884434573, + "grad_norm": 0.10546875, + "learning_rate": 0.0022167035568622746, + "loss": 1.2168, + "step": 4767 + }, + { + "epoch": 0.41811438021783187, + "grad_norm": 0.05859375, + "learning_rate": 0.002216333505380668, + "loss": 1.1708, + "step": 4768 + }, + { + "epoch": 0.4182020719922064, + "grad_norm": 0.07177734375, + "learning_rate": 0.002215963402250908, + "loss": 1.1844, + "step": 4769 + }, + { + "epoch": 0.41828976376658095, + "grad_norm": 0.07568359375, + "learning_rate": 0.0022155932475067465, + "loss": 1.1693, + "step": 4770 + }, + { + "epoch": 0.4183774555409555, + "grad_norm": 0.057861328125, + "learning_rate": 0.0022152230411819408, + "loss": 1.2251, + "step": 4771 + }, + { + "epoch": 0.41846514731533, + "grad_norm": 0.1123046875, + "learning_rate": 0.0022148527833102533, + "loss": 1.1673, + "step": 4772 + }, + { + "epoch": 0.4185528390897046, + "grad_norm": 0.06494140625, + "learning_rate": 0.0022144824739254495, + "loss": 1.264, + "step": 4773 + }, + { + "epoch": 0.4186405308640791, + "grad_norm": 0.04931640625, + "learning_rate": 0.002214112113061302, + "loss": 1.2484, + "step": 4774 + }, + { + "epoch": 0.41872822263845366, + "grad_norm": 0.0830078125, + "learning_rate": 0.002213741700751586, + "loss": 1.224, + "step": 4775 + }, + { + "epoch": 0.4188159144128282, + "grad_norm": 0.05322265625, + "learning_rate": 0.002213371237030082, + "loss": 1.2451, + "step": 4776 + }, + { + "epoch": 0.41890360618720274, + "grad_norm": 0.11669921875, + "learning_rate": 0.0022130007219305753, + "loss": 1.2618, + "step": 4777 + }, + { + "epoch": 0.4189912979615773, + "grad_norm": 0.06982421875, + "learning_rate": 0.0022126301554868566, + "loss": 1.2119, + "step": 4778 + }, + { + "epoch": 0.41907898973595187, + "grad_norm": 0.10107421875, + "learning_rate": 0.0022122595377327193, + "loss": 1.156, + "step": 4779 + }, + { + "epoch": 0.4191666815103264, + "grad_norm": 0.083984375, + "learning_rate": 0.0022118888687019645, + "loss": 1.1695, + "step": 4780 + }, + { + "epoch": 0.41925437328470094, + "grad_norm": 0.05322265625, + "learning_rate": 0.0022115181484283946, + "loss": 1.1473, + "step": 4781 + }, + { + "epoch": 0.4193420650590755, + "grad_norm": 0.09765625, + "learning_rate": 0.0022111473769458197, + "loss": 1.2108, + "step": 4782 + }, + { + "epoch": 0.41942975683345, + "grad_norm": 0.0654296875, + "learning_rate": 0.0022107765542880528, + "loss": 1.1664, + "step": 4783 + }, + { + "epoch": 0.4195174486078246, + "grad_norm": 0.09375, + "learning_rate": 0.002210405680488911, + "loss": 1.1956, + "step": 4784 + }, + { + "epoch": 0.4196051403821991, + "grad_norm": 0.07666015625, + "learning_rate": 0.002210034755582219, + "loss": 1.2688, + "step": 4785 + }, + { + "epoch": 0.41969283215657366, + "grad_norm": 0.060302734375, + "learning_rate": 0.0022096637796018017, + "loss": 1.1942, + "step": 4786 + }, + { + "epoch": 0.4197805239309482, + "grad_norm": 0.061767578125, + "learning_rate": 0.002209292752581493, + "loss": 1.1719, + "step": 4787 + }, + { + "epoch": 0.41986821570532273, + "grad_norm": 0.064453125, + "learning_rate": 0.0022089216745551287, + "loss": 1.1974, + "step": 4788 + }, + { + "epoch": 0.4199559074796973, + "grad_norm": 0.048828125, + "learning_rate": 0.0022085505455565514, + "loss": 1.1684, + "step": 4789 + }, + { + "epoch": 0.42004359925407186, + "grad_norm": 0.0751953125, + "learning_rate": 0.0022081793656196056, + "loss": 1.2018, + "step": 4790 + }, + { + "epoch": 0.4201312910284464, + "grad_norm": 0.06201171875, + "learning_rate": 0.002207808134778143, + "loss": 1.1625, + "step": 4791 + }, + { + "epoch": 0.42021898280282094, + "grad_norm": 0.05908203125, + "learning_rate": 0.002207436853066018, + "loss": 1.1541, + "step": 4792 + }, + { + "epoch": 0.4203066745771955, + "grad_norm": 0.051025390625, + "learning_rate": 0.0022070655205170915, + "loss": 1.1677, + "step": 4793 + }, + { + "epoch": 0.42039436635157, + "grad_norm": 0.05517578125, + "learning_rate": 0.0022066941371652275, + "loss": 1.1971, + "step": 4794 + }, + { + "epoch": 0.4204820581259446, + "grad_norm": 0.052001953125, + "learning_rate": 0.002206322703044295, + "loss": 1.1807, + "step": 4795 + }, + { + "epoch": 0.4205697499003191, + "grad_norm": 0.05029296875, + "learning_rate": 0.0022059512181881683, + "loss": 1.2052, + "step": 4796 + }, + { + "epoch": 0.42065744167469366, + "grad_norm": 0.07080078125, + "learning_rate": 0.0022055796826307254, + "loss": 1.1778, + "step": 4797 + }, + { + "epoch": 0.4207451334490682, + "grad_norm": 0.052490234375, + "learning_rate": 0.00220520809640585, + "loss": 1.1647, + "step": 4798 + }, + { + "epoch": 0.42083282522344273, + "grad_norm": 0.05078125, + "learning_rate": 0.002204836459547429, + "loss": 1.2059, + "step": 4799 + }, + { + "epoch": 0.4209205169978173, + "grad_norm": 0.06640625, + "learning_rate": 0.0022044647720893557, + "loss": 1.1849, + "step": 4800 + }, + { + "epoch": 0.42100820877219186, + "grad_norm": 0.06201171875, + "learning_rate": 0.0022040930340655267, + "loss": 1.2287, + "step": 4801 + }, + { + "epoch": 0.42109590054656637, + "grad_norm": 0.0673828125, + "learning_rate": 0.002203721245509843, + "loss": 1.1924, + "step": 4802 + }, + { + "epoch": 0.42118359232094094, + "grad_norm": 0.05615234375, + "learning_rate": 0.002203349406456211, + "loss": 1.1663, + "step": 4803 + }, + { + "epoch": 0.4212712840953155, + "grad_norm": 0.0673828125, + "learning_rate": 0.002202977516938542, + "loss": 1.1895, + "step": 4804 + }, + { + "epoch": 0.42135897586969, + "grad_norm": 0.057373046875, + "learning_rate": 0.002202605576990751, + "loss": 1.1886, + "step": 4805 + }, + { + "epoch": 0.4214466676440646, + "grad_norm": 0.1142578125, + "learning_rate": 0.0022022335866467573, + "loss": 1.1478, + "step": 4806 + }, + { + "epoch": 0.4215343594184391, + "grad_norm": 0.051513671875, + "learning_rate": 0.0022018615459404867, + "loss": 1.1595, + "step": 4807 + }, + { + "epoch": 0.42162205119281365, + "grad_norm": 0.0654296875, + "learning_rate": 0.0022014894549058674, + "loss": 1.1607, + "step": 4808 + }, + { + "epoch": 0.4217097429671882, + "grad_norm": 0.06103515625, + "learning_rate": 0.0022011173135768336, + "loss": 1.2429, + "step": 4809 + }, + { + "epoch": 0.4217974347415627, + "grad_norm": 0.057861328125, + "learning_rate": 0.0022007451219873235, + "loss": 1.1888, + "step": 4810 + }, + { + "epoch": 0.4218851265159373, + "grad_norm": 0.068359375, + "learning_rate": 0.00220037288017128, + "loss": 1.1911, + "step": 4811 + }, + { + "epoch": 0.42197281829031186, + "grad_norm": 0.072265625, + "learning_rate": 0.0022000005881626507, + "loss": 1.2029, + "step": 4812 + }, + { + "epoch": 0.42206051006468637, + "grad_norm": 0.068359375, + "learning_rate": 0.002199628245995387, + "loss": 1.2099, + "step": 4813 + }, + { + "epoch": 0.42214820183906093, + "grad_norm": 0.0546875, + "learning_rate": 0.002199255853703447, + "loss": 1.1832, + "step": 4814 + }, + { + "epoch": 0.4222358936134355, + "grad_norm": 0.06982421875, + "learning_rate": 0.002198883411320791, + "loss": 1.2283, + "step": 4815 + }, + { + "epoch": 0.42232358538781, + "grad_norm": 0.052978515625, + "learning_rate": 0.002198510918881384, + "loss": 1.208, + "step": 4816 + }, + { + "epoch": 0.4224112771621846, + "grad_norm": 0.052490234375, + "learning_rate": 0.0021981383764191975, + "loss": 1.146, + "step": 4817 + }, + { + "epoch": 0.42249896893655914, + "grad_norm": 0.060791015625, + "learning_rate": 0.0021977657839682067, + "loss": 1.2165, + "step": 4818 + }, + { + "epoch": 0.42258666071093365, + "grad_norm": 0.053955078125, + "learning_rate": 0.0021973931415623896, + "loss": 1.2154, + "step": 4819 + }, + { + "epoch": 0.4226743524853082, + "grad_norm": 0.06298828125, + "learning_rate": 0.0021970204492357316, + "loss": 1.1796, + "step": 4820 + }, + { + "epoch": 0.4227620442596827, + "grad_norm": 0.049560546875, + "learning_rate": 0.0021966477070222206, + "loss": 1.1872, + "step": 4821 + }, + { + "epoch": 0.4228497360340573, + "grad_norm": 0.058349609375, + "learning_rate": 0.00219627491495585, + "loss": 1.1753, + "step": 4822 + }, + { + "epoch": 0.42293742780843185, + "grad_norm": 0.0517578125, + "learning_rate": 0.0021959020730706173, + "loss": 1.1361, + "step": 4823 + }, + { + "epoch": 0.42302511958280636, + "grad_norm": 0.0810546875, + "learning_rate": 0.0021955291814005243, + "loss": 1.2429, + "step": 4824 + }, + { + "epoch": 0.42311281135718093, + "grad_norm": 0.050048828125, + "learning_rate": 0.002195156239979579, + "loss": 1.1787, + "step": 4825 + }, + { + "epoch": 0.4232005031315555, + "grad_norm": 0.0966796875, + "learning_rate": 0.0021947832488417914, + "loss": 1.1422, + "step": 4826 + }, + { + "epoch": 0.42328819490593, + "grad_norm": 0.052734375, + "learning_rate": 0.0021944102080211783, + "loss": 1.1502, + "step": 4827 + }, + { + "epoch": 0.42337588668030457, + "grad_norm": 0.060302734375, + "learning_rate": 0.002194037117551759, + "loss": 1.1687, + "step": 4828 + }, + { + "epoch": 0.42346357845467913, + "grad_norm": 0.061279296875, + "learning_rate": 0.00219366397746756, + "loss": 1.1991, + "step": 4829 + }, + { + "epoch": 0.42355127022905364, + "grad_norm": 0.06396484375, + "learning_rate": 0.0021932907878026084, + "loss": 1.2254, + "step": 4830 + }, + { + "epoch": 0.4236389620034282, + "grad_norm": 0.08740234375, + "learning_rate": 0.0021929175485909404, + "loss": 1.2396, + "step": 4831 + }, + { + "epoch": 0.4237266537778027, + "grad_norm": 0.09130859375, + "learning_rate": 0.002192544259866593, + "loss": 1.1812, + "step": 4832 + }, + { + "epoch": 0.4238143455521773, + "grad_norm": 0.10546875, + "learning_rate": 0.0021921709216636102, + "loss": 1.2107, + "step": 4833 + }, + { + "epoch": 0.42390203732655185, + "grad_norm": 0.1162109375, + "learning_rate": 0.002191797534016039, + "loss": 1.2215, + "step": 4834 + }, + { + "epoch": 0.42398972910092636, + "grad_norm": 0.119140625, + "learning_rate": 0.0021914240969579304, + "loss": 1.2262, + "step": 4835 + }, + { + "epoch": 0.4240774208753009, + "grad_norm": 0.1015625, + "learning_rate": 0.0021910506105233428, + "loss": 1.2075, + "step": 4836 + }, + { + "epoch": 0.4241651126496755, + "grad_norm": 0.0703125, + "learning_rate": 0.0021906770747463357, + "loss": 1.1603, + "step": 4837 + }, + { + "epoch": 0.42425280442405, + "grad_norm": 0.10400390625, + "learning_rate": 0.002190303489660975, + "loss": 1.1846, + "step": 4838 + }, + { + "epoch": 0.42434049619842457, + "grad_norm": 0.048095703125, + "learning_rate": 0.0021899298553013312, + "loss": 1.2529, + "step": 4839 + }, + { + "epoch": 0.42442818797279913, + "grad_norm": 0.058837890625, + "learning_rate": 0.0021895561717014786, + "loss": 1.1983, + "step": 4840 + }, + { + "epoch": 0.42451587974717364, + "grad_norm": 0.06689453125, + "learning_rate": 0.0021891824388954955, + "loss": 1.2227, + "step": 4841 + }, + { + "epoch": 0.4246035715215482, + "grad_norm": 0.0517578125, + "learning_rate": 0.002188808656917466, + "loss": 1.1087, + "step": 4842 + }, + { + "epoch": 0.4246912632959227, + "grad_norm": 0.058349609375, + "learning_rate": 0.002188434825801478, + "loss": 1.2238, + "step": 4843 + }, + { + "epoch": 0.4247789550702973, + "grad_norm": 0.076171875, + "learning_rate": 0.002188060945581624, + "loss": 1.1898, + "step": 4844 + }, + { + "epoch": 0.42486664684467185, + "grad_norm": 0.055419921875, + "learning_rate": 0.002187687016292001, + "loss": 1.1934, + "step": 4845 + }, + { + "epoch": 0.42495433861904636, + "grad_norm": 0.08984375, + "learning_rate": 0.0021873130379667094, + "loss": 1.1658, + "step": 4846 + }, + { + "epoch": 0.4250420303934209, + "grad_norm": 0.06103515625, + "learning_rate": 0.0021869390106398563, + "loss": 1.1675, + "step": 4847 + }, + { + "epoch": 0.4251297221677955, + "grad_norm": 0.052734375, + "learning_rate": 0.0021865649343455512, + "loss": 1.2309, + "step": 4848 + }, + { + "epoch": 0.42521741394217, + "grad_norm": 0.049072265625, + "learning_rate": 0.0021861908091179096, + "loss": 1.2049, + "step": 4849 + }, + { + "epoch": 0.42530510571654456, + "grad_norm": 0.0625, + "learning_rate": 0.00218581663499105, + "loss": 1.1983, + "step": 4850 + }, + { + "epoch": 0.42539279749091913, + "grad_norm": 0.053466796875, + "learning_rate": 0.0021854424119990973, + "loss": 1.2293, + "step": 4851 + }, + { + "epoch": 0.42548048926529364, + "grad_norm": 0.064453125, + "learning_rate": 0.0021850681401761785, + "loss": 1.1584, + "step": 4852 + }, + { + "epoch": 0.4255681810396682, + "grad_norm": 0.05810546875, + "learning_rate": 0.0021846938195564255, + "loss": 1.1314, + "step": 4853 + }, + { + "epoch": 0.42565587281404277, + "grad_norm": 0.07275390625, + "learning_rate": 0.0021843194501739776, + "loss": 1.1285, + "step": 4854 + }, + { + "epoch": 0.4257435645884173, + "grad_norm": 0.053466796875, + "learning_rate": 0.0021839450320629747, + "loss": 1.1013, + "step": 4855 + }, + { + "epoch": 0.42583125636279184, + "grad_norm": 0.07275390625, + "learning_rate": 0.002183570565257564, + "loss": 1.1649, + "step": 4856 + }, + { + "epoch": 0.42591894813716635, + "grad_norm": 0.04931640625, + "learning_rate": 0.0021831960497918942, + "loss": 1.231, + "step": 4857 + }, + { + "epoch": 0.4260066399115409, + "grad_norm": 0.053466796875, + "learning_rate": 0.0021828214857001213, + "loss": 1.1807, + "step": 4858 + }, + { + "epoch": 0.4260943316859155, + "grad_norm": 0.058837890625, + "learning_rate": 0.002182446873016405, + "loss": 1.1603, + "step": 4859 + }, + { + "epoch": 0.42618202346029, + "grad_norm": 0.099609375, + "learning_rate": 0.0021820722117749077, + "loss": 1.1681, + "step": 4860 + }, + { + "epoch": 0.42626971523466456, + "grad_norm": 0.052734375, + "learning_rate": 0.002181697502009799, + "loss": 1.1983, + "step": 4861 + }, + { + "epoch": 0.4263574070090391, + "grad_norm": 0.1279296875, + "learning_rate": 0.00218132274375525, + "loss": 1.1868, + "step": 4862 + }, + { + "epoch": 0.42644509878341363, + "grad_norm": 0.05224609375, + "learning_rate": 0.0021809479370454386, + "loss": 1.1626, + "step": 4863 + }, + { + "epoch": 0.4265327905577882, + "grad_norm": 0.0595703125, + "learning_rate": 0.0021805730819145458, + "loss": 1.1947, + "step": 4864 + }, + { + "epoch": 0.42662048233216276, + "grad_norm": 0.05419921875, + "learning_rate": 0.002180198178396758, + "loss": 1.2031, + "step": 4865 + }, + { + "epoch": 0.4267081741065373, + "grad_norm": 0.050048828125, + "learning_rate": 0.0021798232265262643, + "loss": 1.1203, + "step": 4866 + }, + { + "epoch": 0.42679586588091184, + "grad_norm": 0.06201171875, + "learning_rate": 0.0021794482263372606, + "loss": 1.1481, + "step": 4867 + }, + { + "epoch": 0.42688355765528635, + "grad_norm": 0.0537109375, + "learning_rate": 0.0021790731778639455, + "loss": 1.199, + "step": 4868 + }, + { + "epoch": 0.4269712494296609, + "grad_norm": 0.05615234375, + "learning_rate": 0.0021786980811405224, + "loss": 1.1754, + "step": 4869 + }, + { + "epoch": 0.4270589412040355, + "grad_norm": 0.078125, + "learning_rate": 0.0021783229362011985, + "loss": 1.1882, + "step": 4870 + }, + { + "epoch": 0.42714663297841, + "grad_norm": 0.0556640625, + "learning_rate": 0.0021779477430801876, + "loss": 1.1599, + "step": 4871 + }, + { + "epoch": 0.42723432475278456, + "grad_norm": 0.083984375, + "learning_rate": 0.0021775725018117046, + "loss": 1.1987, + "step": 4872 + }, + { + "epoch": 0.4273220165271591, + "grad_norm": 0.06591796875, + "learning_rate": 0.002177197212429972, + "loss": 1.1567, + "step": 4873 + }, + { + "epoch": 0.42740970830153363, + "grad_norm": 0.051025390625, + "learning_rate": 0.0021768218749692136, + "loss": 1.1874, + "step": 4874 + }, + { + "epoch": 0.4274974000759082, + "grad_norm": 0.05224609375, + "learning_rate": 0.0021764464894636617, + "loss": 1.241, + "step": 4875 + }, + { + "epoch": 0.42758509185028276, + "grad_norm": 0.0517578125, + "learning_rate": 0.0021760710559475477, + "loss": 1.1836, + "step": 4876 + }, + { + "epoch": 0.42767278362465727, + "grad_norm": 0.05859375, + "learning_rate": 0.002175695574455112, + "loss": 1.1566, + "step": 4877 + }, + { + "epoch": 0.42776047539903184, + "grad_norm": 0.0556640625, + "learning_rate": 0.002175320045020597, + "loss": 1.1462, + "step": 4878 + }, + { + "epoch": 0.42784816717340635, + "grad_norm": 0.0546875, + "learning_rate": 0.0021749444676782497, + "loss": 1.2024, + "step": 4879 + }, + { + "epoch": 0.4279358589477809, + "grad_norm": 0.0556640625, + "learning_rate": 0.002174568842462322, + "loss": 1.2287, + "step": 4880 + }, + { + "epoch": 0.4280235507221555, + "grad_norm": 0.064453125, + "learning_rate": 0.0021741931694070704, + "loss": 1.2622, + "step": 4881 + }, + { + "epoch": 0.42811124249653, + "grad_norm": 0.06298828125, + "learning_rate": 0.002173817448546755, + "loss": 1.1871, + "step": 4882 + }, + { + "epoch": 0.42819893427090455, + "grad_norm": 0.0517578125, + "learning_rate": 0.00217344167991564, + "loss": 1.1956, + "step": 4883 + }, + { + "epoch": 0.4282866260452791, + "grad_norm": 0.052001953125, + "learning_rate": 0.0021730658635479958, + "loss": 1.1308, + "step": 4884 + }, + { + "epoch": 0.4283743178196536, + "grad_norm": 0.06884765625, + "learning_rate": 0.0021726899994780947, + "loss": 1.2655, + "step": 4885 + }, + { + "epoch": 0.4284620095940282, + "grad_norm": 0.06884765625, + "learning_rate": 0.002172314087740215, + "loss": 1.1991, + "step": 4886 + }, + { + "epoch": 0.42854970136840276, + "grad_norm": 0.052734375, + "learning_rate": 0.002171938128368639, + "loss": 1.1886, + "step": 4887 + }, + { + "epoch": 0.42863739314277727, + "grad_norm": 0.07373046875, + "learning_rate": 0.0021715621213976525, + "loss": 1.1792, + "step": 4888 + }, + { + "epoch": 0.42872508491715183, + "grad_norm": 0.058349609375, + "learning_rate": 0.002171186066861548, + "loss": 1.1964, + "step": 4889 + }, + { + "epoch": 0.4288127766915264, + "grad_norm": 0.06103515625, + "learning_rate": 0.0021708099647946184, + "loss": 1.1481, + "step": 4890 + }, + { + "epoch": 0.4289004684659009, + "grad_norm": 0.06787109375, + "learning_rate": 0.0021704338152311654, + "loss": 1.192, + "step": 4891 + }, + { + "epoch": 0.4289881602402755, + "grad_norm": 0.061767578125, + "learning_rate": 0.002170057618205491, + "loss": 1.2446, + "step": 4892 + }, + { + "epoch": 0.42907585201465, + "grad_norm": 0.08740234375, + "learning_rate": 0.002169681373751906, + "loss": 1.1884, + "step": 4893 + }, + { + "epoch": 0.42916354378902455, + "grad_norm": 0.06689453125, + "learning_rate": 0.00216930508190472, + "loss": 1.2559, + "step": 4894 + }, + { + "epoch": 0.4292512355633991, + "grad_norm": 0.12890625, + "learning_rate": 0.0021689287426982506, + "loss": 1.1841, + "step": 4895 + }, + { + "epoch": 0.4293389273377736, + "grad_norm": 0.0830078125, + "learning_rate": 0.0021685523561668207, + "loss": 1.2165, + "step": 4896 + }, + { + "epoch": 0.4294266191121482, + "grad_norm": 0.15234375, + "learning_rate": 0.002168175922344754, + "loss": 1.2199, + "step": 4897 + }, + { + "epoch": 0.42951431088652275, + "grad_norm": 0.05810546875, + "learning_rate": 0.002167799441266381, + "loss": 1.1957, + "step": 4898 + }, + { + "epoch": 0.42960200266089726, + "grad_norm": 0.08935546875, + "learning_rate": 0.002167422912966035, + "loss": 1.2114, + "step": 4899 + }, + { + "epoch": 0.42968969443527183, + "grad_norm": 0.076171875, + "learning_rate": 0.0021670463374780556, + "loss": 1.1836, + "step": 4900 + }, + { + "epoch": 0.4297773862096464, + "grad_norm": 0.07177734375, + "learning_rate": 0.002166669714836785, + "loss": 1.1967, + "step": 4901 + }, + { + "epoch": 0.4298650779840209, + "grad_norm": 0.06298828125, + "learning_rate": 0.0021662930450765706, + "loss": 1.1702, + "step": 4902 + }, + { + "epoch": 0.42995276975839547, + "grad_norm": 0.06884765625, + "learning_rate": 0.002165916328231763, + "loss": 1.1802, + "step": 4903 + }, + { + "epoch": 0.43004046153277, + "grad_norm": 0.0888671875, + "learning_rate": 0.0021655395643367177, + "loss": 1.1757, + "step": 4904 + }, + { + "epoch": 0.43012815330714455, + "grad_norm": 0.06005859375, + "learning_rate": 0.0021651627534257953, + "loss": 1.1943, + "step": 4905 + }, + { + "epoch": 0.4302158450815191, + "grad_norm": 0.068359375, + "learning_rate": 0.0021647858955333595, + "loss": 1.1604, + "step": 4906 + }, + { + "epoch": 0.4303035368558936, + "grad_norm": 0.05029296875, + "learning_rate": 0.002164408990693779, + "loss": 1.1118, + "step": 4907 + }, + { + "epoch": 0.4303912286302682, + "grad_norm": 0.06640625, + "learning_rate": 0.0021640320389414262, + "loss": 1.2022, + "step": 4908 + }, + { + "epoch": 0.43047892040464275, + "grad_norm": 0.060546875, + "learning_rate": 0.002163655040310679, + "loss": 1.1361, + "step": 4909 + }, + { + "epoch": 0.43056661217901726, + "grad_norm": 0.054443359375, + "learning_rate": 0.002163277994835918, + "loss": 1.2088, + "step": 4910 + }, + { + "epoch": 0.4306543039533918, + "grad_norm": 0.0576171875, + "learning_rate": 0.002162900902551529, + "loss": 1.1912, + "step": 4911 + }, + { + "epoch": 0.4307419957277664, + "grad_norm": 0.06005859375, + "learning_rate": 0.002162523763491901, + "loss": 1.1541, + "step": 4912 + }, + { + "epoch": 0.4308296875021409, + "grad_norm": 0.0751953125, + "learning_rate": 0.0021621465776914296, + "loss": 1.2025, + "step": 4913 + }, + { + "epoch": 0.43091737927651547, + "grad_norm": 0.049560546875, + "learning_rate": 0.0021617693451845125, + "loss": 1.1347, + "step": 4914 + }, + { + "epoch": 0.43100507105089, + "grad_norm": 0.0556640625, + "learning_rate": 0.0021613920660055515, + "loss": 1.1467, + "step": 4915 + }, + { + "epoch": 0.43109276282526454, + "grad_norm": 0.0654296875, + "learning_rate": 0.0021610147401889546, + "loss": 1.1837, + "step": 4916 + }, + { + "epoch": 0.4311804545996391, + "grad_norm": 0.0478515625, + "learning_rate": 0.002160637367769133, + "loss": 1.1613, + "step": 4917 + }, + { + "epoch": 0.4312681463740136, + "grad_norm": 0.09033203125, + "learning_rate": 0.002160259948780501, + "loss": 1.1833, + "step": 4918 + }, + { + "epoch": 0.4313558381483882, + "grad_norm": 0.05224609375, + "learning_rate": 0.0021598824832574793, + "loss": 1.2067, + "step": 4919 + }, + { + "epoch": 0.43144352992276275, + "grad_norm": 0.08251953125, + "learning_rate": 0.0021595049712344913, + "loss": 1.2314, + "step": 4920 + }, + { + "epoch": 0.43153122169713726, + "grad_norm": 0.05224609375, + "learning_rate": 0.002159127412745965, + "loss": 1.1826, + "step": 4921 + }, + { + "epoch": 0.4316189134715118, + "grad_norm": 0.06494140625, + "learning_rate": 0.002158749807826333, + "loss": 1.2132, + "step": 4922 + }, + { + "epoch": 0.4317066052458864, + "grad_norm": 0.07666015625, + "learning_rate": 0.002158372156510032, + "loss": 1.1945, + "step": 4923 + }, + { + "epoch": 0.4317942970202609, + "grad_norm": 0.06591796875, + "learning_rate": 0.0021579944588315026, + "loss": 1.2673, + "step": 4924 + }, + { + "epoch": 0.43188198879463546, + "grad_norm": 0.07373046875, + "learning_rate": 0.00215761671482519, + "loss": 1.163, + "step": 4925 + }, + { + "epoch": 0.43196968056901003, + "grad_norm": 0.053955078125, + "learning_rate": 0.0021572389245255435, + "loss": 1.2013, + "step": 4926 + }, + { + "epoch": 0.43205737234338454, + "grad_norm": 0.057861328125, + "learning_rate": 0.002156861087967017, + "loss": 1.1685, + "step": 4927 + }, + { + "epoch": 0.4321450641177591, + "grad_norm": 0.06103515625, + "learning_rate": 0.002156483205184066, + "loss": 1.152, + "step": 4928 + }, + { + "epoch": 0.4322327558921336, + "grad_norm": 0.052490234375, + "learning_rate": 0.0021561052762111566, + "loss": 1.181, + "step": 4929 + }, + { + "epoch": 0.4323204476665082, + "grad_norm": 0.05712890625, + "learning_rate": 0.002155727301082751, + "loss": 1.2436, + "step": 4930 + }, + { + "epoch": 0.43240813944088274, + "grad_norm": 0.05517578125, + "learning_rate": 0.002155349279833321, + "loss": 1.1037, + "step": 4931 + }, + { + "epoch": 0.43249583121525725, + "grad_norm": 0.06005859375, + "learning_rate": 0.0021549712124973417, + "loss": 1.1749, + "step": 4932 + }, + { + "epoch": 0.4325835229896318, + "grad_norm": 0.058349609375, + "learning_rate": 0.002154593099109291, + "loss": 1.1924, + "step": 4933 + }, + { + "epoch": 0.4326712147640064, + "grad_norm": 0.05712890625, + "learning_rate": 0.002154214939703652, + "loss": 1.1579, + "step": 4934 + }, + { + "epoch": 0.4327589065383809, + "grad_norm": 0.0634765625, + "learning_rate": 0.0021538367343149132, + "loss": 1.1479, + "step": 4935 + }, + { + "epoch": 0.43284659831275546, + "grad_norm": 0.0615234375, + "learning_rate": 0.002153458482977564, + "loss": 1.1738, + "step": 4936 + }, + { + "epoch": 0.43293429008713, + "grad_norm": 0.06298828125, + "learning_rate": 0.0021530801857261015, + "loss": 1.2221, + "step": 4937 + }, + { + "epoch": 0.43302198186150453, + "grad_norm": 0.072265625, + "learning_rate": 0.0021527018425950245, + "loss": 1.1382, + "step": 4938 + }, + { + "epoch": 0.4331096736358791, + "grad_norm": 0.06201171875, + "learning_rate": 0.0021523234536188368, + "loss": 1.198, + "step": 4939 + }, + { + "epoch": 0.4331973654102536, + "grad_norm": 0.07080078125, + "learning_rate": 0.002151945018832047, + "loss": 1.1888, + "step": 4940 + }, + { + "epoch": 0.4332850571846282, + "grad_norm": 0.05908203125, + "learning_rate": 0.0021515665382691672, + "loss": 1.2007, + "step": 4941 + }, + { + "epoch": 0.43337274895900274, + "grad_norm": 0.10302734375, + "learning_rate": 0.002151188011964714, + "loss": 1.1962, + "step": 4942 + }, + { + "epoch": 0.43346044073337725, + "grad_norm": 0.06298828125, + "learning_rate": 0.002150809439953208, + "loss": 1.2089, + "step": 4943 + }, + { + "epoch": 0.4335481325077518, + "grad_norm": 0.123046875, + "learning_rate": 0.0021504308222691742, + "loss": 1.1976, + "step": 4944 + }, + { + "epoch": 0.4336358242821264, + "grad_norm": 0.059814453125, + "learning_rate": 0.002150052158947141, + "loss": 1.1382, + "step": 4945 + }, + { + "epoch": 0.4337235160565009, + "grad_norm": 0.09130859375, + "learning_rate": 0.0021496734500216414, + "loss": 1.1823, + "step": 4946 + }, + { + "epoch": 0.43381120783087546, + "grad_norm": 0.052978515625, + "learning_rate": 0.002149294695527214, + "loss": 1.1644, + "step": 4947 + }, + { + "epoch": 0.43389889960525, + "grad_norm": 0.068359375, + "learning_rate": 0.0021489158954983985, + "loss": 1.2014, + "step": 4948 + }, + { + "epoch": 0.43398659137962453, + "grad_norm": 0.06396484375, + "learning_rate": 0.0021485370499697413, + "loss": 1.1426, + "step": 4949 + }, + { + "epoch": 0.4340742831539991, + "grad_norm": 0.076171875, + "learning_rate": 0.002148158158975792, + "loss": 1.1685, + "step": 4950 + }, + { + "epoch": 0.4341619749283736, + "grad_norm": 0.06689453125, + "learning_rate": 0.002147779222551105, + "loss": 1.1762, + "step": 4951 + }, + { + "epoch": 0.43424966670274817, + "grad_norm": 0.11669921875, + "learning_rate": 0.002147400240730238, + "loss": 1.1752, + "step": 4952 + }, + { + "epoch": 0.43433735847712274, + "grad_norm": 0.06640625, + "learning_rate": 0.0021470212135477527, + "loss": 1.1845, + "step": 4953 + }, + { + "epoch": 0.43442505025149725, + "grad_norm": 0.076171875, + "learning_rate": 0.002146642141038216, + "loss": 1.1951, + "step": 4954 + }, + { + "epoch": 0.4345127420258718, + "grad_norm": 0.09130859375, + "learning_rate": 0.0021462630232361984, + "loss": 1.1493, + "step": 4955 + }, + { + "epoch": 0.4346004338002464, + "grad_norm": 0.055419921875, + "learning_rate": 0.0021458838601762734, + "loss": 1.2026, + "step": 4956 + }, + { + "epoch": 0.4346881255746209, + "grad_norm": 0.099609375, + "learning_rate": 0.0021455046518930212, + "loss": 1.1481, + "step": 4957 + }, + { + "epoch": 0.43477581734899545, + "grad_norm": 0.059814453125, + "learning_rate": 0.002145125398421023, + "loss": 1.198, + "step": 4958 + }, + { + "epoch": 0.43486350912337, + "grad_norm": 0.055908203125, + "learning_rate": 0.002144746099794867, + "loss": 1.1794, + "step": 4959 + }, + { + "epoch": 0.4349512008977445, + "grad_norm": 0.08837890625, + "learning_rate": 0.0021443667560491447, + "loss": 1.1483, + "step": 4960 + }, + { + "epoch": 0.4350388926721191, + "grad_norm": 0.05712890625, + "learning_rate": 0.00214398736721845, + "loss": 1.1619, + "step": 4961 + }, + { + "epoch": 0.43512658444649366, + "grad_norm": 0.056396484375, + "learning_rate": 0.0021436079333373823, + "loss": 1.1995, + "step": 4962 + }, + { + "epoch": 0.43521427622086817, + "grad_norm": 0.08740234375, + "learning_rate": 0.002143228454440545, + "loss": 1.1979, + "step": 4963 + }, + { + "epoch": 0.43530196799524273, + "grad_norm": 0.076171875, + "learning_rate": 0.002142848930562547, + "loss": 1.1015, + "step": 4964 + }, + { + "epoch": 0.43538965976961724, + "grad_norm": 0.0576171875, + "learning_rate": 0.0021424693617379978, + "loss": 1.1995, + "step": 4965 + }, + { + "epoch": 0.4354773515439918, + "grad_norm": 0.09619140625, + "learning_rate": 0.0021420897480015147, + "loss": 1.2003, + "step": 4966 + }, + { + "epoch": 0.4355650433183664, + "grad_norm": 0.0537109375, + "learning_rate": 0.0021417100893877166, + "loss": 1.2204, + "step": 4967 + }, + { + "epoch": 0.4356527350927409, + "grad_norm": 0.0986328125, + "learning_rate": 0.002141330385931228, + "loss": 1.1681, + "step": 4968 + }, + { + "epoch": 0.43574042686711545, + "grad_norm": 0.06103515625, + "learning_rate": 0.0021409506376666764, + "loss": 1.1656, + "step": 4969 + }, + { + "epoch": 0.43582811864149, + "grad_norm": 0.056884765625, + "learning_rate": 0.0021405708446286943, + "loss": 1.1585, + "step": 4970 + }, + { + "epoch": 0.4359158104158645, + "grad_norm": 0.07861328125, + "learning_rate": 0.002140191006851917, + "loss": 1.155, + "step": 4971 + }, + { + "epoch": 0.4360035021902391, + "grad_norm": 0.06591796875, + "learning_rate": 0.0021398111243709854, + "loss": 1.1486, + "step": 4972 + }, + { + "epoch": 0.43609119396461365, + "grad_norm": 0.0732421875, + "learning_rate": 0.002139431197220544, + "loss": 1.2036, + "step": 4973 + }, + { + "epoch": 0.43617888573898816, + "grad_norm": 0.091796875, + "learning_rate": 0.0021390512254352407, + "loss": 1.1558, + "step": 4974 + }, + { + "epoch": 0.43626657751336273, + "grad_norm": 0.05029296875, + "learning_rate": 0.0021386712090497285, + "loss": 1.1237, + "step": 4975 + }, + { + "epoch": 0.43635426928773724, + "grad_norm": 0.078125, + "learning_rate": 0.0021382911480986627, + "loss": 1.2211, + "step": 4976 + }, + { + "epoch": 0.4364419610621118, + "grad_norm": 0.07421875, + "learning_rate": 0.002137911042616706, + "loss": 1.18, + "step": 4977 + }, + { + "epoch": 0.43652965283648637, + "grad_norm": 0.060546875, + "learning_rate": 0.0021375308926385207, + "loss": 1.2392, + "step": 4978 + }, + { + "epoch": 0.4366173446108609, + "grad_norm": 0.05859375, + "learning_rate": 0.002137150698198777, + "loss": 1.1148, + "step": 4979 + }, + { + "epoch": 0.43670503638523545, + "grad_norm": 0.052490234375, + "learning_rate": 0.0021367704593321466, + "loss": 1.1992, + "step": 4980 + }, + { + "epoch": 0.43679272815961, + "grad_norm": 0.049560546875, + "learning_rate": 0.002136390176073307, + "loss": 1.1714, + "step": 4981 + }, + { + "epoch": 0.4368804199339845, + "grad_norm": 0.06396484375, + "learning_rate": 0.0021360098484569396, + "loss": 1.1588, + "step": 4982 + }, + { + "epoch": 0.4369681117083591, + "grad_norm": 0.0791015625, + "learning_rate": 0.002135629476517728, + "loss": 1.265, + "step": 4983 + }, + { + "epoch": 0.43705580348273365, + "grad_norm": 0.0693359375, + "learning_rate": 0.0021352490602903626, + "loss": 1.1021, + "step": 4984 + }, + { + "epoch": 0.43714349525710816, + "grad_norm": 0.07421875, + "learning_rate": 0.0021348685998095348, + "loss": 1.1914, + "step": 4985 + }, + { + "epoch": 0.4372311870314827, + "grad_norm": 0.08544921875, + "learning_rate": 0.002134488095109943, + "loss": 1.1854, + "step": 4986 + }, + { + "epoch": 0.43731887880585724, + "grad_norm": 0.0517578125, + "learning_rate": 0.002134107546226287, + "loss": 1.1916, + "step": 4987 + }, + { + "epoch": 0.4374065705802318, + "grad_norm": 0.142578125, + "learning_rate": 0.002133726953193273, + "loss": 1.1949, + "step": 4988 + }, + { + "epoch": 0.43749426235460637, + "grad_norm": 0.07177734375, + "learning_rate": 0.0021333463160456093, + "loss": 1.1549, + "step": 4989 + }, + { + "epoch": 0.4375819541289809, + "grad_norm": 0.11572265625, + "learning_rate": 0.002132965634818009, + "loss": 1.2031, + "step": 4990 + }, + { + "epoch": 0.43766964590335544, + "grad_norm": 0.0576171875, + "learning_rate": 0.0021325849095451903, + "loss": 1.2499, + "step": 4991 + }, + { + "epoch": 0.43775733767773, + "grad_norm": 0.14453125, + "learning_rate": 0.0021322041402618733, + "loss": 1.2724, + "step": 4992 + }, + { + "epoch": 0.4378450294521045, + "grad_norm": 0.060791015625, + "learning_rate": 0.002131823327002784, + "loss": 1.237, + "step": 4993 + }, + { + "epoch": 0.4379327212264791, + "grad_norm": 0.111328125, + "learning_rate": 0.0021314424698026502, + "loss": 1.1802, + "step": 4994 + }, + { + "epoch": 0.43802041300085365, + "grad_norm": 0.053466796875, + "learning_rate": 0.002131061568696207, + "loss": 1.2068, + "step": 4995 + }, + { + "epoch": 0.43810810477522816, + "grad_norm": 0.12353515625, + "learning_rate": 0.0021306806237181893, + "loss": 1.1827, + "step": 4996 + }, + { + "epoch": 0.4381957965496027, + "grad_norm": 0.051513671875, + "learning_rate": 0.002130299634903341, + "loss": 1.1625, + "step": 4997 + }, + { + "epoch": 0.43828348832397723, + "grad_norm": 0.09521484375, + "learning_rate": 0.0021299186022864045, + "loss": 1.1956, + "step": 4998 + }, + { + "epoch": 0.4383711800983518, + "grad_norm": 0.12060546875, + "learning_rate": 0.0021295375259021312, + "loss": 1.1608, + "step": 4999 + }, + { + "epoch": 0.43845887187272636, + "grad_norm": 0.06396484375, + "learning_rate": 0.002129156405785273, + "loss": 1.1611, + "step": 5000 + }, + { + "epoch": 0.43845887187272636, + "eval_loss": 1.1928588151931763, + "eval_runtime": 428.8018, + "eval_samples_per_second": 33.692, + "eval_steps_per_second": 8.423, + "step": 5000 + }, + { + "epoch": 0.4385465636471009, + "grad_norm": 0.09228515625, + "learning_rate": 0.002128775241970588, + "loss": 1.2164, + "step": 5001 + }, + { + "epoch": 0.43863425542147544, + "grad_norm": 0.0634765625, + "learning_rate": 0.002128394034492837, + "loss": 1.1381, + "step": 5002 + }, + { + "epoch": 0.43872194719585, + "grad_norm": 0.052734375, + "learning_rate": 0.0021280127833867846, + "loss": 1.1917, + "step": 5003 + }, + { + "epoch": 0.4388096389702245, + "grad_norm": 0.058349609375, + "learning_rate": 0.0021276314886872006, + "loss": 1.1573, + "step": 5004 + }, + { + "epoch": 0.4388973307445991, + "grad_norm": 0.04541015625, + "learning_rate": 0.0021272501504288575, + "loss": 1.1918, + "step": 5005 + }, + { + "epoch": 0.43898502251897364, + "grad_norm": 0.06591796875, + "learning_rate": 0.0021268687686465328, + "loss": 1.1492, + "step": 5006 + }, + { + "epoch": 0.43907271429334815, + "grad_norm": 0.06298828125, + "learning_rate": 0.002126487343375008, + "loss": 1.1615, + "step": 5007 + }, + { + "epoch": 0.4391604060677227, + "grad_norm": 0.05712890625, + "learning_rate": 0.002126105874649067, + "loss": 1.1276, + "step": 5008 + }, + { + "epoch": 0.4392480978420973, + "grad_norm": 0.064453125, + "learning_rate": 0.002125724362503499, + "loss": 1.1589, + "step": 5009 + }, + { + "epoch": 0.4393357896164718, + "grad_norm": 0.057861328125, + "learning_rate": 0.002125342806973098, + "loss": 1.1759, + "step": 5010 + }, + { + "epoch": 0.43942348139084636, + "grad_norm": 0.0751953125, + "learning_rate": 0.0021249612080926593, + "loss": 1.2428, + "step": 5011 + }, + { + "epoch": 0.43951117316522087, + "grad_norm": 0.0517578125, + "learning_rate": 0.0021245795658969852, + "loss": 1.1711, + "step": 5012 + }, + { + "epoch": 0.43959886493959544, + "grad_norm": 0.0556640625, + "learning_rate": 0.00212419788042088, + "loss": 1.1816, + "step": 5013 + }, + { + "epoch": 0.43968655671397, + "grad_norm": 0.0849609375, + "learning_rate": 0.0021238161516991512, + "loss": 1.1812, + "step": 5014 + }, + { + "epoch": 0.4397742484883445, + "grad_norm": 0.07763671875, + "learning_rate": 0.002123434379766613, + "loss": 1.2394, + "step": 5015 + }, + { + "epoch": 0.4398619402627191, + "grad_norm": 0.0537109375, + "learning_rate": 0.0021230525646580814, + "loss": 1.1626, + "step": 5016 + }, + { + "epoch": 0.43994963203709364, + "grad_norm": 0.10693359375, + "learning_rate": 0.0021226707064083772, + "loss": 1.1584, + "step": 5017 + }, + { + "epoch": 0.44003732381146815, + "grad_norm": 0.1298828125, + "learning_rate": 0.002122288805052325, + "loss": 1.2637, + "step": 5018 + }, + { + "epoch": 0.4401250155858427, + "grad_norm": 0.060302734375, + "learning_rate": 0.002121906860624753, + "loss": 1.1642, + "step": 5019 + }, + { + "epoch": 0.4402127073602173, + "grad_norm": 0.06298828125, + "learning_rate": 0.0021215248731604936, + "loss": 1.167, + "step": 5020 + }, + { + "epoch": 0.4403003991345918, + "grad_norm": 0.0517578125, + "learning_rate": 0.002121142842694383, + "loss": 1.1973, + "step": 5021 + }, + { + "epoch": 0.44038809090896636, + "grad_norm": 0.0576171875, + "learning_rate": 0.002120760769261261, + "loss": 1.1185, + "step": 5022 + }, + { + "epoch": 0.44047578268334087, + "grad_norm": 0.052490234375, + "learning_rate": 0.0021203786528959717, + "loss": 1.2457, + "step": 5023 + }, + { + "epoch": 0.44056347445771543, + "grad_norm": 0.05078125, + "learning_rate": 0.0021199964936333645, + "loss": 1.1167, + "step": 5024 + }, + { + "epoch": 0.44065116623209, + "grad_norm": 0.05810546875, + "learning_rate": 0.00211961429150829, + "loss": 1.1496, + "step": 5025 + }, + { + "epoch": 0.4407388580064645, + "grad_norm": 0.05517578125, + "learning_rate": 0.0021192320465556044, + "loss": 1.1974, + "step": 5026 + }, + { + "epoch": 0.44082654978083907, + "grad_norm": 0.0771484375, + "learning_rate": 0.0021188497588101674, + "loss": 1.204, + "step": 5027 + }, + { + "epoch": 0.44091424155521364, + "grad_norm": 0.05078125, + "learning_rate": 0.002118467428306843, + "loss": 1.1446, + "step": 5028 + }, + { + "epoch": 0.44100193332958815, + "grad_norm": 0.10205078125, + "learning_rate": 0.0021180850550804983, + "loss": 1.2387, + "step": 5029 + }, + { + "epoch": 0.4410896251039627, + "grad_norm": 0.07666015625, + "learning_rate": 0.0021177026391660044, + "loss": 1.1794, + "step": 5030 + }, + { + "epoch": 0.4411773168783373, + "grad_norm": 0.16015625, + "learning_rate": 0.002117320180598238, + "loss": 1.2515, + "step": 5031 + }, + { + "epoch": 0.4412650086527118, + "grad_norm": 0.10400390625, + "learning_rate": 0.0021169376794120767, + "loss": 1.2179, + "step": 5032 + }, + { + "epoch": 0.44135270042708635, + "grad_norm": 0.10107421875, + "learning_rate": 0.0021165551356424053, + "loss": 1.1867, + "step": 5033 + }, + { + "epoch": 0.44144039220146086, + "grad_norm": 0.1396484375, + "learning_rate": 0.0021161725493241096, + "loss": 1.2549, + "step": 5034 + }, + { + "epoch": 0.44152808397583543, + "grad_norm": 0.055419921875, + "learning_rate": 0.002115789920492081, + "loss": 1.15, + "step": 5035 + }, + { + "epoch": 0.44161577575021, + "grad_norm": 0.10791015625, + "learning_rate": 0.0021154072491812136, + "loss": 1.216, + "step": 5036 + }, + { + "epoch": 0.4417034675245845, + "grad_norm": 0.0537109375, + "learning_rate": 0.0021150245354264073, + "loss": 1.1308, + "step": 5037 + }, + { + "epoch": 0.44179115929895907, + "grad_norm": 0.0791015625, + "learning_rate": 0.002114641779262563, + "loss": 1.2273, + "step": 5038 + }, + { + "epoch": 0.44187885107333363, + "grad_norm": 0.057861328125, + "learning_rate": 0.002114258980724588, + "loss": 1.23, + "step": 5039 + }, + { + "epoch": 0.44196654284770814, + "grad_norm": 0.057373046875, + "learning_rate": 0.002113876139847393, + "loss": 1.19, + "step": 5040 + }, + { + "epoch": 0.4420542346220827, + "grad_norm": 0.06884765625, + "learning_rate": 0.0021134932566658905, + "loss": 1.1712, + "step": 5041 + }, + { + "epoch": 0.4421419263964573, + "grad_norm": 0.0615234375, + "learning_rate": 0.002113110331215001, + "loss": 1.177, + "step": 5042 + }, + { + "epoch": 0.4422296181708318, + "grad_norm": 0.07421875, + "learning_rate": 0.0021127273635296435, + "loss": 1.2157, + "step": 5043 + }, + { + "epoch": 0.44231730994520635, + "grad_norm": 0.0673828125, + "learning_rate": 0.0021123443536447454, + "loss": 1.1866, + "step": 5044 + }, + { + "epoch": 0.4424050017195809, + "grad_norm": 0.06201171875, + "learning_rate": 0.0021119613015952356, + "loss": 1.1193, + "step": 5045 + }, + { + "epoch": 0.4424926934939554, + "grad_norm": 0.0693359375, + "learning_rate": 0.002111578207416048, + "loss": 1.1052, + "step": 5046 + }, + { + "epoch": 0.44258038526833, + "grad_norm": 0.061279296875, + "learning_rate": 0.0021111950711421197, + "loss": 1.2488, + "step": 5047 + }, + { + "epoch": 0.4426680770427045, + "grad_norm": 0.07275390625, + "learning_rate": 0.0021108118928083912, + "loss": 1.2453, + "step": 5048 + }, + { + "epoch": 0.44275576881707907, + "grad_norm": 0.08837890625, + "learning_rate": 0.0021104286724498073, + "loss": 1.2521, + "step": 5049 + }, + { + "epoch": 0.44284346059145363, + "grad_norm": 0.06689453125, + "learning_rate": 0.0021100454101013176, + "loss": 1.1598, + "step": 5050 + }, + { + "epoch": 0.44293115236582814, + "grad_norm": 0.08447265625, + "learning_rate": 0.0021096621057978735, + "loss": 1.1938, + "step": 5051 + }, + { + "epoch": 0.4430188441402027, + "grad_norm": 0.1259765625, + "learning_rate": 0.002109278759574432, + "loss": 1.1463, + "step": 5052 + }, + { + "epoch": 0.44310653591457727, + "grad_norm": 0.056884765625, + "learning_rate": 0.002108895371465954, + "loss": 1.1279, + "step": 5053 + }, + { + "epoch": 0.4431942276889518, + "grad_norm": 0.09326171875, + "learning_rate": 0.002108511941507402, + "loss": 1.1682, + "step": 5054 + }, + { + "epoch": 0.44328191946332635, + "grad_norm": 0.054931640625, + "learning_rate": 0.002108128469733745, + "loss": 1.1934, + "step": 5055 + }, + { + "epoch": 0.4433696112377009, + "grad_norm": 0.055419921875, + "learning_rate": 0.0021077449561799537, + "loss": 1.1987, + "step": 5056 + }, + { + "epoch": 0.4434573030120754, + "grad_norm": 0.08935546875, + "learning_rate": 0.002107361400881005, + "loss": 1.1953, + "step": 5057 + }, + { + "epoch": 0.44354499478645, + "grad_norm": 0.087890625, + "learning_rate": 0.0021069778038718765, + "loss": 1.2273, + "step": 5058 + }, + { + "epoch": 0.4436326865608245, + "grad_norm": 0.0791015625, + "learning_rate": 0.002106594165187552, + "loss": 1.1607, + "step": 5059 + }, + { + "epoch": 0.44372037833519906, + "grad_norm": 0.107421875, + "learning_rate": 0.002106210484863018, + "loss": 1.1598, + "step": 5060 + }, + { + "epoch": 0.4438080701095736, + "grad_norm": 0.0595703125, + "learning_rate": 0.0021058267629332664, + "loss": 1.2293, + "step": 5061 + }, + { + "epoch": 0.44389576188394814, + "grad_norm": 0.07568359375, + "learning_rate": 0.00210544299943329, + "loss": 1.2128, + "step": 5062 + }, + { + "epoch": 0.4439834536583227, + "grad_norm": 0.0478515625, + "learning_rate": 0.002105059194398088, + "loss": 1.1795, + "step": 5063 + }, + { + "epoch": 0.44407114543269727, + "grad_norm": 0.052978515625, + "learning_rate": 0.0021046753478626626, + "loss": 1.217, + "step": 5064 + }, + { + "epoch": 0.4441588372070718, + "grad_norm": 0.0869140625, + "learning_rate": 0.002104291459862018, + "loss": 1.1526, + "step": 5065 + }, + { + "epoch": 0.44424652898144634, + "grad_norm": 0.0947265625, + "learning_rate": 0.0021039075304311664, + "loss": 1.2137, + "step": 5066 + }, + { + "epoch": 0.4443342207558209, + "grad_norm": 0.05517578125, + "learning_rate": 0.002103523559605119, + "loss": 1.2328, + "step": 5067 + }, + { + "epoch": 0.4444219125301954, + "grad_norm": 0.0556640625, + "learning_rate": 0.002103139547418894, + "loss": 1.1448, + "step": 5068 + }, + { + "epoch": 0.44450960430457, + "grad_norm": 0.058837890625, + "learning_rate": 0.002102755493907512, + "loss": 1.1743, + "step": 5069 + }, + { + "epoch": 0.4445972960789445, + "grad_norm": 0.0849609375, + "learning_rate": 0.002102371399105998, + "loss": 1.2346, + "step": 5070 + }, + { + "epoch": 0.44468498785331906, + "grad_norm": 0.058349609375, + "learning_rate": 0.00210198726304938, + "loss": 1.1625, + "step": 5071 + }, + { + "epoch": 0.4447726796276936, + "grad_norm": 0.057861328125, + "learning_rate": 0.0021016030857726902, + "loss": 1.1455, + "step": 5072 + }, + { + "epoch": 0.44486037140206813, + "grad_norm": 0.0751953125, + "learning_rate": 0.002101218867310965, + "loss": 1.1301, + "step": 5073 + }, + { + "epoch": 0.4449480631764427, + "grad_norm": 0.05322265625, + "learning_rate": 0.0021008346076992436, + "loss": 1.1499, + "step": 5074 + }, + { + "epoch": 0.44503575495081726, + "grad_norm": 0.048095703125, + "learning_rate": 0.0021004503069725704, + "loss": 1.1597, + "step": 5075 + }, + { + "epoch": 0.4451234467251918, + "grad_norm": 0.0732421875, + "learning_rate": 0.0021000659651659913, + "loss": 1.1519, + "step": 5076 + }, + { + "epoch": 0.44521113849956634, + "grad_norm": 0.059814453125, + "learning_rate": 0.0020996815823145585, + "loss": 1.1608, + "step": 5077 + }, + { + "epoch": 0.4452988302739409, + "grad_norm": 0.05224609375, + "learning_rate": 0.002099297158453326, + "loss": 1.2376, + "step": 5078 + }, + { + "epoch": 0.4453865220483154, + "grad_norm": 0.07177734375, + "learning_rate": 0.0020989126936173524, + "loss": 1.1929, + "step": 5079 + }, + { + "epoch": 0.44547421382269, + "grad_norm": 0.0517578125, + "learning_rate": 0.0020985281878417006, + "loss": 1.2082, + "step": 5080 + }, + { + "epoch": 0.44556190559706454, + "grad_norm": 0.07861328125, + "learning_rate": 0.002098143641161436, + "loss": 1.2023, + "step": 5081 + }, + { + "epoch": 0.44564959737143905, + "grad_norm": 0.06982421875, + "learning_rate": 0.002097759053611627, + "loss": 1.216, + "step": 5082 + }, + { + "epoch": 0.4457372891458136, + "grad_norm": 0.07861328125, + "learning_rate": 0.0020973744252273494, + "loss": 1.1852, + "step": 5083 + }, + { + "epoch": 0.44582498092018813, + "grad_norm": 0.07470703125, + "learning_rate": 0.0020969897560436787, + "loss": 1.2432, + "step": 5084 + }, + { + "epoch": 0.4459126726945627, + "grad_norm": 0.057373046875, + "learning_rate": 0.002096605046095696, + "loss": 1.2106, + "step": 5085 + }, + { + "epoch": 0.44600036446893726, + "grad_norm": 0.057861328125, + "learning_rate": 0.0020962202954184863, + "loss": 1.1587, + "step": 5086 + }, + { + "epoch": 0.44608805624331177, + "grad_norm": 0.050537109375, + "learning_rate": 0.002095835504047137, + "loss": 1.1774, + "step": 5087 + }, + { + "epoch": 0.44617574801768634, + "grad_norm": 0.05810546875, + "learning_rate": 0.0020954506720167417, + "loss": 1.1778, + "step": 5088 + }, + { + "epoch": 0.4462634397920609, + "grad_norm": 0.048828125, + "learning_rate": 0.002095065799362394, + "loss": 1.1658, + "step": 5089 + }, + { + "epoch": 0.4463511315664354, + "grad_norm": 0.049560546875, + "learning_rate": 0.0020946808861191948, + "loss": 1.1408, + "step": 5090 + }, + { + "epoch": 0.44643882334081, + "grad_norm": 0.0537109375, + "learning_rate": 0.002094295932322247, + "loss": 1.1782, + "step": 5091 + }, + { + "epoch": 0.44652651511518454, + "grad_norm": 0.0546875, + "learning_rate": 0.0020939109380066568, + "loss": 1.1967, + "step": 5092 + }, + { + "epoch": 0.44661420688955905, + "grad_norm": 0.049560546875, + "learning_rate": 0.0020935259032075345, + "loss": 1.1205, + "step": 5093 + }, + { + "epoch": 0.4467018986639336, + "grad_norm": 0.054931640625, + "learning_rate": 0.002093140827959995, + "loss": 1.1427, + "step": 5094 + }, + { + "epoch": 0.4467895904383081, + "grad_norm": 0.05517578125, + "learning_rate": 0.0020927557122991563, + "loss": 1.1853, + "step": 5095 + }, + { + "epoch": 0.4468772822126827, + "grad_norm": 0.0986328125, + "learning_rate": 0.00209237055626014, + "loss": 1.126, + "step": 5096 + }, + { + "epoch": 0.44696497398705726, + "grad_norm": 0.058349609375, + "learning_rate": 0.0020919853598780703, + "loss": 1.1208, + "step": 5097 + }, + { + "epoch": 0.44705266576143177, + "grad_norm": 0.068359375, + "learning_rate": 0.002091600123188077, + "loss": 1.2101, + "step": 5098 + }, + { + "epoch": 0.44714035753580633, + "grad_norm": 0.056884765625, + "learning_rate": 0.002091214846225292, + "loss": 1.1911, + "step": 5099 + }, + { + "epoch": 0.4472280493101809, + "grad_norm": 0.10302734375, + "learning_rate": 0.0020908295290248528, + "loss": 1.1845, + "step": 5100 + }, + { + "epoch": 0.4473157410845554, + "grad_norm": 0.1103515625, + "learning_rate": 0.002090444171621898, + "loss": 1.2307, + "step": 5101 + }, + { + "epoch": 0.44740343285893, + "grad_norm": 0.052001953125, + "learning_rate": 0.0020900587740515716, + "loss": 1.1727, + "step": 5102 + }, + { + "epoch": 0.44749112463330454, + "grad_norm": 0.06591796875, + "learning_rate": 0.0020896733363490218, + "loss": 1.1761, + "step": 5103 + }, + { + "epoch": 0.44757881640767905, + "grad_norm": 0.06640625, + "learning_rate": 0.0020892878585493974, + "loss": 1.2429, + "step": 5104 + }, + { + "epoch": 0.4476665081820536, + "grad_norm": 0.09716796875, + "learning_rate": 0.0020889023406878557, + "loss": 1.2114, + "step": 5105 + }, + { + "epoch": 0.4477541999564281, + "grad_norm": 0.07177734375, + "learning_rate": 0.002088516782799553, + "loss": 1.2543, + "step": 5106 + }, + { + "epoch": 0.4478418917308027, + "grad_norm": 0.060546875, + "learning_rate": 0.0020881311849196517, + "loss": 1.2172, + "step": 5107 + }, + { + "epoch": 0.44792958350517725, + "grad_norm": 0.1015625, + "learning_rate": 0.0020877455470833177, + "loss": 1.1936, + "step": 5108 + }, + { + "epoch": 0.44801727527955176, + "grad_norm": 0.061767578125, + "learning_rate": 0.002087359869325719, + "loss": 1.2439, + "step": 5109 + }, + { + "epoch": 0.44810496705392633, + "grad_norm": 0.0791015625, + "learning_rate": 0.00208697415168203, + "loss": 1.2146, + "step": 5110 + }, + { + "epoch": 0.4481926588283009, + "grad_norm": 0.08544921875, + "learning_rate": 0.0020865883941874257, + "loss": 1.1201, + "step": 5111 + }, + { + "epoch": 0.4482803506026754, + "grad_norm": 0.04931640625, + "learning_rate": 0.002086202596877088, + "loss": 1.1521, + "step": 5112 + }, + { + "epoch": 0.44836804237704997, + "grad_norm": 0.08642578125, + "learning_rate": 0.0020858167597861987, + "loss": 1.1847, + "step": 5113 + }, + { + "epoch": 0.44845573415142453, + "grad_norm": 0.076171875, + "learning_rate": 0.0020854308829499463, + "loss": 1.2262, + "step": 5114 + }, + { + "epoch": 0.44854342592579904, + "grad_norm": 0.052001953125, + "learning_rate": 0.0020850449664035213, + "loss": 1.1292, + "step": 5115 + }, + { + "epoch": 0.4486311177001736, + "grad_norm": 0.06689453125, + "learning_rate": 0.002084659010182118, + "loss": 1.2237, + "step": 5116 + }, + { + "epoch": 0.4487188094745482, + "grad_norm": 0.05517578125, + "learning_rate": 0.0020842730143209357, + "loss": 1.2155, + "step": 5117 + }, + { + "epoch": 0.4488065012489227, + "grad_norm": 0.051513671875, + "learning_rate": 0.0020838869788551754, + "loss": 1.1337, + "step": 5118 + }, + { + "epoch": 0.44889419302329725, + "grad_norm": 0.054931640625, + "learning_rate": 0.0020835009038200425, + "loss": 1.173, + "step": 5119 + }, + { + "epoch": 0.44898188479767176, + "grad_norm": 0.04833984375, + "learning_rate": 0.002083114789250746, + "loss": 1.1426, + "step": 5120 + }, + { + "epoch": 0.4490695765720463, + "grad_norm": 0.055419921875, + "learning_rate": 0.0020827286351824997, + "loss": 1.115, + "step": 5121 + }, + { + "epoch": 0.4491572683464209, + "grad_norm": 0.054443359375, + "learning_rate": 0.0020823424416505187, + "loss": 1.1145, + "step": 5122 + }, + { + "epoch": 0.4492449601207954, + "grad_norm": 0.07275390625, + "learning_rate": 0.002081956208690022, + "loss": 1.1264, + "step": 5123 + }, + { + "epoch": 0.44933265189516997, + "grad_norm": 0.06298828125, + "learning_rate": 0.0020815699363362357, + "loss": 1.1835, + "step": 5124 + }, + { + "epoch": 0.44942034366954453, + "grad_norm": 0.0546875, + "learning_rate": 0.0020811836246243842, + "loss": 1.1939, + "step": 5125 + }, + { + "epoch": 0.44950803544391904, + "grad_norm": 0.05712890625, + "learning_rate": 0.0020807972735896996, + "loss": 1.2149, + "step": 5126 + }, + { + "epoch": 0.4495957272182936, + "grad_norm": 0.053466796875, + "learning_rate": 0.0020804108832674162, + "loss": 1.2162, + "step": 5127 + }, + { + "epoch": 0.44968341899266817, + "grad_norm": 0.06884765625, + "learning_rate": 0.0020800244536927706, + "loss": 1.1516, + "step": 5128 + }, + { + "epoch": 0.4497711107670427, + "grad_norm": 0.080078125, + "learning_rate": 0.0020796379849010054, + "loss": 1.1648, + "step": 5129 + }, + { + "epoch": 0.44985880254141725, + "grad_norm": 0.06591796875, + "learning_rate": 0.002079251476927365, + "loss": 1.146, + "step": 5130 + }, + { + "epoch": 0.44994649431579176, + "grad_norm": 0.08984375, + "learning_rate": 0.0020788649298070978, + "loss": 1.1734, + "step": 5131 + }, + { + "epoch": 0.4500341860901663, + "grad_norm": 0.07763671875, + "learning_rate": 0.002078478343575456, + "loss": 1.1656, + "step": 5132 + }, + { + "epoch": 0.4501218778645409, + "grad_norm": 0.080078125, + "learning_rate": 0.0020780917182676955, + "loss": 1.2233, + "step": 5133 + }, + { + "epoch": 0.4502095696389154, + "grad_norm": 0.0771484375, + "learning_rate": 0.002077705053919076, + "loss": 1.1914, + "step": 5134 + }, + { + "epoch": 0.45029726141328996, + "grad_norm": 0.060302734375, + "learning_rate": 0.002077318350564859, + "loss": 1.1821, + "step": 5135 + }, + { + "epoch": 0.4503849531876645, + "grad_norm": 0.0556640625, + "learning_rate": 0.002076931608240312, + "loss": 1.2019, + "step": 5136 + }, + { + "epoch": 0.45047264496203904, + "grad_norm": 0.061767578125, + "learning_rate": 0.002076544826980704, + "loss": 1.1701, + "step": 5137 + }, + { + "epoch": 0.4505603367364136, + "grad_norm": 0.054443359375, + "learning_rate": 0.002076158006821309, + "loss": 1.1287, + "step": 5138 + }, + { + "epoch": 0.45064802851078817, + "grad_norm": 0.06201171875, + "learning_rate": 0.0020757711477974046, + "loss": 1.2193, + "step": 5139 + }, + { + "epoch": 0.4507357202851627, + "grad_norm": 0.050048828125, + "learning_rate": 0.0020753842499442697, + "loss": 1.2386, + "step": 5140 + }, + { + "epoch": 0.45082341205953724, + "grad_norm": 0.060302734375, + "learning_rate": 0.0020749973132971897, + "loss": 1.1771, + "step": 5141 + }, + { + "epoch": 0.45091110383391175, + "grad_norm": 0.05078125, + "learning_rate": 0.002074610337891452, + "loss": 1.1241, + "step": 5142 + }, + { + "epoch": 0.4509987956082863, + "grad_norm": 0.07421875, + "learning_rate": 0.0020742233237623473, + "loss": 1.1377, + "step": 5143 + }, + { + "epoch": 0.4510864873826609, + "grad_norm": 0.056640625, + "learning_rate": 0.0020738362709451707, + "loss": 1.1751, + "step": 5144 + }, + { + "epoch": 0.4511741791570354, + "grad_norm": 0.06396484375, + "learning_rate": 0.002073449179475221, + "loss": 1.1675, + "step": 5145 + }, + { + "epoch": 0.45126187093140996, + "grad_norm": 0.0712890625, + "learning_rate": 0.0020730620493877978, + "loss": 1.1716, + "step": 5146 + }, + { + "epoch": 0.4513495627057845, + "grad_norm": 0.06787109375, + "learning_rate": 0.002072674880718209, + "loss": 1.1977, + "step": 5147 + }, + { + "epoch": 0.45143725448015903, + "grad_norm": 0.0478515625, + "learning_rate": 0.0020722876735017617, + "loss": 1.1476, + "step": 5148 + }, + { + "epoch": 0.4515249462545336, + "grad_norm": 0.06884765625, + "learning_rate": 0.002071900427773769, + "loss": 1.2445, + "step": 5149 + }, + { + "epoch": 0.45161263802890816, + "grad_norm": 0.052001953125, + "learning_rate": 0.0020715131435695465, + "loss": 1.1702, + "step": 5150 + }, + { + "epoch": 0.4517003298032827, + "grad_norm": 0.057861328125, + "learning_rate": 0.0020711258209244127, + "loss": 1.138, + "step": 5151 + }, + { + "epoch": 0.45178802157765724, + "grad_norm": 0.057861328125, + "learning_rate": 0.002070738459873692, + "loss": 1.1649, + "step": 5152 + }, + { + "epoch": 0.4518757133520318, + "grad_norm": 0.050048828125, + "learning_rate": 0.0020703510604527095, + "loss": 1.1415, + "step": 5153 + }, + { + "epoch": 0.4519634051264063, + "grad_norm": 0.057861328125, + "learning_rate": 0.0020699636226967954, + "loss": 1.1556, + "step": 5154 + }, + { + "epoch": 0.4520510969007809, + "grad_norm": 0.057861328125, + "learning_rate": 0.002069576146641283, + "loss": 1.1622, + "step": 5155 + }, + { + "epoch": 0.4521387886751554, + "grad_norm": 0.052001953125, + "learning_rate": 0.0020691886323215094, + "loss": 1.1854, + "step": 5156 + }, + { + "epoch": 0.45222648044952996, + "grad_norm": 0.05078125, + "learning_rate": 0.002068801079772815, + "loss": 1.1665, + "step": 5157 + }, + { + "epoch": 0.4523141722239045, + "grad_norm": 0.056396484375, + "learning_rate": 0.0020684134890305425, + "loss": 1.2226, + "step": 5158 + }, + { + "epoch": 0.45240186399827903, + "grad_norm": 0.09130859375, + "learning_rate": 0.00206802586013004, + "loss": 1.206, + "step": 5159 + }, + { + "epoch": 0.4524895557726536, + "grad_norm": 0.049072265625, + "learning_rate": 0.0020676381931066584, + "loss": 1.2103, + "step": 5160 + }, + { + "epoch": 0.45257724754702816, + "grad_norm": 0.07177734375, + "learning_rate": 0.002067250487995752, + "loss": 1.1733, + "step": 5161 + }, + { + "epoch": 0.45266493932140267, + "grad_norm": 0.05126953125, + "learning_rate": 0.0020668627448326778, + "loss": 1.1935, + "step": 5162 + }, + { + "epoch": 0.45275263109577724, + "grad_norm": 0.078125, + "learning_rate": 0.0020664749636527973, + "loss": 1.1838, + "step": 5163 + }, + { + "epoch": 0.4528403228701518, + "grad_norm": 0.050048828125, + "learning_rate": 0.0020660871444914764, + "loss": 1.1831, + "step": 5164 + }, + { + "epoch": 0.4529280146445263, + "grad_norm": 0.053955078125, + "learning_rate": 0.0020656992873840803, + "loss": 1.2171, + "step": 5165 + }, + { + "epoch": 0.4530157064189009, + "grad_norm": 0.05859375, + "learning_rate": 0.0020653113923659837, + "loss": 1.2114, + "step": 5166 + }, + { + "epoch": 0.4531033981932754, + "grad_norm": 0.055908203125, + "learning_rate": 0.0020649234594725596, + "loss": 1.172, + "step": 5167 + }, + { + "epoch": 0.45319108996764995, + "grad_norm": 0.052001953125, + "learning_rate": 0.0020645354887391873, + "loss": 1.1647, + "step": 5168 + }, + { + "epoch": 0.4532787817420245, + "grad_norm": 0.052978515625, + "learning_rate": 0.0020641474802012482, + "loss": 1.2189, + "step": 5169 + }, + { + "epoch": 0.453366473516399, + "grad_norm": 0.057373046875, + "learning_rate": 0.0020637594338941286, + "loss": 1.167, + "step": 5170 + }, + { + "epoch": 0.4534541652907736, + "grad_norm": 0.09375, + "learning_rate": 0.002063371349853216, + "loss": 1.1042, + "step": 5171 + }, + { + "epoch": 0.45354185706514816, + "grad_norm": 0.058349609375, + "learning_rate": 0.002062983228113904, + "loss": 1.1901, + "step": 5172 + }, + { + "epoch": 0.45362954883952267, + "grad_norm": 0.08837890625, + "learning_rate": 0.0020625950687115876, + "loss": 1.1705, + "step": 5173 + }, + { + "epoch": 0.45371724061389723, + "grad_norm": 0.051513671875, + "learning_rate": 0.002062206871681666, + "loss": 1.1499, + "step": 5174 + }, + { + "epoch": 0.4538049323882718, + "grad_norm": 0.054443359375, + "learning_rate": 0.002061818637059542, + "loss": 1.1395, + "step": 5175 + }, + { + "epoch": 0.4538926241626463, + "grad_norm": 0.054443359375, + "learning_rate": 0.002061430364880621, + "loss": 1.1806, + "step": 5176 + }, + { + "epoch": 0.4539803159370209, + "grad_norm": 0.055419921875, + "learning_rate": 0.002061042055180313, + "loss": 1.2428, + "step": 5177 + }, + { + "epoch": 0.4540680077113954, + "grad_norm": 0.07080078125, + "learning_rate": 0.0020606537079940306, + "loss": 1.177, + "step": 5178 + }, + { + "epoch": 0.45415569948576995, + "grad_norm": 0.060791015625, + "learning_rate": 0.00206026532335719, + "loss": 1.1747, + "step": 5179 + }, + { + "epoch": 0.4542433912601445, + "grad_norm": 0.0537109375, + "learning_rate": 0.002059876901305211, + "loss": 1.1627, + "step": 5180 + }, + { + "epoch": 0.454331083034519, + "grad_norm": 0.049560546875, + "learning_rate": 0.0020594884418735173, + "loss": 1.1975, + "step": 5181 + }, + { + "epoch": 0.4544187748088936, + "grad_norm": 0.052734375, + "learning_rate": 0.0020590999450975345, + "loss": 1.1292, + "step": 5182 + }, + { + "epoch": 0.45450646658326815, + "grad_norm": 0.059814453125, + "learning_rate": 0.0020587114110126926, + "loss": 1.2401, + "step": 5183 + }, + { + "epoch": 0.45459415835764266, + "grad_norm": 0.053466796875, + "learning_rate": 0.0020583228396544254, + "loss": 1.1774, + "step": 5184 + }, + { + "epoch": 0.45468185013201723, + "grad_norm": 0.05078125, + "learning_rate": 0.002057934231058169, + "loss": 1.2155, + "step": 5185 + }, + { + "epoch": 0.4547695419063918, + "grad_norm": 0.057861328125, + "learning_rate": 0.002057545585259364, + "loss": 1.1958, + "step": 5186 + }, + { + "epoch": 0.4548572336807663, + "grad_norm": 0.04638671875, + "learning_rate": 0.0020571569022934536, + "loss": 1.1075, + "step": 5187 + }, + { + "epoch": 0.45494492545514087, + "grad_norm": 0.057861328125, + "learning_rate": 0.0020567681821958843, + "loss": 1.1641, + "step": 5188 + }, + { + "epoch": 0.45503261722951543, + "grad_norm": 0.07958984375, + "learning_rate": 0.0020563794250021074, + "loss": 1.1856, + "step": 5189 + }, + { + "epoch": 0.45512030900388994, + "grad_norm": 0.06494140625, + "learning_rate": 0.0020559906307475767, + "loss": 1.1647, + "step": 5190 + }, + { + "epoch": 0.4552080007782645, + "grad_norm": 0.12109375, + "learning_rate": 0.0020556017994677474, + "loss": 1.186, + "step": 5191 + }, + { + "epoch": 0.455295692552639, + "grad_norm": 0.060302734375, + "learning_rate": 0.002055212931198081, + "loss": 1.1897, + "step": 5192 + }, + { + "epoch": 0.4553833843270136, + "grad_norm": 0.1279296875, + "learning_rate": 0.0020548240259740417, + "loss": 1.1956, + "step": 5193 + }, + { + "epoch": 0.45547107610138815, + "grad_norm": 0.050048828125, + "learning_rate": 0.0020544350838310964, + "loss": 1.2482, + "step": 5194 + }, + { + "epoch": 0.45555876787576266, + "grad_norm": 0.09130859375, + "learning_rate": 0.002054046104804715, + "loss": 1.2236, + "step": 5195 + }, + { + "epoch": 0.4556464596501372, + "grad_norm": 0.09375, + "learning_rate": 0.0020536570889303726, + "loss": 1.2121, + "step": 5196 + }, + { + "epoch": 0.4557341514245118, + "grad_norm": 0.07177734375, + "learning_rate": 0.0020532680362435455, + "loss": 1.2086, + "step": 5197 + }, + { + "epoch": 0.4558218431988863, + "grad_norm": 0.08154296875, + "learning_rate": 0.0020528789467797147, + "loss": 1.1654, + "step": 5198 + }, + { + "epoch": 0.45590953497326087, + "grad_norm": 0.060302734375, + "learning_rate": 0.002052489820574364, + "loss": 1.1572, + "step": 5199 + }, + { + "epoch": 0.45599722674763543, + "grad_norm": 0.0673828125, + "learning_rate": 0.00205210065766298, + "loss": 1.12, + "step": 5200 + }, + { + "epoch": 0.45608491852200994, + "grad_norm": 0.0732421875, + "learning_rate": 0.0020517114580810543, + "loss": 1.1034, + "step": 5201 + }, + { + "epoch": 0.4561726102963845, + "grad_norm": 0.06298828125, + "learning_rate": 0.0020513222218640813, + "loss": 1.203, + "step": 5202 + }, + { + "epoch": 0.456260302070759, + "grad_norm": 0.0673828125, + "learning_rate": 0.002050932949047557, + "loss": 1.2056, + "step": 5203 + }, + { + "epoch": 0.4563479938451336, + "grad_norm": 0.05712890625, + "learning_rate": 0.0020505436396669827, + "loss": 1.2128, + "step": 5204 + }, + { + "epoch": 0.45643568561950815, + "grad_norm": 0.049560546875, + "learning_rate": 0.002050154293757863, + "loss": 1.1468, + "step": 5205 + }, + { + "epoch": 0.45652337739388266, + "grad_norm": 0.0615234375, + "learning_rate": 0.002049764911355704, + "loss": 1.1581, + "step": 5206 + }, + { + "epoch": 0.4566110691682572, + "grad_norm": 0.0537109375, + "learning_rate": 0.0020493754924960175, + "loss": 1.222, + "step": 5207 + }, + { + "epoch": 0.4566987609426318, + "grad_norm": 0.08154296875, + "learning_rate": 0.002048986037214317, + "loss": 1.1168, + "step": 5208 + }, + { + "epoch": 0.4567864527170063, + "grad_norm": 0.06640625, + "learning_rate": 0.0020485965455461197, + "loss": 1.1635, + "step": 5209 + }, + { + "epoch": 0.45687414449138086, + "grad_norm": 0.05712890625, + "learning_rate": 0.0020482070175269467, + "loss": 1.1941, + "step": 5210 + }, + { + "epoch": 0.45696183626575543, + "grad_norm": 0.060302734375, + "learning_rate": 0.0020478174531923213, + "loss": 1.1895, + "step": 5211 + }, + { + "epoch": 0.45704952804012994, + "grad_norm": 0.10546875, + "learning_rate": 0.002047427852577772, + "loss": 1.1612, + "step": 5212 + }, + { + "epoch": 0.4571372198145045, + "grad_norm": 0.06591796875, + "learning_rate": 0.0020470382157188275, + "loss": 1.1698, + "step": 5213 + }, + { + "epoch": 0.457224911588879, + "grad_norm": 0.08447265625, + "learning_rate": 0.0020466485426510234, + "loss": 1.1466, + "step": 5214 + }, + { + "epoch": 0.4573126033632536, + "grad_norm": 0.09228515625, + "learning_rate": 0.002046258833409896, + "loss": 1.1319, + "step": 5215 + }, + { + "epoch": 0.45740029513762814, + "grad_norm": 0.08544921875, + "learning_rate": 0.0020458690880309854, + "loss": 1.2753, + "step": 5216 + }, + { + "epoch": 0.45748798691200265, + "grad_norm": 0.052734375, + "learning_rate": 0.002045479306549836, + "loss": 1.2515, + "step": 5217 + }, + { + "epoch": 0.4575756786863772, + "grad_norm": 0.0615234375, + "learning_rate": 0.002045089489001995, + "loss": 1.242, + "step": 5218 + }, + { + "epoch": 0.4576633704607518, + "grad_norm": 0.05859375, + "learning_rate": 0.002044699635423013, + "loss": 1.1769, + "step": 5219 + }, + { + "epoch": 0.4577510622351263, + "grad_norm": 0.068359375, + "learning_rate": 0.0020443097458484427, + "loss": 1.2065, + "step": 5220 + }, + { + "epoch": 0.45783875400950086, + "grad_norm": 0.09423828125, + "learning_rate": 0.0020439198203138415, + "loss": 1.1994, + "step": 5221 + }, + { + "epoch": 0.4579264457838754, + "grad_norm": 0.0546875, + "learning_rate": 0.0020435298588547695, + "loss": 1.1597, + "step": 5222 + }, + { + "epoch": 0.45801413755824993, + "grad_norm": 0.08642578125, + "learning_rate": 0.0020431398615067906, + "loss": 1.1836, + "step": 5223 + }, + { + "epoch": 0.4581018293326245, + "grad_norm": 0.09033203125, + "learning_rate": 0.0020427498283054706, + "loss": 1.1805, + "step": 5224 + }, + { + "epoch": 0.458189521106999, + "grad_norm": 0.0537109375, + "learning_rate": 0.002042359759286381, + "loss": 1.1553, + "step": 5225 + }, + { + "epoch": 0.4582772128813736, + "grad_norm": 0.1201171875, + "learning_rate": 0.0020419696544850935, + "loss": 1.2174, + "step": 5226 + }, + { + "epoch": 0.45836490465574814, + "grad_norm": 0.0537109375, + "learning_rate": 0.0020415795139371857, + "loss": 1.1505, + "step": 5227 + }, + { + "epoch": 0.45845259643012265, + "grad_norm": 0.060302734375, + "learning_rate": 0.0020411893376782366, + "loss": 1.149, + "step": 5228 + }, + { + "epoch": 0.4585402882044972, + "grad_norm": 0.052001953125, + "learning_rate": 0.00204079912574383, + "loss": 1.1881, + "step": 5229 + }, + { + "epoch": 0.4586279799788718, + "grad_norm": 0.04931640625, + "learning_rate": 0.0020404088781695523, + "loss": 1.2628, + "step": 5230 + }, + { + "epoch": 0.4587156717532463, + "grad_norm": 0.053955078125, + "learning_rate": 0.0020400185949909925, + "loss": 1.2306, + "step": 5231 + }, + { + "epoch": 0.45880336352762086, + "grad_norm": 0.055419921875, + "learning_rate": 0.002039628276243744, + "loss": 1.1691, + "step": 5232 + }, + { + "epoch": 0.4588910553019954, + "grad_norm": 0.06103515625, + "learning_rate": 0.002039237921963402, + "loss": 1.1744, + "step": 5233 + }, + { + "epoch": 0.45897874707636993, + "grad_norm": 0.058349609375, + "learning_rate": 0.0020388475321855668, + "loss": 1.2127, + "step": 5234 + }, + { + "epoch": 0.4590664388507445, + "grad_norm": 0.1162109375, + "learning_rate": 0.0020384571069458402, + "loss": 1.1825, + "step": 5235 + }, + { + "epoch": 0.45915413062511906, + "grad_norm": 0.061767578125, + "learning_rate": 0.002038066646279828, + "loss": 1.185, + "step": 5236 + }, + { + "epoch": 0.45924182239949357, + "grad_norm": 0.12158203125, + "learning_rate": 0.00203767615022314, + "loss": 1.1696, + "step": 5237 + }, + { + "epoch": 0.45932951417386814, + "grad_norm": 0.09228515625, + "learning_rate": 0.0020372856188113878, + "loss": 1.1827, + "step": 5238 + }, + { + "epoch": 0.45941720594824265, + "grad_norm": 0.05859375, + "learning_rate": 0.0020368950520801865, + "loss": 1.1566, + "step": 5239 + }, + { + "epoch": 0.4595048977226172, + "grad_norm": 0.1416015625, + "learning_rate": 0.0020365044500651555, + "loss": 1.1677, + "step": 5240 + }, + { + "epoch": 0.4595925894969918, + "grad_norm": 0.053955078125, + "learning_rate": 0.0020361138128019165, + "loss": 1.1707, + "step": 5241 + }, + { + "epoch": 0.4596802812713663, + "grad_norm": 0.06787109375, + "learning_rate": 0.002035723140326094, + "loss": 1.1899, + "step": 5242 + }, + { + "epoch": 0.45976797304574085, + "grad_norm": 0.08544921875, + "learning_rate": 0.002035332432673318, + "loss": 1.1317, + "step": 5243 + }, + { + "epoch": 0.4598556648201154, + "grad_norm": 0.051513671875, + "learning_rate": 0.0020349416898792177, + "loss": 1.257, + "step": 5244 + }, + { + "epoch": 0.4599433565944899, + "grad_norm": 0.05322265625, + "learning_rate": 0.0020345509119794295, + "loss": 1.1831, + "step": 5245 + }, + { + "epoch": 0.4600310483688645, + "grad_norm": 0.056396484375, + "learning_rate": 0.0020341600990095906, + "loss": 1.2278, + "step": 5246 + }, + { + "epoch": 0.46011874014323906, + "grad_norm": 0.057373046875, + "learning_rate": 0.0020337692510053428, + "loss": 1.1497, + "step": 5247 + }, + { + "epoch": 0.46020643191761357, + "grad_norm": 0.08349609375, + "learning_rate": 0.0020333783680023293, + "loss": 1.1507, + "step": 5248 + }, + { + "epoch": 0.46029412369198813, + "grad_norm": 0.061767578125, + "learning_rate": 0.0020329874500361993, + "loss": 1.2228, + "step": 5249 + }, + { + "epoch": 0.46038181546636264, + "grad_norm": 0.06591796875, + "learning_rate": 0.002032596497142602, + "loss": 1.1897, + "step": 5250 + }, + { + "epoch": 0.4604695072407372, + "grad_norm": 0.05859375, + "learning_rate": 0.002032205509357192, + "loss": 1.1816, + "step": 5251 + }, + { + "epoch": 0.4605571990151118, + "grad_norm": 0.05712890625, + "learning_rate": 0.002031814486715626, + "loss": 1.171, + "step": 5252 + }, + { + "epoch": 0.4606448907894863, + "grad_norm": 0.07177734375, + "learning_rate": 0.002031423429253564, + "loss": 1.2107, + "step": 5253 + }, + { + "epoch": 0.46073258256386085, + "grad_norm": 0.05908203125, + "learning_rate": 0.0020310323370066707, + "loss": 1.1908, + "step": 5254 + }, + { + "epoch": 0.4608202743382354, + "grad_norm": 0.052001953125, + "learning_rate": 0.0020306412100106115, + "loss": 1.1246, + "step": 5255 + }, + { + "epoch": 0.4609079661126099, + "grad_norm": 0.11083984375, + "learning_rate": 0.0020302500483010567, + "loss": 1.1751, + "step": 5256 + }, + { + "epoch": 0.4609956578869845, + "grad_norm": 0.051025390625, + "learning_rate": 0.002029858851913679, + "loss": 1.1497, + "step": 5257 + }, + { + "epoch": 0.46108334966135905, + "grad_norm": 0.0986328125, + "learning_rate": 0.002029467620884155, + "loss": 1.1777, + "step": 5258 + }, + { + "epoch": 0.46117104143573356, + "grad_norm": 0.07177734375, + "learning_rate": 0.002029076355248163, + "loss": 1.2054, + "step": 5259 + }, + { + "epoch": 0.46125873321010813, + "grad_norm": 0.09228515625, + "learning_rate": 0.002028685055041386, + "loss": 1.1278, + "step": 5260 + }, + { + "epoch": 0.46134642498448264, + "grad_norm": 0.0634765625, + "learning_rate": 0.0020282937202995097, + "loss": 1.1275, + "step": 5261 + }, + { + "epoch": 0.4614341167588572, + "grad_norm": 0.0498046875, + "learning_rate": 0.002027902351058223, + "loss": 1.2013, + "step": 5262 + }, + { + "epoch": 0.46152180853323177, + "grad_norm": 0.056640625, + "learning_rate": 0.0020275109473532173, + "loss": 1.1834, + "step": 5263 + }, + { + "epoch": 0.4616095003076063, + "grad_norm": 0.05908203125, + "learning_rate": 0.0020271195092201873, + "loss": 1.1776, + "step": 5264 + }, + { + "epoch": 0.46169719208198085, + "grad_norm": 0.0546875, + "learning_rate": 0.0020267280366948323, + "loss": 1.2324, + "step": 5265 + }, + { + "epoch": 0.4617848838563554, + "grad_norm": 0.053955078125, + "learning_rate": 0.002026336529812852, + "loss": 1.1748, + "step": 5266 + }, + { + "epoch": 0.4618725756307299, + "grad_norm": 0.052734375, + "learning_rate": 0.0020259449886099526, + "loss": 1.1385, + "step": 5267 + }, + { + "epoch": 0.4619602674051045, + "grad_norm": 0.0498046875, + "learning_rate": 0.0020255534131218404, + "loss": 1.1508, + "step": 5268 + }, + { + "epoch": 0.46204795917947905, + "grad_norm": 0.055908203125, + "learning_rate": 0.002025161803384226, + "loss": 1.159, + "step": 5269 + }, + { + "epoch": 0.46213565095385356, + "grad_norm": 0.060546875, + "learning_rate": 0.002024770159432824, + "loss": 1.1769, + "step": 5270 + }, + { + "epoch": 0.4622233427282281, + "grad_norm": 0.052001953125, + "learning_rate": 0.0020243784813033503, + "loss": 1.2052, + "step": 5271 + }, + { + "epoch": 0.4623110345026027, + "grad_norm": 0.0615234375, + "learning_rate": 0.0020239867690315266, + "loss": 1.111, + "step": 5272 + }, + { + "epoch": 0.4623987262769772, + "grad_norm": 0.051025390625, + "learning_rate": 0.002023595022653074, + "loss": 1.1928, + "step": 5273 + }, + { + "epoch": 0.46248641805135177, + "grad_norm": 0.0703125, + "learning_rate": 0.002023203242203721, + "loss": 1.1503, + "step": 5274 + }, + { + "epoch": 0.4625741098257263, + "grad_norm": 0.055908203125, + "learning_rate": 0.0020228114277191945, + "loss": 1.2033, + "step": 5275 + }, + { + "epoch": 0.46266180160010084, + "grad_norm": 0.05517578125, + "learning_rate": 0.002022419579235229, + "loss": 1.2272, + "step": 5276 + }, + { + "epoch": 0.4627494933744754, + "grad_norm": 0.057373046875, + "learning_rate": 0.0020220276967875587, + "loss": 1.1007, + "step": 5277 + }, + { + "epoch": 0.4628371851488499, + "grad_norm": 0.091796875, + "learning_rate": 0.0020216357804119234, + "loss": 1.1784, + "step": 5278 + }, + { + "epoch": 0.4629248769232245, + "grad_norm": 0.060546875, + "learning_rate": 0.0020212438301440636, + "loss": 1.2131, + "step": 5279 + }, + { + "epoch": 0.46301256869759905, + "grad_norm": 0.0517578125, + "learning_rate": 0.002020851846019726, + "loss": 1.1642, + "step": 5280 + }, + { + "epoch": 0.46310026047197356, + "grad_norm": 0.05810546875, + "learning_rate": 0.002020459828074656, + "loss": 1.1888, + "step": 5281 + }, + { + "epoch": 0.4631879522463481, + "grad_norm": 0.048095703125, + "learning_rate": 0.002020067776344606, + "loss": 1.0788, + "step": 5282 + }, + { + "epoch": 0.4632756440207227, + "grad_norm": 0.05517578125, + "learning_rate": 0.0020196756908653314, + "loss": 1.2003, + "step": 5283 + }, + { + "epoch": 0.4633633357950972, + "grad_norm": 0.052978515625, + "learning_rate": 0.0020192835716725875, + "loss": 1.2512, + "step": 5284 + }, + { + "epoch": 0.46345102756947176, + "grad_norm": 0.05908203125, + "learning_rate": 0.0020188914188021354, + "loss": 1.1943, + "step": 5285 + }, + { + "epoch": 0.4635387193438463, + "grad_norm": 0.060791015625, + "learning_rate": 0.0020184992322897378, + "loss": 1.1704, + "step": 5286 + }, + { + "epoch": 0.46362641111822084, + "grad_norm": 0.0595703125, + "learning_rate": 0.0020181070121711615, + "loss": 1.1282, + "step": 5287 + }, + { + "epoch": 0.4637141028925954, + "grad_norm": 0.05517578125, + "learning_rate": 0.0020177147584821763, + "loss": 1.1656, + "step": 5288 + }, + { + "epoch": 0.4638017946669699, + "grad_norm": 0.05224609375, + "learning_rate": 0.002017322471258554, + "loss": 1.1281, + "step": 5289 + }, + { + "epoch": 0.4638894864413445, + "grad_norm": 0.08935546875, + "learning_rate": 0.0020169301505360707, + "loss": 1.1506, + "step": 5290 + }, + { + "epoch": 0.46397717821571904, + "grad_norm": 0.056396484375, + "learning_rate": 0.0020165377963505047, + "loss": 1.2141, + "step": 5291 + }, + { + "epoch": 0.46406486999009355, + "grad_norm": 0.078125, + "learning_rate": 0.002016145408737639, + "loss": 1.1921, + "step": 5292 + }, + { + "epoch": 0.4641525617644681, + "grad_norm": 0.062255859375, + "learning_rate": 0.002015752987733256, + "loss": 1.2243, + "step": 5293 + }, + { + "epoch": 0.4642402535388427, + "grad_norm": 0.06396484375, + "learning_rate": 0.002015360533373145, + "loss": 1.2605, + "step": 5294 + }, + { + "epoch": 0.4643279453132172, + "grad_norm": 0.06494140625, + "learning_rate": 0.0020149680456930962, + "loss": 1.197, + "step": 5295 + }, + { + "epoch": 0.46441563708759176, + "grad_norm": 0.083984375, + "learning_rate": 0.002014575524728904, + "loss": 1.1938, + "step": 5296 + }, + { + "epoch": 0.46450332886196627, + "grad_norm": 0.061279296875, + "learning_rate": 0.0020141829705163654, + "loss": 1.1694, + "step": 5297 + }, + { + "epoch": 0.46459102063634083, + "grad_norm": 0.091796875, + "learning_rate": 0.00201379038309128, + "loss": 1.1704, + "step": 5298 + }, + { + "epoch": 0.4646787124107154, + "grad_norm": 0.059814453125, + "learning_rate": 0.00201339776248945, + "loss": 1.1751, + "step": 5299 + }, + { + "epoch": 0.4647664041850899, + "grad_norm": 0.0673828125, + "learning_rate": 0.0020130051087466835, + "loss": 1.1607, + "step": 5300 + }, + { + "epoch": 0.4648540959594645, + "grad_norm": 0.056884765625, + "learning_rate": 0.0020126124218987876, + "loss": 1.1614, + "step": 5301 + }, + { + "epoch": 0.46494178773383904, + "grad_norm": 0.0576171875, + "learning_rate": 0.002012219701981575, + "loss": 1.1535, + "step": 5302 + }, + { + "epoch": 0.46502947950821355, + "grad_norm": 0.0546875, + "learning_rate": 0.00201182694903086, + "loss": 1.1149, + "step": 5303 + }, + { + "epoch": 0.4651171712825881, + "grad_norm": 0.0712890625, + "learning_rate": 0.0020114341630824615, + "loss": 1.1813, + "step": 5304 + }, + { + "epoch": 0.4652048630569627, + "grad_norm": 0.05419921875, + "learning_rate": 0.002011041344172201, + "loss": 1.1763, + "step": 5305 + }, + { + "epoch": 0.4652925548313372, + "grad_norm": 0.07177734375, + "learning_rate": 0.002010648492335901, + "loss": 1.1094, + "step": 5306 + }, + { + "epoch": 0.46538024660571176, + "grad_norm": 0.056396484375, + "learning_rate": 0.0020102556076093906, + "loss": 1.1906, + "step": 5307 + }, + { + "epoch": 0.4654679383800863, + "grad_norm": 0.052978515625, + "learning_rate": 0.002009862690028498, + "loss": 1.218, + "step": 5308 + }, + { + "epoch": 0.46555563015446083, + "grad_norm": 0.052734375, + "learning_rate": 0.0020094697396290575, + "loss": 1.2312, + "step": 5309 + }, + { + "epoch": 0.4656433219288354, + "grad_norm": 0.08154296875, + "learning_rate": 0.0020090767564469045, + "loss": 1.1815, + "step": 5310 + }, + { + "epoch": 0.4657310137032099, + "grad_norm": 0.052490234375, + "learning_rate": 0.002008683740517878, + "loss": 1.2485, + "step": 5311 + }, + { + "epoch": 0.46581870547758447, + "grad_norm": 0.05322265625, + "learning_rate": 0.002008290691877821, + "loss": 1.2737, + "step": 5312 + }, + { + "epoch": 0.46590639725195904, + "grad_norm": 0.07421875, + "learning_rate": 0.0020078976105625773, + "loss": 1.1637, + "step": 5313 + }, + { + "epoch": 0.46599408902633355, + "grad_norm": 0.060546875, + "learning_rate": 0.0020075044966079954, + "loss": 1.1778, + "step": 5314 + }, + { + "epoch": 0.4660817808007081, + "grad_norm": 0.05126953125, + "learning_rate": 0.0020071113500499263, + "loss": 1.1683, + "step": 5315 + }, + { + "epoch": 0.4661694725750827, + "grad_norm": 0.047607421875, + "learning_rate": 0.002006718170924224, + "loss": 1.1503, + "step": 5316 + }, + { + "epoch": 0.4662571643494572, + "grad_norm": 0.078125, + "learning_rate": 0.0020063249592667458, + "loss": 1.2301, + "step": 5317 + }, + { + "epoch": 0.46634485612383175, + "grad_norm": 0.052001953125, + "learning_rate": 0.0020059317151133508, + "loss": 1.1232, + "step": 5318 + }, + { + "epoch": 0.4664325478982063, + "grad_norm": 0.072265625, + "learning_rate": 0.002005538438499902, + "loss": 1.2855, + "step": 5319 + }, + { + "epoch": 0.4665202396725808, + "grad_norm": 0.064453125, + "learning_rate": 0.002005145129462266, + "loss": 1.1561, + "step": 5320 + }, + { + "epoch": 0.4666079314469554, + "grad_norm": 0.0634765625, + "learning_rate": 0.0020047517880363106, + "loss": 1.1757, + "step": 5321 + }, + { + "epoch": 0.4666956232213299, + "grad_norm": 0.07958984375, + "learning_rate": 0.002004358414257908, + "loss": 1.1372, + "step": 5322 + }, + { + "epoch": 0.46678331499570447, + "grad_norm": 0.05615234375, + "learning_rate": 0.002003965008162932, + "loss": 1.2237, + "step": 5323 + }, + { + "epoch": 0.46687100677007903, + "grad_norm": 0.0791015625, + "learning_rate": 0.002003571569787262, + "loss": 1.1755, + "step": 5324 + }, + { + "epoch": 0.46695869854445354, + "grad_norm": 0.058349609375, + "learning_rate": 0.0020031780991667776, + "loss": 1.1883, + "step": 5325 + }, + { + "epoch": 0.4670463903188281, + "grad_norm": 0.08837890625, + "learning_rate": 0.002002784596337362, + "loss": 1.2047, + "step": 5326 + }, + { + "epoch": 0.4671340820932027, + "grad_norm": 0.07666015625, + "learning_rate": 0.002002391061334902, + "loss": 1.2028, + "step": 5327 + }, + { + "epoch": 0.4672217738675772, + "grad_norm": 0.06396484375, + "learning_rate": 0.002001997494195287, + "loss": 1.1238, + "step": 5328 + }, + { + "epoch": 0.46730946564195175, + "grad_norm": 0.1240234375, + "learning_rate": 0.002001603894954409, + "loss": 1.1169, + "step": 5329 + }, + { + "epoch": 0.4673971574163263, + "grad_norm": 0.047119140625, + "learning_rate": 0.002001210263648163, + "loss": 1.1403, + "step": 5330 + }, + { + "epoch": 0.4674848491907008, + "grad_norm": 0.0986328125, + "learning_rate": 0.0020008166003124485, + "loss": 1.1873, + "step": 5331 + }, + { + "epoch": 0.4675725409650754, + "grad_norm": 0.060302734375, + "learning_rate": 0.002000422904983165, + "loss": 1.1732, + "step": 5332 + }, + { + "epoch": 0.4676602327394499, + "grad_norm": 0.0517578125, + "learning_rate": 0.002000029177696218, + "loss": 1.1879, + "step": 5333 + }, + { + "epoch": 0.46774792451382446, + "grad_norm": 0.072265625, + "learning_rate": 0.001999635418487513, + "loss": 1.2192, + "step": 5334 + }, + { + "epoch": 0.46783561628819903, + "grad_norm": 0.09521484375, + "learning_rate": 0.00199924162739296, + "loss": 1.1529, + "step": 5335 + }, + { + "epoch": 0.46792330806257354, + "grad_norm": 0.064453125, + "learning_rate": 0.001998847804448473, + "loss": 1.1675, + "step": 5336 + }, + { + "epoch": 0.4680109998369481, + "grad_norm": 0.061279296875, + "learning_rate": 0.001998453949689966, + "loss": 1.251, + "step": 5337 + }, + { + "epoch": 0.46809869161132267, + "grad_norm": 0.06103515625, + "learning_rate": 0.0019980600631533587, + "loss": 1.1895, + "step": 5338 + }, + { + "epoch": 0.4681863833856972, + "grad_norm": 0.055908203125, + "learning_rate": 0.001997666144874572, + "loss": 1.1433, + "step": 5339 + }, + { + "epoch": 0.46827407516007175, + "grad_norm": 0.062255859375, + "learning_rate": 0.0019972721948895308, + "loss": 1.1403, + "step": 5340 + }, + { + "epoch": 0.4683617669344463, + "grad_norm": 0.059814453125, + "learning_rate": 0.0019968782132341613, + "loss": 1.2059, + "step": 5341 + }, + { + "epoch": 0.4684494587088208, + "grad_norm": 0.061279296875, + "learning_rate": 0.001996484199944395, + "loss": 1.1731, + "step": 5342 + }, + { + "epoch": 0.4685371504831954, + "grad_norm": 0.059814453125, + "learning_rate": 0.0019960901550561635, + "loss": 1.1422, + "step": 5343 + }, + { + "epoch": 0.46862484225756995, + "grad_norm": 0.08935546875, + "learning_rate": 0.001995696078605403, + "loss": 1.185, + "step": 5344 + }, + { + "epoch": 0.46871253403194446, + "grad_norm": 0.058349609375, + "learning_rate": 0.0019953019706280537, + "loss": 1.1869, + "step": 5345 + }, + { + "epoch": 0.468800225806319, + "grad_norm": 0.07958984375, + "learning_rate": 0.001994907831160055, + "loss": 1.1721, + "step": 5346 + }, + { + "epoch": 0.46888791758069354, + "grad_norm": 0.056640625, + "learning_rate": 0.0019945136602373533, + "loss": 1.2218, + "step": 5347 + }, + { + "epoch": 0.4689756093550681, + "grad_norm": 0.04833984375, + "learning_rate": 0.001994119457895894, + "loss": 1.1583, + "step": 5348 + }, + { + "epoch": 0.46906330112944267, + "grad_norm": 0.07275390625, + "learning_rate": 0.001993725224171629, + "loss": 1.1574, + "step": 5349 + }, + { + "epoch": 0.4691509929038172, + "grad_norm": 0.05078125, + "learning_rate": 0.001993330959100511, + "loss": 1.1463, + "step": 5350 + }, + { + "epoch": 0.46923868467819174, + "grad_norm": 0.052734375, + "learning_rate": 0.0019929366627184967, + "loss": 1.2273, + "step": 5351 + }, + { + "epoch": 0.4693263764525663, + "grad_norm": 0.07568359375, + "learning_rate": 0.0019925423350615427, + "loss": 1.2788, + "step": 5352 + }, + { + "epoch": 0.4694140682269408, + "grad_norm": 0.064453125, + "learning_rate": 0.0019921479761656124, + "loss": 1.1574, + "step": 5353 + }, + { + "epoch": 0.4695017600013154, + "grad_norm": 0.049072265625, + "learning_rate": 0.00199175358606667, + "loss": 1.1618, + "step": 5354 + }, + { + "epoch": 0.46958945177568995, + "grad_norm": 0.09033203125, + "learning_rate": 0.001991359164800683, + "loss": 1.1096, + "step": 5355 + }, + { + "epoch": 0.46967714355006446, + "grad_norm": 0.0625, + "learning_rate": 0.0019909647124036214, + "loss": 1.2371, + "step": 5356 + }, + { + "epoch": 0.469764835324439, + "grad_norm": 0.048828125, + "learning_rate": 0.0019905702289114576, + "loss": 1.1693, + "step": 5357 + }, + { + "epoch": 0.46985252709881353, + "grad_norm": 0.10888671875, + "learning_rate": 0.0019901757143601685, + "loss": 1.2045, + "step": 5358 + }, + { + "epoch": 0.4699402188731881, + "grad_norm": 0.06884765625, + "learning_rate": 0.0019897811687857327, + "loss": 1.2171, + "step": 5359 + }, + { + "epoch": 0.47002791064756266, + "grad_norm": 0.06591796875, + "learning_rate": 0.0019893865922241314, + "loss": 1.2155, + "step": 5360 + }, + { + "epoch": 0.4701156024219372, + "grad_norm": 0.09033203125, + "learning_rate": 0.0019889919847113483, + "loss": 1.1602, + "step": 5361 + }, + { + "epoch": 0.47020329419631174, + "grad_norm": 0.05126953125, + "learning_rate": 0.001988597346283372, + "loss": 1.1818, + "step": 5362 + }, + { + "epoch": 0.4702909859706863, + "grad_norm": 0.0537109375, + "learning_rate": 0.001988202676976192, + "loss": 1.1876, + "step": 5363 + }, + { + "epoch": 0.4703786777450608, + "grad_norm": 0.059326171875, + "learning_rate": 0.0019878079768258, + "loss": 1.2834, + "step": 5364 + }, + { + "epoch": 0.4704663695194354, + "grad_norm": 0.05419921875, + "learning_rate": 0.0019874132458681934, + "loss": 1.2145, + "step": 5365 + }, + { + "epoch": 0.47055406129380994, + "grad_norm": 0.051513671875, + "learning_rate": 0.0019870184841393693, + "loss": 1.1583, + "step": 5366 + }, + { + "epoch": 0.47064175306818445, + "grad_norm": 0.05810546875, + "learning_rate": 0.0019866236916753297, + "loss": 1.1915, + "step": 5367 + }, + { + "epoch": 0.470729444842559, + "grad_norm": 0.07958984375, + "learning_rate": 0.0019862288685120787, + "loss": 1.1231, + "step": 5368 + }, + { + "epoch": 0.47081713661693353, + "grad_norm": 0.056396484375, + "learning_rate": 0.0019858340146856226, + "loss": 1.1834, + "step": 5369 + }, + { + "epoch": 0.4709048283913081, + "grad_norm": 0.06591796875, + "learning_rate": 0.001985439130231971, + "loss": 1.1888, + "step": 5370 + }, + { + "epoch": 0.47099252016568266, + "grad_norm": 0.0625, + "learning_rate": 0.001985044215187137, + "loss": 1.1398, + "step": 5371 + }, + { + "epoch": 0.47108021194005717, + "grad_norm": 0.05029296875, + "learning_rate": 0.001984649269587135, + "loss": 1.1932, + "step": 5372 + }, + { + "epoch": 0.47116790371443174, + "grad_norm": 0.0517578125, + "learning_rate": 0.0019842542934679843, + "loss": 1.2102, + "step": 5373 + }, + { + "epoch": 0.4712555954888063, + "grad_norm": 0.072265625, + "learning_rate": 0.001983859286865704, + "loss": 1.1804, + "step": 5374 + }, + { + "epoch": 0.4713432872631808, + "grad_norm": 0.0517578125, + "learning_rate": 0.0019834642498163194, + "loss": 1.1624, + "step": 5375 + }, + { + "epoch": 0.4714309790375554, + "grad_norm": 0.09130859375, + "learning_rate": 0.0019830691823558557, + "loss": 1.1903, + "step": 5376 + }, + { + "epoch": 0.47151867081192994, + "grad_norm": 0.0625, + "learning_rate": 0.0019826740845203424, + "loss": 1.171, + "step": 5377 + }, + { + "epoch": 0.47160636258630445, + "grad_norm": 0.076171875, + "learning_rate": 0.001982278956345811, + "loss": 1.1894, + "step": 5378 + }, + { + "epoch": 0.471694054360679, + "grad_norm": 0.07470703125, + "learning_rate": 0.0019818837978682973, + "loss": 1.169, + "step": 5379 + }, + { + "epoch": 0.4717817461350536, + "grad_norm": 0.06396484375, + "learning_rate": 0.0019814886091238377, + "loss": 1.2408, + "step": 5380 + }, + { + "epoch": 0.4718694379094281, + "grad_norm": 0.059326171875, + "learning_rate": 0.0019810933901484723, + "loss": 1.1323, + "step": 5381 + }, + { + "epoch": 0.47195712968380266, + "grad_norm": 0.054443359375, + "learning_rate": 0.001980698140978245, + "loss": 1.1996, + "step": 5382 + }, + { + "epoch": 0.47204482145817717, + "grad_norm": 0.060546875, + "learning_rate": 0.0019803028616492005, + "loss": 1.1505, + "step": 5383 + }, + { + "epoch": 0.47213251323255173, + "grad_norm": 0.05224609375, + "learning_rate": 0.001979907552197388, + "loss": 1.16, + "step": 5384 + }, + { + "epoch": 0.4722202050069263, + "grad_norm": 0.08154296875, + "learning_rate": 0.0019795122126588585, + "loss": 1.1524, + "step": 5385 + }, + { + "epoch": 0.4723078967813008, + "grad_norm": 0.0634765625, + "learning_rate": 0.0019791168430696652, + "loss": 1.2051, + "step": 5386 + }, + { + "epoch": 0.47239558855567537, + "grad_norm": 0.08154296875, + "learning_rate": 0.001978721443465866, + "loss": 1.1658, + "step": 5387 + }, + { + "epoch": 0.47248328033004994, + "grad_norm": 0.06884765625, + "learning_rate": 0.0019783260138835196, + "loss": 1.2829, + "step": 5388 + }, + { + "epoch": 0.47257097210442445, + "grad_norm": 0.06591796875, + "learning_rate": 0.001977930554358688, + "loss": 1.1749, + "step": 5389 + }, + { + "epoch": 0.472658663878799, + "grad_norm": 0.05810546875, + "learning_rate": 0.0019775350649274366, + "loss": 1.1354, + "step": 5390 + }, + { + "epoch": 0.4727463556531736, + "grad_norm": 0.050048828125, + "learning_rate": 0.0019771395456258333, + "loss": 1.1723, + "step": 5391 + }, + { + "epoch": 0.4728340474275481, + "grad_norm": 0.05078125, + "learning_rate": 0.0019767439964899474, + "loss": 1.1718, + "step": 5392 + }, + { + "epoch": 0.47292173920192265, + "grad_norm": 0.053955078125, + "learning_rate": 0.001976348417555853, + "loss": 1.194, + "step": 5393 + }, + { + "epoch": 0.47300943097629716, + "grad_norm": 0.04833984375, + "learning_rate": 0.001975952808859625, + "loss": 1.1772, + "step": 5394 + }, + { + "epoch": 0.47309712275067173, + "grad_norm": 0.049560546875, + "learning_rate": 0.0019755571704373424, + "loss": 1.1829, + "step": 5395 + }, + { + "epoch": 0.4731848145250463, + "grad_norm": 0.05029296875, + "learning_rate": 0.0019751615023250856, + "loss": 1.202, + "step": 5396 + }, + { + "epoch": 0.4732725062994208, + "grad_norm": 0.0478515625, + "learning_rate": 0.0019747658045589397, + "loss": 1.1656, + "step": 5397 + }, + { + "epoch": 0.47336019807379537, + "grad_norm": 0.048583984375, + "learning_rate": 0.0019743700771749913, + "loss": 1.1332, + "step": 5398 + }, + { + "epoch": 0.47344788984816993, + "grad_norm": 0.056640625, + "learning_rate": 0.0019739743202093285, + "loss": 1.1497, + "step": 5399 + }, + { + "epoch": 0.47353558162254444, + "grad_norm": 0.051025390625, + "learning_rate": 0.0019735785336980447, + "loss": 1.1316, + "step": 5400 + }, + { + "epoch": 0.473623273396919, + "grad_norm": 0.0595703125, + "learning_rate": 0.001973182717677233, + "loss": 1.1576, + "step": 5401 + }, + { + "epoch": 0.4737109651712936, + "grad_norm": 0.055908203125, + "learning_rate": 0.0019727868721829923, + "loss": 1.1987, + "step": 5402 + }, + { + "epoch": 0.4737986569456681, + "grad_norm": 0.057373046875, + "learning_rate": 0.001972390997251422, + "loss": 1.2304, + "step": 5403 + }, + { + "epoch": 0.47388634872004265, + "grad_norm": 0.076171875, + "learning_rate": 0.0019719950929186253, + "loss": 1.2244, + "step": 5404 + }, + { + "epoch": 0.47397404049441716, + "grad_norm": 0.0615234375, + "learning_rate": 0.001971599159220707, + "loss": 1.1542, + "step": 5405 + }, + { + "epoch": 0.4740617322687917, + "grad_norm": 0.09033203125, + "learning_rate": 0.0019712031961937756, + "loss": 1.1553, + "step": 5406 + }, + { + "epoch": 0.4741494240431663, + "grad_norm": 0.057861328125, + "learning_rate": 0.0019708072038739422, + "loss": 1.1995, + "step": 5407 + }, + { + "epoch": 0.4742371158175408, + "grad_norm": 0.07275390625, + "learning_rate": 0.00197041118229732, + "loss": 1.2107, + "step": 5408 + }, + { + "epoch": 0.47432480759191537, + "grad_norm": 0.0546875, + "learning_rate": 0.0019700151315000247, + "loss": 1.1998, + "step": 5409 + }, + { + "epoch": 0.47441249936628993, + "grad_norm": 0.057373046875, + "learning_rate": 0.001969619051518176, + "loss": 1.1977, + "step": 5410 + }, + { + "epoch": 0.47450019114066444, + "grad_norm": 0.052734375, + "learning_rate": 0.001969222942387895, + "loss": 1.1719, + "step": 5411 + }, + { + "epoch": 0.474587882915039, + "grad_norm": 0.07080078125, + "learning_rate": 0.001968826804145305, + "loss": 1.1768, + "step": 5412 + }, + { + "epoch": 0.47467557468941357, + "grad_norm": 0.0537109375, + "learning_rate": 0.001968430636826534, + "loss": 1.1132, + "step": 5413 + }, + { + "epoch": 0.4747632664637881, + "grad_norm": 0.04931640625, + "learning_rate": 0.0019680344404677105, + "loss": 1.1454, + "step": 5414 + }, + { + "epoch": 0.47485095823816265, + "grad_norm": 0.054931640625, + "learning_rate": 0.0019676382151049675, + "loss": 1.2117, + "step": 5415 + }, + { + "epoch": 0.47493865001253716, + "grad_norm": 0.078125, + "learning_rate": 0.001967241960774439, + "loss": 1.247, + "step": 5416 + }, + { + "epoch": 0.4750263417869117, + "grad_norm": 0.06005859375, + "learning_rate": 0.001966845677512263, + "loss": 1.1018, + "step": 5417 + }, + { + "epoch": 0.4751140335612863, + "grad_norm": 0.06494140625, + "learning_rate": 0.001966449365354579, + "loss": 1.1767, + "step": 5418 + }, + { + "epoch": 0.4752017253356608, + "grad_norm": 0.06201171875, + "learning_rate": 0.0019660530243375295, + "loss": 1.165, + "step": 5419 + }, + { + "epoch": 0.47528941711003536, + "grad_norm": 0.049560546875, + "learning_rate": 0.0019656566544972603, + "loss": 1.1854, + "step": 5420 + }, + { + "epoch": 0.4753771088844099, + "grad_norm": 0.0654296875, + "learning_rate": 0.001965260255869919, + "loss": 1.1484, + "step": 5421 + }, + { + "epoch": 0.47546480065878444, + "grad_norm": 0.056884765625, + "learning_rate": 0.001964863828491656, + "loss": 1.1347, + "step": 5422 + }, + { + "epoch": 0.475552492433159, + "grad_norm": 0.0498046875, + "learning_rate": 0.0019644673723986246, + "loss": 1.1231, + "step": 5423 + }, + { + "epoch": 0.47564018420753357, + "grad_norm": 0.054931640625, + "learning_rate": 0.001964070887626981, + "loss": 1.1108, + "step": 5424 + }, + { + "epoch": 0.4757278759819081, + "grad_norm": 0.048583984375, + "learning_rate": 0.0019636743742128832, + "loss": 1.1696, + "step": 5425 + }, + { + "epoch": 0.47581556775628264, + "grad_norm": 0.056884765625, + "learning_rate": 0.0019632778321924923, + "loss": 1.1713, + "step": 5426 + }, + { + "epoch": 0.4759032595306572, + "grad_norm": 0.051513671875, + "learning_rate": 0.001962881261601972, + "loss": 1.1142, + "step": 5427 + }, + { + "epoch": 0.4759909513050317, + "grad_norm": 0.06689453125, + "learning_rate": 0.0019624846624774874, + "loss": 1.1895, + "step": 5428 + }, + { + "epoch": 0.4760786430794063, + "grad_norm": 0.04833984375, + "learning_rate": 0.0019620880348552093, + "loss": 1.1786, + "step": 5429 + }, + { + "epoch": 0.4761663348537808, + "grad_norm": 0.06640625, + "learning_rate": 0.0019616913787713075, + "loss": 1.1989, + "step": 5430 + }, + { + "epoch": 0.47625402662815536, + "grad_norm": 0.052978515625, + "learning_rate": 0.001961294694261957, + "loss": 1.2001, + "step": 5431 + }, + { + "epoch": 0.4763417184025299, + "grad_norm": 0.04833984375, + "learning_rate": 0.001960897981363333, + "loss": 1.1706, + "step": 5432 + }, + { + "epoch": 0.47642941017690443, + "grad_norm": 0.06298828125, + "learning_rate": 0.0019605012401116167, + "loss": 1.2116, + "step": 5433 + }, + { + "epoch": 0.476517101951279, + "grad_norm": 0.07080078125, + "learning_rate": 0.0019601044705429885, + "loss": 1.253, + "step": 5434 + }, + { + "epoch": 0.47660479372565356, + "grad_norm": 0.0712890625, + "learning_rate": 0.001959707672693633, + "loss": 1.2066, + "step": 5435 + }, + { + "epoch": 0.4766924855000281, + "grad_norm": 0.06982421875, + "learning_rate": 0.001959310846599738, + "loss": 1.2256, + "step": 5436 + }, + { + "epoch": 0.47678017727440264, + "grad_norm": 0.10498046875, + "learning_rate": 0.0019589139922974916, + "loss": 1.2722, + "step": 5437 + }, + { + "epoch": 0.4768678690487772, + "grad_norm": 0.0537109375, + "learning_rate": 0.0019585171098230867, + "loss": 1.1843, + "step": 5438 + }, + { + "epoch": 0.4769555608231517, + "grad_norm": 0.0654296875, + "learning_rate": 0.0019581201992127173, + "loss": 1.1852, + "step": 5439 + }, + { + "epoch": 0.4770432525975263, + "grad_norm": 0.068359375, + "learning_rate": 0.001957723260502582, + "loss": 1.1629, + "step": 5440 + }, + { + "epoch": 0.4771309443719008, + "grad_norm": 0.058837890625, + "learning_rate": 0.001957326293728879, + "loss": 1.1815, + "step": 5441 + }, + { + "epoch": 0.47721863614627535, + "grad_norm": 0.057373046875, + "learning_rate": 0.001956929298927812, + "loss": 1.2097, + "step": 5442 + }, + { + "epoch": 0.4773063279206499, + "grad_norm": 0.0703125, + "learning_rate": 0.0019565322761355844, + "loss": 1.1467, + "step": 5443 + }, + { + "epoch": 0.47739401969502443, + "grad_norm": 0.06005859375, + "learning_rate": 0.0019561352253884058, + "loss": 1.1584, + "step": 5444 + }, + { + "epoch": 0.477481711469399, + "grad_norm": 0.05419921875, + "learning_rate": 0.001955738146722484, + "loss": 1.1315, + "step": 5445 + }, + { + "epoch": 0.47756940324377356, + "grad_norm": 0.08203125, + "learning_rate": 0.0019553410401740323, + "loss": 1.1802, + "step": 5446 + }, + { + "epoch": 0.47765709501814807, + "grad_norm": 0.0556640625, + "learning_rate": 0.0019549439057792665, + "loss": 1.1644, + "step": 5447 + }, + { + "epoch": 0.47774478679252264, + "grad_norm": 0.0791015625, + "learning_rate": 0.0019545467435744036, + "loss": 1.138, + "step": 5448 + }, + { + "epoch": 0.4778324785668972, + "grad_norm": 0.04931640625, + "learning_rate": 0.0019541495535956637, + "loss": 1.1394, + "step": 5449 + }, + { + "epoch": 0.4779201703412717, + "grad_norm": 0.07177734375, + "learning_rate": 0.0019537523358792697, + "loss": 1.1708, + "step": 5450 + }, + { + "epoch": 0.4780078621156463, + "grad_norm": 0.05859375, + "learning_rate": 0.0019533550904614473, + "loss": 1.1702, + "step": 5451 + }, + { + "epoch": 0.4780955538900208, + "grad_norm": 0.0576171875, + "learning_rate": 0.0019529578173784234, + "loss": 1.1644, + "step": 5452 + }, + { + "epoch": 0.47818324566439535, + "grad_norm": 0.052490234375, + "learning_rate": 0.0019525605166664285, + "loss": 1.1508, + "step": 5453 + }, + { + "epoch": 0.4782709374387699, + "grad_norm": 0.080078125, + "learning_rate": 0.0019521631883616956, + "loss": 1.1322, + "step": 5454 + }, + { + "epoch": 0.4783586292131444, + "grad_norm": 0.0771484375, + "learning_rate": 0.0019517658325004603, + "loss": 1.1928, + "step": 5455 + }, + { + "epoch": 0.478446320987519, + "grad_norm": 0.06494140625, + "learning_rate": 0.0019513684491189595, + "loss": 1.1934, + "step": 5456 + }, + { + "epoch": 0.47853401276189356, + "grad_norm": 0.059326171875, + "learning_rate": 0.0019509710382534346, + "loss": 1.1654, + "step": 5457 + }, + { + "epoch": 0.47862170453626807, + "grad_norm": 0.07568359375, + "learning_rate": 0.0019505735999401275, + "loss": 1.1728, + "step": 5458 + }, + { + "epoch": 0.47870939631064263, + "grad_norm": 0.0771484375, + "learning_rate": 0.0019501761342152844, + "loss": 1.1689, + "step": 5459 + }, + { + "epoch": 0.4787970880850172, + "grad_norm": 0.07275390625, + "learning_rate": 0.0019497786411151524, + "loss": 1.2006, + "step": 5460 + }, + { + "epoch": 0.4788847798593917, + "grad_norm": 0.10498046875, + "learning_rate": 0.0019493811206759827, + "loss": 1.1617, + "step": 5461 + }, + { + "epoch": 0.4789724716337663, + "grad_norm": 0.0634765625, + "learning_rate": 0.0019489835729340273, + "loss": 1.2235, + "step": 5462 + }, + { + "epoch": 0.47906016340814084, + "grad_norm": 0.1259765625, + "learning_rate": 0.001948585997925542, + "loss": 1.1347, + "step": 5463 + }, + { + "epoch": 0.47914785518251535, + "grad_norm": 0.0751953125, + "learning_rate": 0.0019481883956867846, + "loss": 1.1704, + "step": 5464 + }, + { + "epoch": 0.4792355469568899, + "grad_norm": 0.0517578125, + "learning_rate": 0.001947790766254015, + "loss": 1.1714, + "step": 5465 + }, + { + "epoch": 0.4793232387312644, + "grad_norm": 0.09765625, + "learning_rate": 0.0019473931096634963, + "loss": 1.2116, + "step": 5466 + }, + { + "epoch": 0.479410930505639, + "grad_norm": 0.05517578125, + "learning_rate": 0.0019469954259514939, + "loss": 1.2081, + "step": 5467 + }, + { + "epoch": 0.47949862228001355, + "grad_norm": 0.087890625, + "learning_rate": 0.0019465977151542752, + "loss": 1.1785, + "step": 5468 + }, + { + "epoch": 0.47958631405438806, + "grad_norm": 0.055419921875, + "learning_rate": 0.0019461999773081108, + "loss": 1.1121, + "step": 5469 + }, + { + "epoch": 0.47967400582876263, + "grad_norm": 0.057373046875, + "learning_rate": 0.0019458022124492727, + "loss": 1.1052, + "step": 5470 + }, + { + "epoch": 0.4797616976031372, + "grad_norm": 0.0703125, + "learning_rate": 0.0019454044206140368, + "loss": 1.1249, + "step": 5471 + }, + { + "epoch": 0.4798493893775117, + "grad_norm": 0.0673828125, + "learning_rate": 0.0019450066018386798, + "loss": 1.2033, + "step": 5472 + }, + { + "epoch": 0.47993708115188627, + "grad_norm": 0.08203125, + "learning_rate": 0.001944608756159483, + "loss": 1.118, + "step": 5473 + }, + { + "epoch": 0.48002477292626083, + "grad_norm": 0.058349609375, + "learning_rate": 0.0019442108836127275, + "loss": 1.189, + "step": 5474 + }, + { + "epoch": 0.48011246470063534, + "grad_norm": 0.10498046875, + "learning_rate": 0.0019438129842346992, + "loss": 1.1394, + "step": 5475 + }, + { + "epoch": 0.4802001564750099, + "grad_norm": 0.05615234375, + "learning_rate": 0.001943415058061685, + "loss": 1.1106, + "step": 5476 + }, + { + "epoch": 0.4802878482493844, + "grad_norm": 0.0927734375, + "learning_rate": 0.0019430171051299753, + "loss": 1.1651, + "step": 5477 + }, + { + "epoch": 0.480375540023759, + "grad_norm": 0.09912109375, + "learning_rate": 0.001942619125475862, + "loss": 1.1356, + "step": 5478 + }, + { + "epoch": 0.48046323179813355, + "grad_norm": 0.068359375, + "learning_rate": 0.0019422211191356396, + "loss": 1.1435, + "step": 5479 + }, + { + "epoch": 0.48055092357250806, + "grad_norm": 0.09130859375, + "learning_rate": 0.0019418230861456055, + "loss": 1.1986, + "step": 5480 + }, + { + "epoch": 0.4806386153468826, + "grad_norm": 0.0576171875, + "learning_rate": 0.0019414250265420594, + "loss": 1.1725, + "step": 5481 + }, + { + "epoch": 0.4807263071212572, + "grad_norm": 0.0556640625, + "learning_rate": 0.0019410269403613031, + "loss": 1.1358, + "step": 5482 + }, + { + "epoch": 0.4808139988956317, + "grad_norm": 0.07080078125, + "learning_rate": 0.0019406288276396415, + "loss": 1.1441, + "step": 5483 + }, + { + "epoch": 0.48090169067000627, + "grad_norm": 0.0732421875, + "learning_rate": 0.0019402306884133806, + "loss": 1.1403, + "step": 5484 + }, + { + "epoch": 0.48098938244438083, + "grad_norm": 0.09423828125, + "learning_rate": 0.0019398325227188305, + "loss": 1.1858, + "step": 5485 + }, + { + "epoch": 0.48107707421875534, + "grad_norm": 0.06640625, + "learning_rate": 0.0019394343305923025, + "loss": 1.2389, + "step": 5486 + }, + { + "epoch": 0.4811647659931299, + "grad_norm": 0.053955078125, + "learning_rate": 0.0019390361120701105, + "loss": 1.212, + "step": 5487 + }, + { + "epoch": 0.4812524577675044, + "grad_norm": 0.052734375, + "learning_rate": 0.0019386378671885712, + "loss": 1.1715, + "step": 5488 + }, + { + "epoch": 0.481340149541879, + "grad_norm": 0.054443359375, + "learning_rate": 0.001938239595984004, + "loss": 1.1256, + "step": 5489 + }, + { + "epoch": 0.48142784131625355, + "grad_norm": 0.0654296875, + "learning_rate": 0.001937841298492729, + "loss": 1.1571, + "step": 5490 + }, + { + "epoch": 0.48151553309062806, + "grad_norm": 0.04931640625, + "learning_rate": 0.0019374429747510712, + "loss": 1.1195, + "step": 5491 + }, + { + "epoch": 0.4816032248650026, + "grad_norm": 0.07568359375, + "learning_rate": 0.0019370446247953558, + "loss": 1.132, + "step": 5492 + }, + { + "epoch": 0.4816909166393772, + "grad_norm": 0.060791015625, + "learning_rate": 0.0019366462486619118, + "loss": 1.1541, + "step": 5493 + }, + { + "epoch": 0.4817786084137517, + "grad_norm": 0.07421875, + "learning_rate": 0.0019362478463870697, + "loss": 1.2695, + "step": 5494 + }, + { + "epoch": 0.48186630018812626, + "grad_norm": 0.052734375, + "learning_rate": 0.0019358494180071635, + "loss": 1.1426, + "step": 5495 + }, + { + "epoch": 0.4819539919625008, + "grad_norm": 0.05029296875, + "learning_rate": 0.0019354509635585277, + "loss": 1.2493, + "step": 5496 + }, + { + "epoch": 0.48204168373687534, + "grad_norm": 0.058349609375, + "learning_rate": 0.0019350524830775014, + "loss": 1.2335, + "step": 5497 + }, + { + "epoch": 0.4821293755112499, + "grad_norm": 0.07568359375, + "learning_rate": 0.0019346539766004242, + "loss": 1.1303, + "step": 5498 + }, + { + "epoch": 0.48221706728562447, + "grad_norm": 0.06201171875, + "learning_rate": 0.0019342554441636388, + "loss": 1.1783, + "step": 5499 + }, + { + "epoch": 0.482304759059999, + "grad_norm": 0.0751953125, + "learning_rate": 0.001933856885803491, + "loss": 1.2055, + "step": 5500 + }, + { + "epoch": 0.482304759059999, + "eval_loss": 1.1834715604782104, + "eval_runtime": 428.3748, + "eval_samples_per_second": 33.725, + "eval_steps_per_second": 8.432, + "step": 5500 + }, + { + "epoch": 0.48239245083437354, + "grad_norm": 0.0634765625, + "learning_rate": 0.001933458301556328, + "loss": 1.1604, + "step": 5501 + }, + { + "epoch": 0.48248014260874805, + "grad_norm": 0.08447265625, + "learning_rate": 0.0019330596914585, + "loss": 1.1278, + "step": 5502 + }, + { + "epoch": 0.4825678343831226, + "grad_norm": 0.09716796875, + "learning_rate": 0.0019326610555463588, + "loss": 1.133, + "step": 5503 + }, + { + "epoch": 0.4826555261574972, + "grad_norm": 0.0595703125, + "learning_rate": 0.001932262393856259, + "loss": 1.1856, + "step": 5504 + }, + { + "epoch": 0.4827432179318717, + "grad_norm": 0.08544921875, + "learning_rate": 0.0019318637064245571, + "loss": 1.1695, + "step": 5505 + }, + { + "epoch": 0.48283090970624626, + "grad_norm": 0.0732421875, + "learning_rate": 0.001931464993287613, + "loss": 1.1824, + "step": 5506 + }, + { + "epoch": 0.4829186014806208, + "grad_norm": 0.053466796875, + "learning_rate": 0.001931066254481788, + "loss": 1.1753, + "step": 5507 + }, + { + "epoch": 0.48300629325499533, + "grad_norm": 0.07763671875, + "learning_rate": 0.0019306674900434464, + "loss": 1.1074, + "step": 5508 + }, + { + "epoch": 0.4830939850293699, + "grad_norm": 0.0732421875, + "learning_rate": 0.001930268700008954, + "loss": 1.1815, + "step": 5509 + }, + { + "epoch": 0.48318167680374446, + "grad_norm": 0.07763671875, + "learning_rate": 0.0019298698844146798, + "loss": 1.1893, + "step": 5510 + }, + { + "epoch": 0.483269368578119, + "grad_norm": 0.07080078125, + "learning_rate": 0.0019294710432969946, + "loss": 1.1488, + "step": 5511 + }, + { + "epoch": 0.48335706035249354, + "grad_norm": 0.054931640625, + "learning_rate": 0.001929072176692272, + "loss": 1.1032, + "step": 5512 + }, + { + "epoch": 0.48344475212686805, + "grad_norm": 0.07666015625, + "learning_rate": 0.001928673284636887, + "loss": 1.2194, + "step": 5513 + }, + { + "epoch": 0.4835324439012426, + "grad_norm": 0.0712890625, + "learning_rate": 0.0019282743671672174, + "loss": 1.1615, + "step": 5514 + }, + { + "epoch": 0.4836201356756172, + "grad_norm": 0.06103515625, + "learning_rate": 0.0019278754243196449, + "loss": 1.1459, + "step": 5515 + }, + { + "epoch": 0.4837078274499917, + "grad_norm": 0.0615234375, + "learning_rate": 0.0019274764561305502, + "loss": 1.1523, + "step": 5516 + }, + { + "epoch": 0.48379551922436625, + "grad_norm": 0.0615234375, + "learning_rate": 0.0019270774626363194, + "loss": 1.2307, + "step": 5517 + }, + { + "epoch": 0.4838832109987408, + "grad_norm": 0.10400390625, + "learning_rate": 0.0019266784438733387, + "loss": 1.1652, + "step": 5518 + }, + { + "epoch": 0.48397090277311533, + "grad_norm": 0.0693359375, + "learning_rate": 0.001926279399877999, + "loss": 1.1742, + "step": 5519 + }, + { + "epoch": 0.4840585945474899, + "grad_norm": 0.0869140625, + "learning_rate": 0.001925880330686691, + "loss": 1.2336, + "step": 5520 + }, + { + "epoch": 0.48414628632186446, + "grad_norm": 0.062255859375, + "learning_rate": 0.0019254812363358087, + "loss": 1.1945, + "step": 5521 + }, + { + "epoch": 0.48423397809623897, + "grad_norm": 0.06396484375, + "learning_rate": 0.0019250821168617493, + "loss": 1.1766, + "step": 5522 + }, + { + "epoch": 0.48432166987061354, + "grad_norm": 0.080078125, + "learning_rate": 0.0019246829723009102, + "loss": 1.1674, + "step": 5523 + }, + { + "epoch": 0.48440936164498805, + "grad_norm": 0.06787109375, + "learning_rate": 0.0019242838026896935, + "loss": 1.1515, + "step": 5524 + }, + { + "epoch": 0.4844970534193626, + "grad_norm": 0.052490234375, + "learning_rate": 0.0019238846080645022, + "loss": 1.1357, + "step": 5525 + }, + { + "epoch": 0.4845847451937372, + "grad_norm": 0.05615234375, + "learning_rate": 0.0019234853884617417, + "loss": 1.1147, + "step": 5526 + }, + { + "epoch": 0.4846724369681117, + "grad_norm": 0.0595703125, + "learning_rate": 0.0019230861439178194, + "loss": 1.1837, + "step": 5527 + }, + { + "epoch": 0.48476012874248625, + "grad_norm": 0.051025390625, + "learning_rate": 0.0019226868744691462, + "loss": 1.228, + "step": 5528 + }, + { + "epoch": 0.4848478205168608, + "grad_norm": 0.05419921875, + "learning_rate": 0.001922287580152134, + "loss": 1.1785, + "step": 5529 + }, + { + "epoch": 0.4849355122912353, + "grad_norm": 0.048828125, + "learning_rate": 0.001921888261003197, + "loss": 1.1373, + "step": 5530 + }, + { + "epoch": 0.4850232040656099, + "grad_norm": 0.054443359375, + "learning_rate": 0.0019214889170587528, + "loss": 1.1214, + "step": 5531 + }, + { + "epoch": 0.48511089583998446, + "grad_norm": 0.0830078125, + "learning_rate": 0.00192108954835522, + "loss": 1.2121, + "step": 5532 + }, + { + "epoch": 0.48519858761435897, + "grad_norm": 0.0966796875, + "learning_rate": 0.0019206901549290202, + "loss": 1.1636, + "step": 5533 + }, + { + "epoch": 0.48528627938873353, + "grad_norm": 0.1279296875, + "learning_rate": 0.001920290736816577, + "loss": 1.1863, + "step": 5534 + }, + { + "epoch": 0.4853739711631081, + "grad_norm": 0.05517578125, + "learning_rate": 0.0019198912940543167, + "loss": 1.1505, + "step": 5535 + }, + { + "epoch": 0.4854616629374826, + "grad_norm": 0.12255859375, + "learning_rate": 0.0019194918266786667, + "loss": 1.2243, + "step": 5536 + }, + { + "epoch": 0.4855493547118572, + "grad_norm": 0.07177734375, + "learning_rate": 0.0019190923347260586, + "loss": 1.1104, + "step": 5537 + }, + { + "epoch": 0.4856370464862317, + "grad_norm": 0.061767578125, + "learning_rate": 0.001918692818232923, + "loss": 1.2346, + "step": 5538 + }, + { + "epoch": 0.48572473826060625, + "grad_norm": 0.095703125, + "learning_rate": 0.001918293277235697, + "loss": 1.1643, + "step": 5539 + }, + { + "epoch": 0.4858124300349808, + "grad_norm": 0.0654296875, + "learning_rate": 0.0019178937117708163, + "loss": 1.1439, + "step": 5540 + }, + { + "epoch": 0.4859001218093553, + "grad_norm": 0.0810546875, + "learning_rate": 0.0019174941218747204, + "loss": 1.1538, + "step": 5541 + }, + { + "epoch": 0.4859878135837299, + "grad_norm": 0.08154296875, + "learning_rate": 0.0019170945075838515, + "loss": 1.1435, + "step": 5542 + }, + { + "epoch": 0.48607550535810445, + "grad_norm": 0.0673828125, + "learning_rate": 0.0019166948689346528, + "loss": 1.1268, + "step": 5543 + }, + { + "epoch": 0.48616319713247896, + "grad_norm": 0.08203125, + "learning_rate": 0.0019162952059635706, + "loss": 1.204, + "step": 5544 + }, + { + "epoch": 0.48625088890685353, + "grad_norm": 0.08642578125, + "learning_rate": 0.0019158955187070536, + "loss": 1.1307, + "step": 5545 + }, + { + "epoch": 0.4863385806812281, + "grad_norm": 0.05224609375, + "learning_rate": 0.0019154958072015512, + "loss": 1.2306, + "step": 5546 + }, + { + "epoch": 0.4864262724556026, + "grad_norm": 0.056396484375, + "learning_rate": 0.0019150960714835162, + "loss": 1.2036, + "step": 5547 + }, + { + "epoch": 0.48651396422997717, + "grad_norm": 0.0849609375, + "learning_rate": 0.0019146963115894045, + "loss": 1.1675, + "step": 5548 + }, + { + "epoch": 0.4866016560043517, + "grad_norm": 0.050048828125, + "learning_rate": 0.0019142965275556717, + "loss": 1.1721, + "step": 5549 + }, + { + "epoch": 0.48668934777872624, + "grad_norm": 0.050537109375, + "learning_rate": 0.0019138967194187785, + "loss": 1.1753, + "step": 5550 + }, + { + "epoch": 0.4867770395531008, + "grad_norm": 0.061279296875, + "learning_rate": 0.0019134968872151856, + "loss": 1.1615, + "step": 5551 + }, + { + "epoch": 0.4868647313274753, + "grad_norm": 0.056884765625, + "learning_rate": 0.001913097030981357, + "loss": 1.159, + "step": 5552 + }, + { + "epoch": 0.4869524231018499, + "grad_norm": 0.05029296875, + "learning_rate": 0.0019126971507537577, + "loss": 1.1597, + "step": 5553 + }, + { + "epoch": 0.48704011487622445, + "grad_norm": 0.05224609375, + "learning_rate": 0.0019122972465688568, + "loss": 1.1994, + "step": 5554 + }, + { + "epoch": 0.48712780665059896, + "grad_norm": 0.07421875, + "learning_rate": 0.0019118973184631244, + "loss": 1.1902, + "step": 5555 + }, + { + "epoch": 0.4872154984249735, + "grad_norm": 0.05419921875, + "learning_rate": 0.001911497366473032, + "loss": 1.1935, + "step": 5556 + }, + { + "epoch": 0.4873031901993481, + "grad_norm": 0.07666015625, + "learning_rate": 0.0019110973906350556, + "loss": 1.169, + "step": 5557 + }, + { + "epoch": 0.4873908819737226, + "grad_norm": 0.0673828125, + "learning_rate": 0.0019106973909856708, + "loss": 1.1402, + "step": 5558 + }, + { + "epoch": 0.48747857374809717, + "grad_norm": 0.055908203125, + "learning_rate": 0.0019102973675613568, + "loss": 1.1357, + "step": 5559 + }, + { + "epoch": 0.4875662655224717, + "grad_norm": 0.11572265625, + "learning_rate": 0.0019098973203985948, + "loss": 1.1902, + "step": 5560 + }, + { + "epoch": 0.48765395729684624, + "grad_norm": 0.058349609375, + "learning_rate": 0.0019094972495338684, + "loss": 1.1765, + "step": 5561 + }, + { + "epoch": 0.4877416490712208, + "grad_norm": 0.06982421875, + "learning_rate": 0.0019090971550036627, + "loss": 1.1093, + "step": 5562 + }, + { + "epoch": 0.4878293408455953, + "grad_norm": 0.048828125, + "learning_rate": 0.0019086970368444654, + "loss": 1.1568, + "step": 5563 + }, + { + "epoch": 0.4879170326199699, + "grad_norm": 0.054443359375, + "learning_rate": 0.001908296895092766, + "loss": 1.1526, + "step": 5564 + }, + { + "epoch": 0.48800472439434445, + "grad_norm": 0.0810546875, + "learning_rate": 0.0019078967297850562, + "loss": 1.2091, + "step": 5565 + }, + { + "epoch": 0.48809241616871896, + "grad_norm": 0.05615234375, + "learning_rate": 0.0019074965409578307, + "loss": 1.226, + "step": 5566 + }, + { + "epoch": 0.4881801079430935, + "grad_norm": 0.09423828125, + "learning_rate": 0.0019070963286475858, + "loss": 1.1625, + "step": 5567 + }, + { + "epoch": 0.4882677997174681, + "grad_norm": 0.08544921875, + "learning_rate": 0.001906696092890819, + "loss": 1.1822, + "step": 5568 + }, + { + "epoch": 0.4883554914918426, + "grad_norm": 0.0654296875, + "learning_rate": 0.0019062958337240311, + "loss": 1.1377, + "step": 5569 + }, + { + "epoch": 0.48844318326621716, + "grad_norm": 0.07421875, + "learning_rate": 0.0019058955511837246, + "loss": 1.1434, + "step": 5570 + }, + { + "epoch": 0.4885308750405917, + "grad_norm": 0.07763671875, + "learning_rate": 0.001905495245306405, + "loss": 1.1226, + "step": 5571 + }, + { + "epoch": 0.48861856681496624, + "grad_norm": 0.0615234375, + "learning_rate": 0.0019050949161285781, + "loss": 1.1819, + "step": 5572 + }, + { + "epoch": 0.4887062585893408, + "grad_norm": 0.0791015625, + "learning_rate": 0.001904694563686754, + "loss": 1.152, + "step": 5573 + }, + { + "epoch": 0.4887939503637153, + "grad_norm": 0.06201171875, + "learning_rate": 0.0019042941880174425, + "loss": 1.2202, + "step": 5574 + }, + { + "epoch": 0.4888816421380899, + "grad_norm": 0.0732421875, + "learning_rate": 0.0019038937891571578, + "loss": 1.2113, + "step": 5575 + }, + { + "epoch": 0.48896933391246444, + "grad_norm": 0.12060546875, + "learning_rate": 0.001903493367142415, + "loss": 1.1988, + "step": 5576 + }, + { + "epoch": 0.48905702568683895, + "grad_norm": 0.056396484375, + "learning_rate": 0.0019030929220097317, + "loss": 1.1405, + "step": 5577 + }, + { + "epoch": 0.4891447174612135, + "grad_norm": 0.0849609375, + "learning_rate": 0.0019026924537956266, + "loss": 1.1408, + "step": 5578 + }, + { + "epoch": 0.4892324092355881, + "grad_norm": 0.08349609375, + "learning_rate": 0.001902291962536623, + "loss": 1.1574, + "step": 5579 + }, + { + "epoch": 0.4893201010099626, + "grad_norm": 0.052734375, + "learning_rate": 0.0019018914482692433, + "loss": 1.1742, + "step": 5580 + }, + { + "epoch": 0.48940779278433716, + "grad_norm": 0.068359375, + "learning_rate": 0.0019014909110300132, + "loss": 1.2344, + "step": 5581 + }, + { + "epoch": 0.4894954845587117, + "grad_norm": 0.057373046875, + "learning_rate": 0.0019010903508554617, + "loss": 1.1444, + "step": 5582 + }, + { + "epoch": 0.48958317633308623, + "grad_norm": 0.056884765625, + "learning_rate": 0.0019006897677821182, + "loss": 1.209, + "step": 5583 + }, + { + "epoch": 0.4896708681074608, + "grad_norm": 0.061767578125, + "learning_rate": 0.0019002891618465152, + "loss": 1.1928, + "step": 5584 + }, + { + "epoch": 0.4897585598818353, + "grad_norm": 0.05078125, + "learning_rate": 0.0018998885330851866, + "loss": 1.0958, + "step": 5585 + }, + { + "epoch": 0.4898462516562099, + "grad_norm": 0.049560546875, + "learning_rate": 0.0018994878815346695, + "loss": 1.1861, + "step": 5586 + }, + { + "epoch": 0.48993394343058444, + "grad_norm": 0.04638671875, + "learning_rate": 0.0018990872072315009, + "loss": 1.1376, + "step": 5587 + }, + { + "epoch": 0.49002163520495895, + "grad_norm": 0.06103515625, + "learning_rate": 0.0018986865102122226, + "loss": 1.1151, + "step": 5588 + }, + { + "epoch": 0.4901093269793335, + "grad_norm": 0.06787109375, + "learning_rate": 0.001898285790513376, + "loss": 1.1595, + "step": 5589 + }, + { + "epoch": 0.4901970187537081, + "grad_norm": 0.059326171875, + "learning_rate": 0.001897885048171507, + "loss": 1.1631, + "step": 5590 + }, + { + "epoch": 0.4902847105280826, + "grad_norm": 0.0703125, + "learning_rate": 0.0018974842832231603, + "loss": 1.1789, + "step": 5591 + }, + { + "epoch": 0.49037240230245716, + "grad_norm": 0.055419921875, + "learning_rate": 0.001897083495704887, + "loss": 1.1362, + "step": 5592 + }, + { + "epoch": 0.4904600940768317, + "grad_norm": 0.054443359375, + "learning_rate": 0.001896682685653236, + "loss": 1.1265, + "step": 5593 + }, + { + "epoch": 0.49054778585120623, + "grad_norm": 0.057373046875, + "learning_rate": 0.0018962818531047612, + "loss": 1.1857, + "step": 5594 + }, + { + "epoch": 0.4906354776255808, + "grad_norm": 0.061767578125, + "learning_rate": 0.001895880998096017, + "loss": 1.0938, + "step": 5595 + }, + { + "epoch": 0.4907231693999553, + "grad_norm": 0.052001953125, + "learning_rate": 0.0018954801206635607, + "loss": 1.1917, + "step": 5596 + }, + { + "epoch": 0.49081086117432987, + "grad_norm": 0.06396484375, + "learning_rate": 0.0018950792208439515, + "loss": 1.1448, + "step": 5597 + }, + { + "epoch": 0.49089855294870444, + "grad_norm": 0.07275390625, + "learning_rate": 0.0018946782986737495, + "loss": 1.1591, + "step": 5598 + }, + { + "epoch": 0.49098624472307895, + "grad_norm": 0.051025390625, + "learning_rate": 0.001894277354189518, + "loss": 1.1763, + "step": 5599 + }, + { + "epoch": 0.4910739364974535, + "grad_norm": 0.08544921875, + "learning_rate": 0.0018938763874278227, + "loss": 1.159, + "step": 5600 + }, + { + "epoch": 0.4911616282718281, + "grad_norm": 0.0576171875, + "learning_rate": 0.0018934753984252309, + "loss": 1.1632, + "step": 5601 + }, + { + "epoch": 0.4912493200462026, + "grad_norm": 0.06201171875, + "learning_rate": 0.0018930743872183104, + "loss": 1.1699, + "step": 5602 + }, + { + "epoch": 0.49133701182057715, + "grad_norm": 0.0576171875, + "learning_rate": 0.001892673353843634, + "loss": 1.1563, + "step": 5603 + }, + { + "epoch": 0.4914247035949517, + "grad_norm": 0.061279296875, + "learning_rate": 0.0018922722983377737, + "loss": 1.1437, + "step": 5604 + }, + { + "epoch": 0.4915123953693262, + "grad_norm": 0.06591796875, + "learning_rate": 0.0018918712207373059, + "loss": 1.1474, + "step": 5605 + }, + { + "epoch": 0.4916000871437008, + "grad_norm": 0.103515625, + "learning_rate": 0.0018914701210788066, + "loss": 1.2111, + "step": 5606 + }, + { + "epoch": 0.4916877789180753, + "grad_norm": 0.052734375, + "learning_rate": 0.0018910689993988559, + "loss": 1.1501, + "step": 5607 + }, + { + "epoch": 0.49177547069244987, + "grad_norm": 0.061279296875, + "learning_rate": 0.0018906678557340344, + "loss": 1.1374, + "step": 5608 + }, + { + "epoch": 0.49186316246682443, + "grad_norm": 0.0654296875, + "learning_rate": 0.0018902666901209257, + "loss": 1.1501, + "step": 5609 + }, + { + "epoch": 0.49195085424119894, + "grad_norm": 0.068359375, + "learning_rate": 0.0018898655025961155, + "loss": 1.1997, + "step": 5610 + }, + { + "epoch": 0.4920385460155735, + "grad_norm": 0.055908203125, + "learning_rate": 0.0018894642931961904, + "loss": 1.1713, + "step": 5611 + }, + { + "epoch": 0.4921262377899481, + "grad_norm": 0.0908203125, + "learning_rate": 0.0018890630619577402, + "loss": 1.2223, + "step": 5612 + }, + { + "epoch": 0.4922139295643226, + "grad_norm": 0.052490234375, + "learning_rate": 0.0018886618089173559, + "loss": 1.1973, + "step": 5613 + }, + { + "epoch": 0.49230162133869715, + "grad_norm": 0.0673828125, + "learning_rate": 0.0018882605341116305, + "loss": 1.1854, + "step": 5614 + }, + { + "epoch": 0.4923893131130717, + "grad_norm": 0.06640625, + "learning_rate": 0.0018878592375771595, + "loss": 1.1021, + "step": 5615 + }, + { + "epoch": 0.4924770048874462, + "grad_norm": 0.064453125, + "learning_rate": 0.00188745791935054, + "loss": 1.2012, + "step": 5616 + }, + { + "epoch": 0.4925646966618208, + "grad_norm": 0.0517578125, + "learning_rate": 0.0018870565794683715, + "loss": 1.0881, + "step": 5617 + }, + { + "epoch": 0.49265238843619535, + "grad_norm": 0.0830078125, + "learning_rate": 0.0018866552179672548, + "loss": 1.1713, + "step": 5618 + }, + { + "epoch": 0.49274008021056986, + "grad_norm": 0.06982421875, + "learning_rate": 0.001886253834883793, + "loss": 1.133, + "step": 5619 + }, + { + "epoch": 0.49282777198494443, + "grad_norm": 0.050048828125, + "learning_rate": 0.001885852430254591, + "loss": 1.1271, + "step": 5620 + }, + { + "epoch": 0.49291546375931894, + "grad_norm": 0.09130859375, + "learning_rate": 0.001885451004116257, + "loss": 1.1898, + "step": 5621 + }, + { + "epoch": 0.4930031555336935, + "grad_norm": 0.09375, + "learning_rate": 0.0018850495565053992, + "loss": 1.1786, + "step": 5622 + }, + { + "epoch": 0.49309084730806807, + "grad_norm": 0.058349609375, + "learning_rate": 0.001884648087458628, + "loss": 1.1575, + "step": 5623 + }, + { + "epoch": 0.4931785390824426, + "grad_norm": 0.1025390625, + "learning_rate": 0.0018842465970125572, + "loss": 1.2206, + "step": 5624 + }, + { + "epoch": 0.49326623085681714, + "grad_norm": 0.06103515625, + "learning_rate": 0.0018838450852038018, + "loss": 1.1465, + "step": 5625 + }, + { + "epoch": 0.4933539226311917, + "grad_norm": 0.051025390625, + "learning_rate": 0.001883443552068978, + "loss": 1.1343, + "step": 5626 + }, + { + "epoch": 0.4934416144055662, + "grad_norm": 0.0810546875, + "learning_rate": 0.0018830419976447047, + "loss": 1.1781, + "step": 5627 + }, + { + "epoch": 0.4935293061799408, + "grad_norm": 0.048095703125, + "learning_rate": 0.001882640421967603, + "loss": 1.157, + "step": 5628 + }, + { + "epoch": 0.49361699795431535, + "grad_norm": 0.057861328125, + "learning_rate": 0.0018822388250742954, + "loss": 1.1914, + "step": 5629 + }, + { + "epoch": 0.49370468972868986, + "grad_norm": 0.049560546875, + "learning_rate": 0.0018818372070014073, + "loss": 1.1222, + "step": 5630 + }, + { + "epoch": 0.4937923815030644, + "grad_norm": 0.0625, + "learning_rate": 0.0018814355677855632, + "loss": 1.1642, + "step": 5631 + }, + { + "epoch": 0.49388007327743894, + "grad_norm": 0.055419921875, + "learning_rate": 0.0018810339074633931, + "loss": 1.217, + "step": 5632 + }, + { + "epoch": 0.4939677650518135, + "grad_norm": 0.052978515625, + "learning_rate": 0.001880632226071527, + "loss": 1.173, + "step": 5633 + }, + { + "epoch": 0.49405545682618807, + "grad_norm": 0.080078125, + "learning_rate": 0.0018802305236465973, + "loss": 1.1862, + "step": 5634 + }, + { + "epoch": 0.4941431486005626, + "grad_norm": 0.06982421875, + "learning_rate": 0.0018798288002252381, + "loss": 1.1268, + "step": 5635 + }, + { + "epoch": 0.49423084037493714, + "grad_norm": 0.07666015625, + "learning_rate": 0.0018794270558440858, + "loss": 1.1543, + "step": 5636 + }, + { + "epoch": 0.4943185321493117, + "grad_norm": 0.052001953125, + "learning_rate": 0.001879025290539778, + "loss": 1.1281, + "step": 5637 + }, + { + "epoch": 0.4944062239236862, + "grad_norm": 0.0634765625, + "learning_rate": 0.0018786235043489552, + "loss": 1.1674, + "step": 5638 + }, + { + "epoch": 0.4944939156980608, + "grad_norm": 0.05419921875, + "learning_rate": 0.0018782216973082593, + "loss": 1.2175, + "step": 5639 + }, + { + "epoch": 0.49458160747243535, + "grad_norm": 0.053466796875, + "learning_rate": 0.0018778198694543332, + "loss": 1.2016, + "step": 5640 + }, + { + "epoch": 0.49466929924680986, + "grad_norm": 0.048583984375, + "learning_rate": 0.0018774180208238232, + "loss": 1.1328, + "step": 5641 + }, + { + "epoch": 0.4947569910211844, + "grad_norm": 0.04833984375, + "learning_rate": 0.0018770161514533772, + "loss": 1.2101, + "step": 5642 + }, + { + "epoch": 0.49484468279555893, + "grad_norm": 0.053955078125, + "learning_rate": 0.0018766142613796438, + "loss": 1.1545, + "step": 5643 + }, + { + "epoch": 0.4949323745699335, + "grad_norm": 0.0595703125, + "learning_rate": 0.001876212350639275, + "loss": 1.1378, + "step": 5644 + }, + { + "epoch": 0.49502006634430806, + "grad_norm": 0.0634765625, + "learning_rate": 0.0018758104192689235, + "loss": 1.1898, + "step": 5645 + }, + { + "epoch": 0.4951077581186826, + "grad_norm": 0.0888671875, + "learning_rate": 0.001875408467305245, + "loss": 1.2606, + "step": 5646 + }, + { + "epoch": 0.49519544989305714, + "grad_norm": 0.07666015625, + "learning_rate": 0.001875006494784896, + "loss": 1.1996, + "step": 5647 + }, + { + "epoch": 0.4952831416674317, + "grad_norm": 0.049072265625, + "learning_rate": 0.0018746045017445359, + "loss": 1.1701, + "step": 5648 + }, + { + "epoch": 0.4953708334418062, + "grad_norm": 0.07763671875, + "learning_rate": 0.0018742024882208242, + "loss": 1.1601, + "step": 5649 + }, + { + "epoch": 0.4954585252161808, + "grad_norm": 0.09423828125, + "learning_rate": 0.0018738004542504252, + "loss": 1.166, + "step": 5650 + }, + { + "epoch": 0.49554621699055534, + "grad_norm": 0.060791015625, + "learning_rate": 0.001873398399870002, + "loss": 1.1934, + "step": 5651 + }, + { + "epoch": 0.49563390876492985, + "grad_norm": 0.07666015625, + "learning_rate": 0.0018729963251162217, + "loss": 1.1283, + "step": 5652 + }, + { + "epoch": 0.4957216005393044, + "grad_norm": 0.0595703125, + "learning_rate": 0.001872594230025752, + "loss": 1.1343, + "step": 5653 + }, + { + "epoch": 0.495809292313679, + "grad_norm": 0.05224609375, + "learning_rate": 0.0018721921146352635, + "loss": 1.1333, + "step": 5654 + }, + { + "epoch": 0.4958969840880535, + "grad_norm": 0.107421875, + "learning_rate": 0.0018717899789814274, + "loss": 1.1926, + "step": 5655 + }, + { + "epoch": 0.49598467586242806, + "grad_norm": 0.0625, + "learning_rate": 0.0018713878231009177, + "loss": 1.1943, + "step": 5656 + }, + { + "epoch": 0.49607236763680257, + "grad_norm": 0.08251953125, + "learning_rate": 0.0018709856470304103, + "loss": 1.171, + "step": 5657 + }, + { + "epoch": 0.49616005941117713, + "grad_norm": 0.05810546875, + "learning_rate": 0.0018705834508065823, + "loss": 1.1957, + "step": 5658 + }, + { + "epoch": 0.4962477511855517, + "grad_norm": 0.059814453125, + "learning_rate": 0.001870181234466113, + "loss": 1.1959, + "step": 5659 + }, + { + "epoch": 0.4963354429599262, + "grad_norm": 0.054931640625, + "learning_rate": 0.0018697789980456833, + "loss": 1.1978, + "step": 5660 + }, + { + "epoch": 0.4964231347343008, + "grad_norm": 0.10009765625, + "learning_rate": 0.0018693767415819769, + "loss": 1.2062, + "step": 5661 + }, + { + "epoch": 0.49651082650867534, + "grad_norm": 0.048583984375, + "learning_rate": 0.0018689744651116773, + "loss": 1.1486, + "step": 5662 + }, + { + "epoch": 0.49659851828304985, + "grad_norm": 0.051025390625, + "learning_rate": 0.0018685721686714724, + "loss": 1.1489, + "step": 5663 + }, + { + "epoch": 0.4966862100574244, + "grad_norm": 0.0927734375, + "learning_rate": 0.0018681698522980497, + "loss": 1.1639, + "step": 5664 + }, + { + "epoch": 0.496773901831799, + "grad_norm": 0.1083984375, + "learning_rate": 0.0018677675160280992, + "loss": 1.165, + "step": 5665 + }, + { + "epoch": 0.4968615936061735, + "grad_norm": 0.046875, + "learning_rate": 0.001867365159898314, + "loss": 1.1317, + "step": 5666 + }, + { + "epoch": 0.49694928538054806, + "grad_norm": 0.076171875, + "learning_rate": 0.001866962783945387, + "loss": 1.1819, + "step": 5667 + }, + { + "epoch": 0.49703697715492257, + "grad_norm": 0.048095703125, + "learning_rate": 0.0018665603882060144, + "loss": 1.1626, + "step": 5668 + }, + { + "epoch": 0.49712466892929713, + "grad_norm": 0.060546875, + "learning_rate": 0.001866157972716893, + "loss": 1.1676, + "step": 5669 + }, + { + "epoch": 0.4972123607036717, + "grad_norm": 0.049072265625, + "learning_rate": 0.0018657555375147232, + "loss": 1.1656, + "step": 5670 + }, + { + "epoch": 0.4973000524780462, + "grad_norm": 0.05126953125, + "learning_rate": 0.0018653530826362047, + "loss": 1.1896, + "step": 5671 + }, + { + "epoch": 0.49738774425242077, + "grad_norm": 0.051025390625, + "learning_rate": 0.0018649506081180418, + "loss": 1.1408, + "step": 5672 + }, + { + "epoch": 0.49747543602679534, + "grad_norm": 0.059326171875, + "learning_rate": 0.0018645481139969377, + "loss": 1.1446, + "step": 5673 + }, + { + "epoch": 0.49756312780116985, + "grad_norm": 0.0615234375, + "learning_rate": 0.0018641456003096002, + "loss": 1.1638, + "step": 5674 + }, + { + "epoch": 0.4976508195755444, + "grad_norm": 0.048828125, + "learning_rate": 0.001863743067092736, + "loss": 1.1686, + "step": 5675 + }, + { + "epoch": 0.497738511349919, + "grad_norm": 0.08154296875, + "learning_rate": 0.0018633405143830557, + "loss": 1.1248, + "step": 5676 + }, + { + "epoch": 0.4978262031242935, + "grad_norm": 0.0517578125, + "learning_rate": 0.0018629379422172715, + "loss": 1.1806, + "step": 5677 + }, + { + "epoch": 0.49791389489866805, + "grad_norm": 0.05712890625, + "learning_rate": 0.0018625353506320967, + "loss": 1.1103, + "step": 5678 + }, + { + "epoch": 0.49800158667304256, + "grad_norm": 0.06689453125, + "learning_rate": 0.0018621327396642468, + "loss": 1.1912, + "step": 5679 + }, + { + "epoch": 0.4980892784474171, + "grad_norm": 0.064453125, + "learning_rate": 0.001861730109350438, + "loss": 1.2087, + "step": 5680 + }, + { + "epoch": 0.4981769702217917, + "grad_norm": 0.0654296875, + "learning_rate": 0.0018613274597273912, + "loss": 1.2358, + "step": 5681 + }, + { + "epoch": 0.4982646619961662, + "grad_norm": 0.050048828125, + "learning_rate": 0.0018609247908318244, + "loss": 1.1343, + "step": 5682 + }, + { + "epoch": 0.49835235377054077, + "grad_norm": 0.07666015625, + "learning_rate": 0.0018605221027004615, + "loss": 1.1756, + "step": 5683 + }, + { + "epoch": 0.49844004554491533, + "grad_norm": 0.046142578125, + "learning_rate": 0.0018601193953700261, + "loss": 1.1562, + "step": 5684 + }, + { + "epoch": 0.49852773731928984, + "grad_norm": 0.061767578125, + "learning_rate": 0.0018597166688772446, + "loss": 1.1202, + "step": 5685 + }, + { + "epoch": 0.4986154290936644, + "grad_norm": 0.1064453125, + "learning_rate": 0.001859313923258844, + "loss": 1.1693, + "step": 5686 + }, + { + "epoch": 0.498703120868039, + "grad_norm": 0.05126953125, + "learning_rate": 0.0018589111585515538, + "loss": 1.179, + "step": 5687 + }, + { + "epoch": 0.4987908126424135, + "grad_norm": 0.050048828125, + "learning_rate": 0.0018585083747921056, + "loss": 1.144, + "step": 5688 + }, + { + "epoch": 0.49887850441678805, + "grad_norm": 0.05126953125, + "learning_rate": 0.0018581055720172318, + "loss": 1.1149, + "step": 5689 + }, + { + "epoch": 0.4989661961911626, + "grad_norm": 0.055419921875, + "learning_rate": 0.001857702750263667, + "loss": 1.1048, + "step": 5690 + }, + { + "epoch": 0.4990538879655371, + "grad_norm": 0.07080078125, + "learning_rate": 0.0018572999095681476, + "loss": 1.2094, + "step": 5691 + }, + { + "epoch": 0.4991415797399117, + "grad_norm": 0.0673828125, + "learning_rate": 0.0018568970499674119, + "loss": 1.2009, + "step": 5692 + }, + { + "epoch": 0.4992292715142862, + "grad_norm": 0.056396484375, + "learning_rate": 0.0018564941714981987, + "loss": 1.1597, + "step": 5693 + }, + { + "epoch": 0.49931696328866076, + "grad_norm": 0.08251953125, + "learning_rate": 0.0018560912741972504, + "loss": 1.2405, + "step": 5694 + }, + { + "epoch": 0.49940465506303533, + "grad_norm": 0.08447265625, + "learning_rate": 0.0018556883581013098, + "loss": 1.2059, + "step": 5695 + }, + { + "epoch": 0.49949234683740984, + "grad_norm": 0.10546875, + "learning_rate": 0.0018552854232471222, + "loss": 1.1652, + "step": 5696 + }, + { + "epoch": 0.4995800386117844, + "grad_norm": 0.0751953125, + "learning_rate": 0.0018548824696714334, + "loss": 1.1485, + "step": 5697 + }, + { + "epoch": 0.49966773038615897, + "grad_norm": 0.053955078125, + "learning_rate": 0.001854479497410993, + "loss": 1.2003, + "step": 5698 + }, + { + "epoch": 0.4997554221605335, + "grad_norm": 0.119140625, + "learning_rate": 0.0018540765065025498, + "loss": 1.1798, + "step": 5699 + }, + { + "epoch": 0.49984311393490805, + "grad_norm": 0.045654296875, + "learning_rate": 0.001853673496982856, + "loss": 1.1801, + "step": 5700 + }, + { + "epoch": 0.4999308057092826, + "grad_norm": 0.11865234375, + "learning_rate": 0.0018532704688886657, + "loss": 1.1632, + "step": 5701 + }, + { + "epoch": 0.5000184974836571, + "grad_norm": 0.08642578125, + "learning_rate": 0.0018528674222567328, + "loss": 1.2307, + "step": 5702 + }, + { + "epoch": 0.5001061892580316, + "grad_norm": 0.09228515625, + "learning_rate": 0.001852464357123815, + "loss": 1.2115, + "step": 5703 + }, + { + "epoch": 0.5001938810324063, + "grad_norm": 0.1708984375, + "learning_rate": 0.0018520612735266702, + "loss": 1.1467, + "step": 5704 + }, + { + "epoch": 0.5002815728067808, + "grad_norm": 0.052490234375, + "learning_rate": 0.001851658171502059, + "loss": 1.1028, + "step": 5705 + }, + { + "epoch": 0.5003692645811553, + "grad_norm": 0.1162109375, + "learning_rate": 0.0018512550510867431, + "loss": 1.1291, + "step": 5706 + }, + { + "epoch": 0.5004569563555299, + "grad_norm": 0.0791015625, + "learning_rate": 0.001850851912317486, + "loss": 1.1355, + "step": 5707 + }, + { + "epoch": 0.5005446481299044, + "grad_norm": 0.04541015625, + "learning_rate": 0.0018504487552310535, + "loss": 1.119, + "step": 5708 + }, + { + "epoch": 0.5006323399042789, + "grad_norm": 0.1162109375, + "learning_rate": 0.0018500455798642114, + "loss": 1.1954, + "step": 5709 + }, + { + "epoch": 0.5007200316786535, + "grad_norm": 0.07470703125, + "learning_rate": 0.001849642386253729, + "loss": 1.2129, + "step": 5710 + }, + { + "epoch": 0.500807723453028, + "grad_norm": 0.052978515625, + "learning_rate": 0.001849239174436376, + "loss": 1.1345, + "step": 5711 + }, + { + "epoch": 0.5008954152274026, + "grad_norm": 0.10400390625, + "learning_rate": 0.0018488359444489252, + "loss": 1.182, + "step": 5712 + }, + { + "epoch": 0.5009831070017772, + "grad_norm": 0.0732421875, + "learning_rate": 0.0018484326963281493, + "loss": 1.192, + "step": 5713 + }, + { + "epoch": 0.5010707987761517, + "grad_norm": 0.078125, + "learning_rate": 0.001848029430110824, + "loss": 1.1425, + "step": 5714 + }, + { + "epoch": 0.5011584905505262, + "grad_norm": 0.07568359375, + "learning_rate": 0.0018476261458337255, + "loss": 1.1525, + "step": 5715 + }, + { + "epoch": 0.5012461823249008, + "grad_norm": 0.0576171875, + "learning_rate": 0.0018472228435336328, + "loss": 1.1223, + "step": 5716 + }, + { + "epoch": 0.5013338740992753, + "grad_norm": 0.0703125, + "learning_rate": 0.0018468195232473263, + "loss": 1.1235, + "step": 5717 + }, + { + "epoch": 0.5014215658736498, + "grad_norm": 0.0673828125, + "learning_rate": 0.0018464161850115864, + "loss": 1.1074, + "step": 5718 + }, + { + "epoch": 0.5015092576480245, + "grad_norm": 0.08837890625, + "learning_rate": 0.0018460128288631983, + "loss": 1.199, + "step": 5719 + }, + { + "epoch": 0.501596949422399, + "grad_norm": 0.05322265625, + "learning_rate": 0.001845609454838946, + "loss": 1.1578, + "step": 5720 + }, + { + "epoch": 0.5016846411967735, + "grad_norm": 0.072265625, + "learning_rate": 0.0018452060629756165, + "loss": 1.1661, + "step": 5721 + }, + { + "epoch": 0.501772332971148, + "grad_norm": 0.0556640625, + "learning_rate": 0.0018448026533099975, + "loss": 1.2222, + "step": 5722 + }, + { + "epoch": 0.5018600247455226, + "grad_norm": 0.04833984375, + "learning_rate": 0.0018443992258788805, + "loss": 1.1709, + "step": 5723 + }, + { + "epoch": 0.5019477165198971, + "grad_norm": 0.049560546875, + "learning_rate": 0.001843995780719055, + "loss": 1.1649, + "step": 5724 + }, + { + "epoch": 0.5020354082942716, + "grad_norm": 0.0498046875, + "learning_rate": 0.0018435923178673152, + "loss": 1.173, + "step": 5725 + }, + { + "epoch": 0.5021231000686462, + "grad_norm": 0.053466796875, + "learning_rate": 0.0018431888373604559, + "loss": 1.2286, + "step": 5726 + }, + { + "epoch": 0.5022107918430208, + "grad_norm": 0.05810546875, + "learning_rate": 0.0018427853392352731, + "loss": 1.1544, + "step": 5727 + }, + { + "epoch": 0.5022984836173953, + "grad_norm": 0.058837890625, + "learning_rate": 0.0018423818235285654, + "loss": 1.1648, + "step": 5728 + }, + { + "epoch": 0.5023861753917699, + "grad_norm": 0.05029296875, + "learning_rate": 0.0018419782902771315, + "loss": 1.1256, + "step": 5729 + }, + { + "epoch": 0.5024738671661444, + "grad_norm": 0.0595703125, + "learning_rate": 0.0018415747395177737, + "loss": 1.1708, + "step": 5730 + }, + { + "epoch": 0.5025615589405189, + "grad_norm": 0.064453125, + "learning_rate": 0.0018411711712872938, + "loss": 1.1864, + "step": 5731 + }, + { + "epoch": 0.5026492507148935, + "grad_norm": 0.06298828125, + "learning_rate": 0.001840767585622497, + "loss": 1.1364, + "step": 5732 + }, + { + "epoch": 0.502736942489268, + "grad_norm": 0.09521484375, + "learning_rate": 0.0018403639825601886, + "loss": 1.1742, + "step": 5733 + }, + { + "epoch": 0.5028246342636425, + "grad_norm": 0.050537109375, + "learning_rate": 0.0018399603621371764, + "loss": 1.1361, + "step": 5734 + }, + { + "epoch": 0.5029123260380172, + "grad_norm": 0.0537109375, + "learning_rate": 0.0018395567243902696, + "loss": 1.1774, + "step": 5735 + }, + { + "epoch": 0.5030000178123917, + "grad_norm": 0.050048828125, + "learning_rate": 0.0018391530693562788, + "loss": 1.0974, + "step": 5736 + }, + { + "epoch": 0.5030877095867662, + "grad_norm": 0.07080078125, + "learning_rate": 0.0018387493970720165, + "loss": 1.2111, + "step": 5737 + }, + { + "epoch": 0.5031754013611408, + "grad_norm": 0.0498046875, + "learning_rate": 0.0018383457075742962, + "loss": 1.1339, + "step": 5738 + }, + { + "epoch": 0.5032630931355153, + "grad_norm": 0.06494140625, + "learning_rate": 0.0018379420008999337, + "loss": 1.1121, + "step": 5739 + }, + { + "epoch": 0.5033507849098898, + "grad_norm": 0.056640625, + "learning_rate": 0.0018375382770857462, + "loss": 1.2145, + "step": 5740 + }, + { + "epoch": 0.5034384766842644, + "grad_norm": 0.04736328125, + "learning_rate": 0.0018371345361685514, + "loss": 1.1664, + "step": 5741 + }, + { + "epoch": 0.503526168458639, + "grad_norm": 0.047607421875, + "learning_rate": 0.0018367307781851701, + "loss": 1.1216, + "step": 5742 + }, + { + "epoch": 0.5036138602330135, + "grad_norm": 0.04736328125, + "learning_rate": 0.0018363270031724244, + "loss": 1.1755, + "step": 5743 + }, + { + "epoch": 0.5037015520073881, + "grad_norm": 0.055419921875, + "learning_rate": 0.0018359232111671367, + "loss": 1.1663, + "step": 5744 + }, + { + "epoch": 0.5037892437817626, + "grad_norm": 0.048095703125, + "learning_rate": 0.0018355194022061322, + "loss": 1.154, + "step": 5745 + }, + { + "epoch": 0.5038769355561371, + "grad_norm": 0.047607421875, + "learning_rate": 0.0018351155763262365, + "loss": 1.1686, + "step": 5746 + }, + { + "epoch": 0.5039646273305116, + "grad_norm": 0.054931640625, + "learning_rate": 0.001834711733564279, + "loss": 1.155, + "step": 5747 + }, + { + "epoch": 0.5040523191048862, + "grad_norm": 0.047607421875, + "learning_rate": 0.0018343078739570876, + "loss": 1.166, + "step": 5748 + }, + { + "epoch": 0.5041400108792607, + "grad_norm": 0.10400390625, + "learning_rate": 0.001833903997541494, + "loss": 1.2249, + "step": 5749 + }, + { + "epoch": 0.5042277026536353, + "grad_norm": 0.0517578125, + "learning_rate": 0.0018335001043543312, + "loss": 1.1903, + "step": 5750 + }, + { + "epoch": 0.5043153944280099, + "grad_norm": 0.09228515625, + "learning_rate": 0.0018330961944324315, + "loss": 1.1876, + "step": 5751 + }, + { + "epoch": 0.5044030862023844, + "grad_norm": 0.072265625, + "learning_rate": 0.0018326922678126322, + "loss": 1.1586, + "step": 5752 + }, + { + "epoch": 0.5044907779767589, + "grad_norm": 0.0576171875, + "learning_rate": 0.0018322883245317696, + "loss": 1.2384, + "step": 5753 + }, + { + "epoch": 0.5045784697511335, + "grad_norm": 0.07373046875, + "learning_rate": 0.0018318843646266823, + "loss": 1.2219, + "step": 5754 + }, + { + "epoch": 0.504666161525508, + "grad_norm": 0.07763671875, + "learning_rate": 0.0018314803881342104, + "loss": 1.1529, + "step": 5755 + }, + { + "epoch": 0.5047538532998825, + "grad_norm": 0.052490234375, + "learning_rate": 0.0018310763950911964, + "loss": 1.1267, + "step": 5756 + }, + { + "epoch": 0.5048415450742572, + "grad_norm": 0.048828125, + "learning_rate": 0.0018306723855344817, + "loss": 1.2313, + "step": 5757 + }, + { + "epoch": 0.5049292368486317, + "grad_norm": 0.0576171875, + "learning_rate": 0.0018302683595009122, + "loss": 1.161, + "step": 5758 + }, + { + "epoch": 0.5050169286230062, + "grad_norm": 0.055908203125, + "learning_rate": 0.0018298643170273333, + "loss": 1.1566, + "step": 5759 + }, + { + "epoch": 0.5051046203973808, + "grad_norm": 0.059326171875, + "learning_rate": 0.0018294602581505933, + "loss": 1.2054, + "step": 5760 + }, + { + "epoch": 0.5051923121717553, + "grad_norm": 0.068359375, + "learning_rate": 0.0018290561829075415, + "loss": 1.2319, + "step": 5761 + }, + { + "epoch": 0.5052800039461298, + "grad_norm": 0.0673828125, + "learning_rate": 0.0018286520913350273, + "loss": 1.1668, + "step": 5762 + }, + { + "epoch": 0.5053676957205044, + "grad_norm": 0.054443359375, + "learning_rate": 0.001828247983469904, + "loss": 1.1772, + "step": 5763 + }, + { + "epoch": 0.505455387494879, + "grad_norm": 0.0595703125, + "learning_rate": 0.0018278438593490238, + "loss": 1.2506, + "step": 5764 + }, + { + "epoch": 0.5055430792692535, + "grad_norm": 0.05224609375, + "learning_rate": 0.0018274397190092438, + "loss": 1.1798, + "step": 5765 + }, + { + "epoch": 0.5056307710436281, + "grad_norm": 0.0654296875, + "learning_rate": 0.0018270355624874196, + "loss": 1.2016, + "step": 5766 + }, + { + "epoch": 0.5057184628180026, + "grad_norm": 0.06201171875, + "learning_rate": 0.001826631389820409, + "loss": 1.1744, + "step": 5767 + }, + { + "epoch": 0.5058061545923771, + "grad_norm": 0.07568359375, + "learning_rate": 0.0018262272010450707, + "loss": 1.1798, + "step": 5768 + }, + { + "epoch": 0.5058938463667516, + "grad_norm": 0.0537109375, + "learning_rate": 0.0018258229961982675, + "loss": 1.1568, + "step": 5769 + }, + { + "epoch": 0.5059815381411262, + "grad_norm": 0.08642578125, + "learning_rate": 0.0018254187753168605, + "loss": 1.225, + "step": 5770 + }, + { + "epoch": 0.5060692299155007, + "grad_norm": 0.06103515625, + "learning_rate": 0.001825014538437714, + "loss": 1.1546, + "step": 5771 + }, + { + "epoch": 0.5061569216898752, + "grad_norm": 0.052978515625, + "learning_rate": 0.0018246102855976934, + "loss": 1.1973, + "step": 5772 + }, + { + "epoch": 0.5062446134642499, + "grad_norm": 0.050048828125, + "learning_rate": 0.0018242060168336656, + "loss": 1.135, + "step": 5773 + }, + { + "epoch": 0.5063323052386244, + "grad_norm": 0.047607421875, + "learning_rate": 0.0018238017321824994, + "loss": 1.1889, + "step": 5774 + }, + { + "epoch": 0.5064199970129989, + "grad_norm": 0.060302734375, + "learning_rate": 0.0018233974316810631, + "loss": 1.1415, + "step": 5775 + }, + { + "epoch": 0.5065076887873735, + "grad_norm": 0.0947265625, + "learning_rate": 0.0018229931153662293, + "loss": 1.1746, + "step": 5776 + }, + { + "epoch": 0.506595380561748, + "grad_norm": 0.052001953125, + "learning_rate": 0.0018225887832748694, + "loss": 1.1884, + "step": 5777 + }, + { + "epoch": 0.5066830723361225, + "grad_norm": 0.053466796875, + "learning_rate": 0.0018221844354438582, + "loss": 1.2115, + "step": 5778 + }, + { + "epoch": 0.5067707641104972, + "grad_norm": 0.0537109375, + "learning_rate": 0.0018217800719100706, + "loss": 1.1784, + "step": 5779 + }, + { + "epoch": 0.5068584558848717, + "grad_norm": 0.0693359375, + "learning_rate": 0.0018213756927103846, + "loss": 1.1439, + "step": 5780 + }, + { + "epoch": 0.5069461476592462, + "grad_norm": 0.0634765625, + "learning_rate": 0.0018209712978816772, + "loss": 1.1463, + "step": 5781 + }, + { + "epoch": 0.5070338394336208, + "grad_norm": 0.06982421875, + "learning_rate": 0.0018205668874608288, + "loss": 1.1772, + "step": 5782 + }, + { + "epoch": 0.5071215312079953, + "grad_norm": 0.04443359375, + "learning_rate": 0.0018201624614847213, + "loss": 1.1902, + "step": 5783 + }, + { + "epoch": 0.5072092229823698, + "grad_norm": 0.056884765625, + "learning_rate": 0.0018197580199902361, + "loss": 1.1643, + "step": 5784 + }, + { + "epoch": 0.5072969147567444, + "grad_norm": 0.058837890625, + "learning_rate": 0.0018193535630142577, + "loss": 1.1249, + "step": 5785 + }, + { + "epoch": 0.5073846065311189, + "grad_norm": 0.059814453125, + "learning_rate": 0.0018189490905936713, + "loss": 1.1695, + "step": 5786 + }, + { + "epoch": 0.5074722983054935, + "grad_norm": 0.166015625, + "learning_rate": 0.0018185446027653644, + "loss": 1.1812, + "step": 5787 + }, + { + "epoch": 0.5075599900798681, + "grad_norm": 0.049072265625, + "learning_rate": 0.0018181400995662244, + "loss": 1.1458, + "step": 5788 + }, + { + "epoch": 0.5076476818542426, + "grad_norm": 0.05126953125, + "learning_rate": 0.0018177355810331417, + "loss": 1.0803, + "step": 5789 + }, + { + "epoch": 0.5077353736286171, + "grad_norm": 0.05419921875, + "learning_rate": 0.0018173310472030068, + "loss": 1.1709, + "step": 5790 + }, + { + "epoch": 0.5078230654029917, + "grad_norm": 0.0556640625, + "learning_rate": 0.001816926498112713, + "loss": 1.1859, + "step": 5791 + }, + { + "epoch": 0.5079107571773662, + "grad_norm": 0.076171875, + "learning_rate": 0.0018165219337991528, + "loss": 1.1926, + "step": 5792 + }, + { + "epoch": 0.5079984489517407, + "grad_norm": 0.0830078125, + "learning_rate": 0.0018161173542992222, + "loss": 1.2352, + "step": 5793 + }, + { + "epoch": 0.5080861407261152, + "grad_norm": 0.051025390625, + "learning_rate": 0.0018157127596498182, + "loss": 1.1169, + "step": 5794 + }, + { + "epoch": 0.5081738325004899, + "grad_norm": 0.11474609375, + "learning_rate": 0.0018153081498878375, + "loss": 1.152, + "step": 5795 + }, + { + "epoch": 0.5082615242748644, + "grad_norm": 0.0732421875, + "learning_rate": 0.0018149035250501807, + "loss": 1.1563, + "step": 5796 + }, + { + "epoch": 0.5083492160492389, + "grad_norm": 0.1025390625, + "learning_rate": 0.001814498885173748, + "loss": 1.168, + "step": 5797 + }, + { + "epoch": 0.5084369078236135, + "grad_norm": 0.07666015625, + "learning_rate": 0.001814094230295442, + "loss": 1.1188, + "step": 5798 + }, + { + "epoch": 0.508524599597988, + "grad_norm": 0.08154296875, + "learning_rate": 0.0018136895604521656, + "loss": 1.1922, + "step": 5799 + }, + { + "epoch": 0.5086122913723625, + "grad_norm": 0.0732421875, + "learning_rate": 0.0018132848756808237, + "loss": 1.1117, + "step": 5800 + }, + { + "epoch": 0.5086999831467371, + "grad_norm": 0.0712890625, + "learning_rate": 0.001812880176018323, + "loss": 1.2078, + "step": 5801 + }, + { + "epoch": 0.5087876749211117, + "grad_norm": 0.054931640625, + "learning_rate": 0.0018124754615015705, + "loss": 1.1425, + "step": 5802 + }, + { + "epoch": 0.5088753666954862, + "grad_norm": 0.05126953125, + "learning_rate": 0.0018120707321674751, + "loss": 1.1504, + "step": 5803 + }, + { + "epoch": 0.5089630584698608, + "grad_norm": 0.05419921875, + "learning_rate": 0.0018116659880529472, + "loss": 1.1957, + "step": 5804 + }, + { + "epoch": 0.5090507502442353, + "grad_norm": 0.06640625, + "learning_rate": 0.0018112612291948994, + "loss": 1.1273, + "step": 5805 + }, + { + "epoch": 0.5091384420186098, + "grad_norm": 0.05517578125, + "learning_rate": 0.0018108564556302425, + "loss": 1.155, + "step": 5806 + }, + { + "epoch": 0.5092261337929844, + "grad_norm": 0.06884765625, + "learning_rate": 0.0018104516673958931, + "loss": 1.1446, + "step": 5807 + }, + { + "epoch": 0.5093138255673589, + "grad_norm": 0.051025390625, + "learning_rate": 0.001810046864528766, + "loss": 1.1482, + "step": 5808 + }, + { + "epoch": 0.5094015173417334, + "grad_norm": 0.048828125, + "learning_rate": 0.0018096420470657777, + "loss": 1.1814, + "step": 5809 + }, + { + "epoch": 0.5094892091161081, + "grad_norm": 0.05029296875, + "learning_rate": 0.0018092372150438464, + "loss": 1.2135, + "step": 5810 + }, + { + "epoch": 0.5095769008904826, + "grad_norm": 0.052978515625, + "learning_rate": 0.001808832368499892, + "loss": 1.1625, + "step": 5811 + }, + { + "epoch": 0.5096645926648571, + "grad_norm": 0.076171875, + "learning_rate": 0.0018084275074708364, + "loss": 1.186, + "step": 5812 + }, + { + "epoch": 0.5097522844392317, + "grad_norm": 0.046142578125, + "learning_rate": 0.0018080226319936005, + "loss": 1.1213, + "step": 5813 + }, + { + "epoch": 0.5098399762136062, + "grad_norm": 0.09423828125, + "learning_rate": 0.0018076177421051088, + "loss": 1.139, + "step": 5814 + }, + { + "epoch": 0.5099276679879807, + "grad_norm": 0.0615234375, + "learning_rate": 0.001807212837842286, + "loss": 1.2451, + "step": 5815 + }, + { + "epoch": 0.5100153597623552, + "grad_norm": 0.051025390625, + "learning_rate": 0.001806807919242058, + "loss": 1.1949, + "step": 5816 + }, + { + "epoch": 0.5101030515367299, + "grad_norm": 0.06982421875, + "learning_rate": 0.0018064029863413527, + "loss": 1.1656, + "step": 5817 + }, + { + "epoch": 0.5101907433111044, + "grad_norm": 0.0478515625, + "learning_rate": 0.0018059980391770993, + "loss": 1.1062, + "step": 5818 + }, + { + "epoch": 0.5102784350854789, + "grad_norm": 0.056396484375, + "learning_rate": 0.0018055930777862265, + "loss": 1.1592, + "step": 5819 + }, + { + "epoch": 0.5103661268598535, + "grad_norm": 0.052001953125, + "learning_rate": 0.0018051881022056672, + "loss": 1.1576, + "step": 5820 + }, + { + "epoch": 0.510453818634228, + "grad_norm": 0.052490234375, + "learning_rate": 0.0018047831124723534, + "loss": 1.1511, + "step": 5821 + }, + { + "epoch": 0.5105415104086025, + "grad_norm": 0.0556640625, + "learning_rate": 0.0018043781086232195, + "loss": 1.1957, + "step": 5822 + }, + { + "epoch": 0.5106292021829771, + "grad_norm": 0.050048828125, + "learning_rate": 0.0018039730906952004, + "loss": 1.1676, + "step": 5823 + }, + { + "epoch": 0.5107168939573516, + "grad_norm": 0.060791015625, + "learning_rate": 0.0018035680587252328, + "loss": 1.2499, + "step": 5824 + }, + { + "epoch": 0.5108045857317262, + "grad_norm": 0.05126953125, + "learning_rate": 0.0018031630127502553, + "loss": 1.1384, + "step": 5825 + }, + { + "epoch": 0.5108922775061008, + "grad_norm": 0.0830078125, + "learning_rate": 0.0018027579528072058, + "loss": 1.1521, + "step": 5826 + }, + { + "epoch": 0.5109799692804753, + "grad_norm": 0.052490234375, + "learning_rate": 0.0018023528789330257, + "loss": 1.208, + "step": 5827 + }, + { + "epoch": 0.5110676610548498, + "grad_norm": 0.06689453125, + "learning_rate": 0.0018019477911646558, + "loss": 1.1748, + "step": 5828 + }, + { + "epoch": 0.5111553528292244, + "grad_norm": 0.059326171875, + "learning_rate": 0.0018015426895390402, + "loss": 1.1991, + "step": 5829 + }, + { + "epoch": 0.5112430446035989, + "grad_norm": 0.05078125, + "learning_rate": 0.001801137574093122, + "loss": 1.1822, + "step": 5830 + }, + { + "epoch": 0.5113307363779734, + "grad_norm": 0.04931640625, + "learning_rate": 0.0018007324448638482, + "loss": 1.1399, + "step": 5831 + }, + { + "epoch": 0.5114184281523481, + "grad_norm": 0.058349609375, + "learning_rate": 0.0018003273018881631, + "loss": 1.2482, + "step": 5832 + }, + { + "epoch": 0.5115061199267226, + "grad_norm": 0.07275390625, + "learning_rate": 0.0017999221452030174, + "loss": 1.2276, + "step": 5833 + }, + { + "epoch": 0.5115938117010971, + "grad_norm": 0.051513671875, + "learning_rate": 0.0017995169748453584, + "loss": 1.1534, + "step": 5834 + }, + { + "epoch": 0.5116815034754717, + "grad_norm": 0.049072265625, + "learning_rate": 0.0017991117908521374, + "loss": 1.1785, + "step": 5835 + }, + { + "epoch": 0.5117691952498462, + "grad_norm": 0.05029296875, + "learning_rate": 0.0017987065932603065, + "loss": 1.1173, + "step": 5836 + }, + { + "epoch": 0.5118568870242207, + "grad_norm": 0.049072265625, + "learning_rate": 0.0017983013821068175, + "loss": 1.1945, + "step": 5837 + }, + { + "epoch": 0.5119445787985953, + "grad_norm": 0.0712890625, + "learning_rate": 0.0017978961574286255, + "loss": 1.2266, + "step": 5838 + }, + { + "epoch": 0.5120322705729698, + "grad_norm": 0.05419921875, + "learning_rate": 0.0017974909192626858, + "loss": 1.2387, + "step": 5839 + }, + { + "epoch": 0.5121199623473444, + "grad_norm": 0.068359375, + "learning_rate": 0.0017970856676459547, + "loss": 1.1835, + "step": 5840 + }, + { + "epoch": 0.5122076541217189, + "grad_norm": 0.0517578125, + "learning_rate": 0.0017966804026153906, + "loss": 1.162, + "step": 5841 + }, + { + "epoch": 0.5122953458960935, + "grad_norm": 0.05078125, + "learning_rate": 0.001796275124207953, + "loss": 1.2007, + "step": 5842 + }, + { + "epoch": 0.512383037670468, + "grad_norm": 0.051025390625, + "learning_rate": 0.001795869832460601, + "loss": 1.2048, + "step": 5843 + }, + { + "epoch": 0.5124707294448425, + "grad_norm": 0.05029296875, + "learning_rate": 0.001795464527410297, + "loss": 1.1028, + "step": 5844 + }, + { + "epoch": 0.5125584212192171, + "grad_norm": 0.054931640625, + "learning_rate": 0.0017950592090940035, + "loss": 1.1461, + "step": 5845 + }, + { + "epoch": 0.5126461129935916, + "grad_norm": 0.049560546875, + "learning_rate": 0.0017946538775486848, + "loss": 1.1955, + "step": 5846 + }, + { + "epoch": 0.5127338047679662, + "grad_norm": 0.06298828125, + "learning_rate": 0.0017942485328113057, + "loss": 1.2078, + "step": 5847 + }, + { + "epoch": 0.5128214965423408, + "grad_norm": 0.056640625, + "learning_rate": 0.0017938431749188327, + "loss": 1.134, + "step": 5848 + }, + { + "epoch": 0.5129091883167153, + "grad_norm": 0.061767578125, + "learning_rate": 0.0017934378039082342, + "loss": 1.1885, + "step": 5849 + }, + { + "epoch": 0.5129968800910898, + "grad_norm": 0.0712890625, + "learning_rate": 0.0017930324198164775, + "loss": 1.1321, + "step": 5850 + }, + { + "epoch": 0.5130845718654644, + "grad_norm": 0.051025390625, + "learning_rate": 0.0017926270226805334, + "loss": 1.125, + "step": 5851 + }, + { + "epoch": 0.5131722636398389, + "grad_norm": 0.0693359375, + "learning_rate": 0.0017922216125373733, + "loss": 1.1672, + "step": 5852 + }, + { + "epoch": 0.5132599554142134, + "grad_norm": 0.08447265625, + "learning_rate": 0.0017918161894239692, + "loss": 1.0977, + "step": 5853 + }, + { + "epoch": 0.513347647188588, + "grad_norm": 0.053955078125, + "learning_rate": 0.0017914107533772948, + "loss": 1.2088, + "step": 5854 + }, + { + "epoch": 0.5134353389629626, + "grad_norm": 0.053466796875, + "learning_rate": 0.0017910053044343248, + "loss": 1.1303, + "step": 5855 + }, + { + "epoch": 0.5135230307373371, + "grad_norm": 0.0517578125, + "learning_rate": 0.001790599842632035, + "loss": 1.1778, + "step": 5856 + }, + { + "epoch": 0.5136107225117117, + "grad_norm": 0.051513671875, + "learning_rate": 0.001790194368007402, + "loss": 1.2318, + "step": 5857 + }, + { + "epoch": 0.5136984142860862, + "grad_norm": 0.057861328125, + "learning_rate": 0.0017897888805974052, + "loss": 1.2004, + "step": 5858 + }, + { + "epoch": 0.5137861060604607, + "grad_norm": 0.057373046875, + "learning_rate": 0.0017893833804390235, + "loss": 1.1498, + "step": 5859 + }, + { + "epoch": 0.5138737978348353, + "grad_norm": 0.05859375, + "learning_rate": 0.0017889778675692367, + "loss": 1.1901, + "step": 5860 + }, + { + "epoch": 0.5139614896092098, + "grad_norm": 0.05078125, + "learning_rate": 0.0017885723420250272, + "loss": 1.1503, + "step": 5861 + }, + { + "epoch": 0.5140491813835844, + "grad_norm": 0.053466796875, + "learning_rate": 0.0017881668038433785, + "loss": 1.1612, + "step": 5862 + }, + { + "epoch": 0.514136873157959, + "grad_norm": 0.0546875, + "learning_rate": 0.0017877612530612732, + "loss": 1.2007, + "step": 5863 + }, + { + "epoch": 0.5142245649323335, + "grad_norm": 0.052001953125, + "learning_rate": 0.001787355689715697, + "loss": 1.2136, + "step": 5864 + }, + { + "epoch": 0.514312256706708, + "grad_norm": 0.06201171875, + "learning_rate": 0.0017869501138436372, + "loss": 1.1456, + "step": 5865 + }, + { + "epoch": 0.5143999484810825, + "grad_norm": 0.07666015625, + "learning_rate": 0.00178654452548208, + "loss": 1.1664, + "step": 5866 + }, + { + "epoch": 0.5144876402554571, + "grad_norm": 0.060546875, + "learning_rate": 0.0017861389246680156, + "loss": 1.0804, + "step": 5867 + }, + { + "epoch": 0.5145753320298316, + "grad_norm": 0.107421875, + "learning_rate": 0.001785733311438432, + "loss": 1.198, + "step": 5868 + }, + { + "epoch": 0.5146630238042061, + "grad_norm": 0.0517578125, + "learning_rate": 0.0017853276858303209, + "loss": 1.1833, + "step": 5869 + }, + { + "epoch": 0.5147507155785808, + "grad_norm": 0.04736328125, + "learning_rate": 0.0017849220478806738, + "loss": 1.1416, + "step": 5870 + }, + { + "epoch": 0.5148384073529553, + "grad_norm": 0.10107421875, + "learning_rate": 0.0017845163976264846, + "loss": 1.17, + "step": 5871 + }, + { + "epoch": 0.5149260991273298, + "grad_norm": 0.045654296875, + "learning_rate": 0.0017841107351047472, + "loss": 1.1496, + "step": 5872 + }, + { + "epoch": 0.5150137909017044, + "grad_norm": 0.06689453125, + "learning_rate": 0.0017837050603524568, + "loss": 1.1755, + "step": 5873 + }, + { + "epoch": 0.5151014826760789, + "grad_norm": 0.04833984375, + "learning_rate": 0.0017832993734066102, + "loss": 1.2067, + "step": 5874 + }, + { + "epoch": 0.5151891744504534, + "grad_norm": 0.11279296875, + "learning_rate": 0.001782893674304205, + "loss": 1.1894, + "step": 5875 + }, + { + "epoch": 0.515276866224828, + "grad_norm": 0.0849609375, + "learning_rate": 0.0017824879630822397, + "loss": 1.1842, + "step": 5876 + }, + { + "epoch": 0.5153645579992026, + "grad_norm": 0.05615234375, + "learning_rate": 0.0017820822397777137, + "loss": 1.1571, + "step": 5877 + }, + { + "epoch": 0.5154522497735771, + "grad_norm": 0.046630859375, + "learning_rate": 0.001781676504427629, + "loss": 1.146, + "step": 5878 + }, + { + "epoch": 0.5155399415479517, + "grad_norm": 0.046875, + "learning_rate": 0.0017812707570689867, + "loss": 1.1602, + "step": 5879 + }, + { + "epoch": 0.5156276333223262, + "grad_norm": 0.0537109375, + "learning_rate": 0.0017808649977387903, + "loss": 1.205, + "step": 5880 + }, + { + "epoch": 0.5157153250967007, + "grad_norm": 0.07666015625, + "learning_rate": 0.0017804592264740438, + "loss": 1.2132, + "step": 5881 + }, + { + "epoch": 0.5158030168710753, + "grad_norm": 0.056640625, + "learning_rate": 0.001780053443311753, + "loss": 1.1783, + "step": 5882 + }, + { + "epoch": 0.5158907086454498, + "grad_norm": 0.064453125, + "learning_rate": 0.0017796476482889233, + "loss": 1.2132, + "step": 5883 + }, + { + "epoch": 0.5159784004198243, + "grad_norm": 0.0498046875, + "learning_rate": 0.0017792418414425634, + "loss": 1.118, + "step": 5884 + }, + { + "epoch": 0.516066092194199, + "grad_norm": 0.049560546875, + "learning_rate": 0.0017788360228096812, + "loss": 1.125, + "step": 5885 + }, + { + "epoch": 0.5161537839685735, + "grad_norm": 0.06103515625, + "learning_rate": 0.0017784301924272856, + "loss": 1.1587, + "step": 5886 + }, + { + "epoch": 0.516241475742948, + "grad_norm": 0.08642578125, + "learning_rate": 0.0017780243503323888, + "loss": 1.2328, + "step": 5887 + }, + { + "epoch": 0.5163291675173225, + "grad_norm": 0.052001953125, + "learning_rate": 0.001777618496562001, + "loss": 1.1395, + "step": 5888 + }, + { + "epoch": 0.5164168592916971, + "grad_norm": 0.06396484375, + "learning_rate": 0.0017772126311531363, + "loss": 1.174, + "step": 5889 + }, + { + "epoch": 0.5165045510660716, + "grad_norm": 0.06494140625, + "learning_rate": 0.0017768067541428078, + "loss": 1.2274, + "step": 5890 + }, + { + "epoch": 0.5165922428404461, + "grad_norm": 0.048828125, + "learning_rate": 0.001776400865568031, + "loss": 1.1515, + "step": 5891 + }, + { + "epoch": 0.5166799346148208, + "grad_norm": 0.0517578125, + "learning_rate": 0.0017759949654658207, + "loss": 1.1719, + "step": 5892 + }, + { + "epoch": 0.5167676263891953, + "grad_norm": 0.0546875, + "learning_rate": 0.0017755890538731958, + "loss": 1.1532, + "step": 5893 + }, + { + "epoch": 0.5168553181635698, + "grad_norm": 0.05859375, + "learning_rate": 0.0017751831308271732, + "loss": 1.2069, + "step": 5894 + }, + { + "epoch": 0.5169430099379444, + "grad_norm": 0.05859375, + "learning_rate": 0.0017747771963647716, + "loss": 1.1432, + "step": 5895 + }, + { + "epoch": 0.5170307017123189, + "grad_norm": 0.05126953125, + "learning_rate": 0.0017743712505230123, + "loss": 1.2202, + "step": 5896 + }, + { + "epoch": 0.5171183934866934, + "grad_norm": 0.0595703125, + "learning_rate": 0.0017739652933389158, + "loss": 1.1342, + "step": 5897 + }, + { + "epoch": 0.517206085261068, + "grad_norm": 0.0498046875, + "learning_rate": 0.0017735593248495048, + "loss": 1.2046, + "step": 5898 + }, + { + "epoch": 0.5172937770354425, + "grad_norm": 0.04931640625, + "learning_rate": 0.001773153345091802, + "loss": 1.1277, + "step": 5899 + }, + { + "epoch": 0.5173814688098171, + "grad_norm": 0.06787109375, + "learning_rate": 0.0017727473541028326, + "loss": 1.1266, + "step": 5900 + }, + { + "epoch": 0.5174691605841917, + "grad_norm": 0.06005859375, + "learning_rate": 0.0017723413519196212, + "loss": 1.2666, + "step": 5901 + }, + { + "epoch": 0.5175568523585662, + "grad_norm": 0.076171875, + "learning_rate": 0.0017719353385791943, + "loss": 1.1376, + "step": 5902 + }, + { + "epoch": 0.5176445441329407, + "grad_norm": 0.049560546875, + "learning_rate": 0.001771529314118579, + "loss": 1.1263, + "step": 5903 + }, + { + "epoch": 0.5177322359073153, + "grad_norm": 0.0712890625, + "learning_rate": 0.0017711232785748042, + "loss": 1.1738, + "step": 5904 + }, + { + "epoch": 0.5178199276816898, + "grad_norm": 0.061279296875, + "learning_rate": 0.0017707172319848992, + "loss": 1.1593, + "step": 5905 + }, + { + "epoch": 0.5179076194560643, + "grad_norm": 0.07763671875, + "learning_rate": 0.0017703111743858939, + "loss": 1.1658, + "step": 5906 + }, + { + "epoch": 0.517995311230439, + "grad_norm": 0.048583984375, + "learning_rate": 0.0017699051058148207, + "loss": 1.1897, + "step": 5907 + }, + { + "epoch": 0.5180830030048135, + "grad_norm": 0.0947265625, + "learning_rate": 0.0017694990263087103, + "loss": 1.1685, + "step": 5908 + }, + { + "epoch": 0.518170694779188, + "grad_norm": 0.061767578125, + "learning_rate": 0.001769092935904598, + "loss": 1.2157, + "step": 5909 + }, + { + "epoch": 0.5182583865535626, + "grad_norm": 0.05419921875, + "learning_rate": 0.0017686868346395177, + "loss": 1.1714, + "step": 5910 + }, + { + "epoch": 0.5183460783279371, + "grad_norm": 0.109375, + "learning_rate": 0.0017682807225505037, + "loss": 1.2533, + "step": 5911 + }, + { + "epoch": 0.5184337701023116, + "grad_norm": 0.0673828125, + "learning_rate": 0.0017678745996745936, + "loss": 1.1112, + "step": 5912 + }, + { + "epoch": 0.5185214618766861, + "grad_norm": 0.0595703125, + "learning_rate": 0.001767468466048824, + "loss": 1.0989, + "step": 5913 + }, + { + "epoch": 0.5186091536510608, + "grad_norm": 0.130859375, + "learning_rate": 0.0017670623217102338, + "loss": 1.1558, + "step": 5914 + }, + { + "epoch": 0.5186968454254353, + "grad_norm": 0.080078125, + "learning_rate": 0.0017666561666958619, + "loss": 1.1009, + "step": 5915 + }, + { + "epoch": 0.5187845371998098, + "grad_norm": 0.060546875, + "learning_rate": 0.0017662500010427488, + "loss": 1.1835, + "step": 5916 + }, + { + "epoch": 0.5188722289741844, + "grad_norm": 0.1337890625, + "learning_rate": 0.0017658438247879354, + "loss": 1.1869, + "step": 5917 + }, + { + "epoch": 0.5189599207485589, + "grad_norm": 0.05517578125, + "learning_rate": 0.001765437637968465, + "loss": 1.1594, + "step": 5918 + }, + { + "epoch": 0.5190476125229334, + "grad_norm": 0.050537109375, + "learning_rate": 0.0017650314406213795, + "loss": 1.1837, + "step": 5919 + }, + { + "epoch": 0.519135304297308, + "grad_norm": 0.07421875, + "learning_rate": 0.0017646252327837236, + "loss": 1.1915, + "step": 5920 + }, + { + "epoch": 0.5192229960716825, + "grad_norm": 0.060791015625, + "learning_rate": 0.0017642190144925422, + "loss": 1.2011, + "step": 5921 + }, + { + "epoch": 0.519310687846057, + "grad_norm": 0.0693359375, + "learning_rate": 0.001763812785784882, + "loss": 1.1376, + "step": 5922 + }, + { + "epoch": 0.5193983796204317, + "grad_norm": 0.07763671875, + "learning_rate": 0.001763406546697789, + "loss": 1.168, + "step": 5923 + }, + { + "epoch": 0.5194860713948062, + "grad_norm": 0.047607421875, + "learning_rate": 0.001763000297268312, + "loss": 1.1994, + "step": 5924 + }, + { + "epoch": 0.5195737631691807, + "grad_norm": 0.0791015625, + "learning_rate": 0.0017625940375334997, + "loss": 1.1415, + "step": 5925 + }, + { + "epoch": 0.5196614549435553, + "grad_norm": 0.05029296875, + "learning_rate": 0.0017621877675304016, + "loss": 1.1392, + "step": 5926 + }, + { + "epoch": 0.5197491467179298, + "grad_norm": 0.05322265625, + "learning_rate": 0.001761781487296069, + "loss": 1.1923, + "step": 5927 + }, + { + "epoch": 0.5198368384923043, + "grad_norm": 0.04833984375, + "learning_rate": 0.001761375196867553, + "loss": 1.211, + "step": 5928 + }, + { + "epoch": 0.519924530266679, + "grad_norm": 0.0615234375, + "learning_rate": 0.0017609688962819069, + "loss": 1.161, + "step": 5929 + }, + { + "epoch": 0.5200122220410535, + "grad_norm": 0.05615234375, + "learning_rate": 0.0017605625855761833, + "loss": 1.1306, + "step": 5930 + }, + { + "epoch": 0.520099913815428, + "grad_norm": 0.053466796875, + "learning_rate": 0.0017601562647874378, + "loss": 1.134, + "step": 5931 + }, + { + "epoch": 0.5201876055898026, + "grad_norm": 0.07373046875, + "learning_rate": 0.0017597499339527252, + "loss": 1.163, + "step": 5932 + }, + { + "epoch": 0.5202752973641771, + "grad_norm": 0.0673828125, + "learning_rate": 0.001759343593109102, + "loss": 1.1614, + "step": 5933 + }, + { + "epoch": 0.5203629891385516, + "grad_norm": 0.048095703125, + "learning_rate": 0.0017589372422936252, + "loss": 1.1535, + "step": 5934 + }, + { + "epoch": 0.5204506809129262, + "grad_norm": 0.1259765625, + "learning_rate": 0.0017585308815433536, + "loss": 1.1652, + "step": 5935 + }, + { + "epoch": 0.5205383726873007, + "grad_norm": 0.055908203125, + "learning_rate": 0.0017581245108953454, + "loss": 1.1668, + "step": 5936 + }, + { + "epoch": 0.5206260644616753, + "grad_norm": 0.0654296875, + "learning_rate": 0.001757718130386661, + "loss": 1.1596, + "step": 5937 + }, + { + "epoch": 0.5207137562360498, + "grad_norm": 0.07421875, + "learning_rate": 0.001757311740054361, + "loss": 1.1838, + "step": 5938 + }, + { + "epoch": 0.5208014480104244, + "grad_norm": 0.08349609375, + "learning_rate": 0.0017569053399355075, + "loss": 1.25, + "step": 5939 + }, + { + "epoch": 0.5208891397847989, + "grad_norm": 0.05419921875, + "learning_rate": 0.001756498930067163, + "loss": 1.1483, + "step": 5940 + }, + { + "epoch": 0.5209768315591734, + "grad_norm": 0.06494140625, + "learning_rate": 0.001756092510486391, + "loss": 1.1425, + "step": 5941 + }, + { + "epoch": 0.521064523333548, + "grad_norm": 0.04736328125, + "learning_rate": 0.0017556860812302558, + "loss": 1.1259, + "step": 5942 + }, + { + "epoch": 0.5211522151079225, + "grad_norm": 0.068359375, + "learning_rate": 0.0017552796423358228, + "loss": 1.2174, + "step": 5943 + }, + { + "epoch": 0.521239906882297, + "grad_norm": 0.056396484375, + "learning_rate": 0.0017548731938401587, + "loss": 1.1255, + "step": 5944 + }, + { + "epoch": 0.5213275986566717, + "grad_norm": 0.048095703125, + "learning_rate": 0.0017544667357803296, + "loss": 1.2208, + "step": 5945 + }, + { + "epoch": 0.5214152904310462, + "grad_norm": 0.04541015625, + "learning_rate": 0.0017540602681934037, + "loss": 1.1505, + "step": 5946 + }, + { + "epoch": 0.5215029822054207, + "grad_norm": 0.052490234375, + "learning_rate": 0.0017536537911164505, + "loss": 1.2064, + "step": 5947 + }, + { + "epoch": 0.5215906739797953, + "grad_norm": 0.05126953125, + "learning_rate": 0.0017532473045865386, + "loss": 1.1832, + "step": 5948 + }, + { + "epoch": 0.5216783657541698, + "grad_norm": 0.06640625, + "learning_rate": 0.0017528408086407395, + "loss": 1.1826, + "step": 5949 + }, + { + "epoch": 0.5217660575285443, + "grad_norm": 0.060546875, + "learning_rate": 0.001752434303316124, + "loss": 1.1855, + "step": 5950 + }, + { + "epoch": 0.521853749302919, + "grad_norm": 0.049072265625, + "learning_rate": 0.0017520277886497642, + "loss": 1.1448, + "step": 5951 + }, + { + "epoch": 0.5219414410772935, + "grad_norm": 0.05859375, + "learning_rate": 0.0017516212646787339, + "loss": 1.1677, + "step": 5952 + }, + { + "epoch": 0.522029132851668, + "grad_norm": 0.07373046875, + "learning_rate": 0.001751214731440106, + "loss": 1.1793, + "step": 5953 + }, + { + "epoch": 0.5221168246260426, + "grad_norm": 0.091796875, + "learning_rate": 0.0017508081889709565, + "loss": 1.1917, + "step": 5954 + }, + { + "epoch": 0.5222045164004171, + "grad_norm": 0.060791015625, + "learning_rate": 0.00175040163730836, + "loss": 1.2077, + "step": 5955 + }, + { + "epoch": 0.5222922081747916, + "grad_norm": 0.12353515625, + "learning_rate": 0.0017499950764893937, + "loss": 1.1763, + "step": 5956 + }, + { + "epoch": 0.5223798999491662, + "grad_norm": 0.0478515625, + "learning_rate": 0.0017495885065511342, + "loss": 1.1958, + "step": 5957 + }, + { + "epoch": 0.5224675917235407, + "grad_norm": 0.072265625, + "learning_rate": 0.0017491819275306602, + "loss": 1.1691, + "step": 5958 + }, + { + "epoch": 0.5225552834979152, + "grad_norm": 0.1240234375, + "learning_rate": 0.0017487753394650498, + "loss": 1.1708, + "step": 5959 + }, + { + "epoch": 0.5226429752722898, + "grad_norm": 0.0498046875, + "learning_rate": 0.0017483687423913842, + "loss": 1.1886, + "step": 5960 + }, + { + "epoch": 0.5227306670466644, + "grad_norm": 0.109375, + "learning_rate": 0.0017479621363467432, + "loss": 1.1713, + "step": 5961 + }, + { + "epoch": 0.5228183588210389, + "grad_norm": 0.0947265625, + "learning_rate": 0.0017475555213682084, + "loss": 1.1956, + "step": 5962 + }, + { + "epoch": 0.5229060505954134, + "grad_norm": 0.050048828125, + "learning_rate": 0.0017471488974928613, + "loss": 1.1315, + "step": 5963 + }, + { + "epoch": 0.522993742369788, + "grad_norm": 0.11083984375, + "learning_rate": 0.0017467422647577861, + "loss": 1.1548, + "step": 5964 + }, + { + "epoch": 0.5230814341441625, + "grad_norm": 0.064453125, + "learning_rate": 0.0017463356232000657, + "loss": 1.1998, + "step": 5965 + }, + { + "epoch": 0.523169125918537, + "grad_norm": 0.06396484375, + "learning_rate": 0.0017459289728567853, + "loss": 1.1694, + "step": 5966 + }, + { + "epoch": 0.5232568176929117, + "grad_norm": 0.06640625, + "learning_rate": 0.00174552231376503, + "loss": 1.1099, + "step": 5967 + }, + { + "epoch": 0.5233445094672862, + "grad_norm": 0.056640625, + "learning_rate": 0.0017451156459618864, + "loss": 1.1725, + "step": 5968 + }, + { + "epoch": 0.5234322012416607, + "grad_norm": 0.056884765625, + "learning_rate": 0.001744708969484442, + "loss": 1.163, + "step": 5969 + }, + { + "epoch": 0.5235198930160353, + "grad_norm": 0.08056640625, + "learning_rate": 0.001744302284369783, + "loss": 1.1783, + "step": 5970 + }, + { + "epoch": 0.5236075847904098, + "grad_norm": 0.058837890625, + "learning_rate": 0.001743895590655, + "loss": 1.1826, + "step": 5971 + }, + { + "epoch": 0.5236952765647843, + "grad_norm": 0.0556640625, + "learning_rate": 0.001743488888377181, + "loss": 1.2116, + "step": 5972 + }, + { + "epoch": 0.5237829683391589, + "grad_norm": 0.052001953125, + "learning_rate": 0.001743082177573417, + "loss": 1.1832, + "step": 5973 + }, + { + "epoch": 0.5238706601135334, + "grad_norm": 0.062255859375, + "learning_rate": 0.0017426754582807987, + "loss": 1.1156, + "step": 5974 + }, + { + "epoch": 0.523958351887908, + "grad_norm": 0.05224609375, + "learning_rate": 0.0017422687305364175, + "loss": 1.1881, + "step": 5975 + }, + { + "epoch": 0.5240460436622826, + "grad_norm": 0.047119140625, + "learning_rate": 0.0017418619943773664, + "loss": 1.1664, + "step": 5976 + }, + { + "epoch": 0.5241337354366571, + "grad_norm": 0.10400390625, + "learning_rate": 0.0017414552498407389, + "loss": 1.2041, + "step": 5977 + }, + { + "epoch": 0.5242214272110316, + "grad_norm": 0.06982421875, + "learning_rate": 0.0017410484969636287, + "loss": 1.1877, + "step": 5978 + }, + { + "epoch": 0.5243091189854062, + "grad_norm": 0.05859375, + "learning_rate": 0.0017406417357831304, + "loss": 1.087, + "step": 5979 + }, + { + "epoch": 0.5243968107597807, + "grad_norm": 0.09521484375, + "learning_rate": 0.0017402349663363395, + "loss": 1.1674, + "step": 5980 + }, + { + "epoch": 0.5244845025341552, + "grad_norm": 0.054931640625, + "learning_rate": 0.0017398281886603529, + "loss": 1.1599, + "step": 5981 + }, + { + "epoch": 0.5245721943085299, + "grad_norm": 0.050048828125, + "learning_rate": 0.0017394214027922675, + "loss": 1.2198, + "step": 5982 + }, + { + "epoch": 0.5246598860829044, + "grad_norm": 0.09912109375, + "learning_rate": 0.0017390146087691808, + "loss": 1.1682, + "step": 5983 + }, + { + "epoch": 0.5247475778572789, + "grad_norm": 0.058837890625, + "learning_rate": 0.0017386078066281916, + "loss": 1.2219, + "step": 5984 + }, + { + "epoch": 0.5248352696316534, + "grad_norm": 0.05615234375, + "learning_rate": 0.001738200996406399, + "loss": 1.1836, + "step": 5985 + }, + { + "epoch": 0.524922961406028, + "grad_norm": 0.059814453125, + "learning_rate": 0.001737794178140904, + "loss": 1.1525, + "step": 5986 + }, + { + "epoch": 0.5250106531804025, + "grad_norm": 0.054931640625, + "learning_rate": 0.0017373873518688058, + "loss": 1.189, + "step": 5987 + }, + { + "epoch": 0.525098344954777, + "grad_norm": 0.07958984375, + "learning_rate": 0.0017369805176272069, + "loss": 1.2234, + "step": 5988 + }, + { + "epoch": 0.5251860367291517, + "grad_norm": 0.07080078125, + "learning_rate": 0.0017365736754532096, + "loss": 1.1839, + "step": 5989 + }, + { + "epoch": 0.5252737285035262, + "grad_norm": 0.0546875, + "learning_rate": 0.001736166825383916, + "loss": 1.1544, + "step": 5990 + }, + { + "epoch": 0.5253614202779007, + "grad_norm": 0.10546875, + "learning_rate": 0.0017357599674564309, + "loss": 1.1166, + "step": 5991 + }, + { + "epoch": 0.5254491120522753, + "grad_norm": 0.08251953125, + "learning_rate": 0.001735353101707858, + "loss": 1.1344, + "step": 5992 + }, + { + "epoch": 0.5255368038266498, + "grad_norm": 0.061279296875, + "learning_rate": 0.0017349462281753022, + "loss": 1.1934, + "step": 5993 + }, + { + "epoch": 0.5256244956010243, + "grad_norm": 0.10009765625, + "learning_rate": 0.0017345393468958703, + "loss": 1.1764, + "step": 5994 + }, + { + "epoch": 0.5257121873753989, + "grad_norm": 0.083984375, + "learning_rate": 0.0017341324579066683, + "loss": 1.1373, + "step": 5995 + }, + { + "epoch": 0.5257998791497734, + "grad_norm": 0.050048828125, + "learning_rate": 0.0017337255612448023, + "loss": 1.156, + "step": 5996 + }, + { + "epoch": 0.525887570924148, + "grad_norm": 0.0556640625, + "learning_rate": 0.0017333186569473823, + "loss": 1.1707, + "step": 5997 + }, + { + "epoch": 0.5259752626985226, + "grad_norm": 0.07421875, + "learning_rate": 0.0017329117450515154, + "loss": 1.1315, + "step": 5998 + }, + { + "epoch": 0.5260629544728971, + "grad_norm": 0.04638671875, + "learning_rate": 0.0017325048255943112, + "loss": 1.1613, + "step": 5999 + }, + { + "epoch": 0.5261506462472716, + "grad_norm": 0.049072265625, + "learning_rate": 0.0017320978986128802, + "loss": 1.1743, + "step": 6000 + }, + { + "epoch": 0.5261506462472716, + "eval_loss": 1.1750010251998901, + "eval_runtime": 428.5284, + "eval_samples_per_second": 33.713, + "eval_steps_per_second": 8.429, + "step": 6000 + }, + { + "epoch": 0.5262383380216462, + "grad_norm": 0.0712890625, + "learning_rate": 0.0017316909641443329, + "loss": 1.1835, + "step": 6001 + }, + { + "epoch": 0.5263260297960207, + "grad_norm": 0.0673828125, + "learning_rate": 0.0017312840222257803, + "loss": 1.1314, + "step": 6002 + }, + { + "epoch": 0.5264137215703952, + "grad_norm": 0.0654296875, + "learning_rate": 0.001730877072894335, + "loss": 1.1287, + "step": 6003 + }, + { + "epoch": 0.5265014133447699, + "grad_norm": 0.047119140625, + "learning_rate": 0.001730470116187109, + "loss": 1.1468, + "step": 6004 + }, + { + "epoch": 0.5265891051191444, + "grad_norm": 0.1201171875, + "learning_rate": 0.0017300631521412165, + "loss": 1.1755, + "step": 6005 + }, + { + "epoch": 0.5266767968935189, + "grad_norm": 0.060302734375, + "learning_rate": 0.0017296561807937708, + "loss": 1.1622, + "step": 6006 + }, + { + "epoch": 0.5267644886678935, + "grad_norm": 0.0458984375, + "learning_rate": 0.001729249202181887, + "loss": 1.2125, + "step": 6007 + }, + { + "epoch": 0.526852180442268, + "grad_norm": 0.07470703125, + "learning_rate": 0.0017288422163426814, + "loss": 1.1493, + "step": 6008 + }, + { + "epoch": 0.5269398722166425, + "grad_norm": 0.05126953125, + "learning_rate": 0.0017284352233132682, + "loss": 1.2026, + "step": 6009 + }, + { + "epoch": 0.527027563991017, + "grad_norm": 0.057373046875, + "learning_rate": 0.001728028223130766, + "loss": 1.1812, + "step": 6010 + }, + { + "epoch": 0.5271152557653916, + "grad_norm": 0.0703125, + "learning_rate": 0.001727621215832291, + "loss": 1.1587, + "step": 6011 + }, + { + "epoch": 0.5272029475397662, + "grad_norm": 0.05419921875, + "learning_rate": 0.001727214201454961, + "loss": 1.1317, + "step": 6012 + }, + { + "epoch": 0.5272906393141407, + "grad_norm": 0.0546875, + "learning_rate": 0.0017268071800358956, + "loss": 1.1448, + "step": 6013 + }, + { + "epoch": 0.5273783310885153, + "grad_norm": 0.055908203125, + "learning_rate": 0.0017264001516122136, + "loss": 1.2169, + "step": 6014 + }, + { + "epoch": 0.5274660228628898, + "grad_norm": 0.055908203125, + "learning_rate": 0.0017259931162210349, + "loss": 1.1486, + "step": 6015 + }, + { + "epoch": 0.5275537146372643, + "grad_norm": 0.050537109375, + "learning_rate": 0.0017255860738994802, + "loss": 1.1254, + "step": 6016 + }, + { + "epoch": 0.5276414064116389, + "grad_norm": 0.059814453125, + "learning_rate": 0.0017251790246846706, + "loss": 1.1978, + "step": 6017 + }, + { + "epoch": 0.5277290981860134, + "grad_norm": 0.049072265625, + "learning_rate": 0.001724771968613728, + "loss": 1.1444, + "step": 6018 + }, + { + "epoch": 0.5278167899603879, + "grad_norm": 0.054443359375, + "learning_rate": 0.0017243649057237752, + "loss": 1.2183, + "step": 6019 + }, + { + "epoch": 0.5279044817347626, + "grad_norm": 0.052001953125, + "learning_rate": 0.001723957836051935, + "loss": 1.1968, + "step": 6020 + }, + { + "epoch": 0.5279921735091371, + "grad_norm": 0.05615234375, + "learning_rate": 0.0017235507596353307, + "loss": 1.1243, + "step": 6021 + }, + { + "epoch": 0.5280798652835116, + "grad_norm": 0.07080078125, + "learning_rate": 0.0017231436765110868, + "loss": 1.1691, + "step": 6022 + }, + { + "epoch": 0.5281675570578862, + "grad_norm": 0.055908203125, + "learning_rate": 0.001722736586716329, + "loss": 1.1984, + "step": 6023 + }, + { + "epoch": 0.5282552488322607, + "grad_norm": 0.083984375, + "learning_rate": 0.0017223294902881819, + "loss": 1.11, + "step": 6024 + }, + { + "epoch": 0.5283429406066352, + "grad_norm": 0.0625, + "learning_rate": 0.0017219223872637721, + "loss": 1.1576, + "step": 6025 + }, + { + "epoch": 0.5284306323810098, + "grad_norm": 0.103515625, + "learning_rate": 0.0017215152776802265, + "loss": 1.1859, + "step": 6026 + }, + { + "epoch": 0.5285183241553844, + "grad_norm": 0.058837890625, + "learning_rate": 0.0017211081615746717, + "loss": 1.1828, + "step": 6027 + }, + { + "epoch": 0.5286060159297589, + "grad_norm": 0.06591796875, + "learning_rate": 0.0017207010389842367, + "loss": 1.1305, + "step": 6028 + }, + { + "epoch": 0.5286937077041335, + "grad_norm": 0.052978515625, + "learning_rate": 0.0017202939099460492, + "loss": 1.1895, + "step": 6029 + }, + { + "epoch": 0.528781399478508, + "grad_norm": 0.0693359375, + "learning_rate": 0.0017198867744972386, + "loss": 1.1717, + "step": 6030 + }, + { + "epoch": 0.5288690912528825, + "grad_norm": 0.061279296875, + "learning_rate": 0.0017194796326749342, + "loss": 1.2066, + "step": 6031 + }, + { + "epoch": 0.528956783027257, + "grad_norm": 0.0771484375, + "learning_rate": 0.001719072484516267, + "loss": 1.1947, + "step": 6032 + }, + { + "epoch": 0.5290444748016316, + "grad_norm": 0.06640625, + "learning_rate": 0.0017186653300583677, + "loss": 1.1564, + "step": 6033 + }, + { + "epoch": 0.5291321665760061, + "grad_norm": 0.06298828125, + "learning_rate": 0.0017182581693383672, + "loss": 1.1513, + "step": 6034 + }, + { + "epoch": 0.5292198583503807, + "grad_norm": 0.0654296875, + "learning_rate": 0.001717851002393398, + "loss": 1.1876, + "step": 6035 + }, + { + "epoch": 0.5293075501247553, + "grad_norm": 0.08447265625, + "learning_rate": 0.0017174438292605922, + "loss": 1.1827, + "step": 6036 + }, + { + "epoch": 0.5293952418991298, + "grad_norm": 0.0673828125, + "learning_rate": 0.0017170366499770842, + "loss": 1.2665, + "step": 6037 + }, + { + "epoch": 0.5294829336735043, + "grad_norm": 0.07421875, + "learning_rate": 0.0017166294645800063, + "loss": 1.1667, + "step": 6038 + }, + { + "epoch": 0.5295706254478789, + "grad_norm": 0.05859375, + "learning_rate": 0.001716222273106493, + "loss": 1.1468, + "step": 6039 + }, + { + "epoch": 0.5296583172222534, + "grad_norm": 0.0673828125, + "learning_rate": 0.0017158150755936794, + "loss": 1.1207, + "step": 6040 + }, + { + "epoch": 0.5297460089966279, + "grad_norm": 0.06689453125, + "learning_rate": 0.0017154078720787004, + "loss": 1.1247, + "step": 6041 + }, + { + "epoch": 0.5298337007710026, + "grad_norm": 0.0498046875, + "learning_rate": 0.0017150006625986933, + "loss": 1.113, + "step": 6042 + }, + { + "epoch": 0.5299213925453771, + "grad_norm": 0.10009765625, + "learning_rate": 0.0017145934471907932, + "loss": 1.1456, + "step": 6043 + }, + { + "epoch": 0.5300090843197516, + "grad_norm": 0.06640625, + "learning_rate": 0.0017141862258921372, + "loss": 1.1194, + "step": 6044 + }, + { + "epoch": 0.5300967760941262, + "grad_norm": 0.053466796875, + "learning_rate": 0.0017137789987398632, + "loss": 1.1798, + "step": 6045 + }, + { + "epoch": 0.5301844678685007, + "grad_norm": 0.078125, + "learning_rate": 0.0017133717657711097, + "loss": 1.1831, + "step": 6046 + }, + { + "epoch": 0.5302721596428752, + "grad_norm": 0.059814453125, + "learning_rate": 0.0017129645270230138, + "loss": 1.1408, + "step": 6047 + }, + { + "epoch": 0.5303598514172498, + "grad_norm": 0.0517578125, + "learning_rate": 0.0017125572825327162, + "loss": 1.1667, + "step": 6048 + }, + { + "epoch": 0.5304475431916243, + "grad_norm": 0.046875, + "learning_rate": 0.0017121500323373558, + "loss": 1.1412, + "step": 6049 + }, + { + "epoch": 0.5305352349659989, + "grad_norm": 0.05615234375, + "learning_rate": 0.001711742776474073, + "loss": 1.0961, + "step": 6050 + }, + { + "epoch": 0.5306229267403735, + "grad_norm": 0.0927734375, + "learning_rate": 0.0017113355149800077, + "loss": 1.1647, + "step": 6051 + }, + { + "epoch": 0.530710618514748, + "grad_norm": 0.061767578125, + "learning_rate": 0.0017109282478923024, + "loss": 1.1693, + "step": 6052 + }, + { + "epoch": 0.5307983102891225, + "grad_norm": 0.08740234375, + "learning_rate": 0.0017105209752480979, + "loss": 1.1893, + "step": 6053 + }, + { + "epoch": 0.5308860020634971, + "grad_norm": 0.0654296875, + "learning_rate": 0.001710113697084537, + "loss": 1.1365, + "step": 6054 + }, + { + "epoch": 0.5309736938378716, + "grad_norm": 0.049560546875, + "learning_rate": 0.001709706413438762, + "loss": 1.1853, + "step": 6055 + }, + { + "epoch": 0.5310613856122461, + "grad_norm": 0.064453125, + "learning_rate": 0.0017092991243479161, + "loss": 1.1712, + "step": 6056 + }, + { + "epoch": 0.5311490773866206, + "grad_norm": 0.047607421875, + "learning_rate": 0.0017088918298491435, + "loss": 1.1231, + "step": 6057 + }, + { + "epoch": 0.5312367691609953, + "grad_norm": 0.05859375, + "learning_rate": 0.0017084845299795875, + "loss": 1.2111, + "step": 6058 + }, + { + "epoch": 0.5313244609353698, + "grad_norm": 0.09765625, + "learning_rate": 0.0017080772247763938, + "loss": 1.205, + "step": 6059 + }, + { + "epoch": 0.5314121527097443, + "grad_norm": 0.059814453125, + "learning_rate": 0.0017076699142767068, + "loss": 1.1468, + "step": 6060 + }, + { + "epoch": 0.5314998444841189, + "grad_norm": 0.09375, + "learning_rate": 0.0017072625985176727, + "loss": 1.0453, + "step": 6061 + }, + { + "epoch": 0.5315875362584934, + "grad_norm": 0.0693359375, + "learning_rate": 0.0017068552775364374, + "loss": 1.223, + "step": 6062 + }, + { + "epoch": 0.5316752280328679, + "grad_norm": 0.051513671875, + "learning_rate": 0.0017064479513701474, + "loss": 1.1231, + "step": 6063 + }, + { + "epoch": 0.5317629198072426, + "grad_norm": 0.07861328125, + "learning_rate": 0.0017060406200559502, + "loss": 1.1232, + "step": 6064 + }, + { + "epoch": 0.5318506115816171, + "grad_norm": 0.06103515625, + "learning_rate": 0.001705633283630993, + "loss": 1.1985, + "step": 6065 + }, + { + "epoch": 0.5319383033559916, + "grad_norm": 0.0654296875, + "learning_rate": 0.001705225942132424, + "loss": 1.1715, + "step": 6066 + }, + { + "epoch": 0.5320259951303662, + "grad_norm": 0.052001953125, + "learning_rate": 0.0017048185955973914, + "loss": 1.236, + "step": 6067 + }, + { + "epoch": 0.5321136869047407, + "grad_norm": 0.058349609375, + "learning_rate": 0.0017044112440630448, + "loss": 1.0972, + "step": 6068 + }, + { + "epoch": 0.5322013786791152, + "grad_norm": 0.0537109375, + "learning_rate": 0.0017040038875665328, + "loss": 1.1446, + "step": 6069 + }, + { + "epoch": 0.5322890704534898, + "grad_norm": 0.050048828125, + "learning_rate": 0.001703596526145006, + "loss": 1.1626, + "step": 6070 + }, + { + "epoch": 0.5323767622278643, + "grad_norm": 0.05712890625, + "learning_rate": 0.0017031891598356145, + "loss": 1.1777, + "step": 6071 + }, + { + "epoch": 0.5324644540022389, + "grad_norm": 0.0517578125, + "learning_rate": 0.0017027817886755086, + "loss": 1.1236, + "step": 6072 + }, + { + "epoch": 0.5325521457766135, + "grad_norm": 0.0556640625, + "learning_rate": 0.0017023744127018397, + "loss": 1.1375, + "step": 6073 + }, + { + "epoch": 0.532639837550988, + "grad_norm": 0.04931640625, + "learning_rate": 0.0017019670319517598, + "loss": 1.1734, + "step": 6074 + }, + { + "epoch": 0.5327275293253625, + "grad_norm": 0.052001953125, + "learning_rate": 0.0017015596464624208, + "loss": 1.2084, + "step": 6075 + }, + { + "epoch": 0.5328152210997371, + "grad_norm": 0.05029296875, + "learning_rate": 0.001701152256270975, + "loss": 1.1874, + "step": 6076 + }, + { + "epoch": 0.5329029128741116, + "grad_norm": 0.048828125, + "learning_rate": 0.0017007448614145754, + "loss": 1.1296, + "step": 6077 + }, + { + "epoch": 0.5329906046484861, + "grad_norm": 0.04638671875, + "learning_rate": 0.0017003374619303754, + "loss": 1.1186, + "step": 6078 + }, + { + "epoch": 0.5330782964228606, + "grad_norm": 0.061279296875, + "learning_rate": 0.0016999300578555295, + "loss": 1.1818, + "step": 6079 + }, + { + "epoch": 0.5331659881972353, + "grad_norm": 0.049560546875, + "learning_rate": 0.0016995226492271904, + "loss": 1.179, + "step": 6080 + }, + { + "epoch": 0.5332536799716098, + "grad_norm": 0.046875, + "learning_rate": 0.0016991152360825137, + "loss": 1.2066, + "step": 6081 + }, + { + "epoch": 0.5333413717459843, + "grad_norm": 0.09765625, + "learning_rate": 0.0016987078184586546, + "loss": 1.2245, + "step": 6082 + }, + { + "epoch": 0.5334290635203589, + "grad_norm": 0.07763671875, + "learning_rate": 0.0016983003963927678, + "loss": 1.1402, + "step": 6083 + }, + { + "epoch": 0.5335167552947334, + "grad_norm": 0.05712890625, + "learning_rate": 0.0016978929699220098, + "loss": 1.1463, + "step": 6084 + }, + { + "epoch": 0.5336044470691079, + "grad_norm": 0.10791015625, + "learning_rate": 0.0016974855390835364, + "loss": 1.1086, + "step": 6085 + }, + { + "epoch": 0.5336921388434825, + "grad_norm": 0.0712890625, + "learning_rate": 0.0016970781039145045, + "loss": 1.2318, + "step": 6086 + }, + { + "epoch": 0.533779830617857, + "grad_norm": 0.057861328125, + "learning_rate": 0.0016966706644520708, + "loss": 1.1516, + "step": 6087 + }, + { + "epoch": 0.5338675223922316, + "grad_norm": 0.05419921875, + "learning_rate": 0.0016962632207333937, + "loss": 1.1711, + "step": 6088 + }, + { + "epoch": 0.5339552141666062, + "grad_norm": 0.05126953125, + "learning_rate": 0.0016958557727956296, + "loss": 1.1232, + "step": 6089 + }, + { + "epoch": 0.5340429059409807, + "grad_norm": 0.05419921875, + "learning_rate": 0.001695448320675938, + "loss": 1.2053, + "step": 6090 + }, + { + "epoch": 0.5341305977153552, + "grad_norm": 0.055419921875, + "learning_rate": 0.0016950408644114765, + "loss": 1.2034, + "step": 6091 + }, + { + "epoch": 0.5342182894897298, + "grad_norm": 0.0537109375, + "learning_rate": 0.0016946334040394044, + "loss": 1.1347, + "step": 6092 + }, + { + "epoch": 0.5343059812641043, + "grad_norm": 0.0556640625, + "learning_rate": 0.0016942259395968811, + "loss": 1.1686, + "step": 6093 + }, + { + "epoch": 0.5343936730384788, + "grad_norm": 0.0712890625, + "learning_rate": 0.001693818471121066, + "loss": 1.1984, + "step": 6094 + }, + { + "epoch": 0.5344813648128535, + "grad_norm": 0.05615234375, + "learning_rate": 0.0016934109986491198, + "loss": 1.1266, + "step": 6095 + }, + { + "epoch": 0.534569056587228, + "grad_norm": 0.06787109375, + "learning_rate": 0.001693003522218203, + "loss": 1.116, + "step": 6096 + }, + { + "epoch": 0.5346567483616025, + "grad_norm": 0.0537109375, + "learning_rate": 0.0016925960418654753, + "loss": 1.168, + "step": 6097 + }, + { + "epoch": 0.5347444401359771, + "grad_norm": 0.06884765625, + "learning_rate": 0.0016921885576280985, + "loss": 1.1856, + "step": 6098 + }, + { + "epoch": 0.5348321319103516, + "grad_norm": 0.05078125, + "learning_rate": 0.0016917810695432347, + "loss": 1.1623, + "step": 6099 + }, + { + "epoch": 0.5349198236847261, + "grad_norm": 0.06787109375, + "learning_rate": 0.001691373577648045, + "loss": 1.1376, + "step": 6100 + }, + { + "epoch": 0.5350075154591007, + "grad_norm": 0.051513671875, + "learning_rate": 0.0016909660819796916, + "loss": 1.1783, + "step": 6101 + }, + { + "epoch": 0.5350952072334753, + "grad_norm": 0.05224609375, + "learning_rate": 0.0016905585825753374, + "loss": 1.1521, + "step": 6102 + }, + { + "epoch": 0.5351828990078498, + "grad_norm": 0.046630859375, + "learning_rate": 0.0016901510794721454, + "loss": 1.2314, + "step": 6103 + }, + { + "epoch": 0.5352705907822243, + "grad_norm": 0.05615234375, + "learning_rate": 0.0016897435727072781, + "loss": 1.1634, + "step": 6104 + }, + { + "epoch": 0.5353582825565989, + "grad_norm": 0.0458984375, + "learning_rate": 0.0016893360623179005, + "loss": 1.1481, + "step": 6105 + }, + { + "epoch": 0.5354459743309734, + "grad_norm": 0.054443359375, + "learning_rate": 0.0016889285483411747, + "loss": 1.1689, + "step": 6106 + }, + { + "epoch": 0.5355336661053479, + "grad_norm": 0.048095703125, + "learning_rate": 0.0016885210308142658, + "loss": 1.1625, + "step": 6107 + }, + { + "epoch": 0.5356213578797225, + "grad_norm": 0.05078125, + "learning_rate": 0.0016881135097743388, + "loss": 1.1612, + "step": 6108 + }, + { + "epoch": 0.535709049654097, + "grad_norm": 0.05517578125, + "learning_rate": 0.0016877059852585577, + "loss": 1.158, + "step": 6109 + }, + { + "epoch": 0.5357967414284716, + "grad_norm": 0.052734375, + "learning_rate": 0.0016872984573040883, + "loss": 1.1712, + "step": 6110 + }, + { + "epoch": 0.5358844332028462, + "grad_norm": 0.061279296875, + "learning_rate": 0.0016868909259480955, + "loss": 1.1936, + "step": 6111 + }, + { + "epoch": 0.5359721249772207, + "grad_norm": 0.0546875, + "learning_rate": 0.001686483391227746, + "loss": 1.2165, + "step": 6112 + }, + { + "epoch": 0.5360598167515952, + "grad_norm": 0.06494140625, + "learning_rate": 0.0016860758531802055, + "loss": 1.1493, + "step": 6113 + }, + { + "epoch": 0.5361475085259698, + "grad_norm": 0.08056640625, + "learning_rate": 0.0016856683118426398, + "loss": 1.1423, + "step": 6114 + }, + { + "epoch": 0.5362352003003443, + "grad_norm": 0.07275390625, + "learning_rate": 0.0016852607672522165, + "loss": 1.1809, + "step": 6115 + }, + { + "epoch": 0.5363228920747188, + "grad_norm": 0.07958984375, + "learning_rate": 0.0016848532194461017, + "loss": 1.1543, + "step": 6116 + }, + { + "epoch": 0.5364105838490935, + "grad_norm": 0.047119140625, + "learning_rate": 0.0016844456684614638, + "loss": 1.1536, + "step": 6117 + }, + { + "epoch": 0.536498275623468, + "grad_norm": 0.06396484375, + "learning_rate": 0.0016840381143354696, + "loss": 1.175, + "step": 6118 + }, + { + "epoch": 0.5365859673978425, + "grad_norm": 0.07080078125, + "learning_rate": 0.0016836305571052879, + "loss": 1.1169, + "step": 6119 + }, + { + "epoch": 0.5366736591722171, + "grad_norm": 0.09716796875, + "learning_rate": 0.0016832229968080854, + "loss": 1.1344, + "step": 6120 + }, + { + "epoch": 0.5367613509465916, + "grad_norm": 0.057861328125, + "learning_rate": 0.0016828154334810317, + "loss": 1.1725, + "step": 6121 + }, + { + "epoch": 0.5368490427209661, + "grad_norm": 0.047607421875, + "learning_rate": 0.0016824078671612954, + "loss": 1.1193, + "step": 6122 + }, + { + "epoch": 0.5369367344953407, + "grad_norm": 0.08251953125, + "learning_rate": 0.0016820002978860445, + "loss": 1.1892, + "step": 6123 + }, + { + "epoch": 0.5370244262697152, + "grad_norm": 0.0615234375, + "learning_rate": 0.0016815927256924501, + "loss": 1.1075, + "step": 6124 + }, + { + "epoch": 0.5371121180440898, + "grad_norm": 0.0517578125, + "learning_rate": 0.0016811851506176795, + "loss": 1.1074, + "step": 6125 + }, + { + "epoch": 0.5371998098184644, + "grad_norm": 0.04931640625, + "learning_rate": 0.0016807775726989046, + "loss": 1.1119, + "step": 6126 + }, + { + "epoch": 0.5372875015928389, + "grad_norm": 0.05322265625, + "learning_rate": 0.001680369991973294, + "loss": 1.1076, + "step": 6127 + }, + { + "epoch": 0.5373751933672134, + "grad_norm": 0.058349609375, + "learning_rate": 0.001679962408478019, + "loss": 1.208, + "step": 6128 + }, + { + "epoch": 0.5374628851415879, + "grad_norm": 0.0634765625, + "learning_rate": 0.001679554822250249, + "loss": 1.1426, + "step": 6129 + }, + { + "epoch": 0.5375505769159625, + "grad_norm": 0.045654296875, + "learning_rate": 0.0016791472333271564, + "loss": 1.1756, + "step": 6130 + }, + { + "epoch": 0.537638268690337, + "grad_norm": 0.0712890625, + "learning_rate": 0.0016787396417459108, + "loss": 1.1522, + "step": 6131 + }, + { + "epoch": 0.5377259604647115, + "grad_norm": 0.08544921875, + "learning_rate": 0.001678332047543684, + "loss": 1.125, + "step": 6132 + }, + { + "epoch": 0.5378136522390862, + "grad_norm": 0.05517578125, + "learning_rate": 0.0016779244507576477, + "loss": 1.1209, + "step": 6133 + }, + { + "epoch": 0.5379013440134607, + "grad_norm": 0.07763671875, + "learning_rate": 0.0016775168514249735, + "loss": 1.1574, + "step": 6134 + }, + { + "epoch": 0.5379890357878352, + "grad_norm": 0.056884765625, + "learning_rate": 0.0016771092495828338, + "loss": 1.1463, + "step": 6135 + }, + { + "epoch": 0.5380767275622098, + "grad_norm": 0.056640625, + "learning_rate": 0.0016767016452684, + "loss": 1.1986, + "step": 6136 + }, + { + "epoch": 0.5381644193365843, + "grad_norm": 0.046875, + "learning_rate": 0.001676294038518846, + "loss": 1.2035, + "step": 6137 + }, + { + "epoch": 0.5382521111109588, + "grad_norm": 0.052734375, + "learning_rate": 0.0016758864293713429, + "loss": 1.2038, + "step": 6138 + }, + { + "epoch": 0.5383398028853335, + "grad_norm": 0.04833984375, + "learning_rate": 0.0016754788178630646, + "loss": 1.1651, + "step": 6139 + }, + { + "epoch": 0.538427494659708, + "grad_norm": 0.0458984375, + "learning_rate": 0.0016750712040311832, + "loss": 1.1258, + "step": 6140 + }, + { + "epoch": 0.5385151864340825, + "grad_norm": 0.0634765625, + "learning_rate": 0.0016746635879128734, + "loss": 1.1763, + "step": 6141 + }, + { + "epoch": 0.5386028782084571, + "grad_norm": 0.048828125, + "learning_rate": 0.0016742559695453075, + "loss": 1.1739, + "step": 6142 + }, + { + "epoch": 0.5386905699828316, + "grad_norm": 0.058349609375, + "learning_rate": 0.0016738483489656603, + "loss": 1.1347, + "step": 6143 + }, + { + "epoch": 0.5387782617572061, + "grad_norm": 0.0517578125, + "learning_rate": 0.0016734407262111047, + "loss": 1.13, + "step": 6144 + }, + { + "epoch": 0.5388659535315807, + "grad_norm": 0.057373046875, + "learning_rate": 0.0016730331013188162, + "loss": 1.1444, + "step": 6145 + }, + { + "epoch": 0.5389536453059552, + "grad_norm": 0.058349609375, + "learning_rate": 0.0016726254743259677, + "loss": 1.1331, + "step": 6146 + }, + { + "epoch": 0.5390413370803298, + "grad_norm": 0.051025390625, + "learning_rate": 0.0016722178452697343, + "loss": 1.1253, + "step": 6147 + }, + { + "epoch": 0.5391290288547044, + "grad_norm": 0.05712890625, + "learning_rate": 0.0016718102141872913, + "loss": 1.1512, + "step": 6148 + }, + { + "epoch": 0.5392167206290789, + "grad_norm": 0.0732421875, + "learning_rate": 0.0016714025811158126, + "loss": 1.087, + "step": 6149 + }, + { + "epoch": 0.5393044124034534, + "grad_norm": 0.0556640625, + "learning_rate": 0.001670994946092474, + "loss": 1.1883, + "step": 6150 + }, + { + "epoch": 0.5393921041778279, + "grad_norm": 0.057373046875, + "learning_rate": 0.0016705873091544502, + "loss": 1.0774, + "step": 6151 + }, + { + "epoch": 0.5394797959522025, + "grad_norm": 0.056396484375, + "learning_rate": 0.0016701796703389173, + "loss": 1.1376, + "step": 6152 + }, + { + "epoch": 0.539567487726577, + "grad_norm": 0.051513671875, + "learning_rate": 0.00166977202968305, + "loss": 1.1503, + "step": 6153 + }, + { + "epoch": 0.5396551795009515, + "grad_norm": 0.052001953125, + "learning_rate": 0.0016693643872240252, + "loss": 1.1777, + "step": 6154 + }, + { + "epoch": 0.5397428712753262, + "grad_norm": 0.047607421875, + "learning_rate": 0.0016689567429990185, + "loss": 1.2055, + "step": 6155 + }, + { + "epoch": 0.5398305630497007, + "grad_norm": 0.051025390625, + "learning_rate": 0.0016685490970452053, + "loss": 1.171, + "step": 6156 + }, + { + "epoch": 0.5399182548240752, + "grad_norm": 0.04833984375, + "learning_rate": 0.0016681414493997624, + "loss": 1.1589, + "step": 6157 + }, + { + "epoch": 0.5400059465984498, + "grad_norm": 0.06103515625, + "learning_rate": 0.001667733800099866, + "loss": 1.1231, + "step": 6158 + }, + { + "epoch": 0.5400936383728243, + "grad_norm": 0.049560546875, + "learning_rate": 0.0016673261491826931, + "loss": 1.1593, + "step": 6159 + }, + { + "epoch": 0.5401813301471988, + "grad_norm": 0.0517578125, + "learning_rate": 0.0016669184966854197, + "loss": 1.1881, + "step": 6160 + }, + { + "epoch": 0.5402690219215734, + "grad_norm": 0.06591796875, + "learning_rate": 0.0016665108426452237, + "loss": 1.1642, + "step": 6161 + }, + { + "epoch": 0.540356713695948, + "grad_norm": 0.049072265625, + "learning_rate": 0.0016661031870992809, + "loss": 1.1686, + "step": 6162 + }, + { + "epoch": 0.5404444054703225, + "grad_norm": 0.06591796875, + "learning_rate": 0.0016656955300847695, + "loss": 1.1913, + "step": 6163 + }, + { + "epoch": 0.5405320972446971, + "grad_norm": 0.046875, + "learning_rate": 0.001665287871638866, + "loss": 1.1588, + "step": 6164 + }, + { + "epoch": 0.5406197890190716, + "grad_norm": 0.048095703125, + "learning_rate": 0.001664880211798748, + "loss": 1.1768, + "step": 6165 + }, + { + "epoch": 0.5407074807934461, + "grad_norm": 0.050537109375, + "learning_rate": 0.001664472550601593, + "loss": 1.2207, + "step": 6166 + }, + { + "epoch": 0.5407951725678207, + "grad_norm": 0.064453125, + "learning_rate": 0.0016640648880845792, + "loss": 1.1983, + "step": 6167 + }, + { + "epoch": 0.5408828643421952, + "grad_norm": 0.06884765625, + "learning_rate": 0.0016636572242848837, + "loss": 1.1084, + "step": 6168 + }, + { + "epoch": 0.5409705561165697, + "grad_norm": 0.05322265625, + "learning_rate": 0.0016632495592396842, + "loss": 1.1205, + "step": 6169 + }, + { + "epoch": 0.5410582478909444, + "grad_norm": 0.0615234375, + "learning_rate": 0.0016628418929861596, + "loss": 1.2338, + "step": 6170 + }, + { + "epoch": 0.5411459396653189, + "grad_norm": 0.091796875, + "learning_rate": 0.001662434225561487, + "loss": 1.1588, + "step": 6171 + }, + { + "epoch": 0.5412336314396934, + "grad_norm": 0.0556640625, + "learning_rate": 0.0016620265570028462, + "loss": 1.182, + "step": 6172 + }, + { + "epoch": 0.541321323214068, + "grad_norm": 0.0537109375, + "learning_rate": 0.0016616188873474133, + "loss": 1.1188, + "step": 6173 + }, + { + "epoch": 0.5414090149884425, + "grad_norm": 0.0986328125, + "learning_rate": 0.0016612112166323688, + "loss": 1.1278, + "step": 6174 + }, + { + "epoch": 0.541496706762817, + "grad_norm": 0.06689453125, + "learning_rate": 0.0016608035448948896, + "loss": 1.188, + "step": 6175 + }, + { + "epoch": 0.5415843985371915, + "grad_norm": 0.05859375, + "learning_rate": 0.001660395872172155, + "loss": 1.1424, + "step": 6176 + }, + { + "epoch": 0.5416720903115662, + "grad_norm": 0.09228515625, + "learning_rate": 0.001659988198501344, + "loss": 1.1199, + "step": 6177 + }, + { + "epoch": 0.5417597820859407, + "grad_norm": 0.08740234375, + "learning_rate": 0.0016595805239196346, + "loss": 1.167, + "step": 6178 + }, + { + "epoch": 0.5418474738603152, + "grad_norm": 0.058837890625, + "learning_rate": 0.0016591728484642069, + "loss": 1.2168, + "step": 6179 + }, + { + "epoch": 0.5419351656346898, + "grad_norm": 0.0791015625, + "learning_rate": 0.0016587651721722385, + "loss": 1.1293, + "step": 6180 + }, + { + "epoch": 0.5420228574090643, + "grad_norm": 0.0634765625, + "learning_rate": 0.0016583574950809095, + "loss": 1.1482, + "step": 6181 + }, + { + "epoch": 0.5421105491834388, + "grad_norm": 0.0556640625, + "learning_rate": 0.0016579498172273983, + "loss": 1.1544, + "step": 6182 + }, + { + "epoch": 0.5421982409578134, + "grad_norm": 0.0703125, + "learning_rate": 0.0016575421386488842, + "loss": 1.1129, + "step": 6183 + }, + { + "epoch": 0.542285932732188, + "grad_norm": 0.0693359375, + "learning_rate": 0.0016571344593825465, + "loss": 1.1664, + "step": 6184 + }, + { + "epoch": 0.5423736245065625, + "grad_norm": 0.06689453125, + "learning_rate": 0.0016567267794655649, + "loss": 1.1701, + "step": 6185 + }, + { + "epoch": 0.5424613162809371, + "grad_norm": 0.07763671875, + "learning_rate": 0.0016563190989351175, + "loss": 1.1784, + "step": 6186 + }, + { + "epoch": 0.5425490080553116, + "grad_norm": 0.049560546875, + "learning_rate": 0.0016559114178283853, + "loss": 1.185, + "step": 6187 + }, + { + "epoch": 0.5426366998296861, + "grad_norm": 0.07373046875, + "learning_rate": 0.001655503736182547, + "loss": 1.1976, + "step": 6188 + }, + { + "epoch": 0.5427243916040607, + "grad_norm": 0.08154296875, + "learning_rate": 0.0016550960540347823, + "loss": 1.1419, + "step": 6189 + }, + { + "epoch": 0.5428120833784352, + "grad_norm": 0.0625, + "learning_rate": 0.0016546883714222706, + "loss": 1.2035, + "step": 6190 + }, + { + "epoch": 0.5428997751528097, + "grad_norm": 0.08984375, + "learning_rate": 0.0016542806883821913, + "loss": 1.1298, + "step": 6191 + }, + { + "epoch": 0.5429874669271844, + "grad_norm": 0.0556640625, + "learning_rate": 0.0016538730049517243, + "loss": 1.159, + "step": 6192 + }, + { + "epoch": 0.5430751587015589, + "grad_norm": 0.06689453125, + "learning_rate": 0.001653465321168049, + "loss": 1.1406, + "step": 6193 + }, + { + "epoch": 0.5431628504759334, + "grad_norm": 0.0888671875, + "learning_rate": 0.0016530576370683459, + "loss": 1.2321, + "step": 6194 + }, + { + "epoch": 0.543250542250308, + "grad_norm": 0.045654296875, + "learning_rate": 0.0016526499526897935, + "loss": 1.1539, + "step": 6195 + }, + { + "epoch": 0.5433382340246825, + "grad_norm": 0.052001953125, + "learning_rate": 0.001652242268069573, + "loss": 1.1746, + "step": 6196 + }, + { + "epoch": 0.543425925799057, + "grad_norm": 0.1044921875, + "learning_rate": 0.0016518345832448628, + "loss": 1.1594, + "step": 6197 + }, + { + "epoch": 0.5435136175734316, + "grad_norm": 0.048095703125, + "learning_rate": 0.0016514268982528442, + "loss": 1.1595, + "step": 6198 + }, + { + "epoch": 0.5436013093478061, + "grad_norm": 0.08544921875, + "learning_rate": 0.001651019213130695, + "loss": 1.1401, + "step": 6199 + }, + { + "epoch": 0.5436890011221807, + "grad_norm": 0.09619140625, + "learning_rate": 0.0016506115279155968, + "loss": 1.1848, + "step": 6200 + }, + { + "epoch": 0.5437766928965552, + "grad_norm": 0.050048828125, + "learning_rate": 0.0016502038426447292, + "loss": 1.1586, + "step": 6201 + }, + { + "epoch": 0.5438643846709298, + "grad_norm": 0.12890625, + "learning_rate": 0.0016497961573552712, + "loss": 1.1362, + "step": 6202 + }, + { + "epoch": 0.5439520764453043, + "grad_norm": 0.0966796875, + "learning_rate": 0.0016493884720844034, + "loss": 1.1869, + "step": 6203 + }, + { + "epoch": 0.5440397682196788, + "grad_norm": 0.09619140625, + "learning_rate": 0.001648980786869305, + "loss": 1.1318, + "step": 6204 + }, + { + "epoch": 0.5441274599940534, + "grad_norm": 0.07177734375, + "learning_rate": 0.0016485731017471567, + "loss": 1.2033, + "step": 6205 + }, + { + "epoch": 0.5442151517684279, + "grad_norm": 0.057861328125, + "learning_rate": 0.0016481654167551372, + "loss": 1.1822, + "step": 6206 + }, + { + "epoch": 0.5443028435428025, + "grad_norm": 0.087890625, + "learning_rate": 0.0016477577319304274, + "loss": 1.1775, + "step": 6207 + }, + { + "epoch": 0.5443905353171771, + "grad_norm": 0.08056640625, + "learning_rate": 0.0016473500473102062, + "loss": 1.1539, + "step": 6208 + }, + { + "epoch": 0.5444782270915516, + "grad_norm": 0.056640625, + "learning_rate": 0.0016469423629316545, + "loss": 1.1698, + "step": 6209 + }, + { + "epoch": 0.5445659188659261, + "grad_norm": 0.0888671875, + "learning_rate": 0.0016465346788319516, + "loss": 1.1285, + "step": 6210 + }, + { + "epoch": 0.5446536106403007, + "grad_norm": 0.05419921875, + "learning_rate": 0.0016461269950482763, + "loss": 1.1193, + "step": 6211 + }, + { + "epoch": 0.5447413024146752, + "grad_norm": 0.0517578125, + "learning_rate": 0.0016457193116178096, + "loss": 1.1623, + "step": 6212 + }, + { + "epoch": 0.5448289941890497, + "grad_norm": 0.09375, + "learning_rate": 0.00164531162857773, + "loss": 1.1908, + "step": 6213 + }, + { + "epoch": 0.5449166859634244, + "grad_norm": 0.053955078125, + "learning_rate": 0.0016449039459652183, + "loss": 1.1165, + "step": 6214 + }, + { + "epoch": 0.5450043777377989, + "grad_norm": 0.05810546875, + "learning_rate": 0.0016444962638174534, + "loss": 1.1602, + "step": 6215 + }, + { + "epoch": 0.5450920695121734, + "grad_norm": 0.0986328125, + "learning_rate": 0.0016440885821716147, + "loss": 1.1961, + "step": 6216 + }, + { + "epoch": 0.545179761286548, + "grad_norm": 0.054443359375, + "learning_rate": 0.0016436809010648824, + "loss": 1.1823, + "step": 6217 + }, + { + "epoch": 0.5452674530609225, + "grad_norm": 0.060546875, + "learning_rate": 0.0016432732205344355, + "loss": 1.1681, + "step": 6218 + }, + { + "epoch": 0.545355144835297, + "grad_norm": 0.107421875, + "learning_rate": 0.0016428655406174539, + "loss": 1.1674, + "step": 6219 + }, + { + "epoch": 0.5454428366096716, + "grad_norm": 0.060302734375, + "learning_rate": 0.001642457861351116, + "loss": 1.1003, + "step": 6220 + }, + { + "epoch": 0.5455305283840461, + "grad_norm": 0.046142578125, + "learning_rate": 0.001642050182772602, + "loss": 1.1554, + "step": 6221 + }, + { + "epoch": 0.5456182201584207, + "grad_norm": 0.078125, + "learning_rate": 0.0016416425049190905, + "loss": 1.1374, + "step": 6222 + }, + { + "epoch": 0.5457059119327952, + "grad_norm": 0.07861328125, + "learning_rate": 0.0016412348278277615, + "loss": 1.1634, + "step": 6223 + }, + { + "epoch": 0.5457936037071698, + "grad_norm": 0.048095703125, + "learning_rate": 0.0016408271515357931, + "loss": 1.1173, + "step": 6224 + }, + { + "epoch": 0.5458812954815443, + "grad_norm": 0.0634765625, + "learning_rate": 0.0016404194760803654, + "loss": 1.1473, + "step": 6225 + }, + { + "epoch": 0.5459689872559188, + "grad_norm": 0.0517578125, + "learning_rate": 0.0016400118014986559, + "loss": 1.0985, + "step": 6226 + }, + { + "epoch": 0.5460566790302934, + "grad_norm": 0.05810546875, + "learning_rate": 0.0016396041278278455, + "loss": 1.1385, + "step": 6227 + }, + { + "epoch": 0.5461443708046679, + "grad_norm": 0.0625, + "learning_rate": 0.001639196455105111, + "loss": 1.1643, + "step": 6228 + }, + { + "epoch": 0.5462320625790424, + "grad_norm": 0.04296875, + "learning_rate": 0.0016387887833676318, + "loss": 1.1503, + "step": 6229 + }, + { + "epoch": 0.5463197543534171, + "grad_norm": 0.08740234375, + "learning_rate": 0.001638381112652587, + "loss": 1.1416, + "step": 6230 + }, + { + "epoch": 0.5464074461277916, + "grad_norm": 0.09326171875, + "learning_rate": 0.0016379734429971544, + "loss": 1.1623, + "step": 6231 + }, + { + "epoch": 0.5464951379021661, + "grad_norm": 0.064453125, + "learning_rate": 0.001637565774438513, + "loss": 1.1652, + "step": 6232 + }, + { + "epoch": 0.5465828296765407, + "grad_norm": 0.08984375, + "learning_rate": 0.0016371581070138408, + "loss": 1.1798, + "step": 6233 + }, + { + "epoch": 0.5466705214509152, + "grad_norm": 0.07421875, + "learning_rate": 0.0016367504407603162, + "loss": 1.1315, + "step": 6234 + }, + { + "epoch": 0.5467582132252897, + "grad_norm": 0.055419921875, + "learning_rate": 0.0016363427757151168, + "loss": 1.21, + "step": 6235 + }, + { + "epoch": 0.5468459049996643, + "grad_norm": 0.05859375, + "learning_rate": 0.0016359351119154216, + "loss": 1.1493, + "step": 6236 + }, + { + "epoch": 0.5469335967740389, + "grad_norm": 0.0556640625, + "learning_rate": 0.001635527449398407, + "loss": 1.1037, + "step": 6237 + }, + { + "epoch": 0.5470212885484134, + "grad_norm": 0.05078125, + "learning_rate": 0.0016351197882012523, + "loss": 1.1149, + "step": 6238 + }, + { + "epoch": 0.547108980322788, + "grad_norm": 0.049560546875, + "learning_rate": 0.0016347121283611342, + "loss": 1.2202, + "step": 6239 + }, + { + "epoch": 0.5471966720971625, + "grad_norm": 0.0625, + "learning_rate": 0.0016343044699152311, + "loss": 1.1511, + "step": 6240 + }, + { + "epoch": 0.547284363871537, + "grad_norm": 0.056396484375, + "learning_rate": 0.001633896812900719, + "loss": 1.12, + "step": 6241 + }, + { + "epoch": 0.5473720556459116, + "grad_norm": 0.06494140625, + "learning_rate": 0.0016334891573547762, + "loss": 1.145, + "step": 6242 + }, + { + "epoch": 0.5474597474202861, + "grad_norm": 0.058837890625, + "learning_rate": 0.0016330815033145803, + "loss": 1.2593, + "step": 6243 + }, + { + "epoch": 0.5475474391946606, + "grad_norm": 0.05126953125, + "learning_rate": 0.0016326738508173075, + "loss": 1.2497, + "step": 6244 + }, + { + "epoch": 0.5476351309690353, + "grad_norm": 0.05322265625, + "learning_rate": 0.0016322661999001345, + "loss": 1.2126, + "step": 6245 + }, + { + "epoch": 0.5477228227434098, + "grad_norm": 0.0654296875, + "learning_rate": 0.0016318585506002383, + "loss": 1.1608, + "step": 6246 + }, + { + "epoch": 0.5478105145177843, + "grad_norm": 0.049072265625, + "learning_rate": 0.0016314509029547951, + "loss": 1.1759, + "step": 6247 + }, + { + "epoch": 0.5478982062921588, + "grad_norm": 0.08740234375, + "learning_rate": 0.0016310432570009823, + "loss": 1.1361, + "step": 6248 + }, + { + "epoch": 0.5479858980665334, + "grad_norm": 0.06640625, + "learning_rate": 0.0016306356127759752, + "loss": 1.1059, + "step": 6249 + }, + { + "epoch": 0.5480735898409079, + "grad_norm": 0.06884765625, + "learning_rate": 0.0016302279703169502, + "loss": 1.1806, + "step": 6250 + }, + { + "epoch": 0.5481612816152824, + "grad_norm": 0.06591796875, + "learning_rate": 0.0016298203296610829, + "loss": 1.1273, + "step": 6251 + }, + { + "epoch": 0.5482489733896571, + "grad_norm": 0.048828125, + "learning_rate": 0.0016294126908455503, + "loss": 1.1676, + "step": 6252 + }, + { + "epoch": 0.5483366651640316, + "grad_norm": 0.07470703125, + "learning_rate": 0.0016290050539075262, + "loss": 1.1975, + "step": 6253 + }, + { + "epoch": 0.5484243569384061, + "grad_norm": 0.06591796875, + "learning_rate": 0.0016285974188841878, + "loss": 1.1697, + "step": 6254 + }, + { + "epoch": 0.5485120487127807, + "grad_norm": 0.076171875, + "learning_rate": 0.001628189785812709, + "loss": 1.163, + "step": 6255 + }, + { + "epoch": 0.5485997404871552, + "grad_norm": 0.06201171875, + "learning_rate": 0.0016277821547302656, + "loss": 1.1997, + "step": 6256 + }, + { + "epoch": 0.5486874322615297, + "grad_norm": 0.052734375, + "learning_rate": 0.0016273745256740323, + "loss": 1.1365, + "step": 6257 + }, + { + "epoch": 0.5487751240359043, + "grad_norm": 0.052734375, + "learning_rate": 0.001626966898681184, + "loss": 1.1422, + "step": 6258 + }, + { + "epoch": 0.5488628158102788, + "grad_norm": 0.109375, + "learning_rate": 0.0016265592737888953, + "loss": 1.1426, + "step": 6259 + }, + { + "epoch": 0.5489505075846534, + "grad_norm": 0.064453125, + "learning_rate": 0.0016261516510343397, + "loss": 1.1918, + "step": 6260 + }, + { + "epoch": 0.549038199359028, + "grad_norm": 0.11865234375, + "learning_rate": 0.0016257440304546927, + "loss": 1.1411, + "step": 6261 + }, + { + "epoch": 0.5491258911334025, + "grad_norm": 0.0908203125, + "learning_rate": 0.0016253364120871275, + "loss": 1.1349, + "step": 6262 + }, + { + "epoch": 0.549213582907777, + "grad_norm": 0.0498046875, + "learning_rate": 0.0016249287959688174, + "loss": 1.1885, + "step": 6263 + }, + { + "epoch": 0.5493012746821516, + "grad_norm": 0.05712890625, + "learning_rate": 0.001624521182136936, + "loss": 1.1467, + "step": 6264 + }, + { + "epoch": 0.5493889664565261, + "grad_norm": 0.12109375, + "learning_rate": 0.0016241135706286578, + "loss": 1.1815, + "step": 6265 + }, + { + "epoch": 0.5494766582309006, + "grad_norm": 0.076171875, + "learning_rate": 0.0016237059614811545, + "loss": 1.139, + "step": 6266 + }, + { + "epoch": 0.5495643500052753, + "grad_norm": 0.049560546875, + "learning_rate": 0.0016232983547316002, + "loss": 1.1717, + "step": 6267 + }, + { + "epoch": 0.5496520417796498, + "grad_norm": 0.06787109375, + "learning_rate": 0.0016228907504171664, + "loss": 1.1009, + "step": 6268 + }, + { + "epoch": 0.5497397335540243, + "grad_norm": 0.06787109375, + "learning_rate": 0.0016224831485750267, + "loss": 1.1357, + "step": 6269 + }, + { + "epoch": 0.5498274253283988, + "grad_norm": 0.09814453125, + "learning_rate": 0.0016220755492423527, + "loss": 1.1442, + "step": 6270 + }, + { + "epoch": 0.5499151171027734, + "grad_norm": 0.057373046875, + "learning_rate": 0.001621667952456316, + "loss": 1.1538, + "step": 6271 + }, + { + "epoch": 0.5500028088771479, + "grad_norm": 0.119140625, + "learning_rate": 0.0016212603582540894, + "loss": 1.1771, + "step": 6272 + }, + { + "epoch": 0.5500905006515224, + "grad_norm": 0.051025390625, + "learning_rate": 0.0016208527666728438, + "loss": 1.1882, + "step": 6273 + }, + { + "epoch": 0.550178192425897, + "grad_norm": 0.056640625, + "learning_rate": 0.001620445177749751, + "loss": 1.135, + "step": 6274 + }, + { + "epoch": 0.5502658842002716, + "grad_norm": 0.059814453125, + "learning_rate": 0.001620037591521981, + "loss": 1.1446, + "step": 6275 + }, + { + "epoch": 0.5503535759746461, + "grad_norm": 0.060546875, + "learning_rate": 0.001619630008026706, + "loss": 1.214, + "step": 6276 + }, + { + "epoch": 0.5504412677490207, + "grad_norm": 0.060302734375, + "learning_rate": 0.0016192224273010956, + "loss": 1.1768, + "step": 6277 + }, + { + "epoch": 0.5505289595233952, + "grad_norm": 0.05029296875, + "learning_rate": 0.0016188148493823207, + "loss": 1.2054, + "step": 6278 + }, + { + "epoch": 0.5506166512977697, + "grad_norm": 0.060302734375, + "learning_rate": 0.001618407274307551, + "loss": 1.1467, + "step": 6279 + }, + { + "epoch": 0.5507043430721443, + "grad_norm": 0.0517578125, + "learning_rate": 0.0016179997021139555, + "loss": 1.1381, + "step": 6280 + }, + { + "epoch": 0.5507920348465188, + "grad_norm": 0.05126953125, + "learning_rate": 0.0016175921328387053, + "loss": 1.1963, + "step": 6281 + }, + { + "epoch": 0.5508797266208934, + "grad_norm": 0.0546875, + "learning_rate": 0.0016171845665189687, + "loss": 1.1804, + "step": 6282 + }, + { + "epoch": 0.550967418395268, + "grad_norm": 0.05126953125, + "learning_rate": 0.001616777003191915, + "loss": 1.1913, + "step": 6283 + }, + { + "epoch": 0.5510551101696425, + "grad_norm": 0.04736328125, + "learning_rate": 0.0016163694428947128, + "loss": 1.1054, + "step": 6284 + }, + { + "epoch": 0.551142801944017, + "grad_norm": 0.056396484375, + "learning_rate": 0.0016159618856645306, + "loss": 1.1939, + "step": 6285 + }, + { + "epoch": 0.5512304937183916, + "grad_norm": 0.051513671875, + "learning_rate": 0.0016155543315385362, + "loss": 1.1101, + "step": 6286 + }, + { + "epoch": 0.5513181854927661, + "grad_norm": 0.0517578125, + "learning_rate": 0.0016151467805538982, + "loss": 1.1556, + "step": 6287 + }, + { + "epoch": 0.5514058772671406, + "grad_norm": 0.05126953125, + "learning_rate": 0.001614739232747784, + "loss": 1.1463, + "step": 6288 + }, + { + "epoch": 0.5514935690415153, + "grad_norm": 0.07958984375, + "learning_rate": 0.0016143316881573606, + "loss": 1.1454, + "step": 6289 + }, + { + "epoch": 0.5515812608158898, + "grad_norm": 0.0498046875, + "learning_rate": 0.001613924146819795, + "loss": 1.1797, + "step": 6290 + }, + { + "epoch": 0.5516689525902643, + "grad_norm": 0.08056640625, + "learning_rate": 0.0016135166087722541, + "loss": 1.1276, + "step": 6291 + }, + { + "epoch": 0.5517566443646389, + "grad_norm": 0.04931640625, + "learning_rate": 0.0016131090740519045, + "loss": 1.1592, + "step": 6292 + }, + { + "epoch": 0.5518443361390134, + "grad_norm": 0.0625, + "learning_rate": 0.0016127015426959119, + "loss": 1.1215, + "step": 6293 + }, + { + "epoch": 0.5519320279133879, + "grad_norm": 0.0830078125, + "learning_rate": 0.0016122940147414422, + "loss": 1.1942, + "step": 6294 + }, + { + "epoch": 0.5520197196877624, + "grad_norm": 0.0517578125, + "learning_rate": 0.0016118864902256618, + "loss": 1.255, + "step": 6295 + }, + { + "epoch": 0.552107411462137, + "grad_norm": 0.06005859375, + "learning_rate": 0.0016114789691857347, + "loss": 1.2133, + "step": 6296 + }, + { + "epoch": 0.5521951032365116, + "grad_norm": 0.056884765625, + "learning_rate": 0.0016110714516588257, + "loss": 1.1311, + "step": 6297 + }, + { + "epoch": 0.5522827950108861, + "grad_norm": 0.0478515625, + "learning_rate": 0.0016106639376821004, + "loss": 1.173, + "step": 6298 + }, + { + "epoch": 0.5523704867852607, + "grad_norm": 0.05029296875, + "learning_rate": 0.0016102564272927223, + "loss": 1.1729, + "step": 6299 + }, + { + "epoch": 0.5524581785596352, + "grad_norm": 0.0546875, + "learning_rate": 0.001609848920527855, + "loss": 1.1597, + "step": 6300 + }, + { + "epoch": 0.5525458703340097, + "grad_norm": 0.06591796875, + "learning_rate": 0.0016094414174246628, + "loss": 1.2032, + "step": 6301 + }, + { + "epoch": 0.5526335621083843, + "grad_norm": 0.05908203125, + "learning_rate": 0.0016090339180203084, + "loss": 1.2268, + "step": 6302 + }, + { + "epoch": 0.5527212538827588, + "grad_norm": 0.09423828125, + "learning_rate": 0.0016086264223519554, + "loss": 1.1406, + "step": 6303 + }, + { + "epoch": 0.5528089456571333, + "grad_norm": 0.06396484375, + "learning_rate": 0.0016082189304567657, + "loss": 1.1744, + "step": 6304 + }, + { + "epoch": 0.552896637431508, + "grad_norm": 0.052978515625, + "learning_rate": 0.0016078114423719015, + "loss": 1.1374, + "step": 6305 + }, + { + "epoch": 0.5529843292058825, + "grad_norm": 0.07275390625, + "learning_rate": 0.001607403958134525, + "loss": 1.1721, + "step": 6306 + }, + { + "epoch": 0.553072020980257, + "grad_norm": 0.054931640625, + "learning_rate": 0.0016069964777817975, + "loss": 1.1685, + "step": 6307 + }, + { + "epoch": 0.5531597127546316, + "grad_norm": 0.0556640625, + "learning_rate": 0.0016065890013508802, + "loss": 1.1869, + "step": 6308 + }, + { + "epoch": 0.5532474045290061, + "grad_norm": 0.046142578125, + "learning_rate": 0.001606181528878934, + "loss": 1.1795, + "step": 6309 + }, + { + "epoch": 0.5533350963033806, + "grad_norm": 0.0615234375, + "learning_rate": 0.0016057740604031193, + "loss": 1.2014, + "step": 6310 + }, + { + "epoch": 0.5534227880777552, + "grad_norm": 0.072265625, + "learning_rate": 0.0016053665959605958, + "loss": 1.1581, + "step": 6311 + }, + { + "epoch": 0.5535104798521298, + "grad_norm": 0.050048828125, + "learning_rate": 0.0016049591355885246, + "loss": 1.1228, + "step": 6312 + }, + { + "epoch": 0.5535981716265043, + "grad_norm": 0.07177734375, + "learning_rate": 0.0016045516793240626, + "loss": 1.1614, + "step": 6313 + }, + { + "epoch": 0.5536858634008789, + "grad_norm": 0.05859375, + "learning_rate": 0.001604144227204371, + "loss": 1.1347, + "step": 6314 + }, + { + "epoch": 0.5537735551752534, + "grad_norm": 0.0673828125, + "learning_rate": 0.001603736779266607, + "loss": 1.1603, + "step": 6315 + }, + { + "epoch": 0.5538612469496279, + "grad_norm": 0.0732421875, + "learning_rate": 0.0016033293355479294, + "loss": 1.1814, + "step": 6316 + }, + { + "epoch": 0.5539489387240025, + "grad_norm": 0.048828125, + "learning_rate": 0.0016029218960854957, + "loss": 1.1171, + "step": 6317 + }, + { + "epoch": 0.554036630498377, + "grad_norm": 0.080078125, + "learning_rate": 0.001602514460916464, + "loss": 1.116, + "step": 6318 + }, + { + "epoch": 0.5541243222727515, + "grad_norm": 0.05322265625, + "learning_rate": 0.0016021070300779906, + "loss": 1.1405, + "step": 6319 + }, + { + "epoch": 0.5542120140471261, + "grad_norm": 0.07666015625, + "learning_rate": 0.0016016996036072326, + "loss": 1.1298, + "step": 6320 + }, + { + "epoch": 0.5542997058215007, + "grad_norm": 0.058349609375, + "learning_rate": 0.0016012921815413458, + "loss": 1.1875, + "step": 6321 + }, + { + "epoch": 0.5543873975958752, + "grad_norm": 0.05029296875, + "learning_rate": 0.0016008847639174867, + "loss": 1.1621, + "step": 6322 + }, + { + "epoch": 0.5544750893702497, + "grad_norm": 0.052978515625, + "learning_rate": 0.0016004773507728098, + "loss": 1.1173, + "step": 6323 + }, + { + "epoch": 0.5545627811446243, + "grad_norm": 0.053955078125, + "learning_rate": 0.0016000699421444711, + "loss": 1.1682, + "step": 6324 + }, + { + "epoch": 0.5546504729189988, + "grad_norm": 0.056396484375, + "learning_rate": 0.0015996625380696246, + "loss": 1.1739, + "step": 6325 + }, + { + "epoch": 0.5547381646933733, + "grad_norm": 0.06640625, + "learning_rate": 0.0015992551385854246, + "loss": 1.1476, + "step": 6326 + }, + { + "epoch": 0.554825856467748, + "grad_norm": 0.1015625, + "learning_rate": 0.001598847743729025, + "loss": 1.1328, + "step": 6327 + }, + { + "epoch": 0.5549135482421225, + "grad_norm": 0.0810546875, + "learning_rate": 0.0015984403535375792, + "loss": 1.2484, + "step": 6328 + }, + { + "epoch": 0.555001240016497, + "grad_norm": 0.051513671875, + "learning_rate": 0.0015980329680482406, + "loss": 1.1521, + "step": 6329 + }, + { + "epoch": 0.5550889317908716, + "grad_norm": 0.08642578125, + "learning_rate": 0.001597625587298161, + "loss": 1.1866, + "step": 6330 + }, + { + "epoch": 0.5551766235652461, + "grad_norm": 0.0576171875, + "learning_rate": 0.001597218211324492, + "loss": 1.1049, + "step": 6331 + }, + { + "epoch": 0.5552643153396206, + "grad_norm": 0.0625, + "learning_rate": 0.0015968108401643861, + "loss": 1.1274, + "step": 6332 + }, + { + "epoch": 0.5553520071139952, + "grad_norm": 0.05224609375, + "learning_rate": 0.001596403473854994, + "loss": 1.1283, + "step": 6333 + }, + { + "epoch": 0.5554396988883697, + "grad_norm": 0.052978515625, + "learning_rate": 0.0015959961124334674, + "loss": 1.1537, + "step": 6334 + }, + { + "epoch": 0.5555273906627443, + "grad_norm": 0.051513671875, + "learning_rate": 0.0015955887559369556, + "loss": 1.0928, + "step": 6335 + }, + { + "epoch": 0.5556150824371189, + "grad_norm": 0.0693359375, + "learning_rate": 0.0015951814044026086, + "loss": 1.1594, + "step": 6336 + }, + { + "epoch": 0.5557027742114934, + "grad_norm": 0.0517578125, + "learning_rate": 0.0015947740578675764, + "loss": 1.1512, + "step": 6337 + }, + { + "epoch": 0.5557904659858679, + "grad_norm": 0.072265625, + "learning_rate": 0.0015943667163690074, + "loss": 1.1504, + "step": 6338 + }, + { + "epoch": 0.5558781577602425, + "grad_norm": 0.07373046875, + "learning_rate": 0.0015939593799440502, + "loss": 1.1153, + "step": 6339 + }, + { + "epoch": 0.555965849534617, + "grad_norm": 0.048583984375, + "learning_rate": 0.0015935520486298528, + "loss": 1.1693, + "step": 6340 + }, + { + "epoch": 0.5560535413089915, + "grad_norm": 0.08740234375, + "learning_rate": 0.0015931447224635628, + "loss": 1.2001, + "step": 6341 + }, + { + "epoch": 0.556141233083366, + "grad_norm": 0.10400390625, + "learning_rate": 0.0015927374014823273, + "loss": 1.1423, + "step": 6342 + }, + { + "epoch": 0.5562289248577407, + "grad_norm": 0.051025390625, + "learning_rate": 0.0015923300857232931, + "loss": 1.1328, + "step": 6343 + }, + { + "epoch": 0.5563166166321152, + "grad_norm": 0.07373046875, + "learning_rate": 0.0015919227752236066, + "loss": 1.1577, + "step": 6344 + }, + { + "epoch": 0.5564043084064897, + "grad_norm": 0.0986328125, + "learning_rate": 0.0015915154700204123, + "loss": 1.1518, + "step": 6345 + }, + { + "epoch": 0.5564920001808643, + "grad_norm": 0.046875, + "learning_rate": 0.001591108170150857, + "loss": 1.1548, + "step": 6346 + }, + { + "epoch": 0.5565796919552388, + "grad_norm": 0.0703125, + "learning_rate": 0.0015907008756520843, + "loss": 1.2151, + "step": 6347 + }, + { + "epoch": 0.5566673837296133, + "grad_norm": 0.05859375, + "learning_rate": 0.0015902935865612384, + "loss": 1.1432, + "step": 6348 + }, + { + "epoch": 0.556755075503988, + "grad_norm": 0.049560546875, + "learning_rate": 0.0015898863029154635, + "loss": 1.081, + "step": 6349 + }, + { + "epoch": 0.5568427672783625, + "grad_norm": 0.04931640625, + "learning_rate": 0.0015894790247519021, + "loss": 1.1976, + "step": 6350 + }, + { + "epoch": 0.556930459052737, + "grad_norm": 0.060791015625, + "learning_rate": 0.0015890717521076976, + "loss": 1.2402, + "step": 6351 + }, + { + "epoch": 0.5570181508271116, + "grad_norm": 0.05126953125, + "learning_rate": 0.0015886644850199923, + "loss": 1.1517, + "step": 6352 + }, + { + "epoch": 0.5571058426014861, + "grad_norm": 0.047119140625, + "learning_rate": 0.0015882572235259276, + "loss": 1.1267, + "step": 6353 + }, + { + "epoch": 0.5571935343758606, + "grad_norm": 0.0703125, + "learning_rate": 0.0015878499676626448, + "loss": 1.249, + "step": 6354 + }, + { + "epoch": 0.5572812261502352, + "grad_norm": 0.056396484375, + "learning_rate": 0.0015874427174672843, + "loss": 1.1747, + "step": 6355 + }, + { + "epoch": 0.5573689179246097, + "grad_norm": 0.051513671875, + "learning_rate": 0.0015870354729769867, + "loss": 1.1809, + "step": 6356 + }, + { + "epoch": 0.5574566096989843, + "grad_norm": 0.05029296875, + "learning_rate": 0.001586628234228891, + "loss": 1.1567, + "step": 6357 + }, + { + "epoch": 0.5575443014733589, + "grad_norm": 0.062255859375, + "learning_rate": 0.0015862210012601368, + "loss": 1.1509, + "step": 6358 + }, + { + "epoch": 0.5576319932477334, + "grad_norm": 0.080078125, + "learning_rate": 0.0015858137741078628, + "loss": 1.1619, + "step": 6359 + }, + { + "epoch": 0.5577196850221079, + "grad_norm": 0.050048828125, + "learning_rate": 0.0015854065528092072, + "loss": 1.1199, + "step": 6360 + }, + { + "epoch": 0.5578073767964825, + "grad_norm": 0.0732421875, + "learning_rate": 0.0015849993374013067, + "loss": 1.1476, + "step": 6361 + }, + { + "epoch": 0.557895068570857, + "grad_norm": 0.06396484375, + "learning_rate": 0.001584592127921299, + "loss": 1.181, + "step": 6362 + }, + { + "epoch": 0.5579827603452315, + "grad_norm": 0.047607421875, + "learning_rate": 0.0015841849244063214, + "loss": 1.1602, + "step": 6363 + }, + { + "epoch": 0.5580704521196062, + "grad_norm": 0.07421875, + "learning_rate": 0.0015837777268935076, + "loss": 1.1877, + "step": 6364 + }, + { + "epoch": 0.5581581438939807, + "grad_norm": 0.078125, + "learning_rate": 0.0015833705354199943, + "loss": 1.1736, + "step": 6365 + }, + { + "epoch": 0.5582458356683552, + "grad_norm": 0.0634765625, + "learning_rate": 0.0015829633500229164, + "loss": 1.1355, + "step": 6366 + }, + { + "epoch": 0.5583335274427297, + "grad_norm": 0.04736328125, + "learning_rate": 0.0015825561707394078, + "loss": 1.1769, + "step": 6367 + }, + { + "epoch": 0.5584212192171043, + "grad_norm": 0.050048828125, + "learning_rate": 0.0015821489976066024, + "loss": 1.1726, + "step": 6368 + }, + { + "epoch": 0.5585089109914788, + "grad_norm": 0.048583984375, + "learning_rate": 0.0015817418306616332, + "loss": 1.157, + "step": 6369 + }, + { + "epoch": 0.5585966027658533, + "grad_norm": 0.05322265625, + "learning_rate": 0.0015813346699416328, + "loss": 1.1732, + "step": 6370 + }, + { + "epoch": 0.5586842945402279, + "grad_norm": 0.0556640625, + "learning_rate": 0.0015809275154837332, + "loss": 1.1495, + "step": 6371 + }, + { + "epoch": 0.5587719863146025, + "grad_norm": 0.0498046875, + "learning_rate": 0.001580520367325066, + "loss": 1.2056, + "step": 6372 + }, + { + "epoch": 0.558859678088977, + "grad_norm": 0.05517578125, + "learning_rate": 0.0015801132255027618, + "loss": 1.1562, + "step": 6373 + }, + { + "epoch": 0.5589473698633516, + "grad_norm": 0.061279296875, + "learning_rate": 0.001579706090053951, + "loss": 1.1355, + "step": 6374 + }, + { + "epoch": 0.5590350616377261, + "grad_norm": 0.061279296875, + "learning_rate": 0.0015792989610157637, + "loss": 1.1778, + "step": 6375 + }, + { + "epoch": 0.5591227534121006, + "grad_norm": 0.052978515625, + "learning_rate": 0.0015788918384253283, + "loss": 1.1674, + "step": 6376 + }, + { + "epoch": 0.5592104451864752, + "grad_norm": 0.048095703125, + "learning_rate": 0.0015784847223197734, + "loss": 1.216, + "step": 6377 + }, + { + "epoch": 0.5592981369608497, + "grad_norm": 0.05126953125, + "learning_rate": 0.001578077612736228, + "loss": 1.1611, + "step": 6378 + }, + { + "epoch": 0.5593858287352242, + "grad_norm": 0.0634765625, + "learning_rate": 0.001577670509711818, + "loss": 1.1882, + "step": 6379 + }, + { + "epoch": 0.5594735205095989, + "grad_norm": 0.060302734375, + "learning_rate": 0.0015772634132836714, + "loss": 1.1964, + "step": 6380 + }, + { + "epoch": 0.5595612122839734, + "grad_norm": 0.0634765625, + "learning_rate": 0.0015768563234889134, + "loss": 1.1675, + "step": 6381 + }, + { + "epoch": 0.5596489040583479, + "grad_norm": 0.060791015625, + "learning_rate": 0.0015764492403646697, + "loss": 1.184, + "step": 6382 + }, + { + "epoch": 0.5597365958327225, + "grad_norm": 0.053466796875, + "learning_rate": 0.0015760421639480656, + "loss": 1.1824, + "step": 6383 + }, + { + "epoch": 0.559824287607097, + "grad_norm": 0.056640625, + "learning_rate": 0.001575635094276225, + "loss": 1.1667, + "step": 6384 + }, + { + "epoch": 0.5599119793814715, + "grad_norm": 0.046875, + "learning_rate": 0.0015752280313862722, + "loss": 1.1928, + "step": 6385 + }, + { + "epoch": 0.5599996711558461, + "grad_norm": 0.08349609375, + "learning_rate": 0.0015748209753153294, + "loss": 1.1528, + "step": 6386 + }, + { + "epoch": 0.5600873629302207, + "grad_norm": 0.06982421875, + "learning_rate": 0.0015744139261005203, + "loss": 1.1643, + "step": 6387 + }, + { + "epoch": 0.5601750547045952, + "grad_norm": 0.049072265625, + "learning_rate": 0.0015740068837789653, + "loss": 1.2557, + "step": 6388 + }, + { + "epoch": 0.5602627464789698, + "grad_norm": 0.06689453125, + "learning_rate": 0.0015735998483877869, + "loss": 1.1257, + "step": 6389 + }, + { + "epoch": 0.5603504382533443, + "grad_norm": 0.0888671875, + "learning_rate": 0.0015731928199641044, + "loss": 1.115, + "step": 6390 + }, + { + "epoch": 0.5604381300277188, + "grad_norm": 0.06591796875, + "learning_rate": 0.0015727857985450393, + "loss": 1.1729, + "step": 6391 + }, + { + "epoch": 0.5605258218020933, + "grad_norm": 0.05859375, + "learning_rate": 0.0015723787841677094, + "loss": 1.1319, + "step": 6392 + }, + { + "epoch": 0.5606135135764679, + "grad_norm": 0.0986328125, + "learning_rate": 0.0015719717768692342, + "loss": 1.1822, + "step": 6393 + }, + { + "epoch": 0.5607012053508424, + "grad_norm": 0.061767578125, + "learning_rate": 0.0015715647766867318, + "loss": 1.1574, + "step": 6394 + }, + { + "epoch": 0.560788897125217, + "grad_norm": 0.0576171875, + "learning_rate": 0.0015711577836573188, + "loss": 1.1939, + "step": 6395 + }, + { + "epoch": 0.5608765888995916, + "grad_norm": 0.07080078125, + "learning_rate": 0.0015707507978181129, + "loss": 1.1728, + "step": 6396 + }, + { + "epoch": 0.5609642806739661, + "grad_norm": 0.0634765625, + "learning_rate": 0.0015703438192062294, + "loss": 1.1533, + "step": 6397 + }, + { + "epoch": 0.5610519724483406, + "grad_norm": 0.0634765625, + "learning_rate": 0.0015699368478587841, + "loss": 1.1258, + "step": 6398 + }, + { + "epoch": 0.5611396642227152, + "grad_norm": 0.06689453125, + "learning_rate": 0.0015695298838128916, + "loss": 1.1756, + "step": 6399 + }, + { + "epoch": 0.5612273559970897, + "grad_norm": 0.111328125, + "learning_rate": 0.0015691229271056657, + "loss": 1.131, + "step": 6400 + }, + { + "epoch": 0.5613150477714642, + "grad_norm": 0.058349609375, + "learning_rate": 0.0015687159777742201, + "loss": 1.1651, + "step": 6401 + }, + { + "epoch": 0.5614027395458389, + "grad_norm": 0.07568359375, + "learning_rate": 0.0015683090358556675, + "loss": 1.1808, + "step": 6402 + }, + { + "epoch": 0.5614904313202134, + "grad_norm": 0.083984375, + "learning_rate": 0.0015679021013871202, + "loss": 1.2043, + "step": 6403 + }, + { + "epoch": 0.5615781230945879, + "grad_norm": 0.058837890625, + "learning_rate": 0.001567495174405689, + "loss": 1.1812, + "step": 6404 + }, + { + "epoch": 0.5616658148689625, + "grad_norm": 0.064453125, + "learning_rate": 0.001567088254948485, + "loss": 1.2238, + "step": 6405 + }, + { + "epoch": 0.561753506643337, + "grad_norm": 0.080078125, + "learning_rate": 0.001566681343052618, + "loss": 1.1851, + "step": 6406 + }, + { + "epoch": 0.5618411984177115, + "grad_norm": 0.0791015625, + "learning_rate": 0.0015662744387551976, + "loss": 1.1656, + "step": 6407 + }, + { + "epoch": 0.5619288901920861, + "grad_norm": 0.052734375, + "learning_rate": 0.0015658675420933321, + "loss": 1.1281, + "step": 6408 + }, + { + "epoch": 0.5620165819664606, + "grad_norm": 0.08837890625, + "learning_rate": 0.00156546065310413, + "loss": 1.1559, + "step": 6409 + }, + { + "epoch": 0.5621042737408352, + "grad_norm": 0.06884765625, + "learning_rate": 0.0015650537718246977, + "loss": 1.1692, + "step": 6410 + }, + { + "epoch": 0.5621919655152098, + "grad_norm": 0.05029296875, + "learning_rate": 0.0015646468982921423, + "loss": 1.1127, + "step": 6411 + }, + { + "epoch": 0.5622796572895843, + "grad_norm": 0.049560546875, + "learning_rate": 0.0015642400325435693, + "loss": 1.1707, + "step": 6412 + }, + { + "epoch": 0.5623673490639588, + "grad_norm": 0.0888671875, + "learning_rate": 0.001563833174616084, + "loss": 1.1612, + "step": 6413 + }, + { + "epoch": 0.5624550408383333, + "grad_norm": 0.1005859375, + "learning_rate": 0.001563426324546791, + "loss": 1.2445, + "step": 6414 + }, + { + "epoch": 0.5625427326127079, + "grad_norm": 0.05029296875, + "learning_rate": 0.0015630194823727935, + "loss": 1.2119, + "step": 6415 + }, + { + "epoch": 0.5626304243870824, + "grad_norm": 0.07763671875, + "learning_rate": 0.0015626126481311949, + "loss": 1.1181, + "step": 6416 + }, + { + "epoch": 0.562718116161457, + "grad_norm": 0.11279296875, + "learning_rate": 0.0015622058218590967, + "loss": 1.1875, + "step": 6417 + }, + { + "epoch": 0.5628058079358316, + "grad_norm": 0.07177734375, + "learning_rate": 0.0015617990035936011, + "loss": 1.1615, + "step": 6418 + }, + { + "epoch": 0.5628934997102061, + "grad_norm": 0.103515625, + "learning_rate": 0.0015613921933718088, + "loss": 1.2038, + "step": 6419 + }, + { + "epoch": 0.5629811914845806, + "grad_norm": 0.08056640625, + "learning_rate": 0.0015609853912308196, + "loss": 1.1852, + "step": 6420 + }, + { + "epoch": 0.5630688832589552, + "grad_norm": 0.06884765625, + "learning_rate": 0.0015605785972077329, + "loss": 1.1208, + "step": 6421 + }, + { + "epoch": 0.5631565750333297, + "grad_norm": 0.060791015625, + "learning_rate": 0.0015601718113396475, + "loss": 1.184, + "step": 6422 + }, + { + "epoch": 0.5632442668077042, + "grad_norm": 0.046875, + "learning_rate": 0.0015597650336636607, + "loss": 1.1531, + "step": 6423 + }, + { + "epoch": 0.5633319585820789, + "grad_norm": 0.07861328125, + "learning_rate": 0.00155935826421687, + "loss": 1.116, + "step": 6424 + }, + { + "epoch": 0.5634196503564534, + "grad_norm": 0.07275390625, + "learning_rate": 0.0015589515030363715, + "loss": 1.2072, + "step": 6425 + }, + { + "epoch": 0.5635073421308279, + "grad_norm": 0.057373046875, + "learning_rate": 0.0015585447501592613, + "loss": 1.0931, + "step": 6426 + }, + { + "epoch": 0.5635950339052025, + "grad_norm": 0.09814453125, + "learning_rate": 0.0015581380056226334, + "loss": 1.1671, + "step": 6427 + }, + { + "epoch": 0.563682725679577, + "grad_norm": 0.04638671875, + "learning_rate": 0.0015577312694635824, + "loss": 1.0912, + "step": 6428 + }, + { + "epoch": 0.5637704174539515, + "grad_norm": 0.04541015625, + "learning_rate": 0.0015573245417192015, + "loss": 1.1193, + "step": 6429 + }, + { + "epoch": 0.5638581092283261, + "grad_norm": 0.050537109375, + "learning_rate": 0.001556917822426583, + "loss": 1.1215, + "step": 6430 + }, + { + "epoch": 0.5639458010027006, + "grad_norm": 0.058349609375, + "learning_rate": 0.0015565111116228197, + "loss": 1.1901, + "step": 6431 + }, + { + "epoch": 0.5640334927770752, + "grad_norm": 0.048583984375, + "learning_rate": 0.0015561044093450006, + "loss": 1.1357, + "step": 6432 + }, + { + "epoch": 0.5641211845514498, + "grad_norm": 0.052001953125, + "learning_rate": 0.0015556977156302172, + "loss": 1.2096, + "step": 6433 + }, + { + "epoch": 0.5642088763258243, + "grad_norm": 0.04736328125, + "learning_rate": 0.0015552910305155587, + "loss": 1.1243, + "step": 6434 + }, + { + "epoch": 0.5642965681001988, + "grad_norm": 0.055908203125, + "learning_rate": 0.0015548843540381136, + "loss": 1.1539, + "step": 6435 + }, + { + "epoch": 0.5643842598745734, + "grad_norm": 0.05078125, + "learning_rate": 0.0015544776862349704, + "loss": 1.1485, + "step": 6436 + }, + { + "epoch": 0.5644719516489479, + "grad_norm": 0.04833984375, + "learning_rate": 0.001554071027143215, + "loss": 1.1624, + "step": 6437 + }, + { + "epoch": 0.5645596434233224, + "grad_norm": 0.058837890625, + "learning_rate": 0.0015536643767999347, + "loss": 1.1725, + "step": 6438 + }, + { + "epoch": 0.5646473351976969, + "grad_norm": 0.060791015625, + "learning_rate": 0.0015532577352422143, + "loss": 1.1978, + "step": 6439 + }, + { + "epoch": 0.5647350269720716, + "grad_norm": 0.07568359375, + "learning_rate": 0.0015528511025071387, + "loss": 1.1565, + "step": 6440 + }, + { + "epoch": 0.5648227187464461, + "grad_norm": 0.050048828125, + "learning_rate": 0.001552444478631792, + "loss": 1.1567, + "step": 6441 + }, + { + "epoch": 0.5649104105208206, + "grad_norm": 0.052978515625, + "learning_rate": 0.001552037863653257, + "loss": 1.1374, + "step": 6442 + }, + { + "epoch": 0.5649981022951952, + "grad_norm": 0.047119140625, + "learning_rate": 0.0015516312576086158, + "loss": 1.1368, + "step": 6443 + }, + { + "epoch": 0.5650857940695697, + "grad_norm": 0.04931640625, + "learning_rate": 0.0015512246605349502, + "loss": 1.1687, + "step": 6444 + }, + { + "epoch": 0.5651734858439442, + "grad_norm": 0.048828125, + "learning_rate": 0.0015508180724693398, + "loss": 1.1501, + "step": 6445 + }, + { + "epoch": 0.5652611776183188, + "grad_norm": 0.0830078125, + "learning_rate": 0.0015504114934488662, + "loss": 1.1564, + "step": 6446 + }, + { + "epoch": 0.5653488693926934, + "grad_norm": 0.0703125, + "learning_rate": 0.0015500049235106068, + "loss": 1.1258, + "step": 6447 + }, + { + "epoch": 0.5654365611670679, + "grad_norm": 0.044677734375, + "learning_rate": 0.0015495983626916403, + "loss": 1.1579, + "step": 6448 + }, + { + "epoch": 0.5655242529414425, + "grad_norm": 0.08447265625, + "learning_rate": 0.0015491918110290444, + "loss": 1.1412, + "step": 6449 + }, + { + "epoch": 0.565611944715817, + "grad_norm": 0.059326171875, + "learning_rate": 0.0015487852685598942, + "loss": 1.1863, + "step": 6450 + }, + { + "epoch": 0.5656996364901915, + "grad_norm": 0.046875, + "learning_rate": 0.0015483787353212668, + "loss": 1.1376, + "step": 6451 + }, + { + "epoch": 0.5657873282645661, + "grad_norm": 0.047607421875, + "learning_rate": 0.001547972211350236, + "loss": 1.1793, + "step": 6452 + }, + { + "epoch": 0.5658750200389406, + "grad_norm": 0.0654296875, + "learning_rate": 0.001547565696683877, + "loss": 1.1658, + "step": 6453 + }, + { + "epoch": 0.5659627118133151, + "grad_norm": 0.0859375, + "learning_rate": 0.001547159191359261, + "loss": 1.1813, + "step": 6454 + }, + { + "epoch": 0.5660504035876898, + "grad_norm": 0.0556640625, + "learning_rate": 0.0015467526954134616, + "loss": 1.1809, + "step": 6455 + }, + { + "epoch": 0.5661380953620643, + "grad_norm": 0.07275390625, + "learning_rate": 0.0015463462088835499, + "loss": 1.1622, + "step": 6456 + }, + { + "epoch": 0.5662257871364388, + "grad_norm": 0.1044921875, + "learning_rate": 0.0015459397318065965, + "loss": 1.1251, + "step": 6457 + }, + { + "epoch": 0.5663134789108134, + "grad_norm": 0.050048828125, + "learning_rate": 0.0015455332642196709, + "loss": 1.14, + "step": 6458 + }, + { + "epoch": 0.5664011706851879, + "grad_norm": 0.0615234375, + "learning_rate": 0.0015451268061598415, + "loss": 1.1875, + "step": 6459 + }, + { + "epoch": 0.5664888624595624, + "grad_norm": 0.09814453125, + "learning_rate": 0.0015447203576641772, + "loss": 1.1589, + "step": 6460 + }, + { + "epoch": 0.5665765542339369, + "grad_norm": 0.064453125, + "learning_rate": 0.001544313918769744, + "loss": 1.1756, + "step": 6461 + }, + { + "epoch": 0.5666642460083116, + "grad_norm": 0.1318359375, + "learning_rate": 0.001543907489513609, + "loss": 1.1708, + "step": 6462 + }, + { + "epoch": 0.5667519377826861, + "grad_norm": 0.050537109375, + "learning_rate": 0.001543501069932837, + "loss": 1.1668, + "step": 6463 + }, + { + "epoch": 0.5668396295570606, + "grad_norm": 0.05126953125, + "learning_rate": 0.0015430946600644925, + "loss": 1.1647, + "step": 6464 + }, + { + "epoch": 0.5669273213314352, + "grad_norm": 0.078125, + "learning_rate": 0.0015426882599456394, + "loss": 1.239, + "step": 6465 + }, + { + "epoch": 0.5670150131058097, + "grad_norm": 0.053955078125, + "learning_rate": 0.0015422818696133394, + "loss": 1.1028, + "step": 6466 + }, + { + "epoch": 0.5671027048801842, + "grad_norm": 0.07373046875, + "learning_rate": 0.001541875489104655, + "loss": 1.1704, + "step": 6467 + }, + { + "epoch": 0.5671903966545588, + "grad_norm": 0.11962890625, + "learning_rate": 0.001541469118456647, + "loss": 1.2021, + "step": 6468 + }, + { + "epoch": 0.5672780884289333, + "grad_norm": 0.0654296875, + "learning_rate": 0.0015410627577063752, + "loss": 1.1609, + "step": 6469 + }, + { + "epoch": 0.5673657802033079, + "grad_norm": 0.08154296875, + "learning_rate": 0.001540656406890898, + "loss": 1.1175, + "step": 6470 + }, + { + "epoch": 0.5674534719776825, + "grad_norm": 0.09228515625, + "learning_rate": 0.0015402500660472752, + "loss": 1.1487, + "step": 6471 + }, + { + "epoch": 0.567541163752057, + "grad_norm": 0.057861328125, + "learning_rate": 0.0015398437352125624, + "loss": 1.2183, + "step": 6472 + }, + { + "epoch": 0.5676288555264315, + "grad_norm": 0.07666015625, + "learning_rate": 0.0015394374144238167, + "loss": 1.0994, + "step": 6473 + }, + { + "epoch": 0.5677165473008061, + "grad_norm": 0.08251953125, + "learning_rate": 0.0015390311037180935, + "loss": 1.1235, + "step": 6474 + }, + { + "epoch": 0.5678042390751806, + "grad_norm": 0.0595703125, + "learning_rate": 0.0015386248031324474, + "loss": 1.1824, + "step": 6475 + }, + { + "epoch": 0.5678919308495551, + "grad_norm": 0.06640625, + "learning_rate": 0.0015382185127039312, + "loss": 1.1131, + "step": 6476 + }, + { + "epoch": 0.5679796226239298, + "grad_norm": 0.09765625, + "learning_rate": 0.0015378122324695989, + "loss": 1.1558, + "step": 6477 + }, + { + "epoch": 0.5680673143983043, + "grad_norm": 0.045166015625, + "learning_rate": 0.0015374059624665005, + "loss": 1.1311, + "step": 6478 + }, + { + "epoch": 0.5681550061726788, + "grad_norm": 0.0498046875, + "learning_rate": 0.0015369997027316884, + "loss": 1.1621, + "step": 6479 + }, + { + "epoch": 0.5682426979470534, + "grad_norm": 0.050537109375, + "learning_rate": 0.001536593453302211, + "loss": 1.1048, + "step": 6480 + }, + { + "epoch": 0.5683303897214279, + "grad_norm": 0.056640625, + "learning_rate": 0.0015361872142151186, + "loss": 1.1657, + "step": 6481 + }, + { + "epoch": 0.5684180814958024, + "grad_norm": 0.045654296875, + "learning_rate": 0.0015357809855074582, + "loss": 1.1411, + "step": 6482 + }, + { + "epoch": 0.568505773270177, + "grad_norm": 0.0712890625, + "learning_rate": 0.001535374767216277, + "loss": 1.1566, + "step": 6483 + }, + { + "epoch": 0.5685934650445515, + "grad_norm": 0.04638671875, + "learning_rate": 0.0015349685593786214, + "loss": 1.0925, + "step": 6484 + }, + { + "epoch": 0.5686811568189261, + "grad_norm": 0.049560546875, + "learning_rate": 0.0015345623620315356, + "loss": 1.1369, + "step": 6485 + }, + { + "epoch": 0.5687688485933006, + "grad_norm": 0.06591796875, + "learning_rate": 0.001534156175212065, + "loss": 1.2089, + "step": 6486 + }, + { + "epoch": 0.5688565403676752, + "grad_norm": 0.052734375, + "learning_rate": 0.0015337499989572518, + "loss": 1.1559, + "step": 6487 + }, + { + "epoch": 0.5689442321420497, + "grad_norm": 0.06103515625, + "learning_rate": 0.0015333438333041383, + "loss": 1.1969, + "step": 6488 + }, + { + "epoch": 0.5690319239164242, + "grad_norm": 0.052490234375, + "learning_rate": 0.0015329376782897666, + "loss": 1.1695, + "step": 6489 + }, + { + "epoch": 0.5691196156907988, + "grad_norm": 0.0458984375, + "learning_rate": 0.001532531533951176, + "loss": 1.1342, + "step": 6490 + }, + { + "epoch": 0.5692073074651733, + "grad_norm": 0.051025390625, + "learning_rate": 0.0015321254003254068, + "loss": 1.1141, + "step": 6491 + }, + { + "epoch": 0.5692949992395478, + "grad_norm": 0.052734375, + "learning_rate": 0.0015317192774494963, + "loss": 1.1399, + "step": 6492 + }, + { + "epoch": 0.5693826910139225, + "grad_norm": 0.04736328125, + "learning_rate": 0.0015313131653604828, + "loss": 1.1925, + "step": 6493 + }, + { + "epoch": 0.569470382788297, + "grad_norm": 0.048583984375, + "learning_rate": 0.001530907064095402, + "loss": 1.1537, + "step": 6494 + }, + { + "epoch": 0.5695580745626715, + "grad_norm": 0.051025390625, + "learning_rate": 0.0015305009736912897, + "loss": 1.1201, + "step": 6495 + }, + { + "epoch": 0.5696457663370461, + "grad_norm": 0.0546875, + "learning_rate": 0.0015300948941851797, + "loss": 1.1923, + "step": 6496 + }, + { + "epoch": 0.5697334581114206, + "grad_norm": 0.04638671875, + "learning_rate": 0.0015296888256141061, + "loss": 1.1451, + "step": 6497 + }, + { + "epoch": 0.5698211498857951, + "grad_norm": 0.07373046875, + "learning_rate": 0.001529282768015101, + "loss": 1.1178, + "step": 6498 + }, + { + "epoch": 0.5699088416601698, + "grad_norm": 0.043701171875, + "learning_rate": 0.0015288767214251965, + "loss": 1.0961, + "step": 6499 + }, + { + "epoch": 0.5699965334345443, + "grad_norm": 0.054443359375, + "learning_rate": 0.0015284706858814214, + "loss": 1.1519, + "step": 6500 + }, + { + "epoch": 0.5699965334345443, + "eval_loss": 1.1673738956451416, + "eval_runtime": 428.6209, + "eval_samples_per_second": 33.706, + "eval_steps_per_second": 8.427, + "step": 6500 + }, + { + "epoch": 0.5700842252089188, + "grad_norm": 0.046630859375, + "learning_rate": 0.0015280646614208061, + "loss": 1.1473, + "step": 6501 + }, + { + "epoch": 0.5701719169832934, + "grad_norm": 0.07666015625, + "learning_rate": 0.0015276586480803794, + "loss": 1.1324, + "step": 6502 + }, + { + "epoch": 0.5702596087576679, + "grad_norm": 0.0654296875, + "learning_rate": 0.0015272526458971678, + "loss": 1.1759, + "step": 6503 + }, + { + "epoch": 0.5703473005320424, + "grad_norm": 0.06884765625, + "learning_rate": 0.001526846654908198, + "loss": 1.1068, + "step": 6504 + }, + { + "epoch": 0.570434992306417, + "grad_norm": 0.0517578125, + "learning_rate": 0.0015264406751504952, + "loss": 1.1239, + "step": 6505 + }, + { + "epoch": 0.5705226840807915, + "grad_norm": 0.0888671875, + "learning_rate": 0.0015260347066610841, + "loss": 1.1505, + "step": 6506 + }, + { + "epoch": 0.570610375855166, + "grad_norm": 0.080078125, + "learning_rate": 0.0015256287494769879, + "loss": 1.1984, + "step": 6507 + }, + { + "epoch": 0.5706980676295407, + "grad_norm": 0.057373046875, + "learning_rate": 0.0015252228036352288, + "loss": 1.19, + "step": 6508 + }, + { + "epoch": 0.5707857594039152, + "grad_norm": 0.06689453125, + "learning_rate": 0.0015248168691728273, + "loss": 1.0757, + "step": 6509 + }, + { + "epoch": 0.5708734511782897, + "grad_norm": 0.10400390625, + "learning_rate": 0.0015244109461268046, + "loss": 1.1678, + "step": 6510 + }, + { + "epoch": 0.5709611429526642, + "grad_norm": 0.048828125, + "learning_rate": 0.0015240050345341793, + "loss": 1.2016, + "step": 6511 + }, + { + "epoch": 0.5710488347270388, + "grad_norm": 0.1240234375, + "learning_rate": 0.001523599134431969, + "loss": 1.1747, + "step": 6512 + }, + { + "epoch": 0.5711365265014133, + "grad_norm": 0.10546875, + "learning_rate": 0.0015231932458571924, + "loss": 1.12, + "step": 6513 + }, + { + "epoch": 0.5712242182757878, + "grad_norm": 0.052001953125, + "learning_rate": 0.0015227873688468637, + "loss": 1.0671, + "step": 6514 + }, + { + "epoch": 0.5713119100501625, + "grad_norm": 0.091796875, + "learning_rate": 0.0015223815034379991, + "loss": 1.1723, + "step": 6515 + }, + { + "epoch": 0.571399601824537, + "grad_norm": 0.10888671875, + "learning_rate": 0.001521975649667612, + "loss": 1.1501, + "step": 6516 + }, + { + "epoch": 0.5714872935989115, + "grad_norm": 0.0478515625, + "learning_rate": 0.0015215698075727146, + "loss": 1.1721, + "step": 6517 + }, + { + "epoch": 0.5715749853732861, + "grad_norm": 0.056640625, + "learning_rate": 0.0015211639771903196, + "loss": 1.2306, + "step": 6518 + }, + { + "epoch": 0.5716626771476606, + "grad_norm": 0.0673828125, + "learning_rate": 0.001520758158557437, + "loss": 1.1278, + "step": 6519 + }, + { + "epoch": 0.5717503689220351, + "grad_norm": 0.05419921875, + "learning_rate": 0.001520352351711077, + "loss": 1.1728, + "step": 6520 + }, + { + "epoch": 0.5718380606964097, + "grad_norm": 0.047119140625, + "learning_rate": 0.0015199465566882475, + "loss": 1.1506, + "step": 6521 + }, + { + "epoch": 0.5719257524707843, + "grad_norm": 0.0849609375, + "learning_rate": 0.0015195407735259564, + "loss": 1.1374, + "step": 6522 + }, + { + "epoch": 0.5720134442451588, + "grad_norm": 0.062255859375, + "learning_rate": 0.0015191350022612101, + "loss": 1.1839, + "step": 6523 + }, + { + "epoch": 0.5721011360195334, + "grad_norm": 0.0615234375, + "learning_rate": 0.0015187292429310138, + "loss": 1.1396, + "step": 6524 + }, + { + "epoch": 0.5721888277939079, + "grad_norm": 0.052001953125, + "learning_rate": 0.0015183234955723714, + "loss": 1.1299, + "step": 6525 + }, + { + "epoch": 0.5722765195682824, + "grad_norm": 0.056396484375, + "learning_rate": 0.0015179177602222865, + "loss": 1.1961, + "step": 6526 + }, + { + "epoch": 0.572364211342657, + "grad_norm": 0.049072265625, + "learning_rate": 0.0015175120369177607, + "loss": 1.1657, + "step": 6527 + }, + { + "epoch": 0.5724519031170315, + "grad_norm": 0.047119140625, + "learning_rate": 0.0015171063256957957, + "loss": 1.1423, + "step": 6528 + }, + { + "epoch": 0.572539594891406, + "grad_norm": 0.07666015625, + "learning_rate": 0.0015167006265933898, + "loss": 1.2199, + "step": 6529 + }, + { + "epoch": 0.5726272866657807, + "grad_norm": 0.0576171875, + "learning_rate": 0.0015162949396475432, + "loss": 1.1024, + "step": 6530 + }, + { + "epoch": 0.5727149784401552, + "grad_norm": 0.07763671875, + "learning_rate": 0.0015158892648952527, + "loss": 1.0885, + "step": 6531 + }, + { + "epoch": 0.5728026702145297, + "grad_norm": 0.07470703125, + "learning_rate": 0.0015154836023735156, + "loss": 1.2168, + "step": 6532 + }, + { + "epoch": 0.5728903619889042, + "grad_norm": 0.05078125, + "learning_rate": 0.0015150779521193266, + "loss": 1.1093, + "step": 6533 + }, + { + "epoch": 0.5729780537632788, + "grad_norm": 0.07373046875, + "learning_rate": 0.0015146723141696797, + "loss": 1.2044, + "step": 6534 + }, + { + "epoch": 0.5730657455376533, + "grad_norm": 0.0673828125, + "learning_rate": 0.0015142666885615689, + "loss": 1.211, + "step": 6535 + }, + { + "epoch": 0.5731534373120278, + "grad_norm": 0.052734375, + "learning_rate": 0.001513861075331985, + "loss": 1.1308, + "step": 6536 + }, + { + "epoch": 0.5732411290864025, + "grad_norm": 0.095703125, + "learning_rate": 0.0015134554745179199, + "loss": 1.1379, + "step": 6537 + }, + { + "epoch": 0.573328820860777, + "grad_norm": 0.047119140625, + "learning_rate": 0.0015130498861563632, + "loss": 1.1147, + "step": 6538 + }, + { + "epoch": 0.5734165126351515, + "grad_norm": 0.05908203125, + "learning_rate": 0.001512644310284303, + "loss": 1.2587, + "step": 6539 + }, + { + "epoch": 0.5735042044095261, + "grad_norm": 0.08447265625, + "learning_rate": 0.0015122387469387272, + "loss": 1.1573, + "step": 6540 + }, + { + "epoch": 0.5735918961839006, + "grad_norm": 0.056396484375, + "learning_rate": 0.001511833196156622, + "loss": 1.1539, + "step": 6541 + }, + { + "epoch": 0.5736795879582751, + "grad_norm": 0.07080078125, + "learning_rate": 0.001511427657974973, + "loss": 1.1387, + "step": 6542 + }, + { + "epoch": 0.5737672797326497, + "grad_norm": 0.05517578125, + "learning_rate": 0.0015110221324307635, + "loss": 1.1737, + "step": 6543 + }, + { + "epoch": 0.5738549715070242, + "grad_norm": 0.054443359375, + "learning_rate": 0.001510616619560977, + "loss": 1.1281, + "step": 6544 + }, + { + "epoch": 0.5739426632813988, + "grad_norm": 0.051025390625, + "learning_rate": 0.0015102111194025948, + "loss": 1.153, + "step": 6545 + }, + { + "epoch": 0.5740303550557734, + "grad_norm": 0.0673828125, + "learning_rate": 0.001509805631992598, + "loss": 1.1813, + "step": 6546 + }, + { + "epoch": 0.5741180468301479, + "grad_norm": 0.046875, + "learning_rate": 0.0015094001573679654, + "loss": 1.1682, + "step": 6547 + }, + { + "epoch": 0.5742057386045224, + "grad_norm": 0.054931640625, + "learning_rate": 0.0015089946955656752, + "loss": 1.1304, + "step": 6548 + }, + { + "epoch": 0.574293430378897, + "grad_norm": 0.048095703125, + "learning_rate": 0.0015085892466227058, + "loss": 1.1013, + "step": 6549 + }, + { + "epoch": 0.5743811221532715, + "grad_norm": 0.078125, + "learning_rate": 0.001508183810576031, + "loss": 1.1896, + "step": 6550 + }, + { + "epoch": 0.574468813927646, + "grad_norm": 0.05029296875, + "learning_rate": 0.001507778387462627, + "loss": 1.1672, + "step": 6551 + }, + { + "epoch": 0.5745565057020207, + "grad_norm": 0.06494140625, + "learning_rate": 0.0015073729773194666, + "loss": 1.1243, + "step": 6552 + }, + { + "epoch": 0.5746441974763952, + "grad_norm": 0.05419921875, + "learning_rate": 0.0015069675801835229, + "loss": 1.1437, + "step": 6553 + }, + { + "epoch": 0.5747318892507697, + "grad_norm": 0.048828125, + "learning_rate": 0.0015065621960917664, + "loss": 1.1048, + "step": 6554 + }, + { + "epoch": 0.5748195810251443, + "grad_norm": 0.0751953125, + "learning_rate": 0.0015061568250811675, + "loss": 1.1311, + "step": 6555 + }, + { + "epoch": 0.5749072727995188, + "grad_norm": 0.05322265625, + "learning_rate": 0.0015057514671886947, + "loss": 1.1287, + "step": 6556 + }, + { + "epoch": 0.5749949645738933, + "grad_norm": 0.051025390625, + "learning_rate": 0.0015053461224513157, + "loss": 1.2245, + "step": 6557 + }, + { + "epoch": 0.5750826563482678, + "grad_norm": 0.06640625, + "learning_rate": 0.0015049407909059967, + "loss": 1.1767, + "step": 6558 + }, + { + "epoch": 0.5751703481226424, + "grad_norm": 0.045654296875, + "learning_rate": 0.0015045354725897035, + "loss": 1.1353, + "step": 6559 + }, + { + "epoch": 0.575258039897017, + "grad_norm": 0.04931640625, + "learning_rate": 0.001504130167539399, + "loss": 1.1364, + "step": 6560 + }, + { + "epoch": 0.5753457316713915, + "grad_norm": 0.05078125, + "learning_rate": 0.001503724875792048, + "loss": 1.0904, + "step": 6561 + }, + { + "epoch": 0.5754334234457661, + "grad_norm": 0.044677734375, + "learning_rate": 0.0015033195973846094, + "loss": 1.113, + "step": 6562 + }, + { + "epoch": 0.5755211152201406, + "grad_norm": 0.07275390625, + "learning_rate": 0.0015029143323540453, + "loss": 1.151, + "step": 6563 + }, + { + "epoch": 0.5756088069945151, + "grad_norm": 0.05908203125, + "learning_rate": 0.0015025090807373146, + "loss": 1.1726, + "step": 6564 + }, + { + "epoch": 0.5756964987688897, + "grad_norm": 0.04638671875, + "learning_rate": 0.0015021038425713745, + "loss": 1.1951, + "step": 6565 + }, + { + "epoch": 0.5757841905432642, + "grad_norm": 0.05712890625, + "learning_rate": 0.0015016986178931835, + "loss": 1.1474, + "step": 6566 + }, + { + "epoch": 0.5758718823176388, + "grad_norm": 0.06005859375, + "learning_rate": 0.0015012934067396946, + "loss": 1.1454, + "step": 6567 + }, + { + "epoch": 0.5759595740920134, + "grad_norm": 0.10546875, + "learning_rate": 0.0015008882091478626, + "loss": 1.1748, + "step": 6568 + }, + { + "epoch": 0.5760472658663879, + "grad_norm": 0.0830078125, + "learning_rate": 0.001500483025154642, + "loss": 1.1602, + "step": 6569 + }, + { + "epoch": 0.5761349576407624, + "grad_norm": 0.06494140625, + "learning_rate": 0.001500077854796983, + "loss": 1.18, + "step": 6570 + }, + { + "epoch": 0.576222649415137, + "grad_norm": 0.10400390625, + "learning_rate": 0.0014996726981118369, + "loss": 1.1792, + "step": 6571 + }, + { + "epoch": 0.5763103411895115, + "grad_norm": 0.05029296875, + "learning_rate": 0.0014992675551361522, + "loss": 1.1792, + "step": 6572 + }, + { + "epoch": 0.576398032963886, + "grad_norm": 0.0859375, + "learning_rate": 0.001498862425906878, + "loss": 1.1694, + "step": 6573 + }, + { + "epoch": 0.5764857247382607, + "grad_norm": 0.08447265625, + "learning_rate": 0.00149845731046096, + "loss": 1.1887, + "step": 6574 + }, + { + "epoch": 0.5765734165126352, + "grad_norm": 0.049560546875, + "learning_rate": 0.0014980522088353441, + "loss": 1.1877, + "step": 6575 + }, + { + "epoch": 0.5766611082870097, + "grad_norm": 0.0849609375, + "learning_rate": 0.0014976471210669745, + "loss": 1.2154, + "step": 6576 + }, + { + "epoch": 0.5767488000613843, + "grad_norm": 0.052490234375, + "learning_rate": 0.0014972420471927946, + "loss": 1.1618, + "step": 6577 + }, + { + "epoch": 0.5768364918357588, + "grad_norm": 0.052734375, + "learning_rate": 0.0014968369872497451, + "loss": 1.122, + "step": 6578 + }, + { + "epoch": 0.5769241836101333, + "grad_norm": 0.0439453125, + "learning_rate": 0.0014964319412747672, + "loss": 1.0998, + "step": 6579 + }, + { + "epoch": 0.5770118753845079, + "grad_norm": 0.07373046875, + "learning_rate": 0.0014960269093047996, + "loss": 1.1228, + "step": 6580 + }, + { + "epoch": 0.5770995671588824, + "grad_norm": 0.08203125, + "learning_rate": 0.001495621891376781, + "loss": 1.1265, + "step": 6581 + }, + { + "epoch": 0.577187258933257, + "grad_norm": 0.046875, + "learning_rate": 0.0014952168875276466, + "loss": 1.1559, + "step": 6582 + }, + { + "epoch": 0.5772749507076315, + "grad_norm": 0.103515625, + "learning_rate": 0.0014948118977943334, + "loss": 1.1404, + "step": 6583 + }, + { + "epoch": 0.5773626424820061, + "grad_norm": 0.07275390625, + "learning_rate": 0.0014944069222137739, + "loss": 1.1058, + "step": 6584 + }, + { + "epoch": 0.5774503342563806, + "grad_norm": 0.045654296875, + "learning_rate": 0.0014940019608229015, + "loss": 1.097, + "step": 6585 + }, + { + "epoch": 0.5775380260307551, + "grad_norm": 0.05615234375, + "learning_rate": 0.0014935970136586475, + "loss": 1.1689, + "step": 6586 + }, + { + "epoch": 0.5776257178051297, + "grad_norm": 0.072265625, + "learning_rate": 0.001493192080757942, + "loss": 1.1258, + "step": 6587 + }, + { + "epoch": 0.5777134095795042, + "grad_norm": 0.064453125, + "learning_rate": 0.0014927871621577143, + "loss": 1.1758, + "step": 6588 + }, + { + "epoch": 0.5778011013538787, + "grad_norm": 0.050048828125, + "learning_rate": 0.0014923822578948912, + "loss": 1.1779, + "step": 6589 + }, + { + "epoch": 0.5778887931282534, + "grad_norm": 0.068359375, + "learning_rate": 0.0014919773680063997, + "loss": 1.1491, + "step": 6590 + }, + { + "epoch": 0.5779764849026279, + "grad_norm": 0.05615234375, + "learning_rate": 0.0014915724925291638, + "loss": 1.1562, + "step": 6591 + }, + { + "epoch": 0.5780641766770024, + "grad_norm": 0.048828125, + "learning_rate": 0.001491167631500108, + "loss": 1.2082, + "step": 6592 + }, + { + "epoch": 0.578151868451377, + "grad_norm": 0.050048828125, + "learning_rate": 0.0014907627849561538, + "loss": 1.159, + "step": 6593 + }, + { + "epoch": 0.5782395602257515, + "grad_norm": 0.056884765625, + "learning_rate": 0.001490357952934223, + "loss": 1.1082, + "step": 6594 + }, + { + "epoch": 0.578327252000126, + "grad_norm": 0.045654296875, + "learning_rate": 0.0014899531354712345, + "loss": 1.0696, + "step": 6595 + }, + { + "epoch": 0.5784149437745006, + "grad_norm": 0.049560546875, + "learning_rate": 0.0014895483326041068, + "loss": 1.14, + "step": 6596 + }, + { + "epoch": 0.5785026355488752, + "grad_norm": 0.055908203125, + "learning_rate": 0.0014891435443697572, + "loss": 1.1619, + "step": 6597 + }, + { + "epoch": 0.5785903273232497, + "grad_norm": 0.07470703125, + "learning_rate": 0.001488738770805101, + "loss": 1.217, + "step": 6598 + }, + { + "epoch": 0.5786780190976243, + "grad_norm": 0.047119140625, + "learning_rate": 0.0014883340119470525, + "loss": 1.1591, + "step": 6599 + }, + { + "epoch": 0.5787657108719988, + "grad_norm": 0.048828125, + "learning_rate": 0.0014879292678325255, + "loss": 1.1941, + "step": 6600 + }, + { + "epoch": 0.5788534026463733, + "grad_norm": 0.06689453125, + "learning_rate": 0.0014875245384984304, + "loss": 1.1776, + "step": 6601 + }, + { + "epoch": 0.5789410944207479, + "grad_norm": 0.0615234375, + "learning_rate": 0.0014871198239816777, + "loss": 1.1477, + "step": 6602 + }, + { + "epoch": 0.5790287861951224, + "grad_norm": 0.056884765625, + "learning_rate": 0.0014867151243191765, + "loss": 1.1426, + "step": 6603 + }, + { + "epoch": 0.579116477969497, + "grad_norm": 0.0751953125, + "learning_rate": 0.0014863104395478348, + "loss": 1.1707, + "step": 6604 + }, + { + "epoch": 0.5792041697438715, + "grad_norm": 0.0927734375, + "learning_rate": 0.0014859057697045584, + "loss": 1.1674, + "step": 6605 + }, + { + "epoch": 0.5792918615182461, + "grad_norm": 0.051513671875, + "learning_rate": 0.001485501114826252, + "loss": 1.1671, + "step": 6606 + }, + { + "epoch": 0.5793795532926206, + "grad_norm": 0.0927734375, + "learning_rate": 0.0014850964749498193, + "loss": 1.1706, + "step": 6607 + }, + { + "epoch": 0.5794672450669951, + "grad_norm": 0.08837890625, + "learning_rate": 0.0014846918501121627, + "loss": 1.1243, + "step": 6608 + }, + { + "epoch": 0.5795549368413697, + "grad_norm": 0.053466796875, + "learning_rate": 0.0014842872403501824, + "loss": 1.1424, + "step": 6609 + }, + { + "epoch": 0.5796426286157442, + "grad_norm": 0.09326171875, + "learning_rate": 0.0014838826457007782, + "loss": 1.1872, + "step": 6610 + }, + { + "epoch": 0.5797303203901187, + "grad_norm": 0.05322265625, + "learning_rate": 0.0014834780662008472, + "loss": 1.1556, + "step": 6611 + }, + { + "epoch": 0.5798180121644934, + "grad_norm": 0.06005859375, + "learning_rate": 0.0014830735018872875, + "loss": 1.1546, + "step": 6612 + }, + { + "epoch": 0.5799057039388679, + "grad_norm": 0.0458984375, + "learning_rate": 0.0014826689527969931, + "loss": 1.1947, + "step": 6613 + }, + { + "epoch": 0.5799933957132424, + "grad_norm": 0.07177734375, + "learning_rate": 0.0014822644189668584, + "loss": 1.1886, + "step": 6614 + }, + { + "epoch": 0.580081087487617, + "grad_norm": 0.046142578125, + "learning_rate": 0.0014818599004337756, + "loss": 1.1989, + "step": 6615 + }, + { + "epoch": 0.5801687792619915, + "grad_norm": 0.0478515625, + "learning_rate": 0.001481455397234636, + "loss": 1.1278, + "step": 6616 + }, + { + "epoch": 0.580256471036366, + "grad_norm": 0.050048828125, + "learning_rate": 0.0014810509094063291, + "loss": 1.1574, + "step": 6617 + }, + { + "epoch": 0.5803441628107406, + "grad_norm": 0.06103515625, + "learning_rate": 0.001480646436985743, + "loss": 1.184, + "step": 6618 + }, + { + "epoch": 0.5804318545851151, + "grad_norm": 0.06298828125, + "learning_rate": 0.0014802419800097647, + "loss": 1.1273, + "step": 6619 + }, + { + "epoch": 0.5805195463594897, + "grad_norm": 0.04931640625, + "learning_rate": 0.0014798375385152792, + "loss": 1.1614, + "step": 6620 + }, + { + "epoch": 0.5806072381338643, + "grad_norm": 0.095703125, + "learning_rate": 0.0014794331125391712, + "loss": 1.1572, + "step": 6621 + }, + { + "epoch": 0.5806949299082388, + "grad_norm": 0.051025390625, + "learning_rate": 0.001479028702118323, + "loss": 1.114, + "step": 6622 + }, + { + "epoch": 0.5807826216826133, + "grad_norm": 0.05810546875, + "learning_rate": 0.0014786243072896158, + "loss": 1.1975, + "step": 6623 + }, + { + "epoch": 0.5808703134569879, + "grad_norm": 0.053955078125, + "learning_rate": 0.0014782199280899296, + "loss": 1.1623, + "step": 6624 + }, + { + "epoch": 0.5809580052313624, + "grad_norm": 0.0498046875, + "learning_rate": 0.001477815564556142, + "loss": 1.1613, + "step": 6625 + }, + { + "epoch": 0.5810456970057369, + "grad_norm": 0.0517578125, + "learning_rate": 0.001477411216725131, + "loss": 1.2044, + "step": 6626 + }, + { + "epoch": 0.5811333887801116, + "grad_norm": 0.07568359375, + "learning_rate": 0.001477006884633771, + "loss": 1.1286, + "step": 6627 + }, + { + "epoch": 0.5812210805544861, + "grad_norm": 0.044921875, + "learning_rate": 0.001476602568318937, + "loss": 1.1279, + "step": 6628 + }, + { + "epoch": 0.5813087723288606, + "grad_norm": 0.0498046875, + "learning_rate": 0.0014761982678175008, + "loss": 1.1675, + "step": 6629 + }, + { + "epoch": 0.5813964641032351, + "grad_norm": 0.041015625, + "learning_rate": 0.0014757939831663341, + "loss": 1.1099, + "step": 6630 + }, + { + "epoch": 0.5814841558776097, + "grad_norm": 0.05029296875, + "learning_rate": 0.001475389714402306, + "loss": 1.1385, + "step": 6631 + }, + { + "epoch": 0.5815718476519842, + "grad_norm": 0.047607421875, + "learning_rate": 0.0014749854615622858, + "loss": 1.1145, + "step": 6632 + }, + { + "epoch": 0.5816595394263587, + "grad_norm": 0.08251953125, + "learning_rate": 0.0014745812246831397, + "loss": 1.171, + "step": 6633 + }, + { + "epoch": 0.5817472312007334, + "grad_norm": 0.04736328125, + "learning_rate": 0.0014741770038017334, + "loss": 1.1571, + "step": 6634 + }, + { + "epoch": 0.5818349229751079, + "grad_norm": 0.06884765625, + "learning_rate": 0.0014737727989549295, + "loss": 1.07, + "step": 6635 + }, + { + "epoch": 0.5819226147494824, + "grad_norm": 0.061279296875, + "learning_rate": 0.001473368610179592, + "loss": 1.1658, + "step": 6636 + }, + { + "epoch": 0.582010306523857, + "grad_norm": 0.072265625, + "learning_rate": 0.001472964437512581, + "loss": 1.1448, + "step": 6637 + }, + { + "epoch": 0.5820979982982315, + "grad_norm": 0.05322265625, + "learning_rate": 0.0014725602809907562, + "loss": 1.1345, + "step": 6638 + }, + { + "epoch": 0.582185690072606, + "grad_norm": 0.076171875, + "learning_rate": 0.0014721561406509762, + "loss": 1.1409, + "step": 6639 + }, + { + "epoch": 0.5822733818469806, + "grad_norm": 0.06689453125, + "learning_rate": 0.0014717520165300967, + "loss": 1.162, + "step": 6640 + }, + { + "epoch": 0.5823610736213551, + "grad_norm": 0.05615234375, + "learning_rate": 0.0014713479086649734, + "loss": 1.1569, + "step": 6641 + }, + { + "epoch": 0.5824487653957297, + "grad_norm": 0.0498046875, + "learning_rate": 0.001470943817092459, + "loss": 1.1567, + "step": 6642 + }, + { + "epoch": 0.5825364571701043, + "grad_norm": 0.05615234375, + "learning_rate": 0.001470539741849407, + "loss": 1.1385, + "step": 6643 + }, + { + "epoch": 0.5826241489444788, + "grad_norm": 0.0537109375, + "learning_rate": 0.0014701356829726667, + "loss": 1.1444, + "step": 6644 + }, + { + "epoch": 0.5827118407188533, + "grad_norm": 0.05712890625, + "learning_rate": 0.0014697316404990882, + "loss": 1.1737, + "step": 6645 + }, + { + "epoch": 0.5827995324932279, + "grad_norm": 0.0625, + "learning_rate": 0.0014693276144655187, + "loss": 1.1545, + "step": 6646 + }, + { + "epoch": 0.5828872242676024, + "grad_norm": 0.048828125, + "learning_rate": 0.001468923604908804, + "loss": 1.1808, + "step": 6647 + }, + { + "epoch": 0.5829749160419769, + "grad_norm": 0.055908203125, + "learning_rate": 0.001468519611865789, + "loss": 1.175, + "step": 6648 + }, + { + "epoch": 0.5830626078163516, + "grad_norm": 0.0615234375, + "learning_rate": 0.0014681156353733176, + "loss": 1.1567, + "step": 6649 + }, + { + "epoch": 0.5831502995907261, + "grad_norm": 0.050537109375, + "learning_rate": 0.0014677116754682302, + "loss": 1.1398, + "step": 6650 + }, + { + "epoch": 0.5832379913651006, + "grad_norm": 0.0693359375, + "learning_rate": 0.001467307732187368, + "loss": 1.1527, + "step": 6651 + }, + { + "epoch": 0.5833256831394751, + "grad_norm": 0.051025390625, + "learning_rate": 0.0014669038055675687, + "loss": 1.1722, + "step": 6652 + }, + { + "epoch": 0.5834133749138497, + "grad_norm": 0.04736328125, + "learning_rate": 0.0014664998956456696, + "loss": 1.1584, + "step": 6653 + }, + { + "epoch": 0.5835010666882242, + "grad_norm": 0.05322265625, + "learning_rate": 0.001466096002458506, + "loss": 1.1718, + "step": 6654 + }, + { + "epoch": 0.5835887584625987, + "grad_norm": 0.05224609375, + "learning_rate": 0.0014656921260429129, + "loss": 1.1168, + "step": 6655 + }, + { + "epoch": 0.5836764502369733, + "grad_norm": 0.1005859375, + "learning_rate": 0.0014652882664357217, + "loss": 1.1906, + "step": 6656 + }, + { + "epoch": 0.5837641420113479, + "grad_norm": 0.05859375, + "learning_rate": 0.0014648844236737637, + "loss": 1.1417, + "step": 6657 + }, + { + "epoch": 0.5838518337857224, + "grad_norm": 0.10498046875, + "learning_rate": 0.0014644805977938682, + "loss": 1.1412, + "step": 6658 + }, + { + "epoch": 0.583939525560097, + "grad_norm": 0.0654296875, + "learning_rate": 0.0014640767888328637, + "loss": 1.173, + "step": 6659 + }, + { + "epoch": 0.5840272173344715, + "grad_norm": 0.053955078125, + "learning_rate": 0.0014636729968275758, + "loss": 1.1438, + "step": 6660 + }, + { + "epoch": 0.584114909108846, + "grad_norm": 0.0751953125, + "learning_rate": 0.0014632692218148299, + "loss": 1.1821, + "step": 6661 + }, + { + "epoch": 0.5842026008832206, + "grad_norm": 0.047607421875, + "learning_rate": 0.0014628654638314486, + "loss": 1.1465, + "step": 6662 + }, + { + "epoch": 0.5842902926575951, + "grad_norm": 0.049560546875, + "learning_rate": 0.0014624617229142543, + "loss": 1.1731, + "step": 6663 + }, + { + "epoch": 0.5843779844319696, + "grad_norm": 0.049560546875, + "learning_rate": 0.0014620579991000665, + "loss": 1.1443, + "step": 6664 + }, + { + "epoch": 0.5844656762063443, + "grad_norm": 0.05419921875, + "learning_rate": 0.001461654292425704, + "loss": 1.1478, + "step": 6665 + }, + { + "epoch": 0.5845533679807188, + "grad_norm": 0.052978515625, + "learning_rate": 0.0014612506029279839, + "loss": 1.1897, + "step": 6666 + }, + { + "epoch": 0.5846410597550933, + "grad_norm": 0.052490234375, + "learning_rate": 0.0014608469306437216, + "loss": 1.2188, + "step": 6667 + }, + { + "epoch": 0.5847287515294679, + "grad_norm": 0.04638671875, + "learning_rate": 0.001460443275609731, + "loss": 1.0837, + "step": 6668 + }, + { + "epoch": 0.5848164433038424, + "grad_norm": 0.052490234375, + "learning_rate": 0.0014600396378628242, + "loss": 1.1272, + "step": 6669 + }, + { + "epoch": 0.5849041350782169, + "grad_norm": 0.05078125, + "learning_rate": 0.001459636017439812, + "loss": 1.1098, + "step": 6670 + }, + { + "epoch": 0.5849918268525915, + "grad_norm": 0.05078125, + "learning_rate": 0.0014592324143775036, + "loss": 1.1893, + "step": 6671 + }, + { + "epoch": 0.585079518626966, + "grad_norm": 0.046630859375, + "learning_rate": 0.0014588288287127067, + "loss": 1.1451, + "step": 6672 + }, + { + "epoch": 0.5851672104013406, + "grad_norm": 0.06787109375, + "learning_rate": 0.0014584252604822267, + "loss": 1.1472, + "step": 6673 + }, + { + "epoch": 0.5852549021757152, + "grad_norm": 0.0595703125, + "learning_rate": 0.0014580217097228687, + "loss": 1.1446, + "step": 6674 + }, + { + "epoch": 0.5853425939500897, + "grad_norm": 0.05615234375, + "learning_rate": 0.001457618176471435, + "loss": 1.1293, + "step": 6675 + }, + { + "epoch": 0.5854302857244642, + "grad_norm": 0.08935546875, + "learning_rate": 0.0014572146607647269, + "loss": 1.1971, + "step": 6676 + }, + { + "epoch": 0.5855179774988387, + "grad_norm": 0.076171875, + "learning_rate": 0.0014568111626395448, + "loss": 1.1173, + "step": 6677 + }, + { + "epoch": 0.5856056692732133, + "grad_norm": 0.0654296875, + "learning_rate": 0.0014564076821326852, + "loss": 1.154, + "step": 6678 + }, + { + "epoch": 0.5856933610475878, + "grad_norm": 0.08154296875, + "learning_rate": 0.0014560042192809455, + "loss": 1.0997, + "step": 6679 + }, + { + "epoch": 0.5857810528219624, + "grad_norm": 0.068359375, + "learning_rate": 0.00145560077412112, + "loss": 1.1704, + "step": 6680 + }, + { + "epoch": 0.585868744596337, + "grad_norm": 0.047607421875, + "learning_rate": 0.0014551973466900023, + "loss": 1.1583, + "step": 6681 + }, + { + "epoch": 0.5859564363707115, + "grad_norm": 0.05322265625, + "learning_rate": 0.0014547939370243834, + "loss": 1.152, + "step": 6682 + }, + { + "epoch": 0.586044128145086, + "grad_norm": 0.052978515625, + "learning_rate": 0.001454390545161054, + "loss": 1.1871, + "step": 6683 + }, + { + "epoch": 0.5861318199194606, + "grad_norm": 0.05615234375, + "learning_rate": 0.0014539871711368017, + "loss": 1.158, + "step": 6684 + }, + { + "epoch": 0.5862195116938351, + "grad_norm": 0.049072265625, + "learning_rate": 0.0014535838149884136, + "loss": 1.2047, + "step": 6685 + }, + { + "epoch": 0.5863072034682096, + "grad_norm": 0.05517578125, + "learning_rate": 0.0014531804767526748, + "loss": 1.1308, + "step": 6686 + }, + { + "epoch": 0.5863948952425843, + "grad_norm": 0.057861328125, + "learning_rate": 0.0014527771564663676, + "loss": 1.2001, + "step": 6687 + }, + { + "epoch": 0.5864825870169588, + "grad_norm": 0.045654296875, + "learning_rate": 0.001452373854166275, + "loss": 1.1281, + "step": 6688 + }, + { + "epoch": 0.5865702787913333, + "grad_norm": 0.0546875, + "learning_rate": 0.0014519705698891763, + "loss": 1.1435, + "step": 6689 + }, + { + "epoch": 0.5866579705657079, + "grad_norm": 0.052490234375, + "learning_rate": 0.001451567303671851, + "loss": 1.1643, + "step": 6690 + }, + { + "epoch": 0.5867456623400824, + "grad_norm": 0.052490234375, + "learning_rate": 0.001451164055551075, + "loss": 1.1658, + "step": 6691 + }, + { + "epoch": 0.5868333541144569, + "grad_norm": 0.068359375, + "learning_rate": 0.001450760825563624, + "loss": 1.1337, + "step": 6692 + }, + { + "epoch": 0.5869210458888315, + "grad_norm": 0.052978515625, + "learning_rate": 0.0014503576137462712, + "loss": 1.1698, + "step": 6693 + }, + { + "epoch": 0.587008737663206, + "grad_norm": 0.052978515625, + "learning_rate": 0.0014499544201357888, + "loss": 1.1794, + "step": 6694 + }, + { + "epoch": 0.5870964294375806, + "grad_norm": 0.052490234375, + "learning_rate": 0.0014495512447689467, + "loss": 1.1848, + "step": 6695 + }, + { + "epoch": 0.5871841212119552, + "grad_norm": 0.052978515625, + "learning_rate": 0.001449148087682514, + "loss": 1.1707, + "step": 6696 + }, + { + "epoch": 0.5872718129863297, + "grad_norm": 0.060302734375, + "learning_rate": 0.0014487449489132569, + "loss": 1.1317, + "step": 6697 + }, + { + "epoch": 0.5873595047607042, + "grad_norm": 0.061279296875, + "learning_rate": 0.001448341828497941, + "loss": 1.1461, + "step": 6698 + }, + { + "epoch": 0.5874471965350788, + "grad_norm": 0.054931640625, + "learning_rate": 0.00144793872647333, + "loss": 1.178, + "step": 6699 + }, + { + "epoch": 0.5875348883094533, + "grad_norm": 0.0615234375, + "learning_rate": 0.001447535642876185, + "loss": 1.1699, + "step": 6700 + }, + { + "epoch": 0.5876225800838278, + "grad_norm": 0.053466796875, + "learning_rate": 0.0014471325777432671, + "loss": 1.1414, + "step": 6701 + }, + { + "epoch": 0.5877102718582023, + "grad_norm": 0.0654296875, + "learning_rate": 0.001446729531111335, + "loss": 1.1146, + "step": 6702 + }, + { + "epoch": 0.587797963632577, + "grad_norm": 0.052001953125, + "learning_rate": 0.001446326503017144, + "loss": 1.1659, + "step": 6703 + }, + { + "epoch": 0.5878856554069515, + "grad_norm": 0.051025390625, + "learning_rate": 0.0014459234934974504, + "loss": 1.1631, + "step": 6704 + }, + { + "epoch": 0.587973347181326, + "grad_norm": 0.0546875, + "learning_rate": 0.0014455205025890071, + "loss": 1.1495, + "step": 6705 + }, + { + "epoch": 0.5880610389557006, + "grad_norm": 0.056884765625, + "learning_rate": 0.0014451175303285665, + "loss": 1.1394, + "step": 6706 + }, + { + "epoch": 0.5881487307300751, + "grad_norm": 0.048828125, + "learning_rate": 0.0014447145767528778, + "loss": 1.1444, + "step": 6707 + }, + { + "epoch": 0.5882364225044496, + "grad_norm": 0.052734375, + "learning_rate": 0.0014443116418986902, + "loss": 1.1674, + "step": 6708 + }, + { + "epoch": 0.5883241142788243, + "grad_norm": 0.08056640625, + "learning_rate": 0.00144390872580275, + "loss": 1.2399, + "step": 6709 + }, + { + "epoch": 0.5884118060531988, + "grad_norm": 0.0732421875, + "learning_rate": 0.0014435058285018018, + "loss": 1.1569, + "step": 6710 + }, + { + "epoch": 0.5884994978275733, + "grad_norm": 0.054931640625, + "learning_rate": 0.0014431029500325885, + "loss": 1.13, + "step": 6711 + }, + { + "epoch": 0.5885871896019479, + "grad_norm": 0.1552734375, + "learning_rate": 0.0014427000904318526, + "loss": 1.1484, + "step": 6712 + }, + { + "epoch": 0.5886748813763224, + "grad_norm": 0.07421875, + "learning_rate": 0.0014422972497363333, + "loss": 1.12, + "step": 6713 + }, + { + "epoch": 0.5887625731506969, + "grad_norm": 0.061767578125, + "learning_rate": 0.0014418944279827686, + "loss": 1.1149, + "step": 6714 + }, + { + "epoch": 0.5888502649250715, + "grad_norm": 0.107421875, + "learning_rate": 0.0014414916252078944, + "loss": 1.0776, + "step": 6715 + }, + { + "epoch": 0.588937956699446, + "grad_norm": 0.08349609375, + "learning_rate": 0.0014410888414484464, + "loss": 1.1272, + "step": 6716 + }, + { + "epoch": 0.5890256484738206, + "grad_norm": 0.056884765625, + "learning_rate": 0.001440686076741156, + "loss": 1.2084, + "step": 6717 + }, + { + "epoch": 0.5891133402481952, + "grad_norm": 0.052734375, + "learning_rate": 0.0014402833311227558, + "loss": 1.1663, + "step": 6718 + }, + { + "epoch": 0.5892010320225697, + "grad_norm": 0.0771484375, + "learning_rate": 0.0014398806046299747, + "loss": 1.1657, + "step": 6719 + }, + { + "epoch": 0.5892887237969442, + "grad_norm": 0.053955078125, + "learning_rate": 0.0014394778972995391, + "loss": 1.1435, + "step": 6720 + }, + { + "epoch": 0.5893764155713188, + "grad_norm": 0.04736328125, + "learning_rate": 0.0014390752091681764, + "loss": 1.1213, + "step": 6721 + }, + { + "epoch": 0.5894641073456933, + "grad_norm": 0.0478515625, + "learning_rate": 0.0014386725402726094, + "loss": 1.0895, + "step": 6722 + }, + { + "epoch": 0.5895517991200678, + "grad_norm": 0.05712890625, + "learning_rate": 0.001438269890649562, + "loss": 1.1917, + "step": 6723 + }, + { + "epoch": 0.5896394908944423, + "grad_norm": 0.0546875, + "learning_rate": 0.0014378672603357536, + "loss": 1.1347, + "step": 6724 + }, + { + "epoch": 0.589727182668817, + "grad_norm": 0.0517578125, + "learning_rate": 0.0014374646493679037, + "loss": 1.1518, + "step": 6725 + }, + { + "epoch": 0.5898148744431915, + "grad_norm": 0.06787109375, + "learning_rate": 0.0014370620577827285, + "loss": 1.1036, + "step": 6726 + }, + { + "epoch": 0.589902566217566, + "grad_norm": 0.049560546875, + "learning_rate": 0.0014366594856169445, + "loss": 1.1732, + "step": 6727 + }, + { + "epoch": 0.5899902579919406, + "grad_norm": 0.04736328125, + "learning_rate": 0.0014362569329072646, + "loss": 1.1447, + "step": 6728 + }, + { + "epoch": 0.5900779497663151, + "grad_norm": 0.05810546875, + "learning_rate": 0.0014358543996904004, + "loss": 1.136, + "step": 6729 + }, + { + "epoch": 0.5901656415406896, + "grad_norm": 0.0478515625, + "learning_rate": 0.0014354518860030623, + "loss": 1.1331, + "step": 6730 + }, + { + "epoch": 0.5902533333150642, + "grad_norm": 0.045654296875, + "learning_rate": 0.0014350493918819582, + "loss": 1.139, + "step": 6731 + }, + { + "epoch": 0.5903410250894388, + "grad_norm": 0.06396484375, + "learning_rate": 0.0014346469173637949, + "loss": 1.1862, + "step": 6732 + }, + { + "epoch": 0.5904287168638133, + "grad_norm": 0.06396484375, + "learning_rate": 0.0014342444624852767, + "loss": 1.1603, + "step": 6733 + }, + { + "epoch": 0.5905164086381879, + "grad_norm": 0.047119140625, + "learning_rate": 0.0014338420272831066, + "loss": 1.1724, + "step": 6734 + }, + { + "epoch": 0.5906041004125624, + "grad_norm": 0.051025390625, + "learning_rate": 0.0014334396117939856, + "loss": 1.1519, + "step": 6735 + }, + { + "epoch": 0.5906917921869369, + "grad_norm": 0.062255859375, + "learning_rate": 0.0014330372160546135, + "loss": 1.1185, + "step": 6736 + }, + { + "epoch": 0.5907794839613115, + "grad_norm": 0.059814453125, + "learning_rate": 0.0014326348401016867, + "loss": 1.1543, + "step": 6737 + }, + { + "epoch": 0.590867175735686, + "grad_norm": 0.04443359375, + "learning_rate": 0.0014322324839719008, + "loss": 1.1257, + "step": 6738 + }, + { + "epoch": 0.5909548675100605, + "grad_norm": 0.06201171875, + "learning_rate": 0.0014318301477019512, + "loss": 1.1664, + "step": 6739 + }, + { + "epoch": 0.5910425592844352, + "grad_norm": 0.046142578125, + "learning_rate": 0.001431427831328528, + "loss": 1.1064, + "step": 6740 + }, + { + "epoch": 0.5911302510588097, + "grad_norm": 0.05078125, + "learning_rate": 0.0014310255348883231, + "loss": 1.1521, + "step": 6741 + }, + { + "epoch": 0.5912179428331842, + "grad_norm": 0.04833984375, + "learning_rate": 0.0014306232584180235, + "loss": 1.1836, + "step": 6742 + }, + { + "epoch": 0.5913056346075588, + "grad_norm": 0.046142578125, + "learning_rate": 0.001430221001954317, + "loss": 1.161, + "step": 6743 + }, + { + "epoch": 0.5913933263819333, + "grad_norm": 0.045654296875, + "learning_rate": 0.0014298187655338872, + "loss": 1.1726, + "step": 6744 + }, + { + "epoch": 0.5914810181563078, + "grad_norm": 0.049072265625, + "learning_rate": 0.0014294165491934181, + "loss": 1.1718, + "step": 6745 + }, + { + "epoch": 0.5915687099306824, + "grad_norm": 0.0625, + "learning_rate": 0.0014290143529695901, + "loss": 1.1438, + "step": 6746 + }, + { + "epoch": 0.591656401705057, + "grad_norm": 0.046630859375, + "learning_rate": 0.0014286121768990827, + "loss": 1.1601, + "step": 6747 + }, + { + "epoch": 0.5917440934794315, + "grad_norm": 0.0654296875, + "learning_rate": 0.0014282100210185728, + "loss": 1.1424, + "step": 6748 + }, + { + "epoch": 0.591831785253806, + "grad_norm": 0.060791015625, + "learning_rate": 0.0014278078853647372, + "loss": 1.2146, + "step": 6749 + }, + { + "epoch": 0.5919194770281806, + "grad_norm": 0.0791015625, + "learning_rate": 0.001427405769974248, + "loss": 1.1657, + "step": 6750 + }, + { + "epoch": 0.5920071688025551, + "grad_norm": 0.047607421875, + "learning_rate": 0.0014270036748837787, + "loss": 1.1562, + "step": 6751 + }, + { + "epoch": 0.5920948605769296, + "grad_norm": 0.051513671875, + "learning_rate": 0.001426601600129998, + "loss": 1.1418, + "step": 6752 + }, + { + "epoch": 0.5921825523513042, + "grad_norm": 0.051025390625, + "learning_rate": 0.0014261995457495755, + "loss": 1.1452, + "step": 6753 + }, + { + "epoch": 0.5922702441256787, + "grad_norm": 0.059326171875, + "learning_rate": 0.0014257975117791762, + "loss": 1.1553, + "step": 6754 + }, + { + "epoch": 0.5923579359000533, + "grad_norm": 0.05029296875, + "learning_rate": 0.0014253954982554652, + "loss": 1.2102, + "step": 6755 + }, + { + "epoch": 0.5924456276744279, + "grad_norm": 0.04638671875, + "learning_rate": 0.0014249935052151044, + "loss": 1.1401, + "step": 6756 + }, + { + "epoch": 0.5925333194488024, + "grad_norm": 0.04833984375, + "learning_rate": 0.0014245915326947553, + "loss": 1.1454, + "step": 6757 + }, + { + "epoch": 0.5926210112231769, + "grad_norm": 0.051025390625, + "learning_rate": 0.0014241895807310767, + "loss": 1.1654, + "step": 6758 + }, + { + "epoch": 0.5927087029975515, + "grad_norm": 0.051025390625, + "learning_rate": 0.0014237876493607255, + "loss": 1.149, + "step": 6759 + }, + { + "epoch": 0.592796394771926, + "grad_norm": 0.05419921875, + "learning_rate": 0.0014233857386203562, + "loss": 1.1226, + "step": 6760 + }, + { + "epoch": 0.5928840865463005, + "grad_norm": 0.050537109375, + "learning_rate": 0.001422983848546623, + "loss": 1.1733, + "step": 6761 + }, + { + "epoch": 0.5929717783206752, + "grad_norm": 0.07080078125, + "learning_rate": 0.0014225819791761768, + "loss": 1.2006, + "step": 6762 + }, + { + "epoch": 0.5930594700950497, + "grad_norm": 0.06640625, + "learning_rate": 0.001422180130545667, + "loss": 1.1561, + "step": 6763 + }, + { + "epoch": 0.5931471618694242, + "grad_norm": 0.049072265625, + "learning_rate": 0.0014217783026917409, + "loss": 1.1179, + "step": 6764 + }, + { + "epoch": 0.5932348536437988, + "grad_norm": 0.06640625, + "learning_rate": 0.001421376495651045, + "loss": 1.1723, + "step": 6765 + }, + { + "epoch": 0.5933225454181733, + "grad_norm": 0.053955078125, + "learning_rate": 0.0014209747094602217, + "loss": 1.1122, + "step": 6766 + }, + { + "epoch": 0.5934102371925478, + "grad_norm": 0.0615234375, + "learning_rate": 0.0014205729441559144, + "loss": 1.1787, + "step": 6767 + }, + { + "epoch": 0.5934979289669224, + "grad_norm": 0.08203125, + "learning_rate": 0.0014201711997747619, + "loss": 1.1629, + "step": 6768 + }, + { + "epoch": 0.593585620741297, + "grad_norm": 0.051513671875, + "learning_rate": 0.0014197694763534029, + "loss": 1.1946, + "step": 6769 + }, + { + "epoch": 0.5936733125156715, + "grad_norm": 0.050048828125, + "learning_rate": 0.0014193677739284735, + "loss": 1.1731, + "step": 6770 + }, + { + "epoch": 0.5937610042900461, + "grad_norm": 0.04833984375, + "learning_rate": 0.0014189660925366075, + "loss": 1.1258, + "step": 6771 + }, + { + "epoch": 0.5938486960644206, + "grad_norm": 0.058349609375, + "learning_rate": 0.0014185644322144376, + "loss": 1.1935, + "step": 6772 + }, + { + "epoch": 0.5939363878387951, + "grad_norm": 0.052490234375, + "learning_rate": 0.0014181627929985936, + "loss": 1.1101, + "step": 6773 + }, + { + "epoch": 0.5940240796131696, + "grad_norm": 0.052490234375, + "learning_rate": 0.0014177611749257048, + "loss": 1.1173, + "step": 6774 + }, + { + "epoch": 0.5941117713875442, + "grad_norm": 0.0625, + "learning_rate": 0.001417359578032397, + "loss": 1.172, + "step": 6775 + }, + { + "epoch": 0.5941994631619187, + "grad_norm": 0.048583984375, + "learning_rate": 0.0014169580023552955, + "loss": 1.1546, + "step": 6776 + }, + { + "epoch": 0.5942871549362932, + "grad_norm": 0.0576171875, + "learning_rate": 0.0014165564479310224, + "loss": 1.1183, + "step": 6777 + }, + { + "epoch": 0.5943748467106679, + "grad_norm": 0.06494140625, + "learning_rate": 0.0014161549147961986, + "loss": 1.1704, + "step": 6778 + }, + { + "epoch": 0.5944625384850424, + "grad_norm": 0.0673828125, + "learning_rate": 0.0014157534029874427, + "loss": 1.1868, + "step": 6779 + }, + { + "epoch": 0.5945502302594169, + "grad_norm": 0.06884765625, + "learning_rate": 0.0014153519125413723, + "loss": 1.2295, + "step": 6780 + }, + { + "epoch": 0.5946379220337915, + "grad_norm": 0.08447265625, + "learning_rate": 0.0014149504434946012, + "loss": 1.1211, + "step": 6781 + }, + { + "epoch": 0.594725613808166, + "grad_norm": 0.047607421875, + "learning_rate": 0.001414548995883743, + "loss": 1.1301, + "step": 6782 + }, + { + "epoch": 0.5948133055825405, + "grad_norm": 0.05859375, + "learning_rate": 0.0014141475697454086, + "loss": 1.1345, + "step": 6783 + }, + { + "epoch": 0.5949009973569152, + "grad_norm": 0.05517578125, + "learning_rate": 0.001413746165116207, + "loss": 1.1623, + "step": 6784 + }, + { + "epoch": 0.5949886891312897, + "grad_norm": 0.060791015625, + "learning_rate": 0.0014133447820327454, + "loss": 1.2222, + "step": 6785 + }, + { + "epoch": 0.5950763809056642, + "grad_norm": 0.05419921875, + "learning_rate": 0.0014129434205316285, + "loss": 1.1341, + "step": 6786 + }, + { + "epoch": 0.5951640726800388, + "grad_norm": 0.0556640625, + "learning_rate": 0.0014125420806494603, + "loss": 1.1263, + "step": 6787 + }, + { + "epoch": 0.5952517644544133, + "grad_norm": 0.044677734375, + "learning_rate": 0.0014121407624228409, + "loss": 1.0746, + "step": 6788 + }, + { + "epoch": 0.5953394562287878, + "grad_norm": 0.05419921875, + "learning_rate": 0.0014117394658883697, + "loss": 1.1617, + "step": 6789 + }, + { + "epoch": 0.5954271480031624, + "grad_norm": 0.0869140625, + "learning_rate": 0.0014113381910826445, + "loss": 1.194, + "step": 6790 + }, + { + "epoch": 0.5955148397775369, + "grad_norm": 0.050537109375, + "learning_rate": 0.0014109369380422602, + "loss": 1.1704, + "step": 6791 + }, + { + "epoch": 0.5956025315519115, + "grad_norm": 0.048095703125, + "learning_rate": 0.0014105357068038096, + "loss": 1.1137, + "step": 6792 + }, + { + "epoch": 0.5956902233262861, + "grad_norm": 0.0576171875, + "learning_rate": 0.0014101344974038847, + "loss": 1.1629, + "step": 6793 + }, + { + "epoch": 0.5957779151006606, + "grad_norm": 0.0869140625, + "learning_rate": 0.0014097333098790743, + "loss": 1.1639, + "step": 6794 + }, + { + "epoch": 0.5958656068750351, + "grad_norm": 0.047119140625, + "learning_rate": 0.0014093321442659656, + "loss": 1.1031, + "step": 6795 + }, + { + "epoch": 0.5959532986494096, + "grad_norm": 0.09619140625, + "learning_rate": 0.0014089310006011446, + "loss": 1.1601, + "step": 6796 + }, + { + "epoch": 0.5960409904237842, + "grad_norm": 0.0625, + "learning_rate": 0.0014085298789211936, + "loss": 1.1727, + "step": 6797 + }, + { + "epoch": 0.5961286821981587, + "grad_norm": 0.054931640625, + "learning_rate": 0.0014081287792626945, + "loss": 1.1469, + "step": 6798 + }, + { + "epoch": 0.5962163739725332, + "grad_norm": 0.0576171875, + "learning_rate": 0.001407727701662226, + "loss": 1.214, + "step": 6799 + }, + { + "epoch": 0.5963040657469079, + "grad_norm": 0.04736328125, + "learning_rate": 0.0014073266461563662, + "loss": 1.1608, + "step": 6800 + }, + { + "epoch": 0.5963917575212824, + "grad_norm": 0.04541015625, + "learning_rate": 0.0014069256127816896, + "loss": 1.152, + "step": 6801 + }, + { + "epoch": 0.5964794492956569, + "grad_norm": 0.05029296875, + "learning_rate": 0.0014065246015747697, + "loss": 1.1424, + "step": 6802 + }, + { + "epoch": 0.5965671410700315, + "grad_norm": 0.06005859375, + "learning_rate": 0.0014061236125721777, + "loss": 1.1651, + "step": 6803 + }, + { + "epoch": 0.596654832844406, + "grad_norm": 0.05029296875, + "learning_rate": 0.0014057226458104822, + "loss": 1.0881, + "step": 6804 + }, + { + "epoch": 0.5967425246187805, + "grad_norm": 0.08251953125, + "learning_rate": 0.0014053217013262514, + "loss": 1.1691, + "step": 6805 + }, + { + "epoch": 0.5968302163931551, + "grad_norm": 0.06396484375, + "learning_rate": 0.0014049207791560494, + "loss": 1.1236, + "step": 6806 + }, + { + "epoch": 0.5969179081675297, + "grad_norm": 0.05517578125, + "learning_rate": 0.00140451987933644, + "loss": 1.1734, + "step": 6807 + }, + { + "epoch": 0.5970055999419042, + "grad_norm": 0.048583984375, + "learning_rate": 0.0014041190019039832, + "loss": 1.1557, + "step": 6808 + }, + { + "epoch": 0.5970932917162788, + "grad_norm": 0.06982421875, + "learning_rate": 0.0014037181468952394, + "loss": 1.1358, + "step": 6809 + }, + { + "epoch": 0.5971809834906533, + "grad_norm": 0.05419921875, + "learning_rate": 0.0014033173143467644, + "loss": 1.1523, + "step": 6810 + }, + { + "epoch": 0.5972686752650278, + "grad_norm": 0.05419921875, + "learning_rate": 0.0014029165042951134, + "loss": 1.1464, + "step": 6811 + }, + { + "epoch": 0.5973563670394024, + "grad_norm": 0.054443359375, + "learning_rate": 0.0014025157167768397, + "loss": 1.0916, + "step": 6812 + }, + { + "epoch": 0.5974440588137769, + "grad_norm": 0.04443359375, + "learning_rate": 0.0014021149518284935, + "loss": 1.1473, + "step": 6813 + }, + { + "epoch": 0.5975317505881514, + "grad_norm": 0.050048828125, + "learning_rate": 0.001401714209486624, + "loss": 1.1554, + "step": 6814 + }, + { + "epoch": 0.5976194423625261, + "grad_norm": 0.049560546875, + "learning_rate": 0.0014013134897877778, + "loss": 1.1871, + "step": 6815 + }, + { + "epoch": 0.5977071341369006, + "grad_norm": 0.043212890625, + "learning_rate": 0.001400912792768499, + "loss": 1.1417, + "step": 6816 + }, + { + "epoch": 0.5977948259112751, + "grad_norm": 0.0654296875, + "learning_rate": 0.0014005121184653309, + "loss": 1.1749, + "step": 6817 + }, + { + "epoch": 0.5978825176856497, + "grad_norm": 0.0673828125, + "learning_rate": 0.0014001114669148131, + "loss": 1.2356, + "step": 6818 + }, + { + "epoch": 0.5979702094600242, + "grad_norm": 0.07080078125, + "learning_rate": 0.0013997108381534845, + "loss": 1.18, + "step": 6819 + }, + { + "epoch": 0.5980579012343987, + "grad_norm": 0.06787109375, + "learning_rate": 0.001399310232217882, + "loss": 1.18, + "step": 6820 + }, + { + "epoch": 0.5981455930087732, + "grad_norm": 0.05908203125, + "learning_rate": 0.001398909649144539, + "loss": 1.1364, + "step": 6821 + }, + { + "epoch": 0.5982332847831479, + "grad_norm": 0.048095703125, + "learning_rate": 0.0013985090889699872, + "loss": 1.165, + "step": 6822 + }, + { + "epoch": 0.5983209765575224, + "grad_norm": 0.09716796875, + "learning_rate": 0.0013981085517307578, + "loss": 1.1581, + "step": 6823 + }, + { + "epoch": 0.5984086683318969, + "grad_norm": 0.056884765625, + "learning_rate": 0.0013977080374633776, + "loss": 1.1309, + "step": 6824 + }, + { + "epoch": 0.5984963601062715, + "grad_norm": 0.051513671875, + "learning_rate": 0.0013973075462043738, + "loss": 1.2084, + "step": 6825 + }, + { + "epoch": 0.598584051880646, + "grad_norm": 0.051513671875, + "learning_rate": 0.0013969070779902687, + "loss": 1.1102, + "step": 6826 + }, + { + "epoch": 0.5986717436550205, + "grad_norm": 0.048095703125, + "learning_rate": 0.0013965066328575852, + "loss": 1.1294, + "step": 6827 + }, + { + "epoch": 0.5987594354293951, + "grad_norm": 0.053955078125, + "learning_rate": 0.0013961062108428421, + "loss": 1.1641, + "step": 6828 + }, + { + "epoch": 0.5988471272037696, + "grad_norm": 0.04638671875, + "learning_rate": 0.0013957058119825577, + "loss": 1.0509, + "step": 6829 + }, + { + "epoch": 0.5989348189781442, + "grad_norm": 0.049560546875, + "learning_rate": 0.0013953054363132465, + "loss": 1.087, + "step": 6830 + }, + { + "epoch": 0.5990225107525188, + "grad_norm": 0.05126953125, + "learning_rate": 0.001394905083871422, + "loss": 1.1567, + "step": 6831 + }, + { + "epoch": 0.5991102025268933, + "grad_norm": 0.04833984375, + "learning_rate": 0.0013945047546935951, + "loss": 1.1862, + "step": 6832 + }, + { + "epoch": 0.5991978943012678, + "grad_norm": 0.05517578125, + "learning_rate": 0.0013941044488162753, + "loss": 1.1225, + "step": 6833 + }, + { + "epoch": 0.5992855860756424, + "grad_norm": 0.05712890625, + "learning_rate": 0.001393704166275969, + "loss": 1.1721, + "step": 6834 + }, + { + "epoch": 0.5993732778500169, + "grad_norm": 0.07177734375, + "learning_rate": 0.001393303907109181, + "loss": 1.1816, + "step": 6835 + }, + { + "epoch": 0.5994609696243914, + "grad_norm": 0.051513671875, + "learning_rate": 0.0013929036713524147, + "loss": 1.1805, + "step": 6836 + }, + { + "epoch": 0.5995486613987661, + "grad_norm": 0.060302734375, + "learning_rate": 0.0013925034590421695, + "loss": 1.1387, + "step": 6837 + }, + { + "epoch": 0.5996363531731406, + "grad_norm": 0.060302734375, + "learning_rate": 0.0013921032702149444, + "loss": 1.2183, + "step": 6838 + }, + { + "epoch": 0.5997240449475151, + "grad_norm": 0.068359375, + "learning_rate": 0.0013917031049072346, + "loss": 1.1666, + "step": 6839 + }, + { + "epoch": 0.5998117367218897, + "grad_norm": 0.0625, + "learning_rate": 0.0013913029631555352, + "loss": 1.2364, + "step": 6840 + }, + { + "epoch": 0.5998994284962642, + "grad_norm": 0.045166015625, + "learning_rate": 0.001390902844996338, + "loss": 1.1031, + "step": 6841 + }, + { + "epoch": 0.5999871202706387, + "grad_norm": 0.054443359375, + "learning_rate": 0.0013905027504661316, + "loss": 1.1253, + "step": 6842 + }, + { + "epoch": 0.6000748120450133, + "grad_norm": 0.058837890625, + "learning_rate": 0.0013901026796014051, + "loss": 1.1478, + "step": 6843 + }, + { + "epoch": 0.6001625038193878, + "grad_norm": 0.054931640625, + "learning_rate": 0.0013897026324386436, + "loss": 1.1008, + "step": 6844 + }, + { + "epoch": 0.6002501955937624, + "grad_norm": 0.049072265625, + "learning_rate": 0.0013893026090143299, + "loss": 1.2723, + "step": 6845 + }, + { + "epoch": 0.6003378873681369, + "grad_norm": 0.051025390625, + "learning_rate": 0.0013889026093649448, + "loss": 1.1811, + "step": 6846 + }, + { + "epoch": 0.6004255791425115, + "grad_norm": 0.048095703125, + "learning_rate": 0.0013885026335269679, + "loss": 1.1386, + "step": 6847 + }, + { + "epoch": 0.600513270916886, + "grad_norm": 0.06982421875, + "learning_rate": 0.001388102681536876, + "loss": 1.1425, + "step": 6848 + }, + { + "epoch": 0.6006009626912605, + "grad_norm": 0.04541015625, + "learning_rate": 0.0013877027534311434, + "loss": 1.1828, + "step": 6849 + }, + { + "epoch": 0.6006886544656351, + "grad_norm": 0.056396484375, + "learning_rate": 0.0013873028492462423, + "loss": 1.1733, + "step": 6850 + }, + { + "epoch": 0.6007763462400096, + "grad_norm": 0.055419921875, + "learning_rate": 0.0013869029690186435, + "loss": 1.168, + "step": 6851 + }, + { + "epoch": 0.6008640380143841, + "grad_norm": 0.05224609375, + "learning_rate": 0.0013865031127848143, + "loss": 1.0719, + "step": 6852 + }, + { + "epoch": 0.6009517297887588, + "grad_norm": 0.068359375, + "learning_rate": 0.0013861032805812215, + "loss": 1.1597, + "step": 6853 + }, + { + "epoch": 0.6010394215631333, + "grad_norm": 0.05517578125, + "learning_rate": 0.0013857034724443287, + "loss": 1.1289, + "step": 6854 + }, + { + "epoch": 0.6011271133375078, + "grad_norm": 0.0537109375, + "learning_rate": 0.0013853036884105964, + "loss": 1.1274, + "step": 6855 + }, + { + "epoch": 0.6012148051118824, + "grad_norm": 0.06884765625, + "learning_rate": 0.0013849039285164844, + "loss": 1.1377, + "step": 6856 + }, + { + "epoch": 0.6013024968862569, + "grad_norm": 0.05224609375, + "learning_rate": 0.0013845041927984495, + "loss": 1.2187, + "step": 6857 + }, + { + "epoch": 0.6013901886606314, + "grad_norm": 0.05859375, + "learning_rate": 0.0013841044812929473, + "loss": 1.1356, + "step": 6858 + }, + { + "epoch": 0.601477880435006, + "grad_norm": 0.04736328125, + "learning_rate": 0.0013837047940364296, + "loss": 1.148, + "step": 6859 + }, + { + "epoch": 0.6015655722093806, + "grad_norm": 0.059326171875, + "learning_rate": 0.0013833051310653474, + "loss": 1.1463, + "step": 6860 + }, + { + "epoch": 0.6016532639837551, + "grad_norm": 0.078125, + "learning_rate": 0.0013829054924161485, + "loss": 1.1551, + "step": 6861 + }, + { + "epoch": 0.6017409557581297, + "grad_norm": 0.04638671875, + "learning_rate": 0.0013825058781252798, + "loss": 1.1378, + "step": 6862 + }, + { + "epoch": 0.6018286475325042, + "grad_norm": 0.056396484375, + "learning_rate": 0.0013821062882291841, + "loss": 1.1071, + "step": 6863 + }, + { + "epoch": 0.6019163393068787, + "grad_norm": 0.053955078125, + "learning_rate": 0.0013817067227643034, + "loss": 1.1459, + "step": 6864 + }, + { + "epoch": 0.6020040310812533, + "grad_norm": 0.0546875, + "learning_rate": 0.0013813071817670769, + "loss": 1.1573, + "step": 6865 + }, + { + "epoch": 0.6020917228556278, + "grad_norm": 0.05029296875, + "learning_rate": 0.0013809076652739416, + "loss": 1.156, + "step": 6866 + }, + { + "epoch": 0.6021794146300024, + "grad_norm": 0.048095703125, + "learning_rate": 0.0013805081733213333, + "loss": 1.1583, + "step": 6867 + }, + { + "epoch": 0.6022671064043769, + "grad_norm": 0.04638671875, + "learning_rate": 0.0013801087059456833, + "loss": 1.1862, + "step": 6868 + }, + { + "epoch": 0.6023547981787515, + "grad_norm": 0.054931640625, + "learning_rate": 0.001379709263183423, + "loss": 1.1801, + "step": 6869 + }, + { + "epoch": 0.602442489953126, + "grad_norm": 0.07666015625, + "learning_rate": 0.0013793098450709798, + "loss": 1.1178, + "step": 6870 + }, + { + "epoch": 0.6025301817275005, + "grad_norm": 0.049072265625, + "learning_rate": 0.0013789104516447803, + "loss": 1.1991, + "step": 6871 + }, + { + "epoch": 0.6026178735018751, + "grad_norm": 0.048828125, + "learning_rate": 0.0013785110829412479, + "loss": 1.0913, + "step": 6872 + }, + { + "epoch": 0.6027055652762496, + "grad_norm": 0.10205078125, + "learning_rate": 0.001378111738996803, + "loss": 1.1135, + "step": 6873 + }, + { + "epoch": 0.6027932570506241, + "grad_norm": 0.064453125, + "learning_rate": 0.0013777124198478667, + "loss": 1.129, + "step": 6874 + }, + { + "epoch": 0.6028809488249988, + "grad_norm": 0.055419921875, + "learning_rate": 0.0013773131255308542, + "loss": 1.1324, + "step": 6875 + }, + { + "epoch": 0.6029686405993733, + "grad_norm": 0.05078125, + "learning_rate": 0.0013769138560821806, + "loss": 1.1065, + "step": 6876 + }, + { + "epoch": 0.6030563323737478, + "grad_norm": 0.052001953125, + "learning_rate": 0.0013765146115382587, + "loss": 1.1309, + "step": 6877 + }, + { + "epoch": 0.6031440241481224, + "grad_norm": 0.04638671875, + "learning_rate": 0.0013761153919354982, + "loss": 1.1465, + "step": 6878 + }, + { + "epoch": 0.6032317159224969, + "grad_norm": 0.052978515625, + "learning_rate": 0.0013757161973103065, + "loss": 1.2025, + "step": 6879 + }, + { + "epoch": 0.6033194076968714, + "grad_norm": 0.0771484375, + "learning_rate": 0.0013753170276990898, + "loss": 1.1748, + "step": 6880 + }, + { + "epoch": 0.603407099471246, + "grad_norm": 0.06640625, + "learning_rate": 0.0013749178831382514, + "loss": 1.136, + "step": 6881 + }, + { + "epoch": 0.6034947912456206, + "grad_norm": 0.0830078125, + "learning_rate": 0.0013745187636641917, + "loss": 1.1192, + "step": 6882 + }, + { + "epoch": 0.6035824830199951, + "grad_norm": 0.053955078125, + "learning_rate": 0.0013741196693133092, + "loss": 1.1606, + "step": 6883 + }, + { + "epoch": 0.6036701747943697, + "grad_norm": 0.05126953125, + "learning_rate": 0.001373720600122001, + "loss": 1.1478, + "step": 6884 + }, + { + "epoch": 0.6037578665687442, + "grad_norm": 0.05078125, + "learning_rate": 0.001373321556126661, + "loss": 1.0844, + "step": 6885 + }, + { + "epoch": 0.6038455583431187, + "grad_norm": 0.058837890625, + "learning_rate": 0.001372922537363681, + "loss": 1.1169, + "step": 6886 + }, + { + "epoch": 0.6039332501174933, + "grad_norm": 0.08544921875, + "learning_rate": 0.0013725235438694498, + "loss": 1.2101, + "step": 6887 + }, + { + "epoch": 0.6040209418918678, + "grad_norm": 0.06298828125, + "learning_rate": 0.0013721245756803558, + "loss": 1.1506, + "step": 6888 + }, + { + "epoch": 0.6041086336662423, + "grad_norm": 0.09716796875, + "learning_rate": 0.0013717256328327828, + "loss": 1.2201, + "step": 6889 + }, + { + "epoch": 0.604196325440617, + "grad_norm": 0.068359375, + "learning_rate": 0.0013713267153631135, + "loss": 1.1558, + "step": 6890 + }, + { + "epoch": 0.6042840172149915, + "grad_norm": 0.091796875, + "learning_rate": 0.0013709278233077287, + "loss": 1.0958, + "step": 6891 + }, + { + "epoch": 0.604371708989366, + "grad_norm": 0.07470703125, + "learning_rate": 0.0013705289567030056, + "loss": 1.1832, + "step": 6892 + }, + { + "epoch": 0.6044594007637405, + "grad_norm": 0.0478515625, + "learning_rate": 0.0013701301155853202, + "loss": 1.1421, + "step": 6893 + }, + { + "epoch": 0.6045470925381151, + "grad_norm": 0.0517578125, + "learning_rate": 0.0013697312999910464, + "loss": 1.1767, + "step": 6894 + }, + { + "epoch": 0.6046347843124896, + "grad_norm": 0.060791015625, + "learning_rate": 0.001369332509956554, + "loss": 1.281, + "step": 6895 + }, + { + "epoch": 0.6047224760868641, + "grad_norm": 0.05908203125, + "learning_rate": 0.0013689337455182122, + "loss": 1.1507, + "step": 6896 + }, + { + "epoch": 0.6048101678612388, + "grad_norm": 0.048095703125, + "learning_rate": 0.0013685350067123873, + "loss": 1.1057, + "step": 6897 + }, + { + "epoch": 0.6048978596356133, + "grad_norm": 0.057373046875, + "learning_rate": 0.0013681362935754435, + "loss": 1.126, + "step": 6898 + }, + { + "epoch": 0.6049855514099878, + "grad_norm": 0.07421875, + "learning_rate": 0.0013677376061437417, + "loss": 1.1488, + "step": 6899 + }, + { + "epoch": 0.6050732431843624, + "grad_norm": 0.05078125, + "learning_rate": 0.0013673389444536417, + "loss": 1.1829, + "step": 6900 + }, + { + "epoch": 0.6051609349587369, + "grad_norm": 0.06640625, + "learning_rate": 0.0013669403085415, + "loss": 1.2205, + "step": 6901 + }, + { + "epoch": 0.6052486267331114, + "grad_norm": 0.109375, + "learning_rate": 0.0013665416984436718, + "loss": 1.1251, + "step": 6902 + }, + { + "epoch": 0.605336318507486, + "grad_norm": 0.06787109375, + "learning_rate": 0.0013661431141965088, + "loss": 1.1633, + "step": 6903 + }, + { + "epoch": 0.6054240102818605, + "grad_norm": 0.06591796875, + "learning_rate": 0.0013657445558363612, + "loss": 1.153, + "step": 6904 + }, + { + "epoch": 0.6055117020562351, + "grad_norm": 0.08837890625, + "learning_rate": 0.0013653460233995767, + "loss": 1.1302, + "step": 6905 + }, + { + "epoch": 0.6055993938306097, + "grad_norm": 0.05615234375, + "learning_rate": 0.0013649475169224992, + "loss": 1.1041, + "step": 6906 + }, + { + "epoch": 0.6056870856049842, + "grad_norm": 0.08740234375, + "learning_rate": 0.0013645490364414727, + "loss": 1.1579, + "step": 6907 + }, + { + "epoch": 0.6057747773793587, + "grad_norm": 0.06640625, + "learning_rate": 0.001364150581992837, + "loss": 1.1275, + "step": 6908 + }, + { + "epoch": 0.6058624691537333, + "grad_norm": 0.054931640625, + "learning_rate": 0.0013637521536129303, + "loss": 1.1988, + "step": 6909 + }, + { + "epoch": 0.6059501609281078, + "grad_norm": 0.06591796875, + "learning_rate": 0.0013633537513380886, + "loss": 1.1497, + "step": 6910 + }, + { + "epoch": 0.6060378527024823, + "grad_norm": 0.0615234375, + "learning_rate": 0.0013629553752046446, + "loss": 1.2842, + "step": 6911 + }, + { + "epoch": 0.606125544476857, + "grad_norm": 0.0625, + "learning_rate": 0.0013625570252489292, + "loss": 1.1625, + "step": 6912 + }, + { + "epoch": 0.6062132362512315, + "grad_norm": 0.0478515625, + "learning_rate": 0.0013621587015072714, + "loss": 1.1526, + "step": 6913 + }, + { + "epoch": 0.606300928025606, + "grad_norm": 0.05517578125, + "learning_rate": 0.0013617604040159967, + "loss": 1.1649, + "step": 6914 + }, + { + "epoch": 0.6063886197999805, + "grad_norm": 0.0732421875, + "learning_rate": 0.0013613621328114292, + "loss": 1.1654, + "step": 6915 + }, + { + "epoch": 0.6064763115743551, + "grad_norm": 0.050048828125, + "learning_rate": 0.0013609638879298897, + "loss": 1.1149, + "step": 6916 + }, + { + "epoch": 0.6065640033487296, + "grad_norm": 0.046142578125, + "learning_rate": 0.0013605656694076975, + "loss": 1.1657, + "step": 6917 + }, + { + "epoch": 0.6066516951231041, + "grad_norm": 0.060302734375, + "learning_rate": 0.0013601674772811697, + "loss": 1.1605, + "step": 6918 + }, + { + "epoch": 0.6067393868974787, + "grad_norm": 0.0703125, + "learning_rate": 0.001359769311586619, + "loss": 1.154, + "step": 6919 + }, + { + "epoch": 0.6068270786718533, + "grad_norm": 0.046630859375, + "learning_rate": 0.0013593711723603585, + "loss": 1.1168, + "step": 6920 + }, + { + "epoch": 0.6069147704462278, + "grad_norm": 0.046142578125, + "learning_rate": 0.0013589730596386964, + "loss": 1.169, + "step": 6921 + }, + { + "epoch": 0.6070024622206024, + "grad_norm": 0.05126953125, + "learning_rate": 0.001358574973457941, + "loss": 1.1701, + "step": 6922 + }, + { + "epoch": 0.6070901539949769, + "grad_norm": 0.05517578125, + "learning_rate": 0.0013581769138543947, + "loss": 1.1789, + "step": 6923 + }, + { + "epoch": 0.6071778457693514, + "grad_norm": 0.054931640625, + "learning_rate": 0.0013577788808643606, + "loss": 1.1562, + "step": 6924 + }, + { + "epoch": 0.607265537543726, + "grad_norm": 0.05029296875, + "learning_rate": 0.0013573808745241385, + "loss": 1.1661, + "step": 6925 + }, + { + "epoch": 0.6073532293181005, + "grad_norm": 0.05615234375, + "learning_rate": 0.001356982894870025, + "loss": 1.1582, + "step": 6926 + }, + { + "epoch": 0.607440921092475, + "grad_norm": 0.05126953125, + "learning_rate": 0.0013565849419383152, + "loss": 1.139, + "step": 6927 + }, + { + "epoch": 0.6075286128668497, + "grad_norm": 0.047607421875, + "learning_rate": 0.0013561870157653012, + "loss": 1.1224, + "step": 6928 + }, + { + "epoch": 0.6076163046412242, + "grad_norm": 0.0458984375, + "learning_rate": 0.001355789116387273, + "loss": 1.0936, + "step": 6929 + }, + { + "epoch": 0.6077039964155987, + "grad_norm": 0.06982421875, + "learning_rate": 0.0013553912438405175, + "loss": 1.1331, + "step": 6930 + }, + { + "epoch": 0.6077916881899733, + "grad_norm": 0.055419921875, + "learning_rate": 0.0013549933981613202, + "loss": 1.1388, + "step": 6931 + }, + { + "epoch": 0.6078793799643478, + "grad_norm": 0.05517578125, + "learning_rate": 0.0013545955793859636, + "loss": 1.2244, + "step": 6932 + }, + { + "epoch": 0.6079670717387223, + "grad_norm": 0.049072265625, + "learning_rate": 0.0013541977875507277, + "loss": 1.1788, + "step": 6933 + }, + { + "epoch": 0.608054763513097, + "grad_norm": 0.046875, + "learning_rate": 0.0013538000226918894, + "loss": 1.1546, + "step": 6934 + }, + { + "epoch": 0.6081424552874715, + "grad_norm": 0.0498046875, + "learning_rate": 0.0013534022848457252, + "loss": 1.1026, + "step": 6935 + }, + { + "epoch": 0.608230147061846, + "grad_norm": 0.050537109375, + "learning_rate": 0.001353004574048506, + "loss": 1.2076, + "step": 6936 + }, + { + "epoch": 0.6083178388362206, + "grad_norm": 0.047607421875, + "learning_rate": 0.0013526068903365039, + "loss": 1.1365, + "step": 6937 + }, + { + "epoch": 0.6084055306105951, + "grad_norm": 0.057861328125, + "learning_rate": 0.001352209233745985, + "loss": 1.1471, + "step": 6938 + }, + { + "epoch": 0.6084932223849696, + "grad_norm": 0.0625, + "learning_rate": 0.001351811604313216, + "loss": 1.1913, + "step": 6939 + }, + { + "epoch": 0.6085809141593441, + "grad_norm": 0.048095703125, + "learning_rate": 0.0013514140020744587, + "loss": 1.1136, + "step": 6940 + }, + { + "epoch": 0.6086686059337187, + "grad_norm": 0.047607421875, + "learning_rate": 0.0013510164270659729, + "loss": 1.1642, + "step": 6941 + }, + { + "epoch": 0.6087562977080933, + "grad_norm": 0.047607421875, + "learning_rate": 0.001350618879324018, + "loss": 1.0913, + "step": 6942 + }, + { + "epoch": 0.6088439894824678, + "grad_norm": 0.0439453125, + "learning_rate": 0.0013502213588848476, + "loss": 1.1066, + "step": 6943 + }, + { + "epoch": 0.6089316812568424, + "grad_norm": 0.0517578125, + "learning_rate": 0.001349823865784716, + "loss": 1.2153, + "step": 6944 + }, + { + "epoch": 0.6090193730312169, + "grad_norm": 0.04345703125, + "learning_rate": 0.0013494264000598725, + "loss": 1.1746, + "step": 6945 + }, + { + "epoch": 0.6091070648055914, + "grad_norm": 0.043701171875, + "learning_rate": 0.0013490289617465656, + "loss": 1.1373, + "step": 6946 + }, + { + "epoch": 0.609194756579966, + "grad_norm": 0.05126953125, + "learning_rate": 0.0013486315508810407, + "loss": 1.1602, + "step": 6947 + }, + { + "epoch": 0.6092824483543405, + "grad_norm": 0.05029296875, + "learning_rate": 0.00134823416749954, + "loss": 1.1731, + "step": 6948 + }, + { + "epoch": 0.609370140128715, + "grad_norm": 0.0673828125, + "learning_rate": 0.0013478368116383044, + "loss": 1.138, + "step": 6949 + }, + { + "epoch": 0.6094578319030897, + "grad_norm": 0.058837890625, + "learning_rate": 0.0013474394833335715, + "loss": 1.071, + "step": 6950 + }, + { + "epoch": 0.6095455236774642, + "grad_norm": 0.08544921875, + "learning_rate": 0.001347042182621577, + "loss": 1.1846, + "step": 6951 + }, + { + "epoch": 0.6096332154518387, + "grad_norm": 0.0693359375, + "learning_rate": 0.001346644909538553, + "loss": 1.1784, + "step": 6952 + }, + { + "epoch": 0.6097209072262133, + "grad_norm": 0.0693359375, + "learning_rate": 0.00134624766412073, + "loss": 1.1712, + "step": 6953 + }, + { + "epoch": 0.6098085990005878, + "grad_norm": 0.07177734375, + "learning_rate": 0.0013458504464043358, + "loss": 1.125, + "step": 6954 + }, + { + "epoch": 0.6098962907749623, + "grad_norm": 0.0849609375, + "learning_rate": 0.0013454532564255966, + "loss": 1.1824, + "step": 6955 + }, + { + "epoch": 0.609983982549337, + "grad_norm": 0.044189453125, + "learning_rate": 0.001345056094220734, + "loss": 1.1512, + "step": 6956 + }, + { + "epoch": 0.6100716743237115, + "grad_norm": 0.06884765625, + "learning_rate": 0.0013446589598259677, + "loss": 1.1292, + "step": 6957 + }, + { + "epoch": 0.610159366098086, + "grad_norm": 0.05224609375, + "learning_rate": 0.0013442618532775168, + "loss": 1.1319, + "step": 6958 + }, + { + "epoch": 0.6102470578724606, + "grad_norm": 0.046142578125, + "learning_rate": 0.0013438647746115947, + "loss": 1.1561, + "step": 6959 + }, + { + "epoch": 0.6103347496468351, + "grad_norm": 0.0498046875, + "learning_rate": 0.0013434677238644158, + "loss": 1.1709, + "step": 6960 + }, + { + "epoch": 0.6104224414212096, + "grad_norm": 0.0703125, + "learning_rate": 0.0013430707010721884, + "loss": 1.1731, + "step": 6961 + }, + { + "epoch": 0.6105101331955842, + "grad_norm": 0.06103515625, + "learning_rate": 0.0013426737062711212, + "loss": 1.1645, + "step": 6962 + }, + { + "epoch": 0.6105978249699587, + "grad_norm": 0.053466796875, + "learning_rate": 0.0013422767394974183, + "loss": 1.1936, + "step": 6963 + }, + { + "epoch": 0.6106855167443332, + "grad_norm": 0.0859375, + "learning_rate": 0.001341879800787283, + "loss": 1.1678, + "step": 6964 + }, + { + "epoch": 0.6107732085187078, + "grad_norm": 0.061767578125, + "learning_rate": 0.001341482890176914, + "loss": 1.1192, + "step": 6965 + }, + { + "epoch": 0.6108609002930824, + "grad_norm": 0.0654296875, + "learning_rate": 0.0013410860077025088, + "loss": 1.1294, + "step": 6966 + }, + { + "epoch": 0.6109485920674569, + "grad_norm": 0.06689453125, + "learning_rate": 0.0013406891534002625, + "loss": 1.1399, + "step": 6967 + }, + { + "epoch": 0.6110362838418314, + "grad_norm": 0.0498046875, + "learning_rate": 0.0013402923273063672, + "loss": 1.1161, + "step": 6968 + }, + { + "epoch": 0.611123975616206, + "grad_norm": 0.04638671875, + "learning_rate": 0.0013398955294570115, + "loss": 1.2074, + "step": 6969 + }, + { + "epoch": 0.6112116673905805, + "grad_norm": 0.0771484375, + "learning_rate": 0.0013394987598883833, + "loss": 1.132, + "step": 6970 + }, + { + "epoch": 0.611299359164955, + "grad_norm": 0.055419921875, + "learning_rate": 0.0013391020186366666, + "loss": 1.1569, + "step": 6971 + }, + { + "epoch": 0.6113870509393297, + "grad_norm": 0.044921875, + "learning_rate": 0.0013387053057380433, + "loss": 1.2112, + "step": 6972 + }, + { + "epoch": 0.6114747427137042, + "grad_norm": 0.04638671875, + "learning_rate": 0.0013383086212286931, + "loss": 1.1633, + "step": 6973 + }, + { + "epoch": 0.6115624344880787, + "grad_norm": 0.047119140625, + "learning_rate": 0.0013379119651447916, + "loss": 1.1634, + "step": 6974 + }, + { + "epoch": 0.6116501262624533, + "grad_norm": 0.0419921875, + "learning_rate": 0.0013375153375225126, + "loss": 1.1658, + "step": 6975 + }, + { + "epoch": 0.6117378180368278, + "grad_norm": 0.048583984375, + "learning_rate": 0.001337118738398029, + "loss": 1.1337, + "step": 6976 + }, + { + "epoch": 0.6118255098112023, + "grad_norm": 0.049072265625, + "learning_rate": 0.001336722167807508, + "loss": 1.2006, + "step": 6977 + }, + { + "epoch": 0.6119132015855769, + "grad_norm": 0.045166015625, + "learning_rate": 0.001336325625787117, + "loss": 1.149, + "step": 6978 + }, + { + "epoch": 0.6120008933599514, + "grad_norm": 0.048095703125, + "learning_rate": 0.0013359291123730191, + "loss": 1.1229, + "step": 6979 + }, + { + "epoch": 0.612088585134326, + "grad_norm": 0.054443359375, + "learning_rate": 0.0013355326276013756, + "loss": 1.1943, + "step": 6980 + }, + { + "epoch": 0.6121762769087006, + "grad_norm": 0.04833984375, + "learning_rate": 0.0013351361715083442, + "loss": 1.1539, + "step": 6981 + }, + { + "epoch": 0.6122639686830751, + "grad_norm": 0.05712890625, + "learning_rate": 0.0013347397441300815, + "loss": 1.1314, + "step": 6982 + }, + { + "epoch": 0.6123516604574496, + "grad_norm": 0.053466796875, + "learning_rate": 0.0013343433455027401, + "loss": 1.1423, + "step": 6983 + }, + { + "epoch": 0.6124393522318242, + "grad_norm": 0.05078125, + "learning_rate": 0.0013339469756624709, + "loss": 1.1411, + "step": 6984 + }, + { + "epoch": 0.6125270440061987, + "grad_norm": 0.051513671875, + "learning_rate": 0.0013335506346454213, + "loss": 1.2055, + "step": 6985 + }, + { + "epoch": 0.6126147357805732, + "grad_norm": 0.048095703125, + "learning_rate": 0.001333154322487737, + "loss": 1.1789, + "step": 6986 + }, + { + "epoch": 0.6127024275549477, + "grad_norm": 0.045166015625, + "learning_rate": 0.001332758039225561, + "loss": 1.089, + "step": 6987 + }, + { + "epoch": 0.6127901193293224, + "grad_norm": 0.0654296875, + "learning_rate": 0.0013323617848950327, + "loss": 1.1798, + "step": 6988 + }, + { + "epoch": 0.6128778111036969, + "grad_norm": 0.053955078125, + "learning_rate": 0.001331965559532289, + "loss": 1.1761, + "step": 6989 + }, + { + "epoch": 0.6129655028780714, + "grad_norm": 0.06005859375, + "learning_rate": 0.0013315693631734665, + "loss": 1.2367, + "step": 6990 + }, + { + "epoch": 0.613053194652446, + "grad_norm": 0.04638671875, + "learning_rate": 0.0013311731958546957, + "loss": 1.15, + "step": 6991 + }, + { + "epoch": 0.6131408864268205, + "grad_norm": 0.0810546875, + "learning_rate": 0.0013307770576121056, + "loss": 1.1308, + "step": 6992 + }, + { + "epoch": 0.613228578201195, + "grad_norm": 0.08056640625, + "learning_rate": 0.0013303809484818243, + "loss": 1.1075, + "step": 6993 + }, + { + "epoch": 0.6133162699755697, + "grad_norm": 0.046142578125, + "learning_rate": 0.001329984868499975, + "loss": 1.1455, + "step": 6994 + }, + { + "epoch": 0.6134039617499442, + "grad_norm": 0.05126953125, + "learning_rate": 0.0013295888177026803, + "loss": 1.126, + "step": 6995 + }, + { + "epoch": 0.6134916535243187, + "grad_norm": 0.060791015625, + "learning_rate": 0.0013291927961260577, + "loss": 1.1293, + "step": 6996 + }, + { + "epoch": 0.6135793452986933, + "grad_norm": 0.0478515625, + "learning_rate": 0.0013287968038062242, + "loss": 1.1212, + "step": 6997 + }, + { + "epoch": 0.6136670370730678, + "grad_norm": 0.049072265625, + "learning_rate": 0.0013284008407792927, + "loss": 1.1112, + "step": 6998 + }, + { + "epoch": 0.6137547288474423, + "grad_norm": 0.07080078125, + "learning_rate": 0.0013280049070813745, + "loss": 1.1704, + "step": 6999 + }, + { + "epoch": 0.6138424206218169, + "grad_norm": 0.045166015625, + "learning_rate": 0.001327609002748578, + "loss": 1.1397, + "step": 7000 + }, + { + "epoch": 0.6138424206218169, + "eval_loss": 1.160142183303833, + "eval_runtime": 429.0326, + "eval_samples_per_second": 33.673, + "eval_steps_per_second": 8.419, + "step": 7000 + }, + { + "epoch": 0.6139301123961914, + "grad_norm": 0.04833984375, + "learning_rate": 0.0013272131278170075, + "loss": 1.1611, + "step": 7001 + }, + { + "epoch": 0.614017804170566, + "grad_norm": 0.04541015625, + "learning_rate": 0.001326817282322767, + "loss": 1.1525, + "step": 7002 + }, + { + "epoch": 0.6141054959449406, + "grad_norm": 0.0498046875, + "learning_rate": 0.0013264214663019558, + "loss": 1.1286, + "step": 7003 + }, + { + "epoch": 0.6141931877193151, + "grad_norm": 0.051513671875, + "learning_rate": 0.0013260256797906714, + "loss": 1.1521, + "step": 7004 + }, + { + "epoch": 0.6142808794936896, + "grad_norm": 0.06298828125, + "learning_rate": 0.001325629922825009, + "loss": 1.1163, + "step": 7005 + }, + { + "epoch": 0.6143685712680642, + "grad_norm": 0.048095703125, + "learning_rate": 0.00132523419544106, + "loss": 1.1486, + "step": 7006 + }, + { + "epoch": 0.6144562630424387, + "grad_norm": 0.04541015625, + "learning_rate": 0.0013248384976749146, + "loss": 1.1185, + "step": 7007 + }, + { + "epoch": 0.6145439548168132, + "grad_norm": 0.0498046875, + "learning_rate": 0.0013244428295626585, + "loss": 1.1294, + "step": 7008 + }, + { + "epoch": 0.6146316465911879, + "grad_norm": 0.05029296875, + "learning_rate": 0.001324047191140376, + "loss": 1.1348, + "step": 7009 + }, + { + "epoch": 0.6147193383655624, + "grad_norm": 0.046630859375, + "learning_rate": 0.0013236515824441475, + "loss": 1.1134, + "step": 7010 + }, + { + "epoch": 0.6148070301399369, + "grad_norm": 0.0625, + "learning_rate": 0.001323256003510053, + "loss": 1.1252, + "step": 7011 + }, + { + "epoch": 0.6148947219143114, + "grad_norm": 0.04931640625, + "learning_rate": 0.001322860454374167, + "loss": 1.22, + "step": 7012 + }, + { + "epoch": 0.614982413688686, + "grad_norm": 0.0458984375, + "learning_rate": 0.0013224649350725632, + "loss": 1.1248, + "step": 7013 + }, + { + "epoch": 0.6150701054630605, + "grad_norm": 0.05224609375, + "learning_rate": 0.0013220694456413118, + "loss": 1.161, + "step": 7014 + }, + { + "epoch": 0.615157797237435, + "grad_norm": 0.048583984375, + "learning_rate": 0.0013216739861164808, + "loss": 1.165, + "step": 7015 + }, + { + "epoch": 0.6152454890118096, + "grad_norm": 0.0478515625, + "learning_rate": 0.001321278556534134, + "loss": 1.1638, + "step": 7016 + }, + { + "epoch": 0.6153331807861842, + "grad_norm": 0.05224609375, + "learning_rate": 0.0013208831569303347, + "loss": 1.175, + "step": 7017 + }, + { + "epoch": 0.6154208725605587, + "grad_norm": 0.049560546875, + "learning_rate": 0.001320487787341142, + "loss": 1.2447, + "step": 7018 + }, + { + "epoch": 0.6155085643349333, + "grad_norm": 0.05224609375, + "learning_rate": 0.001320092447802612, + "loss": 1.1454, + "step": 7019 + }, + { + "epoch": 0.6155962561093078, + "grad_norm": 0.05224609375, + "learning_rate": 0.0013196971383507992, + "loss": 1.1323, + "step": 7020 + }, + { + "epoch": 0.6156839478836823, + "grad_norm": 0.048583984375, + "learning_rate": 0.0013193018590217551, + "loss": 1.1776, + "step": 7021 + }, + { + "epoch": 0.6157716396580569, + "grad_norm": 0.04931640625, + "learning_rate": 0.0013189066098515275, + "loss": 1.2058, + "step": 7022 + }, + { + "epoch": 0.6158593314324314, + "grad_norm": 0.0498046875, + "learning_rate": 0.0013185113908761627, + "loss": 1.1369, + "step": 7023 + }, + { + "epoch": 0.6159470232068059, + "grad_norm": 0.053955078125, + "learning_rate": 0.0013181162021317033, + "loss": 1.1576, + "step": 7024 + }, + { + "epoch": 0.6160347149811806, + "grad_norm": 0.048828125, + "learning_rate": 0.001317721043654189, + "loss": 1.1829, + "step": 7025 + }, + { + "epoch": 0.6161224067555551, + "grad_norm": 0.0517578125, + "learning_rate": 0.001317325915479658, + "loss": 1.1451, + "step": 7026 + }, + { + "epoch": 0.6162100985299296, + "grad_norm": 0.050048828125, + "learning_rate": 0.001316930817644145, + "loss": 1.1137, + "step": 7027 + }, + { + "epoch": 0.6162977903043042, + "grad_norm": 0.05908203125, + "learning_rate": 0.001316535750183681, + "loss": 1.1314, + "step": 7028 + }, + { + "epoch": 0.6163854820786787, + "grad_norm": 0.04736328125, + "learning_rate": 0.0013161407131342961, + "loss": 1.0983, + "step": 7029 + }, + { + "epoch": 0.6164731738530532, + "grad_norm": 0.05322265625, + "learning_rate": 0.0013157457065320162, + "loss": 1.1452, + "step": 7030 + }, + { + "epoch": 0.6165608656274278, + "grad_norm": 0.06640625, + "learning_rate": 0.0013153507304128652, + "loss": 1.0899, + "step": 7031 + }, + { + "epoch": 0.6166485574018024, + "grad_norm": 0.0712890625, + "learning_rate": 0.0013149557848128634, + "loss": 1.2623, + "step": 7032 + }, + { + "epoch": 0.6167362491761769, + "grad_norm": 0.057373046875, + "learning_rate": 0.0013145608697680295, + "loss": 1.1314, + "step": 7033 + }, + { + "epoch": 0.6168239409505515, + "grad_norm": 0.09814453125, + "learning_rate": 0.001314165985314378, + "loss": 1.1606, + "step": 7034 + }, + { + "epoch": 0.616911632724926, + "grad_norm": 0.06591796875, + "learning_rate": 0.0013137711314879217, + "loss": 1.12, + "step": 7035 + }, + { + "epoch": 0.6169993244993005, + "grad_norm": 0.060302734375, + "learning_rate": 0.0013133763083246703, + "loss": 1.2337, + "step": 7036 + }, + { + "epoch": 0.617087016273675, + "grad_norm": 0.0517578125, + "learning_rate": 0.0013129815158606309, + "loss": 1.2394, + "step": 7037 + }, + { + "epoch": 0.6171747080480496, + "grad_norm": 0.059326171875, + "learning_rate": 0.0013125867541318068, + "loss": 1.1556, + "step": 7038 + }, + { + "epoch": 0.6172623998224241, + "grad_norm": 0.04931640625, + "learning_rate": 0.0013121920231741998, + "loss": 1.1269, + "step": 7039 + }, + { + "epoch": 0.6173500915967987, + "grad_norm": 0.061279296875, + "learning_rate": 0.0013117973230238084, + "loss": 1.2225, + "step": 7040 + }, + { + "epoch": 0.6174377833711733, + "grad_norm": 0.052490234375, + "learning_rate": 0.0013114026537166283, + "loss": 1.2224, + "step": 7041 + }, + { + "epoch": 0.6175254751455478, + "grad_norm": 0.09326171875, + "learning_rate": 0.001311008015288652, + "loss": 1.1639, + "step": 7042 + }, + { + "epoch": 0.6176131669199223, + "grad_norm": 0.0986328125, + "learning_rate": 0.001310613407775869, + "loss": 1.1485, + "step": 7043 + }, + { + "epoch": 0.6177008586942969, + "grad_norm": 0.0546875, + "learning_rate": 0.001310218831214268, + "loss": 1.1462, + "step": 7044 + }, + { + "epoch": 0.6177885504686714, + "grad_norm": 0.0751953125, + "learning_rate": 0.0013098242856398315, + "loss": 1.1115, + "step": 7045 + }, + { + "epoch": 0.6178762422430459, + "grad_norm": 0.060791015625, + "learning_rate": 0.0013094297710885424, + "loss": 1.1209, + "step": 7046 + }, + { + "epoch": 0.6179639340174206, + "grad_norm": 0.05078125, + "learning_rate": 0.001309035287596379, + "loss": 1.218, + "step": 7047 + }, + { + "epoch": 0.6180516257917951, + "grad_norm": 0.04638671875, + "learning_rate": 0.0013086408351993175, + "loss": 1.1006, + "step": 7048 + }, + { + "epoch": 0.6181393175661696, + "grad_norm": 0.0625, + "learning_rate": 0.0013082464139333298, + "loss": 1.1715, + "step": 7049 + }, + { + "epoch": 0.6182270093405442, + "grad_norm": 0.07275390625, + "learning_rate": 0.001307852023834388, + "loss": 1.146, + "step": 7050 + }, + { + "epoch": 0.6183147011149187, + "grad_norm": 0.05224609375, + "learning_rate": 0.0013074576649384575, + "loss": 1.1791, + "step": 7051 + }, + { + "epoch": 0.6184023928892932, + "grad_norm": 0.056884765625, + "learning_rate": 0.0013070633372815037, + "loss": 1.1356, + "step": 7052 + }, + { + "epoch": 0.6184900846636678, + "grad_norm": 0.0634765625, + "learning_rate": 0.0013066690408994888, + "loss": 1.1568, + "step": 7053 + }, + { + "epoch": 0.6185777764380423, + "grad_norm": 0.05859375, + "learning_rate": 0.0013062747758283706, + "loss": 1.1428, + "step": 7054 + }, + { + "epoch": 0.6186654682124169, + "grad_norm": 0.046630859375, + "learning_rate": 0.001305880542104106, + "loss": 1.1859, + "step": 7055 + }, + { + "epoch": 0.6187531599867915, + "grad_norm": 0.054931640625, + "learning_rate": 0.0013054863397626473, + "loss": 1.225, + "step": 7056 + }, + { + "epoch": 0.618840851761166, + "grad_norm": 0.053466796875, + "learning_rate": 0.0013050921688399452, + "loss": 1.1398, + "step": 7057 + }, + { + "epoch": 0.6189285435355405, + "grad_norm": 0.051025390625, + "learning_rate": 0.0013046980293719471, + "loss": 1.1559, + "step": 7058 + }, + { + "epoch": 0.619016235309915, + "grad_norm": 0.0458984375, + "learning_rate": 0.001304303921394597, + "loss": 1.1, + "step": 7059 + }, + { + "epoch": 0.6191039270842896, + "grad_norm": 0.046142578125, + "learning_rate": 0.0013039098449438371, + "loss": 1.1826, + "step": 7060 + }, + { + "epoch": 0.6191916188586641, + "grad_norm": 0.054931640625, + "learning_rate": 0.0013035158000556056, + "loss": 1.1743, + "step": 7061 + }, + { + "epoch": 0.6192793106330386, + "grad_norm": 0.04638671875, + "learning_rate": 0.0013031217867658389, + "loss": 1.1808, + "step": 7062 + }, + { + "epoch": 0.6193670024074133, + "grad_norm": 0.06640625, + "learning_rate": 0.0013027278051104694, + "loss": 1.1274, + "step": 7063 + }, + { + "epoch": 0.6194546941817878, + "grad_norm": 0.052978515625, + "learning_rate": 0.0013023338551254283, + "loss": 1.1416, + "step": 7064 + }, + { + "epoch": 0.6195423859561623, + "grad_norm": 0.07373046875, + "learning_rate": 0.0013019399368466415, + "loss": 1.1597, + "step": 7065 + }, + { + "epoch": 0.6196300777305369, + "grad_norm": 0.07177734375, + "learning_rate": 0.0013015460503100346, + "loss": 1.1999, + "step": 7066 + }, + { + "epoch": 0.6197177695049114, + "grad_norm": 0.06591796875, + "learning_rate": 0.0013011521955515275, + "loss": 1.1818, + "step": 7067 + }, + { + "epoch": 0.6198054612792859, + "grad_norm": 0.09423828125, + "learning_rate": 0.0013007583726070402, + "loss": 1.2153, + "step": 7068 + }, + { + "epoch": 0.6198931530536606, + "grad_norm": 0.059814453125, + "learning_rate": 0.0013003645815124875, + "loss": 1.2075, + "step": 7069 + }, + { + "epoch": 0.6199808448280351, + "grad_norm": 0.07666015625, + "learning_rate": 0.0012999708223037827, + "loss": 1.1377, + "step": 7070 + }, + { + "epoch": 0.6200685366024096, + "grad_norm": 0.111328125, + "learning_rate": 0.001299577095016835, + "loss": 1.1461, + "step": 7071 + }, + { + "epoch": 0.6201562283767842, + "grad_norm": 0.10595703125, + "learning_rate": 0.001299183399687552, + "loss": 1.2033, + "step": 7072 + }, + { + "epoch": 0.6202439201511587, + "grad_norm": 0.05859375, + "learning_rate": 0.0012987897363518368, + "loss": 1.0833, + "step": 7073 + }, + { + "epoch": 0.6203316119255332, + "grad_norm": 0.06396484375, + "learning_rate": 0.0012983961050455916, + "loss": 1.1631, + "step": 7074 + }, + { + "epoch": 0.6204193036999078, + "grad_norm": 0.0810546875, + "learning_rate": 0.0012980025058047138, + "loss": 1.1793, + "step": 7075 + }, + { + "epoch": 0.6205069954742823, + "grad_norm": 0.04833984375, + "learning_rate": 0.0012976089386650983, + "loss": 1.1512, + "step": 7076 + }, + { + "epoch": 0.6205946872486569, + "grad_norm": 0.064453125, + "learning_rate": 0.0012972154036626385, + "loss": 1.1151, + "step": 7077 + }, + { + "epoch": 0.6206823790230315, + "grad_norm": 0.050048828125, + "learning_rate": 0.0012968219008332228, + "loss": 1.2146, + "step": 7078 + }, + { + "epoch": 0.620770070797406, + "grad_norm": 0.060546875, + "learning_rate": 0.0012964284302127381, + "loss": 1.1997, + "step": 7079 + }, + { + "epoch": 0.6208577625717805, + "grad_norm": 0.048095703125, + "learning_rate": 0.0012960349918370677, + "loss": 1.1165, + "step": 7080 + }, + { + "epoch": 0.6209454543461551, + "grad_norm": 0.048583984375, + "learning_rate": 0.0012956415857420924, + "loss": 1.1553, + "step": 7081 + }, + { + "epoch": 0.6210331461205296, + "grad_norm": 0.046875, + "learning_rate": 0.00129524821196369, + "loss": 1.1352, + "step": 7082 + }, + { + "epoch": 0.6211208378949041, + "grad_norm": 0.047607421875, + "learning_rate": 0.0012948548705377344, + "loss": 1.0914, + "step": 7083 + }, + { + "epoch": 0.6212085296692786, + "grad_norm": 0.0498046875, + "learning_rate": 0.0012944615615000984, + "loss": 1.1849, + "step": 7084 + }, + { + "epoch": 0.6212962214436533, + "grad_norm": 0.06494140625, + "learning_rate": 0.0012940682848866494, + "loss": 1.1317, + "step": 7085 + }, + { + "epoch": 0.6213839132180278, + "grad_norm": 0.060546875, + "learning_rate": 0.0012936750407332544, + "loss": 1.163, + "step": 7086 + }, + { + "epoch": 0.6214716049924023, + "grad_norm": 0.06201171875, + "learning_rate": 0.0012932818290757758, + "loss": 1.1418, + "step": 7087 + }, + { + "epoch": 0.6215592967667769, + "grad_norm": 0.05859375, + "learning_rate": 0.0012928886499500736, + "loss": 1.1355, + "step": 7088 + }, + { + "epoch": 0.6216469885411514, + "grad_norm": 0.050537109375, + "learning_rate": 0.0012924955033920046, + "loss": 1.1441, + "step": 7089 + }, + { + "epoch": 0.6217346803155259, + "grad_norm": 0.048828125, + "learning_rate": 0.001292102389437423, + "loss": 1.1688, + "step": 7090 + }, + { + "epoch": 0.6218223720899005, + "grad_norm": 0.053955078125, + "learning_rate": 0.001291709308122179, + "loss": 1.1275, + "step": 7091 + }, + { + "epoch": 0.621910063864275, + "grad_norm": 0.068359375, + "learning_rate": 0.001291316259482122, + "loss": 1.1624, + "step": 7092 + }, + { + "epoch": 0.6219977556386496, + "grad_norm": 0.0439453125, + "learning_rate": 0.001290923243553096, + "loss": 1.1486, + "step": 7093 + }, + { + "epoch": 0.6220854474130242, + "grad_norm": 0.06298828125, + "learning_rate": 0.0012905302603709427, + "loss": 1.1929, + "step": 7094 + }, + { + "epoch": 0.6221731391873987, + "grad_norm": 0.050048828125, + "learning_rate": 0.001290137309971502, + "loss": 1.2062, + "step": 7095 + }, + { + "epoch": 0.6222608309617732, + "grad_norm": 0.044921875, + "learning_rate": 0.0012897443923906096, + "loss": 1.1259, + "step": 7096 + }, + { + "epoch": 0.6223485227361478, + "grad_norm": 0.05419921875, + "learning_rate": 0.001289351507664099, + "loss": 1.1629, + "step": 7097 + }, + { + "epoch": 0.6224362145105223, + "grad_norm": 0.050048828125, + "learning_rate": 0.0012889586558277992, + "loss": 1.1184, + "step": 7098 + }, + { + "epoch": 0.6225239062848968, + "grad_norm": 0.0498046875, + "learning_rate": 0.0012885658369175383, + "loss": 1.1373, + "step": 7099 + }, + { + "epoch": 0.6226115980592715, + "grad_norm": 0.0908203125, + "learning_rate": 0.0012881730509691402, + "loss": 1.2196, + "step": 7100 + }, + { + "epoch": 0.622699289833646, + "grad_norm": 0.06640625, + "learning_rate": 0.0012877802980184256, + "loss": 1.1609, + "step": 7101 + }, + { + "epoch": 0.6227869816080205, + "grad_norm": 0.0888671875, + "learning_rate": 0.0012873875781012126, + "loss": 1.1309, + "step": 7102 + }, + { + "epoch": 0.6228746733823951, + "grad_norm": 0.05615234375, + "learning_rate": 0.0012869948912533169, + "loss": 1.2125, + "step": 7103 + }, + { + "epoch": 0.6229623651567696, + "grad_norm": 0.046875, + "learning_rate": 0.0012866022375105494, + "loss": 1.1824, + "step": 7104 + }, + { + "epoch": 0.6230500569311441, + "grad_norm": 0.049072265625, + "learning_rate": 0.0012862096169087203, + "loss": 1.1788, + "step": 7105 + }, + { + "epoch": 0.6231377487055186, + "grad_norm": 0.044189453125, + "learning_rate": 0.0012858170294836346, + "loss": 1.1006, + "step": 7106 + }, + { + "epoch": 0.6232254404798933, + "grad_norm": 0.05859375, + "learning_rate": 0.0012854244752710956, + "loss": 1.1616, + "step": 7107 + }, + { + "epoch": 0.6233131322542678, + "grad_norm": 0.059814453125, + "learning_rate": 0.0012850319543069042, + "loss": 1.1207, + "step": 7108 + }, + { + "epoch": 0.6234008240286423, + "grad_norm": 0.057861328125, + "learning_rate": 0.0012846394666268556, + "loss": 1.17, + "step": 7109 + }, + { + "epoch": 0.6234885158030169, + "grad_norm": 0.051513671875, + "learning_rate": 0.0012842470122667443, + "loss": 1.1747, + "step": 7110 + }, + { + "epoch": 0.6235762075773914, + "grad_norm": 0.04833984375, + "learning_rate": 0.0012838545912623618, + "loss": 1.1349, + "step": 7111 + }, + { + "epoch": 0.6236638993517659, + "grad_norm": 0.0634765625, + "learning_rate": 0.0012834622036494952, + "loss": 1.195, + "step": 7112 + }, + { + "epoch": 0.6237515911261405, + "grad_norm": 0.05419921875, + "learning_rate": 0.0012830698494639295, + "loss": 1.0944, + "step": 7113 + }, + { + "epoch": 0.623839282900515, + "grad_norm": 0.046875, + "learning_rate": 0.001282677528741446, + "loss": 1.125, + "step": 7114 + }, + { + "epoch": 0.6239269746748896, + "grad_norm": 0.06884765625, + "learning_rate": 0.0012822852415178244, + "loss": 1.1442, + "step": 7115 + }, + { + "epoch": 0.6240146664492642, + "grad_norm": 0.0576171875, + "learning_rate": 0.0012818929878288384, + "loss": 1.1898, + "step": 7116 + }, + { + "epoch": 0.6241023582236387, + "grad_norm": 0.057861328125, + "learning_rate": 0.0012815007677102624, + "loss": 1.1723, + "step": 7117 + }, + { + "epoch": 0.6241900499980132, + "grad_norm": 0.048095703125, + "learning_rate": 0.0012811085811978648, + "loss": 1.2259, + "step": 7118 + }, + { + "epoch": 0.6242777417723878, + "grad_norm": 0.053955078125, + "learning_rate": 0.0012807164283274127, + "loss": 1.108, + "step": 7119 + }, + { + "epoch": 0.6243654335467623, + "grad_norm": 0.060302734375, + "learning_rate": 0.0012803243091346684, + "loss": 1.1421, + "step": 7120 + }, + { + "epoch": 0.6244531253211368, + "grad_norm": 0.052978515625, + "learning_rate": 0.0012799322236553936, + "loss": 1.153, + "step": 7121 + }, + { + "epoch": 0.6245408170955115, + "grad_norm": 0.053955078125, + "learning_rate": 0.001279540171925344, + "loss": 1.1382, + "step": 7122 + }, + { + "epoch": 0.624628508869886, + "grad_norm": 0.055419921875, + "learning_rate": 0.0012791481539802748, + "loss": 1.1608, + "step": 7123 + }, + { + "epoch": 0.6247162006442605, + "grad_norm": 0.0693359375, + "learning_rate": 0.0012787561698559362, + "loss": 1.1136, + "step": 7124 + }, + { + "epoch": 0.6248038924186351, + "grad_norm": 0.04541015625, + "learning_rate": 0.001278364219588077, + "loss": 1.2437, + "step": 7125 + }, + { + "epoch": 0.6248915841930096, + "grad_norm": 0.0712890625, + "learning_rate": 0.0012779723032124417, + "loss": 1.1347, + "step": 7126 + }, + { + "epoch": 0.6249792759673841, + "grad_norm": 0.08837890625, + "learning_rate": 0.0012775804207647717, + "loss": 1.1173, + "step": 7127 + }, + { + "epoch": 0.6250669677417587, + "grad_norm": 0.05322265625, + "learning_rate": 0.001277188572280806, + "loss": 1.1661, + "step": 7128 + }, + { + "epoch": 0.6251546595161332, + "grad_norm": 0.060546875, + "learning_rate": 0.00127679675779628, + "loss": 1.1156, + "step": 7129 + }, + { + "epoch": 0.6252423512905078, + "grad_norm": 0.060302734375, + "learning_rate": 0.001276404977346926, + "loss": 1.087, + "step": 7130 + }, + { + "epoch": 0.6253300430648823, + "grad_norm": 0.050537109375, + "learning_rate": 0.0012760132309684738, + "loss": 1.1409, + "step": 7131 + }, + { + "epoch": 0.6254177348392569, + "grad_norm": 0.060302734375, + "learning_rate": 0.0012756215186966497, + "loss": 1.1705, + "step": 7132 + }, + { + "epoch": 0.6255054266136314, + "grad_norm": 0.052001953125, + "learning_rate": 0.0012752298405671763, + "loss": 1.1707, + "step": 7133 + }, + { + "epoch": 0.6255931183880059, + "grad_norm": 0.05078125, + "learning_rate": 0.0012748381966157742, + "loss": 1.1785, + "step": 7134 + }, + { + "epoch": 0.6256808101623805, + "grad_norm": 0.049560546875, + "learning_rate": 0.0012744465868781605, + "loss": 1.1098, + "step": 7135 + }, + { + "epoch": 0.625768501936755, + "grad_norm": 0.0458984375, + "learning_rate": 0.0012740550113900478, + "loss": 1.0921, + "step": 7136 + }, + { + "epoch": 0.6258561937111295, + "grad_norm": 0.06591796875, + "learning_rate": 0.001273663470187148, + "loss": 1.145, + "step": 7137 + }, + { + "epoch": 0.6259438854855042, + "grad_norm": 0.046630859375, + "learning_rate": 0.001273271963305168, + "loss": 1.1525, + "step": 7138 + }, + { + "epoch": 0.6260315772598787, + "grad_norm": 0.0595703125, + "learning_rate": 0.0012728804907798127, + "loss": 1.1481, + "step": 7139 + }, + { + "epoch": 0.6261192690342532, + "grad_norm": 0.046142578125, + "learning_rate": 0.001272489052646783, + "loss": 1.1546, + "step": 7140 + }, + { + "epoch": 0.6262069608086278, + "grad_norm": 0.0849609375, + "learning_rate": 0.001272097648941777, + "loss": 1.1193, + "step": 7141 + }, + { + "epoch": 0.6262946525830023, + "grad_norm": 0.0986328125, + "learning_rate": 0.0012717062797004905, + "loss": 1.254, + "step": 7142 + }, + { + "epoch": 0.6263823443573768, + "grad_norm": 0.05126953125, + "learning_rate": 0.0012713149449586141, + "loss": 1.1184, + "step": 7143 + }, + { + "epoch": 0.6264700361317515, + "grad_norm": 0.060546875, + "learning_rate": 0.0012709236447518374, + "loss": 1.1392, + "step": 7144 + }, + { + "epoch": 0.626557727906126, + "grad_norm": 0.0537109375, + "learning_rate": 0.0012705323791158455, + "loss": 1.1812, + "step": 7145 + }, + { + "epoch": 0.6266454196805005, + "grad_norm": 0.055908203125, + "learning_rate": 0.0012701411480863212, + "loss": 1.1901, + "step": 7146 + }, + { + "epoch": 0.6267331114548751, + "grad_norm": 0.049072265625, + "learning_rate": 0.0012697499516989435, + "loss": 1.1907, + "step": 7147 + }, + { + "epoch": 0.6268208032292496, + "grad_norm": 0.04638671875, + "learning_rate": 0.0012693587899893887, + "loss": 1.1438, + "step": 7148 + }, + { + "epoch": 0.6269084950036241, + "grad_norm": 0.05712890625, + "learning_rate": 0.0012689676629933295, + "loss": 1.1743, + "step": 7149 + }, + { + "epoch": 0.6269961867779987, + "grad_norm": 0.048583984375, + "learning_rate": 0.001268576570746436, + "loss": 1.1376, + "step": 7150 + }, + { + "epoch": 0.6270838785523732, + "grad_norm": 0.05810546875, + "learning_rate": 0.0012681855132843744, + "loss": 1.1764, + "step": 7151 + }, + { + "epoch": 0.6271715703267478, + "grad_norm": 0.06982421875, + "learning_rate": 0.0012677944906428085, + "loss": 1.1481, + "step": 7152 + }, + { + "epoch": 0.6272592621011224, + "grad_norm": 0.047607421875, + "learning_rate": 0.0012674035028573981, + "loss": 1.1515, + "step": 7153 + }, + { + "epoch": 0.6273469538754969, + "grad_norm": 0.052978515625, + "learning_rate": 0.0012670125499638011, + "loss": 1.138, + "step": 7154 + }, + { + "epoch": 0.6274346456498714, + "grad_norm": 0.08251953125, + "learning_rate": 0.0012666216319976702, + "loss": 1.1233, + "step": 7155 + }, + { + "epoch": 0.6275223374242459, + "grad_norm": 0.08203125, + "learning_rate": 0.0012662307489946572, + "loss": 1.1358, + "step": 7156 + }, + { + "epoch": 0.6276100291986205, + "grad_norm": 0.062255859375, + "learning_rate": 0.0012658399009904092, + "loss": 1.1568, + "step": 7157 + }, + { + "epoch": 0.627697720972995, + "grad_norm": 0.0625, + "learning_rate": 0.0012654490880205705, + "loss": 1.1594, + "step": 7158 + }, + { + "epoch": 0.6277854127473695, + "grad_norm": 0.09716796875, + "learning_rate": 0.0012650583101207827, + "loss": 1.1967, + "step": 7159 + }, + { + "epoch": 0.6278731045217442, + "grad_norm": 0.0625, + "learning_rate": 0.0012646675673266827, + "loss": 1.1724, + "step": 7160 + }, + { + "epoch": 0.6279607962961187, + "grad_norm": 0.060791015625, + "learning_rate": 0.001264276859673906, + "loss": 1.0737, + "step": 7161 + }, + { + "epoch": 0.6280484880704932, + "grad_norm": 0.048828125, + "learning_rate": 0.0012638861871980837, + "loss": 1.189, + "step": 7162 + }, + { + "epoch": 0.6281361798448678, + "grad_norm": 0.0859375, + "learning_rate": 0.0012634955499348447, + "loss": 1.1351, + "step": 7163 + }, + { + "epoch": 0.6282238716192423, + "grad_norm": 0.052001953125, + "learning_rate": 0.0012631049479198137, + "loss": 1.1214, + "step": 7164 + }, + { + "epoch": 0.6283115633936168, + "grad_norm": 0.052490234375, + "learning_rate": 0.0012627143811886124, + "loss": 1.1078, + "step": 7165 + }, + { + "epoch": 0.6283992551679914, + "grad_norm": 0.044677734375, + "learning_rate": 0.0012623238497768603, + "loss": 1.1501, + "step": 7166 + }, + { + "epoch": 0.628486946942366, + "grad_norm": 0.046875, + "learning_rate": 0.0012619333537201717, + "loss": 1.1204, + "step": 7167 + }, + { + "epoch": 0.6285746387167405, + "grad_norm": 0.051025390625, + "learning_rate": 0.0012615428930541602, + "loss": 1.1181, + "step": 7168 + }, + { + "epoch": 0.6286623304911151, + "grad_norm": 0.0546875, + "learning_rate": 0.0012611524678144332, + "loss": 1.1198, + "step": 7169 + }, + { + "epoch": 0.6287500222654896, + "grad_norm": 0.048583984375, + "learning_rate": 0.001260762078036598, + "loss": 1.1633, + "step": 7170 + }, + { + "epoch": 0.6288377140398641, + "grad_norm": 0.045654296875, + "learning_rate": 0.0012603717237562562, + "loss": 1.1155, + "step": 7171 + }, + { + "epoch": 0.6289254058142387, + "grad_norm": 0.049072265625, + "learning_rate": 0.0012599814050090075, + "loss": 1.1685, + "step": 7172 + }, + { + "epoch": 0.6290130975886132, + "grad_norm": 0.0712890625, + "learning_rate": 0.0012595911218304476, + "loss": 1.1616, + "step": 7173 + }, + { + "epoch": 0.6291007893629877, + "grad_norm": 0.044677734375, + "learning_rate": 0.00125920087425617, + "loss": 1.1229, + "step": 7174 + }, + { + "epoch": 0.6291884811373624, + "grad_norm": 0.048828125, + "learning_rate": 0.0012588106623217632, + "loss": 1.1745, + "step": 7175 + }, + { + "epoch": 0.6292761729117369, + "grad_norm": 0.051513671875, + "learning_rate": 0.001258420486062815, + "loss": 1.1212, + "step": 7176 + }, + { + "epoch": 0.6293638646861114, + "grad_norm": 0.040771484375, + "learning_rate": 0.0012580303455149073, + "loss": 1.1196, + "step": 7177 + }, + { + "epoch": 0.6294515564604859, + "grad_norm": 0.046875, + "learning_rate": 0.0012576402407136198, + "loss": 1.0892, + "step": 7178 + }, + { + "epoch": 0.6295392482348605, + "grad_norm": 0.050048828125, + "learning_rate": 0.0012572501716945298, + "loss": 1.1233, + "step": 7179 + }, + { + "epoch": 0.629626940009235, + "grad_norm": 0.050537109375, + "learning_rate": 0.0012568601384932102, + "loss": 1.1242, + "step": 7180 + }, + { + "epoch": 0.6297146317836095, + "grad_norm": 0.047119140625, + "learning_rate": 0.0012564701411452311, + "loss": 1.2031, + "step": 7181 + }, + { + "epoch": 0.6298023235579842, + "grad_norm": 0.048095703125, + "learning_rate": 0.0012560801796861587, + "loss": 1.1606, + "step": 7182 + }, + { + "epoch": 0.6298900153323587, + "grad_norm": 0.054931640625, + "learning_rate": 0.0012556902541515577, + "loss": 1.1174, + "step": 7183 + }, + { + "epoch": 0.6299777071067332, + "grad_norm": 0.04736328125, + "learning_rate": 0.0012553003645769873, + "loss": 1.1626, + "step": 7184 + }, + { + "epoch": 0.6300653988811078, + "grad_norm": 0.044189453125, + "learning_rate": 0.0012549105109980051, + "loss": 1.0986, + "step": 7185 + }, + { + "epoch": 0.6301530906554823, + "grad_norm": 0.046630859375, + "learning_rate": 0.001254520693450164, + "loss": 1.1397, + "step": 7186 + }, + { + "epoch": 0.6302407824298568, + "grad_norm": 0.046630859375, + "learning_rate": 0.0012541309119690146, + "loss": 1.0868, + "step": 7187 + }, + { + "epoch": 0.6303284742042314, + "grad_norm": 0.048828125, + "learning_rate": 0.0012537411665901046, + "loss": 1.1596, + "step": 7188 + }, + { + "epoch": 0.630416165978606, + "grad_norm": 0.050537109375, + "learning_rate": 0.0012533514573489766, + "loss": 1.0838, + "step": 7189 + }, + { + "epoch": 0.6305038577529805, + "grad_norm": 0.049560546875, + "learning_rate": 0.0012529617842811725, + "loss": 1.115, + "step": 7190 + }, + { + "epoch": 0.6305915495273551, + "grad_norm": 0.050537109375, + "learning_rate": 0.001252572147422228, + "loss": 1.1466, + "step": 7191 + }, + { + "epoch": 0.6306792413017296, + "grad_norm": 0.050537109375, + "learning_rate": 0.0012521825468076783, + "loss": 1.1557, + "step": 7192 + }, + { + "epoch": 0.6307669330761041, + "grad_norm": 0.041748046875, + "learning_rate": 0.001251792982473054, + "loss": 1.114, + "step": 7193 + }, + { + "epoch": 0.6308546248504787, + "grad_norm": 0.046630859375, + "learning_rate": 0.0012514034544538805, + "loss": 1.0619, + "step": 7194 + }, + { + "epoch": 0.6309423166248532, + "grad_norm": 0.050048828125, + "learning_rate": 0.0012510139627856835, + "loss": 1.1903, + "step": 7195 + }, + { + "epoch": 0.6310300083992277, + "grad_norm": 0.04736328125, + "learning_rate": 0.0012506245075039827, + "loss": 1.1645, + "step": 7196 + }, + { + "epoch": 0.6311177001736024, + "grad_norm": 0.043701171875, + "learning_rate": 0.0012502350886442963, + "loss": 1.1891, + "step": 7197 + }, + { + "epoch": 0.6312053919479769, + "grad_norm": 0.05078125, + "learning_rate": 0.0012498457062421378, + "loss": 1.1088, + "step": 7198 + }, + { + "epoch": 0.6312930837223514, + "grad_norm": 0.054931640625, + "learning_rate": 0.0012494563603330175, + "loss": 1.2219, + "step": 7199 + }, + { + "epoch": 0.631380775496726, + "grad_norm": 0.04833984375, + "learning_rate": 0.0012490670509524434, + "loss": 1.1092, + "step": 7200 + }, + { + "epoch": 0.6314684672711005, + "grad_norm": 0.045654296875, + "learning_rate": 0.0012486777781359195, + "loss": 1.2014, + "step": 7201 + }, + { + "epoch": 0.631556159045475, + "grad_norm": 0.0478515625, + "learning_rate": 0.0012482885419189459, + "loss": 1.1901, + "step": 7202 + }, + { + "epoch": 0.6316438508198495, + "grad_norm": 0.050537109375, + "learning_rate": 0.0012478993423370205, + "loss": 1.1738, + "step": 7203 + }, + { + "epoch": 0.6317315425942241, + "grad_norm": 0.04931640625, + "learning_rate": 0.0012475101794256365, + "loss": 1.1432, + "step": 7204 + }, + { + "epoch": 0.6318192343685987, + "grad_norm": 0.05322265625, + "learning_rate": 0.0012471210532202857, + "loss": 1.1686, + "step": 7205 + }, + { + "epoch": 0.6319069261429732, + "grad_norm": 0.04833984375, + "learning_rate": 0.0012467319637564543, + "loss": 1.1431, + "step": 7206 + }, + { + "epoch": 0.6319946179173478, + "grad_norm": 0.059814453125, + "learning_rate": 0.0012463429110696272, + "loss": 1.1414, + "step": 7207 + }, + { + "epoch": 0.6320823096917223, + "grad_norm": 0.047119140625, + "learning_rate": 0.0012459538951952844, + "loss": 1.1469, + "step": 7208 + }, + { + "epoch": 0.6321700014660968, + "grad_norm": 0.0458984375, + "learning_rate": 0.0012455649161689033, + "loss": 1.1107, + "step": 7209 + }, + { + "epoch": 0.6322576932404714, + "grad_norm": 0.052490234375, + "learning_rate": 0.001245175974025959, + "loss": 1.1063, + "step": 7210 + }, + { + "epoch": 0.6323453850148459, + "grad_norm": 0.091796875, + "learning_rate": 0.0012447870688019193, + "loss": 1.1451, + "step": 7211 + }, + { + "epoch": 0.6324330767892204, + "grad_norm": 0.05224609375, + "learning_rate": 0.0012443982005322534, + "loss": 1.1692, + "step": 7212 + }, + { + "epoch": 0.6325207685635951, + "grad_norm": 0.0673828125, + "learning_rate": 0.0012440093692524241, + "loss": 1.1065, + "step": 7213 + }, + { + "epoch": 0.6326084603379696, + "grad_norm": 0.0546875, + "learning_rate": 0.0012436205749978928, + "loss": 1.0812, + "step": 7214 + }, + { + "epoch": 0.6326961521123441, + "grad_norm": 0.044677734375, + "learning_rate": 0.0012432318178041157, + "loss": 1.1128, + "step": 7215 + }, + { + "epoch": 0.6327838438867187, + "grad_norm": 0.056640625, + "learning_rate": 0.0012428430977065466, + "loss": 1.1139, + "step": 7216 + }, + { + "epoch": 0.6328715356610932, + "grad_norm": 0.052001953125, + "learning_rate": 0.0012424544147406364, + "loss": 1.0918, + "step": 7217 + }, + { + "epoch": 0.6329592274354677, + "grad_norm": 0.06591796875, + "learning_rate": 0.0012420657689418315, + "loss": 1.142, + "step": 7218 + }, + { + "epoch": 0.6330469192098424, + "grad_norm": 0.05517578125, + "learning_rate": 0.0012416771603455752, + "loss": 1.1275, + "step": 7219 + }, + { + "epoch": 0.6331346109842169, + "grad_norm": 0.05859375, + "learning_rate": 0.0012412885889873074, + "loss": 1.1352, + "step": 7220 + }, + { + "epoch": 0.6332223027585914, + "grad_norm": 0.06298828125, + "learning_rate": 0.001240900054902466, + "loss": 1.1584, + "step": 7221 + }, + { + "epoch": 0.633309994532966, + "grad_norm": 0.047119140625, + "learning_rate": 0.0012405115581264829, + "loss": 1.1348, + "step": 7222 + }, + { + "epoch": 0.6333976863073405, + "grad_norm": 0.04638671875, + "learning_rate": 0.0012401230986947885, + "loss": 1.1354, + "step": 7223 + }, + { + "epoch": 0.633485378081715, + "grad_norm": 0.052978515625, + "learning_rate": 0.0012397346766428099, + "loss": 1.1097, + "step": 7224 + }, + { + "epoch": 0.6335730698560896, + "grad_norm": 0.05517578125, + "learning_rate": 0.0012393462920059696, + "loss": 1.168, + "step": 7225 + }, + { + "epoch": 0.6336607616304641, + "grad_norm": 0.0703125, + "learning_rate": 0.0012389579448196868, + "loss": 1.1622, + "step": 7226 + }, + { + "epoch": 0.6337484534048387, + "grad_norm": 0.0859375, + "learning_rate": 0.0012385696351193795, + "loss": 1.1357, + "step": 7227 + }, + { + "epoch": 0.6338361451792132, + "grad_norm": 0.0673828125, + "learning_rate": 0.0012381813629404586, + "loss": 1.0921, + "step": 7228 + }, + { + "epoch": 0.6339238369535878, + "grad_norm": 0.048583984375, + "learning_rate": 0.0012377931283183345, + "loss": 1.194, + "step": 7229 + }, + { + "epoch": 0.6340115287279623, + "grad_norm": 0.055908203125, + "learning_rate": 0.0012374049312884128, + "loss": 1.1458, + "step": 7230 + }, + { + "epoch": 0.6340992205023368, + "grad_norm": 0.06201171875, + "learning_rate": 0.0012370167718860963, + "loss": 1.1036, + "step": 7231 + }, + { + "epoch": 0.6341869122767114, + "grad_norm": 0.052490234375, + "learning_rate": 0.0012366286501467843, + "loss": 1.0911, + "step": 7232 + }, + { + "epoch": 0.6342746040510859, + "grad_norm": 0.05419921875, + "learning_rate": 0.0012362405661058718, + "loss": 1.1664, + "step": 7233 + }, + { + "epoch": 0.6343622958254604, + "grad_norm": 0.056884765625, + "learning_rate": 0.0012358525197987522, + "loss": 1.1619, + "step": 7234 + }, + { + "epoch": 0.6344499875998351, + "grad_norm": 0.08154296875, + "learning_rate": 0.0012354645112608131, + "loss": 1.0955, + "step": 7235 + }, + { + "epoch": 0.6345376793742096, + "grad_norm": 0.0478515625, + "learning_rate": 0.0012350765405274406, + "loss": 1.1524, + "step": 7236 + }, + { + "epoch": 0.6346253711485841, + "grad_norm": 0.07666015625, + "learning_rate": 0.0012346886076340167, + "loss": 1.2004, + "step": 7237 + }, + { + "epoch": 0.6347130629229587, + "grad_norm": 0.0703125, + "learning_rate": 0.0012343007126159196, + "loss": 1.1758, + "step": 7238 + }, + { + "epoch": 0.6348007546973332, + "grad_norm": 0.06494140625, + "learning_rate": 0.0012339128555085245, + "loss": 1.122, + "step": 7239 + }, + { + "epoch": 0.6348884464717077, + "grad_norm": 0.054443359375, + "learning_rate": 0.0012335250363472024, + "loss": 1.1258, + "step": 7240 + }, + { + "epoch": 0.6349761382460823, + "grad_norm": 0.046875, + "learning_rate": 0.001233137255167322, + "loss": 1.1516, + "step": 7241 + }, + { + "epoch": 0.6350638300204569, + "grad_norm": 0.053466796875, + "learning_rate": 0.001232749512004248, + "loss": 1.1319, + "step": 7242 + }, + { + "epoch": 0.6351515217948314, + "grad_norm": 0.051025390625, + "learning_rate": 0.0012323618068933412, + "loss": 1.1533, + "step": 7243 + }, + { + "epoch": 0.635239213569206, + "grad_norm": 0.04638671875, + "learning_rate": 0.0012319741398699603, + "loss": 1.1871, + "step": 7244 + }, + { + "epoch": 0.6353269053435805, + "grad_norm": 0.04833984375, + "learning_rate": 0.0012315865109694577, + "loss": 1.1659, + "step": 7245 + }, + { + "epoch": 0.635414597117955, + "grad_norm": 0.0546875, + "learning_rate": 0.0012311989202271859, + "loss": 1.1952, + "step": 7246 + }, + { + "epoch": 0.6355022888923296, + "grad_norm": 0.04931640625, + "learning_rate": 0.0012308113676784906, + "loss": 1.1106, + "step": 7247 + }, + { + "epoch": 0.6355899806667041, + "grad_norm": 0.052734375, + "learning_rate": 0.001230423853358717, + "loss": 1.1539, + "step": 7248 + }, + { + "epoch": 0.6356776724410786, + "grad_norm": 0.047119140625, + "learning_rate": 0.0012300363773032048, + "loss": 1.1665, + "step": 7249 + }, + { + "epoch": 0.6357653642154532, + "grad_norm": 0.05029296875, + "learning_rate": 0.001229648939547291, + "loss": 1.2366, + "step": 7250 + }, + { + "epoch": 0.6358530559898278, + "grad_norm": 0.051513671875, + "learning_rate": 0.001229261540126308, + "loss": 1.1753, + "step": 7251 + }, + { + "epoch": 0.6359407477642023, + "grad_norm": 0.047607421875, + "learning_rate": 0.0012288741790755873, + "loss": 1.1539, + "step": 7252 + }, + { + "epoch": 0.6360284395385768, + "grad_norm": 0.05078125, + "learning_rate": 0.0012284868564304539, + "loss": 1.1168, + "step": 7253 + }, + { + "epoch": 0.6361161313129514, + "grad_norm": 0.045654296875, + "learning_rate": 0.0012280995722262313, + "loss": 1.1243, + "step": 7254 + }, + { + "epoch": 0.6362038230873259, + "grad_norm": 0.052978515625, + "learning_rate": 0.0012277123264982385, + "loss": 1.18, + "step": 7255 + }, + { + "epoch": 0.6362915148617004, + "grad_norm": 0.0498046875, + "learning_rate": 0.0012273251192817913, + "loss": 1.179, + "step": 7256 + }, + { + "epoch": 0.6363792066360751, + "grad_norm": 0.05712890625, + "learning_rate": 0.001226937950612202, + "loss": 1.0844, + "step": 7257 + }, + { + "epoch": 0.6364668984104496, + "grad_norm": 0.058837890625, + "learning_rate": 0.0012265508205247797, + "loss": 1.2039, + "step": 7258 + }, + { + "epoch": 0.6365545901848241, + "grad_norm": 0.0439453125, + "learning_rate": 0.0012261637290548293, + "loss": 1.1832, + "step": 7259 + }, + { + "epoch": 0.6366422819591987, + "grad_norm": 0.048583984375, + "learning_rate": 0.0012257766762376527, + "loss": 1.1512, + "step": 7260 + }, + { + "epoch": 0.6367299737335732, + "grad_norm": 0.0673828125, + "learning_rate": 0.0012253896621085487, + "loss": 1.1915, + "step": 7261 + }, + { + "epoch": 0.6368176655079477, + "grad_norm": 0.04541015625, + "learning_rate": 0.0012250026867028105, + "loss": 1.1925, + "step": 7262 + }, + { + "epoch": 0.6369053572823223, + "grad_norm": 0.046630859375, + "learning_rate": 0.0012246157500557307, + "loss": 1.1498, + "step": 7263 + }, + { + "epoch": 0.6369930490566968, + "grad_norm": 0.048583984375, + "learning_rate": 0.0012242288522025962, + "loss": 1.1403, + "step": 7264 + }, + { + "epoch": 0.6370807408310714, + "grad_norm": 0.05419921875, + "learning_rate": 0.0012238419931786912, + "loss": 1.1598, + "step": 7265 + }, + { + "epoch": 0.637168432605446, + "grad_norm": 0.059814453125, + "learning_rate": 0.0012234551730192962, + "loss": 1.1859, + "step": 7266 + }, + { + "epoch": 0.6372561243798205, + "grad_norm": 0.047119140625, + "learning_rate": 0.0012230683917596884, + "loss": 1.1658, + "step": 7267 + }, + { + "epoch": 0.637343816154195, + "grad_norm": 0.08056640625, + "learning_rate": 0.0012226816494351413, + "loss": 1.164, + "step": 7268 + }, + { + "epoch": 0.6374315079285696, + "grad_norm": 0.05078125, + "learning_rate": 0.001222294946080924, + "loss": 1.1382, + "step": 7269 + }, + { + "epoch": 0.6375191997029441, + "grad_norm": 0.06201171875, + "learning_rate": 0.0012219082817323042, + "loss": 1.1505, + "step": 7270 + }, + { + "epoch": 0.6376068914773186, + "grad_norm": 0.07275390625, + "learning_rate": 0.0012215216564245438, + "loss": 1.1261, + "step": 7271 + }, + { + "epoch": 0.6376945832516933, + "grad_norm": 0.0732421875, + "learning_rate": 0.0012211350701929024, + "loss": 1.2135, + "step": 7272 + }, + { + "epoch": 0.6377822750260678, + "grad_norm": 0.07763671875, + "learning_rate": 0.0012207485230726351, + "loss": 1.1571, + "step": 7273 + }, + { + "epoch": 0.6378699668004423, + "grad_norm": 0.07275390625, + "learning_rate": 0.001220362015098995, + "loss": 1.11, + "step": 7274 + }, + { + "epoch": 0.6379576585748168, + "grad_norm": 0.072265625, + "learning_rate": 0.0012199755463072292, + "loss": 1.1104, + "step": 7275 + }, + { + "epoch": 0.6380453503491914, + "grad_norm": 0.048095703125, + "learning_rate": 0.0012195891167325844, + "loss": 1.1428, + "step": 7276 + }, + { + "epoch": 0.6381330421235659, + "grad_norm": 0.083984375, + "learning_rate": 0.0012192027264103001, + "loss": 1.1711, + "step": 7277 + }, + { + "epoch": 0.6382207338979404, + "grad_norm": 0.07177734375, + "learning_rate": 0.0012188163753756157, + "loss": 1.145, + "step": 7278 + }, + { + "epoch": 0.638308425672315, + "grad_norm": 0.07080078125, + "learning_rate": 0.001218430063663765, + "loss": 1.18, + "step": 7279 + }, + { + "epoch": 0.6383961174466896, + "grad_norm": 0.07568359375, + "learning_rate": 0.001218043791309978, + "loss": 1.1248, + "step": 7280 + }, + { + "epoch": 0.6384838092210641, + "grad_norm": 0.09521484375, + "learning_rate": 0.0012176575583494821, + "loss": 1.1487, + "step": 7281 + }, + { + "epoch": 0.6385715009954387, + "grad_norm": 0.076171875, + "learning_rate": 0.0012172713648175007, + "loss": 1.1715, + "step": 7282 + }, + { + "epoch": 0.6386591927698132, + "grad_norm": 0.06640625, + "learning_rate": 0.0012168852107492538, + "loss": 1.1345, + "step": 7283 + }, + { + "epoch": 0.6387468845441877, + "grad_norm": 0.08349609375, + "learning_rate": 0.0012164990961799579, + "loss": 1.1468, + "step": 7284 + }, + { + "epoch": 0.6388345763185623, + "grad_norm": 0.046142578125, + "learning_rate": 0.001216113021144825, + "loss": 1.2466, + "step": 7285 + }, + { + "epoch": 0.6389222680929368, + "grad_norm": 0.064453125, + "learning_rate": 0.0012157269856790647, + "loss": 1.1641, + "step": 7286 + }, + { + "epoch": 0.6390099598673114, + "grad_norm": 0.051513671875, + "learning_rate": 0.0012153409898178823, + "loss": 1.1354, + "step": 7287 + }, + { + "epoch": 0.639097651641686, + "grad_norm": 0.0517578125, + "learning_rate": 0.001214955033596479, + "loss": 1.1205, + "step": 7288 + }, + { + "epoch": 0.6391853434160605, + "grad_norm": 0.048583984375, + "learning_rate": 0.001214569117050054, + "loss": 1.1036, + "step": 7289 + }, + { + "epoch": 0.639273035190435, + "grad_norm": 0.04736328125, + "learning_rate": 0.0012141832402138017, + "loss": 1.133, + "step": 7290 + }, + { + "epoch": 0.6393607269648096, + "grad_norm": 0.04931640625, + "learning_rate": 0.0012137974031229126, + "loss": 1.1065, + "step": 7291 + }, + { + "epoch": 0.6394484187391841, + "grad_norm": 0.078125, + "learning_rate": 0.001213411605812574, + "loss": 1.119, + "step": 7292 + }, + { + "epoch": 0.6395361105135586, + "grad_norm": 0.054931640625, + "learning_rate": 0.0012130258483179703, + "loss": 1.1227, + "step": 7293 + }, + { + "epoch": 0.6396238022879333, + "grad_norm": 0.04443359375, + "learning_rate": 0.001212640130674281, + "loss": 1.1007, + "step": 7294 + }, + { + "epoch": 0.6397114940623078, + "grad_norm": 0.046875, + "learning_rate": 0.0012122544529166832, + "loss": 1.159, + "step": 7295 + }, + { + "epoch": 0.6397991858366823, + "grad_norm": 0.04736328125, + "learning_rate": 0.0012118688150803492, + "loss": 1.1491, + "step": 7296 + }, + { + "epoch": 0.6398868776110568, + "grad_norm": 0.060302734375, + "learning_rate": 0.0012114832172004477, + "loss": 1.1251, + "step": 7297 + }, + { + "epoch": 0.6399745693854314, + "grad_norm": 0.05126953125, + "learning_rate": 0.0012110976593121448, + "loss": 1.1067, + "step": 7298 + }, + { + "epoch": 0.6400622611598059, + "grad_norm": 0.04638671875, + "learning_rate": 0.0012107121414506024, + "loss": 1.1571, + "step": 7299 + }, + { + "epoch": 0.6401499529341804, + "grad_norm": 0.0478515625, + "learning_rate": 0.0012103266636509789, + "loss": 1.0881, + "step": 7300 + }, + { + "epoch": 0.640237644708555, + "grad_norm": 0.047119140625, + "learning_rate": 0.0012099412259484288, + "loss": 1.0955, + "step": 7301 + }, + { + "epoch": 0.6403253364829296, + "grad_norm": 0.05078125, + "learning_rate": 0.0012095558283781022, + "loss": 1.1745, + "step": 7302 + }, + { + "epoch": 0.6404130282573041, + "grad_norm": 0.046875, + "learning_rate": 0.0012091704709751479, + "loss": 1.1186, + "step": 7303 + }, + { + "epoch": 0.6405007200316787, + "grad_norm": 0.0634765625, + "learning_rate": 0.001208785153774708, + "loss": 1.0849, + "step": 7304 + }, + { + "epoch": 0.6405884118060532, + "grad_norm": 0.06298828125, + "learning_rate": 0.0012083998768119234, + "loss": 1.1483, + "step": 7305 + }, + { + "epoch": 0.6406761035804277, + "grad_norm": 0.0625, + "learning_rate": 0.00120801464012193, + "loss": 1.1379, + "step": 7306 + }, + { + "epoch": 0.6407637953548023, + "grad_norm": 0.078125, + "learning_rate": 0.0012076294437398606, + "loss": 1.1658, + "step": 7307 + }, + { + "epoch": 0.6408514871291768, + "grad_norm": 0.05029296875, + "learning_rate": 0.0012072442877008435, + "loss": 1.1676, + "step": 7308 + }, + { + "epoch": 0.6409391789035513, + "grad_norm": 0.06689453125, + "learning_rate": 0.0012068591720400049, + "loss": 1.173, + "step": 7309 + }, + { + "epoch": 0.641026870677926, + "grad_norm": 0.0595703125, + "learning_rate": 0.0012064740967924655, + "loss": 1.1585, + "step": 7310 + }, + { + "epoch": 0.6411145624523005, + "grad_norm": 0.05712890625, + "learning_rate": 0.0012060890619933437, + "loss": 1.1952, + "step": 7311 + }, + { + "epoch": 0.641202254226675, + "grad_norm": 0.06982421875, + "learning_rate": 0.0012057040676777537, + "loss": 1.1012, + "step": 7312 + }, + { + "epoch": 0.6412899460010496, + "grad_norm": 0.051025390625, + "learning_rate": 0.0012053191138808057, + "loss": 1.1457, + "step": 7313 + }, + { + "epoch": 0.6413776377754241, + "grad_norm": 0.04638671875, + "learning_rate": 0.0012049342006376065, + "loss": 1.1382, + "step": 7314 + }, + { + "epoch": 0.6414653295497986, + "grad_norm": 0.051513671875, + "learning_rate": 0.001204549327983259, + "loss": 1.1487, + "step": 7315 + }, + { + "epoch": 0.6415530213241732, + "grad_norm": 0.052978515625, + "learning_rate": 0.001204164495952863, + "loss": 1.1955, + "step": 7316 + }, + { + "epoch": 0.6416407130985478, + "grad_norm": 0.051025390625, + "learning_rate": 0.0012037797045815139, + "loss": 1.1475, + "step": 7317 + }, + { + "epoch": 0.6417284048729223, + "grad_norm": 0.0859375, + "learning_rate": 0.0012033949539043045, + "loss": 1.1787, + "step": 7318 + }, + { + "epoch": 0.6418160966472969, + "grad_norm": 0.0634765625, + "learning_rate": 0.0012030102439563217, + "loss": 1.1177, + "step": 7319 + }, + { + "epoch": 0.6419037884216714, + "grad_norm": 0.08740234375, + "learning_rate": 0.001202625574772651, + "loss": 1.1268, + "step": 7320 + }, + { + "epoch": 0.6419914801960459, + "grad_norm": 0.091796875, + "learning_rate": 0.0012022409463883729, + "loss": 1.1627, + "step": 7321 + }, + { + "epoch": 0.6420791719704204, + "grad_norm": 0.05810546875, + "learning_rate": 0.0012018563588385648, + "loss": 1.136, + "step": 7322 + }, + { + "epoch": 0.642166863744795, + "grad_norm": 0.07275390625, + "learning_rate": 0.0012014718121582996, + "loss": 1.1379, + "step": 7323 + }, + { + "epoch": 0.6422545555191695, + "grad_norm": 0.08056640625, + "learning_rate": 0.001201087306382647, + "loss": 1.1718, + "step": 7324 + }, + { + "epoch": 0.642342247293544, + "grad_norm": 0.061767578125, + "learning_rate": 0.001200702841546674, + "loss": 1.0806, + "step": 7325 + }, + { + "epoch": 0.6424299390679187, + "grad_norm": 0.05419921875, + "learning_rate": 0.0012003184176854415, + "loss": 1.1306, + "step": 7326 + }, + { + "epoch": 0.6425176308422932, + "grad_norm": 0.0986328125, + "learning_rate": 0.0011999340348340084, + "loss": 1.1772, + "step": 7327 + }, + { + "epoch": 0.6426053226166677, + "grad_norm": 0.056396484375, + "learning_rate": 0.0011995496930274298, + "loss": 1.0925, + "step": 7328 + }, + { + "epoch": 0.6426930143910423, + "grad_norm": 0.0498046875, + "learning_rate": 0.0011991653923007564, + "loss": 1.1413, + "step": 7329 + }, + { + "epoch": 0.6427807061654168, + "grad_norm": 0.0458984375, + "learning_rate": 0.0011987811326890356, + "loss": 1.1961, + "step": 7330 + }, + { + "epoch": 0.6428683979397913, + "grad_norm": 0.056640625, + "learning_rate": 0.0011983969142273102, + "loss": 1.1505, + "step": 7331 + }, + { + "epoch": 0.642956089714166, + "grad_norm": 0.046875, + "learning_rate": 0.0011980127369506206, + "loss": 1.1326, + "step": 7332 + }, + { + "epoch": 0.6430437814885405, + "grad_norm": 0.05078125, + "learning_rate": 0.0011976286008940022, + "loss": 1.129, + "step": 7333 + }, + { + "epoch": 0.643131473262915, + "grad_norm": 0.052001953125, + "learning_rate": 0.0011972445060924885, + "loss": 1.147, + "step": 7334 + }, + { + "epoch": 0.6432191650372896, + "grad_norm": 0.049072265625, + "learning_rate": 0.0011968604525811062, + "loss": 1.1447, + "step": 7335 + }, + { + "epoch": 0.6433068568116641, + "grad_norm": 0.048828125, + "learning_rate": 0.001196476440394881, + "loss": 1.1126, + "step": 7336 + }, + { + "epoch": 0.6433945485860386, + "grad_norm": 0.047607421875, + "learning_rate": 0.0011960924695688338, + "loss": 1.1944, + "step": 7337 + }, + { + "epoch": 0.6434822403604132, + "grad_norm": 0.044921875, + "learning_rate": 0.0011957085401379816, + "loss": 1.1263, + "step": 7338 + }, + { + "epoch": 0.6435699321347877, + "grad_norm": 0.048095703125, + "learning_rate": 0.001195324652137338, + "loss": 1.1204, + "step": 7339 + }, + { + "epoch": 0.6436576239091623, + "grad_norm": 0.059814453125, + "learning_rate": 0.0011949408056019125, + "loss": 1.0916, + "step": 7340 + }, + { + "epoch": 0.6437453156835369, + "grad_norm": 0.050537109375, + "learning_rate": 0.00119455700056671, + "loss": 1.0883, + "step": 7341 + }, + { + "epoch": 0.6438330074579114, + "grad_norm": 0.0458984375, + "learning_rate": 0.0011941732370667338, + "loss": 1.1117, + "step": 7342 + }, + { + "epoch": 0.6439206992322859, + "grad_norm": 0.056396484375, + "learning_rate": 0.0011937895151369817, + "loss": 1.197, + "step": 7343 + }, + { + "epoch": 0.6440083910066605, + "grad_norm": 0.054443359375, + "learning_rate": 0.0011934058348124484, + "loss": 1.1885, + "step": 7344 + }, + { + "epoch": 0.644096082781035, + "grad_norm": 0.07080078125, + "learning_rate": 0.0011930221961281241, + "loss": 1.1365, + "step": 7345 + }, + { + "epoch": 0.6441837745554095, + "grad_norm": 0.04541015625, + "learning_rate": 0.001192638599118996, + "loss": 1.1672, + "step": 7346 + }, + { + "epoch": 0.644271466329784, + "grad_norm": 0.06396484375, + "learning_rate": 0.0011922550438200465, + "loss": 1.1877, + "step": 7347 + }, + { + "epoch": 0.6443591581041587, + "grad_norm": 0.045654296875, + "learning_rate": 0.0011918715302662555, + "loss": 1.179, + "step": 7348 + }, + { + "epoch": 0.6444468498785332, + "grad_norm": 0.059814453125, + "learning_rate": 0.0011914880584925984, + "loss": 1.1783, + "step": 7349 + }, + { + "epoch": 0.6445345416529077, + "grad_norm": 0.048583984375, + "learning_rate": 0.0011911046285340468, + "loss": 1.1803, + "step": 7350 + }, + { + "epoch": 0.6446222334272823, + "grad_norm": 0.08203125, + "learning_rate": 0.0011907212404255682, + "loss": 1.1334, + "step": 7351 + }, + { + "epoch": 0.6447099252016568, + "grad_norm": 0.048583984375, + "learning_rate": 0.001190337894202127, + "loss": 1.0958, + "step": 7352 + }, + { + "epoch": 0.6447976169760313, + "grad_norm": 0.05126953125, + "learning_rate": 0.001189954589898683, + "loss": 1.1119, + "step": 7353 + }, + { + "epoch": 0.644885308750406, + "grad_norm": 0.047119140625, + "learning_rate": 0.001189571327550193, + "loss": 1.177, + "step": 7354 + }, + { + "epoch": 0.6449730005247805, + "grad_norm": 0.0771484375, + "learning_rate": 0.0011891881071916092, + "loss": 1.1989, + "step": 7355 + }, + { + "epoch": 0.645060692299155, + "grad_norm": 0.0654296875, + "learning_rate": 0.001188804928857881, + "loss": 1.1495, + "step": 7356 + }, + { + "epoch": 0.6451483840735296, + "grad_norm": 0.050048828125, + "learning_rate": 0.0011884217925839519, + "loss": 1.1861, + "step": 7357 + }, + { + "epoch": 0.6452360758479041, + "grad_norm": 0.04833984375, + "learning_rate": 0.0011880386984047643, + "loss": 1.1457, + "step": 7358 + }, + { + "epoch": 0.6453237676222786, + "grad_norm": 0.047119140625, + "learning_rate": 0.0011876556463552544, + "loss": 1.1559, + "step": 7359 + }, + { + "epoch": 0.6454114593966532, + "grad_norm": 0.052734375, + "learning_rate": 0.0011872726364703567, + "loss": 1.1569, + "step": 7360 + }, + { + "epoch": 0.6454991511710277, + "grad_norm": 0.06591796875, + "learning_rate": 0.0011868896687849994, + "loss": 1.1296, + "step": 7361 + }, + { + "epoch": 0.6455868429454023, + "grad_norm": 0.046142578125, + "learning_rate": 0.0011865067433341093, + "loss": 1.1266, + "step": 7362 + }, + { + "epoch": 0.6456745347197769, + "grad_norm": 0.048828125, + "learning_rate": 0.001186123860152608, + "loss": 1.176, + "step": 7363 + }, + { + "epoch": 0.6457622264941514, + "grad_norm": 0.04736328125, + "learning_rate": 0.0011857410192754124, + "loss": 1.1469, + "step": 7364 + }, + { + "epoch": 0.6458499182685259, + "grad_norm": 0.04736328125, + "learning_rate": 0.0011853582207374375, + "loss": 1.1658, + "step": 7365 + }, + { + "epoch": 0.6459376100429005, + "grad_norm": 0.04638671875, + "learning_rate": 0.0011849754645735936, + "loss": 1.1004, + "step": 7366 + }, + { + "epoch": 0.646025301817275, + "grad_norm": 0.06591796875, + "learning_rate": 0.0011845927508187869, + "loss": 1.1537, + "step": 7367 + }, + { + "epoch": 0.6461129935916495, + "grad_norm": 0.057373046875, + "learning_rate": 0.0011842100795079196, + "loss": 1.1099, + "step": 7368 + }, + { + "epoch": 0.646200685366024, + "grad_norm": 0.050048828125, + "learning_rate": 0.0011838274506758906, + "loss": 1.1873, + "step": 7369 + }, + { + "epoch": 0.6462883771403987, + "grad_norm": 0.055419921875, + "learning_rate": 0.001183444864357595, + "loss": 1.1034, + "step": 7370 + }, + { + "epoch": 0.6463760689147732, + "grad_norm": 0.07861328125, + "learning_rate": 0.001183062320587923, + "loss": 1.0943, + "step": 7371 + }, + { + "epoch": 0.6464637606891477, + "grad_norm": 0.09521484375, + "learning_rate": 0.0011826798194017622, + "loss": 1.1303, + "step": 7372 + }, + { + "epoch": 0.6465514524635223, + "grad_norm": 0.0478515625, + "learning_rate": 0.0011822973608339956, + "loss": 1.1401, + "step": 7373 + }, + { + "epoch": 0.6466391442378968, + "grad_norm": 0.08349609375, + "learning_rate": 0.001181914944919502, + "loss": 1.1241, + "step": 7374 + }, + { + "epoch": 0.6467268360122713, + "grad_norm": 0.0966796875, + "learning_rate": 0.0011815325716931573, + "loss": 1.0903, + "step": 7375 + }, + { + "epoch": 0.6468145277866459, + "grad_norm": 0.07373046875, + "learning_rate": 0.001181150241189833, + "loss": 1.1764, + "step": 7376 + }, + { + "epoch": 0.6469022195610205, + "grad_norm": 0.080078125, + "learning_rate": 0.0011807679534443956, + "loss": 1.1862, + "step": 7377 + }, + { + "epoch": 0.646989911335395, + "grad_norm": 0.047607421875, + "learning_rate": 0.0011803857084917101, + "loss": 1.1054, + "step": 7378 + }, + { + "epoch": 0.6470776031097696, + "grad_norm": 0.06591796875, + "learning_rate": 0.0011800035063666353, + "loss": 1.1412, + "step": 7379 + }, + { + "epoch": 0.6471652948841441, + "grad_norm": 0.048095703125, + "learning_rate": 0.001179621347104028, + "loss": 1.1472, + "step": 7380 + }, + { + "epoch": 0.6472529866585186, + "grad_norm": 0.04931640625, + "learning_rate": 0.0011792392307387396, + "loss": 1.1687, + "step": 7381 + }, + { + "epoch": 0.6473406784328932, + "grad_norm": 0.0458984375, + "learning_rate": 0.001178857157305618, + "loss": 1.1452, + "step": 7382 + }, + { + "epoch": 0.6474283702072677, + "grad_norm": 0.04931640625, + "learning_rate": 0.0011784751268395068, + "loss": 1.1099, + "step": 7383 + }, + { + "epoch": 0.6475160619816422, + "grad_norm": 0.057861328125, + "learning_rate": 0.0011780931393752473, + "loss": 1.1205, + "step": 7384 + }, + { + "epoch": 0.6476037537560169, + "grad_norm": 0.04345703125, + "learning_rate": 0.001177711194947675, + "loss": 1.0971, + "step": 7385 + }, + { + "epoch": 0.6476914455303914, + "grad_norm": 0.055908203125, + "learning_rate": 0.0011773292935916228, + "loss": 1.1128, + "step": 7386 + }, + { + "epoch": 0.6477791373047659, + "grad_norm": 0.06640625, + "learning_rate": 0.0011769474353419188, + "loss": 1.1382, + "step": 7387 + }, + { + "epoch": 0.6478668290791405, + "grad_norm": 0.083984375, + "learning_rate": 0.001176565620233387, + "loss": 1.1079, + "step": 7388 + }, + { + "epoch": 0.647954520853515, + "grad_norm": 0.08984375, + "learning_rate": 0.001176183848300849, + "loss": 1.1177, + "step": 7389 + }, + { + "epoch": 0.6480422126278895, + "grad_norm": 0.06494140625, + "learning_rate": 0.0011758021195791206, + "loss": 1.1391, + "step": 7390 + }, + { + "epoch": 0.6481299044022641, + "grad_norm": 0.04443359375, + "learning_rate": 0.0011754204341030152, + "loss": 1.1471, + "step": 7391 + }, + { + "epoch": 0.6482175961766387, + "grad_norm": 0.1103515625, + "learning_rate": 0.0011750387919073405, + "loss": 1.1572, + "step": 7392 + }, + { + "epoch": 0.6483052879510132, + "grad_norm": 0.0859375, + "learning_rate": 0.0011746571930269024, + "loss": 1.1009, + "step": 7393 + }, + { + "epoch": 0.6483929797253877, + "grad_norm": 0.044921875, + "learning_rate": 0.0011742756374965007, + "loss": 1.1104, + "step": 7394 + }, + { + "epoch": 0.6484806714997623, + "grad_norm": 0.049072265625, + "learning_rate": 0.0011738941253509334, + "loss": 1.1098, + "step": 7395 + }, + { + "epoch": 0.6485683632741368, + "grad_norm": 0.06787109375, + "learning_rate": 0.0011735126566249927, + "loss": 1.1253, + "step": 7396 + }, + { + "epoch": 0.6486560550485113, + "grad_norm": 0.07666015625, + "learning_rate": 0.0011731312313534674, + "loss": 1.1893, + "step": 7397 + }, + { + "epoch": 0.6487437468228859, + "grad_norm": 0.046630859375, + "learning_rate": 0.0011727498495711427, + "loss": 1.1991, + "step": 7398 + }, + { + "epoch": 0.6488314385972604, + "grad_norm": 0.055908203125, + "learning_rate": 0.0011723685113127998, + "loss": 1.1673, + "step": 7399 + }, + { + "epoch": 0.648919130371635, + "grad_norm": 0.05322265625, + "learning_rate": 0.0011719872166132156, + "loss": 1.1671, + "step": 7400 + }, + { + "epoch": 0.6490068221460096, + "grad_norm": 0.048095703125, + "learning_rate": 0.0011716059655071634, + "loss": 1.1135, + "step": 7401 + }, + { + "epoch": 0.6490945139203841, + "grad_norm": 0.045654296875, + "learning_rate": 0.0011712247580294122, + "loss": 1.1561, + "step": 7402 + }, + { + "epoch": 0.6491822056947586, + "grad_norm": 0.055419921875, + "learning_rate": 0.001170843594214727, + "loss": 1.1035, + "step": 7403 + }, + { + "epoch": 0.6492698974691332, + "grad_norm": 0.052490234375, + "learning_rate": 0.0011704624740978694, + "loss": 1.1649, + "step": 7404 + }, + { + "epoch": 0.6493575892435077, + "grad_norm": 0.049560546875, + "learning_rate": 0.0011700813977135955, + "loss": 1.1874, + "step": 7405 + }, + { + "epoch": 0.6494452810178822, + "grad_norm": 0.04541015625, + "learning_rate": 0.0011697003650966596, + "loss": 1.1339, + "step": 7406 + }, + { + "epoch": 0.6495329727922569, + "grad_norm": 0.050537109375, + "learning_rate": 0.0011693193762818109, + "loss": 1.0972, + "step": 7407 + }, + { + "epoch": 0.6496206645666314, + "grad_norm": 0.047119140625, + "learning_rate": 0.001168938431303794, + "loss": 1.1548, + "step": 7408 + }, + { + "epoch": 0.6497083563410059, + "grad_norm": 0.053466796875, + "learning_rate": 0.00116855753019735, + "loss": 1.1354, + "step": 7409 + }, + { + "epoch": 0.6497960481153805, + "grad_norm": 0.04931640625, + "learning_rate": 0.0011681766729972165, + "loss": 1.1527, + "step": 7410 + }, + { + "epoch": 0.649883739889755, + "grad_norm": 0.052490234375, + "learning_rate": 0.001167795859738127, + "loss": 1.1623, + "step": 7411 + }, + { + "epoch": 0.6499714316641295, + "grad_norm": 0.051513671875, + "learning_rate": 0.0011674150904548099, + "loss": 1.1526, + "step": 7412 + }, + { + "epoch": 0.6500591234385041, + "grad_norm": 0.058837890625, + "learning_rate": 0.0011670343651819912, + "loss": 1.1589, + "step": 7413 + }, + { + "epoch": 0.6501468152128786, + "grad_norm": 0.052001953125, + "learning_rate": 0.0011666536839543913, + "loss": 1.1982, + "step": 7414 + }, + { + "epoch": 0.6502345069872532, + "grad_norm": 0.050537109375, + "learning_rate": 0.0011662730468067271, + "loss": 1.1712, + "step": 7415 + }, + { + "epoch": 0.6503221987616278, + "grad_norm": 0.050537109375, + "learning_rate": 0.0011658924537737132, + "loss": 1.1435, + "step": 7416 + }, + { + "epoch": 0.6504098905360023, + "grad_norm": 0.04931640625, + "learning_rate": 0.0011655119048900574, + "loss": 1.1254, + "step": 7417 + }, + { + "epoch": 0.6504975823103768, + "grad_norm": 0.0693359375, + "learning_rate": 0.0011651314001904654, + "loss": 1.1376, + "step": 7418 + }, + { + "epoch": 0.6505852740847513, + "grad_norm": 0.05615234375, + "learning_rate": 0.0011647509397096378, + "loss": 1.1826, + "step": 7419 + }, + { + "epoch": 0.6506729658591259, + "grad_norm": 0.068359375, + "learning_rate": 0.001164370523482272, + "loss": 1.1737, + "step": 7420 + }, + { + "epoch": 0.6507606576335004, + "grad_norm": 0.04931640625, + "learning_rate": 0.0011639901515430604, + "loss": 1.1658, + "step": 7421 + }, + { + "epoch": 0.650848349407875, + "grad_norm": 0.05126953125, + "learning_rate": 0.0011636098239266928, + "loss": 1.1536, + "step": 7422 + }, + { + "epoch": 0.6509360411822496, + "grad_norm": 0.0517578125, + "learning_rate": 0.0011632295406678534, + "loss": 1.11, + "step": 7423 + }, + { + "epoch": 0.6510237329566241, + "grad_norm": 0.046875, + "learning_rate": 0.0011628493018012236, + "loss": 1.1376, + "step": 7424 + }, + { + "epoch": 0.6511114247309986, + "grad_norm": 0.048583984375, + "learning_rate": 0.0011624691073614797, + "loss": 1.1156, + "step": 7425 + }, + { + "epoch": 0.6511991165053732, + "grad_norm": 0.054931640625, + "learning_rate": 0.001162088957383295, + "loss": 1.168, + "step": 7426 + }, + { + "epoch": 0.6512868082797477, + "grad_norm": 0.04541015625, + "learning_rate": 0.0011617088519013368, + "loss": 1.1781, + "step": 7427 + }, + { + "epoch": 0.6513745000541222, + "grad_norm": 0.0556640625, + "learning_rate": 0.001161328790950272, + "loss": 1.117, + "step": 7428 + }, + { + "epoch": 0.6514621918284969, + "grad_norm": 0.07958984375, + "learning_rate": 0.001160948774564759, + "loss": 1.1634, + "step": 7429 + }, + { + "epoch": 0.6515498836028714, + "grad_norm": 0.046142578125, + "learning_rate": 0.0011605688027794563, + "loss": 1.1364, + "step": 7430 + }, + { + "epoch": 0.6516375753772459, + "grad_norm": 0.04736328125, + "learning_rate": 0.0011601888756290152, + "loss": 1.1498, + "step": 7431 + }, + { + "epoch": 0.6517252671516205, + "grad_norm": 0.0771484375, + "learning_rate": 0.0011598089931480833, + "loss": 1.2154, + "step": 7432 + }, + { + "epoch": 0.651812958925995, + "grad_norm": 0.08837890625, + "learning_rate": 0.0011594291553713065, + "loss": 1.1365, + "step": 7433 + }, + { + "epoch": 0.6519006507003695, + "grad_norm": 0.04541015625, + "learning_rate": 0.0011590493623333238, + "loss": 1.086, + "step": 7434 + }, + { + "epoch": 0.6519883424747441, + "grad_norm": 0.05029296875, + "learning_rate": 0.0011586696140687722, + "loss": 1.1691, + "step": 7435 + }, + { + "epoch": 0.6520760342491186, + "grad_norm": 0.0673828125, + "learning_rate": 0.0011582899106122833, + "loss": 1.1705, + "step": 7436 + }, + { + "epoch": 0.6521637260234932, + "grad_norm": 0.05224609375, + "learning_rate": 0.0011579102519984855, + "loss": 1.1443, + "step": 7437 + }, + { + "epoch": 0.6522514177978678, + "grad_norm": 0.048095703125, + "learning_rate": 0.0011575306382620022, + "loss": 1.1842, + "step": 7438 + }, + { + "epoch": 0.6523391095722423, + "grad_norm": 0.055908203125, + "learning_rate": 0.0011571510694374533, + "loss": 1.1573, + "step": 7439 + }, + { + "epoch": 0.6524268013466168, + "grad_norm": 0.08251953125, + "learning_rate": 0.0011567715455594548, + "loss": 1.1478, + "step": 7440 + }, + { + "epoch": 0.6525144931209913, + "grad_norm": 0.052734375, + "learning_rate": 0.001156392066662618, + "loss": 1.1261, + "step": 7441 + }, + { + "epoch": 0.6526021848953659, + "grad_norm": 0.07763671875, + "learning_rate": 0.0011560126327815503, + "loss": 1.1636, + "step": 7442 + }, + { + "epoch": 0.6526898766697404, + "grad_norm": 0.072265625, + "learning_rate": 0.0011556332439508555, + "loss": 1.1806, + "step": 7443 + }, + { + "epoch": 0.6527775684441149, + "grad_norm": 0.0732421875, + "learning_rate": 0.0011552539002051326, + "loss": 1.0957, + "step": 7444 + }, + { + "epoch": 0.6528652602184896, + "grad_norm": 0.04443359375, + "learning_rate": 0.0011548746015789767, + "loss": 1.1387, + "step": 7445 + }, + { + "epoch": 0.6529529519928641, + "grad_norm": 0.06982421875, + "learning_rate": 0.0011544953481069794, + "loss": 1.1024, + "step": 7446 + }, + { + "epoch": 0.6530406437672386, + "grad_norm": 0.0537109375, + "learning_rate": 0.001154116139823727, + "loss": 1.1576, + "step": 7447 + }, + { + "epoch": 0.6531283355416132, + "grad_norm": 0.0439453125, + "learning_rate": 0.0011537369767638025, + "loss": 1.1527, + "step": 7448 + }, + { + "epoch": 0.6532160273159877, + "grad_norm": 0.05224609375, + "learning_rate": 0.0011533578589617845, + "loss": 1.1353, + "step": 7449 + }, + { + "epoch": 0.6533037190903622, + "grad_norm": 0.052001953125, + "learning_rate": 0.0011529787864522475, + "loss": 1.1612, + "step": 7450 + }, + { + "epoch": 0.6533914108647368, + "grad_norm": 0.055908203125, + "learning_rate": 0.0011525997592697625, + "loss": 1.1254, + "step": 7451 + }, + { + "epoch": 0.6534791026391114, + "grad_norm": 0.042724609375, + "learning_rate": 0.001152220777448895, + "loss": 1.1283, + "step": 7452 + }, + { + "epoch": 0.6535667944134859, + "grad_norm": 0.0439453125, + "learning_rate": 0.001151841841024208, + "loss": 1.1263, + "step": 7453 + }, + { + "epoch": 0.6536544861878605, + "grad_norm": 0.046875, + "learning_rate": 0.001151462950030259, + "loss": 1.1161, + "step": 7454 + }, + { + "epoch": 0.653742177962235, + "grad_norm": 0.048583984375, + "learning_rate": 0.001151084104501602, + "loss": 1.1439, + "step": 7455 + }, + { + "epoch": 0.6538298697366095, + "grad_norm": 0.0458984375, + "learning_rate": 0.0011507053044727865, + "loss": 1.1595, + "step": 7456 + }, + { + "epoch": 0.6539175615109841, + "grad_norm": 0.05712890625, + "learning_rate": 0.0011503265499783588, + "loss": 1.2589, + "step": 7457 + }, + { + "epoch": 0.6540052532853586, + "grad_norm": 0.04345703125, + "learning_rate": 0.0011499478410528592, + "loss": 1.1445, + "step": 7458 + }, + { + "epoch": 0.6540929450597331, + "grad_norm": 0.04736328125, + "learning_rate": 0.001149569177730826, + "loss": 1.1383, + "step": 7459 + }, + { + "epoch": 0.6541806368341078, + "grad_norm": 0.057861328125, + "learning_rate": 0.001149190560046792, + "loss": 1.1329, + "step": 7460 + }, + { + "epoch": 0.6542683286084823, + "grad_norm": 0.0478515625, + "learning_rate": 0.0011488119880352858, + "loss": 1.1326, + "step": 7461 + }, + { + "epoch": 0.6543560203828568, + "grad_norm": 0.0517578125, + "learning_rate": 0.0011484334617308325, + "loss": 1.0822, + "step": 7462 + }, + { + "epoch": 0.6544437121572314, + "grad_norm": 0.08349609375, + "learning_rate": 0.001148054981167953, + "loss": 1.1518, + "step": 7463 + }, + { + "epoch": 0.6545314039316059, + "grad_norm": 0.04736328125, + "learning_rate": 0.0011476765463811634, + "loss": 1.0807, + "step": 7464 + }, + { + "epoch": 0.6546190957059804, + "grad_norm": 0.0595703125, + "learning_rate": 0.0011472981574049764, + "loss": 1.2032, + "step": 7465 + }, + { + "epoch": 0.6547067874803549, + "grad_norm": 0.06884765625, + "learning_rate": 0.001146919814273899, + "loss": 1.1183, + "step": 7466 + }, + { + "epoch": 0.6547944792547296, + "grad_norm": 0.049072265625, + "learning_rate": 0.0011465415170224362, + "loss": 1.1071, + "step": 7467 + }, + { + "epoch": 0.6548821710291041, + "grad_norm": 0.047607421875, + "learning_rate": 0.0011461632656850872, + "loss": 1.1514, + "step": 7468 + }, + { + "epoch": 0.6549698628034786, + "grad_norm": 0.06689453125, + "learning_rate": 0.0011457850602963479, + "loss": 1.1467, + "step": 7469 + }, + { + "epoch": 0.6550575545778532, + "grad_norm": 0.05859375, + "learning_rate": 0.0011454069008907093, + "loss": 1.0881, + "step": 7470 + }, + { + "epoch": 0.6551452463522277, + "grad_norm": 0.047119140625, + "learning_rate": 0.0011450287875026592, + "loss": 1.1201, + "step": 7471 + }, + { + "epoch": 0.6552329381266022, + "grad_norm": 0.0859375, + "learning_rate": 0.0011446507201666792, + "loss": 1.1515, + "step": 7472 + }, + { + "epoch": 0.6553206299009768, + "grad_norm": 0.052001953125, + "learning_rate": 0.0011442726989172498, + "loss": 1.1147, + "step": 7473 + }, + { + "epoch": 0.6554083216753513, + "grad_norm": 0.043701171875, + "learning_rate": 0.0011438947237888443, + "loss": 1.0955, + "step": 7474 + }, + { + "epoch": 0.6554960134497259, + "grad_norm": 0.053466796875, + "learning_rate": 0.0011435167948159336, + "loss": 1.1173, + "step": 7475 + }, + { + "epoch": 0.6555837052241005, + "grad_norm": 0.061767578125, + "learning_rate": 0.0011431389120329838, + "loss": 1.1149, + "step": 7476 + }, + { + "epoch": 0.655671396998475, + "grad_norm": 0.051513671875, + "learning_rate": 0.0011427610754744567, + "loss": 1.1912, + "step": 7477 + }, + { + "epoch": 0.6557590887728495, + "grad_norm": 0.045166015625, + "learning_rate": 0.0011423832851748098, + "loss": 1.0817, + "step": 7478 + }, + { + "epoch": 0.6558467805472241, + "grad_norm": 0.045166015625, + "learning_rate": 0.0011420055411684976, + "loss": 1.1004, + "step": 7479 + }, + { + "epoch": 0.6559344723215986, + "grad_norm": 0.055419921875, + "learning_rate": 0.0011416278434899679, + "loss": 1.0793, + "step": 7480 + }, + { + "epoch": 0.6560221640959731, + "grad_norm": 0.05224609375, + "learning_rate": 0.0011412501921736673, + "loss": 1.1459, + "step": 7481 + }, + { + "epoch": 0.6561098558703478, + "grad_norm": 0.048583984375, + "learning_rate": 0.0011408725872540354, + "loss": 1.1544, + "step": 7482 + }, + { + "epoch": 0.6561975476447223, + "grad_norm": 0.051025390625, + "learning_rate": 0.0011404950287655093, + "loss": 1.1389, + "step": 7483 + }, + { + "epoch": 0.6562852394190968, + "grad_norm": 0.0673828125, + "learning_rate": 0.0011401175167425216, + "loss": 1.1658, + "step": 7484 + }, + { + "epoch": 0.6563729311934714, + "grad_norm": 0.04833984375, + "learning_rate": 0.0011397400512194993, + "loss": 1.093, + "step": 7485 + }, + { + "epoch": 0.6564606229678459, + "grad_norm": 0.051513671875, + "learning_rate": 0.0011393626322308674, + "loss": 1.1317, + "step": 7486 + }, + { + "epoch": 0.6565483147422204, + "grad_norm": 0.0478515625, + "learning_rate": 0.0011389852598110454, + "loss": 1.0885, + "step": 7487 + }, + { + "epoch": 0.6566360065165949, + "grad_norm": 0.053955078125, + "learning_rate": 0.0011386079339944487, + "loss": 1.1275, + "step": 7488 + }, + { + "epoch": 0.6567236982909695, + "grad_norm": 0.051025390625, + "learning_rate": 0.0011382306548154883, + "loss": 1.053, + "step": 7489 + }, + { + "epoch": 0.6568113900653441, + "grad_norm": 0.08447265625, + "learning_rate": 0.0011378534223085706, + "loss": 1.1561, + "step": 7490 + }, + { + "epoch": 0.6568990818397186, + "grad_norm": 0.054443359375, + "learning_rate": 0.001137476236508099, + "loss": 1.147, + "step": 7491 + }, + { + "epoch": 0.6569867736140932, + "grad_norm": 0.0693359375, + "learning_rate": 0.001137099097448471, + "loss": 1.1732, + "step": 7492 + }, + { + "epoch": 0.6570744653884677, + "grad_norm": 0.0595703125, + "learning_rate": 0.0011367220051640822, + "loss": 1.1763, + "step": 7493 + }, + { + "epoch": 0.6571621571628422, + "grad_norm": 0.05126953125, + "learning_rate": 0.001136344959689321, + "loss": 1.1622, + "step": 7494 + }, + { + "epoch": 0.6572498489372168, + "grad_norm": 0.0537109375, + "learning_rate": 0.0011359679610585735, + "loss": 1.1408, + "step": 7495 + }, + { + "epoch": 0.6573375407115913, + "grad_norm": 0.050048828125, + "learning_rate": 0.001135591009306221, + "loss": 1.1771, + "step": 7496 + }, + { + "epoch": 0.6574252324859658, + "grad_norm": 0.0595703125, + "learning_rate": 0.0011352141044666405, + "loss": 1.0935, + "step": 7497 + }, + { + "epoch": 0.6575129242603405, + "grad_norm": 0.048583984375, + "learning_rate": 0.0011348372465742051, + "loss": 1.1131, + "step": 7498 + }, + { + "epoch": 0.657600616034715, + "grad_norm": 0.054443359375, + "learning_rate": 0.001134460435663283, + "loss": 1.1851, + "step": 7499 + }, + { + "epoch": 0.6576883078090895, + "grad_norm": 0.0859375, + "learning_rate": 0.0011340836717682377, + "loss": 1.1702, + "step": 7500 + }, + { + "epoch": 0.6576883078090895, + "eval_loss": 1.1537786722183228, + "eval_runtime": 428.7397, + "eval_samples_per_second": 33.696, + "eval_steps_per_second": 8.425, + "step": 7500 + }, + { + "epoch": 0.6577759995834641, + "grad_norm": 0.04541015625, + "learning_rate": 0.00113370695492343, + "loss": 1.1107, + "step": 7501 + }, + { + "epoch": 0.6578636913578386, + "grad_norm": 0.083984375, + "learning_rate": 0.001133330285163215, + "loss": 1.1474, + "step": 7502 + }, + { + "epoch": 0.6579513831322131, + "grad_norm": 0.046630859375, + "learning_rate": 0.0011329536625219441, + "loss": 1.1292, + "step": 7503 + }, + { + "epoch": 0.6580390749065878, + "grad_norm": 0.050048828125, + "learning_rate": 0.001132577087033965, + "loss": 1.1498, + "step": 7504 + }, + { + "epoch": 0.6581267666809623, + "grad_norm": 0.05078125, + "learning_rate": 0.0011322005587336196, + "loss": 1.1462, + "step": 7505 + }, + { + "epoch": 0.6582144584553368, + "grad_norm": 0.06201171875, + "learning_rate": 0.0011318240776552466, + "loss": 1.1104, + "step": 7506 + }, + { + "epoch": 0.6583021502297114, + "grad_norm": 0.05908203125, + "learning_rate": 0.0011314476438331797, + "loss": 1.1768, + "step": 7507 + }, + { + "epoch": 0.6583898420040859, + "grad_norm": 0.05859375, + "learning_rate": 0.0011310712573017496, + "loss": 1.1191, + "step": 7508 + }, + { + "epoch": 0.6584775337784604, + "grad_norm": 0.0625, + "learning_rate": 0.0011306949180952806, + "loss": 1.1077, + "step": 7509 + }, + { + "epoch": 0.658565225552835, + "grad_norm": 0.04736328125, + "learning_rate": 0.001130318626248095, + "loss": 1.1859, + "step": 7510 + }, + { + "epoch": 0.6586529173272095, + "grad_norm": 0.0478515625, + "learning_rate": 0.0011299423817945087, + "loss": 1.1059, + "step": 7511 + }, + { + "epoch": 0.658740609101584, + "grad_norm": 0.051513671875, + "learning_rate": 0.0011295661847688346, + "loss": 1.1432, + "step": 7512 + }, + { + "epoch": 0.6588283008759586, + "grad_norm": 0.04638671875, + "learning_rate": 0.0011291900352053815, + "loss": 1.1918, + "step": 7513 + }, + { + "epoch": 0.6589159926503332, + "grad_norm": 0.052490234375, + "learning_rate": 0.0011288139331384526, + "loss": 1.1305, + "step": 7514 + }, + { + "epoch": 0.6590036844247077, + "grad_norm": 0.052001953125, + "learning_rate": 0.0011284378786023477, + "loss": 1.1144, + "step": 7515 + }, + { + "epoch": 0.6590913761990822, + "grad_norm": 0.044189453125, + "learning_rate": 0.0011280618716313616, + "loss": 1.1225, + "step": 7516 + }, + { + "epoch": 0.6591790679734568, + "grad_norm": 0.0625, + "learning_rate": 0.0011276859122597853, + "loss": 1.1641, + "step": 7517 + }, + { + "epoch": 0.6592667597478313, + "grad_norm": 0.055419921875, + "learning_rate": 0.0011273100005219057, + "loss": 1.1089, + "step": 7518 + }, + { + "epoch": 0.6593544515222058, + "grad_norm": 0.045654296875, + "learning_rate": 0.0011269341364520047, + "loss": 1.1493, + "step": 7519 + }, + { + "epoch": 0.6594421432965805, + "grad_norm": 0.051513671875, + "learning_rate": 0.00112655832008436, + "loss": 1.1403, + "step": 7520 + }, + { + "epoch": 0.659529835070955, + "grad_norm": 0.061767578125, + "learning_rate": 0.0011261825514532454, + "loss": 1.1257, + "step": 7521 + }, + { + "epoch": 0.6596175268453295, + "grad_norm": 0.044921875, + "learning_rate": 0.00112580683059293, + "loss": 1.1204, + "step": 7522 + }, + { + "epoch": 0.6597052186197041, + "grad_norm": 0.04541015625, + "learning_rate": 0.0011254311575376779, + "loss": 1.1292, + "step": 7523 + }, + { + "epoch": 0.6597929103940786, + "grad_norm": 0.056396484375, + "learning_rate": 0.0011250555323217507, + "loss": 1.1125, + "step": 7524 + }, + { + "epoch": 0.6598806021684531, + "grad_norm": 0.07861328125, + "learning_rate": 0.0011246799549794032, + "loss": 1.156, + "step": 7525 + }, + { + "epoch": 0.6599682939428277, + "grad_norm": 0.052490234375, + "learning_rate": 0.0011243044255448884, + "loss": 1.1471, + "step": 7526 + }, + { + "epoch": 0.6600559857172023, + "grad_norm": 0.055419921875, + "learning_rate": 0.0011239289440524523, + "loss": 1.1821, + "step": 7527 + }, + { + "epoch": 0.6601436774915768, + "grad_norm": 0.072265625, + "learning_rate": 0.001123553510536339, + "loss": 1.1382, + "step": 7528 + }, + { + "epoch": 0.6602313692659514, + "grad_norm": 0.06298828125, + "learning_rate": 0.001123178125030786, + "loss": 1.1097, + "step": 7529 + }, + { + "epoch": 0.6603190610403259, + "grad_norm": 0.050048828125, + "learning_rate": 0.0011228027875700285, + "loss": 1.1387, + "step": 7530 + }, + { + "epoch": 0.6604067528147004, + "grad_norm": 0.06982421875, + "learning_rate": 0.0011224274981882954, + "loss": 1.1625, + "step": 7531 + }, + { + "epoch": 0.660494444589075, + "grad_norm": 0.09375, + "learning_rate": 0.0011220522569198135, + "loss": 1.1831, + "step": 7532 + }, + { + "epoch": 0.6605821363634495, + "grad_norm": 0.08544921875, + "learning_rate": 0.0011216770637988017, + "loss": 1.1244, + "step": 7533 + }, + { + "epoch": 0.660669828137824, + "grad_norm": 0.046875, + "learning_rate": 0.001121301918859478, + "loss": 1.1305, + "step": 7534 + }, + { + "epoch": 0.6607575199121987, + "grad_norm": 0.142578125, + "learning_rate": 0.001120926822136055, + "loss": 1.1568, + "step": 7535 + }, + { + "epoch": 0.6608452116865732, + "grad_norm": 0.1005859375, + "learning_rate": 0.0011205517736627398, + "loss": 1.1298, + "step": 7536 + }, + { + "epoch": 0.6609329034609477, + "grad_norm": 0.0546875, + "learning_rate": 0.0011201767734737359, + "loss": 1.1092, + "step": 7537 + }, + { + "epoch": 0.6610205952353222, + "grad_norm": 0.0869140625, + "learning_rate": 0.0011198018216032428, + "loss": 1.118, + "step": 7538 + }, + { + "epoch": 0.6611082870096968, + "grad_norm": 0.1171875, + "learning_rate": 0.0011194269180854546, + "loss": 1.0887, + "step": 7539 + }, + { + "epoch": 0.6611959787840713, + "grad_norm": 0.099609375, + "learning_rate": 0.0011190520629545619, + "loss": 1.1448, + "step": 7540 + }, + { + "epoch": 0.6612836705584458, + "grad_norm": 0.048583984375, + "learning_rate": 0.0011186772562447502, + "loss": 1.1428, + "step": 7541 + }, + { + "epoch": 0.6613713623328205, + "grad_norm": 0.09130859375, + "learning_rate": 0.0011183024979902014, + "loss": 1.1733, + "step": 7542 + }, + { + "epoch": 0.661459054107195, + "grad_norm": 0.10791015625, + "learning_rate": 0.001117927788225092, + "loss": 1.1467, + "step": 7543 + }, + { + "epoch": 0.6615467458815695, + "grad_norm": 0.109375, + "learning_rate": 0.0011175531269835952, + "loss": 1.2025, + "step": 7544 + }, + { + "epoch": 0.6616344376559441, + "grad_norm": 0.045166015625, + "learning_rate": 0.0011171785142998784, + "loss": 1.1294, + "step": 7545 + }, + { + "epoch": 0.6617221294303186, + "grad_norm": 0.08984375, + "learning_rate": 0.0011168039502081055, + "loss": 1.1198, + "step": 7546 + }, + { + "epoch": 0.6618098212046931, + "grad_norm": 0.130859375, + "learning_rate": 0.001116429434742436, + "loss": 1.1813, + "step": 7547 + }, + { + "epoch": 0.6618975129790677, + "grad_norm": 0.0869140625, + "learning_rate": 0.001116054967937025, + "loss": 1.1119, + "step": 7548 + }, + { + "epoch": 0.6619852047534422, + "grad_norm": 0.05419921875, + "learning_rate": 0.0011156805498260226, + "loss": 1.1233, + "step": 7549 + }, + { + "epoch": 0.6620728965278168, + "grad_norm": 0.053466796875, + "learning_rate": 0.0011153061804435743, + "loss": 1.1417, + "step": 7550 + }, + { + "epoch": 0.6621605883021914, + "grad_norm": 0.068359375, + "learning_rate": 0.0011149318598238226, + "loss": 1.1121, + "step": 7551 + }, + { + "epoch": 0.6622482800765659, + "grad_norm": 0.05615234375, + "learning_rate": 0.0011145575880009034, + "loss": 1.2055, + "step": 7552 + }, + { + "epoch": 0.6623359718509404, + "grad_norm": 0.048828125, + "learning_rate": 0.0011141833650089499, + "loss": 1.1156, + "step": 7553 + }, + { + "epoch": 0.662423663625315, + "grad_norm": 0.0498046875, + "learning_rate": 0.0011138091908820906, + "loss": 1.1669, + "step": 7554 + }, + { + "epoch": 0.6625113553996895, + "grad_norm": 0.0458984375, + "learning_rate": 0.001113435065654449, + "loss": 1.1295, + "step": 7555 + }, + { + "epoch": 0.662599047174064, + "grad_norm": 0.061279296875, + "learning_rate": 0.0011130609893601439, + "loss": 1.1014, + "step": 7556 + }, + { + "epoch": 0.6626867389484387, + "grad_norm": 0.050537109375, + "learning_rate": 0.0011126869620332908, + "loss": 1.1427, + "step": 7557 + }, + { + "epoch": 0.6627744307228132, + "grad_norm": 0.043212890625, + "learning_rate": 0.0011123129837079992, + "loss": 1.1361, + "step": 7558 + }, + { + "epoch": 0.6628621224971877, + "grad_norm": 0.05126953125, + "learning_rate": 0.001111939054418376, + "loss": 1.1818, + "step": 7559 + }, + { + "epoch": 0.6629498142715622, + "grad_norm": 0.06640625, + "learning_rate": 0.0011115651741985218, + "loss": 1.1468, + "step": 7560 + }, + { + "epoch": 0.6630375060459368, + "grad_norm": 0.0517578125, + "learning_rate": 0.001111191343082534, + "loss": 1.1611, + "step": 7561 + }, + { + "epoch": 0.6631251978203113, + "grad_norm": 0.05029296875, + "learning_rate": 0.0011108175611045045, + "loss": 1.1583, + "step": 7562 + }, + { + "epoch": 0.6632128895946858, + "grad_norm": 0.056884765625, + "learning_rate": 0.0011104438282985218, + "loss": 1.1376, + "step": 7563 + }, + { + "epoch": 0.6633005813690604, + "grad_norm": 0.062255859375, + "learning_rate": 0.001110070144698669, + "loss": 1.1165, + "step": 7564 + }, + { + "epoch": 0.663388273143435, + "grad_norm": 0.08251953125, + "learning_rate": 0.0011096965103390246, + "loss": 1.1535, + "step": 7565 + }, + { + "epoch": 0.6634759649178095, + "grad_norm": 0.044189453125, + "learning_rate": 0.0011093229252536647, + "loss": 1.1352, + "step": 7566 + }, + { + "epoch": 0.6635636566921841, + "grad_norm": 0.054443359375, + "learning_rate": 0.0011089493894766579, + "loss": 1.1082, + "step": 7567 + }, + { + "epoch": 0.6636513484665586, + "grad_norm": 0.06103515625, + "learning_rate": 0.0011085759030420698, + "loss": 1.1556, + "step": 7568 + }, + { + "epoch": 0.6637390402409331, + "grad_norm": 0.0615234375, + "learning_rate": 0.001108202465983962, + "loss": 1.1511, + "step": 7569 + }, + { + "epoch": 0.6638267320153077, + "grad_norm": 0.046142578125, + "learning_rate": 0.0011078290783363904, + "loss": 1.1481, + "step": 7570 + }, + { + "epoch": 0.6639144237896822, + "grad_norm": 0.048095703125, + "learning_rate": 0.0011074557401334073, + "loss": 1.152, + "step": 7571 + }, + { + "epoch": 0.6640021155640567, + "grad_norm": 0.0546875, + "learning_rate": 0.00110708245140906, + "loss": 1.1514, + "step": 7572 + }, + { + "epoch": 0.6640898073384314, + "grad_norm": 0.04541015625, + "learning_rate": 0.0011067092121973916, + "loss": 1.1632, + "step": 7573 + }, + { + "epoch": 0.6641774991128059, + "grad_norm": 0.042724609375, + "learning_rate": 0.0011063360225324406, + "loss": 1.0995, + "step": 7574 + }, + { + "epoch": 0.6642651908871804, + "grad_norm": 0.044921875, + "learning_rate": 0.001105962882448241, + "loss": 1.1576, + "step": 7575 + }, + { + "epoch": 0.664352882661555, + "grad_norm": 0.047119140625, + "learning_rate": 0.0011055897919788219, + "loss": 1.1013, + "step": 7576 + }, + { + "epoch": 0.6644405744359295, + "grad_norm": 0.049072265625, + "learning_rate": 0.0011052167511582088, + "loss": 1.2086, + "step": 7577 + }, + { + "epoch": 0.664528266210304, + "grad_norm": 0.04150390625, + "learning_rate": 0.0011048437600204213, + "loss": 1.0804, + "step": 7578 + }, + { + "epoch": 0.6646159579846787, + "grad_norm": 0.0517578125, + "learning_rate": 0.0011044708185994757, + "loss": 1.157, + "step": 7579 + }, + { + "epoch": 0.6647036497590532, + "grad_norm": 0.056884765625, + "learning_rate": 0.001104097926929383, + "loss": 1.1547, + "step": 7580 + }, + { + "epoch": 0.6647913415334277, + "grad_norm": 0.056640625, + "learning_rate": 0.0011037250850441502, + "loss": 1.1102, + "step": 7581 + }, + { + "epoch": 0.6648790333078023, + "grad_norm": 0.044921875, + "learning_rate": 0.0011033522929777793, + "loss": 1.1337, + "step": 7582 + }, + { + "epoch": 0.6649667250821768, + "grad_norm": 0.045654296875, + "learning_rate": 0.0011029795507642686, + "loss": 1.1331, + "step": 7583 + }, + { + "epoch": 0.6650544168565513, + "grad_norm": 0.046630859375, + "learning_rate": 0.0011026068584376106, + "loss": 1.0685, + "step": 7584 + }, + { + "epoch": 0.6651421086309258, + "grad_norm": 0.051513671875, + "learning_rate": 0.0011022342160317941, + "loss": 1.1808, + "step": 7585 + }, + { + "epoch": 0.6652298004053004, + "grad_norm": 0.06298828125, + "learning_rate": 0.0011018616235808029, + "loss": 1.147, + "step": 7586 + }, + { + "epoch": 0.665317492179675, + "grad_norm": 0.0458984375, + "learning_rate": 0.0011014890811186161, + "loss": 1.1395, + "step": 7587 + }, + { + "epoch": 0.6654051839540495, + "grad_norm": 0.05224609375, + "learning_rate": 0.0011011165886792098, + "loss": 1.1371, + "step": 7588 + }, + { + "epoch": 0.6654928757284241, + "grad_norm": 0.0732421875, + "learning_rate": 0.0011007441462965535, + "loss": 1.1849, + "step": 7589 + }, + { + "epoch": 0.6655805675027986, + "grad_norm": 0.060791015625, + "learning_rate": 0.001100371754004613, + "loss": 1.0766, + "step": 7590 + }, + { + "epoch": 0.6656682592771731, + "grad_norm": 0.048095703125, + "learning_rate": 0.0010999994118373495, + "loss": 1.1364, + "step": 7591 + }, + { + "epoch": 0.6657559510515477, + "grad_norm": 0.045654296875, + "learning_rate": 0.0010996271198287202, + "loss": 1.1228, + "step": 7592 + }, + { + "epoch": 0.6658436428259222, + "grad_norm": 0.07421875, + "learning_rate": 0.0010992548780126767, + "loss": 1.0905, + "step": 7593 + }, + { + "epoch": 0.6659313346002967, + "grad_norm": 0.05078125, + "learning_rate": 0.0010988826864231664, + "loss": 1.1159, + "step": 7594 + }, + { + "epoch": 0.6660190263746714, + "grad_norm": 0.057373046875, + "learning_rate": 0.0010985105450941328, + "loss": 1.1433, + "step": 7595 + }, + { + "epoch": 0.6661067181490459, + "grad_norm": 0.04736328125, + "learning_rate": 0.0010981384540595133, + "loss": 1.1362, + "step": 7596 + }, + { + "epoch": 0.6661944099234204, + "grad_norm": 0.049560546875, + "learning_rate": 0.0010977664133532425, + "loss": 1.1315, + "step": 7597 + }, + { + "epoch": 0.666282101697795, + "grad_norm": 0.052490234375, + "learning_rate": 0.0010973944230092492, + "loss": 1.1408, + "step": 7598 + }, + { + "epoch": 0.6663697934721695, + "grad_norm": 0.060791015625, + "learning_rate": 0.001097022483061458, + "loss": 1.1242, + "step": 7599 + }, + { + "epoch": 0.666457485246544, + "grad_norm": 0.05224609375, + "learning_rate": 0.0010966505935437892, + "loss": 1.1186, + "step": 7600 + }, + { + "epoch": 0.6665451770209186, + "grad_norm": 0.046875, + "learning_rate": 0.0010962787544901574, + "loss": 1.141, + "step": 7601 + }, + { + "epoch": 0.6666328687952932, + "grad_norm": 0.0849609375, + "learning_rate": 0.001095906965934474, + "loss": 1.1332, + "step": 7602 + }, + { + "epoch": 0.6667205605696677, + "grad_norm": 0.05126953125, + "learning_rate": 0.0010955352279106443, + "loss": 1.147, + "step": 7603 + }, + { + "epoch": 0.6668082523440423, + "grad_norm": 0.052734375, + "learning_rate": 0.0010951635404525708, + "loss": 1.1097, + "step": 7604 + }, + { + "epoch": 0.6668959441184168, + "grad_norm": 0.0673828125, + "learning_rate": 0.0010947919035941504, + "loss": 1.1853, + "step": 7605 + }, + { + "epoch": 0.6669836358927913, + "grad_norm": 0.08544921875, + "learning_rate": 0.0010944203173692748, + "loss": 1.1445, + "step": 7606 + }, + { + "epoch": 0.6670713276671659, + "grad_norm": 0.054931640625, + "learning_rate": 0.001094048781811832, + "loss": 1.1052, + "step": 7607 + }, + { + "epoch": 0.6671590194415404, + "grad_norm": 0.042724609375, + "learning_rate": 0.0010936772969557052, + "loss": 1.1009, + "step": 7608 + }, + { + "epoch": 0.667246711215915, + "grad_norm": 0.05224609375, + "learning_rate": 0.0010933058628347727, + "loss": 1.1267, + "step": 7609 + }, + { + "epoch": 0.6673344029902895, + "grad_norm": 0.09228515625, + "learning_rate": 0.001092934479482909, + "loss": 1.1422, + "step": 7610 + }, + { + "epoch": 0.6674220947646641, + "grad_norm": 0.044189453125, + "learning_rate": 0.001092563146933982, + "loss": 1.1047, + "step": 7611 + }, + { + "epoch": 0.6675097865390386, + "grad_norm": 0.0458984375, + "learning_rate": 0.0010921918652218574, + "loss": 1.1516, + "step": 7612 + }, + { + "epoch": 0.6675974783134131, + "grad_norm": 0.050048828125, + "learning_rate": 0.0010918206343803944, + "loss": 1.1182, + "step": 7613 + }, + { + "epoch": 0.6676851700877877, + "grad_norm": 0.04638671875, + "learning_rate": 0.0010914494544434488, + "loss": 1.1549, + "step": 7614 + }, + { + "epoch": 0.6677728618621622, + "grad_norm": 0.045166015625, + "learning_rate": 0.001091078325444871, + "loss": 1.1092, + "step": 7615 + }, + { + "epoch": 0.6678605536365367, + "grad_norm": 0.0458984375, + "learning_rate": 0.0010907072474185071, + "loss": 1.0999, + "step": 7616 + }, + { + "epoch": 0.6679482454109114, + "grad_norm": 0.056640625, + "learning_rate": 0.001090336220398199, + "loss": 1.1294, + "step": 7617 + }, + { + "epoch": 0.6680359371852859, + "grad_norm": 0.045166015625, + "learning_rate": 0.001089965244417782, + "loss": 1.1129, + "step": 7618 + }, + { + "epoch": 0.6681236289596604, + "grad_norm": 0.060791015625, + "learning_rate": 0.0010895943195110894, + "loss": 1.1492, + "step": 7619 + }, + { + "epoch": 0.668211320734035, + "grad_norm": 0.053466796875, + "learning_rate": 0.0010892234457119479, + "loss": 1.1355, + "step": 7620 + }, + { + "epoch": 0.6682990125084095, + "grad_norm": 0.0439453125, + "learning_rate": 0.0010888526230541805, + "loss": 1.096, + "step": 7621 + }, + { + "epoch": 0.668386704282784, + "grad_norm": 0.04541015625, + "learning_rate": 0.0010884818515716056, + "loss": 1.1133, + "step": 7622 + }, + { + "epoch": 0.6684743960571586, + "grad_norm": 0.046630859375, + "learning_rate": 0.0010881111312980361, + "loss": 1.1504, + "step": 7623 + }, + { + "epoch": 0.6685620878315331, + "grad_norm": 0.04736328125, + "learning_rate": 0.001087740462267281, + "loss": 1.1186, + "step": 7624 + }, + { + "epoch": 0.6686497796059077, + "grad_norm": 0.0498046875, + "learning_rate": 0.001087369844513144, + "loss": 1.2002, + "step": 7625 + }, + { + "epoch": 0.6687374713802823, + "grad_norm": 0.0439453125, + "learning_rate": 0.0010869992780694246, + "loss": 1.1043, + "step": 7626 + }, + { + "epoch": 0.6688251631546568, + "grad_norm": 0.0810546875, + "learning_rate": 0.0010866287629699187, + "loss": 1.1646, + "step": 7627 + }, + { + "epoch": 0.6689128549290313, + "grad_norm": 0.054931640625, + "learning_rate": 0.0010862582992484146, + "loss": 1.1227, + "step": 7628 + }, + { + "epoch": 0.6690005467034059, + "grad_norm": 0.04736328125, + "learning_rate": 0.0010858878869386976, + "loss": 1.0949, + "step": 7629 + }, + { + "epoch": 0.6690882384777804, + "grad_norm": 0.050537109375, + "learning_rate": 0.00108551752607455, + "loss": 1.1251, + "step": 7630 + }, + { + "epoch": 0.6691759302521549, + "grad_norm": 0.0546875, + "learning_rate": 0.0010851472166897467, + "loss": 1.1154, + "step": 7631 + }, + { + "epoch": 0.6692636220265294, + "grad_norm": 0.06494140625, + "learning_rate": 0.001084776958818059, + "loss": 1.1771, + "step": 7632 + }, + { + "epoch": 0.6693513138009041, + "grad_norm": 0.08349609375, + "learning_rate": 0.0010844067524932535, + "loss": 1.1586, + "step": 7633 + }, + { + "epoch": 0.6694390055752786, + "grad_norm": 0.064453125, + "learning_rate": 0.0010840365977490924, + "loss": 1.1525, + "step": 7634 + }, + { + "epoch": 0.6695266973496531, + "grad_norm": 0.048583984375, + "learning_rate": 0.0010836664946193323, + "loss": 1.1427, + "step": 7635 + }, + { + "epoch": 0.6696143891240277, + "grad_norm": 0.06298828125, + "learning_rate": 0.0010832964431377263, + "loss": 1.1019, + "step": 7636 + }, + { + "epoch": 0.6697020808984022, + "grad_norm": 0.053466796875, + "learning_rate": 0.001082926443338021, + "loss": 1.0758, + "step": 7637 + }, + { + "epoch": 0.6697897726727767, + "grad_norm": 0.05810546875, + "learning_rate": 0.0010825564952539604, + "loss": 1.1533, + "step": 7638 + }, + { + "epoch": 0.6698774644471513, + "grad_norm": 0.0625, + "learning_rate": 0.0010821865989192826, + "loss": 1.1247, + "step": 7639 + }, + { + "epoch": 0.6699651562215259, + "grad_norm": 0.049072265625, + "learning_rate": 0.0010818167543677217, + "loss": 1.1344, + "step": 7640 + }, + { + "epoch": 0.6700528479959004, + "grad_norm": 0.04443359375, + "learning_rate": 0.0010814469616330054, + "loss": 1.1003, + "step": 7641 + }, + { + "epoch": 0.670140539770275, + "grad_norm": 0.05322265625, + "learning_rate": 0.0010810772207488586, + "loss": 1.1638, + "step": 7642 + }, + { + "epoch": 0.6702282315446495, + "grad_norm": 0.050048828125, + "learning_rate": 0.0010807075317490003, + "loss": 1.1342, + "step": 7643 + }, + { + "epoch": 0.670315923319024, + "grad_norm": 0.048583984375, + "learning_rate": 0.0010803378946671466, + "loss": 1.1401, + "step": 7644 + }, + { + "epoch": 0.6704036150933986, + "grad_norm": 0.046875, + "learning_rate": 0.0010799683095370051, + "loss": 1.1781, + "step": 7645 + }, + { + "epoch": 0.6704913068677731, + "grad_norm": 0.046142578125, + "learning_rate": 0.001079598776392283, + "loss": 1.1657, + "step": 7646 + }, + { + "epoch": 0.6705789986421477, + "grad_norm": 0.04833984375, + "learning_rate": 0.0010792292952666795, + "loss": 1.1179, + "step": 7647 + }, + { + "epoch": 0.6706666904165223, + "grad_norm": 0.056640625, + "learning_rate": 0.001078859866193891, + "loss": 1.1252, + "step": 7648 + }, + { + "epoch": 0.6707543821908968, + "grad_norm": 0.0439453125, + "learning_rate": 0.0010784904892076088, + "loss": 1.1337, + "step": 7649 + }, + { + "epoch": 0.6708420739652713, + "grad_norm": 0.041748046875, + "learning_rate": 0.0010781211643415183, + "loss": 1.1129, + "step": 7650 + }, + { + "epoch": 0.6709297657396459, + "grad_norm": 0.045654296875, + "learning_rate": 0.0010777518916293015, + "loss": 1.1351, + "step": 7651 + }, + { + "epoch": 0.6710174575140204, + "grad_norm": 0.072265625, + "learning_rate": 0.0010773826711046345, + "loss": 1.1485, + "step": 7652 + }, + { + "epoch": 0.6711051492883949, + "grad_norm": 0.044189453125, + "learning_rate": 0.0010770135028011897, + "loss": 1.1596, + "step": 7653 + }, + { + "epoch": 0.6711928410627696, + "grad_norm": 0.044189453125, + "learning_rate": 0.001076644386752635, + "loss": 1.114, + "step": 7654 + }, + { + "epoch": 0.6712805328371441, + "grad_norm": 0.05712890625, + "learning_rate": 0.0010762753229926316, + "loss": 1.0893, + "step": 7655 + }, + { + "epoch": 0.6713682246115186, + "grad_norm": 0.048583984375, + "learning_rate": 0.0010759063115548374, + "loss": 1.1654, + "step": 7656 + }, + { + "epoch": 0.6714559163858931, + "grad_norm": 0.05029296875, + "learning_rate": 0.0010755373524729056, + "loss": 1.1385, + "step": 7657 + }, + { + "epoch": 0.6715436081602677, + "grad_norm": 0.049560546875, + "learning_rate": 0.0010751684457804848, + "loss": 1.1198, + "step": 7658 + }, + { + "epoch": 0.6716312999346422, + "grad_norm": 0.044921875, + "learning_rate": 0.0010747995915112174, + "loss": 1.1009, + "step": 7659 + }, + { + "epoch": 0.6717189917090167, + "grad_norm": 0.0498046875, + "learning_rate": 0.0010744307896987425, + "loss": 1.1458, + "step": 7660 + }, + { + "epoch": 0.6718066834833913, + "grad_norm": 0.06201171875, + "learning_rate": 0.0010740620403766937, + "loss": 1.1854, + "step": 7661 + }, + { + "epoch": 0.6718943752577659, + "grad_norm": 0.044921875, + "learning_rate": 0.0010736933435787006, + "loss": 1.1357, + "step": 7662 + }, + { + "epoch": 0.6719820670321404, + "grad_norm": 0.045166015625, + "learning_rate": 0.0010733246993383863, + "loss": 1.1288, + "step": 7663 + }, + { + "epoch": 0.672069758806515, + "grad_norm": 0.048828125, + "learning_rate": 0.0010729561076893708, + "loss": 1.1204, + "step": 7664 + }, + { + "epoch": 0.6721574505808895, + "grad_norm": 0.0546875, + "learning_rate": 0.0010725875686652681, + "loss": 1.1841, + "step": 7665 + }, + { + "epoch": 0.672245142355264, + "grad_norm": 0.0556640625, + "learning_rate": 0.00107221908229969, + "loss": 1.1166, + "step": 7666 + }, + { + "epoch": 0.6723328341296386, + "grad_norm": 0.050048828125, + "learning_rate": 0.001071850648626239, + "loss": 1.1447, + "step": 7667 + }, + { + "epoch": 0.6724205259040131, + "grad_norm": 0.04296875, + "learning_rate": 0.0010714822676785168, + "loss": 1.131, + "step": 7668 + }, + { + "epoch": 0.6725082176783876, + "grad_norm": 0.047119140625, + "learning_rate": 0.001071113939490118, + "loss": 1.0941, + "step": 7669 + }, + { + "epoch": 0.6725959094527623, + "grad_norm": 0.060302734375, + "learning_rate": 0.0010707456640946334, + "loss": 1.1172, + "step": 7670 + }, + { + "epoch": 0.6726836012271368, + "grad_norm": 0.047607421875, + "learning_rate": 0.0010703774415256496, + "loss": 1.2103, + "step": 7671 + }, + { + "epoch": 0.6727712930015113, + "grad_norm": 0.05322265625, + "learning_rate": 0.0010700092718167462, + "loss": 1.1058, + "step": 7672 + }, + { + "epoch": 0.6728589847758859, + "grad_norm": 0.055908203125, + "learning_rate": 0.0010696411550014998, + "loss": 1.0723, + "step": 7673 + }, + { + "epoch": 0.6729466765502604, + "grad_norm": 0.046630859375, + "learning_rate": 0.0010692730911134821, + "loss": 1.1582, + "step": 7674 + }, + { + "epoch": 0.6730343683246349, + "grad_norm": 0.059814453125, + "learning_rate": 0.0010689050801862595, + "loss": 1.0875, + "step": 7675 + }, + { + "epoch": 0.6731220600990095, + "grad_norm": 0.0654296875, + "learning_rate": 0.0010685371222533939, + "loss": 1.1061, + "step": 7676 + }, + { + "epoch": 0.673209751873384, + "grad_norm": 0.04833984375, + "learning_rate": 0.0010681692173484412, + "loss": 1.1086, + "step": 7677 + }, + { + "epoch": 0.6732974436477586, + "grad_norm": 0.044921875, + "learning_rate": 0.0010678013655049537, + "loss": 1.1059, + "step": 7678 + }, + { + "epoch": 0.6733851354221332, + "grad_norm": 0.07958984375, + "learning_rate": 0.0010674335667564788, + "loss": 1.1736, + "step": 7679 + }, + { + "epoch": 0.6734728271965077, + "grad_norm": 0.056884765625, + "learning_rate": 0.0010670658211365592, + "loss": 1.1791, + "step": 7680 + }, + { + "epoch": 0.6735605189708822, + "grad_norm": 0.057373046875, + "learning_rate": 0.0010666981286787316, + "loss": 1.1354, + "step": 7681 + }, + { + "epoch": 0.6736482107452567, + "grad_norm": 0.055419921875, + "learning_rate": 0.0010663304894165287, + "loss": 1.1206, + "step": 7682 + }, + { + "epoch": 0.6737359025196313, + "grad_norm": 0.06884765625, + "learning_rate": 0.0010659629033834786, + "loss": 1.1854, + "step": 7683 + }, + { + "epoch": 0.6738235942940058, + "grad_norm": 0.06298828125, + "learning_rate": 0.0010655953706131044, + "loss": 1.0993, + "step": 7684 + }, + { + "epoch": 0.6739112860683804, + "grad_norm": 0.045166015625, + "learning_rate": 0.0010652278911389237, + "loss": 1.1392, + "step": 7685 + }, + { + "epoch": 0.673998977842755, + "grad_norm": 0.0458984375, + "learning_rate": 0.0010648604649944494, + "loss": 1.1346, + "step": 7686 + }, + { + "epoch": 0.6740866696171295, + "grad_norm": 0.049560546875, + "learning_rate": 0.00106449309221319, + "loss": 1.0955, + "step": 7687 + }, + { + "epoch": 0.674174361391504, + "grad_norm": 0.046875, + "learning_rate": 0.001064125772828649, + "loss": 1.1305, + "step": 7688 + }, + { + "epoch": 0.6742620531658786, + "grad_norm": 0.054931640625, + "learning_rate": 0.0010637585068743256, + "loss": 1.0967, + "step": 7689 + }, + { + "epoch": 0.6743497449402531, + "grad_norm": 0.046630859375, + "learning_rate": 0.001063391294383713, + "loss": 1.1476, + "step": 7690 + }, + { + "epoch": 0.6744374367146276, + "grad_norm": 0.06005859375, + "learning_rate": 0.0010630241353902995, + "loss": 1.1854, + "step": 7691 + }, + { + "epoch": 0.6745251284890023, + "grad_norm": 0.049560546875, + "learning_rate": 0.00106265702992757, + "loss": 1.1513, + "step": 7692 + }, + { + "epoch": 0.6746128202633768, + "grad_norm": 0.04638671875, + "learning_rate": 0.0010622899780290036, + "loss": 1.1292, + "step": 7693 + }, + { + "epoch": 0.6747005120377513, + "grad_norm": 0.052001953125, + "learning_rate": 0.0010619229797280735, + "loss": 1.1361, + "step": 7694 + }, + { + "epoch": 0.6747882038121259, + "grad_norm": 0.046630859375, + "learning_rate": 0.0010615560350582492, + "loss": 1.1391, + "step": 7695 + }, + { + "epoch": 0.6748758955865004, + "grad_norm": 0.04638671875, + "learning_rate": 0.0010611891440529954, + "loss": 1.1188, + "step": 7696 + }, + { + "epoch": 0.6749635873608749, + "grad_norm": 0.043212890625, + "learning_rate": 0.0010608223067457726, + "loss": 1.1187, + "step": 7697 + }, + { + "epoch": 0.6750512791352495, + "grad_norm": 0.044189453125, + "learning_rate": 0.0010604555231700338, + "loss": 1.1863, + "step": 7698 + }, + { + "epoch": 0.675138970909624, + "grad_norm": 0.04443359375, + "learning_rate": 0.001060088793359229, + "loss": 1.1254, + "step": 7699 + }, + { + "epoch": 0.6752266626839986, + "grad_norm": 0.05712890625, + "learning_rate": 0.0010597221173468039, + "loss": 1.1119, + "step": 7700 + }, + { + "epoch": 0.6753143544583732, + "grad_norm": 0.050048828125, + "learning_rate": 0.0010593554951661983, + "loss": 1.1324, + "step": 7701 + }, + { + "epoch": 0.6754020462327477, + "grad_norm": 0.045654296875, + "learning_rate": 0.001058988926850847, + "loss": 1.1005, + "step": 7702 + }, + { + "epoch": 0.6754897380071222, + "grad_norm": 0.04736328125, + "learning_rate": 0.0010586224124341792, + "loss": 1.1295, + "step": 7703 + }, + { + "epoch": 0.6755774297814967, + "grad_norm": 0.0498046875, + "learning_rate": 0.0010582559519496206, + "loss": 1.1078, + "step": 7704 + }, + { + "epoch": 0.6756651215558713, + "grad_norm": 0.050537109375, + "learning_rate": 0.0010578895454305915, + "loss": 1.1968, + "step": 7705 + }, + { + "epoch": 0.6757528133302458, + "grad_norm": 0.05859375, + "learning_rate": 0.001057523192910508, + "loss": 1.1119, + "step": 7706 + }, + { + "epoch": 0.6758405051046203, + "grad_norm": 0.056396484375, + "learning_rate": 0.0010571568944227796, + "loss": 1.0514, + "step": 7707 + }, + { + "epoch": 0.675928196878995, + "grad_norm": 0.04736328125, + "learning_rate": 0.0010567906500008116, + "loss": 1.1403, + "step": 7708 + }, + { + "epoch": 0.6760158886533695, + "grad_norm": 0.060791015625, + "learning_rate": 0.0010564244596780053, + "loss": 1.1622, + "step": 7709 + }, + { + "epoch": 0.676103580427744, + "grad_norm": 0.053466796875, + "learning_rate": 0.001056058323487756, + "loss": 1.1068, + "step": 7710 + }, + { + "epoch": 0.6761912722021186, + "grad_norm": 0.046875, + "learning_rate": 0.0010556922414634545, + "loss": 1.1646, + "step": 7711 + }, + { + "epoch": 0.6762789639764931, + "grad_norm": 0.044921875, + "learning_rate": 0.0010553262136384863, + "loss": 1.1194, + "step": 7712 + }, + { + "epoch": 0.6763666557508676, + "grad_norm": 0.05078125, + "learning_rate": 0.0010549602400462318, + "loss": 1.1536, + "step": 7713 + }, + { + "epoch": 0.6764543475252422, + "grad_norm": 0.045166015625, + "learning_rate": 0.0010545943207200675, + "loss": 1.1052, + "step": 7714 + }, + { + "epoch": 0.6765420392996168, + "grad_norm": 0.047607421875, + "learning_rate": 0.0010542284556933647, + "loss": 1.1866, + "step": 7715 + }, + { + "epoch": 0.6766297310739913, + "grad_norm": 0.04296875, + "learning_rate": 0.0010538626449994883, + "loss": 1.1243, + "step": 7716 + }, + { + "epoch": 0.6767174228483659, + "grad_norm": 0.046630859375, + "learning_rate": 0.0010534968886717995, + "loss": 1.1034, + "step": 7717 + }, + { + "epoch": 0.6768051146227404, + "grad_norm": 0.045166015625, + "learning_rate": 0.0010531311867436556, + "loss": 1.1111, + "step": 7718 + }, + { + "epoch": 0.6768928063971149, + "grad_norm": 0.05078125, + "learning_rate": 0.0010527655392484053, + "loss": 1.1518, + "step": 7719 + }, + { + "epoch": 0.6769804981714895, + "grad_norm": 0.08642578125, + "learning_rate": 0.0010523999462193974, + "loss": 1.1437, + "step": 7720 + }, + { + "epoch": 0.677068189945864, + "grad_norm": 0.053466796875, + "learning_rate": 0.0010520344076899708, + "loss": 1.1442, + "step": 7721 + }, + { + "epoch": 0.6771558817202386, + "grad_norm": 0.042724609375, + "learning_rate": 0.0010516689236934626, + "loss": 1.0603, + "step": 7722 + }, + { + "epoch": 0.6772435734946132, + "grad_norm": 0.056884765625, + "learning_rate": 0.0010513034942632043, + "loss": 1.1319, + "step": 7723 + }, + { + "epoch": 0.6773312652689877, + "grad_norm": 0.044921875, + "learning_rate": 0.0010509381194325218, + "loss": 1.0905, + "step": 7724 + }, + { + "epoch": 0.6774189570433622, + "grad_norm": 0.05029296875, + "learning_rate": 0.001050572799234736, + "loss": 1.1628, + "step": 7725 + }, + { + "epoch": 0.6775066488177368, + "grad_norm": 0.04345703125, + "learning_rate": 0.0010502075337031635, + "loss": 1.1714, + "step": 7726 + }, + { + "epoch": 0.6775943405921113, + "grad_norm": 0.04345703125, + "learning_rate": 0.001049842322871116, + "loss": 1.1309, + "step": 7727 + }, + { + "epoch": 0.6776820323664858, + "grad_norm": 0.046630859375, + "learning_rate": 0.0010494771667718989, + "loss": 1.223, + "step": 7728 + }, + { + "epoch": 0.6777697241408603, + "grad_norm": 0.04833984375, + "learning_rate": 0.0010491120654388146, + "loss": 1.1736, + "step": 7729 + }, + { + "epoch": 0.677857415915235, + "grad_norm": 0.054931640625, + "learning_rate": 0.001048747018905158, + "loss": 1.1612, + "step": 7730 + }, + { + "epoch": 0.6779451076896095, + "grad_norm": 0.043212890625, + "learning_rate": 0.0010483820272042216, + "loss": 1.1541, + "step": 7731 + }, + { + "epoch": 0.678032799463984, + "grad_norm": 0.045166015625, + "learning_rate": 0.0010480170903692909, + "loss": 1.1749, + "step": 7732 + }, + { + "epoch": 0.6781204912383586, + "grad_norm": 0.06298828125, + "learning_rate": 0.0010476522084336482, + "loss": 1.1399, + "step": 7733 + }, + { + "epoch": 0.6782081830127331, + "grad_norm": 0.046142578125, + "learning_rate": 0.0010472873814305684, + "loss": 1.1426, + "step": 7734 + }, + { + "epoch": 0.6782958747871076, + "grad_norm": 0.05419921875, + "learning_rate": 0.0010469226093933245, + "loss": 1.1757, + "step": 7735 + }, + { + "epoch": 0.6783835665614822, + "grad_norm": 0.047119140625, + "learning_rate": 0.001046557892355181, + "loss": 1.0712, + "step": 7736 + }, + { + "epoch": 0.6784712583358568, + "grad_norm": 0.051025390625, + "learning_rate": 0.0010461932303493999, + "loss": 1.1702, + "step": 7737 + }, + { + "epoch": 0.6785589501102313, + "grad_norm": 0.0498046875, + "learning_rate": 0.0010458286234092378, + "loss": 1.1565, + "step": 7738 + }, + { + "epoch": 0.6786466418846059, + "grad_norm": 0.04833984375, + "learning_rate": 0.0010454640715679452, + "loss": 1.0842, + "step": 7739 + }, + { + "epoch": 0.6787343336589804, + "grad_norm": 0.047119140625, + "learning_rate": 0.001045099574858769, + "loss": 1.1267, + "step": 7740 + }, + { + "epoch": 0.6788220254333549, + "grad_norm": 0.0537109375, + "learning_rate": 0.0010447351333149498, + "loss": 1.1461, + "step": 7741 + }, + { + "epoch": 0.6789097172077295, + "grad_norm": 0.07470703125, + "learning_rate": 0.0010443707469697245, + "loss": 1.1502, + "step": 7742 + }, + { + "epoch": 0.678997408982104, + "grad_norm": 0.053466796875, + "learning_rate": 0.001044006415856323, + "loss": 1.1999, + "step": 7743 + }, + { + "epoch": 0.6790851007564785, + "grad_norm": 0.046875, + "learning_rate": 0.001043642140007972, + "loss": 1.1392, + "step": 7744 + }, + { + "epoch": 0.6791727925308532, + "grad_norm": 0.06787109375, + "learning_rate": 0.0010432779194578926, + "loss": 1.1665, + "step": 7745 + }, + { + "epoch": 0.6792604843052277, + "grad_norm": 0.0517578125, + "learning_rate": 0.0010429137542393014, + "loss": 1.1236, + "step": 7746 + }, + { + "epoch": 0.6793481760796022, + "grad_norm": 0.044921875, + "learning_rate": 0.001042549644385408, + "loss": 1.1717, + "step": 7747 + }, + { + "epoch": 0.6794358678539768, + "grad_norm": 0.045166015625, + "learning_rate": 0.0010421855899294188, + "loss": 1.1073, + "step": 7748 + }, + { + "epoch": 0.6795235596283513, + "grad_norm": 0.04931640625, + "learning_rate": 0.0010418215909045348, + "loss": 1.1137, + "step": 7749 + }, + { + "epoch": 0.6796112514027258, + "grad_norm": 0.0498046875, + "learning_rate": 0.0010414576473439523, + "loss": 1.1321, + "step": 7750 + }, + { + "epoch": 0.6796989431771003, + "grad_norm": 0.054443359375, + "learning_rate": 0.0010410937592808607, + "loss": 1.0864, + "step": 7751 + }, + { + "epoch": 0.679786634951475, + "grad_norm": 0.0498046875, + "learning_rate": 0.001040729926748447, + "loss": 1.1572, + "step": 7752 + }, + { + "epoch": 0.6798743267258495, + "grad_norm": 0.05224609375, + "learning_rate": 0.0010403661497798908, + "loss": 1.1088, + "step": 7753 + }, + { + "epoch": 0.679962018500224, + "grad_norm": 0.048095703125, + "learning_rate": 0.0010400024284083677, + "loss": 1.1618, + "step": 7754 + }, + { + "epoch": 0.6800497102745986, + "grad_norm": 0.0576171875, + "learning_rate": 0.0010396387626670494, + "loss": 1.1616, + "step": 7755 + }, + { + "epoch": 0.6801374020489731, + "grad_norm": 0.044921875, + "learning_rate": 0.0010392751525890992, + "loss": 1.1111, + "step": 7756 + }, + { + "epoch": 0.6802250938233476, + "grad_norm": 0.043701171875, + "learning_rate": 0.0010389115982076787, + "loss": 1.1503, + "step": 7757 + }, + { + "epoch": 0.6803127855977222, + "grad_norm": 0.058349609375, + "learning_rate": 0.0010385480995559432, + "loss": 1.1506, + "step": 7758 + }, + { + "epoch": 0.6804004773720967, + "grad_norm": 0.051513671875, + "learning_rate": 0.0010381846566670425, + "loss": 1.0544, + "step": 7759 + }, + { + "epoch": 0.6804881691464713, + "grad_norm": 0.04638671875, + "learning_rate": 0.001037821269574122, + "loss": 1.1589, + "step": 7760 + }, + { + "epoch": 0.6805758609208459, + "grad_norm": 0.05810546875, + "learning_rate": 0.0010374579383103208, + "loss": 1.1284, + "step": 7761 + }, + { + "epoch": 0.6806635526952204, + "grad_norm": 0.05078125, + "learning_rate": 0.0010370946629087747, + "loss": 1.0949, + "step": 7762 + }, + { + "epoch": 0.6807512444695949, + "grad_norm": 0.045166015625, + "learning_rate": 0.0010367314434026127, + "loss": 1.0876, + "step": 7763 + }, + { + "epoch": 0.6808389362439695, + "grad_norm": 0.048828125, + "learning_rate": 0.001036368279824961, + "loss": 1.1807, + "step": 7764 + }, + { + "epoch": 0.680926628018344, + "grad_norm": 0.048828125, + "learning_rate": 0.001036005172208937, + "loss": 1.1351, + "step": 7765 + }, + { + "epoch": 0.6810143197927185, + "grad_norm": 0.045654296875, + "learning_rate": 0.0010356421205876566, + "loss": 1.182, + "step": 7766 + }, + { + "epoch": 0.6811020115670932, + "grad_norm": 0.044189453125, + "learning_rate": 0.0010352791249942288, + "loss": 1.1423, + "step": 7767 + }, + { + "epoch": 0.6811897033414677, + "grad_norm": 0.045166015625, + "learning_rate": 0.0010349161854617582, + "loss": 1.2074, + "step": 7768 + }, + { + "epoch": 0.6812773951158422, + "grad_norm": 0.04638671875, + "learning_rate": 0.0010345533020233442, + "loss": 1.1345, + "step": 7769 + }, + { + "epoch": 0.6813650868902168, + "grad_norm": 0.0478515625, + "learning_rate": 0.0010341904747120793, + "loss": 1.1493, + "step": 7770 + }, + { + "epoch": 0.6814527786645913, + "grad_norm": 0.05859375, + "learning_rate": 0.0010338277035610536, + "loss": 1.1132, + "step": 7771 + }, + { + "epoch": 0.6815404704389658, + "grad_norm": 0.043701171875, + "learning_rate": 0.0010334649886033507, + "loss": 1.0994, + "step": 7772 + }, + { + "epoch": 0.6816281622133404, + "grad_norm": 0.060791015625, + "learning_rate": 0.0010331023298720497, + "loss": 1.2056, + "step": 7773 + }, + { + "epoch": 0.681715853987715, + "grad_norm": 0.04248046875, + "learning_rate": 0.0010327397274002234, + "loss": 1.1879, + "step": 7774 + }, + { + "epoch": 0.6818035457620895, + "grad_norm": 0.0595703125, + "learning_rate": 0.00103237718122094, + "loss": 1.1233, + "step": 7775 + }, + { + "epoch": 0.681891237536464, + "grad_norm": 0.060546875, + "learning_rate": 0.001032014691367264, + "loss": 1.1102, + "step": 7776 + }, + { + "epoch": 0.6819789293108386, + "grad_norm": 0.07080078125, + "learning_rate": 0.0010316522578722527, + "loss": 1.1536, + "step": 7777 + }, + { + "epoch": 0.6820666210852131, + "grad_norm": 0.0419921875, + "learning_rate": 0.001031289880768959, + "loss": 1.1407, + "step": 7778 + }, + { + "epoch": 0.6821543128595876, + "grad_norm": 0.0498046875, + "learning_rate": 0.001030927560090431, + "loss": 1.1882, + "step": 7779 + }, + { + "epoch": 0.6822420046339622, + "grad_norm": 0.049072265625, + "learning_rate": 0.001030565295869711, + "loss": 1.1005, + "step": 7780 + }, + { + "epoch": 0.6823296964083367, + "grad_norm": 0.0458984375, + "learning_rate": 0.001030203088139838, + "loss": 1.1542, + "step": 7781 + }, + { + "epoch": 0.6824173881827112, + "grad_norm": 0.05712890625, + "learning_rate": 0.0010298409369338425, + "loss": 1.2198, + "step": 7782 + }, + { + "epoch": 0.6825050799570859, + "grad_norm": 0.04931640625, + "learning_rate": 0.001029478842284753, + "loss": 1.1803, + "step": 7783 + }, + { + "epoch": 0.6825927717314604, + "grad_norm": 0.046142578125, + "learning_rate": 0.0010291168042255907, + "loss": 1.1074, + "step": 7784 + }, + { + "epoch": 0.6826804635058349, + "grad_norm": 0.04345703125, + "learning_rate": 0.0010287548227893735, + "loss": 1.1227, + "step": 7785 + }, + { + "epoch": 0.6827681552802095, + "grad_norm": 0.05517578125, + "learning_rate": 0.0010283928980091132, + "loss": 1.1154, + "step": 7786 + }, + { + "epoch": 0.682855847054584, + "grad_norm": 0.054931640625, + "learning_rate": 0.0010280310299178158, + "loss": 1.0879, + "step": 7787 + }, + { + "epoch": 0.6829435388289585, + "grad_norm": 0.049560546875, + "learning_rate": 0.0010276692185484822, + "loss": 1.1283, + "step": 7788 + }, + { + "epoch": 0.6830312306033332, + "grad_norm": 0.0576171875, + "learning_rate": 0.0010273074639341095, + "loss": 1.1631, + "step": 7789 + }, + { + "epoch": 0.6831189223777077, + "grad_norm": 0.06494140625, + "learning_rate": 0.0010269457661076888, + "loss": 1.1593, + "step": 7790 + }, + { + "epoch": 0.6832066141520822, + "grad_norm": 0.060791015625, + "learning_rate": 0.0010265841251022058, + "loss": 1.1204, + "step": 7791 + }, + { + "epoch": 0.6832943059264568, + "grad_norm": 0.06982421875, + "learning_rate": 0.0010262225409506413, + "loss": 1.1523, + "step": 7792 + }, + { + "epoch": 0.6833819977008313, + "grad_norm": 0.046875, + "learning_rate": 0.0010258610136859704, + "loss": 1.1178, + "step": 7793 + }, + { + "epoch": 0.6834696894752058, + "grad_norm": 0.054931640625, + "learning_rate": 0.0010254995433411642, + "loss": 1.0623, + "step": 7794 + }, + { + "epoch": 0.6835573812495804, + "grad_norm": 0.056640625, + "learning_rate": 0.001025138129949188, + "loss": 1.1637, + "step": 7795 + }, + { + "epoch": 0.6836450730239549, + "grad_norm": 0.047119140625, + "learning_rate": 0.0010247767735430003, + "loss": 1.1403, + "step": 7796 + }, + { + "epoch": 0.6837327647983295, + "grad_norm": 0.048828125, + "learning_rate": 0.0010244154741555574, + "loss": 1.1479, + "step": 7797 + }, + { + "epoch": 0.6838204565727041, + "grad_norm": 0.044189453125, + "learning_rate": 0.0010240542318198079, + "loss": 1.1136, + "step": 7798 + }, + { + "epoch": 0.6839081483470786, + "grad_norm": 0.047119140625, + "learning_rate": 0.0010236930465686972, + "loss": 1.1899, + "step": 7799 + }, + { + "epoch": 0.6839958401214531, + "grad_norm": 0.0458984375, + "learning_rate": 0.0010233319184351636, + "loss": 1.1164, + "step": 7800 + }, + { + "epoch": 0.6840835318958276, + "grad_norm": 0.049072265625, + "learning_rate": 0.001022970847452141, + "loss": 1.1056, + "step": 7801 + }, + { + "epoch": 0.6841712236702022, + "grad_norm": 0.047607421875, + "learning_rate": 0.0010226098336525587, + "loss": 1.1916, + "step": 7802 + }, + { + "epoch": 0.6842589154445767, + "grad_norm": 0.044189453125, + "learning_rate": 0.0010222488770693402, + "loss": 1.1306, + "step": 7803 + }, + { + "epoch": 0.6843466072189512, + "grad_norm": 0.04443359375, + "learning_rate": 0.001021887977735404, + "loss": 1.0922, + "step": 7804 + }, + { + "epoch": 0.6844342989933259, + "grad_norm": 0.06103515625, + "learning_rate": 0.0010215271356836617, + "loss": 1.1178, + "step": 7805 + }, + { + "epoch": 0.6845219907677004, + "grad_norm": 0.05419921875, + "learning_rate": 0.0010211663509470227, + "loss": 1.1672, + "step": 7806 + }, + { + "epoch": 0.6846096825420749, + "grad_norm": 0.05419921875, + "learning_rate": 0.0010208056235583888, + "loss": 1.193, + "step": 7807 + }, + { + "epoch": 0.6846973743164495, + "grad_norm": 0.0556640625, + "learning_rate": 0.0010204449535506585, + "loss": 1.1681, + "step": 7808 + }, + { + "epoch": 0.684785066090824, + "grad_norm": 0.050048828125, + "learning_rate": 0.0010200843409567228, + "loss": 1.1995, + "step": 7809 + }, + { + "epoch": 0.6848727578651985, + "grad_norm": 0.04736328125, + "learning_rate": 0.001019723785809469, + "loss": 1.142, + "step": 7810 + }, + { + "epoch": 0.6849604496395731, + "grad_norm": 0.0478515625, + "learning_rate": 0.001019363288141779, + "loss": 1.1184, + "step": 7811 + }, + { + "epoch": 0.6850481414139477, + "grad_norm": 0.0498046875, + "learning_rate": 0.0010190028479865286, + "loss": 1.1465, + "step": 7812 + }, + { + "epoch": 0.6851358331883222, + "grad_norm": 0.0546875, + "learning_rate": 0.0010186424653765905, + "loss": 1.139, + "step": 7813 + }, + { + "epoch": 0.6852235249626968, + "grad_norm": 0.0517578125, + "learning_rate": 0.0010182821403448294, + "loss": 1.1413, + "step": 7814 + }, + { + "epoch": 0.6853112167370713, + "grad_norm": 0.058349609375, + "learning_rate": 0.001017921872924106, + "loss": 1.129, + "step": 7815 + }, + { + "epoch": 0.6853989085114458, + "grad_norm": 0.04248046875, + "learning_rate": 0.0010175616631472762, + "loss": 1.1217, + "step": 7816 + }, + { + "epoch": 0.6854866002858204, + "grad_norm": 0.051025390625, + "learning_rate": 0.0010172015110471906, + "loss": 1.1116, + "step": 7817 + }, + { + "epoch": 0.6855742920601949, + "grad_norm": 0.048095703125, + "learning_rate": 0.0010168414166566932, + "loss": 1.0954, + "step": 7818 + }, + { + "epoch": 0.6856619838345694, + "grad_norm": 0.058837890625, + "learning_rate": 0.0010164813800086238, + "loss": 1.1899, + "step": 7819 + }, + { + "epoch": 0.6857496756089441, + "grad_norm": 0.053466796875, + "learning_rate": 0.0010161214011358182, + "loss": 1.1614, + "step": 7820 + }, + { + "epoch": 0.6858373673833186, + "grad_norm": 0.0537109375, + "learning_rate": 0.0010157614800711035, + "loss": 1.1321, + "step": 7821 + }, + { + "epoch": 0.6859250591576931, + "grad_norm": 0.0439453125, + "learning_rate": 0.0010154016168473054, + "loss": 1.1145, + "step": 7822 + }, + { + "epoch": 0.6860127509320676, + "grad_norm": 0.046142578125, + "learning_rate": 0.0010150418114972409, + "loss": 1.1202, + "step": 7823 + }, + { + "epoch": 0.6861004427064422, + "grad_norm": 0.048095703125, + "learning_rate": 0.0010146820640537237, + "loss": 1.1988, + "step": 7824 + }, + { + "epoch": 0.6861881344808167, + "grad_norm": 0.045166015625, + "learning_rate": 0.0010143223745495626, + "loss": 1.1131, + "step": 7825 + }, + { + "epoch": 0.6862758262551912, + "grad_norm": 0.06494140625, + "learning_rate": 0.0010139627430175604, + "loss": 1.1964, + "step": 7826 + }, + { + "epoch": 0.6863635180295659, + "grad_norm": 0.0498046875, + "learning_rate": 0.0010136031694905137, + "loss": 1.1726, + "step": 7827 + }, + { + "epoch": 0.6864512098039404, + "grad_norm": 0.047607421875, + "learning_rate": 0.001013243654001215, + "loss": 1.1455, + "step": 7828 + }, + { + "epoch": 0.6865389015783149, + "grad_norm": 0.050048828125, + "learning_rate": 0.0010128841965824512, + "loss": 1.1605, + "step": 7829 + }, + { + "epoch": 0.6866265933526895, + "grad_norm": 0.0478515625, + "learning_rate": 0.0010125247972670045, + "loss": 1.1724, + "step": 7830 + }, + { + "epoch": 0.686714285127064, + "grad_norm": 0.05224609375, + "learning_rate": 0.0010121654560876503, + "loss": 1.1608, + "step": 7831 + }, + { + "epoch": 0.6868019769014385, + "grad_norm": 0.0439453125, + "learning_rate": 0.0010118061730771599, + "loss": 1.1072, + "step": 7832 + }, + { + "epoch": 0.6868896686758131, + "grad_norm": 0.04736328125, + "learning_rate": 0.001011446948268299, + "loss": 1.1588, + "step": 7833 + }, + { + "epoch": 0.6869773604501876, + "grad_norm": 0.05029296875, + "learning_rate": 0.0010110877816938285, + "loss": 1.1428, + "step": 7834 + }, + { + "epoch": 0.6870650522245622, + "grad_norm": 0.04931640625, + "learning_rate": 0.0010107286733865024, + "loss": 1.1657, + "step": 7835 + }, + { + "epoch": 0.6871527439989368, + "grad_norm": 0.043212890625, + "learning_rate": 0.001010369623379071, + "loss": 1.1331, + "step": 7836 + }, + { + "epoch": 0.6872404357733113, + "grad_norm": 0.044677734375, + "learning_rate": 0.0010100106317042794, + "loss": 1.1471, + "step": 7837 + }, + { + "epoch": 0.6873281275476858, + "grad_norm": 0.052978515625, + "learning_rate": 0.0010096516983948652, + "loss": 1.1486, + "step": 7838 + }, + { + "epoch": 0.6874158193220604, + "grad_norm": 0.07080078125, + "learning_rate": 0.0010092928234835634, + "loss": 1.1833, + "step": 7839 + }, + { + "epoch": 0.6875035110964349, + "grad_norm": 0.061767578125, + "learning_rate": 0.0010089340070031027, + "loss": 1.1089, + "step": 7840 + }, + { + "epoch": 0.6875912028708094, + "grad_norm": 0.06884765625, + "learning_rate": 0.001008575248986205, + "loss": 1.1568, + "step": 7841 + }, + { + "epoch": 0.6876788946451841, + "grad_norm": 0.0771484375, + "learning_rate": 0.0010082165494655886, + "loss": 1.0997, + "step": 7842 + }, + { + "epoch": 0.6877665864195586, + "grad_norm": 0.047119140625, + "learning_rate": 0.0010078579084739664, + "loss": 1.1167, + "step": 7843 + }, + { + "epoch": 0.6878542781939331, + "grad_norm": 0.044677734375, + "learning_rate": 0.0010074993260440455, + "loss": 1.1459, + "step": 7844 + }, + { + "epoch": 0.6879419699683077, + "grad_norm": 0.051025390625, + "learning_rate": 0.0010071408022085272, + "loss": 1.2438, + "step": 7845 + }, + { + "epoch": 0.6880296617426822, + "grad_norm": 0.06005859375, + "learning_rate": 0.0010067823370001083, + "loss": 1.1847, + "step": 7846 + }, + { + "epoch": 0.6881173535170567, + "grad_norm": 0.04345703125, + "learning_rate": 0.0010064239304514795, + "loss": 1.0766, + "step": 7847 + }, + { + "epoch": 0.6882050452914312, + "grad_norm": 0.0517578125, + "learning_rate": 0.0010060655825953274, + "loss": 1.1505, + "step": 7848 + }, + { + "epoch": 0.6882927370658058, + "grad_norm": 0.045166015625, + "learning_rate": 0.0010057072934643316, + "loss": 1.1462, + "step": 7849 + }, + { + "epoch": 0.6883804288401804, + "grad_norm": 0.05615234375, + "learning_rate": 0.0010053490630911673, + "loss": 1.183, + "step": 7850 + }, + { + "epoch": 0.6884681206145549, + "grad_norm": 0.051513671875, + "learning_rate": 0.0010049908915085043, + "loss": 1.0956, + "step": 7851 + }, + { + "epoch": 0.6885558123889295, + "grad_norm": 0.052734375, + "learning_rate": 0.0010046327787490074, + "loss": 1.1504, + "step": 7852 + }, + { + "epoch": 0.688643504163304, + "grad_norm": 0.055419921875, + "learning_rate": 0.0010042747248453345, + "loss": 1.1546, + "step": 7853 + }, + { + "epoch": 0.6887311959376785, + "grad_norm": 0.047119140625, + "learning_rate": 0.0010039167298301402, + "loss": 1.0873, + "step": 7854 + }, + { + "epoch": 0.6888188877120531, + "grad_norm": 0.06884765625, + "learning_rate": 0.001003558793736072, + "loss": 1.1588, + "step": 7855 + }, + { + "epoch": 0.6889065794864276, + "grad_norm": 0.060302734375, + "learning_rate": 0.0010032009165957727, + "loss": 1.1666, + "step": 7856 + }, + { + "epoch": 0.6889942712608021, + "grad_norm": 0.0693359375, + "learning_rate": 0.001002843098441881, + "loss": 1.153, + "step": 7857 + }, + { + "epoch": 0.6890819630351768, + "grad_norm": 0.10302734375, + "learning_rate": 0.0010024853393070274, + "loss": 1.1272, + "step": 7858 + }, + { + "epoch": 0.6891696548095513, + "grad_norm": 0.046630859375, + "learning_rate": 0.0010021276392238392, + "loss": 1.1569, + "step": 7859 + }, + { + "epoch": 0.6892573465839258, + "grad_norm": 0.1298828125, + "learning_rate": 0.001001769998224938, + "loss": 1.149, + "step": 7860 + }, + { + "epoch": 0.6893450383583004, + "grad_norm": 0.05419921875, + "learning_rate": 0.00100141241634294, + "loss": 1.1102, + "step": 7861 + }, + { + "epoch": 0.6894327301326749, + "grad_norm": 0.09033203125, + "learning_rate": 0.0010010548936104549, + "loss": 1.1673, + "step": 7862 + }, + { + "epoch": 0.6895204219070494, + "grad_norm": 0.04443359375, + "learning_rate": 0.001000697430060088, + "loss": 1.113, + "step": 7863 + }, + { + "epoch": 0.689608113681424, + "grad_norm": 0.07421875, + "learning_rate": 0.0010003400257244395, + "loss": 1.1013, + "step": 7864 + }, + { + "epoch": 0.6896958054557986, + "grad_norm": 0.1171875, + "learning_rate": 0.0009999826806361036, + "loss": 1.1037, + "step": 7865 + }, + { + "epoch": 0.6897834972301731, + "grad_norm": 0.09814453125, + "learning_rate": 0.0009996253948276698, + "loss": 1.1803, + "step": 7866 + }, + { + "epoch": 0.6898711890045477, + "grad_norm": 0.048828125, + "learning_rate": 0.0009992681683317203, + "loss": 1.1183, + "step": 7867 + }, + { + "epoch": 0.6899588807789222, + "grad_norm": 0.076171875, + "learning_rate": 0.0009989110011808342, + "loss": 1.1339, + "step": 7868 + }, + { + "epoch": 0.6900465725532967, + "grad_norm": 0.05712890625, + "learning_rate": 0.0009985538934075837, + "loss": 1.0644, + "step": 7869 + }, + { + "epoch": 0.6901342643276713, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009981968450445372, + "loss": 1.1196, + "step": 7870 + }, + { + "epoch": 0.6902219561020458, + "grad_norm": 0.0908203125, + "learning_rate": 0.000997839856124256, + "loss": 1.1922, + "step": 7871 + }, + { + "epoch": 0.6903096478764204, + "grad_norm": 0.048828125, + "learning_rate": 0.0009974829266792954, + "loss": 1.1288, + "step": 7872 + }, + { + "epoch": 0.6903973396507949, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009971260567422078, + "loss": 1.1156, + "step": 7873 + }, + { + "epoch": 0.6904850314251695, + "grad_norm": 0.0830078125, + "learning_rate": 0.0009967692463455384, + "loss": 1.1175, + "step": 7874 + }, + { + "epoch": 0.690572723199544, + "grad_norm": 0.0546875, + "learning_rate": 0.0009964124955218277, + "loss": 1.1035, + "step": 7875 + }, + { + "epoch": 0.6906604149739185, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009960558043036099, + "loss": 1.0896, + "step": 7876 + }, + { + "epoch": 0.6907481067482931, + "grad_norm": 0.044189453125, + "learning_rate": 0.0009956991727234144, + "loss": 1.0858, + "step": 7877 + }, + { + "epoch": 0.6908357985226676, + "grad_norm": 0.046630859375, + "learning_rate": 0.0009953426008137655, + "loss": 1.1113, + "step": 7878 + }, + { + "epoch": 0.6909234902970421, + "grad_norm": 0.048583984375, + "learning_rate": 0.0009949860886071817, + "loss": 1.1667, + "step": 7879 + }, + { + "epoch": 0.6910111820714168, + "grad_norm": 0.0673828125, + "learning_rate": 0.0009946296361361751, + "loss": 1.1316, + "step": 7880 + }, + { + "epoch": 0.6910988738457913, + "grad_norm": 0.05078125, + "learning_rate": 0.0009942732434332544, + "loss": 1.1224, + "step": 7881 + }, + { + "epoch": 0.6911865656201658, + "grad_norm": 0.06787109375, + "learning_rate": 0.000993916910530921, + "loss": 1.16, + "step": 7882 + }, + { + "epoch": 0.6912742573945404, + "grad_norm": 0.046142578125, + "learning_rate": 0.0009935606374616717, + "loss": 1.1298, + "step": 7883 + }, + { + "epoch": 0.6913619491689149, + "grad_norm": 0.048828125, + "learning_rate": 0.0009932044242579976, + "loss": 1.2187, + "step": 7884 + }, + { + "epoch": 0.6914496409432894, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009928482709523844, + "loss": 1.1766, + "step": 7885 + }, + { + "epoch": 0.691537332717664, + "grad_norm": 0.060791015625, + "learning_rate": 0.0009924921775773124, + "loss": 1.1219, + "step": 7886 + }, + { + "epoch": 0.6916250244920386, + "grad_norm": 0.047119140625, + "learning_rate": 0.000992136144165257, + "loss": 1.1552, + "step": 7887 + }, + { + "epoch": 0.6917127162664131, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009917801707486872, + "loss": 1.1048, + "step": 7888 + }, + { + "epoch": 0.6918004080407877, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009914242573600656, + "loss": 1.1212, + "step": 7889 + }, + { + "epoch": 0.6918880998151622, + "grad_norm": 0.0625, + "learning_rate": 0.0009910684040318524, + "loss": 1.1564, + "step": 7890 + }, + { + "epoch": 0.6919757915895367, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009907126107964988, + "loss": 1.1832, + "step": 7891 + }, + { + "epoch": 0.6920634833639113, + "grad_norm": 0.044921875, + "learning_rate": 0.0009903568776864539, + "loss": 1.2012, + "step": 7892 + }, + { + "epoch": 0.6921511751382858, + "grad_norm": 0.042236328125, + "learning_rate": 0.000990001204734159, + "loss": 1.1097, + "step": 7893 + }, + { + "epoch": 0.6922388669126603, + "grad_norm": 0.0498046875, + "learning_rate": 0.00098964559197205, + "loss": 1.1303, + "step": 7894 + }, + { + "epoch": 0.6923265586870349, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009892900394325584, + "loss": 1.1133, + "step": 7895 + }, + { + "epoch": 0.6924142504614095, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009889345471481095, + "loss": 1.1557, + "step": 7896 + }, + { + "epoch": 0.692501942235784, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009885791151511242, + "loss": 1.1435, + "step": 7897 + }, + { + "epoch": 0.6925896340101585, + "grad_norm": 0.046875, + "learning_rate": 0.0009882237434740153, + "loss": 1.1548, + "step": 7898 + }, + { + "epoch": 0.6926773257845331, + "grad_norm": 0.0537109375, + "learning_rate": 0.0009878684321491928, + "loss": 1.2064, + "step": 7899 + }, + { + "epoch": 0.6927650175589076, + "grad_norm": 0.042236328125, + "learning_rate": 0.0009875131812090597, + "loss": 1.0812, + "step": 7900 + }, + { + "epoch": 0.6928527093332821, + "grad_norm": 0.0478515625, + "learning_rate": 0.0009871579906860152, + "loss": 1.1228, + "step": 7901 + }, + { + "epoch": 0.6929404011076568, + "grad_norm": 0.046875, + "learning_rate": 0.0009868028606124502, + "loss": 1.1644, + "step": 7902 + }, + { + "epoch": 0.6930280928820313, + "grad_norm": 0.048828125, + "learning_rate": 0.000986447791020752, + "loss": 1.1241, + "step": 7903 + }, + { + "epoch": 0.6931157846564058, + "grad_norm": 0.0458984375, + "learning_rate": 0.0009860927819433027, + "loss": 1.1308, + "step": 7904 + }, + { + "epoch": 0.6932034764307804, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009857378334124781, + "loss": 1.1352, + "step": 7905 + }, + { + "epoch": 0.6932911682051549, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009853829454606483, + "loss": 1.1337, + "step": 7906 + }, + { + "epoch": 0.6933788599795294, + "grad_norm": 0.048583984375, + "learning_rate": 0.0009850281181201777, + "loss": 1.1306, + "step": 7907 + }, + { + "epoch": 0.693466551753904, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009846733514234263, + "loss": 1.1354, + "step": 7908 + }, + { + "epoch": 0.6935542435282785, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009843186454027476, + "loss": 1.1131, + "step": 7909 + }, + { + "epoch": 0.6936419353026531, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009839640000904905, + "loss": 1.1682, + "step": 7910 + }, + { + "epoch": 0.6937296270770277, + "grad_norm": 0.046142578125, + "learning_rate": 0.0009836094155189965, + "loss": 1.1239, + "step": 7911 + }, + { + "epoch": 0.6938173188514022, + "grad_norm": 0.044921875, + "learning_rate": 0.0009832548917206039, + "loss": 1.1814, + "step": 7912 + }, + { + "epoch": 0.6939050106257767, + "grad_norm": 0.04296875, + "learning_rate": 0.0009829004287276438, + "loss": 1.0868, + "step": 7913 + }, + { + "epoch": 0.6939927024001513, + "grad_norm": 0.045654296875, + "learning_rate": 0.000982546026572443, + "loss": 1.1181, + "step": 7914 + }, + { + "epoch": 0.6940803941745258, + "grad_norm": 0.04638671875, + "learning_rate": 0.000982191685287321, + "loss": 1.1627, + "step": 7915 + }, + { + "epoch": 0.6941680859489003, + "grad_norm": 0.042724609375, + "learning_rate": 0.0009818374049045933, + "loss": 1.0883, + "step": 7916 + }, + { + "epoch": 0.694255777723275, + "grad_norm": 0.047119140625, + "learning_rate": 0.0009814831854565696, + "loss": 1.2146, + "step": 7917 + }, + { + "epoch": 0.6943434694976495, + "grad_norm": 0.04150390625, + "learning_rate": 0.0009811290269755534, + "loss": 1.0834, + "step": 7918 + }, + { + "epoch": 0.694431161272024, + "grad_norm": 0.04345703125, + "learning_rate": 0.0009807749294938439, + "loss": 1.0952, + "step": 7919 + }, + { + "epoch": 0.6945188530463985, + "grad_norm": 0.053466796875, + "learning_rate": 0.0009804208930437328, + "loss": 1.1411, + "step": 7920 + }, + { + "epoch": 0.6946065448207731, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009800669176575076, + "loss": 1.1176, + "step": 7921 + }, + { + "epoch": 0.6946942365951476, + "grad_norm": 0.04736328125, + "learning_rate": 0.0009797130033674505, + "loss": 1.1095, + "step": 7922 + }, + { + "epoch": 0.6947819283695221, + "grad_norm": 0.046142578125, + "learning_rate": 0.0009793591502058369, + "loss": 1.173, + "step": 7923 + }, + { + "epoch": 0.6948696201438967, + "grad_norm": 0.06103515625, + "learning_rate": 0.0009790053582049382, + "loss": 1.1187, + "step": 7924 + }, + { + "epoch": 0.6949573119182713, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009786516273970177, + "loss": 1.1486, + "step": 7925 + }, + { + "epoch": 0.6950450036926458, + "grad_norm": 0.050537109375, + "learning_rate": 0.000978297957814336, + "loss": 1.1093, + "step": 7926 + }, + { + "epoch": 0.6951326954670204, + "grad_norm": 0.052734375, + "learning_rate": 0.0009779443494891463, + "loss": 1.191, + "step": 7927 + }, + { + "epoch": 0.6952203872413949, + "grad_norm": 0.044921875, + "learning_rate": 0.000977590802453698, + "loss": 1.152, + "step": 7928 + }, + { + "epoch": 0.6953080790157694, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009772373167402319, + "loss": 1.1854, + "step": 7929 + }, + { + "epoch": 0.695395770790144, + "grad_norm": 0.0478515625, + "learning_rate": 0.0009768838923809857, + "loss": 1.152, + "step": 7930 + }, + { + "epoch": 0.6954834625645185, + "grad_norm": 0.04736328125, + "learning_rate": 0.0009765305294081909, + "loss": 1.101, + "step": 7931 + }, + { + "epoch": 0.695571154338893, + "grad_norm": 0.0419921875, + "learning_rate": 0.000976177227854074, + "loss": 1.1091, + "step": 7932 + }, + { + "epoch": 0.6956588461132677, + "grad_norm": 0.059814453125, + "learning_rate": 0.0009758239877508537, + "loss": 1.0926, + "step": 7933 + }, + { + "epoch": 0.6957465378876422, + "grad_norm": 0.048095703125, + "learning_rate": 0.0009754708091307454, + "loss": 1.1195, + "step": 7934 + }, + { + "epoch": 0.6958342296620167, + "grad_norm": 0.047119140625, + "learning_rate": 0.0009751176920259581, + "loss": 1.1517, + "step": 7935 + }, + { + "epoch": 0.6959219214363913, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009747646364686956, + "loss": 1.0778, + "step": 7936 + }, + { + "epoch": 0.6960096132107658, + "grad_norm": 0.05810546875, + "learning_rate": 0.0009744116424911546, + "loss": 1.1607, + "step": 7937 + }, + { + "epoch": 0.6960973049851403, + "grad_norm": 0.043701171875, + "learning_rate": 0.0009740587101255278, + "loss": 1.1132, + "step": 7938 + }, + { + "epoch": 0.696184996759515, + "grad_norm": 0.044189453125, + "learning_rate": 0.000973705839404002, + "loss": 1.1237, + "step": 7939 + }, + { + "epoch": 0.6962726885338895, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009733530303587575, + "loss": 1.1683, + "step": 7940 + }, + { + "epoch": 0.696360380308264, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009730002830219703, + "loss": 1.1732, + "step": 7941 + }, + { + "epoch": 0.6964480720826385, + "grad_norm": 0.052734375, + "learning_rate": 0.0009726475974258092, + "loss": 1.1425, + "step": 7942 + }, + { + "epoch": 0.6965357638570131, + "grad_norm": 0.042724609375, + "learning_rate": 0.0009722949736024386, + "loss": 1.1575, + "step": 7943 + }, + { + "epoch": 0.6966234556313876, + "grad_norm": 0.042724609375, + "learning_rate": 0.0009719424115840169, + "loss": 1.1174, + "step": 7944 + }, + { + "epoch": 0.6967111474057621, + "grad_norm": 0.04443359375, + "learning_rate": 0.000971589911402697, + "loss": 1.0691, + "step": 7945 + }, + { + "epoch": 0.6967988391801367, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009712374730906264, + "loss": 1.1336, + "step": 7946 + }, + { + "epoch": 0.6968865309545113, + "grad_norm": 0.056884765625, + "learning_rate": 0.0009708850966799453, + "loss": 1.1688, + "step": 7947 + }, + { + "epoch": 0.6969742227288858, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009705327822027904, + "loss": 1.0877, + "step": 7948 + }, + { + "epoch": 0.6970619145032604, + "grad_norm": 0.0546875, + "learning_rate": 0.0009701805296912918, + "loss": 1.1268, + "step": 7949 + }, + { + "epoch": 0.6971496062776349, + "grad_norm": 0.059326171875, + "learning_rate": 0.0009698283391775742, + "loss": 1.1476, + "step": 7950 + }, + { + "epoch": 0.6972372980520094, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009694762106937559, + "loss": 1.099, + "step": 7951 + }, + { + "epoch": 0.697324989826384, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009691241442719503, + "loss": 1.1352, + "step": 7952 + }, + { + "epoch": 0.6974126816007585, + "grad_norm": 0.044189453125, + "learning_rate": 0.0009687721399442652, + "loss": 1.1027, + "step": 7953 + }, + { + "epoch": 0.697500373375133, + "grad_norm": 0.046142578125, + "learning_rate": 0.0009684201977428027, + "loss": 1.136, + "step": 7954 + }, + { + "epoch": 0.6975880651495077, + "grad_norm": 0.0703125, + "learning_rate": 0.0009680683176996585, + "loss": 1.1475, + "step": 7955 + }, + { + "epoch": 0.6976757569238822, + "grad_norm": 0.05126953125, + "learning_rate": 0.0009677164998469235, + "loss": 1.1675, + "step": 7956 + }, + { + "epoch": 0.6977634486982567, + "grad_norm": 0.046875, + "learning_rate": 0.000967364744216682, + "loss": 1.1738, + "step": 7957 + }, + { + "epoch": 0.6978511404726313, + "grad_norm": 0.06640625, + "learning_rate": 0.0009670130508410137, + "loss": 1.134, + "step": 7958 + }, + { + "epoch": 0.6979388322470058, + "grad_norm": 0.09765625, + "learning_rate": 0.0009666614197519926, + "loss": 1.0882, + "step": 7959 + }, + { + "epoch": 0.6980265240213803, + "grad_norm": 0.115234375, + "learning_rate": 0.0009663098509816852, + "loss": 1.1686, + "step": 7960 + }, + { + "epoch": 0.6981142157957549, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009659583445621545, + "loss": 1.1708, + "step": 7961 + }, + { + "epoch": 0.6982019075701295, + "grad_norm": 0.138671875, + "learning_rate": 0.0009656069005254571, + "loss": 1.1807, + "step": 7962 + }, + { + "epoch": 0.698289599344504, + "grad_norm": 0.083984375, + "learning_rate": 0.000965255518903644, + "loss": 1.0855, + "step": 7963 + }, + { + "epoch": 0.6983772911188786, + "grad_norm": 0.07421875, + "learning_rate": 0.0009649041997287594, + "loss": 1.1701, + "step": 7964 + }, + { + "epoch": 0.6984649828932531, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009645529430328435, + "loss": 1.11, + "step": 7965 + }, + { + "epoch": 0.6985526746676276, + "grad_norm": 0.06201171875, + "learning_rate": 0.0009642017488479296, + "loss": 1.1784, + "step": 7966 + }, + { + "epoch": 0.6986403664420021, + "grad_norm": 0.08447265625, + "learning_rate": 0.0009638506172060461, + "loss": 1.1526, + "step": 7967 + }, + { + "epoch": 0.6987280582163767, + "grad_norm": 0.064453125, + "learning_rate": 0.0009634995481392147, + "loss": 1.138, + "step": 7968 + }, + { + "epoch": 0.6988157499907512, + "grad_norm": 0.05126953125, + "learning_rate": 0.0009631485416794522, + "loss": 1.1574, + "step": 7969 + }, + { + "epoch": 0.6989034417651258, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009627975978587698, + "loss": 1.1471, + "step": 7970 + }, + { + "epoch": 0.6989911335395004, + "grad_norm": 0.055419921875, + "learning_rate": 0.0009624467167091724, + "loss": 1.1316, + "step": 7971 + }, + { + "epoch": 0.6990788253138749, + "grad_norm": 0.06494140625, + "learning_rate": 0.0009620958982626598, + "loss": 1.1147, + "step": 7972 + }, + { + "epoch": 0.6991665170882494, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009617451425512258, + "loss": 1.1545, + "step": 7973 + }, + { + "epoch": 0.699254208862624, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009613944496068575, + "loss": 1.1029, + "step": 7974 + }, + { + "epoch": 0.6993419006369985, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009610438194615375, + "loss": 1.1226, + "step": 7975 + }, + { + "epoch": 0.699429592411373, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009606932521472428, + "loss": 1.1437, + "step": 7976 + }, + { + "epoch": 0.6995172841857477, + "grad_norm": 0.05126953125, + "learning_rate": 0.0009603427476959449, + "loss": 1.0961, + "step": 7977 + }, + { + "epoch": 0.6996049759601222, + "grad_norm": 0.04931640625, + "learning_rate": 0.0009599923061396073, + "loss": 1.1154, + "step": 7978 + }, + { + "epoch": 0.6996926677344967, + "grad_norm": 0.054931640625, + "learning_rate": 0.00095964192751019, + "loss": 1.1959, + "step": 7979 + }, + { + "epoch": 0.6997803595088713, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009592916118396468, + "loss": 1.1459, + "step": 7980 + }, + { + "epoch": 0.6998680512832458, + "grad_norm": 0.04296875, + "learning_rate": 0.0009589413591599262, + "loss": 1.1298, + "step": 7981 + }, + { + "epoch": 0.6999557430576203, + "grad_norm": 0.07177734375, + "learning_rate": 0.0009585911695029691, + "loss": 1.1605, + "step": 7982 + }, + { + "epoch": 0.7000434348319949, + "grad_norm": 0.044189453125, + "learning_rate": 0.0009582410429007125, + "loss": 1.0921, + "step": 7983 + }, + { + "epoch": 0.7001311266063694, + "grad_norm": 0.042236328125, + "learning_rate": 0.0009578909793850871, + "loss": 1.1072, + "step": 7984 + }, + { + "epoch": 0.700218818380744, + "grad_norm": 0.048095703125, + "learning_rate": 0.0009575409789880179, + "loss": 1.1983, + "step": 7985 + }, + { + "epoch": 0.7003065101551186, + "grad_norm": 0.06396484375, + "learning_rate": 0.0009571910417414236, + "loss": 1.1749, + "step": 7986 + }, + { + "epoch": 0.7003942019294931, + "grad_norm": 0.04931640625, + "learning_rate": 0.0009568411676772178, + "loss": 1.1587, + "step": 7987 + }, + { + "epoch": 0.7004818937038676, + "grad_norm": 0.04443359375, + "learning_rate": 0.000956491356827308, + "loss": 1.1172, + "step": 7988 + }, + { + "epoch": 0.7005695854782422, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009561416092235969, + "loss": 1.1097, + "step": 7989 + }, + { + "epoch": 0.7006572772526167, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009557919248979796, + "loss": 1.0974, + "step": 7990 + }, + { + "epoch": 0.7007449690269912, + "grad_norm": 0.054931640625, + "learning_rate": 0.0009554423038823464, + "loss": 1.1613, + "step": 7991 + }, + { + "epoch": 0.7008326608013657, + "grad_norm": 0.047119140625, + "learning_rate": 0.0009550927462085818, + "loss": 1.1511, + "step": 7992 + }, + { + "epoch": 0.7009203525757404, + "grad_norm": 0.0478515625, + "learning_rate": 0.0009547432519085652, + "loss": 1.1124, + "step": 7993 + }, + { + "epoch": 0.7010080443501149, + "grad_norm": 0.048583984375, + "learning_rate": 0.0009543938210141695, + "loss": 1.0719, + "step": 7994 + }, + { + "epoch": 0.7010957361244894, + "grad_norm": 0.046875, + "learning_rate": 0.0009540444535572611, + "loss": 1.1304, + "step": 7995 + }, + { + "epoch": 0.701183427898864, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009536951495697022, + "loss": 1.2113, + "step": 7996 + }, + { + "epoch": 0.7012711196732385, + "grad_norm": 0.05126953125, + "learning_rate": 0.0009533459090833479, + "loss": 1.1697, + "step": 7997 + }, + { + "epoch": 0.701358811447613, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009529967321300489, + "loss": 1.0975, + "step": 7998 + }, + { + "epoch": 0.7014465032219876, + "grad_norm": 0.08154296875, + "learning_rate": 0.0009526476187416481, + "loss": 1.1787, + "step": 7999 + }, + { + "epoch": 0.7015341949963622, + "grad_norm": 0.0908203125, + "learning_rate": 0.0009522985689499842, + "loss": 1.1633, + "step": 8000 + }, + { + "epoch": 0.7015341949963622, + "eval_loss": 1.148275375366211, + "eval_runtime": 429.1936, + "eval_samples_per_second": 33.661, + "eval_steps_per_second": 8.416, + "step": 8000 + }, + { + "epoch": 0.7016218867707367, + "grad_norm": 0.0810546875, + "learning_rate": 0.0009519495827868897, + "loss": 1.113, + "step": 8001 + }, + { + "epoch": 0.7017095785451113, + "grad_norm": 0.049072265625, + "learning_rate": 0.0009516006602841915, + "loss": 1.0907, + "step": 8002 + }, + { + "epoch": 0.7017972703194858, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009512518014737104, + "loss": 1.1518, + "step": 8003 + }, + { + "epoch": 0.7018849620938603, + "grad_norm": 0.10888671875, + "learning_rate": 0.0009509030063872606, + "loss": 1.1587, + "step": 8004 + }, + { + "epoch": 0.7019726538682349, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009505542750566523, + "loss": 1.1565, + "step": 8005 + }, + { + "epoch": 0.7020603456426094, + "grad_norm": 0.0478515625, + "learning_rate": 0.0009502056075136886, + "loss": 1.1193, + "step": 8006 + }, + { + "epoch": 0.702148037416984, + "grad_norm": 0.044189453125, + "learning_rate": 0.0009498570037901668, + "loss": 1.1086, + "step": 8007 + }, + { + "epoch": 0.7022357291913586, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009495084639178792, + "loss": 1.1417, + "step": 8008 + }, + { + "epoch": 0.7023234209657331, + "grad_norm": 0.0458984375, + "learning_rate": 0.0009491599879286109, + "loss": 1.1422, + "step": 8009 + }, + { + "epoch": 0.7024111127401076, + "grad_norm": 0.047119140625, + "learning_rate": 0.0009488115758541424, + "loss": 1.1405, + "step": 8010 + }, + { + "epoch": 0.7024988045144822, + "grad_norm": 0.058349609375, + "learning_rate": 0.0009484632277262482, + "loss": 1.1527, + "step": 8011 + }, + { + "epoch": 0.7025864962888567, + "grad_norm": 0.047119140625, + "learning_rate": 0.000948114943576697, + "loss": 1.1399, + "step": 8012 + }, + { + "epoch": 0.7026741880632312, + "grad_norm": 0.060791015625, + "learning_rate": 0.0009477667234372506, + "loss": 1.1465, + "step": 8013 + }, + { + "epoch": 0.7027618798376057, + "grad_norm": 0.0732421875, + "learning_rate": 0.0009474185673396664, + "loss": 1.1091, + "step": 8014 + }, + { + "epoch": 0.7028495716119804, + "grad_norm": 0.046875, + "learning_rate": 0.000947070475315695, + "loss": 1.0887, + "step": 8015 + }, + { + "epoch": 0.7029372633863549, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009467224473970822, + "loss": 1.1258, + "step": 8016 + }, + { + "epoch": 0.7030249551607294, + "grad_norm": 0.052734375, + "learning_rate": 0.000946374483615566, + "loss": 1.1599, + "step": 8017 + }, + { + "epoch": 0.703112646935104, + "grad_norm": 0.060302734375, + "learning_rate": 0.0009460265840028807, + "loss": 1.1551, + "step": 8018 + }, + { + "epoch": 0.7032003387094785, + "grad_norm": 0.07080078125, + "learning_rate": 0.0009456787485907534, + "loss": 1.1649, + "step": 8019 + }, + { + "epoch": 0.703288030483853, + "grad_norm": 0.062255859375, + "learning_rate": 0.0009453309774109069, + "loss": 1.1577, + "step": 8020 + }, + { + "epoch": 0.7033757222582276, + "grad_norm": 0.048583984375, + "learning_rate": 0.0009449832704950555, + "loss": 1.2459, + "step": 8021 + }, + { + "epoch": 0.7034634140326022, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009446356278749097, + "loss": 1.1152, + "step": 8022 + }, + { + "epoch": 0.7035511058069767, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009442880495821742, + "loss": 1.146, + "step": 8023 + }, + { + "epoch": 0.7036387975813513, + "grad_norm": 0.049072265625, + "learning_rate": 0.0009439405356485468, + "loss": 1.1322, + "step": 8024 + }, + { + "epoch": 0.7037264893557258, + "grad_norm": 0.04345703125, + "learning_rate": 0.0009435930861057199, + "loss": 1.1139, + "step": 8025 + }, + { + "epoch": 0.7038141811301003, + "grad_norm": 0.048095703125, + "learning_rate": 0.0009432457009853794, + "loss": 1.2094, + "step": 8026 + }, + { + "epoch": 0.7039018729044749, + "grad_norm": 0.044677734375, + "learning_rate": 0.000942898380319207, + "loss": 1.1713, + "step": 8027 + }, + { + "epoch": 0.7039895646788494, + "grad_norm": 0.0419921875, + "learning_rate": 0.0009425511241388765, + "loss": 1.1217, + "step": 8028 + }, + { + "epoch": 0.7040772564532239, + "grad_norm": 0.04150390625, + "learning_rate": 0.0009422039324760573, + "loss": 1.1332, + "step": 8029 + }, + { + "epoch": 0.7041649482275986, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009418568053624129, + "loss": 1.1754, + "step": 8030 + }, + { + "epoch": 0.7042526400019731, + "grad_norm": 0.0478515625, + "learning_rate": 0.000941509742829599, + "loss": 1.1638, + "step": 8031 + }, + { + "epoch": 0.7043403317763476, + "grad_norm": 0.048095703125, + "learning_rate": 0.0009411627449092679, + "loss": 1.2097, + "step": 8032 + }, + { + "epoch": 0.7044280235507222, + "grad_norm": 0.056884765625, + "learning_rate": 0.0009408158116330645, + "loss": 1.2393, + "step": 8033 + }, + { + "epoch": 0.7045157153250967, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009404689430326288, + "loss": 1.1448, + "step": 8034 + }, + { + "epoch": 0.7046034070994712, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009401221391395935, + "loss": 1.1948, + "step": 8035 + }, + { + "epoch": 0.7046910988738458, + "grad_norm": 0.044921875, + "learning_rate": 0.0009397753999855864, + "loss": 1.1226, + "step": 8036 + }, + { + "epoch": 0.7047787906482204, + "grad_norm": 0.04296875, + "learning_rate": 0.0009394287256022294, + "loss": 1.1006, + "step": 8037 + }, + { + "epoch": 0.7048664824225949, + "grad_norm": 0.0517578125, + "learning_rate": 0.0009390821160211388, + "loss": 1.176, + "step": 8038 + }, + { + "epoch": 0.7049541741969694, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009387355712739236, + "loss": 1.0653, + "step": 8039 + }, + { + "epoch": 0.705041865971344, + "grad_norm": 0.04248046875, + "learning_rate": 0.0009383890913921885, + "loss": 1.1019, + "step": 8040 + }, + { + "epoch": 0.7051295577457185, + "grad_norm": 0.04931640625, + "learning_rate": 0.0009380426764075312, + "loss": 1.1791, + "step": 8041 + }, + { + "epoch": 0.705217249520093, + "grad_norm": 0.05126953125, + "learning_rate": 0.0009376963263515432, + "loss": 1.1534, + "step": 8042 + }, + { + "epoch": 0.7053049412944676, + "grad_norm": 0.046630859375, + "learning_rate": 0.0009373500412558124, + "loss": 1.0992, + "step": 8043 + }, + { + "epoch": 0.7053926330688421, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009370038211519177, + "loss": 1.1575, + "step": 8044 + }, + { + "epoch": 0.7054803248432167, + "grad_norm": 0.043701171875, + "learning_rate": 0.0009366576660714337, + "loss": 1.1553, + "step": 8045 + }, + { + "epoch": 0.7055680166175913, + "grad_norm": 0.053466796875, + "learning_rate": 0.0009363115760459292, + "loss": 1.2077, + "step": 8046 + }, + { + "epoch": 0.7056557083919658, + "grad_norm": 0.04296875, + "learning_rate": 0.000935965551106967, + "loss": 1.1095, + "step": 8047 + }, + { + "epoch": 0.7057434001663403, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009356195912861026, + "loss": 1.161, + "step": 8048 + }, + { + "epoch": 0.7058310919407149, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009352736966148875, + "loss": 1.1469, + "step": 8049 + }, + { + "epoch": 0.7059187837150894, + "grad_norm": 0.046875, + "learning_rate": 0.0009349278671248664, + "loss": 1.1284, + "step": 8050 + }, + { + "epoch": 0.7060064754894639, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009345821028475783, + "loss": 1.1485, + "step": 8051 + }, + { + "epoch": 0.7060941672638386, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009342364038145549, + "loss": 1.111, + "step": 8052 + }, + { + "epoch": 0.7061818590382131, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009338907700573236, + "loss": 1.1661, + "step": 8053 + }, + { + "epoch": 0.7062695508125876, + "grad_norm": 0.0546875, + "learning_rate": 0.0009335452016074058, + "loss": 1.1296, + "step": 8054 + }, + { + "epoch": 0.7063572425869622, + "grad_norm": 0.060546875, + "learning_rate": 0.0009331996984963158, + "loss": 1.1191, + "step": 8055 + }, + { + "epoch": 0.7064449343613367, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009328542607555637, + "loss": 1.169, + "step": 8056 + }, + { + "epoch": 0.7065326261357112, + "grad_norm": 0.046875, + "learning_rate": 0.0009325088884166514, + "loss": 1.1024, + "step": 8057 + }, + { + "epoch": 0.7066203179100858, + "grad_norm": 0.05908203125, + "learning_rate": 0.000932163581511076, + "loss": 1.1052, + "step": 8058 + }, + { + "epoch": 0.7067080096844603, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009318183400703287, + "loss": 1.1811, + "step": 8059 + }, + { + "epoch": 0.7067957014588349, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009314731641258951, + "loss": 1.1666, + "step": 8060 + }, + { + "epoch": 0.7068833932332095, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009311280537092543, + "loss": 1.1674, + "step": 8061 + }, + { + "epoch": 0.706971085007584, + "grad_norm": 0.05126953125, + "learning_rate": 0.000930783008851879, + "loss": 1.1915, + "step": 8062 + }, + { + "epoch": 0.7070587767819585, + "grad_norm": 0.07568359375, + "learning_rate": 0.0009304380295852364, + "loss": 1.1089, + "step": 8063 + }, + { + "epoch": 0.707146468556333, + "grad_norm": 0.0732421875, + "learning_rate": 0.000930093115940788, + "loss": 1.1882, + "step": 8064 + }, + { + "epoch": 0.7072341603307076, + "grad_norm": 0.04541015625, + "learning_rate": 0.0009297482679499894, + "loss": 1.157, + "step": 8065 + }, + { + "epoch": 0.7073218521050821, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009294034856442891, + "loss": 1.0451, + "step": 8066 + }, + { + "epoch": 0.7074095438794566, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009290587690551309, + "loss": 1.2029, + "step": 8067 + }, + { + "epoch": 0.7074972356538313, + "grad_norm": 0.046630859375, + "learning_rate": 0.0009287141182139516, + "loss": 1.122, + "step": 8068 + }, + { + "epoch": 0.7075849274282058, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009283695331521831, + "loss": 1.1352, + "step": 8069 + }, + { + "epoch": 0.7076726192025803, + "grad_norm": 0.046142578125, + "learning_rate": 0.00092802501390125, + "loss": 1.1998, + "step": 8070 + }, + { + "epoch": 0.7077603109769549, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009276805604925718, + "loss": 1.0744, + "step": 8071 + }, + { + "epoch": 0.7078480027513294, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009273361729575618, + "loss": 1.1178, + "step": 8072 + }, + { + "epoch": 0.7079356945257039, + "grad_norm": 0.043701171875, + "learning_rate": 0.0009269918513276278, + "loss": 1.1662, + "step": 8073 + }, + { + "epoch": 0.7080233863000785, + "grad_norm": 0.04736328125, + "learning_rate": 0.0009266475956341707, + "loss": 1.132, + "step": 8074 + }, + { + "epoch": 0.7081110780744531, + "grad_norm": 0.0458984375, + "learning_rate": 0.0009263034059085848, + "loss": 1.144, + "step": 8075 + }, + { + "epoch": 0.7081987698488276, + "grad_norm": 0.047119140625, + "learning_rate": 0.0009259592821822603, + "loss": 1.1731, + "step": 8076 + }, + { + "epoch": 0.7082864616232022, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009256152244865801, + "loss": 1.1816, + "step": 8077 + }, + { + "epoch": 0.7083741533975767, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009252712328529216, + "loss": 1.1394, + "step": 8078 + }, + { + "epoch": 0.7084618451719512, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009249273073126556, + "loss": 1.1457, + "step": 8079 + }, + { + "epoch": 0.7085495369463258, + "grad_norm": 0.0751953125, + "learning_rate": 0.0009245834478971474, + "loss": 1.1595, + "step": 8080 + }, + { + "epoch": 0.7086372287207003, + "grad_norm": 0.04541015625, + "learning_rate": 0.0009242396546377561, + "loss": 1.1558, + "step": 8081 + }, + { + "epoch": 0.7087249204950749, + "grad_norm": 0.04736328125, + "learning_rate": 0.0009238959275658348, + "loss": 1.1925, + "step": 8082 + }, + { + "epoch": 0.7088126122694495, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009235522667127307, + "loss": 1.16, + "step": 8083 + }, + { + "epoch": 0.708900304043824, + "grad_norm": 0.04443359375, + "learning_rate": 0.0009232086721097844, + "loss": 1.1077, + "step": 8084 + }, + { + "epoch": 0.7089879958181985, + "grad_norm": 0.04248046875, + "learning_rate": 0.0009228651437883308, + "loss": 1.1019, + "step": 8085 + }, + { + "epoch": 0.709075687592573, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009225216817796989, + "loss": 1.1505, + "step": 8086 + }, + { + "epoch": 0.7091633793669476, + "grad_norm": 0.04931640625, + "learning_rate": 0.0009221782861152125, + "loss": 1.0998, + "step": 8087 + }, + { + "epoch": 0.7092510711413221, + "grad_norm": 0.049072265625, + "learning_rate": 0.0009218349568261868, + "loss": 1.1861, + "step": 8088 + }, + { + "epoch": 0.7093387629156966, + "grad_norm": 0.044677734375, + "learning_rate": 0.0009214916939439335, + "loss": 1.1069, + "step": 8089 + }, + { + "epoch": 0.7094264546900713, + "grad_norm": 0.04638671875, + "learning_rate": 0.0009211484974997568, + "loss": 1.1707, + "step": 8090 + }, + { + "epoch": 0.7095141464644458, + "grad_norm": 0.049560546875, + "learning_rate": 0.000920805367524956, + "loss": 1.0944, + "step": 8091 + }, + { + "epoch": 0.7096018382388203, + "grad_norm": 0.046875, + "learning_rate": 0.0009204623040508237, + "loss": 1.1431, + "step": 8092 + }, + { + "epoch": 0.7096895300131949, + "grad_norm": 0.048095703125, + "learning_rate": 0.0009201193071086453, + "loss": 1.1128, + "step": 8093 + }, + { + "epoch": 0.7097772217875694, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009197763767297018, + "loss": 1.1169, + "step": 8094 + }, + { + "epoch": 0.7098649135619439, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009194335129452678, + "loss": 1.1213, + "step": 8095 + }, + { + "epoch": 0.7099526053363185, + "grad_norm": 0.048095703125, + "learning_rate": 0.000919090715786612, + "loss": 1.1245, + "step": 8096 + }, + { + "epoch": 0.710040297110693, + "grad_norm": 0.04736328125, + "learning_rate": 0.0009187479852849954, + "loss": 1.1705, + "step": 8097 + }, + { + "epoch": 0.7101279888850676, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009184053214716751, + "loss": 1.1951, + "step": 8098 + }, + { + "epoch": 0.7102156806594422, + "grad_norm": 0.04248046875, + "learning_rate": 0.0009180627243779006, + "loss": 1.1532, + "step": 8099 + }, + { + "epoch": 0.7103033724338167, + "grad_norm": 0.057373046875, + "learning_rate": 0.0009177201940349168, + "loss": 1.1359, + "step": 8100 + }, + { + "epoch": 0.7103910642081912, + "grad_norm": 0.05517578125, + "learning_rate": 0.0009173777304739603, + "loss": 1.1392, + "step": 8101 + }, + { + "epoch": 0.7104787559825658, + "grad_norm": 0.04345703125, + "learning_rate": 0.0009170353337262638, + "loss": 1.1034, + "step": 8102 + }, + { + "epoch": 0.7105664477569403, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009166930038230526, + "loss": 1.1511, + "step": 8103 + }, + { + "epoch": 0.7106541395313148, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009163507407955469, + "loss": 1.1505, + "step": 8104 + }, + { + "epoch": 0.7107418313056895, + "grad_norm": 0.046875, + "learning_rate": 0.0009160085446749592, + "loss": 1.2235, + "step": 8105 + }, + { + "epoch": 0.710829523080064, + "grad_norm": 0.048095703125, + "learning_rate": 0.0009156664154924974, + "loss": 1.1778, + "step": 8106 + }, + { + "epoch": 0.7109172148544385, + "grad_norm": 0.05810546875, + "learning_rate": 0.0009153243532793629, + "loss": 1.1308, + "step": 8107 + }, + { + "epoch": 0.7110049066288131, + "grad_norm": 0.04541015625, + "learning_rate": 0.0009149823580667515, + "loss": 1.1633, + "step": 8108 + }, + { + "epoch": 0.7110925984031876, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009146404298858508, + "loss": 1.1336, + "step": 8109 + }, + { + "epoch": 0.7111802901775621, + "grad_norm": 0.052734375, + "learning_rate": 0.0009142985687678451, + "loss": 1.0819, + "step": 8110 + }, + { + "epoch": 0.7112679819519366, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009139567747439105, + "loss": 1.101, + "step": 8111 + }, + { + "epoch": 0.7113556737263113, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009136150478452177, + "loss": 1.1218, + "step": 8112 + }, + { + "epoch": 0.7114433655006858, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009132733881029317, + "loss": 1.1529, + "step": 8113 + }, + { + "epoch": 0.7115310572750603, + "grad_norm": 0.07275390625, + "learning_rate": 0.0009129317955482114, + "loss": 1.1269, + "step": 8114 + }, + { + "epoch": 0.7116187490494349, + "grad_norm": 0.04638671875, + "learning_rate": 0.000912590270212208, + "loss": 1.1572, + "step": 8115 + }, + { + "epoch": 0.7117064408238094, + "grad_norm": 0.043701171875, + "learning_rate": 0.0009122488121260683, + "loss": 1.1743, + "step": 8116 + }, + { + "epoch": 0.7117941325981839, + "grad_norm": 0.041015625, + "learning_rate": 0.0009119074213209327, + "loss": 1.0585, + "step": 8117 + }, + { + "epoch": 0.7118818243725585, + "grad_norm": 0.048583984375, + "learning_rate": 0.000911566097827935, + "loss": 1.1368, + "step": 8118 + }, + { + "epoch": 0.711969516146933, + "grad_norm": 0.05615234375, + "learning_rate": 0.0009112248416782027, + "loss": 1.1222, + "step": 8119 + }, + { + "epoch": 0.7120572079213076, + "grad_norm": 0.05517578125, + "learning_rate": 0.0009108836529028577, + "loss": 1.1348, + "step": 8120 + }, + { + "epoch": 0.7121448996956822, + "grad_norm": 0.043212890625, + "learning_rate": 0.0009105425315330153, + "loss": 1.1329, + "step": 8121 + }, + { + "epoch": 0.7122325914700567, + "grad_norm": 0.0625, + "learning_rate": 0.0009102014775997859, + "loss": 1.1363, + "step": 8122 + }, + { + "epoch": 0.7123202832444312, + "grad_norm": 0.051025390625, + "learning_rate": 0.0009098604911342713, + "loss": 1.174, + "step": 8123 + }, + { + "epoch": 0.7124079750188058, + "grad_norm": 0.046630859375, + "learning_rate": 0.0009095195721675692, + "loss": 1.1272, + "step": 8124 + }, + { + "epoch": 0.7124956667931803, + "grad_norm": 0.045166015625, + "learning_rate": 0.0009091787207307712, + "loss": 1.1392, + "step": 8125 + }, + { + "epoch": 0.7125833585675548, + "grad_norm": 0.0546875, + "learning_rate": 0.0009088379368549609, + "loss": 1.1088, + "step": 8126 + }, + { + "epoch": 0.7126710503419295, + "grad_norm": 0.04541015625, + "learning_rate": 0.0009084972205712181, + "loss": 1.1153, + "step": 8127 + }, + { + "epoch": 0.712758742116304, + "grad_norm": 0.044677734375, + "learning_rate": 0.0009081565719106139, + "loss": 1.1241, + "step": 8128 + }, + { + "epoch": 0.7128464338906785, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009078159909042154, + "loss": 1.1815, + "step": 8129 + }, + { + "epoch": 0.7129341256650531, + "grad_norm": 0.049560546875, + "learning_rate": 0.0009074754775830826, + "loss": 1.1789, + "step": 8130 + }, + { + "epoch": 0.7130218174394276, + "grad_norm": 0.047119140625, + "learning_rate": 0.00090713503197827, + "loss": 1.1215, + "step": 8131 + }, + { + "epoch": 0.7131095092138021, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009067946541208241, + "loss": 1.1278, + "step": 8132 + }, + { + "epoch": 0.7131972009881766, + "grad_norm": 0.064453125, + "learning_rate": 0.0009064543440417873, + "loss": 1.1864, + "step": 8133 + }, + { + "epoch": 0.7132848927625512, + "grad_norm": 0.043701171875, + "learning_rate": 0.0009061141017721948, + "loss": 1.1689, + "step": 8134 + }, + { + "epoch": 0.7133725845369258, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009057739273430759, + "loss": 1.1247, + "step": 8135 + }, + { + "epoch": 0.7134602763113003, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009054338207854541, + "loss": 1.0898, + "step": 8136 + }, + { + "epoch": 0.7135479680856749, + "grad_norm": 0.048095703125, + "learning_rate": 0.0009050937821303452, + "loss": 1.0819, + "step": 8137 + }, + { + "epoch": 0.7136356598600494, + "grad_norm": 0.0478515625, + "learning_rate": 0.0009047538114087606, + "loss": 1.1521, + "step": 8138 + }, + { + "epoch": 0.7137233516344239, + "grad_norm": 0.047607421875, + "learning_rate": 0.0009044139086517044, + "loss": 1.1245, + "step": 8139 + }, + { + "epoch": 0.7138110434087985, + "grad_norm": 0.048828125, + "learning_rate": 0.0009040740738901755, + "loss": 1.1293, + "step": 8140 + }, + { + "epoch": 0.713898735183173, + "grad_norm": 0.04541015625, + "learning_rate": 0.000903734307155165, + "loss": 1.0882, + "step": 8141 + }, + { + "epoch": 0.7139864269575475, + "grad_norm": 0.043701171875, + "learning_rate": 0.0009033946084776598, + "loss": 1.059, + "step": 8142 + }, + { + "epoch": 0.7140741187319222, + "grad_norm": 0.06982421875, + "learning_rate": 0.0009030549778886385, + "loss": 1.1671, + "step": 8143 + }, + { + "epoch": 0.7141618105062967, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009027154154190748, + "loss": 1.0945, + "step": 8144 + }, + { + "epoch": 0.7142495022806712, + "grad_norm": 0.044677734375, + "learning_rate": 0.0009023759210999367, + "loss": 1.1182, + "step": 8145 + }, + { + "epoch": 0.7143371940550458, + "grad_norm": 0.048583984375, + "learning_rate": 0.0009020364949621839, + "loss": 1.1655, + "step": 8146 + }, + { + "epoch": 0.7144248858294203, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009016971370367721, + "loss": 1.1338, + "step": 8147 + }, + { + "epoch": 0.7145125776037948, + "grad_norm": 0.046142578125, + "learning_rate": 0.0009013578473546498, + "loss": 1.1451, + "step": 8148 + }, + { + "epoch": 0.7146002693781695, + "grad_norm": 0.044189453125, + "learning_rate": 0.0009010186259467594, + "loss": 1.1313, + "step": 8149 + }, + { + "epoch": 0.714687961152544, + "grad_norm": 0.044921875, + "learning_rate": 0.0009006794728440365, + "loss": 1.0902, + "step": 8150 + }, + { + "epoch": 0.7147756529269185, + "grad_norm": 0.045654296875, + "learning_rate": 0.0009003403880774114, + "loss": 1.1349, + "step": 8151 + }, + { + "epoch": 0.7148633447012931, + "grad_norm": 0.0517578125, + "learning_rate": 0.0009000013716778074, + "loss": 1.061, + "step": 8152 + }, + { + "epoch": 0.7149510364756676, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008996624236761431, + "loss": 1.1497, + "step": 8153 + }, + { + "epoch": 0.7150387282500421, + "grad_norm": 0.059814453125, + "learning_rate": 0.0008993235441033282, + "loss": 1.1541, + "step": 8154 + }, + { + "epoch": 0.7151264200244167, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008989847329902678, + "loss": 1.1061, + "step": 8155 + }, + { + "epoch": 0.7152141117987912, + "grad_norm": 0.05517578125, + "learning_rate": 0.0008986459903678615, + "loss": 1.1606, + "step": 8156 + }, + { + "epoch": 0.7153018035731658, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008983073162670016, + "loss": 1.1395, + "step": 8157 + }, + { + "epoch": 0.7153894953475403, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008979687107185733, + "loss": 1.1943, + "step": 8158 + }, + { + "epoch": 0.7154771871219149, + "grad_norm": 0.06689453125, + "learning_rate": 0.0008976301737534579, + "loss": 1.1356, + "step": 8159 + }, + { + "epoch": 0.7155648788962894, + "grad_norm": 0.0771484375, + "learning_rate": 0.0008972917054025278, + "loss": 1.1445, + "step": 8160 + }, + { + "epoch": 0.7156525706706639, + "grad_norm": 0.050537109375, + "learning_rate": 0.0008969533056966512, + "loss": 1.1333, + "step": 8161 + }, + { + "epoch": 0.7157402624450385, + "grad_norm": 0.04150390625, + "learning_rate": 0.000896614974666689, + "loss": 1.0955, + "step": 8162 + }, + { + "epoch": 0.715827954219413, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008962767123434967, + "loss": 1.1486, + "step": 8163 + }, + { + "epoch": 0.7159156459937875, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008959385187579222, + "loss": 1.0904, + "step": 8164 + }, + { + "epoch": 0.7160033377681622, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008956003939408082, + "loss": 1.1531, + "step": 8165 + }, + { + "epoch": 0.7160910295425367, + "grad_norm": 0.04833984375, + "learning_rate": 0.0008952623379229907, + "loss": 1.1658, + "step": 8166 + }, + { + "epoch": 0.7161787213169112, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008949243507353002, + "loss": 1.1237, + "step": 8167 + }, + { + "epoch": 0.7162664130912858, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008945864324085591, + "loss": 1.1002, + "step": 8168 + }, + { + "epoch": 0.7163541048656603, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008942485829735853, + "loss": 1.1475, + "step": 8169 + }, + { + "epoch": 0.7164417966400348, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008939108024611899, + "loss": 1.1785, + "step": 8170 + }, + { + "epoch": 0.7165294884144094, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008935730909021779, + "loss": 1.0951, + "step": 8171 + }, + { + "epoch": 0.716617180188784, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008932354483273467, + "loss": 1.1179, + "step": 8172 + }, + { + "epoch": 0.7167048719631585, + "grad_norm": 0.08740234375, + "learning_rate": 0.0008928978747674892, + "loss": 1.1436, + "step": 8173 + }, + { + "epoch": 0.7167925637375331, + "grad_norm": 0.06298828125, + "learning_rate": 0.000892560370253391, + "loss": 1.1567, + "step": 8174 + }, + { + "epoch": 0.7168802555119076, + "grad_norm": 0.04833984375, + "learning_rate": 0.0008922229348158322, + "loss": 1.124, + "step": 8175 + }, + { + "epoch": 0.7169679472862821, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008918855684855856, + "loss": 1.111, + "step": 8176 + }, + { + "epoch": 0.7170556390606567, + "grad_norm": 0.051513671875, + "learning_rate": 0.0008915482712934176, + "loss": 1.1377, + "step": 8177 + }, + { + "epoch": 0.7171433308350312, + "grad_norm": 0.058837890625, + "learning_rate": 0.0008912110432700892, + "loss": 1.1149, + "step": 8178 + }, + { + "epoch": 0.7172310226094057, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008908738844463548, + "loss": 1.1796, + "step": 8179 + }, + { + "epoch": 0.7173187143837804, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008905367948529632, + "loss": 1.1376, + "step": 8180 + }, + { + "epoch": 0.7174064061581549, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008901997745206547, + "loss": 1.1642, + "step": 8181 + }, + { + "epoch": 0.7174940979325294, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008898628234801653, + "loss": 1.1191, + "step": 8182 + }, + { + "epoch": 0.7175817897069039, + "grad_norm": 0.07373046875, + "learning_rate": 0.0008895259417622244, + "loss": 1.1662, + "step": 8183 + }, + { + "epoch": 0.7176694814812785, + "grad_norm": 0.08447265625, + "learning_rate": 0.0008891891293975548, + "loss": 1.1617, + "step": 8184 + }, + { + "epoch": 0.717757173255653, + "grad_norm": 0.08251953125, + "learning_rate": 0.0008888523864168718, + "loss": 1.1331, + "step": 8185 + }, + { + "epoch": 0.7178448650300275, + "grad_norm": 0.05078125, + "learning_rate": 0.0008885157128508867, + "loss": 1.2114, + "step": 8186 + }, + { + "epoch": 0.7179325568044022, + "grad_norm": 0.07958984375, + "learning_rate": 0.0008881791087303027, + "loss": 1.1043, + "step": 8187 + }, + { + "epoch": 0.7180202485787767, + "grad_norm": 0.091796875, + "learning_rate": 0.0008878425740858172, + "loss": 1.1973, + "step": 8188 + }, + { + "epoch": 0.7181079403531512, + "grad_norm": 0.07177734375, + "learning_rate": 0.000887506108948122, + "loss": 1.1264, + "step": 8189 + }, + { + "epoch": 0.7181956321275258, + "grad_norm": 0.04736328125, + "learning_rate": 0.0008871697133479008, + "loss": 1.1412, + "step": 8190 + }, + { + "epoch": 0.7182833239019003, + "grad_norm": 0.05078125, + "learning_rate": 0.0008868333873158326, + "loss": 1.1072, + "step": 8191 + }, + { + "epoch": 0.7183710156762748, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008864971308825894, + "loss": 1.1245, + "step": 8192 + }, + { + "epoch": 0.7184587074506494, + "grad_norm": 0.078125, + "learning_rate": 0.0008861609440788372, + "loss": 1.1238, + "step": 8193 + }, + { + "epoch": 0.718546399225024, + "grad_norm": 0.052734375, + "learning_rate": 0.0008858248269352352, + "loss": 1.1278, + "step": 8194 + }, + { + "epoch": 0.7186340909993985, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008854887794824357, + "loss": 1.1412, + "step": 8195 + }, + { + "epoch": 0.7187217827737731, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008851528017510859, + "loss": 1.137, + "step": 8196 + }, + { + "epoch": 0.7188094745481476, + "grad_norm": 0.05810546875, + "learning_rate": 0.0008848168937718263, + "loss": 1.1153, + "step": 8197 + }, + { + "epoch": 0.7188971663225221, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008844810555752912, + "loss": 1.1245, + "step": 8198 + }, + { + "epoch": 0.7189848580968967, + "grad_norm": 0.06201171875, + "learning_rate": 0.0008841452871921069, + "loss": 1.1581, + "step": 8199 + }, + { + "epoch": 0.7190725498712712, + "grad_norm": 0.04833984375, + "learning_rate": 0.0008838095886528956, + "loss": 1.1094, + "step": 8200 + }, + { + "epoch": 0.7191602416456457, + "grad_norm": 0.047119140625, + "learning_rate": 0.0008834739599882718, + "loss": 1.1283, + "step": 8201 + }, + { + "epoch": 0.7192479334200204, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008831384012288445, + "loss": 1.1577, + "step": 8202 + }, + { + "epoch": 0.7193356251943949, + "grad_norm": 0.042236328125, + "learning_rate": 0.000882802912405215, + "loss": 1.1222, + "step": 8203 + }, + { + "epoch": 0.7194233169687694, + "grad_norm": 0.05810546875, + "learning_rate": 0.0008824674935479792, + "loss": 1.1365, + "step": 8204 + }, + { + "epoch": 0.7195110087431439, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008821321446877269, + "loss": 1.119, + "step": 8205 + }, + { + "epoch": 0.7195987005175185, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008817968658550411, + "loss": 1.1557, + "step": 8206 + }, + { + "epoch": 0.719686392291893, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008814616570804974, + "loss": 1.0611, + "step": 8207 + }, + { + "epoch": 0.7197740840662675, + "grad_norm": 0.046875, + "learning_rate": 0.0008811265183946669, + "loss": 1.1068, + "step": 8208 + }, + { + "epoch": 0.7198617758406421, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008807914498281131, + "loss": 1.1046, + "step": 8209 + }, + { + "epoch": 0.7199494676150167, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008804564514113935, + "loss": 1.1496, + "step": 8210 + }, + { + "epoch": 0.7200371593893912, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008801215231750592, + "loss": 1.0697, + "step": 8211 + }, + { + "epoch": 0.7201248511637658, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008797866651496541, + "loss": 1.1424, + "step": 8212 + }, + { + "epoch": 0.7202125429381403, + "grad_norm": 0.04541015625, + "learning_rate": 0.0008794518773657166, + "loss": 1.1677, + "step": 8213 + }, + { + "epoch": 0.7203002347125148, + "grad_norm": 0.06494140625, + "learning_rate": 0.0008791171598537791, + "loss": 1.1895, + "step": 8214 + }, + { + "epoch": 0.7203879264868894, + "grad_norm": 0.044921875, + "learning_rate": 0.0008787825126443666, + "loss": 1.1127, + "step": 8215 + }, + { + "epoch": 0.7204756182612639, + "grad_norm": 0.049072265625, + "learning_rate": 0.000878447935767998, + "loss": 1.1392, + "step": 8216 + }, + { + "epoch": 0.7205633100356384, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008781134292551858, + "loss": 1.0924, + "step": 8217 + }, + { + "epoch": 0.7206510018100131, + "grad_norm": 0.07373046875, + "learning_rate": 0.0008777789931364363, + "loss": 1.1449, + "step": 8218 + }, + { + "epoch": 0.7207386935843876, + "grad_norm": 0.040771484375, + "learning_rate": 0.000877444627442249, + "loss": 1.1213, + "step": 8219 + }, + { + "epoch": 0.7208263853587621, + "grad_norm": 0.060302734375, + "learning_rate": 0.0008771103322031177, + "loss": 1.1579, + "step": 8220 + }, + { + "epoch": 0.7209140771331367, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008767761074495287, + "loss": 1.1059, + "step": 8221 + }, + { + "epoch": 0.7210017689075112, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008764419532119625, + "loss": 1.1148, + "step": 8222 + }, + { + "epoch": 0.7210894606818857, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008761078695208931, + "loss": 1.0967, + "step": 8223 + }, + { + "epoch": 0.7211771524562604, + "grad_norm": 0.056640625, + "learning_rate": 0.0008757738564067888, + "loss": 1.1421, + "step": 8224 + }, + { + "epoch": 0.7212648442306349, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008754399139001098, + "loss": 1.1592, + "step": 8225 + }, + { + "epoch": 0.7213525360050094, + "grad_norm": 0.05126953125, + "learning_rate": 0.000875106042031311, + "loss": 1.1389, + "step": 8226 + }, + { + "epoch": 0.721440227779384, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008747722408308413, + "loss": 1.1119, + "step": 8227 + }, + { + "epoch": 0.7215279195537585, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008744385103291415, + "loss": 1.0484, + "step": 8228 + }, + { + "epoch": 0.721615611328133, + "grad_norm": 0.047607421875, + "learning_rate": 0.000874104850556648, + "loss": 1.1597, + "step": 8229 + }, + { + "epoch": 0.7217033031025075, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008737712615437887, + "loss": 1.2046, + "step": 8230 + }, + { + "epoch": 0.7217909948768821, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008734377433209864, + "loss": 1.109, + "step": 8231 + }, + { + "epoch": 0.7218786866512567, + "grad_norm": 0.068359375, + "learning_rate": 0.0008731042959186573, + "loss": 1.1816, + "step": 8232 + }, + { + "epoch": 0.7219663784256312, + "grad_norm": 0.051513671875, + "learning_rate": 0.0008727709193672113, + "loss": 1.1684, + "step": 8233 + }, + { + "epoch": 0.7220540702000058, + "grad_norm": 0.04248046875, + "learning_rate": 0.0008724376136970505, + "loss": 1.1199, + "step": 8234 + }, + { + "epoch": 0.7221417619743803, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008721043789385723, + "loss": 1.1543, + "step": 8235 + }, + { + "epoch": 0.7222294537487548, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008717712151221663, + "loss": 1.1432, + "step": 8236 + }, + { + "epoch": 0.7223171455231294, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008714381222782172, + "loss": 1.1117, + "step": 8237 + }, + { + "epoch": 0.7224048372975039, + "grad_norm": 0.052490234375, + "learning_rate": 0.000871105100437101, + "loss": 1.1142, + "step": 8238 + }, + { + "epoch": 0.7224925290718784, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008707721496291888, + "loss": 1.1357, + "step": 8239 + }, + { + "epoch": 0.7225802208462531, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008704392698848451, + "loss": 1.1223, + "step": 8240 + }, + { + "epoch": 0.7226679126206276, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008701064612344277, + "loss": 1.107, + "step": 8241 + }, + { + "epoch": 0.7227556043950021, + "grad_norm": 0.048095703125, + "learning_rate": 0.000869773723708288, + "loss": 1.157, + "step": 8242 + }, + { + "epoch": 0.7228432961693767, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008694410573367703, + "loss": 1.1855, + "step": 8243 + }, + { + "epoch": 0.7229309879437512, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008691084621502134, + "loss": 1.141, + "step": 8244 + }, + { + "epoch": 0.7230186797181257, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008687759381789485, + "loss": 1.105, + "step": 8245 + }, + { + "epoch": 0.7231063714925003, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008684434854533014, + "loss": 1.1051, + "step": 8246 + }, + { + "epoch": 0.7231940632668749, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008681111040035913, + "loss": 1.114, + "step": 8247 + }, + { + "epoch": 0.7232817550412494, + "grad_norm": 0.04833984375, + "learning_rate": 0.0008677787938601296, + "loss": 1.1324, + "step": 8248 + }, + { + "epoch": 0.723369446815624, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008674465550532224, + "loss": 1.1451, + "step": 8249 + }, + { + "epoch": 0.7234571385899985, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008671143876131694, + "loss": 1.089, + "step": 8250 + }, + { + "epoch": 0.723544830364373, + "grad_norm": 0.048828125, + "learning_rate": 0.0008667822915702639, + "loss": 1.132, + "step": 8251 + }, + { + "epoch": 0.7236325221387476, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008664502669547908, + "loss": 1.1319, + "step": 8252 + }, + { + "epoch": 0.7237202139131221, + "grad_norm": 0.0673828125, + "learning_rate": 0.0008661183137970306, + "loss": 1.0913, + "step": 8253 + }, + { + "epoch": 0.7238079056874966, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008657864321272567, + "loss": 1.1528, + "step": 8254 + }, + { + "epoch": 0.7238955974618712, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008654546219757363, + "loss": 1.1565, + "step": 8255 + }, + { + "epoch": 0.7239832892362458, + "grad_norm": 0.05859375, + "learning_rate": 0.0008651228833727283, + "loss": 1.1178, + "step": 8256 + }, + { + "epoch": 0.7240709810106203, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008647912163484876, + "loss": 1.068, + "step": 8257 + }, + { + "epoch": 0.7241586727849948, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008644596209332608, + "loss": 1.2285, + "step": 8258 + }, + { + "epoch": 0.7242463645593694, + "grad_norm": 0.053466796875, + "learning_rate": 0.0008641280971572892, + "loss": 1.137, + "step": 8259 + }, + { + "epoch": 0.7243340563337439, + "grad_norm": 0.052978515625, + "learning_rate": 0.0008637966450508061, + "loss": 1.1765, + "step": 8260 + }, + { + "epoch": 0.7244217481081184, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008634652646440396, + "loss": 1.1462, + "step": 8261 + }, + { + "epoch": 0.7245094398824931, + "grad_norm": 0.05419921875, + "learning_rate": 0.0008631339559672101, + "loss": 1.1458, + "step": 8262 + }, + { + "epoch": 0.7245971316568676, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008628027190505326, + "loss": 1.128, + "step": 8263 + }, + { + "epoch": 0.7246848234312421, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008624715539242154, + "loss": 1.1438, + "step": 8264 + }, + { + "epoch": 0.7247725152056167, + "grad_norm": 0.04736328125, + "learning_rate": 0.0008621404606184591, + "loss": 1.1505, + "step": 8265 + }, + { + "epoch": 0.7248602069799912, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008618094391634589, + "loss": 1.1551, + "step": 8266 + }, + { + "epoch": 0.7249478987543657, + "grad_norm": 0.050537109375, + "learning_rate": 0.000861478489589403, + "loss": 1.1217, + "step": 8267 + }, + { + "epoch": 0.7250355905287403, + "grad_norm": 0.058349609375, + "learning_rate": 0.000861147611926474, + "loss": 1.1507, + "step": 8268 + }, + { + "epoch": 0.7251232823031148, + "grad_norm": 0.050537109375, + "learning_rate": 0.000860816806204846, + "loss": 1.1517, + "step": 8269 + }, + { + "epoch": 0.7252109740774894, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008604860724546878, + "loss": 1.2101, + "step": 8270 + }, + { + "epoch": 0.725298665851864, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008601554107061616, + "loss": 1.0934, + "step": 8271 + }, + { + "epoch": 0.7253863576262385, + "grad_norm": 0.05712890625, + "learning_rate": 0.0008598248209894231, + "loss": 1.1272, + "step": 8272 + }, + { + "epoch": 0.725474049400613, + "grad_norm": 0.0693359375, + "learning_rate": 0.0008594943033346216, + "loss": 1.133, + "step": 8273 + }, + { + "epoch": 0.7255617411749876, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008591638577718984, + "loss": 1.1075, + "step": 8274 + }, + { + "epoch": 0.7256494329493621, + "grad_norm": 0.04736328125, + "learning_rate": 0.0008588334843313899, + "loss": 1.1525, + "step": 8275 + }, + { + "epoch": 0.7257371247237366, + "grad_norm": 0.058349609375, + "learning_rate": 0.000858503183043225, + "loss": 1.1431, + "step": 8276 + }, + { + "epoch": 0.7258248164981111, + "grad_norm": 0.050537109375, + "learning_rate": 0.0008581729539375273, + "loss": 1.1179, + "step": 8277 + }, + { + "epoch": 0.7259125082724858, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008578427970444122, + "loss": 1.0911, + "step": 8278 + }, + { + "epoch": 0.7260002000468603, + "grad_norm": 0.04833984375, + "learning_rate": 0.0008575127123939884, + "loss": 1.12, + "step": 8279 + }, + { + "epoch": 0.7260878918212348, + "grad_norm": 0.044921875, + "learning_rate": 0.0008571827000163596, + "loss": 1.0798, + "step": 8280 + }, + { + "epoch": 0.7261755835956094, + "grad_norm": 0.052734375, + "learning_rate": 0.0008568527599416216, + "loss": 1.1519, + "step": 8281 + }, + { + "epoch": 0.7262632753699839, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008565228921998652, + "loss": 1.1446, + "step": 8282 + }, + { + "epoch": 0.7263509671443584, + "grad_norm": 0.041015625, + "learning_rate": 0.0008561930968211721, + "loss": 1.0922, + "step": 8283 + }, + { + "epoch": 0.726438658918733, + "grad_norm": 0.05859375, + "learning_rate": 0.0008558633738356193, + "loss": 1.0825, + "step": 8284 + }, + { + "epoch": 0.7265263506931076, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008555337232732768, + "loss": 1.15, + "step": 8285 + }, + { + "epoch": 0.7266140424674821, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008552041451642083, + "loss": 1.2071, + "step": 8286 + }, + { + "epoch": 0.7267017342418567, + "grad_norm": 0.048095703125, + "learning_rate": 0.0008548746395384695, + "loss": 1.1457, + "step": 8287 + }, + { + "epoch": 0.7267894260162312, + "grad_norm": 0.04345703125, + "learning_rate": 0.000854545206426111, + "loss": 1.1105, + "step": 8288 + }, + { + "epoch": 0.7268771177906057, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008542158458571763, + "loss": 1.1139, + "step": 8289 + }, + { + "epoch": 0.7269648095649803, + "grad_norm": 0.044921875, + "learning_rate": 0.0008538865578617025, + "loss": 1.1922, + "step": 8290 + }, + { + "epoch": 0.7270525013393548, + "grad_norm": 0.061767578125, + "learning_rate": 0.000853557342469719, + "loss": 1.1537, + "step": 8291 + }, + { + "epoch": 0.7271401931137293, + "grad_norm": 0.044921875, + "learning_rate": 0.0008532281997112497, + "loss": 1.1147, + "step": 8292 + }, + { + "epoch": 0.727227884888104, + "grad_norm": 0.054931640625, + "learning_rate": 0.0008528991296163122, + "loss": 1.164, + "step": 8293 + }, + { + "epoch": 0.7273155766624785, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008525701322149165, + "loss": 1.1176, + "step": 8294 + }, + { + "epoch": 0.727403268436853, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008522412075370663, + "loss": 1.1592, + "step": 8295 + }, + { + "epoch": 0.7274909602112276, + "grad_norm": 0.043212890625, + "learning_rate": 0.000851912355612758, + "loss": 1.1747, + "step": 8296 + }, + { + "epoch": 0.7275786519856021, + "grad_norm": 0.050048828125, + "learning_rate": 0.0008515835764719826, + "loss": 1.129, + "step": 8297 + }, + { + "epoch": 0.7276663437599766, + "grad_norm": 0.0419921875, + "learning_rate": 0.000851254870144724, + "loss": 1.0943, + "step": 8298 + }, + { + "epoch": 0.7277540355343513, + "grad_norm": 0.050048828125, + "learning_rate": 0.0008509262366609594, + "loss": 1.1115, + "step": 8299 + }, + { + "epoch": 0.7278417273087258, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008505976760506596, + "loss": 1.1402, + "step": 8300 + }, + { + "epoch": 0.7279294190831003, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008502691883437876, + "loss": 1.1216, + "step": 8301 + }, + { + "epoch": 0.7280171108574748, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008499407735703011, + "loss": 1.1382, + "step": 8302 + }, + { + "epoch": 0.7281048026318494, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008496124317601507, + "loss": 1.1686, + "step": 8303 + }, + { + "epoch": 0.7281924944062239, + "grad_norm": 0.04541015625, + "learning_rate": 0.000849284162943281, + "loss": 1.1039, + "step": 8304 + }, + { + "epoch": 0.7282801861805984, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008489559671496278, + "loss": 1.1901, + "step": 8305 + }, + { + "epoch": 0.728367877954973, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008486278444091227, + "loss": 1.1143, + "step": 8306 + }, + { + "epoch": 0.7284555697293476, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008482997947516895, + "loss": 1.1289, + "step": 8307 + }, + { + "epoch": 0.7285432615037221, + "grad_norm": 0.05224609375, + "learning_rate": 0.0008479718182072458, + "loss": 1.168, + "step": 8308 + }, + { + "epoch": 0.7286309532780967, + "grad_norm": 0.052978515625, + "learning_rate": 0.0008476439148057015, + "loss": 1.098, + "step": 8309 + }, + { + "epoch": 0.7287186450524712, + "grad_norm": 0.05126953125, + "learning_rate": 0.000847316084576961, + "loss": 1.1238, + "step": 8310 + }, + { + "epoch": 0.7288063368268457, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008469883275509217, + "loss": 1.1249, + "step": 8311 + }, + { + "epoch": 0.7288940286012203, + "grad_norm": 0.05078125, + "learning_rate": 0.0008466606437574734, + "loss": 1.1589, + "step": 8312 + }, + { + "epoch": 0.7289817203755948, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008463330332265012, + "loss": 1.1232, + "step": 8313 + }, + { + "epoch": 0.7290694121499693, + "grad_norm": 0.048095703125, + "learning_rate": 0.0008460054959878815, + "loss": 1.0965, + "step": 8314 + }, + { + "epoch": 0.729157103924344, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008456780320714849, + "loss": 1.0877, + "step": 8315 + }, + { + "epoch": 0.7292447956987185, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008453506415071752, + "loss": 1.1332, + "step": 8316 + }, + { + "epoch": 0.729332487473093, + "grad_norm": 0.05078125, + "learning_rate": 0.0008450233243248105, + "loss": 1.1446, + "step": 8317 + }, + { + "epoch": 0.7294201792474676, + "grad_norm": 0.052734375, + "learning_rate": 0.00084469608055424, + "loss": 1.1769, + "step": 8318 + }, + { + "epoch": 0.7295078710218421, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008443689102253083, + "loss": 1.0746, + "step": 8319 + }, + { + "epoch": 0.7295955627962166, + "grad_norm": 0.046875, + "learning_rate": 0.0008440418133678519, + "loss": 1.129, + "step": 8320 + }, + { + "epoch": 0.7296832545705912, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008437147900117022, + "loss": 1.1121, + "step": 8321 + }, + { + "epoch": 0.7297709463449658, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008433878401866818, + "loss": 1.1072, + "step": 8322 + }, + { + "epoch": 0.7298586381193403, + "grad_norm": 0.054931640625, + "learning_rate": 0.0008430609639226079, + "loss": 1.1073, + "step": 8323 + }, + { + "epoch": 0.7299463298937148, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008427341612492915, + "loss": 1.1259, + "step": 8324 + }, + { + "epoch": 0.7300340216680894, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008424074321965351, + "loss": 1.0869, + "step": 8325 + }, + { + "epoch": 0.7301217134424639, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008420807767941369, + "loss": 1.14, + "step": 8326 + }, + { + "epoch": 0.7302094052168384, + "grad_norm": 0.04541015625, + "learning_rate": 0.0008417541950718856, + "loss": 1.1342, + "step": 8327 + }, + { + "epoch": 0.730297096991213, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008414276870595657, + "loss": 1.1242, + "step": 8328 + }, + { + "epoch": 0.7303847887655875, + "grad_norm": 0.039306640625, + "learning_rate": 0.000841101252786953, + "loss": 1.08, + "step": 8329 + }, + { + "epoch": 0.730472480539962, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008407748922838179, + "loss": 1.0879, + "step": 8330 + }, + { + "epoch": 0.7305601723143367, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008404486055799239, + "loss": 1.1088, + "step": 8331 + }, + { + "epoch": 0.7306478640887112, + "grad_norm": 0.05224609375, + "learning_rate": 0.000840122392705027, + "loss": 1.1781, + "step": 8332 + }, + { + "epoch": 0.7307355558630857, + "grad_norm": 0.04296875, + "learning_rate": 0.0008397962536888772, + "loss": 1.0915, + "step": 8333 + }, + { + "epoch": 0.7308232476374603, + "grad_norm": 0.053466796875, + "learning_rate": 0.0008394701885612175, + "loss": 1.1008, + "step": 8334 + }, + { + "epoch": 0.7309109394118348, + "grad_norm": 0.047119140625, + "learning_rate": 0.0008391441973517845, + "loss": 1.1234, + "step": 8335 + }, + { + "epoch": 0.7309986311862093, + "grad_norm": 0.05224609375, + "learning_rate": 0.0008388182800903074, + "loss": 1.1562, + "step": 8336 + }, + { + "epoch": 0.731086322960584, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008384924368065088, + "loss": 1.2029, + "step": 8337 + }, + { + "epoch": 0.7311740147349585, + "grad_norm": 0.051513671875, + "learning_rate": 0.0008381666675301051, + "loss": 1.1526, + "step": 8338 + }, + { + "epoch": 0.731261706509333, + "grad_norm": 0.05712890625, + "learning_rate": 0.0008378409722908061, + "loss": 1.1425, + "step": 8339 + }, + { + "epoch": 0.7313493982837076, + "grad_norm": 0.05517578125, + "learning_rate": 0.0008375153511183134, + "loss": 1.1728, + "step": 8340 + }, + { + "epoch": 0.7314370900580821, + "grad_norm": 0.05078125, + "learning_rate": 0.0008371898040423233, + "loss": 1.1045, + "step": 8341 + }, + { + "epoch": 0.7315247818324566, + "grad_norm": 0.064453125, + "learning_rate": 0.0008368643310925246, + "loss": 1.2176, + "step": 8342 + }, + { + "epoch": 0.7316124736068312, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008365389322986005, + "loss": 1.0988, + "step": 8343 + }, + { + "epoch": 0.7317001653812057, + "grad_norm": 0.05859375, + "learning_rate": 0.0008362136076902251, + "loss": 1.14, + "step": 8344 + }, + { + "epoch": 0.7317878571555803, + "grad_norm": 0.04541015625, + "learning_rate": 0.0008358883572970687, + "loss": 1.1626, + "step": 8345 + }, + { + "epoch": 0.7318755489299549, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008355631811487919, + "loss": 1.1319, + "step": 8346 + }, + { + "epoch": 0.7319632407043294, + "grad_norm": 0.05908203125, + "learning_rate": 0.0008352380792750505, + "loss": 1.1266, + "step": 8347 + }, + { + "epoch": 0.7320509324787039, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008349130517054934, + "loss": 1.1362, + "step": 8348 + }, + { + "epoch": 0.7321386242530784, + "grad_norm": 0.04296875, + "learning_rate": 0.0008345880984697614, + "loss": 1.1265, + "step": 8349 + }, + { + "epoch": 0.732226316027453, + "grad_norm": 0.047119140625, + "learning_rate": 0.0008342632195974901, + "loss": 1.1361, + "step": 8350 + }, + { + "epoch": 0.7323140078018275, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008339384151183068, + "loss": 1.1975, + "step": 8351 + }, + { + "epoch": 0.732401699576202, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008336136850618338, + "loss": 1.1185, + "step": 8352 + }, + { + "epoch": 0.7324893913505767, + "grad_norm": 0.04736328125, + "learning_rate": 0.0008332890294576857, + "loss": 1.1775, + "step": 8353 + }, + { + "epoch": 0.7325770831249512, + "grad_norm": 0.048828125, + "learning_rate": 0.0008329644483354693, + "loss": 1.1219, + "step": 8354 + }, + { + "epoch": 0.7326647748993257, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008326399417247859, + "loss": 1.0847, + "step": 8355 + }, + { + "epoch": 0.7327524666737003, + "grad_norm": 0.061767578125, + "learning_rate": 0.0008323155096552299, + "loss": 1.1591, + "step": 8356 + }, + { + "epoch": 0.7328401584480748, + "grad_norm": 0.04833984375, + "learning_rate": 0.0008319911521563891, + "loss": 1.1414, + "step": 8357 + }, + { + "epoch": 0.7329278502224493, + "grad_norm": 0.05908203125, + "learning_rate": 0.000831666869257843, + "loss": 1.1184, + "step": 8358 + }, + { + "epoch": 0.733015541996824, + "grad_norm": 0.049560546875, + "learning_rate": 0.000831342660989166, + "loss": 1.137, + "step": 8359 + }, + { + "epoch": 0.7331032337711985, + "grad_norm": 0.047119140625, + "learning_rate": 0.0008310185273799247, + "loss": 1.1031, + "step": 8360 + }, + { + "epoch": 0.733190925545573, + "grad_norm": 0.0673828125, + "learning_rate": 0.0008306944684596803, + "loss": 1.1634, + "step": 8361 + }, + { + "epoch": 0.7332786173199476, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008303704842579851, + "loss": 1.176, + "step": 8362 + }, + { + "epoch": 0.7333663090943221, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008300465748043857, + "loss": 1.1661, + "step": 8363 + }, + { + "epoch": 0.7334540008686966, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008297227401284215, + "loss": 1.1129, + "step": 8364 + }, + { + "epoch": 0.7335416926430712, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008293989802596265, + "loss": 1.1117, + "step": 8365 + }, + { + "epoch": 0.7336293844174457, + "grad_norm": 0.041259765625, + "learning_rate": 0.0008290752952275261, + "loss": 1.1694, + "step": 8366 + }, + { + "epoch": 0.7337170761918202, + "grad_norm": 0.041259765625, + "learning_rate": 0.0008287516850616392, + "loss": 1.128, + "step": 8367 + }, + { + "epoch": 0.7338047679661949, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008284281497914789, + "loss": 1.1343, + "step": 8368 + }, + { + "epoch": 0.7338924597405694, + "grad_norm": 0.06005859375, + "learning_rate": 0.0008281046894465502, + "loss": 1.1606, + "step": 8369 + }, + { + "epoch": 0.7339801515149439, + "grad_norm": 0.056396484375, + "learning_rate": 0.0008277813040563523, + "loss": 1.1728, + "step": 8370 + }, + { + "epoch": 0.7340678432893185, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008274579936503768, + "loss": 1.0915, + "step": 8371 + }, + { + "epoch": 0.734155535063693, + "grad_norm": 0.047607421875, + "learning_rate": 0.000827134758258109, + "loss": 1.222, + "step": 8372 + }, + { + "epoch": 0.7342432268380675, + "grad_norm": 0.048095703125, + "learning_rate": 0.0008268115979090268, + "loss": 1.1566, + "step": 8373 + }, + { + "epoch": 0.734330918612442, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008264885126326023, + "loss": 1.0803, + "step": 8374 + }, + { + "epoch": 0.7344186103868167, + "grad_norm": 0.072265625, + "learning_rate": 0.0008261655024582992, + "loss": 1.1495, + "step": 8375 + }, + { + "epoch": 0.7345063021611912, + "grad_norm": 0.048095703125, + "learning_rate": 0.0008258425674155756, + "loss": 1.1461, + "step": 8376 + }, + { + "epoch": 0.7345939939355657, + "grad_norm": 0.046875, + "learning_rate": 0.0008255197075338822, + "loss": 1.0912, + "step": 8377 + }, + { + "epoch": 0.7346816857099403, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008251969228426631, + "loss": 1.1583, + "step": 8378 + }, + { + "epoch": 0.7347693774843148, + "grad_norm": 0.064453125, + "learning_rate": 0.0008248742133713559, + "loss": 1.1411, + "step": 8379 + }, + { + "epoch": 0.7348570692586893, + "grad_norm": 0.04736328125, + "learning_rate": 0.0008245515791493904, + "loss": 1.1343, + "step": 8380 + }, + { + "epoch": 0.7349447610330639, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008242290202061893, + "loss": 1.0828, + "step": 8381 + }, + { + "epoch": 0.7350324528074385, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008239065365711702, + "loss": 1.1374, + "step": 8382 + }, + { + "epoch": 0.735120144581813, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008235841282737422, + "loss": 1.1263, + "step": 8383 + }, + { + "epoch": 0.7352078363561876, + "grad_norm": 0.04296875, + "learning_rate": 0.0008232617953433088, + "loss": 1.0992, + "step": 8384 + }, + { + "epoch": 0.7352955281305621, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008229395378092651, + "loss": 1.1467, + "step": 8385 + }, + { + "epoch": 0.7353832199049366, + "grad_norm": 0.04541015625, + "learning_rate": 0.0008226173557010004, + "loss": 1.1255, + "step": 8386 + }, + { + "epoch": 0.7354709116793112, + "grad_norm": 0.048095703125, + "learning_rate": 0.000822295249047897, + "loss": 1.1335, + "step": 8387 + }, + { + "epoch": 0.7355586034536857, + "grad_norm": 0.04248046875, + "learning_rate": 0.0008219732178793305, + "loss": 1.128, + "step": 8388 + }, + { + "epoch": 0.7356462952280602, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008216512622246689, + "loss": 1.1116, + "step": 8389 + }, + { + "epoch": 0.7357339870024349, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008213293821132736, + "loss": 1.1942, + "step": 8390 + }, + { + "epoch": 0.7358216787768094, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008210075775744994, + "loss": 1.1393, + "step": 8391 + }, + { + "epoch": 0.7359093705511839, + "grad_norm": 0.05126953125, + "learning_rate": 0.0008206858486376945, + "loss": 1.1297, + "step": 8392 + }, + { + "epoch": 0.7359970623255585, + "grad_norm": 0.04248046875, + "learning_rate": 0.000820364195332199, + "loss": 1.1, + "step": 8393 + }, + { + "epoch": 0.736084754099933, + "grad_norm": 0.05712890625, + "learning_rate": 0.0008200426176873472, + "loss": 1.1379, + "step": 8394 + }, + { + "epoch": 0.7361724458743075, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008197211157324662, + "loss": 1.1564, + "step": 8395 + }, + { + "epoch": 0.736260137648682, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008193996894968765, + "loss": 1.1245, + "step": 8396 + }, + { + "epoch": 0.7363478294230567, + "grad_norm": 0.054443359375, + "learning_rate": 0.0008190783390098908, + "loss": 1.1679, + "step": 8397 + }, + { + "epoch": 0.7364355211974312, + "grad_norm": 0.052001953125, + "learning_rate": 0.0008187570643008152, + "loss": 1.1779, + "step": 8398 + }, + { + "epoch": 0.7365232129718057, + "grad_norm": 0.06494140625, + "learning_rate": 0.0008184358653989495, + "loss": 1.1787, + "step": 8399 + }, + { + "epoch": 0.7366109047461803, + "grad_norm": 0.064453125, + "learning_rate": 0.0008181147423335864, + "loss": 1.1855, + "step": 8400 + }, + { + "epoch": 0.7366985965205548, + "grad_norm": 0.04541015625, + "learning_rate": 0.0008177936951340116, + "loss": 1.1093, + "step": 8401 + }, + { + "epoch": 0.7367862882949293, + "grad_norm": 0.044189453125, + "learning_rate": 0.000817472723829503, + "loss": 1.0963, + "step": 8402 + }, + { + "epoch": 0.7368739800693039, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008171518284493327, + "loss": 1.1415, + "step": 8403 + }, + { + "epoch": 0.7369616718436784, + "grad_norm": 0.053466796875, + "learning_rate": 0.0008168310090227657, + "loss": 1.1224, + "step": 8404 + }, + { + "epoch": 0.737049363618053, + "grad_norm": 0.048095703125, + "learning_rate": 0.00081651026557906, + "loss": 1.2081, + "step": 8405 + }, + { + "epoch": 0.7371370553924276, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008161895981474669, + "loss": 1.0961, + "step": 8406 + }, + { + "epoch": 0.7372247471668021, + "grad_norm": 0.052978515625, + "learning_rate": 0.0008158690067572293, + "loss": 1.1095, + "step": 8407 + }, + { + "epoch": 0.7373124389411766, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008155484914375853, + "loss": 1.139, + "step": 8408 + }, + { + "epoch": 0.7374001307155512, + "grad_norm": 0.04345703125, + "learning_rate": 0.0008152280522177645, + "loss": 1.1328, + "step": 8409 + }, + { + "epoch": 0.7374878224899257, + "grad_norm": 0.059326171875, + "learning_rate": 0.000814907689126991, + "loss": 1.1528, + "step": 8410 + }, + { + "epoch": 0.7375755142643002, + "grad_norm": 0.059326171875, + "learning_rate": 0.0008145874021944798, + "loss": 1.1254, + "step": 8411 + }, + { + "epoch": 0.7376632060386749, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008142671914494413, + "loss": 1.1362, + "step": 8412 + }, + { + "epoch": 0.7377508978130494, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008139470569210777, + "loss": 1.1209, + "step": 8413 + }, + { + "epoch": 0.7378385895874239, + "grad_norm": 0.046142578125, + "learning_rate": 0.0008136269986385838, + "loss": 1.161, + "step": 8414 + }, + { + "epoch": 0.7379262813617985, + "grad_norm": 0.05078125, + "learning_rate": 0.000813307016631149, + "loss": 1.1072, + "step": 8415 + }, + { + "epoch": 0.738013973136173, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008129871109279537, + "loss": 1.1165, + "step": 8416 + }, + { + "epoch": 0.7381016649105475, + "grad_norm": 0.0439453125, + "learning_rate": 0.0008126672815581736, + "loss": 1.1038, + "step": 8417 + }, + { + "epoch": 0.7381893566849221, + "grad_norm": 0.044189453125, + "learning_rate": 0.0008123475285509755, + "loss": 1.1735, + "step": 8418 + }, + { + "epoch": 0.7382770484592966, + "grad_norm": 0.04736328125, + "learning_rate": 0.000812027851935521, + "loss": 1.0937, + "step": 8419 + }, + { + "epoch": 0.7383647402336712, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008117082517409628, + "loss": 1.149, + "step": 8420 + }, + { + "epoch": 0.7384524320080457, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008113887279964481, + "loss": 1.1326, + "step": 8421 + }, + { + "epoch": 0.7385401237824203, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008110692807311162, + "loss": 1.121, + "step": 8422 + }, + { + "epoch": 0.7386278155567948, + "grad_norm": 0.044677734375, + "learning_rate": 0.0008107499099741008, + "loss": 1.1381, + "step": 8423 + }, + { + "epoch": 0.7387155073311693, + "grad_norm": 0.045166015625, + "learning_rate": 0.0008104306157545266, + "loss": 1.1566, + "step": 8424 + }, + { + "epoch": 0.7388031991055439, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008101113981015132, + "loss": 1.1154, + "step": 8425 + }, + { + "epoch": 0.7388908908799184, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008097922570441718, + "loss": 1.1304, + "step": 8426 + }, + { + "epoch": 0.738978582654293, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008094731926116082, + "loss": 1.2247, + "step": 8427 + }, + { + "epoch": 0.7390662744286676, + "grad_norm": 0.046630859375, + "learning_rate": 0.000809154204832919, + "loss": 1.1973, + "step": 8428 + }, + { + "epoch": 0.7391539662030421, + "grad_norm": 0.047119140625, + "learning_rate": 0.0008088352937371959, + "loss": 1.1957, + "step": 8429 + }, + { + "epoch": 0.7392416579774166, + "grad_norm": 0.0419921875, + "learning_rate": 0.000808516459353523, + "loss": 1.1166, + "step": 8430 + }, + { + "epoch": 0.7393293497517912, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008081977017109762, + "loss": 1.1123, + "step": 8431 + }, + { + "epoch": 0.7394170415261657, + "grad_norm": 0.04248046875, + "learning_rate": 0.0008078790208386261, + "loss": 1.1656, + "step": 8432 + }, + { + "epoch": 0.7395047333005402, + "grad_norm": 0.047119140625, + "learning_rate": 0.0008075604167655353, + "loss": 1.1343, + "step": 8433 + }, + { + "epoch": 0.7395924250749148, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008072418895207595, + "loss": 1.1157, + "step": 8434 + }, + { + "epoch": 0.7396801168492894, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008069234391333479, + "loss": 1.1455, + "step": 8435 + }, + { + "epoch": 0.7397678086236639, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008066050656323424, + "loss": 1.1309, + "step": 8436 + }, + { + "epoch": 0.7398555003980385, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008062867690467781, + "loss": 1.131, + "step": 8437 + }, + { + "epoch": 0.739943192172413, + "grad_norm": 0.048583984375, + "learning_rate": 0.0008059685494056819, + "loss": 1.0857, + "step": 8438 + }, + { + "epoch": 0.7400308839467875, + "grad_norm": 0.06640625, + "learning_rate": 0.0008056504067380751, + "loss": 1.1586, + "step": 8439 + }, + { + "epoch": 0.7401185757211621, + "grad_norm": 0.044921875, + "learning_rate": 0.0008053323410729716, + "loss": 1.1498, + "step": 8440 + }, + { + "epoch": 0.7402062674955366, + "grad_norm": 0.0908203125, + "learning_rate": 0.0008050143524393784, + "loss": 1.0999, + "step": 8441 + }, + { + "epoch": 0.7402939592699112, + "grad_norm": 0.07177734375, + "learning_rate": 0.0008046964408662946, + "loss": 1.1921, + "step": 8442 + }, + { + "epoch": 0.7403816510442858, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008043786063827134, + "loss": 1.1246, + "step": 8443 + }, + { + "epoch": 0.7404693428186603, + "grad_norm": 0.0439453125, + "learning_rate": 0.00080406084901762, + "loss": 1.115, + "step": 8444 + }, + { + "epoch": 0.7405570345930348, + "grad_norm": 0.0703125, + "learning_rate": 0.000803743168799994, + "loss": 1.1294, + "step": 8445 + }, + { + "epoch": 0.7406447263674093, + "grad_norm": 0.05810546875, + "learning_rate": 0.0008034255657588061, + "loss": 1.1278, + "step": 8446 + }, + { + "epoch": 0.7407324181417839, + "grad_norm": 0.05322265625, + "learning_rate": 0.0008031080399230215, + "loss": 1.1057, + "step": 8447 + }, + { + "epoch": 0.7408201099161584, + "grad_norm": 0.04541015625, + "learning_rate": 0.0008027905913215971, + "loss": 1.0668, + "step": 8448 + }, + { + "epoch": 0.7409078016905329, + "grad_norm": 0.053466796875, + "learning_rate": 0.0008024732199834835, + "loss": 1.124, + "step": 8449 + }, + { + "epoch": 0.7409954934649076, + "grad_norm": 0.06201171875, + "learning_rate": 0.0008021559259376249, + "loss": 1.1174, + "step": 8450 + }, + { + "epoch": 0.7410831852392821, + "grad_norm": 0.04931640625, + "learning_rate": 0.0008018387092129568, + "loss": 1.1505, + "step": 8451 + }, + { + "epoch": 0.7411708770136566, + "grad_norm": 0.047607421875, + "learning_rate": 0.0008015215698384087, + "loss": 1.1067, + "step": 8452 + }, + { + "epoch": 0.7412585687880312, + "grad_norm": 0.046630859375, + "learning_rate": 0.0008012045078429033, + "loss": 1.0593, + "step": 8453 + }, + { + "epoch": 0.7413462605624057, + "grad_norm": 0.052001953125, + "learning_rate": 0.000800887523255356, + "loss": 1.1849, + "step": 8454 + }, + { + "epoch": 0.7414339523367802, + "grad_norm": 0.09619140625, + "learning_rate": 0.0008005706161046742, + "loss": 1.1211, + "step": 8455 + }, + { + "epoch": 0.7415216441111548, + "grad_norm": 0.047119140625, + "learning_rate": 0.0008002537864197594, + "loss": 1.1241, + "step": 8456 + }, + { + "epoch": 0.7416093358855294, + "grad_norm": 0.0419921875, + "learning_rate": 0.0007999370342295055, + "loss": 1.1306, + "step": 8457 + }, + { + "epoch": 0.7416970276599039, + "grad_norm": 0.052978515625, + "learning_rate": 0.0007996203595627999, + "loss": 1.1567, + "step": 8458 + }, + { + "epoch": 0.7417847194342785, + "grad_norm": 0.052734375, + "learning_rate": 0.0007993037624485226, + "loss": 1.1228, + "step": 8459 + }, + { + "epoch": 0.741872411208653, + "grad_norm": 0.042724609375, + "learning_rate": 0.0007989872429155458, + "loss": 1.1281, + "step": 8460 + }, + { + "epoch": 0.7419601029830275, + "grad_norm": 0.042236328125, + "learning_rate": 0.0007986708009927353, + "loss": 1.138, + "step": 8461 + }, + { + "epoch": 0.7420477947574021, + "grad_norm": 0.0703125, + "learning_rate": 0.0007983544367089503, + "loss": 1.134, + "step": 8462 + }, + { + "epoch": 0.7421354865317766, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007980381500930423, + "loss": 1.117, + "step": 8463 + }, + { + "epoch": 0.7422231783061511, + "grad_norm": 0.052978515625, + "learning_rate": 0.0007977219411738559, + "loss": 1.1518, + "step": 8464 + }, + { + "epoch": 0.7423108700805258, + "grad_norm": 0.04443359375, + "learning_rate": 0.000797405809980228, + "loss": 1.0851, + "step": 8465 + }, + { + "epoch": 0.7423985618549003, + "grad_norm": 0.05224609375, + "learning_rate": 0.000797089756540989, + "loss": 1.1438, + "step": 8466 + }, + { + "epoch": 0.7424862536292748, + "grad_norm": 0.044189453125, + "learning_rate": 0.0007967737808849625, + "loss": 1.1092, + "step": 8467 + }, + { + "epoch": 0.7425739454036493, + "grad_norm": 0.06396484375, + "learning_rate": 0.0007964578830409652, + "loss": 1.1362, + "step": 8468 + }, + { + "epoch": 0.7426616371780239, + "grad_norm": 0.051025390625, + "learning_rate": 0.0007961420630378048, + "loss": 1.1725, + "step": 8469 + }, + { + "epoch": 0.7427493289523984, + "grad_norm": 0.047607421875, + "learning_rate": 0.0007958263209042844, + "loss": 1.0755, + "step": 8470 + }, + { + "epoch": 0.7428370207267729, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007955106566691984, + "loss": 1.2014, + "step": 8471 + }, + { + "epoch": 0.7429247125011476, + "grad_norm": 0.0546875, + "learning_rate": 0.0007951950703613351, + "loss": 1.1219, + "step": 8472 + }, + { + "epoch": 0.7430124042755221, + "grad_norm": 0.044189453125, + "learning_rate": 0.0007948795620094741, + "loss": 1.1353, + "step": 8473 + }, + { + "epoch": 0.7431000960498966, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007945641316423896, + "loss": 1.1229, + "step": 8474 + }, + { + "epoch": 0.7431877878242712, + "grad_norm": 0.052734375, + "learning_rate": 0.000794248779288848, + "loss": 1.2321, + "step": 8475 + }, + { + "epoch": 0.7432754795986457, + "grad_norm": 0.047607421875, + "learning_rate": 0.0007939335049776092, + "loss": 1.1559, + "step": 8476 + }, + { + "epoch": 0.7433631713730202, + "grad_norm": 0.042236328125, + "learning_rate": 0.0007936183087374244, + "loss": 1.087, + "step": 8477 + }, + { + "epoch": 0.7434508631473948, + "grad_norm": 0.044677734375, + "learning_rate": 0.0007933031905970388, + "loss": 1.1271, + "step": 8478 + }, + { + "epoch": 0.7435385549217693, + "grad_norm": 0.044189453125, + "learning_rate": 0.0007929881505851909, + "loss": 1.1422, + "step": 8479 + }, + { + "epoch": 0.7436262466961439, + "grad_norm": 0.056640625, + "learning_rate": 0.0007926731887306117, + "loss": 1.1394, + "step": 8480 + }, + { + "epoch": 0.7437139384705185, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007923583050620244, + "loss": 1.1279, + "step": 8481 + }, + { + "epoch": 0.743801630244893, + "grad_norm": 0.046630859375, + "learning_rate": 0.0007920434996081456, + "loss": 1.1016, + "step": 8482 + }, + { + "epoch": 0.7438893220192675, + "grad_norm": 0.04931640625, + "learning_rate": 0.0007917287723976844, + "loss": 1.1227, + "step": 8483 + }, + { + "epoch": 0.7439770137936421, + "grad_norm": 0.04296875, + "learning_rate": 0.0007914141234593438, + "loss": 1.1154, + "step": 8484 + }, + { + "epoch": 0.7440647055680166, + "grad_norm": 0.044921875, + "learning_rate": 0.0007910995528218192, + "loss": 1.1578, + "step": 8485 + }, + { + "epoch": 0.7441523973423911, + "grad_norm": 0.044677734375, + "learning_rate": 0.0007907850605137974, + "loss": 1.1411, + "step": 8486 + }, + { + "epoch": 0.7442400891167658, + "grad_norm": 0.052490234375, + "learning_rate": 0.0007904706465639604, + "loss": 1.1278, + "step": 8487 + }, + { + "epoch": 0.7443277808911403, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007901563110009813, + "loss": 1.1096, + "step": 8488 + }, + { + "epoch": 0.7444154726655148, + "grad_norm": 0.042236328125, + "learning_rate": 0.0007898420538535271, + "loss": 1.0725, + "step": 8489 + }, + { + "epoch": 0.7445031644398894, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007895278751502575, + "loss": 1.1455, + "step": 8490 + }, + { + "epoch": 0.7445908562142639, + "grad_norm": 0.05810546875, + "learning_rate": 0.000789213774919824, + "loss": 1.1188, + "step": 8491 + }, + { + "epoch": 0.7446785479886384, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007888997531908719, + "loss": 1.1308, + "step": 8492 + }, + { + "epoch": 0.7447662397630129, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007885858099920397, + "loss": 1.1698, + "step": 8493 + }, + { + "epoch": 0.7448539315373875, + "grad_norm": 0.051513671875, + "learning_rate": 0.0007882719453519581, + "loss": 1.1693, + "step": 8494 + }, + { + "epoch": 0.7449416233117621, + "grad_norm": 0.05810546875, + "learning_rate": 0.0007879581592992503, + "loss": 1.1556, + "step": 8495 + }, + { + "epoch": 0.7450293150861366, + "grad_norm": 0.05615234375, + "learning_rate": 0.000787644451862533, + "loss": 1.103, + "step": 8496 + }, + { + "epoch": 0.7451170068605112, + "grad_norm": 0.0498046875, + "learning_rate": 0.0007873308230704152, + "loss": 1.0997, + "step": 8497 + }, + { + "epoch": 0.7452046986348857, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007870172729515003, + "loss": 1.1354, + "step": 8498 + }, + { + "epoch": 0.7452923904092602, + "grad_norm": 0.048583984375, + "learning_rate": 0.000786703801534382, + "loss": 1.2128, + "step": 8499 + }, + { + "epoch": 0.7453800821836348, + "grad_norm": 0.04638671875, + "learning_rate": 0.000786390408847648, + "loss": 1.1248, + "step": 8500 + }, + { + "epoch": 0.7453800821836348, + "eval_loss": 1.1438930034637451, + "eval_runtime": 429.1939, + "eval_samples_per_second": 33.661, + "eval_steps_per_second": 8.416, + "step": 8500 + }, + { + "epoch": 0.7454677739580093, + "grad_norm": 0.051513671875, + "learning_rate": 0.0007860770949198794, + "loss": 1.0943, + "step": 8501 + }, + { + "epoch": 0.7455554657323838, + "grad_norm": 0.058349609375, + "learning_rate": 0.0007857638597796496, + "loss": 1.2627, + "step": 8502 + }, + { + "epoch": 0.7456431575067585, + "grad_norm": 0.055908203125, + "learning_rate": 0.0007854507034555251, + "loss": 1.1442, + "step": 8503 + }, + { + "epoch": 0.745730849281133, + "grad_norm": 0.0478515625, + "learning_rate": 0.0007851376259760642, + "loss": 1.1325, + "step": 8504 + }, + { + "epoch": 0.7458185410555075, + "grad_norm": 0.04833984375, + "learning_rate": 0.0007848246273698191, + "loss": 1.1798, + "step": 8505 + }, + { + "epoch": 0.7459062328298821, + "grad_norm": 0.0517578125, + "learning_rate": 0.0007845117076653347, + "loss": 1.1366, + "step": 8506 + }, + { + "epoch": 0.7459939246042566, + "grad_norm": 0.044189453125, + "learning_rate": 0.0007841988668911487, + "loss": 1.0675, + "step": 8507 + }, + { + "epoch": 0.7460816163786311, + "grad_norm": 0.04248046875, + "learning_rate": 0.0007838861050757905, + "loss": 1.0947, + "step": 8508 + }, + { + "epoch": 0.7461693081530058, + "grad_norm": 0.0546875, + "learning_rate": 0.0007835734222477838, + "loss": 1.1269, + "step": 8509 + }, + { + "epoch": 0.7462569999273803, + "grad_norm": 0.04296875, + "learning_rate": 0.000783260818435644, + "loss": 1.1438, + "step": 8510 + }, + { + "epoch": 0.7463446917017548, + "grad_norm": 0.048828125, + "learning_rate": 0.0007829482936678805, + "loss": 1.1764, + "step": 8511 + }, + { + "epoch": 0.7464323834761294, + "grad_norm": 0.0458984375, + "learning_rate": 0.0007826358479729945, + "loss": 1.0977, + "step": 8512 + }, + { + "epoch": 0.7465200752505039, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007823234813794796, + "loss": 1.12, + "step": 8513 + }, + { + "epoch": 0.7466077670248784, + "grad_norm": 0.048583984375, + "learning_rate": 0.0007820111939158236, + "loss": 1.1032, + "step": 8514 + }, + { + "epoch": 0.7466954587992529, + "grad_norm": 0.05322265625, + "learning_rate": 0.0007816989856105061, + "loss": 1.1272, + "step": 8515 + }, + { + "epoch": 0.7467831505736275, + "grad_norm": 0.057861328125, + "learning_rate": 0.0007813868564919994, + "loss": 1.1701, + "step": 8516 + }, + { + "epoch": 0.746870842348002, + "grad_norm": 0.06591796875, + "learning_rate": 0.0007810748065887694, + "loss": 1.0965, + "step": 8517 + }, + { + "epoch": 0.7469585341223766, + "grad_norm": 0.0576171875, + "learning_rate": 0.0007807628359292736, + "loss": 1.1156, + "step": 8518 + }, + { + "epoch": 0.7470462258967512, + "grad_norm": 0.08056640625, + "learning_rate": 0.0007804509445419631, + "loss": 1.0932, + "step": 8519 + }, + { + "epoch": 0.7471339176711257, + "grad_norm": 0.06787109375, + "learning_rate": 0.0007801391324552819, + "loss": 1.1866, + "step": 8520 + }, + { + "epoch": 0.7472216094455002, + "grad_norm": 0.046875, + "learning_rate": 0.0007798273996976664, + "loss": 1.0996, + "step": 8521 + }, + { + "epoch": 0.7473093012198748, + "grad_norm": 0.0771484375, + "learning_rate": 0.0007795157462975456, + "loss": 1.1634, + "step": 8522 + }, + { + "epoch": 0.7473969929942493, + "grad_norm": 0.05078125, + "learning_rate": 0.0007792041722833415, + "loss": 1.0959, + "step": 8523 + }, + { + "epoch": 0.7474846847686238, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007788926776834689, + "loss": 1.0954, + "step": 8524 + }, + { + "epoch": 0.7475723765429985, + "grad_norm": 0.042724609375, + "learning_rate": 0.0007785812625263358, + "loss": 1.0653, + "step": 8525 + }, + { + "epoch": 0.747660068317373, + "grad_norm": 0.044677734375, + "learning_rate": 0.0007782699268403415, + "loss": 1.1369, + "step": 8526 + }, + { + "epoch": 0.7477477600917475, + "grad_norm": 0.042236328125, + "learning_rate": 0.0007779586706538797, + "loss": 1.1252, + "step": 8527 + }, + { + "epoch": 0.7478354518661221, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007776474939953359, + "loss": 1.0726, + "step": 8528 + }, + { + "epoch": 0.7479231436404966, + "grad_norm": 0.0546875, + "learning_rate": 0.0007773363968930889, + "loss": 1.1477, + "step": 8529 + }, + { + "epoch": 0.7480108354148711, + "grad_norm": 0.044677734375, + "learning_rate": 0.0007770253793755095, + "loss": 1.1386, + "step": 8530 + }, + { + "epoch": 0.7480985271892457, + "grad_norm": 0.047607421875, + "learning_rate": 0.0007767144414709619, + "loss": 1.1553, + "step": 8531 + }, + { + "epoch": 0.7481862189636203, + "grad_norm": 0.048828125, + "learning_rate": 0.0007764035832078033, + "loss": 1.1119, + "step": 8532 + }, + { + "epoch": 0.7482739107379948, + "grad_norm": 0.059326171875, + "learning_rate": 0.0007760928046143824, + "loss": 1.1262, + "step": 8533 + }, + { + "epoch": 0.7483616025123694, + "grad_norm": 0.043701171875, + "learning_rate": 0.0007757821057190418, + "loss": 1.2086, + "step": 8534 + }, + { + "epoch": 0.7484492942867439, + "grad_norm": 0.054443359375, + "learning_rate": 0.0007754714865501163, + "loss": 1.2241, + "step": 8535 + }, + { + "epoch": 0.7485369860611184, + "grad_norm": 0.052001953125, + "learning_rate": 0.0007751609471359337, + "loss": 1.0654, + "step": 8536 + }, + { + "epoch": 0.748624677835493, + "grad_norm": 0.04736328125, + "learning_rate": 0.0007748504875048141, + "loss": 1.1249, + "step": 8537 + }, + { + "epoch": 0.7487123696098675, + "grad_norm": 0.049560546875, + "learning_rate": 0.0007745401076850715, + "loss": 1.1814, + "step": 8538 + }, + { + "epoch": 0.748800061384242, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007742298077050105, + "loss": 1.0929, + "step": 8539 + }, + { + "epoch": 0.7488877531586166, + "grad_norm": 0.046875, + "learning_rate": 0.0007739195875929303, + "loss": 1.1113, + "step": 8540 + }, + { + "epoch": 0.7489754449329912, + "grad_norm": 0.056640625, + "learning_rate": 0.000773609447377122, + "loss": 1.0623, + "step": 8541 + }, + { + "epoch": 0.7490631367073657, + "grad_norm": 0.047119140625, + "learning_rate": 0.0007732993870858698, + "loss": 1.146, + "step": 8542 + }, + { + "epoch": 0.7491508284817402, + "grad_norm": 0.0478515625, + "learning_rate": 0.0007729894067474508, + "loss": 1.1399, + "step": 8543 + }, + { + "epoch": 0.7492385202561148, + "grad_norm": 0.054931640625, + "learning_rate": 0.0007726795063901334, + "loss": 1.1926, + "step": 8544 + }, + { + "epoch": 0.7493262120304893, + "grad_norm": 0.048095703125, + "learning_rate": 0.0007723696860421803, + "loss": 1.1333, + "step": 8545 + }, + { + "epoch": 0.7494139038048638, + "grad_norm": 0.0498046875, + "learning_rate": 0.000772059945731846, + "loss": 1.0736, + "step": 8546 + }, + { + "epoch": 0.7495015955792385, + "grad_norm": 0.06201171875, + "learning_rate": 0.000771750285487379, + "loss": 1.1819, + "step": 8547 + }, + { + "epoch": 0.749589287353613, + "grad_norm": 0.044677734375, + "learning_rate": 0.0007714407053370184, + "loss": 1.1212, + "step": 8548 + }, + { + "epoch": 0.7496769791279875, + "grad_norm": 0.052001953125, + "learning_rate": 0.0007711312053089975, + "loss": 1.1662, + "step": 8549 + }, + { + "epoch": 0.7497646709023621, + "grad_norm": 0.04248046875, + "learning_rate": 0.0007708217854315418, + "loss": 1.0975, + "step": 8550 + }, + { + "epoch": 0.7498523626767366, + "grad_norm": 0.056396484375, + "learning_rate": 0.0007705124457328697, + "loss": 1.1772, + "step": 8551 + }, + { + "epoch": 0.7499400544511111, + "grad_norm": 0.044677734375, + "learning_rate": 0.0007702031862411923, + "loss": 1.119, + "step": 8552 + }, + { + "epoch": 0.7500277462254857, + "grad_norm": 0.052734375, + "learning_rate": 0.000769894006984713, + "loss": 1.126, + "step": 8553 + }, + { + "epoch": 0.7501154379998602, + "grad_norm": 0.047607421875, + "learning_rate": 0.0007695849079916282, + "loss": 1.146, + "step": 8554 + }, + { + "epoch": 0.7502031297742348, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007692758892901269, + "loss": 1.1341, + "step": 8555 + }, + { + "epoch": 0.7502908215486094, + "grad_norm": 0.052734375, + "learning_rate": 0.0007689669509083916, + "loss": 1.1656, + "step": 8556 + }, + { + "epoch": 0.7503785133229839, + "grad_norm": 0.048095703125, + "learning_rate": 0.0007686580928745952, + "loss": 1.1114, + "step": 8557 + }, + { + "epoch": 0.7504662050973584, + "grad_norm": 0.056396484375, + "learning_rate": 0.0007683493152169057, + "loss": 1.1849, + "step": 8558 + }, + { + "epoch": 0.750553896871733, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007680406179634828, + "loss": 1.1453, + "step": 8559 + }, + { + "epoch": 0.7506415886461075, + "grad_norm": 0.0458984375, + "learning_rate": 0.0007677320011424789, + "loss": 1.1095, + "step": 8560 + }, + { + "epoch": 0.750729280420482, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007674234647820389, + "loss": 1.1356, + "step": 8561 + }, + { + "epoch": 0.7508169721948567, + "grad_norm": 0.0458984375, + "learning_rate": 0.0007671150089103004, + "loss": 1.0642, + "step": 8562 + }, + { + "epoch": 0.7509046639692312, + "grad_norm": 0.04833984375, + "learning_rate": 0.0007668066335553938, + "loss": 1.1231, + "step": 8563 + }, + { + "epoch": 0.7509923557436057, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007664983387454429, + "loss": 1.1691, + "step": 8564 + }, + { + "epoch": 0.7510800475179802, + "grad_norm": 0.042724609375, + "learning_rate": 0.0007661901245085622, + "loss": 1.1368, + "step": 8565 + }, + { + "epoch": 0.7511677392923548, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007658819908728612, + "loss": 1.1001, + "step": 8566 + }, + { + "epoch": 0.7512554310667293, + "grad_norm": 0.050537109375, + "learning_rate": 0.0007655739378664396, + "loss": 1.1745, + "step": 8567 + }, + { + "epoch": 0.7513431228411038, + "grad_norm": 0.043701171875, + "learning_rate": 0.0007652659655173922, + "loss": 1.147, + "step": 8568 + }, + { + "epoch": 0.7514308146154784, + "grad_norm": 0.048095703125, + "learning_rate": 0.0007649580738538042, + "loss": 1.1246, + "step": 8569 + }, + { + "epoch": 0.751518506389853, + "grad_norm": 0.048828125, + "learning_rate": 0.000764650262903756, + "loss": 1.1035, + "step": 8570 + }, + { + "epoch": 0.7516061981642275, + "grad_norm": 0.053466796875, + "learning_rate": 0.0007643425326953176, + "loss": 1.0869, + "step": 8571 + }, + { + "epoch": 0.7516938899386021, + "grad_norm": 0.053466796875, + "learning_rate": 0.0007640348832565541, + "loss": 1.1414, + "step": 8572 + }, + { + "epoch": 0.7517815817129766, + "grad_norm": 0.05126953125, + "learning_rate": 0.0007637273146155221, + "loss": 1.1834, + "step": 8573 + }, + { + "epoch": 0.7518692734873511, + "grad_norm": 0.068359375, + "learning_rate": 0.0007634198268002715, + "loss": 1.223, + "step": 8574 + }, + { + "epoch": 0.7519569652617257, + "grad_norm": 0.054931640625, + "learning_rate": 0.0007631124198388436, + "loss": 1.1055, + "step": 8575 + }, + { + "epoch": 0.7520446570361002, + "grad_norm": 0.0595703125, + "learning_rate": 0.0007628050937592736, + "loss": 1.1015, + "step": 8576 + }, + { + "epoch": 0.7521323488104747, + "grad_norm": 0.04931640625, + "learning_rate": 0.0007624978485895887, + "loss": 1.0945, + "step": 8577 + }, + { + "epoch": 0.7522200405848494, + "grad_norm": 0.09912109375, + "learning_rate": 0.0007621906843578094, + "loss": 1.1515, + "step": 8578 + }, + { + "epoch": 0.7523077323592239, + "grad_norm": 0.058349609375, + "learning_rate": 0.0007618836010919472, + "loss": 1.0851, + "step": 8579 + }, + { + "epoch": 0.7523954241335984, + "grad_norm": 0.087890625, + "learning_rate": 0.0007615765988200079, + "loss": 1.0855, + "step": 8580 + }, + { + "epoch": 0.752483115907973, + "grad_norm": 0.058349609375, + "learning_rate": 0.0007612696775699896, + "loss": 1.1743, + "step": 8581 + }, + { + "epoch": 0.7525708076823475, + "grad_norm": 0.045166015625, + "learning_rate": 0.000760962837369883, + "loss": 1.1705, + "step": 8582 + }, + { + "epoch": 0.752658499456722, + "grad_norm": 0.05810546875, + "learning_rate": 0.0007606560782476702, + "loss": 1.1679, + "step": 8583 + }, + { + "epoch": 0.7527461912310967, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007603494002313268, + "loss": 1.1942, + "step": 8584 + }, + { + "epoch": 0.7528338830054712, + "grad_norm": 0.04833984375, + "learning_rate": 0.0007600428033488219, + "loss": 1.1375, + "step": 8585 + }, + { + "epoch": 0.7529215747798457, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007597362876281157, + "loss": 1.0941, + "step": 8586 + }, + { + "epoch": 0.7530092665542202, + "grad_norm": 0.052978515625, + "learning_rate": 0.0007594298530971621, + "loss": 1.1157, + "step": 8587 + }, + { + "epoch": 0.7530969583285948, + "grad_norm": 0.057373046875, + "learning_rate": 0.0007591234997839066, + "loss": 1.0968, + "step": 8588 + }, + { + "epoch": 0.7531846501029693, + "grad_norm": 0.060302734375, + "learning_rate": 0.0007588172277162883, + "loss": 1.0576, + "step": 8589 + }, + { + "epoch": 0.7532723418773438, + "grad_norm": 0.060791015625, + "learning_rate": 0.0007585110369222381, + "loss": 1.1316, + "step": 8590 + }, + { + "epoch": 0.7533600336517184, + "grad_norm": 0.044189453125, + "learning_rate": 0.0007582049274296804, + "loss": 1.1364, + "step": 8591 + }, + { + "epoch": 0.753447725426093, + "grad_norm": 0.044921875, + "learning_rate": 0.0007578988992665307, + "loss": 1.1809, + "step": 8592 + }, + { + "epoch": 0.7535354172004675, + "grad_norm": 0.04150390625, + "learning_rate": 0.0007575929524606984, + "loss": 1.0927, + "step": 8593 + }, + { + "epoch": 0.7536231089748421, + "grad_norm": 0.04833984375, + "learning_rate": 0.0007572870870400853, + "loss": 1.0951, + "step": 8594 + }, + { + "epoch": 0.7537108007492166, + "grad_norm": 0.0419921875, + "learning_rate": 0.0007569813030325851, + "loss": 1.1027, + "step": 8595 + }, + { + "epoch": 0.7537984925235911, + "grad_norm": 0.046875, + "learning_rate": 0.0007566756004660853, + "loss": 1.1168, + "step": 8596 + }, + { + "epoch": 0.7538861842979657, + "grad_norm": 0.044921875, + "learning_rate": 0.0007563699793684643, + "loss": 1.085, + "step": 8597 + }, + { + "epoch": 0.7539738760723402, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007560644397675943, + "loss": 1.1056, + "step": 8598 + }, + { + "epoch": 0.7540615678467147, + "grad_norm": 0.045166015625, + "learning_rate": 0.00075575898169134, + "loss": 1.1167, + "step": 8599 + }, + { + "epoch": 0.7541492596210894, + "grad_norm": 0.04443359375, + "learning_rate": 0.0007554536051675578, + "loss": 1.1951, + "step": 8600 + }, + { + "epoch": 0.7542369513954639, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007551483102240978, + "loss": 1.136, + "step": 8601 + }, + { + "epoch": 0.7543246431698384, + "grad_norm": 0.06884765625, + "learning_rate": 0.0007548430968888015, + "loss": 1.0868, + "step": 8602 + }, + { + "epoch": 0.754412334944213, + "grad_norm": 0.052490234375, + "learning_rate": 0.0007545379651895038, + "loss": 1.1547, + "step": 8603 + }, + { + "epoch": 0.7545000267185875, + "grad_norm": 0.0869140625, + "learning_rate": 0.0007542329151540323, + "loss": 1.1059, + "step": 8604 + }, + { + "epoch": 0.754587718492962, + "grad_norm": 0.046142578125, + "learning_rate": 0.0007539279468102069, + "loss": 1.1105, + "step": 8605 + }, + { + "epoch": 0.7546754102673366, + "grad_norm": 0.0498046875, + "learning_rate": 0.0007536230601858392, + "loss": 1.2443, + "step": 8606 + }, + { + "epoch": 0.7547631020417112, + "grad_norm": 0.044677734375, + "learning_rate": 0.000753318255308734, + "loss": 1.1553, + "step": 8607 + }, + { + "epoch": 0.7548507938160857, + "grad_norm": 0.057861328125, + "learning_rate": 0.0007530135322066896, + "loss": 1.1524, + "step": 8608 + }, + { + "epoch": 0.7549384855904603, + "grad_norm": 0.047607421875, + "learning_rate": 0.0007527088909074958, + "loss": 1.1766, + "step": 8609 + }, + { + "epoch": 0.7550261773648348, + "grad_norm": 0.052978515625, + "learning_rate": 0.0007524043314389343, + "loss": 1.1565, + "step": 8610 + }, + { + "epoch": 0.7551138691392093, + "grad_norm": 0.056640625, + "learning_rate": 0.0007520998538287807, + "loss": 1.1142, + "step": 8611 + }, + { + "epoch": 0.7552015609135838, + "grad_norm": 0.0576171875, + "learning_rate": 0.0007517954581048026, + "loss": 1.0841, + "step": 8612 + }, + { + "epoch": 0.7552892526879584, + "grad_norm": 0.050048828125, + "learning_rate": 0.0007514911442947602, + "loss": 1.1583, + "step": 8613 + }, + { + "epoch": 0.7553769444623329, + "grad_norm": 0.05078125, + "learning_rate": 0.0007511869124264057, + "loss": 1.0915, + "step": 8614 + }, + { + "epoch": 0.7554646362367075, + "grad_norm": 0.0830078125, + "learning_rate": 0.0007508827625274845, + "loss": 1.1135, + "step": 8615 + }, + { + "epoch": 0.7555523280110821, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007505786946257348, + "loss": 1.1438, + "step": 8616 + }, + { + "epoch": 0.7556400197854566, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007502747087488856, + "loss": 1.1148, + "step": 8617 + }, + { + "epoch": 0.7557277115598311, + "grad_norm": 0.041015625, + "learning_rate": 0.0007499708049246611, + "loss": 1.1461, + "step": 8618 + }, + { + "epoch": 0.7558154033342057, + "grad_norm": 0.046142578125, + "learning_rate": 0.0007496669831807751, + "loss": 1.1236, + "step": 8619 + }, + { + "epoch": 0.7559030951085802, + "grad_norm": 0.053466796875, + "learning_rate": 0.0007493632435449362, + "loss": 1.1408, + "step": 8620 + }, + { + "epoch": 0.7559907868829547, + "grad_norm": 0.048828125, + "learning_rate": 0.0007490595860448444, + "loss": 1.1919, + "step": 8621 + }, + { + "epoch": 0.7560784786573294, + "grad_norm": 0.047119140625, + "learning_rate": 0.0007487560107081926, + "loss": 1.1325, + "step": 8622 + }, + { + "epoch": 0.7561661704317039, + "grad_norm": 0.048828125, + "learning_rate": 0.0007484525175626666, + "loss": 1.1023, + "step": 8623 + }, + { + "epoch": 0.7562538622060784, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007481491066359432, + "loss": 1.1197, + "step": 8624 + }, + { + "epoch": 0.756341553980453, + "grad_norm": 0.057373046875, + "learning_rate": 0.000747845777955693, + "loss": 1.1252, + "step": 8625 + }, + { + "epoch": 0.7564292457548275, + "grad_norm": 0.052978515625, + "learning_rate": 0.0007475425315495791, + "loss": 1.057, + "step": 8626 + }, + { + "epoch": 0.756516937529202, + "grad_norm": 0.052734375, + "learning_rate": 0.0007472393674452572, + "loss": 1.1083, + "step": 8627 + }, + { + "epoch": 0.7566046293035766, + "grad_norm": 0.047119140625, + "learning_rate": 0.0007469362856703739, + "loss": 1.158, + "step": 8628 + }, + { + "epoch": 0.7566923210779511, + "grad_norm": 0.044921875, + "learning_rate": 0.0007466332862525701, + "loss": 1.1221, + "step": 8629 + }, + { + "epoch": 0.7567800128523257, + "grad_norm": 0.044921875, + "learning_rate": 0.0007463303692194786, + "loss": 1.1119, + "step": 8630 + }, + { + "epoch": 0.7568677046267003, + "grad_norm": 0.0498046875, + "learning_rate": 0.0007460275345987251, + "loss": 1.166, + "step": 8631 + }, + { + "epoch": 0.7569553964010748, + "grad_norm": 0.04833984375, + "learning_rate": 0.0007457247824179264, + "loss": 1.1657, + "step": 8632 + }, + { + "epoch": 0.7570430881754493, + "grad_norm": 0.051025390625, + "learning_rate": 0.0007454221127046934, + "loss": 1.0879, + "step": 8633 + }, + { + "epoch": 0.7571307799498239, + "grad_norm": 0.049560546875, + "learning_rate": 0.0007451195254866285, + "loss": 1.1409, + "step": 8634 + }, + { + "epoch": 0.7572184717241984, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007448170207913267, + "loss": 1.0777, + "step": 8635 + }, + { + "epoch": 0.7573061634985729, + "grad_norm": 0.046142578125, + "learning_rate": 0.0007445145986463763, + "loss": 1.1177, + "step": 8636 + }, + { + "epoch": 0.7573938552729474, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007442122590793567, + "loss": 1.1204, + "step": 8637 + }, + { + "epoch": 0.7574815470473221, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007439100021178407, + "loss": 1.1544, + "step": 8638 + }, + { + "epoch": 0.7575692388216966, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007436078277893935, + "loss": 1.1013, + "step": 8639 + }, + { + "epoch": 0.7576569305960711, + "grad_norm": 0.043701171875, + "learning_rate": 0.0007433057361215731, + "loss": 1.0742, + "step": 8640 + }, + { + "epoch": 0.7577446223704457, + "grad_norm": 0.045166015625, + "learning_rate": 0.0007430037271419282, + "loss": 1.1808, + "step": 8641 + }, + { + "epoch": 0.7578323141448202, + "grad_norm": 0.046630859375, + "learning_rate": 0.0007427018008780022, + "loss": 1.1043, + "step": 8642 + }, + { + "epoch": 0.7579200059191947, + "grad_norm": 0.044921875, + "learning_rate": 0.0007423999573573298, + "loss": 1.1825, + "step": 8643 + }, + { + "epoch": 0.7580076976935693, + "grad_norm": 0.044677734375, + "learning_rate": 0.0007420981966074388, + "loss": 1.1525, + "step": 8644 + }, + { + "epoch": 0.7580953894679439, + "grad_norm": 0.041748046875, + "learning_rate": 0.000741796518655848, + "loss": 1.0803, + "step": 8645 + }, + { + "epoch": 0.7581830812423184, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007414949235300701, + "loss": 1.0897, + "step": 8646 + }, + { + "epoch": 0.758270773016693, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007411934112576101, + "loss": 1.1099, + "step": 8647 + }, + { + "epoch": 0.7583584647910675, + "grad_norm": 0.04736328125, + "learning_rate": 0.0007408919818659648, + "loss": 1.1116, + "step": 8648 + }, + { + "epoch": 0.758446156565442, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007405906353826243, + "loss": 1.1428, + "step": 8649 + }, + { + "epoch": 0.7585338483398166, + "grad_norm": 0.057861328125, + "learning_rate": 0.0007402893718350703, + "loss": 1.1617, + "step": 8650 + }, + { + "epoch": 0.7586215401141911, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007399881912507767, + "loss": 1.0983, + "step": 8651 + }, + { + "epoch": 0.7587092318885656, + "grad_norm": 0.046142578125, + "learning_rate": 0.0007396870936572109, + "loss": 1.1031, + "step": 8652 + }, + { + "epoch": 0.7587969236629403, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007393860790818322, + "loss": 1.1795, + "step": 8653 + }, + { + "epoch": 0.7588846154373148, + "grad_norm": 0.06396484375, + "learning_rate": 0.0007390851475520927, + "loss": 1.1228, + "step": 8654 + }, + { + "epoch": 0.7589723072116893, + "grad_norm": 0.04736328125, + "learning_rate": 0.000738784299095436, + "loss": 1.1171, + "step": 8655 + }, + { + "epoch": 0.7590599989860639, + "grad_norm": 0.0458984375, + "learning_rate": 0.0007384835337392987, + "loss": 1.1796, + "step": 8656 + }, + { + "epoch": 0.7591476907604384, + "grad_norm": 0.044189453125, + "learning_rate": 0.0007381828515111102, + "loss": 1.1423, + "step": 8657 + }, + { + "epoch": 0.7592353825348129, + "grad_norm": 0.04931640625, + "learning_rate": 0.0007378822524382922, + "loss": 1.1962, + "step": 8658 + }, + { + "epoch": 0.7593230743091874, + "grad_norm": 0.04931640625, + "learning_rate": 0.0007375817365482577, + "loss": 1.1874, + "step": 8659 + }, + { + "epoch": 0.7594107660835621, + "grad_norm": 0.048583984375, + "learning_rate": 0.0007372813038684134, + "loss": 1.185, + "step": 8660 + }, + { + "epoch": 0.7594984578579366, + "grad_norm": 0.043701171875, + "learning_rate": 0.000736980954426158, + "loss": 1.1391, + "step": 8661 + }, + { + "epoch": 0.7595861496323111, + "grad_norm": 0.045166015625, + "learning_rate": 0.0007366806882488832, + "loss": 1.0968, + "step": 8662 + }, + { + "epoch": 0.7596738414066857, + "grad_norm": 0.07080078125, + "learning_rate": 0.0007363805053639714, + "loss": 1.0923, + "step": 8663 + }, + { + "epoch": 0.7597615331810602, + "grad_norm": 0.04931640625, + "learning_rate": 0.000736080405798799, + "loss": 1.165, + "step": 8664 + }, + { + "epoch": 0.7598492249554347, + "grad_norm": 0.050537109375, + "learning_rate": 0.0007357803895807343, + "loss": 1.1829, + "step": 8665 + }, + { + "epoch": 0.7599369167298093, + "grad_norm": 0.04736328125, + "learning_rate": 0.0007354804567371385, + "loss": 1.1658, + "step": 8666 + }, + { + "epoch": 0.7600246085041839, + "grad_norm": 0.049072265625, + "learning_rate": 0.0007351806072953643, + "loss": 1.1244, + "step": 8667 + }, + { + "epoch": 0.7601123002785584, + "grad_norm": 0.042724609375, + "learning_rate": 0.0007348808412827565, + "loss": 1.0674, + "step": 8668 + }, + { + "epoch": 0.760199992052933, + "grad_norm": 0.047119140625, + "learning_rate": 0.0007345811587266537, + "loss": 1.1089, + "step": 8669 + }, + { + "epoch": 0.7602876838273075, + "grad_norm": 0.05224609375, + "learning_rate": 0.000734281559654386, + "loss": 1.1791, + "step": 8670 + }, + { + "epoch": 0.760375375601682, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007339820440932767, + "loss": 1.1221, + "step": 8671 + }, + { + "epoch": 0.7604630673760566, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007336826120706398, + "loss": 1.1652, + "step": 8672 + }, + { + "epoch": 0.7605507591504311, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007333832636137832, + "loss": 1.1084, + "step": 8673 + }, + { + "epoch": 0.7606384509248056, + "grad_norm": 0.047119140625, + "learning_rate": 0.0007330839987500069, + "loss": 1.1681, + "step": 8674 + }, + { + "epoch": 0.7607261426991803, + "grad_norm": 0.059814453125, + "learning_rate": 0.0007327848175066026, + "loss": 1.1883, + "step": 8675 + }, + { + "epoch": 0.7608138344735548, + "grad_norm": 0.051025390625, + "learning_rate": 0.0007324857199108558, + "loss": 1.1365, + "step": 8676 + }, + { + "epoch": 0.7609015262479293, + "grad_norm": 0.061767578125, + "learning_rate": 0.0007321867059900424, + "loss": 1.175, + "step": 8677 + }, + { + "epoch": 0.7609892180223039, + "grad_norm": 0.045166015625, + "learning_rate": 0.000731887775771432, + "loss": 1.1831, + "step": 8678 + }, + { + "epoch": 0.7610769097966784, + "grad_norm": 0.05322265625, + "learning_rate": 0.0007315889292822862, + "loss": 1.0899, + "step": 8679 + }, + { + "epoch": 0.7611646015710529, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007312901665498598, + "loss": 1.1483, + "step": 8680 + }, + { + "epoch": 0.7612522933454275, + "grad_norm": 0.07421875, + "learning_rate": 0.0007309914876013982, + "loss": 1.1419, + "step": 8681 + }, + { + "epoch": 0.761339985119802, + "grad_norm": 0.058349609375, + "learning_rate": 0.0007306928924641404, + "loss": 1.0812, + "step": 8682 + }, + { + "epoch": 0.7614276768941766, + "grad_norm": 0.05126953125, + "learning_rate": 0.0007303943811653176, + "loss": 1.1408, + "step": 8683 + }, + { + "epoch": 0.7615153686685511, + "grad_norm": 0.044677734375, + "learning_rate": 0.000730095953732154, + "loss": 1.1484, + "step": 8684 + }, + { + "epoch": 0.7616030604429257, + "grad_norm": 0.05126953125, + "learning_rate": 0.0007297976101918644, + "loss": 1.103, + "step": 8685 + }, + { + "epoch": 0.7616907522173002, + "grad_norm": 0.048583984375, + "learning_rate": 0.0007294993505716569, + "loss": 1.1436, + "step": 8686 + }, + { + "epoch": 0.7617784439916747, + "grad_norm": 0.0458984375, + "learning_rate": 0.0007292011748987324, + "loss": 1.12, + "step": 8687 + }, + { + "epoch": 0.7618661357660493, + "grad_norm": 0.04736328125, + "learning_rate": 0.0007289030832002837, + "loss": 1.2152, + "step": 8688 + }, + { + "epoch": 0.7619538275404238, + "grad_norm": 0.040771484375, + "learning_rate": 0.0007286050755034965, + "loss": 1.1363, + "step": 8689 + }, + { + "epoch": 0.7620415193147984, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007283071518355473, + "loss": 1.0707, + "step": 8690 + }, + { + "epoch": 0.762129211089173, + "grad_norm": 0.04833984375, + "learning_rate": 0.0007280093122236065, + "loss": 1.147, + "step": 8691 + }, + { + "epoch": 0.7622169028635475, + "grad_norm": 0.050048828125, + "learning_rate": 0.0007277115566948364, + "loss": 1.165, + "step": 8692 + }, + { + "epoch": 0.762304594637922, + "grad_norm": 0.042236328125, + "learning_rate": 0.0007274138852763919, + "loss": 1.0908, + "step": 8693 + }, + { + "epoch": 0.7623922864122966, + "grad_norm": 0.05126953125, + "learning_rate": 0.000727116297995419, + "loss": 1.1214, + "step": 8694 + }, + { + "epoch": 0.7624799781866711, + "grad_norm": 0.042724609375, + "learning_rate": 0.0007268187948790573, + "loss": 1.1461, + "step": 8695 + }, + { + "epoch": 0.7625676699610456, + "grad_norm": 0.07470703125, + "learning_rate": 0.0007265213759544382, + "loss": 1.1707, + "step": 8696 + }, + { + "epoch": 0.7626553617354203, + "grad_norm": 0.056884765625, + "learning_rate": 0.0007262240412486862, + "loss": 1.1722, + "step": 8697 + }, + { + "epoch": 0.7627430535097948, + "grad_norm": 0.044677734375, + "learning_rate": 0.0007259267907889167, + "loss": 1.1185, + "step": 8698 + }, + { + "epoch": 0.7628307452841693, + "grad_norm": 0.051513671875, + "learning_rate": 0.000725629624602238, + "loss": 1.1577, + "step": 8699 + }, + { + "epoch": 0.7629184370585439, + "grad_norm": 0.0634765625, + "learning_rate": 0.0007253325427157514, + "loss": 1.1558, + "step": 8700 + }, + { + "epoch": 0.7630061288329184, + "grad_norm": 0.041748046875, + "learning_rate": 0.0007250355451565506, + "loss": 1.1316, + "step": 8701 + }, + { + "epoch": 0.7630938206072929, + "grad_norm": 0.048095703125, + "learning_rate": 0.0007247386319517201, + "loss": 1.0985, + "step": 8702 + }, + { + "epoch": 0.7631815123816675, + "grad_norm": 0.057861328125, + "learning_rate": 0.0007244418031283373, + "loss": 1.1352, + "step": 8703 + }, + { + "epoch": 0.763269204156042, + "grad_norm": 0.0537109375, + "learning_rate": 0.0007241450587134728, + "loss": 1.1275, + "step": 8704 + }, + { + "epoch": 0.7633568959304166, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007238483987341889, + "loss": 1.0956, + "step": 8705 + }, + { + "epoch": 0.7634445877047912, + "grad_norm": 0.048828125, + "learning_rate": 0.0007235518232175402, + "loss": 1.1325, + "step": 8706 + }, + { + "epoch": 0.7635322794791657, + "grad_norm": 0.044921875, + "learning_rate": 0.000723255332190574, + "loss": 1.1158, + "step": 8707 + }, + { + "epoch": 0.7636199712535402, + "grad_norm": 0.0625, + "learning_rate": 0.0007229589256803287, + "loss": 1.1521, + "step": 8708 + }, + { + "epoch": 0.7637076630279147, + "grad_norm": 0.064453125, + "learning_rate": 0.0007226626037138363, + "loss": 1.1167, + "step": 8709 + }, + { + "epoch": 0.7637953548022893, + "grad_norm": 0.050537109375, + "learning_rate": 0.0007223663663181205, + "loss": 1.1655, + "step": 8710 + }, + { + "epoch": 0.7638830465766638, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007220702135201978, + "loss": 1.1254, + "step": 8711 + }, + { + "epoch": 0.7639707383510383, + "grad_norm": 0.048583984375, + "learning_rate": 0.0007217741453470757, + "loss": 1.121, + "step": 8712 + }, + { + "epoch": 0.764058430125413, + "grad_norm": 0.064453125, + "learning_rate": 0.0007214781618257555, + "loss": 1.158, + "step": 8713 + }, + { + "epoch": 0.7641461218997875, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007211822629832298, + "loss": 1.1561, + "step": 8714 + }, + { + "epoch": 0.764233813674162, + "grad_norm": 0.044189453125, + "learning_rate": 0.0007208864488464846, + "loss": 1.1309, + "step": 8715 + }, + { + "epoch": 0.7643215054485366, + "grad_norm": 0.05078125, + "learning_rate": 0.0007205907194424961, + "loss": 1.1532, + "step": 8716 + }, + { + "epoch": 0.7644091972229111, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007202950747982349, + "loss": 1.1659, + "step": 8717 + }, + { + "epoch": 0.7644968889972856, + "grad_norm": 0.044677734375, + "learning_rate": 0.0007199995149406633, + "loss": 1.2157, + "step": 8718 + }, + { + "epoch": 0.7645845807716602, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007197040398967347, + "loss": 1.1732, + "step": 8719 + }, + { + "epoch": 0.7646722725460348, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007194086496933966, + "loss": 1.0747, + "step": 8720 + }, + { + "epoch": 0.7647599643204093, + "grad_norm": 0.045166015625, + "learning_rate": 0.0007191133443575868, + "loss": 1.1001, + "step": 8721 + }, + { + "epoch": 0.7648476560947839, + "grad_norm": 0.0458984375, + "learning_rate": 0.000718818123916237, + "loss": 1.1789, + "step": 8722 + }, + { + "epoch": 0.7649353478691584, + "grad_norm": 0.049560546875, + "learning_rate": 0.0007185229883962705, + "loss": 1.1394, + "step": 8723 + }, + { + "epoch": 0.7650230396435329, + "grad_norm": 0.047607421875, + "learning_rate": 0.000718227937824603, + "loss": 1.1502, + "step": 8724 + }, + { + "epoch": 0.7651107314179075, + "grad_norm": 0.04541015625, + "learning_rate": 0.000717932972228142, + "loss": 1.0985, + "step": 8725 + }, + { + "epoch": 0.765198423192282, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007176380916337878, + "loss": 1.1711, + "step": 8726 + }, + { + "epoch": 0.7652861149666565, + "grad_norm": 0.058349609375, + "learning_rate": 0.000717343296068433, + "loss": 1.1504, + "step": 8727 + }, + { + "epoch": 0.7653738067410312, + "grad_norm": 0.048583984375, + "learning_rate": 0.0007170485855589616, + "loss": 1.093, + "step": 8728 + }, + { + "epoch": 0.7654614985154057, + "grad_norm": 0.0498046875, + "learning_rate": 0.0007167539601322514, + "loss": 1.0963, + "step": 8729 + }, + { + "epoch": 0.7655491902897802, + "grad_norm": 0.043701171875, + "learning_rate": 0.0007164594198151704, + "loss": 1.194, + "step": 8730 + }, + { + "epoch": 0.7656368820641547, + "grad_norm": 0.04052734375, + "learning_rate": 0.0007161649646345804, + "loss": 1.0734, + "step": 8731 + }, + { + "epoch": 0.7657245738385293, + "grad_norm": 0.0439453125, + "learning_rate": 0.000715870594617335, + "loss": 1.0952, + "step": 8732 + }, + { + "epoch": 0.7658122656129038, + "grad_norm": 0.052978515625, + "learning_rate": 0.0007155763097902803, + "loss": 1.1707, + "step": 8733 + }, + { + "epoch": 0.7658999573872783, + "grad_norm": 0.0458984375, + "learning_rate": 0.0007152821101802534, + "loss": 1.1272, + "step": 8734 + }, + { + "epoch": 0.765987649161653, + "grad_norm": 0.046875, + "learning_rate": 0.0007149879958140857, + "loss": 1.1936, + "step": 8735 + }, + { + "epoch": 0.7660753409360275, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007146939667185986, + "loss": 1.1045, + "step": 8736 + }, + { + "epoch": 0.766163032710402, + "grad_norm": 0.048828125, + "learning_rate": 0.0007144000229206072, + "loss": 1.1193, + "step": 8737 + }, + { + "epoch": 0.7662507244847766, + "grad_norm": 0.058349609375, + "learning_rate": 0.000714106164446919, + "loss": 1.1672, + "step": 8738 + }, + { + "epoch": 0.7663384162591511, + "grad_norm": 0.046875, + "learning_rate": 0.000713812391324332, + "loss": 1.1292, + "step": 8739 + }, + { + "epoch": 0.7664261080335256, + "grad_norm": 0.041748046875, + "learning_rate": 0.0007135187035796382, + "loss": 1.0682, + "step": 8740 + }, + { + "epoch": 0.7665137998079002, + "grad_norm": 0.051513671875, + "learning_rate": 0.0007132251012396212, + "loss": 1.1377, + "step": 8741 + }, + { + "epoch": 0.7666014915822748, + "grad_norm": 0.04345703125, + "learning_rate": 0.000712931584331057, + "loss": 1.1428, + "step": 8742 + }, + { + "epoch": 0.7666891833566493, + "grad_norm": 0.045166015625, + "learning_rate": 0.0007126381528807131, + "loss": 1.0837, + "step": 8743 + }, + { + "epoch": 0.7667768751310239, + "grad_norm": 0.044677734375, + "learning_rate": 0.0007123448069153497, + "loss": 1.1044, + "step": 8744 + }, + { + "epoch": 0.7668645669053984, + "grad_norm": 0.04248046875, + "learning_rate": 0.0007120515464617194, + "loss": 1.1089, + "step": 8745 + }, + { + "epoch": 0.7669522586797729, + "grad_norm": 0.048583984375, + "learning_rate": 0.0007117583715465672, + "loss": 1.1258, + "step": 8746 + }, + { + "epoch": 0.7670399504541475, + "grad_norm": 0.06005859375, + "learning_rate": 0.0007114652821966289, + "loss": 1.1643, + "step": 8747 + }, + { + "epoch": 0.767127642228522, + "grad_norm": 0.06005859375, + "learning_rate": 0.0007111722784386342, + "loss": 1.1111, + "step": 8748 + }, + { + "epoch": 0.7672153340028965, + "grad_norm": 0.04931640625, + "learning_rate": 0.000710879360299304, + "loss": 1.1626, + "step": 8749 + }, + { + "epoch": 0.7673030257772712, + "grad_norm": 0.05029296875, + "learning_rate": 0.0007105865278053524, + "loss": 1.1738, + "step": 8750 + }, + { + "epoch": 0.7673907175516457, + "grad_norm": 0.050537109375, + "learning_rate": 0.0007102937809834839, + "loss": 1.0734, + "step": 8751 + }, + { + "epoch": 0.7674784093260202, + "grad_norm": 0.054443359375, + "learning_rate": 0.0007100011198603971, + "loss": 1.1216, + "step": 8752 + }, + { + "epoch": 0.7675661011003948, + "grad_norm": 0.051513671875, + "learning_rate": 0.0007097085444627812, + "loss": 1.1679, + "step": 8753 + }, + { + "epoch": 0.7676537928747693, + "grad_norm": 0.060546875, + "learning_rate": 0.0007094160548173187, + "loss": 1.1109, + "step": 8754 + }, + { + "epoch": 0.7677414846491438, + "grad_norm": 0.06201171875, + "learning_rate": 0.0007091236509506843, + "loss": 1.1382, + "step": 8755 + }, + { + "epoch": 0.7678291764235183, + "grad_norm": 0.0703125, + "learning_rate": 0.0007088313328895436, + "loss": 1.1719, + "step": 8756 + }, + { + "epoch": 0.767916868197893, + "grad_norm": 0.07958984375, + "learning_rate": 0.0007085391006605557, + "loss": 1.1093, + "step": 8757 + }, + { + "epoch": 0.7680045599722675, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007082469542903716, + "loss": 1.1367, + "step": 8758 + }, + { + "epoch": 0.768092251746642, + "grad_norm": 0.04443359375, + "learning_rate": 0.000707954893805634, + "loss": 1.095, + "step": 8759 + }, + { + "epoch": 0.7681799435210166, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007076629192329788, + "loss": 1.1202, + "step": 8760 + }, + { + "epoch": 0.7682676352953911, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007073710305990322, + "loss": 1.1109, + "step": 8761 + }, + { + "epoch": 0.7683553270697656, + "grad_norm": 0.06982421875, + "learning_rate": 0.0007070792279304142, + "loss": 1.1824, + "step": 8762 + }, + { + "epoch": 0.7684430188441402, + "grad_norm": 0.0859375, + "learning_rate": 0.0007067875112537366, + "loss": 1.1529, + "step": 8763 + }, + { + "epoch": 0.7685307106185147, + "grad_norm": 0.047119140625, + "learning_rate": 0.0007064958805956035, + "loss": 1.0925, + "step": 8764 + }, + { + "epoch": 0.7686184023928893, + "grad_norm": 0.045654296875, + "learning_rate": 0.0007062043359826098, + "loss": 1.1174, + "step": 8765 + }, + { + "epoch": 0.7687060941672639, + "grad_norm": 0.047119140625, + "learning_rate": 0.0007059128774413444, + "loss": 1.1572, + "step": 8766 + }, + { + "epoch": 0.7687937859416384, + "grad_norm": 0.0859375, + "learning_rate": 0.0007056215049983875, + "loss": 1.1326, + "step": 8767 + }, + { + "epoch": 0.7688814777160129, + "grad_norm": 0.04736328125, + "learning_rate": 0.0007053302186803117, + "loss": 1.1491, + "step": 8768 + }, + { + "epoch": 0.7689691694903875, + "grad_norm": 0.046142578125, + "learning_rate": 0.0007050390185136814, + "loss": 1.1296, + "step": 8769 + }, + { + "epoch": 0.769056861264762, + "grad_norm": 0.044921875, + "learning_rate": 0.000704747904525053, + "loss": 1.1264, + "step": 8770 + }, + { + "epoch": 0.7691445530391365, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007044568767409753, + "loss": 1.0936, + "step": 8771 + }, + { + "epoch": 0.7692322448135112, + "grad_norm": 0.0712890625, + "learning_rate": 0.0007041659351879894, + "loss": 1.1458, + "step": 8772 + }, + { + "epoch": 0.7693199365878857, + "grad_norm": 0.045166015625, + "learning_rate": 0.0007038750798926293, + "loss": 1.1524, + "step": 8773 + }, + { + "epoch": 0.7694076283622602, + "grad_norm": 0.0458984375, + "learning_rate": 0.0007035843108814189, + "loss": 1.0978, + "step": 8774 + }, + { + "epoch": 0.7694953201366348, + "grad_norm": 0.046875, + "learning_rate": 0.0007032936281808761, + "loss": 1.1188, + "step": 8775 + }, + { + "epoch": 0.7695830119110093, + "grad_norm": 0.05078125, + "learning_rate": 0.0007030030318175109, + "loss": 1.0903, + "step": 8776 + }, + { + "epoch": 0.7696707036853838, + "grad_norm": 0.0712890625, + "learning_rate": 0.0007027125218178244, + "loss": 1.2298, + "step": 8777 + }, + { + "epoch": 0.7697583954597583, + "grad_norm": 0.048095703125, + "learning_rate": 0.0007024220982083104, + "loss": 1.1756, + "step": 8778 + }, + { + "epoch": 0.769846087234133, + "grad_norm": 0.043701171875, + "learning_rate": 0.0007021317610154549, + "loss": 1.082, + "step": 8779 + }, + { + "epoch": 0.7699337790085075, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007018415102657358, + "loss": 1.1471, + "step": 8780 + }, + { + "epoch": 0.770021470782882, + "grad_norm": 0.04296875, + "learning_rate": 0.0007015513459856239, + "loss": 1.1169, + "step": 8781 + }, + { + "epoch": 0.7701091625572566, + "grad_norm": 0.046630859375, + "learning_rate": 0.0007012612682015802, + "loss": 1.0852, + "step": 8782 + }, + { + "epoch": 0.7701968543316311, + "grad_norm": 0.056396484375, + "learning_rate": 0.0007009712769400597, + "loss": 1.105, + "step": 8783 + }, + { + "epoch": 0.7702845461060056, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007006813722275089, + "loss": 1.1304, + "step": 8784 + }, + { + "epoch": 0.7703722378803802, + "grad_norm": 0.044677734375, + "learning_rate": 0.0007003915540903663, + "loss": 1.1833, + "step": 8785 + }, + { + "epoch": 0.7704599296547547, + "grad_norm": 0.04345703125, + "learning_rate": 0.0007001018225550631, + "loss": 1.1155, + "step": 8786 + }, + { + "epoch": 0.7705476214291292, + "grad_norm": 0.06005859375, + "learning_rate": 0.0006998121776480213, + "loss": 1.1064, + "step": 8787 + }, + { + "epoch": 0.7706353132035039, + "grad_norm": 0.048583984375, + "learning_rate": 0.0006995226193956558, + "loss": 1.1066, + "step": 8788 + }, + { + "epoch": 0.7707230049778784, + "grad_norm": 0.048095703125, + "learning_rate": 0.0006992331478243736, + "loss": 1.0869, + "step": 8789 + }, + { + "epoch": 0.7708106967522529, + "grad_norm": 0.07080078125, + "learning_rate": 0.0006989437629605739, + "loss": 1.1609, + "step": 8790 + }, + { + "epoch": 0.7708983885266275, + "grad_norm": 0.0732421875, + "learning_rate": 0.0006986544648306485, + "loss": 1.1664, + "step": 8791 + }, + { + "epoch": 0.770986080301002, + "grad_norm": 0.05322265625, + "learning_rate": 0.0006983652534609795, + "loss": 1.1653, + "step": 8792 + }, + { + "epoch": 0.7710737720753765, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006980761288779427, + "loss": 1.0771, + "step": 8793 + }, + { + "epoch": 0.7711614638497511, + "grad_norm": 0.051025390625, + "learning_rate": 0.0006977870911079055, + "loss": 1.1566, + "step": 8794 + }, + { + "epoch": 0.7712491556241257, + "grad_norm": 0.044189453125, + "learning_rate": 0.0006974981401772282, + "loss": 1.1304, + "step": 8795 + }, + { + "epoch": 0.7713368473985002, + "grad_norm": 0.044189453125, + "learning_rate": 0.0006972092761122609, + "loss": 1.1605, + "step": 8796 + }, + { + "epoch": 0.7714245391728748, + "grad_norm": 0.07470703125, + "learning_rate": 0.0006969204989393481, + "loss": 1.1734, + "step": 8797 + }, + { + "epoch": 0.7715122309472493, + "grad_norm": 0.0478515625, + "learning_rate": 0.0006966318086848254, + "loss": 1.1063, + "step": 8798 + }, + { + "epoch": 0.7715999227216238, + "grad_norm": 0.043701171875, + "learning_rate": 0.0006963432053750212, + "loss": 1.1034, + "step": 8799 + }, + { + "epoch": 0.7716876144959984, + "grad_norm": 0.04296875, + "learning_rate": 0.0006960546890362544, + "loss": 1.1835, + "step": 8800 + }, + { + "epoch": 0.7717753062703729, + "grad_norm": 0.0576171875, + "learning_rate": 0.0006957662596948371, + "loss": 1.1541, + "step": 8801 + }, + { + "epoch": 0.7718629980447475, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006954779173770738, + "loss": 1.1353, + "step": 8802 + }, + { + "epoch": 0.771950689819122, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006951896621092607, + "loss": 1.0771, + "step": 8803 + }, + { + "epoch": 0.7720383815934966, + "grad_norm": 0.045166015625, + "learning_rate": 0.0006949014939176858, + "loss": 1.1346, + "step": 8804 + }, + { + "epoch": 0.7721260733678711, + "grad_norm": 0.0498046875, + "learning_rate": 0.0006946134128286283, + "loss": 1.0905, + "step": 8805 + }, + { + "epoch": 0.7722137651422456, + "grad_norm": 0.050537109375, + "learning_rate": 0.0006943254188683614, + "loss": 1.2018, + "step": 8806 + }, + { + "epoch": 0.7723014569166202, + "grad_norm": 0.0810546875, + "learning_rate": 0.0006940375120631494, + "loss": 1.1055, + "step": 8807 + }, + { + "epoch": 0.7723891486909947, + "grad_norm": 0.059814453125, + "learning_rate": 0.0006937496924392486, + "loss": 1.1228, + "step": 8808 + }, + { + "epoch": 0.7724768404653692, + "grad_norm": 0.048095703125, + "learning_rate": 0.0006934619600229071, + "loss": 1.1385, + "step": 8809 + }, + { + "epoch": 0.7725645322397439, + "grad_norm": 0.072265625, + "learning_rate": 0.0006931743148403655, + "loss": 1.2018, + "step": 8810 + }, + { + "epoch": 0.7726522240141184, + "grad_norm": 0.09326171875, + "learning_rate": 0.0006928867569178563, + "loss": 1.1125, + "step": 8811 + }, + { + "epoch": 0.7727399157884929, + "grad_norm": 0.07275390625, + "learning_rate": 0.0006925992862816041, + "loss": 1.0908, + "step": 8812 + }, + { + "epoch": 0.7728276075628675, + "grad_norm": 0.055419921875, + "learning_rate": 0.000692311902957826, + "loss": 1.1161, + "step": 8813 + }, + { + "epoch": 0.772915299337242, + "grad_norm": 0.06494140625, + "learning_rate": 0.0006920246069727296, + "loss": 1.1903, + "step": 8814 + }, + { + "epoch": 0.7730029911116165, + "grad_norm": 0.07080078125, + "learning_rate": 0.000691737398352516, + "loss": 1.1246, + "step": 8815 + }, + { + "epoch": 0.7730906828859911, + "grad_norm": 0.07958984375, + "learning_rate": 0.0006914502771233779, + "loss": 1.1663, + "step": 8816 + }, + { + "epoch": 0.7731783746603657, + "grad_norm": 0.0751953125, + "learning_rate": 0.0006911632433115006, + "loss": 1.1501, + "step": 8817 + }, + { + "epoch": 0.7732660664347402, + "grad_norm": 0.061767578125, + "learning_rate": 0.0006908762969430599, + "loss": 1.1549, + "step": 8818 + }, + { + "epoch": 0.7733537582091148, + "grad_norm": 0.04345703125, + "learning_rate": 0.0006905894380442247, + "loss": 1.1227, + "step": 8819 + }, + { + "epoch": 0.7734414499834893, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006903026666411566, + "loss": 1.1738, + "step": 8820 + }, + { + "epoch": 0.7735291417578638, + "grad_norm": 0.061767578125, + "learning_rate": 0.0006900159827600075, + "loss": 1.154, + "step": 8821 + }, + { + "epoch": 0.7736168335322384, + "grad_norm": 0.09814453125, + "learning_rate": 0.000689729386426923, + "loss": 1.0917, + "step": 8822 + }, + { + "epoch": 0.7737045253066129, + "grad_norm": 0.0908203125, + "learning_rate": 0.000689442877668039, + "loss": 1.115, + "step": 8823 + }, + { + "epoch": 0.7737922170809874, + "grad_norm": 0.050048828125, + "learning_rate": 0.0006891564565094851, + "loss": 1.1118, + "step": 8824 + }, + { + "epoch": 0.7738799088553621, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006888701229773818, + "loss": 1.1249, + "step": 8825 + }, + { + "epoch": 0.7739676006297366, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006885838770978425, + "loss": 1.0973, + "step": 8826 + }, + { + "epoch": 0.7740552924041111, + "grad_norm": 0.052490234375, + "learning_rate": 0.0006882977188969714, + "loss": 1.1404, + "step": 8827 + }, + { + "epoch": 0.7741429841784856, + "grad_norm": 0.047607421875, + "learning_rate": 0.0006880116484008658, + "loss": 1.1479, + "step": 8828 + }, + { + "epoch": 0.7742306759528602, + "grad_norm": 0.06591796875, + "learning_rate": 0.0006877256656356144, + "loss": 1.1088, + "step": 8829 + }, + { + "epoch": 0.7743183677272347, + "grad_norm": 0.044189453125, + "learning_rate": 0.0006874397706272987, + "loss": 1.0518, + "step": 8830 + }, + { + "epoch": 0.7744060595016092, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006871539634019905, + "loss": 1.1634, + "step": 8831 + }, + { + "epoch": 0.7744937512759839, + "grad_norm": 0.044189453125, + "learning_rate": 0.0006868682439857553, + "loss": 1.1367, + "step": 8832 + }, + { + "epoch": 0.7745814430503584, + "grad_norm": 0.044921875, + "learning_rate": 0.00068658261240465, + "loss": 1.0738, + "step": 8833 + }, + { + "epoch": 0.7746691348247329, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006862970686847238, + "loss": 1.1291, + "step": 8834 + }, + { + "epoch": 0.7747568265991075, + "grad_norm": 0.05029296875, + "learning_rate": 0.0006860116128520168, + "loss": 1.1778, + "step": 8835 + }, + { + "epoch": 0.774844518373482, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006857262449325622, + "loss": 1.1855, + "step": 8836 + }, + { + "epoch": 0.7749322101478565, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006854409649523852, + "loss": 1.1499, + "step": 8837 + }, + { + "epoch": 0.7750199019222311, + "grad_norm": 0.053955078125, + "learning_rate": 0.0006851557729375016, + "loss": 1.1886, + "step": 8838 + }, + { + "epoch": 0.7751075936966056, + "grad_norm": 0.0419921875, + "learning_rate": 0.000684870668913921, + "loss": 1.0963, + "step": 8839 + }, + { + "epoch": 0.7751952854709802, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006845856529076441, + "loss": 1.0804, + "step": 8840 + }, + { + "epoch": 0.7752829772453548, + "grad_norm": 0.04345703125, + "learning_rate": 0.0006843007249446632, + "loss": 1.1898, + "step": 8841 + }, + { + "epoch": 0.7753706690197293, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006840158850509633, + "loss": 1.1674, + "step": 8842 + }, + { + "epoch": 0.7754583607941038, + "grad_norm": 0.048583984375, + "learning_rate": 0.0006837311332525209, + "loss": 1.1466, + "step": 8843 + }, + { + "epoch": 0.7755460525684784, + "grad_norm": 0.0693359375, + "learning_rate": 0.0006834464695753051, + "loss": 1.0985, + "step": 8844 + }, + { + "epoch": 0.7756337443428529, + "grad_norm": 0.043701171875, + "learning_rate": 0.0006831618940452759, + "loss": 1.1517, + "step": 8845 + }, + { + "epoch": 0.7757214361172274, + "grad_norm": 0.046630859375, + "learning_rate": 0.000682877406688386, + "loss": 1.1631, + "step": 8846 + }, + { + "epoch": 0.7758091278916021, + "grad_norm": 0.055419921875, + "learning_rate": 0.00068259300753058, + "loss": 1.1483, + "step": 8847 + }, + { + "epoch": 0.7758968196659766, + "grad_norm": 0.04833984375, + "learning_rate": 0.0006823086965977947, + "loss": 1.126, + "step": 8848 + }, + { + "epoch": 0.7759845114403511, + "grad_norm": 0.046875, + "learning_rate": 0.0006820244739159577, + "loss": 1.1596, + "step": 8849 + }, + { + "epoch": 0.7760722032147256, + "grad_norm": 0.04296875, + "learning_rate": 0.0006817403395109901, + "loss": 1.1319, + "step": 8850 + }, + { + "epoch": 0.7761598949891002, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006814562934088039, + "loss": 1.1175, + "step": 8851 + }, + { + "epoch": 0.7762475867634747, + "grad_norm": 0.05078125, + "learning_rate": 0.0006811723356353039, + "loss": 1.1591, + "step": 8852 + }, + { + "epoch": 0.7763352785378492, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006808884662163853, + "loss": 1.1414, + "step": 8853 + }, + { + "epoch": 0.7764229703122238, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006806046851779374, + "loss": 1.1277, + "step": 8854 + }, + { + "epoch": 0.7765106620865984, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006803209925458393, + "loss": 1.119, + "step": 8855 + }, + { + "epoch": 0.7765983538609729, + "grad_norm": 0.0537109375, + "learning_rate": 0.0006800373883459634, + "loss": 1.1687, + "step": 8856 + }, + { + "epoch": 0.7766860456353475, + "grad_norm": 0.052734375, + "learning_rate": 0.0006797538726041743, + "loss": 1.1359, + "step": 8857 + }, + { + "epoch": 0.776773737409722, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006794704453463266, + "loss": 1.0774, + "step": 8858 + }, + { + "epoch": 0.7768614291840965, + "grad_norm": 0.0732421875, + "learning_rate": 0.0006791871065982692, + "loss": 1.1939, + "step": 8859 + }, + { + "epoch": 0.7769491209584711, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006789038563858415, + "loss": 1.1459, + "step": 8860 + }, + { + "epoch": 0.7770368127328456, + "grad_norm": 0.041015625, + "learning_rate": 0.0006786206947348756, + "loss": 1.099, + "step": 8861 + }, + { + "epoch": 0.7771245045072201, + "grad_norm": 0.06298828125, + "learning_rate": 0.0006783376216711943, + "loss": 1.1047, + "step": 8862 + }, + { + "epoch": 0.7772121962815948, + "grad_norm": 0.055419921875, + "learning_rate": 0.0006780546372206135, + "loss": 1.1139, + "step": 8863 + }, + { + "epoch": 0.7772998880559693, + "grad_norm": 0.05712890625, + "learning_rate": 0.0006777717414089407, + "loss": 1.0861, + "step": 8864 + }, + { + "epoch": 0.7773875798303438, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006774889342619756, + "loss": 1.1896, + "step": 8865 + }, + { + "epoch": 0.7774752716047184, + "grad_norm": 0.046875, + "learning_rate": 0.0006772062158055093, + "loss": 1.1577, + "step": 8866 + }, + { + "epoch": 0.7775629633790929, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006769235860653245, + "loss": 1.1313, + "step": 8867 + }, + { + "epoch": 0.7776506551534674, + "grad_norm": 0.052490234375, + "learning_rate": 0.0006766410450671967, + "loss": 1.1603, + "step": 8868 + }, + { + "epoch": 0.777738346927842, + "grad_norm": 0.050048828125, + "learning_rate": 0.0006763585928368928, + "loss": 1.1262, + "step": 8869 + }, + { + "epoch": 0.7778260387022166, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006760762294001722, + "loss": 1.1301, + "step": 8870 + }, + { + "epoch": 0.7779137304765911, + "grad_norm": 0.041015625, + "learning_rate": 0.0006757939547827853, + "loss": 1.0957, + "step": 8871 + }, + { + "epoch": 0.7780014222509657, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006755117690104746, + "loss": 1.0357, + "step": 8872 + }, + { + "epoch": 0.7780891140253402, + "grad_norm": 0.06494140625, + "learning_rate": 0.0006752296721089749, + "loss": 1.1092, + "step": 8873 + }, + { + "epoch": 0.7781768057997147, + "grad_norm": 0.045166015625, + "learning_rate": 0.0006749476641040125, + "loss": 1.1159, + "step": 8874 + }, + { + "epoch": 0.7782644975740892, + "grad_norm": 0.047607421875, + "learning_rate": 0.0006746657450213067, + "loss": 1.1468, + "step": 8875 + }, + { + "epoch": 0.7783521893484638, + "grad_norm": 0.0615234375, + "learning_rate": 0.0006743839148865666, + "loss": 1.1189, + "step": 8876 + }, + { + "epoch": 0.7784398811228384, + "grad_norm": 0.0439453125, + "learning_rate": 0.000674102173725495, + "loss": 1.1277, + "step": 8877 + }, + { + "epoch": 0.7785275728972129, + "grad_norm": 0.048095703125, + "learning_rate": 0.0006738205215637859, + "loss": 1.0777, + "step": 8878 + }, + { + "epoch": 0.7786152646715875, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006735389584271256, + "loss": 1.16, + "step": 8879 + }, + { + "epoch": 0.778702956445962, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006732574843411912, + "loss": 1.1116, + "step": 8880 + }, + { + "epoch": 0.7787906482203365, + "grad_norm": 0.043701171875, + "learning_rate": 0.0006729760993316529, + "loss": 1.1335, + "step": 8881 + }, + { + "epoch": 0.7788783399947111, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006726948034241719, + "loss": 1.1627, + "step": 8882 + }, + { + "epoch": 0.7789660317690856, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006724135966444025, + "loss": 1.1483, + "step": 8883 + }, + { + "epoch": 0.7790537235434601, + "grad_norm": 0.0498046875, + "learning_rate": 0.000672132479017989, + "loss": 1.1891, + "step": 8884 + }, + { + "epoch": 0.7791414153178348, + "grad_norm": 0.04296875, + "learning_rate": 0.0006718514505705689, + "loss": 1.1026, + "step": 8885 + }, + { + "epoch": 0.7792291070922093, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006715705113277717, + "loss": 1.0936, + "step": 8886 + }, + { + "epoch": 0.7793167988665838, + "grad_norm": 0.04931640625, + "learning_rate": 0.0006712896613152185, + "loss": 1.1819, + "step": 8887 + }, + { + "epoch": 0.7794044906409584, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006710089005585215, + "loss": 1.1403, + "step": 8888 + }, + { + "epoch": 0.7794921824153329, + "grad_norm": 0.0419921875, + "learning_rate": 0.0006707282290832851, + "loss": 1.1289, + "step": 8889 + }, + { + "epoch": 0.7795798741897074, + "grad_norm": 0.044189453125, + "learning_rate": 0.0006704476469151061, + "loss": 1.1788, + "step": 8890 + }, + { + "epoch": 0.779667565964082, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006701671540795733, + "loss": 1.1299, + "step": 8891 + }, + { + "epoch": 0.7797552577384566, + "grad_norm": 0.045166015625, + "learning_rate": 0.0006698867506022666, + "loss": 1.1352, + "step": 8892 + }, + { + "epoch": 0.7798429495128311, + "grad_norm": 0.04833984375, + "learning_rate": 0.0006696064365087584, + "loss": 1.1155, + "step": 8893 + }, + { + "epoch": 0.7799306412872057, + "grad_norm": 0.05029296875, + "learning_rate": 0.0006693262118246119, + "loss": 1.1214, + "step": 8894 + }, + { + "epoch": 0.7800183330615802, + "grad_norm": 0.058349609375, + "learning_rate": 0.0006690460765753833, + "loss": 1.1451, + "step": 8895 + }, + { + "epoch": 0.7801060248359547, + "grad_norm": 0.0517578125, + "learning_rate": 0.0006687660307866201, + "loss": 1.1226, + "step": 8896 + }, + { + "epoch": 0.7801937166103293, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006684860744838624, + "loss": 1.1296, + "step": 8897 + }, + { + "epoch": 0.7802814083847038, + "grad_norm": 0.0576171875, + "learning_rate": 0.0006682062076926405, + "loss": 1.1234, + "step": 8898 + }, + { + "epoch": 0.7803691001590783, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006679264304384779, + "loss": 1.1472, + "step": 8899 + }, + { + "epoch": 0.7804567919334529, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006676467427468896, + "loss": 1.018, + "step": 8900 + }, + { + "epoch": 0.7805444837078275, + "grad_norm": 0.0458984375, + "learning_rate": 0.000667367144643383, + "loss": 1.1223, + "step": 8901 + }, + { + "epoch": 0.780632175482202, + "grad_norm": 0.041259765625, + "learning_rate": 0.0006670876361534558, + "loss": 1.1032, + "step": 8902 + }, + { + "epoch": 0.7807198672565765, + "grad_norm": 0.050537109375, + "learning_rate": 0.0006668082173025985, + "loss": 1.1337, + "step": 8903 + }, + { + "epoch": 0.7808075590309511, + "grad_norm": 0.0458984375, + "learning_rate": 0.0006665288881162945, + "loss": 1.1529, + "step": 8904 + }, + { + "epoch": 0.7808952508053256, + "grad_norm": 0.059814453125, + "learning_rate": 0.0006662496486200163, + "loss": 1.1562, + "step": 8905 + }, + { + "epoch": 0.7809829425797001, + "grad_norm": 0.051513671875, + "learning_rate": 0.0006659704988392312, + "loss": 1.0984, + "step": 8906 + }, + { + "epoch": 0.7810706343540748, + "grad_norm": 0.043701171875, + "learning_rate": 0.000665691438799396, + "loss": 1.0548, + "step": 8907 + }, + { + "epoch": 0.7811583261284493, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006654124685259604, + "loss": 1.1327, + "step": 8908 + }, + { + "epoch": 0.7812460179028238, + "grad_norm": 0.07275390625, + "learning_rate": 0.0006651335880443661, + "loss": 1.1161, + "step": 8909 + }, + { + "epoch": 0.7813337096771984, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006648547973800464, + "loss": 1.1093, + "step": 8910 + }, + { + "epoch": 0.7814214014515729, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006645760965584258, + "loss": 1.0665, + "step": 8911 + }, + { + "epoch": 0.7815090932259474, + "grad_norm": 0.04541015625, + "learning_rate": 0.0006642974856049214, + "loss": 1.1322, + "step": 8912 + }, + { + "epoch": 0.781596785000322, + "grad_norm": 0.04296875, + "learning_rate": 0.0006640189645449417, + "loss": 1.166, + "step": 8913 + }, + { + "epoch": 0.7816844767746965, + "grad_norm": 0.050537109375, + "learning_rate": 0.0006637405334038874, + "loss": 1.0959, + "step": 8914 + }, + { + "epoch": 0.7817721685490711, + "grad_norm": 0.05908203125, + "learning_rate": 0.0006634621922071501, + "loss": 1.1888, + "step": 8915 + }, + { + "epoch": 0.7818598603234457, + "grad_norm": 0.0478515625, + "learning_rate": 0.0006631839409801142, + "loss": 1.209, + "step": 8916 + }, + { + "epoch": 0.7819475520978202, + "grad_norm": 0.05712890625, + "learning_rate": 0.0006629057797481553, + "loss": 1.1704, + "step": 8917 + }, + { + "epoch": 0.7820352438721947, + "grad_norm": 0.041259765625, + "learning_rate": 0.0006626277085366413, + "loss": 1.0844, + "step": 8918 + }, + { + "epoch": 0.7821229356465693, + "grad_norm": 0.05029296875, + "learning_rate": 0.0006623497273709317, + "loss": 1.163, + "step": 8919 + }, + { + "epoch": 0.7822106274209438, + "grad_norm": 0.042724609375, + "learning_rate": 0.000662071836276377, + "loss": 1.2092, + "step": 8920 + }, + { + "epoch": 0.7822983191953183, + "grad_norm": 0.055908203125, + "learning_rate": 0.0006617940352783207, + "loss": 1.2079, + "step": 8921 + }, + { + "epoch": 0.7823860109696928, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006615163244020972, + "loss": 1.1588, + "step": 8922 + }, + { + "epoch": 0.7824737027440675, + "grad_norm": 0.044921875, + "learning_rate": 0.000661238703673033, + "loss": 1.0975, + "step": 8923 + }, + { + "epoch": 0.782561394518442, + "grad_norm": 0.04736328125, + "learning_rate": 0.000660961173116447, + "loss": 1.1209, + "step": 8924 + }, + { + "epoch": 0.7826490862928165, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006606837327576484, + "loss": 1.0952, + "step": 8925 + }, + { + "epoch": 0.7827367780671911, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006604063826219395, + "loss": 1.1019, + "step": 8926 + }, + { + "epoch": 0.7828244698415656, + "grad_norm": 0.048828125, + "learning_rate": 0.000660129122734614, + "loss": 1.0934, + "step": 8927 + }, + { + "epoch": 0.7829121616159401, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006598519531209573, + "loss": 1.1145, + "step": 8928 + }, + { + "epoch": 0.7829998533903147, + "grad_norm": 0.043701171875, + "learning_rate": 0.0006595748738062462, + "loss": 1.1256, + "step": 8929 + }, + { + "epoch": 0.7830875451646893, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006592978848157497, + "loss": 1.1401, + "step": 8930 + }, + { + "epoch": 0.7831752369390638, + "grad_norm": 0.048095703125, + "learning_rate": 0.0006590209861747288, + "loss": 1.1675, + "step": 8931 + }, + { + "epoch": 0.7832629287134384, + "grad_norm": 0.04345703125, + "learning_rate": 0.0006587441779084362, + "loss": 1.1613, + "step": 8932 + }, + { + "epoch": 0.7833506204878129, + "grad_norm": 0.04345703125, + "learning_rate": 0.000658467460042115, + "loss": 1.1007, + "step": 8933 + }, + { + "epoch": 0.7834383122621874, + "grad_norm": 0.05029296875, + "learning_rate": 0.000658190832601002, + "loss": 1.129, + "step": 8934 + }, + { + "epoch": 0.783526004036562, + "grad_norm": 0.052001953125, + "learning_rate": 0.0006579142956103247, + "loss": 1.1367, + "step": 8935 + }, + { + "epoch": 0.7836136958109365, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006576378490953029, + "loss": 1.1572, + "step": 8936 + }, + { + "epoch": 0.783701387585311, + "grad_norm": 0.04296875, + "learning_rate": 0.0006573614930811473, + "loss": 1.12, + "step": 8937 + }, + { + "epoch": 0.7837890793596857, + "grad_norm": 0.04296875, + "learning_rate": 0.0006570852275930613, + "loss": 1.1414, + "step": 8938 + }, + { + "epoch": 0.7838767711340602, + "grad_norm": 0.04541015625, + "learning_rate": 0.0006568090526562389, + "loss": 1.1604, + "step": 8939 + }, + { + "epoch": 0.7839644629084347, + "grad_norm": 0.04736328125, + "learning_rate": 0.0006565329682958669, + "loss": 1.2276, + "step": 8940 + }, + { + "epoch": 0.7840521546828093, + "grad_norm": 0.04736328125, + "learning_rate": 0.0006562569745371243, + "loss": 1.1197, + "step": 8941 + }, + { + "epoch": 0.7841398464571838, + "grad_norm": 0.050048828125, + "learning_rate": 0.0006559810714051797, + "loss": 1.1756, + "step": 8942 + }, + { + "epoch": 0.7842275382315583, + "grad_norm": 0.044921875, + "learning_rate": 0.0006557052589251952, + "loss": 1.1851, + "step": 8943 + }, + { + "epoch": 0.784315230005933, + "grad_norm": 0.043701171875, + "learning_rate": 0.0006554295371223246, + "loss": 1.1202, + "step": 8944 + }, + { + "epoch": 0.7844029217803075, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006551539060217125, + "loss": 1.1427, + "step": 8945 + }, + { + "epoch": 0.784490613554682, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006548783656484966, + "loss": 1.116, + "step": 8946 + }, + { + "epoch": 0.7845783053290565, + "grad_norm": 0.044189453125, + "learning_rate": 0.0006546029160278043, + "loss": 1.1071, + "step": 8947 + }, + { + "epoch": 0.7846659971034311, + "grad_norm": 0.049072265625, + "learning_rate": 0.0006543275571847567, + "loss": 1.1113, + "step": 8948 + }, + { + "epoch": 0.7847536888778056, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006540522891444652, + "loss": 1.0965, + "step": 8949 + }, + { + "epoch": 0.7848413806521801, + "grad_norm": 0.0498046875, + "learning_rate": 0.0006537771119320345, + "loss": 1.1677, + "step": 8950 + }, + { + "epoch": 0.7849290724265547, + "grad_norm": 0.045166015625, + "learning_rate": 0.0006535020255725592, + "loss": 1.1401, + "step": 8951 + }, + { + "epoch": 0.7850167642009293, + "grad_norm": 0.064453125, + "learning_rate": 0.0006532270300911266, + "loss": 1.1321, + "step": 8952 + }, + { + "epoch": 0.7851044559753038, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006529521255128155, + "loss": 1.1679, + "step": 8953 + }, + { + "epoch": 0.7851921477496784, + "grad_norm": 0.048583984375, + "learning_rate": 0.0006526773118626974, + "loss": 1.07, + "step": 8954 + }, + { + "epoch": 0.7852798395240529, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006524025891658337, + "loss": 1.1298, + "step": 8955 + }, + { + "epoch": 0.7853675312984274, + "grad_norm": 0.047607421875, + "learning_rate": 0.0006521279574472782, + "loss": 1.1264, + "step": 8956 + }, + { + "epoch": 0.785455223072802, + "grad_norm": 0.059326171875, + "learning_rate": 0.0006518534167320769, + "loss": 1.1025, + "step": 8957 + }, + { + "epoch": 0.7855429148471765, + "grad_norm": 0.05224609375, + "learning_rate": 0.0006515789670452675, + "loss": 1.1231, + "step": 8958 + }, + { + "epoch": 0.785630606621551, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006513046084118792, + "loss": 1.1028, + "step": 8959 + }, + { + "epoch": 0.7857182983959257, + "grad_norm": 0.04833984375, + "learning_rate": 0.0006510303408569321, + "loss": 1.1045, + "step": 8960 + }, + { + "epoch": 0.7858059901703002, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006507561644054391, + "loss": 1.1486, + "step": 8961 + }, + { + "epoch": 0.7858936819446747, + "grad_norm": 0.056884765625, + "learning_rate": 0.0006504820790824042, + "loss": 1.1269, + "step": 8962 + }, + { + "epoch": 0.7859813737190493, + "grad_norm": 0.041748046875, + "learning_rate": 0.000650208084912824, + "loss": 1.1354, + "step": 8963 + }, + { + "epoch": 0.7860690654934238, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006499341819216853, + "loss": 1.1562, + "step": 8964 + }, + { + "epoch": 0.7861567572677983, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006496603701339672, + "loss": 1.1993, + "step": 8965 + }, + { + "epoch": 0.7862444490421729, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006493866495746412, + "loss": 1.0762, + "step": 8966 + }, + { + "epoch": 0.7863321408165475, + "grad_norm": 0.0478515625, + "learning_rate": 0.00064911302026867, + "loss": 1.1173, + "step": 8967 + }, + { + "epoch": 0.786419832590922, + "grad_norm": 0.047607421875, + "learning_rate": 0.0006488394822410072, + "loss": 1.1532, + "step": 8968 + }, + { + "epoch": 0.7865075243652965, + "grad_norm": 0.045166015625, + "learning_rate": 0.000648566035516599, + "loss": 1.0801, + "step": 8969 + }, + { + "epoch": 0.7865952161396711, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006482926801203835, + "loss": 1.0817, + "step": 8970 + }, + { + "epoch": 0.7866829079140456, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006480194160772894, + "loss": 1.1365, + "step": 8971 + }, + { + "epoch": 0.7867705996884201, + "grad_norm": 0.061767578125, + "learning_rate": 0.0006477462434122389, + "loss": 1.1035, + "step": 8972 + }, + { + "epoch": 0.7868582914627947, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006474731621501429, + "loss": 1.0941, + "step": 8973 + }, + { + "epoch": 0.7869459832371692, + "grad_norm": 0.059326171875, + "learning_rate": 0.0006472001723159063, + "loss": 1.0893, + "step": 8974 + }, + { + "epoch": 0.7870336750115438, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006469272739344254, + "loss": 1.1202, + "step": 8975 + }, + { + "epoch": 0.7871213667859184, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006466544670305877, + "loss": 1.1172, + "step": 8976 + }, + { + "epoch": 0.7872090585602929, + "grad_norm": 0.0517578125, + "learning_rate": 0.0006463817516292729, + "loss": 1.1236, + "step": 8977 + }, + { + "epoch": 0.7872967503346674, + "grad_norm": 0.057861328125, + "learning_rate": 0.0006461091277553512, + "loss": 1.1162, + "step": 8978 + }, + { + "epoch": 0.787384442109042, + "grad_norm": 0.07421875, + "learning_rate": 0.0006458365954336854, + "loss": 1.1215, + "step": 8979 + }, + { + "epoch": 0.7874721338834165, + "grad_norm": 0.054443359375, + "learning_rate": 0.0006455641546891298, + "loss": 1.1223, + "step": 8980 + }, + { + "epoch": 0.787559825657791, + "grad_norm": 0.051025390625, + "learning_rate": 0.0006452918055465309, + "loss": 1.1076, + "step": 8981 + }, + { + "epoch": 0.7876475174321657, + "grad_norm": 0.053955078125, + "learning_rate": 0.0006450195480307253, + "loss": 1.1247, + "step": 8982 + }, + { + "epoch": 0.7877352092065402, + "grad_norm": 0.06201171875, + "learning_rate": 0.0006447473821665426, + "loss": 1.1358, + "step": 8983 + }, + { + "epoch": 0.7878229009809147, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006444753079788034, + "loss": 1.1315, + "step": 8984 + }, + { + "epoch": 0.7879105927552893, + "grad_norm": 0.06396484375, + "learning_rate": 0.0006442033254923209, + "loss": 1.1581, + "step": 8985 + }, + { + "epoch": 0.7879982845296638, + "grad_norm": 0.05419921875, + "learning_rate": 0.0006439314347318983, + "loss": 1.1199, + "step": 8986 + }, + { + "epoch": 0.7880859763040383, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006436596357223316, + "loss": 1.1397, + "step": 8987 + }, + { + "epoch": 0.7881736680784129, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006433879284884083, + "loss": 1.1308, + "step": 8988 + }, + { + "epoch": 0.7882613598527874, + "grad_norm": 0.0654296875, + "learning_rate": 0.0006431163130549077, + "loss": 1.1227, + "step": 8989 + }, + { + "epoch": 0.788349051627162, + "grad_norm": 0.056884765625, + "learning_rate": 0.0006428447894466003, + "loss": 1.1527, + "step": 8990 + }, + { + "epoch": 0.7884367434015366, + "grad_norm": 0.04345703125, + "learning_rate": 0.0006425733576882475, + "loss": 1.1154, + "step": 8991 + }, + { + "epoch": 0.7885244351759111, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006423020178046039, + "loss": 1.102, + "step": 8992 + }, + { + "epoch": 0.7886121269502856, + "grad_norm": 0.0498046875, + "learning_rate": 0.000642030769820415, + "loss": 1.1666, + "step": 8993 + }, + { + "epoch": 0.7886998187246601, + "grad_norm": 0.047119140625, + "learning_rate": 0.0006417596137604182, + "loss": 1.2063, + "step": 8994 + }, + { + "epoch": 0.7887875104990347, + "grad_norm": 0.044921875, + "learning_rate": 0.0006414885496493414, + "loss": 1.1439, + "step": 8995 + }, + { + "epoch": 0.7888752022734092, + "grad_norm": 0.048095703125, + "learning_rate": 0.0006412175775119055, + "loss": 1.1385, + "step": 8996 + }, + { + "epoch": 0.7889628940477837, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006409466973728223, + "loss": 1.0893, + "step": 8997 + }, + { + "epoch": 0.7890505858221584, + "grad_norm": 0.045166015625, + "learning_rate": 0.0006406759092567956, + "loss": 1.1224, + "step": 8998 + }, + { + "epoch": 0.7891382775965329, + "grad_norm": 0.050048828125, + "learning_rate": 0.0006404052131885206, + "loss": 1.164, + "step": 8999 + }, + { + "epoch": 0.7892259693709074, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006401346091926837, + "loss": 1.1306, + "step": 9000 + }, + { + "epoch": 0.7892259693709074, + "eval_loss": 1.1407471895217896, + "eval_runtime": 428.9889, + "eval_samples_per_second": 33.677, + "eval_steps_per_second": 8.42, + "step": 9000 + }, + { + "epoch": 0.789313661145282, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006398640972939633, + "loss": 1.1118, + "step": 9001 + }, + { + "epoch": 0.7894013529196565, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006395936775170298, + "loss": 1.117, + "step": 9002 + }, + { + "epoch": 0.789489044694031, + "grad_norm": 0.04296875, + "learning_rate": 0.0006393233498865447, + "loss": 1.1224, + "step": 9003 + }, + { + "epoch": 0.7895767364684056, + "grad_norm": 0.041259765625, + "learning_rate": 0.0006390531144271609, + "loss": 1.1093, + "step": 9004 + }, + { + "epoch": 0.7896644282427802, + "grad_norm": 0.060791015625, + "learning_rate": 0.0006387829711635231, + "loss": 1.187, + "step": 9005 + }, + { + "epoch": 0.7897521200171547, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006385129201202683, + "loss": 1.1548, + "step": 9006 + }, + { + "epoch": 0.7898398117915293, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006382429613220237, + "loss": 1.095, + "step": 9007 + }, + { + "epoch": 0.7899275035659038, + "grad_norm": 0.0458984375, + "learning_rate": 0.0006379730947934097, + "loss": 1.1489, + "step": 9008 + }, + { + "epoch": 0.7900151953402783, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006377033205590364, + "loss": 1.1344, + "step": 9009 + }, + { + "epoch": 0.7901028871146529, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006374336386435071, + "loss": 1.1541, + "step": 9010 + }, + { + "epoch": 0.7901905788890274, + "grad_norm": 0.0458984375, + "learning_rate": 0.0006371640490714158, + "loss": 1.148, + "step": 9011 + }, + { + "epoch": 0.790278270663402, + "grad_norm": 0.054931640625, + "learning_rate": 0.0006368945518673493, + "loss": 1.1606, + "step": 9012 + }, + { + "epoch": 0.7903659624377766, + "grad_norm": 0.044921875, + "learning_rate": 0.0006366251470558838, + "loss": 1.1188, + "step": 9013 + }, + { + "epoch": 0.7904536542121511, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006363558346615891, + "loss": 1.1534, + "step": 9014 + }, + { + "epoch": 0.7905413459865256, + "grad_norm": 0.0517578125, + "learning_rate": 0.0006360866147090253, + "loss": 1.1252, + "step": 9015 + }, + { + "epoch": 0.7906290377609002, + "grad_norm": 0.048583984375, + "learning_rate": 0.0006358174872227453, + "loss": 1.1249, + "step": 9016 + }, + { + "epoch": 0.7907167295352747, + "grad_norm": 0.047607421875, + "learning_rate": 0.000635548452227292, + "loss": 1.1153, + "step": 9017 + }, + { + "epoch": 0.7908044213096492, + "grad_norm": 0.043212890625, + "learning_rate": 0.000635279509747201, + "loss": 1.0609, + "step": 9018 + }, + { + "epoch": 0.7908921130840237, + "grad_norm": 0.04345703125, + "learning_rate": 0.0006350106598069993, + "loss": 1.1268, + "step": 9019 + }, + { + "epoch": 0.7909798048583984, + "grad_norm": 0.04736328125, + "learning_rate": 0.0006347419024312056, + "loss": 1.1, + "step": 9020 + }, + { + "epoch": 0.7910674966327729, + "grad_norm": 0.047607421875, + "learning_rate": 0.0006344732376443292, + "loss": 1.2022, + "step": 9021 + }, + { + "epoch": 0.7911551884071474, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006342046654708717, + "loss": 1.0776, + "step": 9022 + }, + { + "epoch": 0.791242880181522, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006339361859353271, + "loss": 1.1075, + "step": 9023 + }, + { + "epoch": 0.7913305719558965, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006336677990621791, + "loss": 1.0695, + "step": 9024 + }, + { + "epoch": 0.791418263730271, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006333995048759043, + "loss": 1.1209, + "step": 9025 + }, + { + "epoch": 0.7915059555046456, + "grad_norm": 0.044677734375, + "learning_rate": 0.00063313130340097, + "loss": 1.1308, + "step": 9026 + }, + { + "epoch": 0.7915936472790202, + "grad_norm": 0.05517578125, + "learning_rate": 0.0006328631946618359, + "loss": 1.166, + "step": 9027 + }, + { + "epoch": 0.7916813390533947, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006325951786829526, + "loss": 1.1079, + "step": 9028 + }, + { + "epoch": 0.7917690308277693, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006323272554887625, + "loss": 1.1442, + "step": 9029 + }, + { + "epoch": 0.7918567226021438, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006320594251037003, + "loss": 1.1177, + "step": 9030 + }, + { + "epoch": 0.7919444143765183, + "grad_norm": 0.049072265625, + "learning_rate": 0.0006317916875521901, + "loss": 1.1549, + "step": 9031 + }, + { + "epoch": 0.7920321061508929, + "grad_norm": 0.044921875, + "learning_rate": 0.0006315240428586495, + "loss": 1.1457, + "step": 9032 + }, + { + "epoch": 0.7921197979252674, + "grad_norm": 0.04345703125, + "learning_rate": 0.0006312564910474871, + "loss": 1.1217, + "step": 9033 + }, + { + "epoch": 0.7922074896996419, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006309890321431033, + "loss": 1.143, + "step": 9034 + }, + { + "epoch": 0.7922951814740166, + "grad_norm": 0.045654296875, + "learning_rate": 0.0006307216661698887, + "loss": 1.1722, + "step": 9035 + }, + { + "epoch": 0.7923828732483911, + "grad_norm": 0.05517578125, + "learning_rate": 0.0006304543931522268, + "loss": 1.1168, + "step": 9036 + }, + { + "epoch": 0.7924705650227656, + "grad_norm": 0.04296875, + "learning_rate": 0.0006301872131144926, + "loss": 1.1261, + "step": 9037 + }, + { + "epoch": 0.7925582567971402, + "grad_norm": 0.048583984375, + "learning_rate": 0.0006299201260810521, + "loss": 1.0978, + "step": 9038 + }, + { + "epoch": 0.7926459485715147, + "grad_norm": 0.0478515625, + "learning_rate": 0.0006296531320762623, + "loss": 1.1366, + "step": 9039 + }, + { + "epoch": 0.7927336403458892, + "grad_norm": 0.052978515625, + "learning_rate": 0.0006293862311244734, + "loss": 1.1998, + "step": 9040 + }, + { + "epoch": 0.7928213321202637, + "grad_norm": 0.043701171875, + "learning_rate": 0.0006291194232500251, + "loss": 1.1143, + "step": 9041 + }, + { + "epoch": 0.7929090238946384, + "grad_norm": 0.050048828125, + "learning_rate": 0.0006288527084772498, + "loss": 1.1374, + "step": 9042 + }, + { + "epoch": 0.7929967156690129, + "grad_norm": 0.04345703125, + "learning_rate": 0.0006285860868304723, + "loss": 1.1488, + "step": 9043 + }, + { + "epoch": 0.7930844074433874, + "grad_norm": 0.042724609375, + "learning_rate": 0.000628319558334006, + "loss": 1.09, + "step": 9044 + }, + { + "epoch": 0.793172099217762, + "grad_norm": 0.0419921875, + "learning_rate": 0.0006280531230121588, + "loss": 1.1228, + "step": 9045 + }, + { + "epoch": 0.7932597909921365, + "grad_norm": 0.0419921875, + "learning_rate": 0.0006277867808892285, + "loss": 1.1171, + "step": 9046 + }, + { + "epoch": 0.793347482766511, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006275205319895055, + "loss": 1.0605, + "step": 9047 + }, + { + "epoch": 0.7934351745408856, + "grad_norm": 0.044189453125, + "learning_rate": 0.00062725437633727, + "loss": 1.1171, + "step": 9048 + }, + { + "epoch": 0.7935228663152601, + "grad_norm": 0.0458984375, + "learning_rate": 0.0006269883139567951, + "loss": 1.1789, + "step": 9049 + }, + { + "epoch": 0.7936105580896347, + "grad_norm": 0.047119140625, + "learning_rate": 0.0006267223448723452, + "loss": 1.1001, + "step": 9050 + }, + { + "epoch": 0.7936982498640093, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006264564691081762, + "loss": 1.1068, + "step": 9051 + }, + { + "epoch": 0.7937859416383838, + "grad_norm": 0.044189453125, + "learning_rate": 0.0006261906866885346, + "loss": 1.0892, + "step": 9052 + }, + { + "epoch": 0.7938736334127583, + "grad_norm": 0.047607421875, + "learning_rate": 0.0006259249976376594, + "loss": 1.0549, + "step": 9053 + }, + { + "epoch": 0.7939613251871329, + "grad_norm": 0.05029296875, + "learning_rate": 0.0006256594019797808, + "loss": 1.1802, + "step": 9054 + }, + { + "epoch": 0.7940490169615074, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006253938997391204, + "loss": 1.1563, + "step": 9055 + }, + { + "epoch": 0.7941367087358819, + "grad_norm": 0.04931640625, + "learning_rate": 0.0006251284909398918, + "loss": 1.1438, + "step": 9056 + }, + { + "epoch": 0.7942244005102566, + "grad_norm": 0.0537109375, + "learning_rate": 0.000624863175606299, + "loss": 1.0949, + "step": 9057 + }, + { + "epoch": 0.7943120922846311, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006245979537625377, + "loss": 1.1091, + "step": 9058 + }, + { + "epoch": 0.7943997840590056, + "grad_norm": 0.0556640625, + "learning_rate": 0.0006243328254327962, + "loss": 1.1325, + "step": 9059 + }, + { + "epoch": 0.7944874758333802, + "grad_norm": 0.05419921875, + "learning_rate": 0.0006240677906412533, + "loss": 1.1239, + "step": 9060 + }, + { + "epoch": 0.7945751676077547, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006238028494120798, + "loss": 1.1056, + "step": 9061 + }, + { + "epoch": 0.7946628593821292, + "grad_norm": 0.041259765625, + "learning_rate": 0.0006235380017694367, + "loss": 1.0702, + "step": 9062 + }, + { + "epoch": 0.7947505511565038, + "grad_norm": 0.04736328125, + "learning_rate": 0.0006232732477374782, + "loss": 1.1675, + "step": 9063 + }, + { + "epoch": 0.7948382429308783, + "grad_norm": 0.044189453125, + "learning_rate": 0.0006230085873403491, + "loss": 1.1573, + "step": 9064 + }, + { + "epoch": 0.7949259347052529, + "grad_norm": 0.055908203125, + "learning_rate": 0.000622744020602186, + "loss": 1.0541, + "step": 9065 + }, + { + "epoch": 0.7950136264796274, + "grad_norm": 0.061279296875, + "learning_rate": 0.0006224795475471159, + "loss": 1.1356, + "step": 9066 + }, + { + "epoch": 0.795101318254002, + "grad_norm": 0.051513671875, + "learning_rate": 0.0006222151681992587, + "loss": 1.0819, + "step": 9067 + }, + { + "epoch": 0.7951890100283765, + "grad_norm": 0.050537109375, + "learning_rate": 0.0006219508825827249, + "loss": 1.116, + "step": 9068 + }, + { + "epoch": 0.795276701802751, + "grad_norm": 0.04248046875, + "learning_rate": 0.0006216866907216171, + "loss": 1.1154, + "step": 9069 + }, + { + "epoch": 0.7953643935771256, + "grad_norm": 0.0654296875, + "learning_rate": 0.0006214225926400281, + "loss": 1.1331, + "step": 9070 + }, + { + "epoch": 0.7954520853515001, + "grad_norm": 0.08056640625, + "learning_rate": 0.0006211585883620435, + "loss": 1.1331, + "step": 9071 + }, + { + "epoch": 0.7955397771258746, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006208946779117395, + "loss": 1.1622, + "step": 9072 + }, + { + "epoch": 0.7956274689002493, + "grad_norm": 0.052978515625, + "learning_rate": 0.0006206308613131848, + "loss": 1.2091, + "step": 9073 + }, + { + "epoch": 0.7957151606746238, + "grad_norm": 0.07275390625, + "learning_rate": 0.0006203671385904381, + "loss": 1.1037, + "step": 9074 + }, + { + "epoch": 0.7958028524489983, + "grad_norm": 0.0712890625, + "learning_rate": 0.0006201035097675501, + "loss": 1.1571, + "step": 9075 + }, + { + "epoch": 0.7958905442233729, + "grad_norm": 0.07080078125, + "learning_rate": 0.0006198399748685633, + "loss": 1.1815, + "step": 9076 + }, + { + "epoch": 0.7959782359977474, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006195765339175115, + "loss": 1.1223, + "step": 9077 + }, + { + "epoch": 0.7960659277721219, + "grad_norm": 0.046875, + "learning_rate": 0.0006193131869384202, + "loss": 1.2083, + "step": 9078 + }, + { + "epoch": 0.7961536195464965, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006190499339553051, + "loss": 1.1192, + "step": 9079 + }, + { + "epoch": 0.7962413113208711, + "grad_norm": 0.07275390625, + "learning_rate": 0.0006187867749921746, + "loss": 1.1091, + "step": 9080 + }, + { + "epoch": 0.7963290030952456, + "grad_norm": 0.05126953125, + "learning_rate": 0.0006185237100730281, + "loss": 1.1815, + "step": 9081 + }, + { + "epoch": 0.7964166948696202, + "grad_norm": 0.05810546875, + "learning_rate": 0.0006182607392218566, + "loss": 1.1268, + "step": 9082 + }, + { + "epoch": 0.7965043866439947, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006179978624626427, + "loss": 1.0693, + "step": 9083 + }, + { + "epoch": 0.7965920784183692, + "grad_norm": 0.0458984375, + "learning_rate": 0.0006177350798193591, + "loss": 1.1332, + "step": 9084 + }, + { + "epoch": 0.7966797701927438, + "grad_norm": 0.052734375, + "learning_rate": 0.0006174723913159717, + "loss": 1.1883, + "step": 9085 + }, + { + "epoch": 0.7967674619671183, + "grad_norm": 0.045166015625, + "learning_rate": 0.0006172097969764366, + "loss": 1.1484, + "step": 9086 + }, + { + "epoch": 0.7968551537414928, + "grad_norm": 0.041259765625, + "learning_rate": 0.0006169472968247023, + "loss": 1.1208, + "step": 9087 + }, + { + "epoch": 0.7969428455158675, + "grad_norm": 0.055908203125, + "learning_rate": 0.0006166848908847073, + "loss": 1.1419, + "step": 9088 + }, + { + "epoch": 0.797030537290242, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006164225791803828, + "loss": 1.1159, + "step": 9089 + }, + { + "epoch": 0.7971182290646165, + "grad_norm": 0.043701171875, + "learning_rate": 0.0006161603617356508, + "loss": 1.0835, + "step": 9090 + }, + { + "epoch": 0.797205920838991, + "grad_norm": 0.04296875, + "learning_rate": 0.0006158982385744254, + "loss": 1.0899, + "step": 9091 + }, + { + "epoch": 0.7972936126133656, + "grad_norm": 0.0498046875, + "learning_rate": 0.000615636209720611, + "loss": 1.2, + "step": 9092 + }, + { + "epoch": 0.7973813043877401, + "grad_norm": 0.04296875, + "learning_rate": 0.000615374275198104, + "loss": 1.1592, + "step": 9093 + }, + { + "epoch": 0.7974689961621146, + "grad_norm": 0.051025390625, + "learning_rate": 0.0006151124350307921, + "loss": 1.1291, + "step": 9094 + }, + { + "epoch": 0.7975566879364893, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006148506892425545, + "loss": 1.0863, + "step": 9095 + }, + { + "epoch": 0.7976443797108638, + "grad_norm": 0.0439453125, + "learning_rate": 0.0006145890378572623, + "loss": 1.0934, + "step": 9096 + }, + { + "epoch": 0.7977320714852383, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006143274808987765, + "loss": 1.1264, + "step": 9097 + }, + { + "epoch": 0.7978197632596129, + "grad_norm": 0.044189453125, + "learning_rate": 0.0006140660183909508, + "loss": 1.1944, + "step": 9098 + }, + { + "epoch": 0.7979074550339874, + "grad_norm": 0.055908203125, + "learning_rate": 0.0006138046503576302, + "loss": 1.0747, + "step": 9099 + }, + { + "epoch": 0.7979951468083619, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006135433768226507, + "loss": 1.098, + "step": 9100 + }, + { + "epoch": 0.7980828385827365, + "grad_norm": 0.04296875, + "learning_rate": 0.0006132821978098393, + "loss": 1.1361, + "step": 9101 + }, + { + "epoch": 0.798170530357111, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006130211133430153, + "loss": 1.1153, + "step": 9102 + }, + { + "epoch": 0.7982582221314856, + "grad_norm": 0.045166015625, + "learning_rate": 0.0006127601234459888, + "loss": 1.1259, + "step": 9103 + }, + { + "epoch": 0.7983459139058602, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006124992281425617, + "loss": 1.162, + "step": 9104 + }, + { + "epoch": 0.7984336056802347, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006122384274565263, + "loss": 1.1153, + "step": 9105 + }, + { + "epoch": 0.7985212974546092, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006119777214116676, + "loss": 1.0784, + "step": 9106 + }, + { + "epoch": 0.7986089892289838, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006117171100317608, + "loss": 1.1493, + "step": 9107 + }, + { + "epoch": 0.7986966810033583, + "grad_norm": 0.04736328125, + "learning_rate": 0.0006114565933405736, + "loss": 1.1809, + "step": 9108 + }, + { + "epoch": 0.7987843727777328, + "grad_norm": 0.04541015625, + "learning_rate": 0.000611196171361864, + "loss": 1.1241, + "step": 9109 + }, + { + "epoch": 0.7988720645521075, + "grad_norm": 0.048095703125, + "learning_rate": 0.0006109358441193821, + "loss": 1.0945, + "step": 9110 + }, + { + "epoch": 0.798959756326482, + "grad_norm": 0.041259765625, + "learning_rate": 0.0006106756116368685, + "loss": 1.1021, + "step": 9111 + }, + { + "epoch": 0.7990474481008565, + "grad_norm": 0.049072265625, + "learning_rate": 0.0006104154739380562, + "loss": 1.12, + "step": 9112 + }, + { + "epoch": 0.799135139875231, + "grad_norm": 0.046875, + "learning_rate": 0.0006101554310466689, + "loss": 1.1035, + "step": 9113 + }, + { + "epoch": 0.7992228316496056, + "grad_norm": 0.04541015625, + "learning_rate": 0.0006098954829864224, + "loss": 1.1626, + "step": 9114 + }, + { + "epoch": 0.7993105234239801, + "grad_norm": 0.046875, + "learning_rate": 0.0006096356297810224, + "loss": 1.0878, + "step": 9115 + }, + { + "epoch": 0.7993982151983546, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006093758714541674, + "loss": 1.149, + "step": 9116 + }, + { + "epoch": 0.7994859069727293, + "grad_norm": 0.0458984375, + "learning_rate": 0.0006091162080295463, + "loss": 1.2016, + "step": 9117 + }, + { + "epoch": 0.7995735987471038, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006088566395308407, + "loss": 1.1447, + "step": 9118 + }, + { + "epoch": 0.7996612905214783, + "grad_norm": 0.048828125, + "learning_rate": 0.0006085971659817213, + "loss": 1.1134, + "step": 9119 + }, + { + "epoch": 0.7997489822958529, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006083377874058521, + "loss": 1.1025, + "step": 9120 + }, + { + "epoch": 0.7998366740702274, + "grad_norm": 0.045166015625, + "learning_rate": 0.0006080785038268876, + "loss": 1.1129, + "step": 9121 + }, + { + "epoch": 0.7999243658446019, + "grad_norm": 0.052734375, + "learning_rate": 0.0006078193152684742, + "loss": 1.1528, + "step": 9122 + }, + { + "epoch": 0.8000120576189765, + "grad_norm": 0.05126953125, + "learning_rate": 0.0006075602217542485, + "loss": 1.1656, + "step": 9123 + }, + { + "epoch": 0.800099749393351, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006073012233078396, + "loss": 1.1705, + "step": 9124 + }, + { + "epoch": 0.8001874411677256, + "grad_norm": 0.044189453125, + "learning_rate": 0.0006070423199528677, + "loss": 1.0995, + "step": 9125 + }, + { + "epoch": 0.8002751329421002, + "grad_norm": 0.059814453125, + "learning_rate": 0.0006067835117129436, + "loss": 1.1841, + "step": 9126 + }, + { + "epoch": 0.8003628247164747, + "grad_norm": 0.058349609375, + "learning_rate": 0.0006065247986116705, + "loss": 1.1183, + "step": 9127 + }, + { + "epoch": 0.8004505164908492, + "grad_norm": 0.04638671875, + "learning_rate": 0.0006062661806726416, + "loss": 1.1949, + "step": 9128 + }, + { + "epoch": 0.8005382082652238, + "grad_norm": 0.05908203125, + "learning_rate": 0.0006060076579194428, + "loss": 1.1408, + "step": 9129 + }, + { + "epoch": 0.8006259000395983, + "grad_norm": 0.059814453125, + "learning_rate": 0.0006057492303756503, + "loss": 1.1182, + "step": 9130 + }, + { + "epoch": 0.8007135918139728, + "grad_norm": 0.048095703125, + "learning_rate": 0.0006054908980648327, + "loss": 1.1501, + "step": 9131 + }, + { + "epoch": 0.8008012835883475, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006052326610105486, + "loss": 1.1055, + "step": 9132 + }, + { + "epoch": 0.800888975362722, + "grad_norm": 0.050537109375, + "learning_rate": 0.0006049745192363486, + "loss": 1.1328, + "step": 9133 + }, + { + "epoch": 0.8009766671370965, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006047164727657749, + "loss": 1.0939, + "step": 9134 + }, + { + "epoch": 0.8010643589114711, + "grad_norm": 0.049072265625, + "learning_rate": 0.0006044585216223603, + "loss": 1.1942, + "step": 9135 + }, + { + "epoch": 0.8011520506858456, + "grad_norm": 0.04443359375, + "learning_rate": 0.00060420066582963, + "loss": 1.1072, + "step": 9136 + }, + { + "epoch": 0.8012397424602201, + "grad_norm": 0.06640625, + "learning_rate": 0.0006039429054110987, + "loss": 1.1332, + "step": 9137 + }, + { + "epoch": 0.8013274342345946, + "grad_norm": 0.04345703125, + "learning_rate": 0.0006036852403902743, + "loss": 1.0691, + "step": 9138 + }, + { + "epoch": 0.8014151260089692, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006034276707906548, + "loss": 1.1077, + "step": 9139 + }, + { + "epoch": 0.8015028177833438, + "grad_norm": 0.061279296875, + "learning_rate": 0.0006031701966357304, + "loss": 1.1056, + "step": 9140 + }, + { + "epoch": 0.8015905095577183, + "grad_norm": 0.04736328125, + "learning_rate": 0.0006029128179489813, + "loss": 1.1231, + "step": 9141 + }, + { + "epoch": 0.8016782013320929, + "grad_norm": 0.051513671875, + "learning_rate": 0.0006026555347538806, + "loss": 1.1495, + "step": 9142 + }, + { + "epoch": 0.8017658931064674, + "grad_norm": 0.0458984375, + "learning_rate": 0.000602398347073891, + "loss": 1.1651, + "step": 9143 + }, + { + "epoch": 0.8018535848808419, + "grad_norm": 0.051513671875, + "learning_rate": 0.0006021412549324677, + "loss": 1.0998, + "step": 9144 + }, + { + "epoch": 0.8019412766552165, + "grad_norm": 0.04443359375, + "learning_rate": 0.0006018842583530574, + "loss": 1.1035, + "step": 9145 + }, + { + "epoch": 0.802028968429591, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006016273573590966, + "loss": 1.1187, + "step": 9146 + }, + { + "epoch": 0.8021166602039655, + "grad_norm": 0.047607421875, + "learning_rate": 0.0006013705519740145, + "loss": 1.1263, + "step": 9147 + }, + { + "epoch": 0.8022043519783402, + "grad_norm": 0.0458984375, + "learning_rate": 0.0006011138422212309, + "loss": 1.1447, + "step": 9148 + }, + { + "epoch": 0.8022920437527147, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006008572281241578, + "loss": 1.1089, + "step": 9149 + }, + { + "epoch": 0.8023797355270892, + "grad_norm": 0.044677734375, + "learning_rate": 0.0006006007097061967, + "loss": 1.1459, + "step": 9150 + }, + { + "epoch": 0.8024674273014638, + "grad_norm": 0.0654296875, + "learning_rate": 0.0006003442869907417, + "loss": 1.1257, + "step": 9151 + }, + { + "epoch": 0.8025551190758383, + "grad_norm": 0.044189453125, + "learning_rate": 0.0006000879600011782, + "loss": 1.1322, + "step": 9152 + }, + { + "epoch": 0.8026428108502128, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005998317287608828, + "loss": 1.1317, + "step": 9153 + }, + { + "epoch": 0.8027305026245874, + "grad_norm": 0.048828125, + "learning_rate": 0.0005995755932932224, + "loss": 1.0847, + "step": 9154 + }, + { + "epoch": 0.802818194398962, + "grad_norm": 0.04443359375, + "learning_rate": 0.000599319553621556, + "loss": 1.1207, + "step": 9155 + }, + { + "epoch": 0.8029058861733365, + "grad_norm": 0.058349609375, + "learning_rate": 0.0005990636097692343, + "loss": 1.154, + "step": 9156 + }, + { + "epoch": 0.8029935779477111, + "grad_norm": 0.046142578125, + "learning_rate": 0.0005988077617595986, + "loss": 1.1267, + "step": 9157 + }, + { + "epoch": 0.8030812697220856, + "grad_norm": 0.056640625, + "learning_rate": 0.0005985520096159812, + "loss": 1.1921, + "step": 9158 + }, + { + "epoch": 0.8031689614964601, + "grad_norm": 0.06005859375, + "learning_rate": 0.0005982963533617064, + "loss": 1.1132, + "step": 9159 + }, + { + "epoch": 0.8032566532708346, + "grad_norm": 0.04736328125, + "learning_rate": 0.0005980407930200888, + "loss": 1.0891, + "step": 9160 + }, + { + "epoch": 0.8033443450452092, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005977853286144353, + "loss": 1.0859, + "step": 9161 + }, + { + "epoch": 0.8034320368195838, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005975299601680437, + "loss": 1.1116, + "step": 9162 + }, + { + "epoch": 0.8035197285939583, + "grad_norm": 0.0498046875, + "learning_rate": 0.000597274687704203, + "loss": 1.1149, + "step": 9163 + }, + { + "epoch": 0.8036074203683329, + "grad_norm": 0.06982421875, + "learning_rate": 0.0005970195112461929, + "loss": 1.0943, + "step": 9164 + }, + { + "epoch": 0.8036951121427074, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005967644308172852, + "loss": 1.1193, + "step": 9165 + }, + { + "epoch": 0.8037828039170819, + "grad_norm": 0.04296875, + "learning_rate": 0.0005965094464407424, + "loss": 1.1208, + "step": 9166 + }, + { + "epoch": 0.8038704956914565, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005962545581398189, + "loss": 1.0602, + "step": 9167 + }, + { + "epoch": 0.803958187465831, + "grad_norm": 0.043212890625, + "learning_rate": 0.000595999765937759, + "loss": 1.1482, + "step": 9168 + }, + { + "epoch": 0.8040458792402055, + "grad_norm": 0.0458984375, + "learning_rate": 0.0005957450698577997, + "loss": 1.1856, + "step": 9169 + }, + { + "epoch": 0.8041335710145802, + "grad_norm": 0.0419921875, + "learning_rate": 0.0005954904699231684, + "loss": 1.0747, + "step": 9170 + }, + { + "epoch": 0.8042212627889547, + "grad_norm": 0.052734375, + "learning_rate": 0.0005952359661570845, + "loss": 1.1007, + "step": 9171 + }, + { + "epoch": 0.8043089545633292, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005949815585827572, + "loss": 1.1138, + "step": 9172 + }, + { + "epoch": 0.8043966463377038, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005947272472233882, + "loss": 1.1271, + "step": 9173 + }, + { + "epoch": 0.8044843381120783, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005944730321021701, + "loss": 1.1044, + "step": 9174 + }, + { + "epoch": 0.8045720298864528, + "grad_norm": 0.0458984375, + "learning_rate": 0.0005942189132422871, + "loss": 1.1568, + "step": 9175 + }, + { + "epoch": 0.8046597216608274, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005939648906669137, + "loss": 1.0934, + "step": 9176 + }, + { + "epoch": 0.804747413435202, + "grad_norm": 0.0517578125, + "learning_rate": 0.0005937109643992157, + "loss": 1.1795, + "step": 9177 + }, + { + "epoch": 0.8048351052095765, + "grad_norm": 0.04638671875, + "learning_rate": 0.0005934571344623509, + "loss": 1.0788, + "step": 9178 + }, + { + "epoch": 0.8049227969839511, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005932034008794681, + "loss": 1.109, + "step": 9179 + }, + { + "epoch": 0.8050104887583256, + "grad_norm": 0.046875, + "learning_rate": 0.0005929497636737073, + "loss": 1.1257, + "step": 9180 + }, + { + "epoch": 0.8050981805327001, + "grad_norm": 0.0458984375, + "learning_rate": 0.000592696222868199, + "loss": 1.1558, + "step": 9181 + }, + { + "epoch": 0.8051858723070747, + "grad_norm": 0.04638671875, + "learning_rate": 0.0005924427784860657, + "loss": 1.1571, + "step": 9182 + }, + { + "epoch": 0.8052735640814492, + "grad_norm": 0.05615234375, + "learning_rate": 0.000592189430550421, + "loss": 1.1009, + "step": 9183 + }, + { + "epoch": 0.8053612558558237, + "grad_norm": 0.050048828125, + "learning_rate": 0.0005919361790843698, + "loss": 1.1374, + "step": 9184 + }, + { + "epoch": 0.8054489476301983, + "grad_norm": 0.052734375, + "learning_rate": 0.0005916830241110074, + "loss": 1.1207, + "step": 9185 + }, + { + "epoch": 0.8055366394045729, + "grad_norm": 0.04736328125, + "learning_rate": 0.0005914299656534211, + "loss": 1.1193, + "step": 9186 + }, + { + "epoch": 0.8056243311789474, + "grad_norm": 0.0419921875, + "learning_rate": 0.0005911770037346893, + "loss": 1.1663, + "step": 9187 + }, + { + "epoch": 0.8057120229533219, + "grad_norm": 0.048095703125, + "learning_rate": 0.0005909241383778814, + "loss": 1.1316, + "step": 9188 + }, + { + "epoch": 0.8057997147276965, + "grad_norm": 0.06298828125, + "learning_rate": 0.0005906713696060585, + "loss": 1.1531, + "step": 9189 + }, + { + "epoch": 0.805887406502071, + "grad_norm": 0.04296875, + "learning_rate": 0.0005904186974422717, + "loss": 1.0923, + "step": 9190 + }, + { + "epoch": 0.8059750982764455, + "grad_norm": 0.062255859375, + "learning_rate": 0.0005901661219095644, + "loss": 1.1188, + "step": 9191 + }, + { + "epoch": 0.8060627900508202, + "grad_norm": 0.072265625, + "learning_rate": 0.0005899136430309711, + "loss": 1.124, + "step": 9192 + }, + { + "epoch": 0.8061504818251947, + "grad_norm": 0.052978515625, + "learning_rate": 0.0005896612608295167, + "loss": 1.1534, + "step": 9193 + }, + { + "epoch": 0.8062381735995692, + "grad_norm": 0.0712890625, + "learning_rate": 0.0005894089753282184, + "loss": 1.0778, + "step": 9194 + }, + { + "epoch": 0.8063258653739438, + "grad_norm": 0.0595703125, + "learning_rate": 0.0005891567865500832, + "loss": 1.0945, + "step": 9195 + }, + { + "epoch": 0.8064135571483183, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005889046945181106, + "loss": 1.1654, + "step": 9196 + }, + { + "epoch": 0.8065012489226928, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005886526992552904, + "loss": 1.1498, + "step": 9197 + }, + { + "epoch": 0.8065889406970674, + "grad_norm": 0.044189453125, + "learning_rate": 0.000588400800784605, + "loss": 1.1059, + "step": 9198 + }, + { + "epoch": 0.806676632471442, + "grad_norm": 0.0458984375, + "learning_rate": 0.0005881489991290252, + "loss": 1.1426, + "step": 9199 + }, + { + "epoch": 0.8067643242458165, + "grad_norm": 0.041748046875, + "learning_rate": 0.0005878972943115159, + "loss": 1.1368, + "step": 9200 + }, + { + "epoch": 0.8068520160201911, + "grad_norm": 0.046630859375, + "learning_rate": 0.0005876456863550315, + "loss": 1.1687, + "step": 9201 + }, + { + "epoch": 0.8069397077945656, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005873941752825183, + "loss": 1.1419, + "step": 9202 + }, + { + "epoch": 0.8070273995689401, + "grad_norm": 0.042724609375, + "learning_rate": 0.000587142761116913, + "loss": 1.0956, + "step": 9203 + }, + { + "epoch": 0.8071150913433147, + "grad_norm": 0.049560546875, + "learning_rate": 0.000586891443881144, + "loss": 1.0782, + "step": 9204 + }, + { + "epoch": 0.8072027831176892, + "grad_norm": 0.051025390625, + "learning_rate": 0.0005866402235981309, + "loss": 1.1099, + "step": 9205 + }, + { + "epoch": 0.8072904748920637, + "grad_norm": 0.048828125, + "learning_rate": 0.0005863891002907849, + "loss": 1.1006, + "step": 9206 + }, + { + "epoch": 0.8073781666664384, + "grad_norm": 0.051513671875, + "learning_rate": 0.0005861380739820065, + "loss": 1.1861, + "step": 9207 + }, + { + "epoch": 0.8074658584408129, + "grad_norm": 0.05322265625, + "learning_rate": 0.0005858871446946897, + "loss": 1.0721, + "step": 9208 + }, + { + "epoch": 0.8075535502151874, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005856363124517189, + "loss": 1.1253, + "step": 9209 + }, + { + "epoch": 0.8076412419895619, + "grad_norm": 0.05517578125, + "learning_rate": 0.0005853855772759681, + "loss": 1.124, + "step": 9210 + }, + { + "epoch": 0.8077289337639365, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005851349391903046, + "loss": 1.0994, + "step": 9211 + }, + { + "epoch": 0.807816625538311, + "grad_norm": 0.044921875, + "learning_rate": 0.0005848843982175857, + "loss": 1.1379, + "step": 9212 + }, + { + "epoch": 0.8079043173126855, + "grad_norm": 0.03955078125, + "learning_rate": 0.0005846339543806599, + "loss": 1.1196, + "step": 9213 + }, + { + "epoch": 0.8079920090870601, + "grad_norm": 0.0419921875, + "learning_rate": 0.0005843836077023671, + "loss": 1.0609, + "step": 9214 + }, + { + "epoch": 0.8080797008614347, + "grad_norm": 0.049560546875, + "learning_rate": 0.0005841333582055387, + "loss": 1.0578, + "step": 9215 + }, + { + "epoch": 0.8081673926358092, + "grad_norm": 0.0458984375, + "learning_rate": 0.0005838832059129967, + "loss": 1.0787, + "step": 9216 + }, + { + "epoch": 0.8082550844101838, + "grad_norm": 0.0498046875, + "learning_rate": 0.0005836331508475537, + "loss": 1.0827, + "step": 9217 + }, + { + "epoch": 0.8083427761845583, + "grad_norm": 0.043701171875, + "learning_rate": 0.0005833831930320147, + "loss": 1.0956, + "step": 9218 + }, + { + "epoch": 0.8084304679589328, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005831333324891751, + "loss": 1.1561, + "step": 9219 + }, + { + "epoch": 0.8085181597333074, + "grad_norm": 0.05615234375, + "learning_rate": 0.0005828835692418217, + "loss": 1.1686, + "step": 9220 + }, + { + "epoch": 0.8086058515076819, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005826339033127317, + "loss": 1.0974, + "step": 9221 + }, + { + "epoch": 0.8086935432820564, + "grad_norm": 0.04296875, + "learning_rate": 0.0005823843347246743, + "loss": 1.159, + "step": 9222 + }, + { + "epoch": 0.8087812350564311, + "grad_norm": 0.043701171875, + "learning_rate": 0.0005821348635004096, + "loss": 1.1683, + "step": 9223 + }, + { + "epoch": 0.8088689268308056, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005818854896626891, + "loss": 1.1019, + "step": 9224 + }, + { + "epoch": 0.8089566186051801, + "grad_norm": 0.04296875, + "learning_rate": 0.000581636213234254, + "loss": 1.1119, + "step": 9225 + }, + { + "epoch": 0.8090443103795547, + "grad_norm": 0.04052734375, + "learning_rate": 0.000581387034237839, + "loss": 1.1439, + "step": 9226 + }, + { + "epoch": 0.8091320021539292, + "grad_norm": 0.041015625, + "learning_rate": 0.0005811379526961673, + "loss": 1.1232, + "step": 9227 + }, + { + "epoch": 0.8092196939283037, + "grad_norm": 0.050537109375, + "learning_rate": 0.0005808889686319549, + "loss": 1.1207, + "step": 9228 + }, + { + "epoch": 0.8093073857026784, + "grad_norm": 0.049560546875, + "learning_rate": 0.0005806400820679094, + "loss": 1.1605, + "step": 9229 + }, + { + "epoch": 0.8093950774770529, + "grad_norm": 0.043701171875, + "learning_rate": 0.0005803912930267272, + "loss": 1.1491, + "step": 9230 + }, + { + "epoch": 0.8094827692514274, + "grad_norm": 0.042236328125, + "learning_rate": 0.000580142601531098, + "loss": 1.0774, + "step": 9231 + }, + { + "epoch": 0.8095704610258019, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005798940076037018, + "loss": 1.1035, + "step": 9232 + }, + { + "epoch": 0.8096581528001765, + "grad_norm": 0.04345703125, + "learning_rate": 0.00057964551126721, + "loss": 1.0883, + "step": 9233 + }, + { + "epoch": 0.809745844574551, + "grad_norm": 0.044921875, + "learning_rate": 0.000579397112544284, + "loss": 1.1322, + "step": 9234 + }, + { + "epoch": 0.8098335363489255, + "grad_norm": 0.049560546875, + "learning_rate": 0.0005791488114575775, + "loss": 1.1261, + "step": 9235 + }, + { + "epoch": 0.8099212281233001, + "grad_norm": 0.046875, + "learning_rate": 0.0005789006080297353, + "loss": 1.1373, + "step": 9236 + }, + { + "epoch": 0.8100089198976747, + "grad_norm": 0.046875, + "learning_rate": 0.0005786525022833928, + "loss": 1.1358, + "step": 9237 + }, + { + "epoch": 0.8100966116720492, + "grad_norm": 0.046875, + "learning_rate": 0.000578404494241176, + "loss": 1.1212, + "step": 9238 + }, + { + "epoch": 0.8101843034464238, + "grad_norm": 0.055419921875, + "learning_rate": 0.0005781565839257031, + "loss": 1.1261, + "step": 9239 + }, + { + "epoch": 0.8102719952207983, + "grad_norm": 0.0419921875, + "learning_rate": 0.0005779087713595828, + "loss": 1.1094, + "step": 9240 + }, + { + "epoch": 0.8103596869951728, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005776610565654149, + "loss": 1.1065, + "step": 9241 + }, + { + "epoch": 0.8104473787695474, + "grad_norm": 0.04638671875, + "learning_rate": 0.0005774134395657908, + "loss": 1.127, + "step": 9242 + }, + { + "epoch": 0.8105350705439219, + "grad_norm": 0.05029296875, + "learning_rate": 0.0005771659203832921, + "loss": 1.1426, + "step": 9243 + }, + { + "epoch": 0.8106227623182964, + "grad_norm": 0.053466796875, + "learning_rate": 0.0005769184990404917, + "loss": 1.141, + "step": 9244 + }, + { + "epoch": 0.8107104540926711, + "grad_norm": 0.04443359375, + "learning_rate": 0.000576671175559954, + "loss": 1.1165, + "step": 9245 + }, + { + "epoch": 0.8107981458670456, + "grad_norm": 0.047119140625, + "learning_rate": 0.0005764239499642345, + "loss": 1.1643, + "step": 9246 + }, + { + "epoch": 0.8108858376414201, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005761768222758794, + "loss": 1.1296, + "step": 9247 + }, + { + "epoch": 0.8109735294157947, + "grad_norm": 0.046142578125, + "learning_rate": 0.0005759297925174258, + "loss": 1.1311, + "step": 9248 + }, + { + "epoch": 0.8110612211901692, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005756828607114023, + "loss": 1.0759, + "step": 9249 + }, + { + "epoch": 0.8111489129645437, + "grad_norm": 0.043701171875, + "learning_rate": 0.0005754360268803287, + "loss": 1.2053, + "step": 9250 + }, + { + "epoch": 0.8112366047389183, + "grad_norm": 0.0537109375, + "learning_rate": 0.0005751892910467158, + "loss": 1.1632, + "step": 9251 + }, + { + "epoch": 0.8113242965132929, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005749426532330647, + "loss": 1.1217, + "step": 9252 + }, + { + "epoch": 0.8114119882876674, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005746961134618681, + "loss": 1.1207, + "step": 9253 + }, + { + "epoch": 0.811499680062042, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005744496717556105, + "loss": 1.1801, + "step": 9254 + }, + { + "epoch": 0.8115873718364165, + "grad_norm": 0.0556640625, + "learning_rate": 0.0005742033281367664, + "loss": 1.0634, + "step": 9255 + }, + { + "epoch": 0.811675063610791, + "grad_norm": 0.04833984375, + "learning_rate": 0.0005739570826278014, + "loss": 1.103, + "step": 9256 + }, + { + "epoch": 0.8117627553851655, + "grad_norm": 0.0556640625, + "learning_rate": 0.0005737109352511724, + "loss": 1.1117, + "step": 9257 + }, + { + "epoch": 0.8118504471595401, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005734648860293278, + "loss": 1.1345, + "step": 9258 + }, + { + "epoch": 0.8119381389339146, + "grad_norm": 0.04833984375, + "learning_rate": 0.0005732189349847071, + "loss": 1.0801, + "step": 9259 + }, + { + "epoch": 0.8120258307082892, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005729730821397396, + "loss": 1.1172, + "step": 9260 + }, + { + "epoch": 0.8121135224826638, + "grad_norm": 0.047119140625, + "learning_rate": 0.0005727273275168463, + "loss": 1.0871, + "step": 9261 + }, + { + "epoch": 0.8122012142570383, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005724816711384401, + "loss": 1.1154, + "step": 9262 + }, + { + "epoch": 0.8122889060314128, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005722361130269236, + "loss": 1.1348, + "step": 9263 + }, + { + "epoch": 0.8123765978057874, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005719906532046916, + "loss": 1.1395, + "step": 9264 + }, + { + "epoch": 0.8124642895801619, + "grad_norm": 0.044921875, + "learning_rate": 0.000571745291694129, + "loss": 1.1329, + "step": 9265 + }, + { + "epoch": 0.8125519813545364, + "grad_norm": 0.04736328125, + "learning_rate": 0.0005715000285176123, + "loss": 1.1948, + "step": 9266 + }, + { + "epoch": 0.812639673128911, + "grad_norm": 0.0615234375, + "learning_rate": 0.0005712548636975088, + "loss": 1.1354, + "step": 9267 + }, + { + "epoch": 0.8127273649032856, + "grad_norm": 0.0439453125, + "learning_rate": 0.0005710097972561773, + "loss": 1.1652, + "step": 9268 + }, + { + "epoch": 0.8128150566776601, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005707648292159668, + "loss": 1.0781, + "step": 9269 + }, + { + "epoch": 0.8129027484520347, + "grad_norm": 0.0498046875, + "learning_rate": 0.0005705199595992176, + "loss": 1.1954, + "step": 9270 + }, + { + "epoch": 0.8129904402264092, + "grad_norm": 0.05078125, + "learning_rate": 0.0005702751884282614, + "loss": 1.0935, + "step": 9271 + }, + { + "epoch": 0.8130781320007837, + "grad_norm": 0.048583984375, + "learning_rate": 0.0005700305157254207, + "loss": 1.1218, + "step": 9272 + }, + { + "epoch": 0.8131658237751583, + "grad_norm": 0.04541015625, + "learning_rate": 0.0005697859415130095, + "loss": 1.1236, + "step": 9273 + }, + { + "epoch": 0.8132535155495328, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005695414658133315, + "loss": 1.1597, + "step": 9274 + }, + { + "epoch": 0.8133412073239074, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005692970886486825, + "loss": 1.1672, + "step": 9275 + }, + { + "epoch": 0.813428899098282, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005690528100413494, + "loss": 1.0898, + "step": 9276 + }, + { + "epoch": 0.8135165908726565, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005688086300136099, + "loss": 1.0864, + "step": 9277 + }, + { + "epoch": 0.813604282647031, + "grad_norm": 0.043701171875, + "learning_rate": 0.0005685645485877322, + "loss": 1.083, + "step": 9278 + }, + { + "epoch": 0.8136919744214056, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005683205657859756, + "loss": 1.1697, + "step": 9279 + }, + { + "epoch": 0.8137796661957801, + "grad_norm": 0.04248046875, + "learning_rate": 0.000568076681630591, + "loss": 1.1222, + "step": 9280 + }, + { + "epoch": 0.8138673579701546, + "grad_norm": 0.04638671875, + "learning_rate": 0.0005678328961438201, + "loss": 1.1486, + "step": 9281 + }, + { + "epoch": 0.8139550497445291, + "grad_norm": 0.0732421875, + "learning_rate": 0.000567589209347896, + "loss": 1.169, + "step": 9282 + }, + { + "epoch": 0.8140427415189038, + "grad_norm": 0.05322265625, + "learning_rate": 0.0005673456212650412, + "loss": 1.1248, + "step": 9283 + }, + { + "epoch": 0.8141304332932783, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005671021319174709, + "loss": 1.1091, + "step": 9284 + }, + { + "epoch": 0.8142181250676528, + "grad_norm": 0.048095703125, + "learning_rate": 0.0005668587413273908, + "loss": 1.1363, + "step": 9285 + }, + { + "epoch": 0.8143058168420274, + "grad_norm": 0.044921875, + "learning_rate": 0.0005666154495169978, + "loss": 1.1442, + "step": 9286 + }, + { + "epoch": 0.8143935086164019, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005663722565084784, + "loss": 1.1048, + "step": 9287 + }, + { + "epoch": 0.8144812003907764, + "grad_norm": 0.0419921875, + "learning_rate": 0.000566129162324012, + "loss": 1.101, + "step": 9288 + }, + { + "epoch": 0.814568892165151, + "grad_norm": 0.0419921875, + "learning_rate": 0.0005658861669857682, + "loss": 1.157, + "step": 9289 + }, + { + "epoch": 0.8146565839395256, + "grad_norm": 0.046630859375, + "learning_rate": 0.0005656432705159075, + "loss": 1.1482, + "step": 9290 + }, + { + "epoch": 0.8147442757139001, + "grad_norm": 0.052978515625, + "learning_rate": 0.0005654004729365809, + "loss": 1.1618, + "step": 9291 + }, + { + "epoch": 0.8148319674882747, + "grad_norm": 0.0439453125, + "learning_rate": 0.0005651577742699313, + "loss": 1.1374, + "step": 9292 + }, + { + "epoch": 0.8149196592626492, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005649151745380921, + "loss": 1.085, + "step": 9293 + }, + { + "epoch": 0.8150073510370237, + "grad_norm": 0.05078125, + "learning_rate": 0.0005646726737631885, + "loss": 1.1414, + "step": 9294 + }, + { + "epoch": 0.8150950428113983, + "grad_norm": 0.045654296875, + "learning_rate": 0.000564430271967335, + "loss": 1.1432, + "step": 9295 + }, + { + "epoch": 0.8151827345857728, + "grad_norm": 0.052978515625, + "learning_rate": 0.0005641879691726379, + "loss": 1.1774, + "step": 9296 + }, + { + "epoch": 0.8152704263601473, + "grad_norm": 0.052001953125, + "learning_rate": 0.0005639457654011954, + "loss": 1.1015, + "step": 9297 + }, + { + "epoch": 0.815358118134522, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005637036606750953, + "loss": 1.1158, + "step": 9298 + }, + { + "epoch": 0.8154458099088965, + "grad_norm": 0.052978515625, + "learning_rate": 0.0005634616550164169, + "loss": 1.1498, + "step": 9299 + }, + { + "epoch": 0.815533501683271, + "grad_norm": 0.046142578125, + "learning_rate": 0.0005632197484472312, + "loss": 1.1196, + "step": 9300 + }, + { + "epoch": 0.8156211934576456, + "grad_norm": 0.04931640625, + "learning_rate": 0.0005629779409895983, + "loss": 1.1293, + "step": 9301 + }, + { + "epoch": 0.8157088852320201, + "grad_norm": 0.044921875, + "learning_rate": 0.0005627362326655715, + "loss": 1.1439, + "step": 9302 + }, + { + "epoch": 0.8157965770063946, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005624946234971932, + "loss": 1.1481, + "step": 9303 + }, + { + "epoch": 0.8158842687807691, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005622531135064983, + "loss": 1.1299, + "step": 9304 + }, + { + "epoch": 0.8159719605551438, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005620117027155112, + "loss": 1.0746, + "step": 9305 + }, + { + "epoch": 0.8160596523295183, + "grad_norm": 0.05224609375, + "learning_rate": 0.0005617703911462482, + "loss": 1.1264, + "step": 9306 + }, + { + "epoch": 0.8161473441038928, + "grad_norm": 0.043701171875, + "learning_rate": 0.0005615291788207161, + "loss": 1.1398, + "step": 9307 + }, + { + "epoch": 0.8162350358782674, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005612880657609133, + "loss": 1.1568, + "step": 9308 + }, + { + "epoch": 0.8163227276526419, + "grad_norm": 0.056884765625, + "learning_rate": 0.0005610470519888281, + "loss": 1.1489, + "step": 9309 + }, + { + "epoch": 0.8164104194270164, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005608061375264408, + "loss": 1.123, + "step": 9310 + }, + { + "epoch": 0.816498111201391, + "grad_norm": 0.04736328125, + "learning_rate": 0.0005605653223957223, + "loss": 1.203, + "step": 9311 + }, + { + "epoch": 0.8165858029757656, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005603246066186336, + "loss": 1.2096, + "step": 9312 + }, + { + "epoch": 0.8166734947501401, + "grad_norm": 0.044921875, + "learning_rate": 0.0005600839902171284, + "loss": 1.1389, + "step": 9313 + }, + { + "epoch": 0.8167611865245147, + "grad_norm": 0.0498046875, + "learning_rate": 0.000559843473213149, + "loss": 1.1223, + "step": 9314 + }, + { + "epoch": 0.8168488782988892, + "grad_norm": 0.046875, + "learning_rate": 0.0005596030556286309, + "loss": 1.1524, + "step": 9315 + }, + { + "epoch": 0.8169365700732637, + "grad_norm": 0.056640625, + "learning_rate": 0.0005593627374854991, + "loss": 1.1613, + "step": 9316 + }, + { + "epoch": 0.8170242618476383, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005591225188056705, + "loss": 1.1139, + "step": 9317 + }, + { + "epoch": 0.8171119536220128, + "grad_norm": 0.046875, + "learning_rate": 0.000558882399611052, + "loss": 1.1525, + "step": 9318 + }, + { + "epoch": 0.8171996453963873, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005586423799235417, + "loss": 1.2005, + "step": 9319 + }, + { + "epoch": 0.817287337170762, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005584024597650292, + "loss": 1.1648, + "step": 9320 + }, + { + "epoch": 0.8173750289451365, + "grad_norm": 0.05224609375, + "learning_rate": 0.0005581626391573947, + "loss": 1.1723, + "step": 9321 + }, + { + "epoch": 0.817462720719511, + "grad_norm": 0.056396484375, + "learning_rate": 0.0005579229181225084, + "loss": 1.1581, + "step": 9322 + }, + { + "epoch": 0.8175504124938856, + "grad_norm": 0.04638671875, + "learning_rate": 0.0005576832966822331, + "loss": 1.0623, + "step": 9323 + }, + { + "epoch": 0.8176381042682601, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005574437748584212, + "loss": 1.1253, + "step": 9324 + }, + { + "epoch": 0.8177257960426346, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005572043526729165, + "loss": 1.1632, + "step": 9325 + }, + { + "epoch": 0.8178134878170092, + "grad_norm": 0.0439453125, + "learning_rate": 0.0005569650301475544, + "loss": 1.108, + "step": 9326 + }, + { + "epoch": 0.8179011795913838, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005567258073041594, + "loss": 1.1252, + "step": 9327 + }, + { + "epoch": 0.8179888713657583, + "grad_norm": 0.04296875, + "learning_rate": 0.0005564866841645489, + "loss": 1.1345, + "step": 9328 + }, + { + "epoch": 0.8180765631401328, + "grad_norm": 0.05029296875, + "learning_rate": 0.0005562476607505294, + "loss": 1.1327, + "step": 9329 + }, + { + "epoch": 0.8181642549145074, + "grad_norm": 0.048095703125, + "learning_rate": 0.0005560087370839, + "loss": 1.1051, + "step": 9330 + }, + { + "epoch": 0.8182519466888819, + "grad_norm": 0.04638671875, + "learning_rate": 0.0005557699131864498, + "loss": 1.0976, + "step": 9331 + }, + { + "epoch": 0.8183396384632564, + "grad_norm": 0.05029296875, + "learning_rate": 0.0005555311890799587, + "loss": 1.1253, + "step": 9332 + }, + { + "epoch": 0.818427330237631, + "grad_norm": 0.05029296875, + "learning_rate": 0.0005552925647861973, + "loss": 1.1574, + "step": 9333 + }, + { + "epoch": 0.8185150220120055, + "grad_norm": 0.041748046875, + "learning_rate": 0.0005550540403269286, + "loss": 1.0956, + "step": 9334 + }, + { + "epoch": 0.81860271378638, + "grad_norm": 0.0498046875, + "learning_rate": 0.0005548156157239046, + "loss": 1.1553, + "step": 9335 + }, + { + "epoch": 0.8186904055607547, + "grad_norm": 0.06005859375, + "learning_rate": 0.0005545772909988693, + "loss": 1.1972, + "step": 9336 + }, + { + "epoch": 0.8187780973351292, + "grad_norm": 0.046142578125, + "learning_rate": 0.0005543390661735571, + "loss": 1.1036, + "step": 9337 + }, + { + "epoch": 0.8188657891095037, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005541009412696937, + "loss": 1.1856, + "step": 9338 + }, + { + "epoch": 0.8189534808838783, + "grad_norm": 0.048828125, + "learning_rate": 0.0005538629163089956, + "loss": 1.1341, + "step": 9339 + }, + { + "epoch": 0.8190411726582528, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005536249913131697, + "loss": 1.1511, + "step": 9340 + }, + { + "epoch": 0.8191288644326273, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005533871663039143, + "loss": 1.1279, + "step": 9341 + }, + { + "epoch": 0.819216556207002, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005531494413029184, + "loss": 1.1228, + "step": 9342 + }, + { + "epoch": 0.8193042479813765, + "grad_norm": 0.05615234375, + "learning_rate": 0.0005529118163318624, + "loss": 1.1713, + "step": 9343 + }, + { + "epoch": 0.819391939755751, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005526742914124162, + "loss": 1.1058, + "step": 9344 + }, + { + "epoch": 0.8194796315301256, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005524368665662421, + "loss": 1.1127, + "step": 9345 + }, + { + "epoch": 0.8195673233045001, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005521995418149922, + "loss": 1.1265, + "step": 9346 + }, + { + "epoch": 0.8196550150788746, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005519623171803103, + "loss": 1.0708, + "step": 9347 + }, + { + "epoch": 0.8197427068532492, + "grad_norm": 0.044921875, + "learning_rate": 0.0005517251926838306, + "loss": 1.1559, + "step": 9348 + }, + { + "epoch": 0.8198303986276237, + "grad_norm": 0.048583984375, + "learning_rate": 0.0005514881683471777, + "loss": 1.1404, + "step": 9349 + }, + { + "epoch": 0.8199180904019983, + "grad_norm": 0.041259765625, + "learning_rate": 0.0005512512441919682, + "loss": 1.0509, + "step": 9350 + }, + { + "epoch": 0.8200057821763728, + "grad_norm": 0.04150390625, + "learning_rate": 0.0005510144202398088, + "loss": 1.0908, + "step": 9351 + }, + { + "epoch": 0.8200934739507474, + "grad_norm": 0.04296875, + "learning_rate": 0.0005507776965122974, + "loss": 1.1447, + "step": 9352 + }, + { + "epoch": 0.8201811657251219, + "grad_norm": 0.04296875, + "learning_rate": 0.0005505410730310227, + "loss": 1.1395, + "step": 9353 + }, + { + "epoch": 0.8202688574994964, + "grad_norm": 0.04541015625, + "learning_rate": 0.0005503045498175636, + "loss": 1.0863, + "step": 9354 + }, + { + "epoch": 0.820356549273871, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005500681268934906, + "loss": 1.1353, + "step": 9355 + }, + { + "epoch": 0.8204442410482455, + "grad_norm": 0.046630859375, + "learning_rate": 0.000549831804280365, + "loss": 1.1011, + "step": 9356 + }, + { + "epoch": 0.82053193282262, + "grad_norm": 0.052001953125, + "learning_rate": 0.0005495955819997393, + "loss": 1.0913, + "step": 9357 + }, + { + "epoch": 0.8206196245969947, + "grad_norm": 0.047119140625, + "learning_rate": 0.0005493594600731552, + "loss": 1.1645, + "step": 9358 + }, + { + "epoch": 0.8207073163713692, + "grad_norm": 0.04296875, + "learning_rate": 0.0005491234385221473, + "loss": 1.1237, + "step": 9359 + }, + { + "epoch": 0.8207950081457437, + "grad_norm": 0.0439453125, + "learning_rate": 0.0005488875173682398, + "loss": 1.1263, + "step": 9360 + }, + { + "epoch": 0.8208826999201183, + "grad_norm": 0.0478515625, + "learning_rate": 0.0005486516966329485, + "loss": 1.1365, + "step": 9361 + }, + { + "epoch": 0.8209703916944928, + "grad_norm": 0.040771484375, + "learning_rate": 0.0005484159763377794, + "loss": 1.0689, + "step": 9362 + }, + { + "epoch": 0.8210580834688673, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005481803565042292, + "loss": 1.1867, + "step": 9363 + }, + { + "epoch": 0.821145775243242, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005479448371537863, + "loss": 1.1531, + "step": 9364 + }, + { + "epoch": 0.8212334670176165, + "grad_norm": 0.046142578125, + "learning_rate": 0.0005477094183079292, + "loss": 1.0918, + "step": 9365 + }, + { + "epoch": 0.821321158791991, + "grad_norm": 0.046630859375, + "learning_rate": 0.0005474740999881279, + "loss": 1.1724, + "step": 9366 + }, + { + "epoch": 0.8214088505663656, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005472388822158422, + "loss": 1.1108, + "step": 9367 + }, + { + "epoch": 0.8214965423407401, + "grad_norm": 0.051513671875, + "learning_rate": 0.0005470037650125235, + "loss": 1.1216, + "step": 9368 + }, + { + "epoch": 0.8215842341151146, + "grad_norm": 0.04296875, + "learning_rate": 0.0005467687483996143, + "loss": 1.1639, + "step": 9369 + }, + { + "epoch": 0.8216719258894892, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005465338323985476, + "loss": 1.1134, + "step": 9370 + }, + { + "epoch": 0.8217596176638637, + "grad_norm": 0.049560546875, + "learning_rate": 0.0005462990170307465, + "loss": 1.0993, + "step": 9371 + }, + { + "epoch": 0.8218473094382382, + "grad_norm": 0.050048828125, + "learning_rate": 0.0005460643023176258, + "loss": 1.1033, + "step": 9372 + }, + { + "epoch": 0.8219350012126129, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005458296882805908, + "loss": 1.1338, + "step": 9373 + }, + { + "epoch": 0.8220226929869874, + "grad_norm": 0.0439453125, + "learning_rate": 0.0005455951749410382, + "loss": 1.0877, + "step": 9374 + }, + { + "epoch": 0.8221103847613619, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005453607623203545, + "loss": 1.1239, + "step": 9375 + }, + { + "epoch": 0.8221980765357364, + "grad_norm": 0.050048828125, + "learning_rate": 0.0005451264504399174, + "loss": 1.1959, + "step": 9376 + }, + { + "epoch": 0.822285768310111, + "grad_norm": 0.04296875, + "learning_rate": 0.000544892239321096, + "loss": 1.1579, + "step": 9377 + }, + { + "epoch": 0.8223734600844855, + "grad_norm": 0.0458984375, + "learning_rate": 0.0005446581289852494, + "loss": 1.1079, + "step": 9378 + }, + { + "epoch": 0.82246115185886, + "grad_norm": 0.0576171875, + "learning_rate": 0.0005444241194537287, + "loss": 1.2099, + "step": 9379 + }, + { + "epoch": 0.8225488436332347, + "grad_norm": 0.04248046875, + "learning_rate": 0.0005441902107478737, + "loss": 1.0921, + "step": 9380 + }, + { + "epoch": 0.8226365354076092, + "grad_norm": 0.0576171875, + "learning_rate": 0.0005439564028890169, + "loss": 1.0934, + "step": 9381 + }, + { + "epoch": 0.8227242271819837, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005437226958984808, + "loss": 1.1486, + "step": 9382 + }, + { + "epoch": 0.8228119189563583, + "grad_norm": 0.048583984375, + "learning_rate": 0.0005434890897975791, + "loss": 1.1206, + "step": 9383 + }, + { + "epoch": 0.8228996107307328, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005432555846076162, + "loss": 1.1473, + "step": 9384 + }, + { + "epoch": 0.8229873025051073, + "grad_norm": 0.04638671875, + "learning_rate": 0.0005430221803498867, + "loss": 1.0677, + "step": 9385 + }, + { + "epoch": 0.8230749942794819, + "grad_norm": 0.0439453125, + "learning_rate": 0.0005427888770456766, + "loss": 1.1731, + "step": 9386 + }, + { + "epoch": 0.8231626860538565, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005425556747162627, + "loss": 1.1009, + "step": 9387 + }, + { + "epoch": 0.823250377828231, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005423225733829128, + "loss": 1.1717, + "step": 9388 + }, + { + "epoch": 0.8233380696026056, + "grad_norm": 0.0458984375, + "learning_rate": 0.0005420895730668844, + "loss": 1.1596, + "step": 9389 + }, + { + "epoch": 0.8234257613769801, + "grad_norm": 0.0439453125, + "learning_rate": 0.0005418566737894268, + "loss": 1.1548, + "step": 9390 + }, + { + "epoch": 0.8235134531513546, + "grad_norm": 0.046875, + "learning_rate": 0.00054162387557178, + "loss": 1.1612, + "step": 9391 + }, + { + "epoch": 0.8236011449257292, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005413911784351747, + "loss": 1.1693, + "step": 9392 + }, + { + "epoch": 0.8236888367001037, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005411585824008318, + "loss": 1.1328, + "step": 9393 + }, + { + "epoch": 0.8237765284744782, + "grad_norm": 0.043701171875, + "learning_rate": 0.0005409260874899638, + "loss": 1.1164, + "step": 9394 + }, + { + "epoch": 0.8238642202488529, + "grad_norm": 0.054443359375, + "learning_rate": 0.0005406936937237735, + "loss": 1.1757, + "step": 9395 + }, + { + "epoch": 0.8239519120232274, + "grad_norm": 0.046142578125, + "learning_rate": 0.000540461401123455, + "loss": 1.1175, + "step": 9396 + }, + { + "epoch": 0.8240396037976019, + "grad_norm": 0.06494140625, + "learning_rate": 0.0005402292097101924, + "loss": 1.1401, + "step": 9397 + }, + { + "epoch": 0.8241272955719765, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005399971195051607, + "loss": 1.1603, + "step": 9398 + }, + { + "epoch": 0.824214987346351, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005397651305295263, + "loss": 1.1254, + "step": 9399 + }, + { + "epoch": 0.8243026791207255, + "grad_norm": 0.044921875, + "learning_rate": 0.0005395332428044458, + "loss": 1.1166, + "step": 9400 + }, + { + "epoch": 0.8243903708951, + "grad_norm": 0.05029296875, + "learning_rate": 0.0005393014563510675, + "loss": 1.0996, + "step": 9401 + }, + { + "epoch": 0.8244780626694747, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005390697711905287, + "loss": 1.1909, + "step": 9402 + }, + { + "epoch": 0.8245657544438492, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005388381873439587, + "loss": 1.0715, + "step": 9403 + }, + { + "epoch": 0.8246534462182237, + "grad_norm": 0.04052734375, + "learning_rate": 0.0005386067048324778, + "loss": 1.0782, + "step": 9404 + }, + { + "epoch": 0.8247411379925983, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005383753236771964, + "loss": 1.1912, + "step": 9405 + }, + { + "epoch": 0.8248288297669728, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005381440438992161, + "loss": 1.0922, + "step": 9406 + }, + { + "epoch": 0.8249165215413473, + "grad_norm": 0.04736328125, + "learning_rate": 0.0005379128655196284, + "loss": 1.1881, + "step": 9407 + }, + { + "epoch": 0.8250042133157219, + "grad_norm": 0.04638671875, + "learning_rate": 0.0005376817885595164, + "loss": 1.1163, + "step": 9408 + }, + { + "epoch": 0.8250919050900964, + "grad_norm": 0.0439453125, + "learning_rate": 0.000537450813039954, + "loss": 1.164, + "step": 9409 + }, + { + "epoch": 0.825179596864471, + "grad_norm": 0.048095703125, + "learning_rate": 0.0005372199389820058, + "loss": 1.1099, + "step": 9410 + }, + { + "epoch": 0.8252672886388456, + "grad_norm": 0.05810546875, + "learning_rate": 0.000536989166406726, + "loss": 1.0769, + "step": 9411 + }, + { + "epoch": 0.8253549804132201, + "grad_norm": 0.061767578125, + "learning_rate": 0.0005367584953351612, + "loss": 1.2007, + "step": 9412 + }, + { + "epoch": 0.8254426721875946, + "grad_norm": 0.0458984375, + "learning_rate": 0.000536527925788348, + "loss": 1.1401, + "step": 9413 + }, + { + "epoch": 0.8255303639619692, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005362974577873134, + "loss": 1.181, + "step": 9414 + }, + { + "epoch": 0.8256180557363437, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005360670913530759, + "loss": 1.1372, + "step": 9415 + }, + { + "epoch": 0.8257057475107182, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005358368265066437, + "loss": 1.1081, + "step": 9416 + }, + { + "epoch": 0.8257934392850929, + "grad_norm": 0.0546875, + "learning_rate": 0.0005356066632690165, + "loss": 1.1504, + "step": 9417 + }, + { + "epoch": 0.8258811310594674, + "grad_norm": 0.0439453125, + "learning_rate": 0.0005353766016611852, + "loss": 1.162, + "step": 9418 + }, + { + "epoch": 0.8259688228338419, + "grad_norm": 0.043701171875, + "learning_rate": 0.0005351466417041305, + "loss": 1.1629, + "step": 9419 + }, + { + "epoch": 0.8260565146082165, + "grad_norm": 0.044921875, + "learning_rate": 0.0005349167834188238, + "loss": 1.1, + "step": 9420 + }, + { + "epoch": 0.826144206382591, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005346870268262281, + "loss": 1.1229, + "step": 9421 + }, + { + "epoch": 0.8262318981569655, + "grad_norm": 0.0537109375, + "learning_rate": 0.0005344573719472962, + "loss": 1.1627, + "step": 9422 + }, + { + "epoch": 0.82631958993134, + "grad_norm": 0.0615234375, + "learning_rate": 0.0005342278188029727, + "loss": 1.1363, + "step": 9423 + }, + { + "epoch": 0.8264072817057146, + "grad_norm": 0.046142578125, + "learning_rate": 0.0005339983674141912, + "loss": 1.1401, + "step": 9424 + }, + { + "epoch": 0.8264949734800892, + "grad_norm": 0.048583984375, + "learning_rate": 0.0005337690178018778, + "loss": 1.1354, + "step": 9425 + }, + { + "epoch": 0.8265826652544637, + "grad_norm": 0.053466796875, + "learning_rate": 0.0005335397699869485, + "loss": 1.1857, + "step": 9426 + }, + { + "epoch": 0.8266703570288383, + "grad_norm": 0.04541015625, + "learning_rate": 0.0005333106239903104, + "loss": 1.1318, + "step": 9427 + }, + { + "epoch": 0.8267580488032128, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005330815798328601, + "loss": 1.1494, + "step": 9428 + }, + { + "epoch": 0.8268457405775873, + "grad_norm": 0.060302734375, + "learning_rate": 0.0005328526375354866, + "loss": 1.1129, + "step": 9429 + }, + { + "epoch": 0.8269334323519619, + "grad_norm": 0.057373046875, + "learning_rate": 0.000532623797119069, + "loss": 1.1157, + "step": 9430 + }, + { + "epoch": 0.8270211241263364, + "grad_norm": 0.04248046875, + "learning_rate": 0.0005323950586044763, + "loss": 1.1061, + "step": 9431 + }, + { + "epoch": 0.827108815900711, + "grad_norm": 0.05078125, + "learning_rate": 0.000532166422012569, + "loss": 1.111, + "step": 9432 + }, + { + "epoch": 0.8271965076750856, + "grad_norm": 0.04150390625, + "learning_rate": 0.0005319378873641986, + "loss": 1.0943, + "step": 9433 + }, + { + "epoch": 0.8272841994494601, + "grad_norm": 0.04638671875, + "learning_rate": 0.0005317094546802066, + "loss": 1.0969, + "step": 9434 + }, + { + "epoch": 0.8273718912238346, + "grad_norm": 0.0439453125, + "learning_rate": 0.0005314811239814253, + "loss": 1.1013, + "step": 9435 + }, + { + "epoch": 0.8274595829982092, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005312528952886781, + "loss": 1.1361, + "step": 9436 + }, + { + "epoch": 0.8275472747725837, + "grad_norm": 0.047119140625, + "learning_rate": 0.0005310247686227791, + "loss": 1.1385, + "step": 9437 + }, + { + "epoch": 0.8276349665469582, + "grad_norm": 0.04541015625, + "learning_rate": 0.0005307967440045324, + "loss": 1.1161, + "step": 9438 + }, + { + "epoch": 0.8277226583213328, + "grad_norm": 0.0419921875, + "learning_rate": 0.0005305688214547333, + "loss": 1.1054, + "step": 9439 + }, + { + "epoch": 0.8278103500957074, + "grad_norm": 0.0419921875, + "learning_rate": 0.0005303410009941679, + "loss": 1.1476, + "step": 9440 + }, + { + "epoch": 0.8278980418700819, + "grad_norm": 0.04150390625, + "learning_rate": 0.0005301132826436131, + "loss": 1.096, + "step": 9441 + }, + { + "epoch": 0.8279857336444565, + "grad_norm": 0.0634765625, + "learning_rate": 0.0005298856664238358, + "loss": 1.1903, + "step": 9442 + }, + { + "epoch": 0.828073425418831, + "grad_norm": 0.048583984375, + "learning_rate": 0.0005296581523555939, + "loss": 1.1288, + "step": 9443 + }, + { + "epoch": 0.8281611171932055, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005294307404596366, + "loss": 1.1086, + "step": 9444 + }, + { + "epoch": 0.8282488089675801, + "grad_norm": 0.059326171875, + "learning_rate": 0.0005292034307567032, + "loss": 1.091, + "step": 9445 + }, + { + "epoch": 0.8283365007419546, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005289762232675233, + "loss": 1.1515, + "step": 9446 + }, + { + "epoch": 0.8284241925163291, + "grad_norm": 0.053466796875, + "learning_rate": 0.0005287491180128183, + "loss": 1.2079, + "step": 9447 + }, + { + "epoch": 0.8285118842907037, + "grad_norm": 0.064453125, + "learning_rate": 0.0005285221150132989, + "loss": 1.1706, + "step": 9448 + }, + { + "epoch": 0.8285995760650783, + "grad_norm": 0.052001953125, + "learning_rate": 0.0005282952142896676, + "loss": 1.1141, + "step": 9449 + }, + { + "epoch": 0.8286872678394528, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005280684158626174, + "loss": 1.0987, + "step": 9450 + }, + { + "epoch": 0.8287749596138273, + "grad_norm": 0.045654296875, + "learning_rate": 0.000527841719752831, + "loss": 1.1808, + "step": 9451 + }, + { + "epoch": 0.8288626513882019, + "grad_norm": 0.048828125, + "learning_rate": 0.000527615125980983, + "loss": 1.1396, + "step": 9452 + }, + { + "epoch": 0.8289503431625764, + "grad_norm": 0.043701171875, + "learning_rate": 0.000527388634567738, + "loss": 1.1436, + "step": 9453 + }, + { + "epoch": 0.8290380349369509, + "grad_norm": 0.06640625, + "learning_rate": 0.0005271622455337519, + "loss": 1.1633, + "step": 9454 + }, + { + "epoch": 0.8291257267113256, + "grad_norm": 0.06689453125, + "learning_rate": 0.00052693595889967, + "loss": 1.1088, + "step": 9455 + }, + { + "epoch": 0.8292134184857001, + "grad_norm": 0.046630859375, + "learning_rate": 0.0005267097746861296, + "loss": 1.1379, + "step": 9456 + }, + { + "epoch": 0.8293011102600746, + "grad_norm": 0.05712890625, + "learning_rate": 0.0005264836929137582, + "loss": 1.1337, + "step": 9457 + }, + { + "epoch": 0.8293888020344492, + "grad_norm": 0.050537109375, + "learning_rate": 0.0005262577136031732, + "loss": 1.146, + "step": 9458 + }, + { + "epoch": 0.8294764938088237, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005260318367749843, + "loss": 1.1384, + "step": 9459 + }, + { + "epoch": 0.8295641855831982, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005258060624497902, + "loss": 1.1017, + "step": 9460 + }, + { + "epoch": 0.8296518773575728, + "grad_norm": 0.0537109375, + "learning_rate": 0.0005255803906481811, + "loss": 1.1231, + "step": 9461 + }, + { + "epoch": 0.8297395691319474, + "grad_norm": 0.04638671875, + "learning_rate": 0.0005253548213907376, + "loss": 1.1503, + "step": 9462 + }, + { + "epoch": 0.8298272609063219, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005251293546980314, + "loss": 1.1401, + "step": 9463 + }, + { + "epoch": 0.8299149526806965, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005249039905906242, + "loss": 1.1423, + "step": 9464 + }, + { + "epoch": 0.830002644455071, + "grad_norm": 0.041015625, + "learning_rate": 0.0005246787290890685, + "loss": 1.1809, + "step": 9465 + }, + { + "epoch": 0.8300903362294455, + "grad_norm": 0.060791015625, + "learning_rate": 0.0005244535702139076, + "loss": 1.147, + "step": 9466 + }, + { + "epoch": 0.8301780280038201, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005242285139856755, + "loss": 1.1633, + "step": 9467 + }, + { + "epoch": 0.8302657197781946, + "grad_norm": 0.043701171875, + "learning_rate": 0.0005240035604248971, + "loss": 1.0903, + "step": 9468 + }, + { + "epoch": 0.8303534115525691, + "grad_norm": 0.05029296875, + "learning_rate": 0.0005237787095520868, + "loss": 1.1272, + "step": 9469 + }, + { + "epoch": 0.8304411033269438, + "grad_norm": 0.04638671875, + "learning_rate": 0.0005235539613877512, + "loss": 1.0585, + "step": 9470 + }, + { + "epoch": 0.8305287951013183, + "grad_norm": 0.045654296875, + "learning_rate": 0.000523329315952386, + "loss": 1.0976, + "step": 9471 + }, + { + "epoch": 0.8306164868756928, + "grad_norm": 0.04248046875, + "learning_rate": 0.0005231047732664793, + "loss": 1.1317, + "step": 9472 + }, + { + "epoch": 0.8307041786500673, + "grad_norm": 0.050537109375, + "learning_rate": 0.0005228803333505078, + "loss": 1.1194, + "step": 9473 + }, + { + "epoch": 0.8307918704244419, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005226559962249403, + "loss": 1.2008, + "step": 9474 + }, + { + "epoch": 0.8308795621988164, + "grad_norm": 0.0439453125, + "learning_rate": 0.0005224317619102357, + "loss": 1.1224, + "step": 9475 + }, + { + "epoch": 0.8309672539731909, + "grad_norm": 0.05126953125, + "learning_rate": 0.0005222076304268439, + "loss": 1.1111, + "step": 9476 + }, + { + "epoch": 0.8310549457475656, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005219836017952044, + "loss": 1.1378, + "step": 9477 + }, + { + "epoch": 0.8311426375219401, + "grad_norm": 0.043701171875, + "learning_rate": 0.0005217596760357485, + "loss": 1.1888, + "step": 9478 + }, + { + "epoch": 0.8312303292963146, + "grad_norm": 0.04638671875, + "learning_rate": 0.0005215358531688978, + "loss": 1.1202, + "step": 9479 + }, + { + "epoch": 0.8313180210706892, + "grad_norm": 0.052001953125, + "learning_rate": 0.0005213121332150644, + "loss": 1.1081, + "step": 9480 + }, + { + "epoch": 0.8314057128450637, + "grad_norm": 0.05029296875, + "learning_rate": 0.0005210885161946508, + "loss": 1.106, + "step": 9481 + }, + { + "epoch": 0.8314934046194382, + "grad_norm": 0.040771484375, + "learning_rate": 0.0005208650021280498, + "loss": 1.0938, + "step": 9482 + }, + { + "epoch": 0.8315810963938128, + "grad_norm": 0.046142578125, + "learning_rate": 0.0005206415910356457, + "loss": 1.122, + "step": 9483 + }, + { + "epoch": 0.8316687881681873, + "grad_norm": 0.054931640625, + "learning_rate": 0.0005204182829378132, + "loss": 1.1575, + "step": 9484 + }, + { + "epoch": 0.8317564799425619, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005201950778549177, + "loss": 1.0674, + "step": 9485 + }, + { + "epoch": 0.8318441717169365, + "grad_norm": 0.07177734375, + "learning_rate": 0.0005199719758073141, + "loss": 1.1363, + "step": 9486 + }, + { + "epoch": 0.831931863491311, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005197489768153492, + "loss": 1.112, + "step": 9487 + }, + { + "epoch": 0.8320195552656855, + "grad_norm": 0.044921875, + "learning_rate": 0.0005195260808993599, + "loss": 1.1528, + "step": 9488 + }, + { + "epoch": 0.8321072470400601, + "grad_norm": 0.047119140625, + "learning_rate": 0.0005193032880796737, + "loss": 1.177, + "step": 9489 + }, + { + "epoch": 0.8321949388144346, + "grad_norm": 0.04296875, + "learning_rate": 0.0005190805983766091, + "loss": 1.1562, + "step": 9490 + }, + { + "epoch": 0.8322826305888091, + "grad_norm": 0.040771484375, + "learning_rate": 0.0005188580118104739, + "loss": 1.1115, + "step": 9491 + }, + { + "epoch": 0.8323703223631838, + "grad_norm": 0.05419921875, + "learning_rate": 0.0005186355284015682, + "loss": 1.1875, + "step": 9492 + }, + { + "epoch": 0.8324580141375583, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005184131481701819, + "loss": 1.0686, + "step": 9493 + }, + { + "epoch": 0.8325457059119328, + "grad_norm": 0.041748046875, + "learning_rate": 0.0005181908711365953, + "loss": 1.1052, + "step": 9494 + }, + { + "epoch": 0.8326333976863073, + "grad_norm": 0.041015625, + "learning_rate": 0.0005179686973210793, + "loss": 1.1258, + "step": 9495 + }, + { + "epoch": 0.8327210894606819, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005177466267438958, + "loss": 1.1104, + "step": 9496 + }, + { + "epoch": 0.8328087812350564, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005175246594252974, + "loss": 1.1036, + "step": 9497 + }, + { + "epoch": 0.8328964730094309, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005173027953855262, + "loss": 1.1065, + "step": 9498 + }, + { + "epoch": 0.8329841647838055, + "grad_norm": 0.052490234375, + "learning_rate": 0.0005170810346448164, + "loss": 1.1399, + "step": 9499 + }, + { + "epoch": 0.8330718565581801, + "grad_norm": 0.04052734375, + "learning_rate": 0.0005168593772233912, + "loss": 1.0726, + "step": 9500 + }, + { + "epoch": 0.8330718565581801, + "eval_loss": 1.138482689857483, + "eval_runtime": 428.9247, + "eval_samples_per_second": 33.682, + "eval_steps_per_second": 8.421, + "step": 9500 + }, + { + "epoch": 0.8331595483325546, + "grad_norm": 0.04248046875, + "learning_rate": 0.0005166378231414657, + "loss": 1.1238, + "step": 9501 + }, + { + "epoch": 0.8332472401069292, + "grad_norm": 0.043212890625, + "learning_rate": 0.000516416372419245, + "loss": 1.237, + "step": 9502 + }, + { + "epoch": 0.8333349318813037, + "grad_norm": 0.0517578125, + "learning_rate": 0.000516195025076925, + "loss": 1.1987, + "step": 9503 + }, + { + "epoch": 0.8334226236556782, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005159737811346916, + "loss": 1.1143, + "step": 9504 + }, + { + "epoch": 0.8335103154300528, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005157526406127218, + "loss": 1.1351, + "step": 9505 + }, + { + "epoch": 0.8335980072044273, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005155316035311834, + "loss": 1.075, + "step": 9506 + }, + { + "epoch": 0.8336856989788018, + "grad_norm": 0.050537109375, + "learning_rate": 0.0005153106699102341, + "loss": 1.1192, + "step": 9507 + }, + { + "epoch": 0.8337733907531765, + "grad_norm": 0.043701171875, + "learning_rate": 0.0005150898397700223, + "loss": 1.0909, + "step": 9508 + }, + { + "epoch": 0.833861082527551, + "grad_norm": 0.046142578125, + "learning_rate": 0.0005148691131306874, + "loss": 1.137, + "step": 9509 + }, + { + "epoch": 0.8339487743019255, + "grad_norm": 0.041259765625, + "learning_rate": 0.0005146484900123591, + "loss": 1.1156, + "step": 9510 + }, + { + "epoch": 0.8340364660763001, + "grad_norm": 0.04541015625, + "learning_rate": 0.0005144279704351576, + "loss": 1.169, + "step": 9511 + }, + { + "epoch": 0.8341241578506746, + "grad_norm": 0.04638671875, + "learning_rate": 0.0005142075544191941, + "loss": 1.1143, + "step": 9512 + }, + { + "epoch": 0.8342118496250491, + "grad_norm": 0.0546875, + "learning_rate": 0.0005139872419845691, + "loss": 1.1044, + "step": 9513 + }, + { + "epoch": 0.8342995413994237, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005137670331513755, + "loss": 1.1542, + "step": 9514 + }, + { + "epoch": 0.8343872331737983, + "grad_norm": 0.047607421875, + "learning_rate": 0.000513546927939695, + "loss": 1.1188, + "step": 9515 + }, + { + "epoch": 0.8344749249481728, + "grad_norm": 0.048095703125, + "learning_rate": 0.000513326926369601, + "loss": 1.1767, + "step": 9516 + }, + { + "epoch": 0.8345626167225474, + "grad_norm": 0.042724609375, + "learning_rate": 0.000513107028461157, + "loss": 1.1572, + "step": 9517 + }, + { + "epoch": 0.8346503084969219, + "grad_norm": 0.046630859375, + "learning_rate": 0.000512887234234417, + "loss": 1.1116, + "step": 9518 + }, + { + "epoch": 0.8347380002712964, + "grad_norm": 0.04541015625, + "learning_rate": 0.000512667543709426, + "loss": 1.146, + "step": 9519 + }, + { + "epoch": 0.8348256920456709, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005124479569062187, + "loss": 1.1614, + "step": 9520 + }, + { + "epoch": 0.8349133838200455, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005122284738448217, + "loss": 1.1036, + "step": 9521 + }, + { + "epoch": 0.83500107559442, + "grad_norm": 0.048828125, + "learning_rate": 0.0005120090945452501, + "loss": 1.1164, + "step": 9522 + }, + { + "epoch": 0.8350887673687946, + "grad_norm": 0.052978515625, + "learning_rate": 0.0005117898190275117, + "loss": 1.162, + "step": 9523 + }, + { + "epoch": 0.8351764591431692, + "grad_norm": 0.0478515625, + "learning_rate": 0.0005115706473116034, + "loss": 1.1027, + "step": 9524 + }, + { + "epoch": 0.8352641509175437, + "grad_norm": 0.059326171875, + "learning_rate": 0.0005113515794175137, + "loss": 1.0984, + "step": 9525 + }, + { + "epoch": 0.8353518426919182, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005111326153652199, + "loss": 1.1134, + "step": 9526 + }, + { + "epoch": 0.8354395344662928, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005109137551746918, + "loss": 1.101, + "step": 9527 + }, + { + "epoch": 0.8355272262406673, + "grad_norm": 0.041015625, + "learning_rate": 0.0005106949988658888, + "loss": 1.1233, + "step": 9528 + }, + { + "epoch": 0.8356149180150418, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005104763464587609, + "loss": 1.1347, + "step": 9529 + }, + { + "epoch": 0.8357026097894165, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005102577979732485, + "loss": 1.0752, + "step": 9530 + }, + { + "epoch": 0.835790301563791, + "grad_norm": 0.05029296875, + "learning_rate": 0.0005100393534292828, + "loss": 1.1701, + "step": 9531 + }, + { + "epoch": 0.8358779933381655, + "grad_norm": 0.053955078125, + "learning_rate": 0.0005098210128467851, + "loss": 1.156, + "step": 9532 + }, + { + "epoch": 0.8359656851125401, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005096027762456675, + "loss": 1.209, + "step": 9533 + }, + { + "epoch": 0.8360533768869146, + "grad_norm": 0.06298828125, + "learning_rate": 0.0005093846436458331, + "loss": 1.1422, + "step": 9534 + }, + { + "epoch": 0.8361410686612891, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005091666150671745, + "loss": 1.1057, + "step": 9535 + }, + { + "epoch": 0.8362287604356637, + "grad_norm": 0.046630859375, + "learning_rate": 0.0005089486905295756, + "loss": 1.1676, + "step": 9536 + }, + { + "epoch": 0.8363164522100383, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005087308700529103, + "loss": 1.0809, + "step": 9537 + }, + { + "epoch": 0.8364041439844128, + "grad_norm": 0.04638671875, + "learning_rate": 0.000508513153657044, + "loss": 1.0877, + "step": 9538 + }, + { + "epoch": 0.8364918357587874, + "grad_norm": 0.04248046875, + "learning_rate": 0.000508295541361831, + "loss": 1.1148, + "step": 9539 + }, + { + "epoch": 0.8365795275331619, + "grad_norm": 0.04150390625, + "learning_rate": 0.0005080780331871175, + "loss": 1.1498, + "step": 9540 + }, + { + "epoch": 0.8366672193075364, + "grad_norm": 0.048095703125, + "learning_rate": 0.0005078606291527392, + "loss": 1.1405, + "step": 9541 + }, + { + "epoch": 0.836754911081911, + "grad_norm": 0.0439453125, + "learning_rate": 0.0005076433292785232, + "loss": 1.1299, + "step": 9542 + }, + { + "epoch": 0.8368426028562855, + "grad_norm": 0.046875, + "learning_rate": 0.0005074261335842869, + "loss": 1.1186, + "step": 9543 + }, + { + "epoch": 0.83693029463066, + "grad_norm": 0.046875, + "learning_rate": 0.0005072090420898375, + "loss": 1.1578, + "step": 9544 + }, + { + "epoch": 0.8370179864050346, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005069920548149734, + "loss": 1.1367, + "step": 9545 + }, + { + "epoch": 0.8371056781794092, + "grad_norm": 0.048828125, + "learning_rate": 0.000506775171779483, + "loss": 1.1157, + "step": 9546 + }, + { + "epoch": 0.8371933699537837, + "grad_norm": 0.044921875, + "learning_rate": 0.0005065583930031461, + "loss": 1.0922, + "step": 9547 + }, + { + "epoch": 0.8372810617281582, + "grad_norm": 0.04345703125, + "learning_rate": 0.000506341718505732, + "loss": 1.0878, + "step": 9548 + }, + { + "epoch": 0.8373687535025328, + "grad_norm": 0.04296875, + "learning_rate": 0.0005061251483070005, + "loss": 1.1328, + "step": 9549 + }, + { + "epoch": 0.8374564452769073, + "grad_norm": 0.046142578125, + "learning_rate": 0.0005059086824267027, + "loss": 1.0989, + "step": 9550 + }, + { + "epoch": 0.8375441370512818, + "grad_norm": 0.046875, + "learning_rate": 0.0005056923208845795, + "loss": 1.0961, + "step": 9551 + }, + { + "epoch": 0.8376318288256565, + "grad_norm": 0.047607421875, + "learning_rate": 0.000505476063700363, + "loss": 1.1205, + "step": 9552 + }, + { + "epoch": 0.837719520600031, + "grad_norm": 0.041748046875, + "learning_rate": 0.0005052599108937746, + "loss": 1.1181, + "step": 9553 + }, + { + "epoch": 0.8378072123744055, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005050438624845271, + "loss": 1.1301, + "step": 9554 + }, + { + "epoch": 0.8378949041487801, + "grad_norm": 0.042236328125, + "learning_rate": 0.0005048279184923238, + "loss": 1.0896, + "step": 9555 + }, + { + "epoch": 0.8379825959231546, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005046120789368583, + "loss": 1.1376, + "step": 9556 + }, + { + "epoch": 0.8380702876975291, + "grad_norm": 0.048095703125, + "learning_rate": 0.0005043963438378141, + "loss": 1.1014, + "step": 9557 + }, + { + "epoch": 0.8381579794719037, + "grad_norm": 0.042236328125, + "learning_rate": 0.000504180713214866, + "loss": 1.1675, + "step": 9558 + }, + { + "epoch": 0.8382456712462782, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005039651870876788, + "loss": 1.1099, + "step": 9559 + }, + { + "epoch": 0.8383333630206528, + "grad_norm": 0.045654296875, + "learning_rate": 0.0005037497654759085, + "loss": 1.1203, + "step": 9560 + }, + { + "epoch": 0.8384210547950274, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005035344483992002, + "loss": 1.1286, + "step": 9561 + }, + { + "epoch": 0.8385087465694019, + "grad_norm": 0.04248046875, + "learning_rate": 0.0005033192358771907, + "loss": 1.1347, + "step": 9562 + }, + { + "epoch": 0.8385964383437764, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005031041279295066, + "loss": 1.0812, + "step": 9563 + }, + { + "epoch": 0.838684130118151, + "grad_norm": 0.0439453125, + "learning_rate": 0.0005028891245757655, + "loss": 1.1524, + "step": 9564 + }, + { + "epoch": 0.8387718218925255, + "grad_norm": 0.05126953125, + "learning_rate": 0.0005026742258355755, + "loss": 1.1591, + "step": 9565 + }, + { + "epoch": 0.8388595136669, + "grad_norm": 0.054443359375, + "learning_rate": 0.0005024594317285337, + "loss": 1.1186, + "step": 9566 + }, + { + "epoch": 0.8389472054412745, + "grad_norm": 0.041259765625, + "learning_rate": 0.0005022447422742295, + "loss": 1.0775, + "step": 9567 + }, + { + "epoch": 0.8390348972156492, + "grad_norm": 0.048828125, + "learning_rate": 0.0005020301574922418, + "loss": 1.1206, + "step": 9568 + }, + { + "epoch": 0.8391225889900237, + "grad_norm": 0.04345703125, + "learning_rate": 0.0005018156774021402, + "loss": 1.1334, + "step": 9569 + }, + { + "epoch": 0.8392102807643982, + "grad_norm": 0.050048828125, + "learning_rate": 0.0005016013020234851, + "loss": 1.0894, + "step": 9570 + }, + { + "epoch": 0.8392979725387728, + "grad_norm": 0.059326171875, + "learning_rate": 0.0005013870313758265, + "loss": 1.12, + "step": 9571 + }, + { + "epoch": 0.8393856643131473, + "grad_norm": 0.0556640625, + "learning_rate": 0.0005011728654787053, + "loss": 1.1436, + "step": 9572 + }, + { + "epoch": 0.8394733560875218, + "grad_norm": 0.046142578125, + "learning_rate": 0.0005009588043516533, + "loss": 1.1481, + "step": 9573 + }, + { + "epoch": 0.8395610478618964, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005007448480141923, + "loss": 1.0871, + "step": 9574 + }, + { + "epoch": 0.839648739636271, + "grad_norm": 0.057861328125, + "learning_rate": 0.0005005309964858342, + "loss": 1.095, + "step": 9575 + }, + { + "epoch": 0.8397364314106455, + "grad_norm": 0.0498046875, + "learning_rate": 0.0005003172497860817, + "loss": 1.1169, + "step": 9576 + }, + { + "epoch": 0.8398241231850201, + "grad_norm": 0.0498046875, + "learning_rate": 0.0005001036079344283, + "loss": 1.1642, + "step": 9577 + }, + { + "epoch": 0.8399118149593946, + "grad_norm": 0.04443359375, + "learning_rate": 0.0004998900709503576, + "loss": 1.0762, + "step": 9578 + }, + { + "epoch": 0.8399995067337691, + "grad_norm": 0.0419921875, + "learning_rate": 0.0004996766388533432, + "loss": 1.1409, + "step": 9579 + }, + { + "epoch": 0.8400871985081437, + "grad_norm": 0.054443359375, + "learning_rate": 0.0004994633116628496, + "loss": 1.1026, + "step": 9580 + }, + { + "epoch": 0.8401748902825182, + "grad_norm": 0.05322265625, + "learning_rate": 0.0004992500893983321, + "loss": 1.0856, + "step": 9581 + }, + { + "epoch": 0.8402625820568927, + "grad_norm": 0.044677734375, + "learning_rate": 0.000499036972079236, + "loss": 1.0976, + "step": 9582 + }, + { + "epoch": 0.8403502738312674, + "grad_norm": 0.044677734375, + "learning_rate": 0.0004988239597249969, + "loss": 1.1184, + "step": 9583 + }, + { + "epoch": 0.8404379656056419, + "grad_norm": 0.042236328125, + "learning_rate": 0.0004986110523550407, + "loss": 1.12, + "step": 9584 + }, + { + "epoch": 0.8405256573800164, + "grad_norm": 0.047119140625, + "learning_rate": 0.0004983982499887841, + "loss": 1.15, + "step": 9585 + }, + { + "epoch": 0.840613349154391, + "grad_norm": 0.046630859375, + "learning_rate": 0.0004981855526456343, + "loss": 1.1946, + "step": 9586 + }, + { + "epoch": 0.8407010409287655, + "grad_norm": 0.051025390625, + "learning_rate": 0.000497972960344989, + "loss": 1.1289, + "step": 9587 + }, + { + "epoch": 0.84078873270314, + "grad_norm": 0.07373046875, + "learning_rate": 0.0004977604731062356, + "loss": 1.1008, + "step": 9588 + }, + { + "epoch": 0.8408764244775147, + "grad_norm": 0.06396484375, + "learning_rate": 0.0004975480909487522, + "loss": 1.1392, + "step": 9589 + }, + { + "epoch": 0.8409641162518892, + "grad_norm": 0.049560546875, + "learning_rate": 0.0004973358138919083, + "loss": 1.086, + "step": 9590 + }, + { + "epoch": 0.8410518080262637, + "grad_norm": 0.046142578125, + "learning_rate": 0.0004971236419550626, + "loss": 1.1561, + "step": 9591 + }, + { + "epoch": 0.8411394998006382, + "grad_norm": 0.05712890625, + "learning_rate": 0.0004969115751575643, + "loss": 1.176, + "step": 9592 + }, + { + "epoch": 0.8412271915750128, + "grad_norm": 0.0439453125, + "learning_rate": 0.0004966996135187539, + "loss": 1.1336, + "step": 9593 + }, + { + "epoch": 0.8413148833493873, + "grad_norm": 0.04833984375, + "learning_rate": 0.0004964877570579615, + "loss": 1.1391, + "step": 9594 + }, + { + "epoch": 0.8414025751237618, + "grad_norm": 0.043212890625, + "learning_rate": 0.0004962760057945076, + "loss": 1.0273, + "step": 9595 + }, + { + "epoch": 0.8414902668981364, + "grad_norm": 0.048828125, + "learning_rate": 0.0004960643597477042, + "loss": 1.1679, + "step": 9596 + }, + { + "epoch": 0.841577958672511, + "grad_norm": 0.050048828125, + "learning_rate": 0.000495852818936852, + "loss": 1.1532, + "step": 9597 + }, + { + "epoch": 0.8416656504468855, + "grad_norm": 0.050048828125, + "learning_rate": 0.0004956413833812432, + "loss": 1.1456, + "step": 9598 + }, + { + "epoch": 0.8417533422212601, + "grad_norm": 0.0419921875, + "learning_rate": 0.0004954300531001606, + "loss": 1.0768, + "step": 9599 + }, + { + "epoch": 0.8418410339956346, + "grad_norm": 0.040771484375, + "learning_rate": 0.0004952188281128764, + "loss": 1.1052, + "step": 9600 + }, + { + "epoch": 0.8419287257700091, + "grad_norm": 0.04296875, + "learning_rate": 0.0004950077084386542, + "loss": 1.0954, + "step": 9601 + }, + { + "epoch": 0.8420164175443837, + "grad_norm": 0.0419921875, + "learning_rate": 0.0004947966940967471, + "loss": 1.0906, + "step": 9602 + }, + { + "epoch": 0.8421041093187582, + "grad_norm": 0.042724609375, + "learning_rate": 0.0004945857851063997, + "loss": 1.0763, + "step": 9603 + }, + { + "epoch": 0.8421918010931327, + "grad_norm": 0.043212890625, + "learning_rate": 0.0004943749814868457, + "loss": 1.1233, + "step": 9604 + }, + { + "epoch": 0.8422794928675074, + "grad_norm": 0.048095703125, + "learning_rate": 0.0004941642832573105, + "loss": 1.1657, + "step": 9605 + }, + { + "epoch": 0.8423671846418819, + "grad_norm": 0.044189453125, + "learning_rate": 0.0004939536904370086, + "loss": 1.1243, + "step": 9606 + }, + { + "epoch": 0.8424548764162564, + "grad_norm": 0.0419921875, + "learning_rate": 0.0004937432030451456, + "loss": 1.0712, + "step": 9607 + }, + { + "epoch": 0.842542568190631, + "grad_norm": 0.044189453125, + "learning_rate": 0.000493532821100918, + "loss": 1.1154, + "step": 9608 + }, + { + "epoch": 0.8426302599650055, + "grad_norm": 0.044189453125, + "learning_rate": 0.0004933225446235118, + "loss": 1.1331, + "step": 9609 + }, + { + "epoch": 0.84271795173938, + "grad_norm": 0.04443359375, + "learning_rate": 0.0004931123736321033, + "loss": 1.0916, + "step": 9610 + }, + { + "epoch": 0.8428056435137546, + "grad_norm": 0.04296875, + "learning_rate": 0.0004929023081458599, + "loss": 1.1041, + "step": 9611 + }, + { + "epoch": 0.8428933352881292, + "grad_norm": 0.045654296875, + "learning_rate": 0.0004926923481839388, + "loss": 1.1601, + "step": 9612 + }, + { + "epoch": 0.8429810270625037, + "grad_norm": 0.049072265625, + "learning_rate": 0.0004924824937654885, + "loss": 1.106, + "step": 9613 + }, + { + "epoch": 0.8430687188368782, + "grad_norm": 0.044921875, + "learning_rate": 0.0004922727449096462, + "loss": 1.1532, + "step": 9614 + }, + { + "epoch": 0.8431564106112528, + "grad_norm": 0.04443359375, + "learning_rate": 0.000492063101635541, + "loss": 1.1599, + "step": 9615 + }, + { + "epoch": 0.8432441023856273, + "grad_norm": 0.041748046875, + "learning_rate": 0.0004918535639622921, + "loss": 1.043, + "step": 9616 + }, + { + "epoch": 0.8433317941600018, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004916441319090079, + "loss": 1.1345, + "step": 9617 + }, + { + "epoch": 0.8434194859343764, + "grad_norm": 0.040771484375, + "learning_rate": 0.0004914348054947889, + "loss": 1.149, + "step": 9618 + }, + { + "epoch": 0.8435071777087509, + "grad_norm": 0.043212890625, + "learning_rate": 0.0004912255847387248, + "loss": 1.1142, + "step": 9619 + }, + { + "epoch": 0.8435948694831255, + "grad_norm": 0.043701171875, + "learning_rate": 0.0004910164696598958, + "loss": 1.1558, + "step": 9620 + }, + { + "epoch": 0.8436825612575001, + "grad_norm": 0.042236328125, + "learning_rate": 0.0004908074602773728, + "loss": 1.0948, + "step": 9621 + }, + { + "epoch": 0.8437702530318746, + "grad_norm": 0.040771484375, + "learning_rate": 0.0004905985566102171, + "loss": 1.1124, + "step": 9622 + }, + { + "epoch": 0.8438579448062491, + "grad_norm": 0.062255859375, + "learning_rate": 0.0004903897586774804, + "loss": 1.1841, + "step": 9623 + }, + { + "epoch": 0.8439456365806237, + "grad_norm": 0.041748046875, + "learning_rate": 0.0004901810664982038, + "loss": 1.1179, + "step": 9624 + }, + { + "epoch": 0.8440333283549982, + "grad_norm": 0.041748046875, + "learning_rate": 0.0004899724800914199, + "loss": 1.0956, + "step": 9625 + }, + { + "epoch": 0.8441210201293727, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004897639994761513, + "loss": 1.1108, + "step": 9626 + }, + { + "epoch": 0.8442087119037474, + "grad_norm": 0.039794921875, + "learning_rate": 0.000489555624671411, + "loss": 1.1081, + "step": 9627 + }, + { + "epoch": 0.8442964036781219, + "grad_norm": 0.04541015625, + "learning_rate": 0.0004893473556962018, + "loss": 1.118, + "step": 9628 + }, + { + "epoch": 0.8443840954524964, + "grad_norm": 0.06591796875, + "learning_rate": 0.0004891391925695176, + "loss": 1.1352, + "step": 9629 + }, + { + "epoch": 0.844471787226871, + "grad_norm": 0.048583984375, + "learning_rate": 0.0004889311353103422, + "loss": 1.1591, + "step": 9630 + }, + { + "epoch": 0.8445594790012455, + "grad_norm": 0.043701171875, + "learning_rate": 0.0004887231839376503, + "loss": 1.1755, + "step": 9631 + }, + { + "epoch": 0.84464717077562, + "grad_norm": 0.049072265625, + "learning_rate": 0.0004885153384704059, + "loss": 1.1208, + "step": 9632 + }, + { + "epoch": 0.8447348625499946, + "grad_norm": 0.041015625, + "learning_rate": 0.0004883075989275646, + "loss": 1.0899, + "step": 9633 + }, + { + "epoch": 0.8448225543243691, + "grad_norm": 0.04638671875, + "learning_rate": 0.0004880999653280712, + "loss": 1.1314, + "step": 9634 + }, + { + "epoch": 0.8449102460987437, + "grad_norm": 0.0751953125, + "learning_rate": 0.00048789243769086133, + "loss": 1.1664, + "step": 9635 + }, + { + "epoch": 0.8449979378731183, + "grad_norm": 0.04345703125, + "learning_rate": 0.0004876850160348616, + "loss": 1.1229, + "step": 9636 + }, + { + "epoch": 0.8450856296474928, + "grad_norm": 0.04931640625, + "learning_rate": 0.00048747770037898744, + "loss": 1.1354, + "step": 9637 + }, + { + "epoch": 0.8451733214218673, + "grad_norm": 0.072265625, + "learning_rate": 0.0004872704907421462, + "loss": 1.1488, + "step": 9638 + }, + { + "epoch": 0.8452610131962418, + "grad_norm": 0.04345703125, + "learning_rate": 0.00048706338714323444, + "loss": 1.0833, + "step": 9639 + }, + { + "epoch": 0.8453487049706164, + "grad_norm": 0.045166015625, + "learning_rate": 0.0004868563896011401, + "loss": 1.1524, + "step": 9640 + }, + { + "epoch": 0.8454363967449909, + "grad_norm": 0.0478515625, + "learning_rate": 0.00048664949813474, + "loss": 1.0742, + "step": 9641 + }, + { + "epoch": 0.8455240885193654, + "grad_norm": 0.049560546875, + "learning_rate": 0.00048644271276290253, + "loss": 1.1666, + "step": 9642 + }, + { + "epoch": 0.8456117802937401, + "grad_norm": 0.044189453125, + "learning_rate": 0.00048623603350448605, + "loss": 1.1208, + "step": 9643 + }, + { + "epoch": 0.8456994720681146, + "grad_norm": 0.04736328125, + "learning_rate": 0.00048602946037833926, + "loss": 1.1022, + "step": 9644 + }, + { + "epoch": 0.8457871638424891, + "grad_norm": 0.06787109375, + "learning_rate": 0.00048582299340330066, + "loss": 1.1217, + "step": 9645 + }, + { + "epoch": 0.8458748556168637, + "grad_norm": 0.046142578125, + "learning_rate": 0.0004856166325981997, + "loss": 1.1677, + "step": 9646 + }, + { + "epoch": 0.8459625473912382, + "grad_norm": 0.04248046875, + "learning_rate": 0.00048541037798185597, + "loss": 1.1098, + "step": 9647 + }, + { + "epoch": 0.8460502391656127, + "grad_norm": 0.041015625, + "learning_rate": 0.00048520422957307944, + "loss": 1.0784, + "step": 9648 + }, + { + "epoch": 0.8461379309399873, + "grad_norm": 0.05517578125, + "learning_rate": 0.0004849981873906707, + "loss": 1.1096, + "step": 9649 + }, + { + "epoch": 0.8462256227143619, + "grad_norm": 0.05126953125, + "learning_rate": 0.0004847922514534197, + "loss": 1.1904, + "step": 9650 + }, + { + "epoch": 0.8463133144887364, + "grad_norm": 0.048828125, + "learning_rate": 0.00048458642178010705, + "loss": 1.102, + "step": 9651 + }, + { + "epoch": 0.846401006263111, + "grad_norm": 0.0439453125, + "learning_rate": 0.0004843806983895044, + "loss": 1.1791, + "step": 9652 + }, + { + "epoch": 0.8464886980374855, + "grad_norm": 0.0439453125, + "learning_rate": 0.0004841750813003729, + "loss": 1.1488, + "step": 9653 + }, + { + "epoch": 0.84657638981186, + "grad_norm": 0.04296875, + "learning_rate": 0.00048396957053146474, + "loss": 1.1663, + "step": 9654 + }, + { + "epoch": 0.8466640815862346, + "grad_norm": 0.04638671875, + "learning_rate": 0.00048376416610152134, + "loss": 1.0483, + "step": 9655 + }, + { + "epoch": 0.8467517733606091, + "grad_norm": 0.0625, + "learning_rate": 0.00048355886802927545, + "loss": 1.1396, + "step": 9656 + }, + { + "epoch": 0.8468394651349836, + "grad_norm": 0.05126953125, + "learning_rate": 0.00048335367633344965, + "loss": 1.0971, + "step": 9657 + }, + { + "epoch": 0.8469271569093583, + "grad_norm": 0.046875, + "learning_rate": 0.00048314859103275705, + "loss": 1.1176, + "step": 9658 + }, + { + "epoch": 0.8470148486837328, + "grad_norm": 0.0458984375, + "learning_rate": 0.0004829436121459006, + "loss": 1.1512, + "step": 9659 + }, + { + "epoch": 0.8471025404581073, + "grad_norm": 0.046142578125, + "learning_rate": 0.0004827387396915738, + "loss": 1.1656, + "step": 9660 + }, + { + "epoch": 0.8471902322324819, + "grad_norm": 0.05029296875, + "learning_rate": 0.0004825339736884608, + "loss": 1.1271, + "step": 9661 + }, + { + "epoch": 0.8472779240068564, + "grad_norm": 0.04052734375, + "learning_rate": 0.0004823293141552358, + "loss": 1.0985, + "step": 9662 + }, + { + "epoch": 0.8473656157812309, + "grad_norm": 0.04296875, + "learning_rate": 0.0004821247611105628, + "loss": 1.1543, + "step": 9663 + }, + { + "epoch": 0.8474533075556054, + "grad_norm": 0.044189453125, + "learning_rate": 0.0004819203145730968, + "loss": 1.0924, + "step": 9664 + }, + { + "epoch": 0.8475409993299801, + "grad_norm": 0.0478515625, + "learning_rate": 0.0004817159745614826, + "loss": 1.1021, + "step": 9665 + }, + { + "epoch": 0.8476286911043546, + "grad_norm": 0.0419921875, + "learning_rate": 0.000481511741094356, + "loss": 1.0923, + "step": 9666 + }, + { + "epoch": 0.8477163828787291, + "grad_norm": 0.04443359375, + "learning_rate": 0.000481307614190342, + "loss": 1.1214, + "step": 9667 + }, + { + "epoch": 0.8478040746531037, + "grad_norm": 0.060546875, + "learning_rate": 0.0004811035938680565, + "loss": 1.1401, + "step": 9668 + }, + { + "epoch": 0.8478917664274782, + "grad_norm": 0.043701171875, + "learning_rate": 0.0004808996801461058, + "loss": 1.111, + "step": 9669 + }, + { + "epoch": 0.8479794582018527, + "grad_norm": 0.041748046875, + "learning_rate": 0.00048069587304308607, + "loss": 1.1538, + "step": 9670 + }, + { + "epoch": 0.8480671499762273, + "grad_norm": 0.04443359375, + "learning_rate": 0.00048049217257758466, + "loss": 1.1297, + "step": 9671 + }, + { + "epoch": 0.8481548417506019, + "grad_norm": 0.041748046875, + "learning_rate": 0.0004802885787681776, + "loss": 1.1236, + "step": 9672 + }, + { + "epoch": 0.8482425335249764, + "grad_norm": 0.043701171875, + "learning_rate": 0.0004800850916334326, + "loss": 1.1562, + "step": 9673 + }, + { + "epoch": 0.848330225299351, + "grad_norm": 0.0546875, + "learning_rate": 0.00047988171119190716, + "loss": 1.146, + "step": 9674 + }, + { + "epoch": 0.8484179170737255, + "grad_norm": 0.0439453125, + "learning_rate": 0.00047967843746214906, + "loss": 1.1751, + "step": 9675 + }, + { + "epoch": 0.8485056088481, + "grad_norm": 0.04443359375, + "learning_rate": 0.0004794752704626964, + "loss": 1.1753, + "step": 9676 + }, + { + "epoch": 0.8485933006224746, + "grad_norm": 0.042236328125, + "learning_rate": 0.00047927221021207717, + "loss": 1.0936, + "step": 9677 + }, + { + "epoch": 0.8486809923968491, + "grad_norm": 0.047607421875, + "learning_rate": 0.00047906925672881024, + "loss": 1.2118, + "step": 9678 + }, + { + "epoch": 0.8487686841712236, + "grad_norm": 0.048095703125, + "learning_rate": 0.0004788664100314043, + "loss": 1.1683, + "step": 9679 + }, + { + "epoch": 0.8488563759455983, + "grad_norm": 0.0615234375, + "learning_rate": 0.0004786636701383587, + "loss": 1.1196, + "step": 9680 + }, + { + "epoch": 0.8489440677199728, + "grad_norm": 0.042724609375, + "learning_rate": 0.0004784610370681625, + "loss": 1.1122, + "step": 9681 + }, + { + "epoch": 0.8490317594943473, + "grad_norm": 0.04638671875, + "learning_rate": 0.0004782585108392953, + "loss": 1.1635, + "step": 9682 + }, + { + "epoch": 0.8491194512687219, + "grad_norm": 0.046142578125, + "learning_rate": 0.000478056091470227, + "loss": 1.0639, + "step": 9683 + }, + { + "epoch": 0.8492071430430964, + "grad_norm": 0.04248046875, + "learning_rate": 0.0004778537789794182, + "loss": 1.0846, + "step": 9684 + }, + { + "epoch": 0.8492948348174709, + "grad_norm": 0.045654296875, + "learning_rate": 0.0004776515733853189, + "loss": 1.1454, + "step": 9685 + }, + { + "epoch": 0.8493825265918454, + "grad_norm": 0.05126953125, + "learning_rate": 0.0004774494747063694, + "loss": 1.1479, + "step": 9686 + }, + { + "epoch": 0.84947021836622, + "grad_norm": 0.046630859375, + "learning_rate": 0.00047724748296100076, + "loss": 1.1241, + "step": 9687 + }, + { + "epoch": 0.8495579101405946, + "grad_norm": 0.0380859375, + "learning_rate": 0.00047704559816763435, + "loss": 1.0953, + "step": 9688 + }, + { + "epoch": 0.8496456019149691, + "grad_norm": 0.042236328125, + "learning_rate": 0.00047684382034468163, + "loss": 1.0851, + "step": 9689 + }, + { + "epoch": 0.8497332936893437, + "grad_norm": 0.044189453125, + "learning_rate": 0.0004766421495105437, + "loss": 1.1509, + "step": 9690 + }, + { + "epoch": 0.8498209854637182, + "grad_norm": 0.0478515625, + "learning_rate": 0.0004764405856836128, + "loss": 1.1505, + "step": 9691 + }, + { + "epoch": 0.8499086772380927, + "grad_norm": 0.0439453125, + "learning_rate": 0.0004762391288822709, + "loss": 1.1223, + "step": 9692 + }, + { + "epoch": 0.8499963690124673, + "grad_norm": 0.048583984375, + "learning_rate": 0.0004760377791248908, + "loss": 1.1802, + "step": 9693 + }, + { + "epoch": 0.8500840607868418, + "grad_norm": 0.0458984375, + "learning_rate": 0.0004758365364298344, + "loss": 1.1412, + "step": 9694 + }, + { + "epoch": 0.8501717525612164, + "grad_norm": 0.049560546875, + "learning_rate": 0.00047563540081545463, + "loss": 1.1682, + "step": 9695 + }, + { + "epoch": 0.850259444335591, + "grad_norm": 0.04345703125, + "learning_rate": 0.0004754343723000947, + "loss": 1.1476, + "step": 9696 + }, + { + "epoch": 0.8503471361099655, + "grad_norm": 0.0576171875, + "learning_rate": 0.0004752334509020883, + "loss": 1.0911, + "step": 9697 + }, + { + "epoch": 0.85043482788434, + "grad_norm": 0.048828125, + "learning_rate": 0.0004750326366397583, + "loss": 1.1304, + "step": 9698 + }, + { + "epoch": 0.8505225196587146, + "grad_norm": 0.0478515625, + "learning_rate": 0.0004748319295314185, + "loss": 1.0771, + "step": 9699 + }, + { + "epoch": 0.8506102114330891, + "grad_norm": 0.044189453125, + "learning_rate": 0.0004746313295953731, + "loss": 1.1032, + "step": 9700 + }, + { + "epoch": 0.8506979032074636, + "grad_norm": 0.04248046875, + "learning_rate": 0.0004744308368499166, + "loss": 1.085, + "step": 9701 + }, + { + "epoch": 0.8507855949818383, + "grad_norm": 0.04833984375, + "learning_rate": 0.00047423045131333297, + "loss": 1.1626, + "step": 9702 + }, + { + "epoch": 0.8508732867562128, + "grad_norm": 0.044189453125, + "learning_rate": 0.0004740301730038968, + "loss": 1.1094, + "step": 9703 + }, + { + "epoch": 0.8509609785305873, + "grad_norm": 0.0439453125, + "learning_rate": 0.0004738300019398729, + "loss": 1.2059, + "step": 9704 + }, + { + "epoch": 0.8510486703049619, + "grad_norm": 0.044921875, + "learning_rate": 0.0004736299381395167, + "loss": 1.1351, + "step": 9705 + }, + { + "epoch": 0.8511363620793364, + "grad_norm": 0.048828125, + "learning_rate": 0.0004734299816210734, + "loss": 1.0825, + "step": 9706 + }, + { + "epoch": 0.8512240538537109, + "grad_norm": 0.045654296875, + "learning_rate": 0.0004732301324027786, + "loss": 1.1388, + "step": 9707 + }, + { + "epoch": 0.8513117456280855, + "grad_norm": 0.044921875, + "learning_rate": 0.0004730303905028578, + "loss": 1.1918, + "step": 9708 + }, + { + "epoch": 0.85139943740246, + "grad_norm": 0.04296875, + "learning_rate": 0.00047283075593952683, + "loss": 1.1522, + "step": 9709 + }, + { + "epoch": 0.8514871291768346, + "grad_norm": 0.04443359375, + "learning_rate": 0.00047263122873099226, + "loss": 1.1861, + "step": 9710 + }, + { + "epoch": 0.8515748209512091, + "grad_norm": 0.04443359375, + "learning_rate": 0.0004724318088954505, + "loss": 1.0792, + "step": 9711 + }, + { + "epoch": 0.8516625127255837, + "grad_norm": 0.0478515625, + "learning_rate": 0.0004722324964510876, + "loss": 1.0793, + "step": 9712 + }, + { + "epoch": 0.8517502044999582, + "grad_norm": 0.046875, + "learning_rate": 0.0004720332914160805, + "loss": 1.1432, + "step": 9713 + }, + { + "epoch": 0.8518378962743327, + "grad_norm": 0.045654296875, + "learning_rate": 0.00047183419380859653, + "loss": 1.103, + "step": 9714 + }, + { + "epoch": 0.8519255880487073, + "grad_norm": 0.045654296875, + "learning_rate": 0.0004716352036467929, + "loss": 1.2073, + "step": 9715 + }, + { + "epoch": 0.8520132798230818, + "grad_norm": 0.039794921875, + "learning_rate": 0.00047143632094881634, + "loss": 1.1112, + "step": 9716 + }, + { + "epoch": 0.8521009715974563, + "grad_norm": 0.04296875, + "learning_rate": 0.00047123754573280503, + "loss": 1.1106, + "step": 9717 + }, + { + "epoch": 0.852188663371831, + "grad_norm": 0.042236328125, + "learning_rate": 0.00047103887801688686, + "loss": 1.156, + "step": 9718 + }, + { + "epoch": 0.8522763551462055, + "grad_norm": 0.058837890625, + "learning_rate": 0.00047084031781917927, + "loss": 1.099, + "step": 9719 + }, + { + "epoch": 0.85236404692058, + "grad_norm": 0.045654296875, + "learning_rate": 0.0004706418651577909, + "loss": 1.0966, + "step": 9720 + }, + { + "epoch": 0.8524517386949546, + "grad_norm": 0.05517578125, + "learning_rate": 0.0004704435200508196, + "loss": 1.1417, + "step": 9721 + }, + { + "epoch": 0.8525394304693291, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004702452825163544, + "loss": 1.0499, + "step": 9722 + }, + { + "epoch": 0.8526271222437036, + "grad_norm": 0.0458984375, + "learning_rate": 0.0004700471525724738, + "loss": 1.1095, + "step": 9723 + }, + { + "epoch": 0.8527148140180782, + "grad_norm": 0.04443359375, + "learning_rate": 0.0004698491302372473, + "loss": 1.1664, + "step": 9724 + }, + { + "epoch": 0.8528025057924528, + "grad_norm": 0.046875, + "learning_rate": 0.0004696512155287332, + "loss": 1.1363, + "step": 9725 + }, + { + "epoch": 0.8528901975668273, + "grad_norm": 0.042236328125, + "learning_rate": 0.0004694534084649812, + "loss": 1.153, + "step": 9726 + }, + { + "epoch": 0.8529778893412019, + "grad_norm": 0.048583984375, + "learning_rate": 0.00046925570906403065, + "loss": 1.1675, + "step": 9727 + }, + { + "epoch": 0.8530655811155764, + "grad_norm": 0.044677734375, + "learning_rate": 0.0004690581173439115, + "loss": 1.1347, + "step": 9728 + }, + { + "epoch": 0.8531532728899509, + "grad_norm": 0.049560546875, + "learning_rate": 0.0004688606333226437, + "loss": 1.1007, + "step": 9729 + }, + { + "epoch": 0.8532409646643255, + "grad_norm": 0.04345703125, + "learning_rate": 0.00046866325701823685, + "loss": 1.1525, + "step": 9730 + }, + { + "epoch": 0.8533286564387, + "grad_norm": 0.047607421875, + "learning_rate": 0.0004684659884486912, + "loss": 1.1732, + "step": 9731 + }, + { + "epoch": 0.8534163482130745, + "grad_norm": 0.046630859375, + "learning_rate": 0.0004682688276319974, + "loss": 1.1699, + "step": 9732 + }, + { + "epoch": 0.8535040399874492, + "grad_norm": 0.0517578125, + "learning_rate": 0.0004680717745861361, + "loss": 1.1483, + "step": 9733 + }, + { + "epoch": 0.8535917317618237, + "grad_norm": 0.048095703125, + "learning_rate": 0.00046787482932907765, + "loss": 1.0927, + "step": 9734 + }, + { + "epoch": 0.8536794235361982, + "grad_norm": 0.042236328125, + "learning_rate": 0.00046767799187878343, + "loss": 1.1192, + "step": 9735 + }, + { + "epoch": 0.8537671153105727, + "grad_norm": 0.046142578125, + "learning_rate": 0.0004674812622532038, + "loss": 1.1248, + "step": 9736 + }, + { + "epoch": 0.8538548070849473, + "grad_norm": 0.040283203125, + "learning_rate": 0.00046728464047028043, + "loss": 1.1261, + "step": 9737 + }, + { + "epoch": 0.8539424988593218, + "grad_norm": 0.04931640625, + "learning_rate": 0.00046708812654794486, + "loss": 1.1831, + "step": 9738 + }, + { + "epoch": 0.8540301906336963, + "grad_norm": 0.0419921875, + "learning_rate": 0.0004668917205041183, + "loss": 1.082, + "step": 9739 + }, + { + "epoch": 0.854117882408071, + "grad_norm": 0.04248046875, + "learning_rate": 0.0004666954223567128, + "loss": 1.0797, + "step": 9740 + }, + { + "epoch": 0.8542055741824455, + "grad_norm": 0.04345703125, + "learning_rate": 0.00046649923212362994, + "loss": 1.1097, + "step": 9741 + }, + { + "epoch": 0.85429326595682, + "grad_norm": 0.042724609375, + "learning_rate": 0.00046630314982276216, + "loss": 1.0978, + "step": 9742 + }, + { + "epoch": 0.8543809577311946, + "grad_norm": 0.05126953125, + "learning_rate": 0.0004661071754719912, + "loss": 1.1392, + "step": 9743 + }, + { + "epoch": 0.8544686495055691, + "grad_norm": 0.042236328125, + "learning_rate": 0.0004659113090891897, + "loss": 1.0998, + "step": 9744 + }, + { + "epoch": 0.8545563412799436, + "grad_norm": 0.044189453125, + "learning_rate": 0.0004657155506922199, + "loss": 1.1715, + "step": 9745 + }, + { + "epoch": 0.8546440330543182, + "grad_norm": 0.042724609375, + "learning_rate": 0.0004655199002989351, + "loss": 1.1198, + "step": 9746 + }, + { + "epoch": 0.8547317248286928, + "grad_norm": 0.04248046875, + "learning_rate": 0.00046532435792717746, + "loss": 1.1117, + "step": 9747 + }, + { + "epoch": 0.8548194166030673, + "grad_norm": 0.04296875, + "learning_rate": 0.00046512892359478, + "loss": 1.1627, + "step": 9748 + }, + { + "epoch": 0.8549071083774419, + "grad_norm": 0.042724609375, + "learning_rate": 0.0004649335973195661, + "loss": 1.0906, + "step": 9749 + }, + { + "epoch": 0.8549948001518164, + "grad_norm": 0.041748046875, + "learning_rate": 0.0004647383791193491, + "loss": 1.1372, + "step": 9750 + }, + { + "epoch": 0.8550824919261909, + "grad_norm": 0.044921875, + "learning_rate": 0.0004645432690119319, + "loss": 1.1458, + "step": 9751 + }, + { + "epoch": 0.8551701837005655, + "grad_norm": 0.04638671875, + "learning_rate": 0.0004643482670151086, + "loss": 1.1657, + "step": 9752 + }, + { + "epoch": 0.85525787547494, + "grad_norm": 0.04638671875, + "learning_rate": 0.00046415337314666236, + "loss": 1.095, + "step": 9753 + }, + { + "epoch": 0.8553455672493145, + "grad_norm": 0.048828125, + "learning_rate": 0.0004639585874243672, + "loss": 1.0849, + "step": 9754 + }, + { + "epoch": 0.8554332590236892, + "grad_norm": 0.0439453125, + "learning_rate": 0.0004637639098659875, + "loss": 1.1733, + "step": 9755 + }, + { + "epoch": 0.8555209507980637, + "grad_norm": 0.046142578125, + "learning_rate": 0.0004635693404892765, + "loss": 1.0902, + "step": 9756 + }, + { + "epoch": 0.8556086425724382, + "grad_norm": 0.04443359375, + "learning_rate": 0.0004633748793119791, + "loss": 1.1505, + "step": 9757 + }, + { + "epoch": 0.8556963343468127, + "grad_norm": 0.0576171875, + "learning_rate": 0.00046318052635182936, + "loss": 1.1375, + "step": 9758 + }, + { + "epoch": 0.8557840261211873, + "grad_norm": 0.044677734375, + "learning_rate": 0.00046298628162655186, + "loss": 1.151, + "step": 9759 + }, + { + "epoch": 0.8558717178955618, + "grad_norm": 0.049072265625, + "learning_rate": 0.0004627921451538613, + "loss": 1.1007, + "step": 9760 + }, + { + "epoch": 0.8559594096699363, + "grad_norm": 0.041259765625, + "learning_rate": 0.00046259811695146234, + "loss": 1.1091, + "step": 9761 + }, + { + "epoch": 0.856047101444311, + "grad_norm": 0.05224609375, + "learning_rate": 0.0004624041970370497, + "loss": 1.1137, + "step": 9762 + }, + { + "epoch": 0.8561347932186855, + "grad_norm": 0.048583984375, + "learning_rate": 0.00046221038542830863, + "loss": 1.1804, + "step": 9763 + }, + { + "epoch": 0.85622248499306, + "grad_norm": 0.045166015625, + "learning_rate": 0.0004620166821429143, + "loss": 1.0772, + "step": 9764 + }, + { + "epoch": 0.8563101767674346, + "grad_norm": 0.045654296875, + "learning_rate": 0.00046182308719853175, + "loss": 1.1436, + "step": 9765 + }, + { + "epoch": 0.8563978685418091, + "grad_norm": 0.046875, + "learning_rate": 0.0004616296006128162, + "loss": 1.1759, + "step": 9766 + }, + { + "epoch": 0.8564855603161836, + "grad_norm": 0.045654296875, + "learning_rate": 0.0004614362224034133, + "loss": 1.1921, + "step": 9767 + }, + { + "epoch": 0.8565732520905582, + "grad_norm": 0.04296875, + "learning_rate": 0.0004612429525879591, + "loss": 1.1232, + "step": 9768 + }, + { + "epoch": 0.8566609438649327, + "grad_norm": 0.042724609375, + "learning_rate": 0.00046104979118407864, + "loss": 1.1529, + "step": 9769 + }, + { + "epoch": 0.8567486356393073, + "grad_norm": 0.05029296875, + "learning_rate": 0.0004608567382093878, + "loss": 1.1025, + "step": 9770 + }, + { + "epoch": 0.8568363274136819, + "grad_norm": 0.0439453125, + "learning_rate": 0.00046066379368149264, + "loss": 1.1908, + "step": 9771 + }, + { + "epoch": 0.8569240191880564, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004604709576179894, + "loss": 1.1124, + "step": 9772 + }, + { + "epoch": 0.8570117109624309, + "grad_norm": 0.051025390625, + "learning_rate": 0.00046027823003646416, + "loss": 1.1596, + "step": 9773 + }, + { + "epoch": 0.8570994027368055, + "grad_norm": 0.043212890625, + "learning_rate": 0.0004600856109544928, + "loss": 1.0672, + "step": 9774 + }, + { + "epoch": 0.85718709451118, + "grad_norm": 0.043701171875, + "learning_rate": 0.00045989310038964203, + "loss": 1.1121, + "step": 9775 + }, + { + "epoch": 0.8572747862855545, + "grad_norm": 0.051025390625, + "learning_rate": 0.00045970069835946824, + "loss": 1.1125, + "step": 9776 + }, + { + "epoch": 0.8573624780599292, + "grad_norm": 0.064453125, + "learning_rate": 0.00045950840488151836, + "loss": 1.1799, + "step": 9777 + }, + { + "epoch": 0.8574501698343037, + "grad_norm": 0.05517578125, + "learning_rate": 0.00045931621997332843, + "loss": 1.0867, + "step": 9778 + }, + { + "epoch": 0.8575378616086782, + "grad_norm": 0.0458984375, + "learning_rate": 0.00045912414365242545, + "loss": 1.1374, + "step": 9779 + }, + { + "epoch": 0.8576255533830528, + "grad_norm": 0.04248046875, + "learning_rate": 0.00045893217593632633, + "loss": 1.0963, + "step": 9780 + }, + { + "epoch": 0.8577132451574273, + "grad_norm": 0.047607421875, + "learning_rate": 0.00045874031684253817, + "loss": 1.0821, + "step": 9781 + }, + { + "epoch": 0.8578009369318018, + "grad_norm": 0.04248046875, + "learning_rate": 0.000458548566388558, + "loss": 1.0814, + "step": 9782 + }, + { + "epoch": 0.8578886287061763, + "grad_norm": 0.052001953125, + "learning_rate": 0.00045835692459187267, + "loss": 1.1456, + "step": 9783 + }, + { + "epoch": 0.857976320480551, + "grad_norm": 0.0439453125, + "learning_rate": 0.00045816539146995965, + "loss": 1.1459, + "step": 9784 + }, + { + "epoch": 0.8580640122549255, + "grad_norm": 0.043212890625, + "learning_rate": 0.0004579739670402864, + "loss": 1.1463, + "step": 9785 + }, + { + "epoch": 0.8581517040293, + "grad_norm": 0.041748046875, + "learning_rate": 0.00045778265132030986, + "loss": 1.1535, + "step": 9786 + }, + { + "epoch": 0.8582393958036746, + "grad_norm": 0.04052734375, + "learning_rate": 0.0004575914443274782, + "loss": 1.084, + "step": 9787 + }, + { + "epoch": 0.8583270875780491, + "grad_norm": 0.0419921875, + "learning_rate": 0.00045740034607922827, + "loss": 1.1102, + "step": 9788 + }, + { + "epoch": 0.8584147793524236, + "grad_norm": 0.046142578125, + "learning_rate": 0.00045720935659298824, + "loss": 1.102, + "step": 9789 + }, + { + "epoch": 0.8585024711267982, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004570184758861755, + "loss": 1.1132, + "step": 9790 + }, + { + "epoch": 0.8585901629011727, + "grad_norm": 0.04736328125, + "learning_rate": 0.0004568277039761985, + "loss": 1.1168, + "step": 9791 + }, + { + "epoch": 0.8586778546755472, + "grad_norm": 0.05078125, + "learning_rate": 0.0004566370408804545, + "loss": 1.0892, + "step": 9792 + }, + { + "epoch": 0.8587655464499219, + "grad_norm": 0.045654296875, + "learning_rate": 0.0004564464866163317, + "loss": 1.093, + "step": 9793 + }, + { + "epoch": 0.8588532382242964, + "grad_norm": 0.0439453125, + "learning_rate": 0.0004562560412012082, + "loss": 1.1019, + "step": 9794 + }, + { + "epoch": 0.8589409299986709, + "grad_norm": 0.041748046875, + "learning_rate": 0.0004560657046524523, + "loss": 1.1642, + "step": 9795 + }, + { + "epoch": 0.8590286217730455, + "grad_norm": 0.041748046875, + "learning_rate": 0.00045587547698742174, + "loss": 1.1228, + "step": 9796 + }, + { + "epoch": 0.85911631354742, + "grad_norm": 0.04296875, + "learning_rate": 0.00045568535822346505, + "loss": 1.148, + "step": 9797 + }, + { + "epoch": 0.8592040053217945, + "grad_norm": 0.0400390625, + "learning_rate": 0.0004554953483779208, + "loss": 1.1241, + "step": 9798 + }, + { + "epoch": 0.8592916970961691, + "grad_norm": 0.046875, + "learning_rate": 0.00045530544746811717, + "loss": 1.1176, + "step": 9799 + }, + { + "epoch": 0.8593793888705437, + "grad_norm": 0.0458984375, + "learning_rate": 0.0004551156555113725, + "loss": 1.1434, + "step": 9800 + }, + { + "epoch": 0.8594670806449182, + "grad_norm": 0.043701171875, + "learning_rate": 0.0004549259725249955, + "loss": 1.101, + "step": 9801 + }, + { + "epoch": 0.8595547724192928, + "grad_norm": 0.044677734375, + "learning_rate": 0.0004547363985262849, + "loss": 1.1217, + "step": 9802 + }, + { + "epoch": 0.8596424641936673, + "grad_norm": 0.04248046875, + "learning_rate": 0.00045454693353252907, + "loss": 1.1048, + "step": 9803 + }, + { + "epoch": 0.8597301559680418, + "grad_norm": 0.0419921875, + "learning_rate": 0.00045435757756100696, + "loss": 1.1397, + "step": 9804 + }, + { + "epoch": 0.8598178477424163, + "grad_norm": 0.0478515625, + "learning_rate": 0.000454168330628987, + "loss": 1.112, + "step": 9805 + }, + { + "epoch": 0.8599055395167909, + "grad_norm": 0.052001953125, + "learning_rate": 0.00045397919275372826, + "loss": 1.1584, + "step": 9806 + }, + { + "epoch": 0.8599932312911654, + "grad_norm": 0.048583984375, + "learning_rate": 0.00045379016395247977, + "loss": 1.1453, + "step": 9807 + }, + { + "epoch": 0.86008092306554, + "grad_norm": 0.04345703125, + "learning_rate": 0.0004536012442424805, + "loss": 1.1543, + "step": 9808 + }, + { + "epoch": 0.8601686148399146, + "grad_norm": 0.043701171875, + "learning_rate": 0.00045341243364095896, + "loss": 1.1633, + "step": 9809 + }, + { + "epoch": 0.8602563066142891, + "grad_norm": 0.042724609375, + "learning_rate": 0.00045322373216513456, + "loss": 1.1127, + "step": 9810 + }, + { + "epoch": 0.8603439983886636, + "grad_norm": 0.039794921875, + "learning_rate": 0.00045303513983221635, + "loss": 1.0812, + "step": 9811 + }, + { + "epoch": 0.8604316901630382, + "grad_norm": 0.044677734375, + "learning_rate": 0.0004528466566594034, + "loss": 1.1023, + "step": 9812 + }, + { + "epoch": 0.8605193819374127, + "grad_norm": 0.047607421875, + "learning_rate": 0.0004526582826638852, + "loss": 1.1241, + "step": 9813 + }, + { + "epoch": 0.8606070737117872, + "grad_norm": 0.0546875, + "learning_rate": 0.0004524700178628405, + "loss": 1.1551, + "step": 9814 + }, + { + "epoch": 0.8606947654861619, + "grad_norm": 0.06591796875, + "learning_rate": 0.00045228186227343865, + "loss": 1.132, + "step": 9815 + }, + { + "epoch": 0.8607824572605364, + "grad_norm": 0.0478515625, + "learning_rate": 0.0004520938159128392, + "loss": 1.137, + "step": 9816 + }, + { + "epoch": 0.8608701490349109, + "grad_norm": 0.0703125, + "learning_rate": 0.00045190587879819145, + "loss": 1.1322, + "step": 9817 + }, + { + "epoch": 0.8609578408092855, + "grad_norm": 0.0478515625, + "learning_rate": 0.0004517180509466344, + "loss": 1.112, + "step": 9818 + }, + { + "epoch": 0.86104553258366, + "grad_norm": 0.062255859375, + "learning_rate": 0.00045153033237529814, + "loss": 1.1188, + "step": 9819 + }, + { + "epoch": 0.8611332243580345, + "grad_norm": 0.068359375, + "learning_rate": 0.00045134272310130147, + "loss": 1.137, + "step": 9820 + }, + { + "epoch": 0.8612209161324091, + "grad_norm": 0.044921875, + "learning_rate": 0.00045115522314175406, + "loss": 1.1229, + "step": 9821 + }, + { + "epoch": 0.8613086079067837, + "grad_norm": 0.0478515625, + "learning_rate": 0.00045096783251375594, + "loss": 1.1675, + "step": 9822 + }, + { + "epoch": 0.8613962996811582, + "grad_norm": 0.04736328125, + "learning_rate": 0.0004507805512343958, + "loss": 1.096, + "step": 9823 + }, + { + "epoch": 0.8614839914555328, + "grad_norm": 0.043212890625, + "learning_rate": 0.0004505933793207536, + "loss": 1.1094, + "step": 9824 + }, + { + "epoch": 0.8615716832299073, + "grad_norm": 0.045166015625, + "learning_rate": 0.00045040631678989904, + "loss": 1.0766, + "step": 9825 + }, + { + "epoch": 0.8616593750042818, + "grad_norm": 0.046630859375, + "learning_rate": 0.0004502193636588918, + "loss": 1.1525, + "step": 9826 + }, + { + "epoch": 0.8617470667786564, + "grad_norm": 0.0546875, + "learning_rate": 0.0004500325199447812, + "loss": 1.1101, + "step": 9827 + }, + { + "epoch": 0.8618347585530309, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004498457856646071, + "loss": 1.0606, + "step": 9828 + }, + { + "epoch": 0.8619224503274054, + "grad_norm": 0.0458984375, + "learning_rate": 0.000449659160835399, + "loss": 1.1601, + "step": 9829 + }, + { + "epoch": 0.86201014210178, + "grad_norm": 0.04736328125, + "learning_rate": 0.00044947264547417733, + "loss": 1.1422, + "step": 9830 + }, + { + "epoch": 0.8620978338761546, + "grad_norm": 0.043212890625, + "learning_rate": 0.0004492862395979508, + "loss": 1.0959, + "step": 9831 + }, + { + "epoch": 0.8621855256505291, + "grad_norm": 0.044921875, + "learning_rate": 0.00044909994322371957, + "loss": 1.1758, + "step": 9832 + }, + { + "epoch": 0.8622732174249036, + "grad_norm": 0.05029296875, + "learning_rate": 0.00044891375636847344, + "loss": 1.142, + "step": 9833 + }, + { + "epoch": 0.8623609091992782, + "grad_norm": 0.047119140625, + "learning_rate": 0.0004487276790491925, + "loss": 1.1204, + "step": 9834 + }, + { + "epoch": 0.8624486009736527, + "grad_norm": 0.06005859375, + "learning_rate": 0.00044854171128284567, + "loss": 1.1317, + "step": 9835 + }, + { + "epoch": 0.8625362927480272, + "grad_norm": 0.04541015625, + "learning_rate": 0.0004483558530863936, + "loss": 1.1388, + "step": 9836 + }, + { + "epoch": 0.8626239845224019, + "grad_norm": 0.055908203125, + "learning_rate": 0.0004481701044767855, + "loss": 1.0911, + "step": 9837 + }, + { + "epoch": 0.8627116762967764, + "grad_norm": 0.05224609375, + "learning_rate": 0.00044798446547096143, + "loss": 1.1727, + "step": 9838 + }, + { + "epoch": 0.8627993680711509, + "grad_norm": 0.045166015625, + "learning_rate": 0.00044779893608585104, + "loss": 1.0929, + "step": 9839 + }, + { + "epoch": 0.8628870598455255, + "grad_norm": 0.04296875, + "learning_rate": 0.0004476135163383745, + "loss": 1.1326, + "step": 9840 + }, + { + "epoch": 0.8629747516199, + "grad_norm": 0.0478515625, + "learning_rate": 0.000447428206245441, + "loss": 1.1008, + "step": 9841 + }, + { + "epoch": 0.8630624433942745, + "grad_norm": 0.044921875, + "learning_rate": 0.00044724300582395053, + "loss": 1.1546, + "step": 9842 + }, + { + "epoch": 0.8631501351686491, + "grad_norm": 0.04296875, + "learning_rate": 0.00044705791509079334, + "loss": 1.15, + "step": 9843 + }, + { + "epoch": 0.8632378269430236, + "grad_norm": 0.048583984375, + "learning_rate": 0.000446872934062849, + "loss": 1.1296, + "step": 9844 + }, + { + "epoch": 0.8633255187173982, + "grad_norm": 0.05078125, + "learning_rate": 0.0004466880627569869, + "loss": 1.1438, + "step": 9845 + }, + { + "epoch": 0.8634132104917728, + "grad_norm": 0.05126953125, + "learning_rate": 0.00044650330119006725, + "loss": 1.1049, + "step": 9846 + }, + { + "epoch": 0.8635009022661473, + "grad_norm": 0.046630859375, + "learning_rate": 0.0004463186493789398, + "loss": 1.1744, + "step": 9847 + }, + { + "epoch": 0.8635885940405218, + "grad_norm": 0.05078125, + "learning_rate": 0.00044613410734044436, + "loss": 1.105, + "step": 9848 + }, + { + "epoch": 0.8636762858148964, + "grad_norm": 0.0439453125, + "learning_rate": 0.00044594967509141025, + "loss": 1.118, + "step": 9849 + }, + { + "epoch": 0.8637639775892709, + "grad_norm": 0.0478515625, + "learning_rate": 0.00044576535264865776, + "loss": 1.1352, + "step": 9850 + }, + { + "epoch": 0.8638516693636454, + "grad_norm": 0.045166015625, + "learning_rate": 0.00044558114002899606, + "loss": 1.0946, + "step": 9851 + }, + { + "epoch": 0.8639393611380201, + "grad_norm": 0.043701171875, + "learning_rate": 0.0004453970372492257, + "loss": 1.1595, + "step": 9852 + }, + { + "epoch": 0.8640270529123946, + "grad_norm": 0.054931640625, + "learning_rate": 0.00044521304432613594, + "loss": 1.0893, + "step": 9853 + }, + { + "epoch": 0.8641147446867691, + "grad_norm": 0.04736328125, + "learning_rate": 0.00044502916127650593, + "loss": 1.1185, + "step": 9854 + }, + { + "epoch": 0.8642024364611436, + "grad_norm": 0.0419921875, + "learning_rate": 0.0004448453881171059, + "loss": 1.124, + "step": 9855 + }, + { + "epoch": 0.8642901282355182, + "grad_norm": 0.047607421875, + "learning_rate": 0.0004446617248646954, + "loss": 1.0964, + "step": 9856 + }, + { + "epoch": 0.8643778200098927, + "grad_norm": 0.0654296875, + "learning_rate": 0.0004444781715360243, + "loss": 1.1098, + "step": 9857 + }, + { + "epoch": 0.8644655117842672, + "grad_norm": 0.042724609375, + "learning_rate": 0.00044429472814783155, + "loss": 1.1513, + "step": 9858 + }, + { + "epoch": 0.8645532035586418, + "grad_norm": 0.043212890625, + "learning_rate": 0.0004441113947168471, + "loss": 1.0964, + "step": 9859 + }, + { + "epoch": 0.8646408953330164, + "grad_norm": 0.045166015625, + "learning_rate": 0.0004439281712597906, + "loss": 1.1486, + "step": 9860 + }, + { + "epoch": 0.8647285871073909, + "grad_norm": 0.05078125, + "learning_rate": 0.00044374505779337154, + "loss": 1.1763, + "step": 9861 + }, + { + "epoch": 0.8648162788817655, + "grad_norm": 0.06298828125, + "learning_rate": 0.000443562054334289, + "loss": 1.1796, + "step": 9862 + }, + { + "epoch": 0.86490397065614, + "grad_norm": 0.044921875, + "learning_rate": 0.0004433791608992328, + "loss": 1.1487, + "step": 9863 + }, + { + "epoch": 0.8649916624305145, + "grad_norm": 0.042236328125, + "learning_rate": 0.0004431963775048823, + "loss": 1.0923, + "step": 9864 + }, + { + "epoch": 0.8650793542048891, + "grad_norm": 0.04345703125, + "learning_rate": 0.00044301370416790684, + "loss": 1.1753, + "step": 9865 + }, + { + "epoch": 0.8651670459792636, + "grad_norm": 0.0478515625, + "learning_rate": 0.00044283114090496584, + "loss": 1.1221, + "step": 9866 + }, + { + "epoch": 0.8652547377536381, + "grad_norm": 0.046630859375, + "learning_rate": 0.0004426486877327082, + "loss": 1.0957, + "step": 9867 + }, + { + "epoch": 0.8653424295280128, + "grad_norm": 0.04248046875, + "learning_rate": 0.0004424663446677736, + "loss": 1.1225, + "step": 9868 + }, + { + "epoch": 0.8654301213023873, + "grad_norm": 0.050537109375, + "learning_rate": 0.0004422841117267911, + "loss": 1.0965, + "step": 9869 + }, + { + "epoch": 0.8655178130767618, + "grad_norm": 0.0439453125, + "learning_rate": 0.00044210198892638027, + "loss": 1.171, + "step": 9870 + }, + { + "epoch": 0.8656055048511364, + "grad_norm": 0.04931640625, + "learning_rate": 0.0004419199762831498, + "loss": 1.1254, + "step": 9871 + }, + { + "epoch": 0.8656931966255109, + "grad_norm": 0.0498046875, + "learning_rate": 0.0004417380738136985, + "loss": 1.1511, + "step": 9872 + }, + { + "epoch": 0.8657808883998854, + "grad_norm": 0.043701171875, + "learning_rate": 0.0004415562815346159, + "loss": 1.1471, + "step": 9873 + }, + { + "epoch": 0.86586858017426, + "grad_norm": 0.046142578125, + "learning_rate": 0.0004413745994624807, + "loss": 1.1009, + "step": 9874 + }, + { + "epoch": 0.8659562719486346, + "grad_norm": 0.042724609375, + "learning_rate": 0.0004411930276138624, + "loss": 1.1512, + "step": 9875 + }, + { + "epoch": 0.8660439637230091, + "grad_norm": 0.041259765625, + "learning_rate": 0.00044101156600531916, + "loss": 1.0685, + "step": 9876 + }, + { + "epoch": 0.8661316554973836, + "grad_norm": 0.046142578125, + "learning_rate": 0.0004408302146534001, + "loss": 1.1239, + "step": 9877 + }, + { + "epoch": 0.8662193472717582, + "grad_norm": 0.04248046875, + "learning_rate": 0.0004406489735746439, + "loss": 1.1116, + "step": 9878 + }, + { + "epoch": 0.8663070390461327, + "grad_norm": 0.044921875, + "learning_rate": 0.00044046784278557976, + "loss": 1.1166, + "step": 9879 + }, + { + "epoch": 0.8663947308205072, + "grad_norm": 0.044921875, + "learning_rate": 0.00044028682230272577, + "loss": 1.1129, + "step": 9880 + }, + { + "epoch": 0.8664824225948818, + "grad_norm": 0.05029296875, + "learning_rate": 0.0004401059121425905, + "loss": 1.101, + "step": 9881 + }, + { + "epoch": 0.8665701143692564, + "grad_norm": 0.056884765625, + "learning_rate": 0.0004399251123216728, + "loss": 1.1271, + "step": 9882 + }, + { + "epoch": 0.8666578061436309, + "grad_norm": 0.046630859375, + "learning_rate": 0.0004397444228564615, + "loss": 1.1343, + "step": 9883 + }, + { + "epoch": 0.8667454979180055, + "grad_norm": 0.042236328125, + "learning_rate": 0.0004395638437634341, + "loss": 1.0843, + "step": 9884 + }, + { + "epoch": 0.86683318969238, + "grad_norm": 0.045166015625, + "learning_rate": 0.0004393833750590596, + "loss": 1.113, + "step": 9885 + }, + { + "epoch": 0.8669208814667545, + "grad_norm": 0.046142578125, + "learning_rate": 0.00043920301675979596, + "loss": 1.1092, + "step": 9886 + }, + { + "epoch": 0.8670085732411291, + "grad_norm": 0.04296875, + "learning_rate": 0.00043902276888209196, + "loss": 1.127, + "step": 9887 + }, + { + "epoch": 0.8670962650155036, + "grad_norm": 0.049072265625, + "learning_rate": 0.00043884263144238514, + "loss": 1.1439, + "step": 9888 + }, + { + "epoch": 0.8671839567898781, + "grad_norm": 0.0517578125, + "learning_rate": 0.00043866260445710355, + "loss": 1.1047, + "step": 9889 + }, + { + "epoch": 0.8672716485642528, + "grad_norm": 0.04541015625, + "learning_rate": 0.0004384826879426653, + "loss": 1.1111, + "step": 9890 + }, + { + "epoch": 0.8673593403386273, + "grad_norm": 0.04833984375, + "learning_rate": 0.00043830288191547864, + "loss": 1.0754, + "step": 9891 + }, + { + "epoch": 0.8674470321130018, + "grad_norm": 0.044677734375, + "learning_rate": 0.00043812318639194084, + "loss": 1.1161, + "step": 9892 + }, + { + "epoch": 0.8675347238873764, + "grad_norm": 0.0478515625, + "learning_rate": 0.00043794360138844045, + "loss": 1.1981, + "step": 9893 + }, + { + "epoch": 0.8676224156617509, + "grad_norm": 0.043212890625, + "learning_rate": 0.0004377641269213545, + "loss": 1.1418, + "step": 9894 + }, + { + "epoch": 0.8677101074361254, + "grad_norm": 0.04296875, + "learning_rate": 0.00043758476300705064, + "loss": 1.1135, + "step": 9895 + }, + { + "epoch": 0.8677977992105, + "grad_norm": 0.043212890625, + "learning_rate": 0.0004374055096618868, + "loss": 1.0697, + "step": 9896 + }, + { + "epoch": 0.8678854909848746, + "grad_norm": 0.047607421875, + "learning_rate": 0.00043722636690221025, + "loss": 1.1285, + "step": 9897 + }, + { + "epoch": 0.8679731827592491, + "grad_norm": 0.051025390625, + "learning_rate": 0.00043704733474435797, + "loss": 1.1349, + "step": 9898 + }, + { + "epoch": 0.8680608745336237, + "grad_norm": 0.054931640625, + "learning_rate": 0.00043686841320465764, + "loss": 1.1651, + "step": 9899 + }, + { + "epoch": 0.8681485663079982, + "grad_norm": 0.049072265625, + "learning_rate": 0.00043668960229942635, + "loss": 1.1073, + "step": 9900 + }, + { + "epoch": 0.8682362580823727, + "grad_norm": 0.042724609375, + "learning_rate": 0.0004365109020449715, + "loss": 1.1426, + "step": 9901 + }, + { + "epoch": 0.8683239498567472, + "grad_norm": 0.04248046875, + "learning_rate": 0.0004363323124575895, + "loss": 1.1544, + "step": 9902 + }, + { + "epoch": 0.8684116416311218, + "grad_norm": 0.048583984375, + "learning_rate": 0.00043615383355356737, + "loss": 1.0973, + "step": 9903 + }, + { + "epoch": 0.8684993334054963, + "grad_norm": 0.04052734375, + "learning_rate": 0.0004359754653491825, + "loss": 1.1119, + "step": 9904 + }, + { + "epoch": 0.8685870251798709, + "grad_norm": 0.04150390625, + "learning_rate": 0.00043579720786070096, + "loss": 1.1324, + "step": 9905 + }, + { + "epoch": 0.8686747169542455, + "grad_norm": 0.041015625, + "learning_rate": 0.0004356190611043798, + "loss": 1.1055, + "step": 9906 + }, + { + "epoch": 0.86876240872862, + "grad_norm": 0.045166015625, + "learning_rate": 0.00043544102509646506, + "loss": 1.1625, + "step": 9907 + }, + { + "epoch": 0.8688501005029945, + "grad_norm": 0.05810546875, + "learning_rate": 0.00043526309985319355, + "loss": 1.1062, + "step": 9908 + }, + { + "epoch": 0.8689377922773691, + "grad_norm": 0.04736328125, + "learning_rate": 0.00043508528539079144, + "loss": 1.1309, + "step": 9909 + }, + { + "epoch": 0.8690254840517436, + "grad_norm": 0.06298828125, + "learning_rate": 0.00043490758172547525, + "loss": 1.1133, + "step": 9910 + }, + { + "epoch": 0.8691131758261181, + "grad_norm": 0.044921875, + "learning_rate": 0.0004347299888734507, + "loss": 1.0884, + "step": 9911 + }, + { + "epoch": 0.8692008676004928, + "grad_norm": 0.04345703125, + "learning_rate": 0.000434552506850914, + "loss": 1.118, + "step": 9912 + }, + { + "epoch": 0.8692885593748673, + "grad_norm": 0.045166015625, + "learning_rate": 0.00043437513567405085, + "loss": 1.1172, + "step": 9913 + }, + { + "epoch": 0.8693762511492418, + "grad_norm": 0.044921875, + "learning_rate": 0.00043419787535903763, + "loss": 1.0849, + "step": 9914 + }, + { + "epoch": 0.8694639429236164, + "grad_norm": 0.042236328125, + "learning_rate": 0.0004340207259220393, + "loss": 1.1097, + "step": 9915 + }, + { + "epoch": 0.8695516346979909, + "grad_norm": 0.044189453125, + "learning_rate": 0.0004338436873792116, + "loss": 1.1156, + "step": 9916 + }, + { + "epoch": 0.8696393264723654, + "grad_norm": 0.049560546875, + "learning_rate": 0.00043366675974670026, + "loss": 1.1471, + "step": 9917 + }, + { + "epoch": 0.86972701824674, + "grad_norm": 0.041748046875, + "learning_rate": 0.00043348994304064054, + "loss": 1.1024, + "step": 9918 + }, + { + "epoch": 0.8698147100211145, + "grad_norm": 0.04443359375, + "learning_rate": 0.0004333132372771577, + "loss": 1.1382, + "step": 9919 + }, + { + "epoch": 0.869902401795489, + "grad_norm": 0.0419921875, + "learning_rate": 0.00043313664247236663, + "loss": 1.1343, + "step": 9920 + }, + { + "epoch": 0.8699900935698637, + "grad_norm": 0.047607421875, + "learning_rate": 0.00043296015864237265, + "loss": 1.1069, + "step": 9921 + }, + { + "epoch": 0.8700777853442382, + "grad_norm": 0.0556640625, + "learning_rate": 0.00043278378580327006, + "loss": 1.1137, + "step": 9922 + }, + { + "epoch": 0.8701654771186127, + "grad_norm": 0.053955078125, + "learning_rate": 0.0004326075239711441, + "loss": 1.1276, + "step": 9923 + }, + { + "epoch": 0.8702531688929873, + "grad_norm": 0.05859375, + "learning_rate": 0.0004324313731620695, + "loss": 1.162, + "step": 9924 + }, + { + "epoch": 0.8703408606673618, + "grad_norm": 0.047607421875, + "learning_rate": 0.0004322553333921103, + "loss": 1.1152, + "step": 9925 + }, + { + "epoch": 0.8704285524417363, + "grad_norm": 0.048583984375, + "learning_rate": 0.0004320794046773212, + "loss": 1.1247, + "step": 9926 + }, + { + "epoch": 0.8705162442161108, + "grad_norm": 0.04345703125, + "learning_rate": 0.0004319035870337462, + "loss": 1.1184, + "step": 9927 + }, + { + "epoch": 0.8706039359904855, + "grad_norm": 0.044189453125, + "learning_rate": 0.00043172788047741984, + "loss": 1.1429, + "step": 9928 + }, + { + "epoch": 0.87069162776486, + "grad_norm": 0.045654296875, + "learning_rate": 0.0004315522850243657, + "loss": 1.1238, + "step": 9929 + }, + { + "epoch": 0.8707793195392345, + "grad_norm": 0.046142578125, + "learning_rate": 0.0004313768006905975, + "loss": 1.1444, + "step": 9930 + }, + { + "epoch": 0.8708670113136091, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004312014274921195, + "loss": 1.1181, + "step": 9931 + }, + { + "epoch": 0.8709547030879836, + "grad_norm": 0.0634765625, + "learning_rate": 0.000431026165444925, + "loss": 1.1409, + "step": 9932 + }, + { + "epoch": 0.8710423948623581, + "grad_norm": 0.0458984375, + "learning_rate": 0.0004308510145649974, + "loss": 1.1649, + "step": 9933 + }, + { + "epoch": 0.8711300866367327, + "grad_norm": 0.04248046875, + "learning_rate": 0.00043067597486830993, + "loss": 1.1273, + "step": 9934 + }, + { + "epoch": 0.8712177784111073, + "grad_norm": 0.0478515625, + "learning_rate": 0.000430501046370826, + "loss": 1.1104, + "step": 9935 + }, + { + "epoch": 0.8713054701854818, + "grad_norm": 0.04541015625, + "learning_rate": 0.00043032622908849874, + "loss": 1.1337, + "step": 9936 + }, + { + "epoch": 0.8713931619598564, + "grad_norm": 0.042724609375, + "learning_rate": 0.0004301515230372704, + "loss": 1.0986, + "step": 9937 + }, + { + "epoch": 0.8714808537342309, + "grad_norm": 0.043701171875, + "learning_rate": 0.0004299769282330745, + "loss": 1.152, + "step": 9938 + }, + { + "epoch": 0.8715685455086054, + "grad_norm": 0.046875, + "learning_rate": 0.0004298024446918331, + "loss": 1.154, + "step": 9939 + }, + { + "epoch": 0.87165623728298, + "grad_norm": 0.047607421875, + "learning_rate": 0.0004296280724294587, + "loss": 1.1612, + "step": 9940 + }, + { + "epoch": 0.8717439290573545, + "grad_norm": 0.0400390625, + "learning_rate": 0.00042945381146185414, + "loss": 1.1055, + "step": 9941 + }, + { + "epoch": 0.871831620831729, + "grad_norm": 0.0439453125, + "learning_rate": 0.0004292796618049107, + "loss": 1.1523, + "step": 9942 + }, + { + "epoch": 0.8719193126061037, + "grad_norm": 0.04345703125, + "learning_rate": 0.00042910562347451083, + "loss": 1.1656, + "step": 9943 + }, + { + "epoch": 0.8720070043804782, + "grad_norm": 0.046142578125, + "learning_rate": 0.0004289316964865265, + "loss": 1.1781, + "step": 9944 + }, + { + "epoch": 0.8720946961548527, + "grad_norm": 0.04443359375, + "learning_rate": 0.0004287578808568191, + "loss": 1.2174, + "step": 9945 + }, + { + "epoch": 0.8721823879292273, + "grad_norm": 0.04833984375, + "learning_rate": 0.0004285841766012408, + "loss": 1.1036, + "step": 9946 + }, + { + "epoch": 0.8722700797036018, + "grad_norm": 0.0419921875, + "learning_rate": 0.0004284105837356322, + "loss": 1.1524, + "step": 9947 + }, + { + "epoch": 0.8723577714779763, + "grad_norm": 0.04931640625, + "learning_rate": 0.00042823710227582495, + "loss": 1.0991, + "step": 9948 + }, + { + "epoch": 0.8724454632523508, + "grad_norm": 0.04248046875, + "learning_rate": 0.00042806373223763984, + "loss": 1.1533, + "step": 9949 + }, + { + "epoch": 0.8725331550267255, + "grad_norm": 0.045166015625, + "learning_rate": 0.00042789047363688833, + "loss": 1.1264, + "step": 9950 + }, + { + "epoch": 0.8726208468011, + "grad_norm": 0.044677734375, + "learning_rate": 0.00042771732648937035, + "loss": 1.0901, + "step": 9951 + }, + { + "epoch": 0.8727085385754745, + "grad_norm": 0.045654296875, + "learning_rate": 0.0004275442908108771, + "loss": 1.2054, + "step": 9952 + }, + { + "epoch": 0.8727962303498491, + "grad_norm": 0.04248046875, + "learning_rate": 0.00042737136661718855, + "loss": 1.09, + "step": 9953 + }, + { + "epoch": 0.8728839221242236, + "grad_norm": 0.054443359375, + "learning_rate": 0.0004271985539240755, + "loss": 1.1582, + "step": 9954 + }, + { + "epoch": 0.8729716138985981, + "grad_norm": 0.06103515625, + "learning_rate": 0.0004270258527472979, + "loss": 1.0873, + "step": 9955 + }, + { + "epoch": 0.8730593056729727, + "grad_norm": 0.053955078125, + "learning_rate": 0.0004268532631026049, + "loss": 1.12, + "step": 9956 + }, + { + "epoch": 0.8731469974473473, + "grad_norm": 0.044189453125, + "learning_rate": 0.00042668078500573677, + "loss": 1.1726, + "step": 9957 + }, + { + "epoch": 0.8732346892217218, + "grad_norm": 0.04833984375, + "learning_rate": 0.00042650841847242325, + "loss": 1.1801, + "step": 9958 + }, + { + "epoch": 0.8733223809960964, + "grad_norm": 0.04248046875, + "learning_rate": 0.00042633616351838365, + "loss": 1.1355, + "step": 9959 + }, + { + "epoch": 0.8734100727704709, + "grad_norm": 0.048828125, + "learning_rate": 0.0004261640201593269, + "loss": 1.1412, + "step": 9960 + }, + { + "epoch": 0.8734977645448454, + "grad_norm": 0.048095703125, + "learning_rate": 0.00042599198841095223, + "loss": 1.1516, + "step": 9961 + }, + { + "epoch": 0.87358545631922, + "grad_norm": 0.059814453125, + "learning_rate": 0.0004258200682889482, + "loss": 1.1193, + "step": 9962 + }, + { + "epoch": 0.8736731480935945, + "grad_norm": 0.049072265625, + "learning_rate": 0.0004256482598089943, + "loss": 1.1439, + "step": 9963 + }, + { + "epoch": 0.873760839867969, + "grad_norm": 0.04296875, + "learning_rate": 0.00042547656298675806, + "loss": 1.1135, + "step": 9964 + }, + { + "epoch": 0.8738485316423437, + "grad_norm": 0.04248046875, + "learning_rate": 0.00042530497783789826, + "loss": 1.0825, + "step": 9965 + }, + { + "epoch": 0.8739362234167182, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004251335043780628, + "loss": 1.0954, + "step": 9966 + }, + { + "epoch": 0.8740239151910927, + "grad_norm": 0.04443359375, + "learning_rate": 0.0004249621426228902, + "loss": 1.0629, + "step": 9967 + }, + { + "epoch": 0.8741116069654673, + "grad_norm": 0.04296875, + "learning_rate": 0.0004247908925880073, + "loss": 1.1624, + "step": 9968 + }, + { + "epoch": 0.8741992987398418, + "grad_norm": 0.051025390625, + "learning_rate": 0.0004246197542890323, + "loss": 1.1488, + "step": 9969 + }, + { + "epoch": 0.8742869905142163, + "grad_norm": 0.0615234375, + "learning_rate": 0.00042444872774157226, + "loss": 1.1421, + "step": 9970 + }, + { + "epoch": 0.8743746822885909, + "grad_norm": 0.0478515625, + "learning_rate": 0.0004242778129612247, + "loss": 1.1597, + "step": 9971 + }, + { + "epoch": 0.8744623740629655, + "grad_norm": 0.04248046875, + "learning_rate": 0.0004241070099635768, + "loss": 1.1248, + "step": 9972 + }, + { + "epoch": 0.87455006583734, + "grad_norm": 0.044189453125, + "learning_rate": 0.0004239363187642045, + "loss": 1.0638, + "step": 9973 + }, + { + "epoch": 0.8746377576117145, + "grad_norm": 0.04541015625, + "learning_rate": 0.0004237657393786749, + "loss": 1.135, + "step": 9974 + }, + { + "epoch": 0.8747254493860891, + "grad_norm": 0.05078125, + "learning_rate": 0.00042359527182254426, + "loss": 1.1566, + "step": 9975 + }, + { + "epoch": 0.8748131411604636, + "grad_norm": 0.043212890625, + "learning_rate": 0.00042342491611135896, + "loss": 1.1713, + "step": 9976 + }, + { + "epoch": 0.8749008329348381, + "grad_norm": 0.044921875, + "learning_rate": 0.00042325467226065505, + "loss": 1.1344, + "step": 9977 + }, + { + "epoch": 0.8749885247092127, + "grad_norm": 0.042236328125, + "learning_rate": 0.0004230845402859581, + "loss": 1.1154, + "step": 9978 + }, + { + "epoch": 0.8750762164835872, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004229145202027839, + "loss": 1.1036, + "step": 9979 + }, + { + "epoch": 0.8751639082579618, + "grad_norm": 0.043701171875, + "learning_rate": 0.00042274461202663745, + "loss": 1.1386, + "step": 9980 + }, + { + "epoch": 0.8752516000323364, + "grad_norm": 0.04052734375, + "learning_rate": 0.0004225748157730148, + "loss": 1.1012, + "step": 9981 + }, + { + "epoch": 0.8753392918067109, + "grad_norm": 0.0419921875, + "learning_rate": 0.00042240513145739986, + "loss": 1.1533, + "step": 9982 + }, + { + "epoch": 0.8754269835810854, + "grad_norm": 0.04248046875, + "learning_rate": 0.0004222355590952681, + "loss": 1.1253, + "step": 9983 + }, + { + "epoch": 0.87551467535546, + "grad_norm": 0.041748046875, + "learning_rate": 0.0004220660987020838, + "loss": 1.0936, + "step": 9984 + }, + { + "epoch": 0.8756023671298345, + "grad_norm": 0.04052734375, + "learning_rate": 0.0004218967502933016, + "loss": 1.0833, + "step": 9985 + }, + { + "epoch": 0.875690058904209, + "grad_norm": 0.0517578125, + "learning_rate": 0.00042172751388436534, + "loss": 1.1584, + "step": 9986 + }, + { + "epoch": 0.8757777506785837, + "grad_norm": 0.04638671875, + "learning_rate": 0.000421558389490709, + "loss": 1.1314, + "step": 9987 + }, + { + "epoch": 0.8758654424529582, + "grad_norm": 0.04345703125, + "learning_rate": 0.0004213893771277562, + "loss": 1.1364, + "step": 9988 + }, + { + "epoch": 0.8759531342273327, + "grad_norm": 0.045654296875, + "learning_rate": 0.00042122047681092097, + "loss": 1.1391, + "step": 9989 + }, + { + "epoch": 0.8760408260017073, + "grad_norm": 0.04443359375, + "learning_rate": 0.00042105168855560617, + "loss": 1.1301, + "step": 9990 + }, + { + "epoch": 0.8761285177760818, + "grad_norm": 0.04443359375, + "learning_rate": 0.00042088301237720456, + "loss": 1.1312, + "step": 9991 + }, + { + "epoch": 0.8762162095504563, + "grad_norm": 0.04345703125, + "learning_rate": 0.00042071444829109927, + "loss": 1.1921, + "step": 9992 + }, + { + "epoch": 0.8763039013248309, + "grad_norm": 0.04296875, + "learning_rate": 0.00042054599631266283, + "loss": 1.0855, + "step": 9993 + }, + { + "epoch": 0.8763915930992054, + "grad_norm": 0.05859375, + "learning_rate": 0.00042037765645725817, + "loss": 1.1294, + "step": 9994 + }, + { + "epoch": 0.87647928487358, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004202094287402365, + "loss": 1.0818, + "step": 9995 + }, + { + "epoch": 0.8765669766479545, + "grad_norm": 0.047119140625, + "learning_rate": 0.00042004131317694034, + "loss": 1.1676, + "step": 9996 + }, + { + "epoch": 0.8766546684223291, + "grad_norm": 0.045166015625, + "learning_rate": 0.00041987330978270125, + "loss": 1.1169, + "step": 9997 + }, + { + "epoch": 0.8767423601967036, + "grad_norm": 0.04052734375, + "learning_rate": 0.0004197054185728409, + "loss": 1.09, + "step": 9998 + }, + { + "epoch": 0.8768300519710781, + "grad_norm": 0.0458984375, + "learning_rate": 0.00041953763956267045, + "loss": 1.0852, + "step": 9999 + }, + { + "epoch": 0.8769177437454527, + "grad_norm": 0.043212890625, + "learning_rate": 0.0004193699727674907, + "loss": 1.1104, + "step": 10000 + }, + { + "epoch": 0.8769177437454527, + "eval_loss": 1.1368882656097412, + "eval_runtime": 439.2361, + "eval_samples_per_second": 32.891, + "eval_steps_per_second": 8.223, + "step": 10000 + } + ], + "logging_steps": 1.0, + "max_steps": 11403, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.948783792128e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}