{ "best_metric": null, "best_model_checkpoint": null, "epoch": 62.512, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.064, "grad_norm": 10.007953573903052, "learning_rate": 1.666666666666667e-08, "loss": 1.1368, "mean_token_accuracy": 0.7181684672832489, "step": 1 }, { "epoch": 0.128, "grad_norm": 10.10576610929371, "learning_rate": 3.333333333333334e-08, "loss": 1.0941, "mean_token_accuracy": 0.7283257842063904, "step": 2 }, { "epoch": 0.192, "grad_norm": 9.916675620886746, "learning_rate": 5.0000000000000004e-08, "loss": 1.1692, "mean_token_accuracy": 0.7081598937511444, "step": 3 }, { "epoch": 0.256, "grad_norm": 10.013930071030025, "learning_rate": 6.666666666666668e-08, "loss": 1.1583, "mean_token_accuracy": 0.7120069265365601, "step": 4 }, { "epoch": 0.32, "grad_norm": 10.111473674490512, "learning_rate": 8.333333333333334e-08, "loss": 1.1276, "mean_token_accuracy": 0.7223769649863243, "step": 5 }, { "epoch": 0.384, "grad_norm": 9.721386316372486, "learning_rate": 1.0000000000000001e-07, "loss": 1.1733, "mean_token_accuracy": 0.7070580348372459, "step": 6 }, { "epoch": 0.448, "grad_norm": 9.945920148513494, "learning_rate": 1.1666666666666668e-07, "loss": 1.1103, "mean_token_accuracy": 0.7231727540493011, "step": 7 }, { "epoch": 0.512, "grad_norm": 10.05073073200519, "learning_rate": 1.3333333333333336e-07, "loss": 1.1569, "mean_token_accuracy": 0.7107461988925934, "step": 8 }, { "epoch": 0.576, "grad_norm": 9.982106775936531, "learning_rate": 1.5000000000000002e-07, "loss": 1.1421, "mean_token_accuracy": 0.7149700075387955, "step": 9 }, { "epoch": 0.64, "grad_norm": 10.003792635816648, "learning_rate": 1.6666666666666668e-07, "loss": 1.1023, "mean_token_accuracy": 0.723631851375103, "step": 10 }, { "epoch": 0.704, "grad_norm": 10.276626977604513, "learning_rate": 1.8333333333333336e-07, "loss": 1.1332, "mean_token_accuracy": 0.7172808423638344, "step": 11 }, { "epoch": 0.768, "grad_norm": 10.08481578876737, "learning_rate": 2.0000000000000002e-07, "loss": 1.1518, "mean_token_accuracy": 0.7127203717827797, "step": 12 }, { "epoch": 0.832, "grad_norm": 9.525669825984947, "learning_rate": 2.166666666666667e-07, "loss": 1.1224, "mean_token_accuracy": 0.7215505689382553, "step": 13 }, { "epoch": 0.896, "grad_norm": 9.884821814703367, "learning_rate": 2.3333333333333336e-07, "loss": 1.095, "mean_token_accuracy": 0.727457769215107, "step": 14 }, { "epoch": 0.96, "grad_norm": 9.908816622207203, "learning_rate": 2.5000000000000004e-07, "loss": 1.2031, "mean_token_accuracy": 0.7008141502737999, "step": 15 }, { "epoch": 1.0, "grad_norm": 9.908816622207203, "learning_rate": 2.666666666666667e-07, "loss": 1.1747, "mean_token_accuracy": 0.7078599452972412, "step": 16 }, { "epoch": 1.064, "grad_norm": 13.258080687546816, "learning_rate": 2.8333333333333336e-07, "loss": 1.1206, "mean_token_accuracy": 0.7200201973319054, "step": 17 }, { "epoch": 1.1280000000000001, "grad_norm": 8.912044234462355, "learning_rate": 3.0000000000000004e-07, "loss": 1.0693, "mean_token_accuracy": 0.7318499013781548, "step": 18 }, { "epoch": 1.192, "grad_norm": 8.94584739766406, "learning_rate": 3.166666666666667e-07, "loss": 1.1054, "mean_token_accuracy": 0.7246572002768517, "step": 19 }, { "epoch": 1.256, "grad_norm": 8.687101553022499, "learning_rate": 3.3333333333333335e-07, "loss": 1.1838, "mean_token_accuracy": 0.7047165706753731, "step": 20 }, { "epoch": 1.32, "grad_norm": 8.857875304939988, "learning_rate": 3.5000000000000004e-07, "loss": 1.1336, "mean_token_accuracy": 0.7145568132400513, "step": 21 }, { "epoch": 1.384, "grad_norm": 9.255461373829739, "learning_rate": 3.666666666666667e-07, "loss": 1.1114, "mean_token_accuracy": 0.7223157361149788, "step": 22 }, { "epoch": 1.448, "grad_norm": 8.418207023442692, "learning_rate": 3.8333333333333335e-07, "loss": 1.1268, "mean_token_accuracy": 0.7167146131396294, "step": 23 }, { "epoch": 1.512, "grad_norm": 7.705233880931532, "learning_rate": 4.0000000000000003e-07, "loss": 1.1378, "mean_token_accuracy": 0.7149853110313416, "step": 24 }, { "epoch": 1.576, "grad_norm": 7.568068610919968, "learning_rate": 4.1666666666666667e-07, "loss": 1.129, "mean_token_accuracy": 0.7133937403559685, "step": 25 }, { "epoch": 1.6400000000000001, "grad_norm": 7.724411766044574, "learning_rate": 4.333333333333334e-07, "loss": 1.0933, "mean_token_accuracy": 0.7226524278521538, "step": 26 }, { "epoch": 1.704, "grad_norm": 7.554413210448343, "learning_rate": 4.5000000000000003e-07, "loss": 1.1277, "mean_token_accuracy": 0.7138069346547127, "step": 27 }, { "epoch": 1.768, "grad_norm": 7.175909295623305, "learning_rate": 4.666666666666667e-07, "loss": 1.1096, "mean_token_accuracy": 0.7179848179221153, "step": 28 }, { "epoch": 1.8319999999999999, "grad_norm": 7.690911690268024, "learning_rate": 4.833333333333334e-07, "loss": 1.11, "mean_token_accuracy": 0.7183062061667442, "step": 29 }, { "epoch": 1.896, "grad_norm": 7.269537464563892, "learning_rate": 5.000000000000001e-07, "loss": 1.1048, "mean_token_accuracy": 0.7193468436598778, "step": 30 }, { "epoch": 1.96, "grad_norm": 7.349627492611058, "learning_rate": 5.166666666666667e-07, "loss": 1.0959, "mean_token_accuracy": 0.7227894812822342, "step": 31 }, { "epoch": 2.0, "grad_norm": 9.24557176053041, "learning_rate": 5.333333333333335e-07, "loss": 1.0826, "mean_token_accuracy": 0.7226983308792114, "step": 32 }, { "epoch": 2.064, "grad_norm": 7.945360176065237, "learning_rate": 5.5e-07, "loss": 1.0919, "mean_token_accuracy": 0.7189489528536797, "step": 33 }, { "epoch": 2.128, "grad_norm": 6.714634565442243, "learning_rate": 5.666666666666667e-07, "loss": 1.0784, "mean_token_accuracy": 0.7208925113081932, "step": 34 }, { "epoch": 2.192, "grad_norm": 7.046866334016109, "learning_rate": 5.833333333333334e-07, "loss": 1.0962, "mean_token_accuracy": 0.7166227996349335, "step": 35 }, { "epoch": 2.2560000000000002, "grad_norm": 6.840924401059772, "learning_rate": 6.000000000000001e-07, "loss": 1.0485, "mean_token_accuracy": 0.7256672382354736, "step": 36 }, { "epoch": 2.32, "grad_norm": 6.534813854731247, "learning_rate": 6.166666666666668e-07, "loss": 1.0731, "mean_token_accuracy": 0.7201885432004929, "step": 37 }, { "epoch": 2.384, "grad_norm": 6.758569834880835, "learning_rate": 6.333333333333334e-07, "loss": 1.0797, "mean_token_accuracy": 0.7189183384180069, "step": 38 }, { "epoch": 2.448, "grad_norm": 6.70566361809579, "learning_rate": 6.5e-07, "loss": 1.0626, "mean_token_accuracy": 0.7240603491663933, "step": 39 }, { "epoch": 2.512, "grad_norm": 6.246910550309576, "learning_rate": 6.666666666666667e-07, "loss": 1.0914, "mean_token_accuracy": 0.7172196358442307, "step": 40 }, { "epoch": 2.576, "grad_norm": 6.2507008375840245, "learning_rate": 6.833333333333334e-07, "loss": 1.0384, "mean_token_accuracy": 0.7299063429236412, "step": 41 }, { "epoch": 2.64, "grad_norm": 6.205970350852554, "learning_rate": 7.000000000000001e-07, "loss": 1.0632, "mean_token_accuracy": 0.7216444090008736, "step": 42 }, { "epoch": 2.7039999999999997, "grad_norm": 6.0718887817048985, "learning_rate": 7.166666666666668e-07, "loss": 0.9998, "mean_token_accuracy": 0.7378843873739243, "step": 43 }, { "epoch": 2.768, "grad_norm": 6.231952736573528, "learning_rate": 7.333333333333334e-07, "loss": 1.0507, "mean_token_accuracy": 0.7227595448493958, "step": 44 }, { "epoch": 2.832, "grad_norm": 5.663201161989997, "learning_rate": 7.5e-07, "loss": 0.9759, "mean_token_accuracy": 0.7404964491724968, "step": 45 }, { "epoch": 2.896, "grad_norm": 5.568193166481359, "learning_rate": 7.666666666666667e-07, "loss": 1.017, "mean_token_accuracy": 0.730059377849102, "step": 46 }, { "epoch": 2.96, "grad_norm": 5.465135258410058, "learning_rate": 7.833333333333335e-07, "loss": 0.9963, "mean_token_accuracy": 0.7316509559750557, "step": 47 }, { "epoch": 3.0, "grad_norm": 5.465135258410058, "learning_rate": 8.000000000000001e-07, "loss": 0.9867, "mean_token_accuracy": 0.739226245880127, "step": 48 }, { "epoch": 3.064, "grad_norm": 7.2451088135268575, "learning_rate": 8.166666666666668e-07, "loss": 1.0628, "mean_token_accuracy": 0.7171737253665924, "step": 49 }, { "epoch": 3.128, "grad_norm": 5.512381242128147, "learning_rate": 8.333333333333333e-07, "loss": 1.0087, "mean_token_accuracy": 0.7285749167203903, "step": 50 }, { "epoch": 3.192, "grad_norm": 5.374839758968374, "learning_rate": 8.500000000000001e-07, "loss": 0.9171, "mean_token_accuracy": 0.7510559484362602, "step": 51 }, { "epoch": 3.2560000000000002, "grad_norm": 5.267507624714914, "learning_rate": 8.666666666666668e-07, "loss": 0.9881, "mean_token_accuracy": 0.7333649694919586, "step": 52 }, { "epoch": 3.32, "grad_norm": 5.267774790003361, "learning_rate": 8.833333333333334e-07, "loss": 0.9893, "mean_token_accuracy": 0.7316507250070572, "step": 53 }, { "epoch": 3.384, "grad_norm": 5.170331530077155, "learning_rate": 9.000000000000001e-07, "loss": 1.0137, "mean_token_accuracy": 0.7252412438392639, "step": 54 }, { "epoch": 3.448, "grad_norm": 5.269625020765499, "learning_rate": 9.166666666666666e-07, "loss": 0.952, "mean_token_accuracy": 0.7447967752814293, "step": 55 }, { "epoch": 3.512, "grad_norm": 4.77276899732146, "learning_rate": 9.333333333333334e-07, "loss": 0.9766, "mean_token_accuracy": 0.7357982397079468, "step": 56 }, { "epoch": 3.576, "grad_norm": 4.871021680753307, "learning_rate": 9.500000000000001e-07, "loss": 0.9582, "mean_token_accuracy": 0.741552397608757, "step": 57 }, { "epoch": 3.64, "grad_norm": 4.666741537056278, "learning_rate": 9.666666666666668e-07, "loss": 0.9081, "mean_token_accuracy": 0.7523108571767807, "step": 58 }, { "epoch": 3.7039999999999997, "grad_norm": 4.674976781602286, "learning_rate": 9.833333333333334e-07, "loss": 0.9678, "mean_token_accuracy": 0.7383539527654648, "step": 59 }, { "epoch": 3.768, "grad_norm": 4.534249331585696, "learning_rate": 1.0000000000000002e-06, "loss": 0.9633, "mean_token_accuracy": 0.7388895601034164, "step": 60 }, { "epoch": 3.832, "grad_norm": 4.692860560778736, "learning_rate": 1.0166666666666667e-06, "loss": 0.9534, "mean_token_accuracy": 0.7402515858411789, "step": 61 }, { "epoch": 3.896, "grad_norm": 4.557274591328087, "learning_rate": 1.0333333333333333e-06, "loss": 0.9216, "mean_token_accuracy": 0.7471994459629059, "step": 62 }, { "epoch": 3.96, "grad_norm": 4.464140689962714, "learning_rate": 1.0500000000000001e-06, "loss": 0.931, "mean_token_accuracy": 0.7445672154426575, "step": 63 }, { "epoch": 4.0, "grad_norm": 4.53173989244256, "learning_rate": 1.066666666666667e-06, "loss": 0.9313, "mean_token_accuracy": 0.7451273202896118, "step": 64 }, { "epoch": 4.064, "grad_norm": 5.4364094997169925, "learning_rate": 1.0833333333333335e-06, "loss": 0.8936, "mean_token_accuracy": 0.7526322230696678, "step": 65 }, { "epoch": 4.128, "grad_norm": 3.8487651461075476, "learning_rate": 1.1e-06, "loss": 0.9123, "mean_token_accuracy": 0.7452558800578117, "step": 66 }, { "epoch": 4.192, "grad_norm": 3.891857321345981, "learning_rate": 1.1166666666666666e-06, "loss": 0.8775, "mean_token_accuracy": 0.7547747269272804, "step": 67 }, { "epoch": 4.256, "grad_norm": 3.7769786226924387, "learning_rate": 1.1333333333333334e-06, "loss": 0.9241, "mean_token_accuracy": 0.7429909482598305, "step": 68 }, { "epoch": 4.32, "grad_norm": 3.2728231439391946, "learning_rate": 1.1500000000000002e-06, "loss": 0.8587, "mean_token_accuracy": 0.7588519677519798, "step": 69 }, { "epoch": 4.384, "grad_norm": 3.0939253235282598, "learning_rate": 1.1666666666666668e-06, "loss": 0.8692, "mean_token_accuracy": 0.7554250583052635, "step": 70 }, { "epoch": 4.448, "grad_norm": 3.1377096215237827, "learning_rate": 1.1833333333333334e-06, "loss": 0.8664, "mean_token_accuracy": 0.7558918967843056, "step": 71 }, { "epoch": 4.5120000000000005, "grad_norm": 2.813907548242143, "learning_rate": 1.2000000000000002e-06, "loss": 0.8748, "mean_token_accuracy": 0.7552032396197319, "step": 72 }, { "epoch": 4.576, "grad_norm": 2.788977274276516, "learning_rate": 1.2166666666666667e-06, "loss": 0.8577, "mean_token_accuracy": 0.7556317299604416, "step": 73 }, { "epoch": 4.64, "grad_norm": 3.2305914499805093, "learning_rate": 1.2333333333333335e-06, "loss": 0.8237, "mean_token_accuracy": 0.7640793398022652, "step": 74 }, { "epoch": 4.704, "grad_norm": 2.192727706358339, "learning_rate": 1.25e-06, "loss": 0.9211, "mean_token_accuracy": 0.7392874658107758, "step": 75 }, { "epoch": 4.768, "grad_norm": 1.9150906923820799, "learning_rate": 1.2666666666666669e-06, "loss": 0.8731, "mean_token_accuracy": 0.7498163431882858, "step": 76 }, { "epoch": 4.832, "grad_norm": 1.950080454332065, "learning_rate": 1.2833333333333335e-06, "loss": 0.8334, "mean_token_accuracy": 0.7623041197657585, "step": 77 }, { "epoch": 4.896, "grad_norm": 1.7257221353172196, "learning_rate": 1.3e-06, "loss": 0.8787, "mean_token_accuracy": 0.7492654249072075, "step": 78 }, { "epoch": 4.96, "grad_norm": 1.9238035324562013, "learning_rate": 1.3166666666666666e-06, "loss": 0.8178, "mean_token_accuracy": 0.7650740817189217, "step": 79 }, { "epoch": 5.0, "grad_norm": 2.372697677503388, "learning_rate": 1.3333333333333334e-06, "loss": 0.8245, "mean_token_accuracy": 0.7643976449966431, "step": 80 }, { "epoch": 5.064, "grad_norm": 1.7328635942702375, "learning_rate": 1.3500000000000002e-06, "loss": 0.8231, "mean_token_accuracy": 0.7632376402616501, "step": 81 }, { "epoch": 5.128, "grad_norm": 1.6700259108213473, "learning_rate": 1.3666666666666668e-06, "loss": 0.807, "mean_token_accuracy": 0.7658545523881912, "step": 82 }, { "epoch": 5.192, "grad_norm": 1.6093012055434561, "learning_rate": 1.3833333333333336e-06, "loss": 0.8246, "mean_token_accuracy": 0.7631764262914658, "step": 83 }, { "epoch": 5.256, "grad_norm": 1.4205249331653929, "learning_rate": 1.4000000000000001e-06, "loss": 0.777, "mean_token_accuracy": 0.7751132473349571, "step": 84 }, { "epoch": 5.32, "grad_norm": 1.6616678293511726, "learning_rate": 1.4166666666666667e-06, "loss": 0.8292, "mean_token_accuracy": 0.7602840438485146, "step": 85 }, { "epoch": 5.384, "grad_norm": 1.7238409259516527, "learning_rate": 1.4333333333333335e-06, "loss": 0.8385, "mean_token_accuracy": 0.7604676708579063, "step": 86 }, { "epoch": 5.448, "grad_norm": 1.7541192640281142, "learning_rate": 1.45e-06, "loss": 0.8074, "mean_token_accuracy": 0.7647833153605461, "step": 87 }, { "epoch": 5.5120000000000005, "grad_norm": 1.6476959169162377, "learning_rate": 1.4666666666666669e-06, "loss": 0.7775, "mean_token_accuracy": 0.7735818848013878, "step": 88 }, { "epoch": 5.576, "grad_norm": 1.7753669380966037, "learning_rate": 1.4833333333333337e-06, "loss": 0.8216, "mean_token_accuracy": 0.7624265477061272, "step": 89 }, { "epoch": 5.64, "grad_norm": 1.5929133019322397, "learning_rate": 1.5e-06, "loss": 0.7701, "mean_token_accuracy": 0.7748836874961853, "step": 90 }, { "epoch": 5.704, "grad_norm": 1.8554277024821038, "learning_rate": 1.5166666666666668e-06, "loss": 0.8006, "mean_token_accuracy": 0.7662371471524239, "step": 91 }, { "epoch": 5.768, "grad_norm": 1.804900828514802, "learning_rate": 1.5333333333333334e-06, "loss": 0.7997, "mean_token_accuracy": 0.7669258117675781, "step": 92 }, { "epoch": 5.832, "grad_norm": 1.9595025511345283, "learning_rate": 1.5500000000000002e-06, "loss": 0.8114, "mean_token_accuracy": 0.7633245587348938, "step": 93 }, { "epoch": 5.896, "grad_norm": 1.521186213015913, "learning_rate": 1.566666666666667e-06, "loss": 0.7922, "mean_token_accuracy": 0.7670788392424583, "step": 94 }, { "epoch": 5.96, "grad_norm": 1.686587104564136, "learning_rate": 1.5833333333333333e-06, "loss": 0.8009, "mean_token_accuracy": 0.765701524913311, "step": 95 }, { "epoch": 6.0, "grad_norm": 1.686587104564136, "learning_rate": 1.6000000000000001e-06, "loss": 0.8391, "mean_token_accuracy": 0.7569049954414367, "step": 96 }, { "epoch": 6.064, "grad_norm": 2.765412694026545, "learning_rate": 1.6166666666666667e-06, "loss": 0.7353, "mean_token_accuracy": 0.7811388894915581, "step": 97 }, { "epoch": 6.128, "grad_norm": 1.4157659989050184, "learning_rate": 1.6333333333333335e-06, "loss": 0.7994, "mean_token_accuracy": 0.7665432170033455, "step": 98 }, { "epoch": 6.192, "grad_norm": 1.8430356595547281, "learning_rate": 1.6500000000000003e-06, "loss": 0.8021, "mean_token_accuracy": 0.7650740742683411, "step": 99 }, { "epoch": 6.256, "grad_norm": 1.2556775359910428, "learning_rate": 1.6666666666666667e-06, "loss": 0.7606, "mean_token_accuracy": 0.7758325189352036, "step": 100 }, { "epoch": 6.32, "grad_norm": 1.1706600467042594, "learning_rate": 1.6833333333333335e-06, "loss": 0.7704, "mean_token_accuracy": 0.7706139832735062, "step": 101 }, { "epoch": 6.384, "grad_norm": 1.6392949445239655, "learning_rate": 1.7000000000000002e-06, "loss": 0.7896, "mean_token_accuracy": 0.7682419195771217, "step": 102 }, { "epoch": 6.448, "grad_norm": 1.1765815462568665, "learning_rate": 1.7166666666666668e-06, "loss": 0.7677, "mean_token_accuracy": 0.7733074128627777, "step": 103 }, { "epoch": 6.5120000000000005, "grad_norm": 1.8069468775388329, "learning_rate": 1.7333333333333336e-06, "loss": 0.7354, "mean_token_accuracy": 0.7818532660603523, "step": 104 }, { "epoch": 6.576, "grad_norm": 1.162448601720104, "learning_rate": 1.75e-06, "loss": 0.7341, "mean_token_accuracy": 0.7821682095527649, "step": 105 }, { "epoch": 6.64, "grad_norm": 1.148015970459647, "learning_rate": 1.7666666666666668e-06, "loss": 0.7635, "mean_token_accuracy": 0.7734604626893997, "step": 106 }, { "epoch": 6.704, "grad_norm": 1.3300212461691385, "learning_rate": 1.7833333333333336e-06, "loss": 0.7025, "mean_token_accuracy": 0.7902485355734825, "step": 107 }, { "epoch": 6.768, "grad_norm": 1.230108534039048, "learning_rate": 1.8000000000000001e-06, "loss": 0.7578, "mean_token_accuracy": 0.7756029665470123, "step": 108 }, { "epoch": 6.832, "grad_norm": 1.1170051553451015, "learning_rate": 1.816666666666667e-06, "loss": 0.7473, "mean_token_accuracy": 0.7790003642439842, "step": 109 }, { "epoch": 6.896, "grad_norm": 0.9783171526060587, "learning_rate": 1.8333333333333333e-06, "loss": 0.7313, "mean_token_accuracy": 0.7833771929144859, "step": 110 }, { "epoch": 6.96, "grad_norm": 1.117670674739453, "learning_rate": 1.85e-06, "loss": 0.7481, "mean_token_accuracy": 0.7799951061606407, "step": 111 }, { "epoch": 7.0, "grad_norm": 1.0781269354596512, "learning_rate": 1.8666666666666669e-06, "loss": 0.7453, "mean_token_accuracy": 0.7788687705993652, "step": 112 }, { "epoch": 7.064, "grad_norm": 1.1528645235040904, "learning_rate": 1.8833333333333334e-06, "loss": 0.7217, "mean_token_accuracy": 0.7856268361210823, "step": 113 }, { "epoch": 7.128, "grad_norm": 0.9226474778459752, "learning_rate": 1.9000000000000002e-06, "loss": 0.7034, "mean_token_accuracy": 0.7901414036750793, "step": 114 }, { "epoch": 7.192, "grad_norm": 0.9349905034093241, "learning_rate": 1.916666666666667e-06, "loss": 0.7406, "mean_token_accuracy": 0.7780515402555466, "step": 115 }, { "epoch": 7.256, "grad_norm": 0.9544197217813454, "learning_rate": 1.9333333333333336e-06, "loss": 0.6973, "mean_token_accuracy": 0.7903403639793396, "step": 116 }, { "epoch": 7.32, "grad_norm": 0.9668111876902723, "learning_rate": 1.9500000000000004e-06, "loss": 0.7685, "mean_token_accuracy": 0.7709506526589394, "step": 117 }, { "epoch": 7.384, "grad_norm": 0.8660447768095622, "learning_rate": 1.9666666666666668e-06, "loss": 0.7199, "mean_token_accuracy": 0.7839893475174904, "step": 118 }, { "epoch": 7.448, "grad_norm": 0.998258542388104, "learning_rate": 1.9833333333333335e-06, "loss": 0.7432, "mean_token_accuracy": 0.7790921926498413, "step": 119 }, { "epoch": 7.5120000000000005, "grad_norm": 1.0502438317762517, "learning_rate": 2.0000000000000003e-06, "loss": 0.7167, "mean_token_accuracy": 0.787218414247036, "step": 120 }, { "epoch": 7.576, "grad_norm": 0.8430096084912502, "learning_rate": 2.0166666666666667e-06, "loss": 0.7135, "mean_token_accuracy": 0.7850858643651009, "step": 121 }, { "epoch": 7.64, "grad_norm": 0.8626304432979793, "learning_rate": 2.0333333333333335e-06, "loss": 0.6925, "mean_token_accuracy": 0.7908746749162674, "step": 122 }, { "epoch": 7.704, "grad_norm": 0.8812997644278306, "learning_rate": 2.05e-06, "loss": 0.7039, "mean_token_accuracy": 0.7904627844691277, "step": 123 }, { "epoch": 7.768, "grad_norm": 1.0343824932566608, "learning_rate": 2.0666666666666666e-06, "loss": 0.7149, "mean_token_accuracy": 0.784433163702488, "step": 124 }, { "epoch": 7.832, "grad_norm": 1.0449997051191895, "learning_rate": 2.0833333333333334e-06, "loss": 0.7129, "mean_token_accuracy": 0.7853360548615456, "step": 125 }, { "epoch": 7.896, "grad_norm": 0.8583721450188934, "learning_rate": 2.1000000000000002e-06, "loss": 0.736, "mean_token_accuracy": 0.7791840061545372, "step": 126 }, { "epoch": 7.96, "grad_norm": 0.8789545374419696, "learning_rate": 2.116666666666667e-06, "loss": 0.7275, "mean_token_accuracy": 0.7829946205019951, "step": 127 }, { "epoch": 8.0, "grad_norm": 1.1395142184360476, "learning_rate": 2.133333333333334e-06, "loss": 0.6545, "mean_token_accuracy": 0.8044809103012085, "step": 128 }, { "epoch": 8.064, "grad_norm": 0.8243734332948314, "learning_rate": 2.15e-06, "loss": 0.7189, "mean_token_accuracy": 0.7840658649802208, "step": 129 }, { "epoch": 8.128, "grad_norm": 0.8656469422963479, "learning_rate": 2.166666666666667e-06, "loss": 0.6645, "mean_token_accuracy": 0.7986868247389793, "step": 130 }, { "epoch": 8.192, "grad_norm": 0.9139539486160407, "learning_rate": 2.1833333333333333e-06, "loss": 0.7571, "mean_token_accuracy": 0.7743633687496185, "step": 131 }, { "epoch": 8.256, "grad_norm": 0.8317508129323331, "learning_rate": 2.2e-06, "loss": 0.7003, "mean_token_accuracy": 0.7902944311499596, "step": 132 }, { "epoch": 8.32, "grad_norm": 0.9447054352928229, "learning_rate": 2.216666666666667e-06, "loss": 0.6457, "mean_token_accuracy": 0.8054144233465195, "step": 133 }, { "epoch": 8.384, "grad_norm": 0.8525168442579962, "learning_rate": 2.2333333333333333e-06, "loss": 0.6817, "mean_token_accuracy": 0.7947171926498413, "step": 134 }, { "epoch": 8.448, "grad_norm": 0.905888207465798, "learning_rate": 2.25e-06, "loss": 0.69, "mean_token_accuracy": 0.7926358953118324, "step": 135 }, { "epoch": 8.512, "grad_norm": 0.9600957425648814, "learning_rate": 2.266666666666667e-06, "loss": 0.6683, "mean_token_accuracy": 0.7957328483462334, "step": 136 }, { "epoch": 8.576, "grad_norm": 0.8765612027502118, "learning_rate": 2.2833333333333336e-06, "loss": 0.6762, "mean_token_accuracy": 0.7966454327106476, "step": 137 }, { "epoch": 8.64, "grad_norm": 0.8851999560720238, "learning_rate": 2.3000000000000004e-06, "loss": 0.6934, "mean_token_accuracy": 0.7893915250897408, "step": 138 }, { "epoch": 8.704, "grad_norm": 0.9503541271986636, "learning_rate": 2.316666666666667e-06, "loss": 0.6755, "mean_token_accuracy": 0.7954823672771454, "step": 139 }, { "epoch": 8.768, "grad_norm": 0.9327525502568886, "learning_rate": 2.3333333333333336e-06, "loss": 0.6735, "mean_token_accuracy": 0.7949467450380325, "step": 140 }, { "epoch": 8.832, "grad_norm": 0.8315423613474694, "learning_rate": 2.35e-06, "loss": 0.7006, "mean_token_accuracy": 0.7896822914481163, "step": 141 }, { "epoch": 8.896, "grad_norm": 0.9718381967552873, "learning_rate": 2.3666666666666667e-06, "loss": 0.6929, "mean_token_accuracy": 0.7894068360328674, "step": 142 }, { "epoch": 8.96, "grad_norm": 0.8638614945974594, "learning_rate": 2.3833333333333335e-06, "loss": 0.675, "mean_token_accuracy": 0.7937071546912193, "step": 143 }, { "epoch": 9.0, "grad_norm": 0.8638614945974594, "learning_rate": 2.4000000000000003e-06, "loss": 0.6451, "mean_token_accuracy": 0.801811945438385, "step": 144 }, { "epoch": 9.064, "grad_norm": 1.1264367075932835, "learning_rate": 2.4166666666666667e-06, "loss": 0.6411, "mean_token_accuracy": 0.8030117377638817, "step": 145 }, { "epoch": 9.128, "grad_norm": 0.7650363015628794, "learning_rate": 2.4333333333333335e-06, "loss": 0.6317, "mean_token_accuracy": 0.8047716841101646, "step": 146 }, { "epoch": 9.192, "grad_norm": 0.8137166970455426, "learning_rate": 2.4500000000000003e-06, "loss": 0.6069, "mean_token_accuracy": 0.8138467147946358, "step": 147 }, { "epoch": 9.256, "grad_norm": 0.8680945045500856, "learning_rate": 2.466666666666667e-06, "loss": 0.6527, "mean_token_accuracy": 0.8022924736142159, "step": 148 }, { "epoch": 9.32, "grad_norm": 0.7939816431969379, "learning_rate": 2.4833333333333334e-06, "loss": 0.7115, "mean_token_accuracy": 0.7844637557864189, "step": 149 }, { "epoch": 9.384, "grad_norm": 0.8189337676803738, "learning_rate": 2.5e-06, "loss": 0.6776, "mean_token_accuracy": 0.7949752360582352, "step": 150 }, { "epoch": 9.448, "grad_norm": 0.7875373929470795, "learning_rate": 2.5166666666666666e-06, "loss": 0.6796, "mean_token_accuracy": 0.7922533079981804, "step": 151 }, { "epoch": 9.512, "grad_norm": 0.8207615998768927, "learning_rate": 2.5333333333333338e-06, "loss": 0.6569, "mean_token_accuracy": 0.7991858497262001, "step": 152 }, { "epoch": 9.576, "grad_norm": 0.8107085702208965, "learning_rate": 2.55e-06, "loss": 0.6741, "mean_token_accuracy": 0.7946043238043785, "step": 153 }, { "epoch": 9.64, "grad_norm": 0.833678180831081, "learning_rate": 2.566666666666667e-06, "loss": 0.6961, "mean_token_accuracy": 0.7878611758351326, "step": 154 }, { "epoch": 9.704, "grad_norm": 0.8112098888060073, "learning_rate": 2.5833333333333337e-06, "loss": 0.6065, "mean_token_accuracy": 0.8132345676422119, "step": 155 }, { "epoch": 9.768, "grad_norm": 0.8277778449712225, "learning_rate": 2.6e-06, "loss": 0.6169, "mean_token_accuracy": 0.8092862442135811, "step": 156 }, { "epoch": 9.832, "grad_norm": 0.7715722964551908, "learning_rate": 2.616666666666667e-06, "loss": 0.6511, "mean_token_accuracy": 0.8022006675601006, "step": 157 }, { "epoch": 9.896, "grad_norm": 0.8080199291903217, "learning_rate": 2.6333333333333332e-06, "loss": 0.6476, "mean_token_accuracy": 0.8036392033100128, "step": 158 }, { "epoch": 9.96, "grad_norm": 0.809658653186753, "learning_rate": 2.6500000000000005e-06, "loss": 0.6466, "mean_token_accuracy": 0.8011447116732597, "step": 159 }, { "epoch": 10.0, "grad_norm": 1.1152653790103664, "learning_rate": 2.666666666666667e-06, "loss": 0.6754, "mean_token_accuracy": 0.7960577845573426, "step": 160 }, { "epoch": 10.064, "grad_norm": 0.970883147182631, "learning_rate": 2.683333333333333e-06, "loss": 0.6278, "mean_token_accuracy": 0.8058429285883904, "step": 161 }, { "epoch": 10.128, "grad_norm": 0.7579220258849924, "learning_rate": 2.7000000000000004e-06, "loss": 0.6262, "mean_token_accuracy": 0.8091485053300858, "step": 162 }, { "epoch": 10.192, "grad_norm": 0.8916776619975644, "learning_rate": 2.7166666666666668e-06, "loss": 0.6415, "mean_token_accuracy": 0.8026444688439369, "step": 163 }, { "epoch": 10.256, "grad_norm": 0.8133394969829804, "learning_rate": 2.7333333333333336e-06, "loss": 0.6105, "mean_token_accuracy": 0.8141527846455574, "step": 164 }, { "epoch": 10.32, "grad_norm": 0.8189109529379089, "learning_rate": 2.7500000000000004e-06, "loss": 0.6456, "mean_token_accuracy": 0.7997061684727669, "step": 165 }, { "epoch": 10.384, "grad_norm": 0.9025659324817272, "learning_rate": 2.766666666666667e-06, "loss": 0.6216, "mean_token_accuracy": 0.8096382319927216, "step": 166 }, { "epoch": 10.448, "grad_norm": 0.8500776081532554, "learning_rate": 2.7833333333333335e-06, "loss": 0.618, "mean_token_accuracy": 0.8094465807080269, "step": 167 }, { "epoch": 10.512, "grad_norm": 0.7657352637365726, "learning_rate": 2.8000000000000003e-06, "loss": 0.6004, "mean_token_accuracy": 0.8145353868603706, "step": 168 }, { "epoch": 10.576, "grad_norm": 0.8806954208745533, "learning_rate": 2.816666666666667e-06, "loss": 0.6525, "mean_token_accuracy": 0.7989103868603706, "step": 169 }, { "epoch": 10.64, "grad_norm": 0.9111890607517068, "learning_rate": 2.8333333333333335e-06, "loss": 0.6161, "mean_token_accuracy": 0.8091638088226318, "step": 170 }, { "epoch": 10.704, "grad_norm": 0.9541948724358974, "learning_rate": 2.85e-06, "loss": 0.5958, "mean_token_accuracy": 0.814688429236412, "step": 171 }, { "epoch": 10.768, "grad_norm": 0.9428049713660556, "learning_rate": 2.866666666666667e-06, "loss": 0.6024, "mean_token_accuracy": 0.8117967024445534, "step": 172 }, { "epoch": 10.832, "grad_norm": 0.8811114765115681, "learning_rate": 2.8833333333333334e-06, "loss": 0.6126, "mean_token_accuracy": 0.80983716994524, "step": 173 }, { "epoch": 10.896, "grad_norm": 0.911818071058545, "learning_rate": 2.9e-06, "loss": 0.6025, "mean_token_accuracy": 0.8134488388895988, "step": 174 }, { "epoch": 10.96, "grad_norm": 0.9005191809608027, "learning_rate": 2.916666666666667e-06, "loss": 0.65, "mean_token_accuracy": 0.8019251972436905, "step": 175 }, { "epoch": 11.0, "grad_norm": 0.9005191809608027, "learning_rate": 2.9333333333333338e-06, "loss": 0.6372, "mean_token_accuracy": 0.8037218451499939, "step": 176 }, { "epoch": 11.064, "grad_norm": 1.2983799536958789, "learning_rate": 2.95e-06, "loss": 0.5818, "mean_token_accuracy": 0.8188203945755959, "step": 177 }, { "epoch": 11.128, "grad_norm": 0.8839580480521877, "learning_rate": 2.9666666666666673e-06, "loss": 0.5602, "mean_token_accuracy": 0.8259365782141685, "step": 178 }, { "epoch": 11.192, "grad_norm": 0.9457535169943188, "learning_rate": 2.9833333333333337e-06, "loss": 0.5862, "mean_token_accuracy": 0.8189581260085106, "step": 179 }, { "epoch": 11.256, "grad_norm": 0.8826207780210019, "learning_rate": 3e-06, "loss": 0.5967, "mean_token_accuracy": 0.8152546510100365, "step": 180 }, { "epoch": 11.32, "grad_norm": 0.8607172191230935, "learning_rate": 3.0166666666666673e-06, "loss": 0.5791, "mean_token_accuracy": 0.8194478452205658, "step": 181 }, { "epoch": 11.384, "grad_norm": 0.9248210261397865, "learning_rate": 3.0333333333333337e-06, "loss": 0.5956, "mean_token_accuracy": 0.8157596662640572, "step": 182 }, { "epoch": 11.448, "grad_norm": 0.8153908682538602, "learning_rate": 3.05e-06, "loss": 0.5839, "mean_token_accuracy": 0.8194325268268585, "step": 183 }, { "epoch": 11.512, "grad_norm": 0.7843254653482064, "learning_rate": 3.066666666666667e-06, "loss": 0.615, "mean_token_accuracy": 0.8090719804167747, "step": 184 }, { "epoch": 11.576, "grad_norm": 0.8343276532465587, "learning_rate": 3.0833333333333336e-06, "loss": 0.5934, "mean_token_accuracy": 0.8146578148007393, "step": 185 }, { "epoch": 11.64, "grad_norm": 0.8273990723303548, "learning_rate": 3.1000000000000004e-06, "loss": 0.5671, "mean_token_accuracy": 0.8210700303316116, "step": 186 }, { "epoch": 11.704, "grad_norm": 0.8267053778053666, "learning_rate": 3.1166666666666668e-06, "loss": 0.5895, "mean_token_accuracy": 0.8155913278460503, "step": 187 }, { "epoch": 11.768, "grad_norm": 0.8600310134259179, "learning_rate": 3.133333333333334e-06, "loss": 0.5765, "mean_token_accuracy": 0.8206415176391602, "step": 188 }, { "epoch": 11.832, "grad_norm": 0.8340954212671062, "learning_rate": 3.1500000000000003e-06, "loss": 0.6097, "mean_token_accuracy": 0.8105870485305786, "step": 189 }, { "epoch": 11.896, "grad_norm": 0.8308454280001522, "learning_rate": 3.1666666666666667e-06, "loss": 0.604, "mean_token_accuracy": 0.8139538392424583, "step": 190 }, { "epoch": 11.96, "grad_norm": 0.788639790061392, "learning_rate": 3.183333333333334e-06, "loss": 0.5658, "mean_token_accuracy": 0.8232458382844925, "step": 191 }, { "epoch": 12.0, "grad_norm": 0.9085472961793744, "learning_rate": 3.2000000000000003e-06, "loss": 0.6153, "mean_token_accuracy": 0.81141037940979, "step": 192 }, { "epoch": 12.064, "grad_norm": 1.1221740640596087, "learning_rate": 3.2166666666666666e-06, "loss": 0.5654, "mean_token_accuracy": 0.8234573975205421, "step": 193 }, { "epoch": 12.128, "grad_norm": 0.8419625502909994, "learning_rate": 3.2333333333333334e-06, "loss": 0.552, "mean_token_accuracy": 0.8269772231578827, "step": 194 }, { "epoch": 12.192, "grad_norm": 0.8512779742095804, "learning_rate": 3.2500000000000002e-06, "loss": 0.5438, "mean_token_accuracy": 0.8302522003650665, "step": 195 }, { "epoch": 12.256, "grad_norm": 0.8577429707322042, "learning_rate": 3.266666666666667e-06, "loss": 0.5811, "mean_token_accuracy": 0.8193713203072548, "step": 196 }, { "epoch": 12.32, "grad_norm": 0.8337696658356195, "learning_rate": 3.2833333333333334e-06, "loss": 0.5567, "mean_token_accuracy": 0.8266864567995071, "step": 197 }, { "epoch": 12.384, "grad_norm": 0.8002604999744183, "learning_rate": 3.3000000000000006e-06, "loss": 0.5292, "mean_token_accuracy": 0.831996813416481, "step": 198 }, { "epoch": 12.448, "grad_norm": 0.8143856724318314, "learning_rate": 3.316666666666667e-06, "loss": 0.5743, "mean_token_accuracy": 0.8208557814359665, "step": 199 }, { "epoch": 12.512, "grad_norm": 0.7950429868339589, "learning_rate": 3.3333333333333333e-06, "loss": 0.4984, "mean_token_accuracy": 0.8418523520231247, "step": 200 }, { "epoch": 12.576, "grad_norm": 0.8711726300641066, "learning_rate": 3.3500000000000005e-06, "loss": 0.5647, "mean_token_accuracy": 0.8243143931031227, "step": 201 }, { "epoch": 12.64, "grad_norm": 0.954905755552063, "learning_rate": 3.366666666666667e-06, "loss": 0.5312, "mean_token_accuracy": 0.8356362283229828, "step": 202 }, { "epoch": 12.704, "grad_norm": 0.8826163061608358, "learning_rate": 3.3833333333333333e-06, "loss": 0.5656, "mean_token_accuracy": 0.820687435567379, "step": 203 }, { "epoch": 12.768, "grad_norm": 0.8348505199484839, "learning_rate": 3.4000000000000005e-06, "loss": 0.5551, "mean_token_accuracy": 0.8269619345664978, "step": 204 }, { "epoch": 12.832, "grad_norm": 0.8889634392339238, "learning_rate": 3.416666666666667e-06, "loss": 0.5809, "mean_token_accuracy": 0.8178445473313332, "step": 205 }, { "epoch": 12.896, "grad_norm": 0.8664936824592053, "learning_rate": 3.4333333333333336e-06, "loss": 0.5737, "mean_token_accuracy": 0.8230748176574707, "step": 206 }, { "epoch": 12.96, "grad_norm": 0.8639909003146001, "learning_rate": 3.45e-06, "loss": 0.5243, "mean_token_accuracy": 0.8342158421874046, "step": 207 }, { "epoch": 13.0, "grad_norm": 1.1199807302148823, "learning_rate": 3.4666666666666672e-06, "loss": 0.5271, "mean_token_accuracy": 0.8343780517578125, "step": 208 }, { "epoch": 13.064, "grad_norm": 0.872840411950679, "learning_rate": 3.4833333333333336e-06, "loss": 0.4941, "mean_token_accuracy": 0.8443315401673317, "step": 209 }, { "epoch": 13.128, "grad_norm": 0.8604092546423662, "learning_rate": 3.5e-06, "loss": 0.5525, "mean_token_accuracy": 0.8285075947642326, "step": 210 }, { "epoch": 13.192, "grad_norm": 0.8197115114554855, "learning_rate": 3.516666666666667e-06, "loss": 0.511, "mean_token_accuracy": 0.8377662822604179, "step": 211 }, { "epoch": 13.256, "grad_norm": 0.8625732378617251, "learning_rate": 3.5333333333333335e-06, "loss": 0.5377, "mean_token_accuracy": 0.8302152454853058, "step": 212 }, { "epoch": 13.32, "grad_norm": 0.8677165301724644, "learning_rate": 3.5500000000000003e-06, "loss": 0.4982, "mean_token_accuracy": 0.8423267677426338, "step": 213 }, { "epoch": 13.384, "grad_norm": 0.8751508529632246, "learning_rate": 3.566666666666667e-06, "loss": 0.514, "mean_token_accuracy": 0.8371082320809364, "step": 214 }, { "epoch": 13.448, "grad_norm": 0.8674670807419815, "learning_rate": 3.5833333333333335e-06, "loss": 0.5101, "mean_token_accuracy": 0.8389905691146851, "step": 215 }, { "epoch": 13.512, "grad_norm": 0.9007445642033973, "learning_rate": 3.6000000000000003e-06, "loss": 0.4895, "mean_token_accuracy": 0.8430460318922997, "step": 216 }, { "epoch": 13.576, "grad_norm": 0.8582578450559122, "learning_rate": 3.616666666666667e-06, "loss": 0.5271, "mean_token_accuracy": 0.8336190059781075, "step": 217 }, { "epoch": 13.64, "grad_norm": 0.8866871125394813, "learning_rate": 3.633333333333334e-06, "loss": 0.5177, "mean_token_accuracy": 0.8373224809765816, "step": 218 }, { "epoch": 13.704, "grad_norm": 0.8590554810630842, "learning_rate": 3.65e-06, "loss": 0.4842, "mean_token_accuracy": 0.8470861986279488, "step": 219 }, { "epoch": 13.768, "grad_norm": 0.8200732842385068, "learning_rate": 3.6666666666666666e-06, "loss": 0.5103, "mean_token_accuracy": 0.8375214263796806, "step": 220 }, { "epoch": 13.832, "grad_norm": 0.9182051153208441, "learning_rate": 3.6833333333333338e-06, "loss": 0.532, "mean_token_accuracy": 0.833159901201725, "step": 221 }, { "epoch": 13.896, "grad_norm": 0.9107019477434463, "learning_rate": 3.7e-06, "loss": 0.4845, "mean_token_accuracy": 0.8487083688378334, "step": 222 }, { "epoch": 13.96, "grad_norm": 0.8816980193671223, "learning_rate": 3.716666666666667e-06, "loss": 0.5297, "mean_token_accuracy": 0.8335271775722504, "step": 223 }, { "epoch": 14.0, "grad_norm": 0.8816980193671223, "learning_rate": 3.7333333333333337e-06, "loss": 0.5252, "mean_token_accuracy": 0.8347943067550659, "step": 224 }, { "epoch": 14.064, "grad_norm": 1.2290793381943403, "learning_rate": 3.7500000000000005e-06, "loss": 0.4719, "mean_token_accuracy": 0.849871464073658, "step": 225 }, { "epoch": 14.128, "grad_norm": 0.8833369727527698, "learning_rate": 3.766666666666667e-06, "loss": 0.4815, "mean_token_accuracy": 0.8489685356616974, "step": 226 }, { "epoch": 14.192, "grad_norm": 0.9355104543560427, "learning_rate": 3.7833333333333337e-06, "loss": 0.4754, "mean_token_accuracy": 0.8488614037632942, "step": 227 }, { "epoch": 14.256, "grad_norm": 0.9063277433622311, "learning_rate": 3.8000000000000005e-06, "loss": 0.4739, "mean_token_accuracy": 0.8489406183362007, "step": 228 }, { "epoch": 14.32, "grad_norm": 0.8329284497513757, "learning_rate": 3.816666666666667e-06, "loss": 0.4795, "mean_token_accuracy": 0.8481574431061745, "step": 229 }, { "epoch": 14.384, "grad_norm": 0.9854239992920599, "learning_rate": 3.833333333333334e-06, "loss": 0.4899, "mean_token_accuracy": 0.844392754137516, "step": 230 }, { "epoch": 14.448, "grad_norm": 0.9478046309710416, "learning_rate": 3.85e-06, "loss": 0.4596, "mean_token_accuracy": 0.8532656580209732, "step": 231 }, { "epoch": 14.512, "grad_norm": 1.0096196790776408, "learning_rate": 3.866666666666667e-06, "loss": 0.4838, "mean_token_accuracy": 0.8463822305202484, "step": 232 }, { "epoch": 14.576, "grad_norm": 0.9570160362093639, "learning_rate": 3.883333333333333e-06, "loss": 0.4807, "mean_token_accuracy": 0.8482798784971237, "step": 233 }, { "epoch": 14.64, "grad_norm": 0.9019552872491859, "learning_rate": 3.900000000000001e-06, "loss": 0.4693, "mean_token_accuracy": 0.8510039076209068, "step": 234 }, { "epoch": 14.704, "grad_norm": 0.8728718119317298, "learning_rate": 3.916666666666667e-06, "loss": 0.4627, "mean_token_accuracy": 0.8530546054244041, "step": 235 }, { "epoch": 14.768, "grad_norm": 0.8614825362928388, "learning_rate": 3.9333333333333335e-06, "loss": 0.4442, "mean_token_accuracy": 0.8577834144234657, "step": 236 }, { "epoch": 14.832, "grad_norm": 0.9106551658056996, "learning_rate": 3.95e-06, "loss": 0.4779, "mean_token_accuracy": 0.8493970334529877, "step": 237 }, { "epoch": 14.896, "grad_norm": 0.8683436027304747, "learning_rate": 3.966666666666667e-06, "loss": 0.4657, "mean_token_accuracy": 0.8519527465105057, "step": 238 }, { "epoch": 14.96, "grad_norm": 0.9527634520289008, "learning_rate": 3.983333333333334e-06, "loss": 0.463, "mean_token_accuracy": 0.8566356673836708, "step": 239 }, { "epoch": 15.0, "grad_norm": 1.014596804004236, "learning_rate": 4.000000000000001e-06, "loss": 0.4699, "mean_token_accuracy": 0.8511263489723205, "step": 240 }, { "epoch": 15.064, "grad_norm": 1.2095861559398775, "learning_rate": 4.0166666666666675e-06, "loss": 0.4483, "mean_token_accuracy": 0.8559928983449936, "step": 241 }, { "epoch": 15.128, "grad_norm": 0.9471435914464652, "learning_rate": 4.033333333333333e-06, "loss": 0.4238, "mean_token_accuracy": 0.8659779280424118, "step": 242 }, { "epoch": 15.192, "grad_norm": 0.9428748833394961, "learning_rate": 4.05e-06, "loss": 0.4585, "mean_token_accuracy": 0.8553385436534882, "step": 243 }, { "epoch": 15.256, "grad_norm": 0.9529539368578219, "learning_rate": 4.066666666666667e-06, "loss": 0.4337, "mean_token_accuracy": 0.8622214794158936, "step": 244 }, { "epoch": 15.32, "grad_norm": 0.9796207646593222, "learning_rate": 4.083333333333334e-06, "loss": 0.4455, "mean_token_accuracy": 0.858839362859726, "step": 245 }, { "epoch": 15.384, "grad_norm": 0.9701635187271879, "learning_rate": 4.1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8629101291298866, "step": 246 }, { "epoch": 15.448, "grad_norm": 1.024827992850235, "learning_rate": 4.116666666666667e-06, "loss": 0.3932, "mean_token_accuracy": 0.8745867982506752, "step": 247 }, { "epoch": 15.512, "grad_norm": 1.0497833878098355, "learning_rate": 4.133333333333333e-06, "loss": 0.4078, "mean_token_accuracy": 0.8725361078977585, "step": 248 }, { "epoch": 15.576, "grad_norm": 0.9472529032810553, "learning_rate": 4.15e-06, "loss": 0.4449, "mean_token_accuracy": 0.8592372685670853, "step": 249 }, { "epoch": 15.64, "grad_norm": 0.9478343790941723, "learning_rate": 4.166666666666667e-06, "loss": 0.4219, "mean_token_accuracy": 0.8652668967843056, "step": 250 }, { "epoch": 15.704, "grad_norm": 0.9321107206569655, "learning_rate": 4.183333333333334e-06, "loss": 0.4298, "mean_token_accuracy": 0.8654046431183815, "step": 251 }, { "epoch": 15.768, "grad_norm": 0.9732502303804135, "learning_rate": 4.2000000000000004e-06, "loss": 0.4414, "mean_token_accuracy": 0.859681062400341, "step": 252 }, { "epoch": 15.832, "grad_norm": 0.9945421297805389, "learning_rate": 4.216666666666667e-06, "loss": 0.4216, "mean_token_accuracy": 0.8655117452144623, "step": 253 }, { "epoch": 15.896, "grad_norm": 0.9326350812277701, "learning_rate": 4.233333333333334e-06, "loss": 0.4035, "mean_token_accuracy": 0.8712200075387955, "step": 254 }, { "epoch": 15.96, "grad_norm": 1.0680349777327294, "learning_rate": 4.25e-06, "loss": 0.4513, "mean_token_accuracy": 0.8564214035868645, "step": 255 }, { "epoch": 16.0, "grad_norm": 1.4459949949786204, "learning_rate": 4.266666666666668e-06, "loss": 0.4211, "mean_token_accuracy": 0.866209602355957, "step": 256 }, { "epoch": 16.064, "grad_norm": 0.9926835489862903, "learning_rate": 4.2833333333333335e-06, "loss": 0.4141, "mean_token_accuracy": 0.8701640516519547, "step": 257 }, { "epoch": 16.128, "grad_norm": 1.0502823434095907, "learning_rate": 4.3e-06, "loss": 0.3879, "mean_token_accuracy": 0.8767139986157417, "step": 258 }, { "epoch": 16.192, "grad_norm": 1.1061971897219878, "learning_rate": 4.316666666666667e-06, "loss": 0.4031, "mean_token_accuracy": 0.8708374053239822, "step": 259 }, { "epoch": 16.256, "grad_norm": 0.9828760122741771, "learning_rate": 4.333333333333334e-06, "loss": 0.3661, "mean_token_accuracy": 0.8833098635077477, "step": 260 }, { "epoch": 16.32, "grad_norm": 0.9971845178919754, "learning_rate": 4.350000000000001e-06, "loss": 0.3876, "mean_token_accuracy": 0.8770660012960434, "step": 261 }, { "epoch": 16.384, "grad_norm": 1.0631766300694614, "learning_rate": 4.366666666666667e-06, "loss": 0.3743, "mean_token_accuracy": 0.8821620866656303, "step": 262 }, { "epoch": 16.448, "grad_norm": 1.0368611853261243, "learning_rate": 4.383333333333334e-06, "loss": 0.3689, "mean_token_accuracy": 0.883187435567379, "step": 263 }, { "epoch": 16.512, "grad_norm": 1.137383338430845, "learning_rate": 4.4e-06, "loss": 0.3882, "mean_token_accuracy": 0.8767053857445717, "step": 264 }, { "epoch": 16.576, "grad_norm": 1.1151794385689966, "learning_rate": 4.416666666666667e-06, "loss": 0.3845, "mean_token_accuracy": 0.8766833990812302, "step": 265 }, { "epoch": 16.64, "grad_norm": 1.0034943029679426, "learning_rate": 4.433333333333334e-06, "loss": 0.4103, "mean_token_accuracy": 0.8700416311621666, "step": 266 }, { "epoch": 16.704, "grad_norm": 1.0998885757348322, "learning_rate": 4.450000000000001e-06, "loss": 0.3816, "mean_token_accuracy": 0.8783767446875572, "step": 267 }, { "epoch": 16.768, "grad_norm": 0.9860317005300909, "learning_rate": 4.4666666666666665e-06, "loss": 0.4046, "mean_token_accuracy": 0.8717862367630005, "step": 268 }, { "epoch": 16.832, "grad_norm": 0.9920115374986633, "learning_rate": 4.483333333333333e-06, "loss": 0.4099, "mean_token_accuracy": 0.8697355464100838, "step": 269 }, { "epoch": 16.896, "grad_norm": 0.9915860668246114, "learning_rate": 4.5e-06, "loss": 0.3645, "mean_token_accuracy": 0.8825600072741508, "step": 270 }, { "epoch": 16.96, "grad_norm": 1.029814094238486, "learning_rate": 4.516666666666667e-06, "loss": 0.3795, "mean_token_accuracy": 0.8797288164496422, "step": 271 }, { "epoch": 17.0, "grad_norm": 1.029814094238486, "learning_rate": 4.533333333333334e-06, "loss": 0.359, "mean_token_accuracy": 0.8838148832321167, "step": 272 }, { "epoch": 17.064, "grad_norm": 1.4779924524755126, "learning_rate": 4.5500000000000005e-06, "loss": 0.3518, "mean_token_accuracy": 0.8883753567934036, "step": 273 }, { "epoch": 17.128, "grad_norm": 1.1783228074510093, "learning_rate": 4.566666666666667e-06, "loss": 0.3758, "mean_token_accuracy": 0.8818101137876511, "step": 274 }, { "epoch": 17.192, "grad_norm": 0.9801069428669729, "learning_rate": 4.583333333333333e-06, "loss": 0.3302, "mean_token_accuracy": 0.8959659710526466, "step": 275 }, { "epoch": 17.256, "grad_norm": 1.3623200526384085, "learning_rate": 4.600000000000001e-06, "loss": 0.3614, "mean_token_accuracy": 0.8848096206784248, "step": 276 }, { "epoch": 17.32, "grad_norm": 1.2157214174041995, "learning_rate": 4.616666666666667e-06, "loss": 0.3574, "mean_token_accuracy": 0.8847025036811829, "step": 277 }, { "epoch": 17.384, "grad_norm": 1.1221218311870096, "learning_rate": 4.633333333333334e-06, "loss": 0.3345, "mean_token_accuracy": 0.893104188144207, "step": 278 }, { "epoch": 17.448, "grad_norm": 1.1233732818689055, "learning_rate": 4.65e-06, "loss": 0.3464, "mean_token_accuracy": 0.8902730196714401, "step": 279 }, { "epoch": 17.512, "grad_norm": 1.0978257972691658, "learning_rate": 4.666666666666667e-06, "loss": 0.3304, "mean_token_accuracy": 0.8955833688378334, "step": 280 }, { "epoch": 17.576, "grad_norm": 1.0198063150837045, "learning_rate": 4.683333333333334e-06, "loss": 0.3278, "mean_token_accuracy": 0.8956751897931099, "step": 281 }, { "epoch": 17.64, "grad_norm": 1.5018797212092934, "learning_rate": 4.7e-06, "loss": 0.3593, "mean_token_accuracy": 0.8848905637860298, "step": 282 }, { "epoch": 17.704, "grad_norm": 1.310736482184957, "learning_rate": 4.7166666666666675e-06, "loss": 0.3221, "mean_token_accuracy": 0.8983686342835426, "step": 283 }, { "epoch": 17.768, "grad_norm": 1.2198740325171735, "learning_rate": 4.7333333333333335e-06, "loss": 0.3387, "mean_token_accuracy": 0.8912524506449699, "step": 284 }, { "epoch": 17.832, "grad_norm": 1.5660842871160958, "learning_rate": 4.75e-06, "loss": 0.3387, "mean_token_accuracy": 0.8922471776604652, "step": 285 }, { "epoch": 17.896, "grad_norm": 1.1678670963189186, "learning_rate": 4.766666666666667e-06, "loss": 0.3469, "mean_token_accuracy": 0.8883600607514381, "step": 286 }, { "epoch": 17.96, "grad_norm": 1.2210352325154754, "learning_rate": 4.783333333333334e-06, "loss": 0.3498, "mean_token_accuracy": 0.8913748785853386, "step": 287 }, { "epoch": 18.0, "grad_norm": 1.363909232832047, "learning_rate": 4.800000000000001e-06, "loss": 0.3338, "mean_token_accuracy": 0.8948264241218566, "step": 288 }, { "epoch": 18.064, "grad_norm": 1.3217772798851801, "learning_rate": 4.816666666666667e-06, "loss": 0.3317, "mean_token_accuracy": 0.8955680802464485, "step": 289 }, { "epoch": 18.128, "grad_norm": 1.0050401550666879, "learning_rate": 4.833333333333333e-06, "loss": 0.3014, "mean_token_accuracy": 0.9032045751810074, "step": 290 }, { "epoch": 18.192, "grad_norm": 1.1818559596633615, "learning_rate": 4.85e-06, "loss": 0.3184, "mean_token_accuracy": 0.8992868512868881, "step": 291 }, { "epoch": 18.256, "grad_norm": 1.282282833620202, "learning_rate": 4.866666666666667e-06, "loss": 0.3053, "mean_token_accuracy": 0.9029444083571434, "step": 292 }, { "epoch": 18.32, "grad_norm": 1.138870361992162, "learning_rate": 4.883333333333334e-06, "loss": 0.3097, "mean_token_accuracy": 0.9020108804106712, "step": 293 }, { "epoch": 18.384, "grad_norm": 1.2833806115329116, "learning_rate": 4.9000000000000005e-06, "loss": 0.2918, "mean_token_accuracy": 0.9076120257377625, "step": 294 }, { "epoch": 18.448, "grad_norm": 1.2322148687034031, "learning_rate": 4.9166666666666665e-06, "loss": 0.3082, "mean_token_accuracy": 0.9028679057955742, "step": 295 }, { "epoch": 18.512, "grad_norm": 1.1237891364974113, "learning_rate": 4.933333333333334e-06, "loss": 0.3032, "mean_token_accuracy": 0.9043829515576363, "step": 296 }, { "epoch": 18.576, "grad_norm": 1.1274605217867506, "learning_rate": 4.95e-06, "loss": 0.2824, "mean_token_accuracy": 0.91067273914814, "step": 297 }, { "epoch": 18.64, "grad_norm": 1.1398356522244715, "learning_rate": 4.966666666666667e-06, "loss": 0.2972, "mean_token_accuracy": 0.9071223065257072, "step": 298 }, { "epoch": 18.704, "grad_norm": 1.1065858798757606, "learning_rate": 4.983333333333334e-06, "loss": 0.2938, "mean_token_accuracy": 0.9060357511043549, "step": 299 }, { "epoch": 18.768, "grad_norm": 1.1852503631762932, "learning_rate": 5e-06, "loss": 0.3009, "mean_token_accuracy": 0.9042651131749153, "step": 300 }, { "epoch": 18.832, "grad_norm": 1.086972654832599, "learning_rate": 4.9999773402320124e-06, "loss": 0.2966, "mean_token_accuracy": 0.9073365703225136, "step": 301 }, { "epoch": 18.896, "grad_norm": 1.105157142449476, "learning_rate": 4.99990936138446e-06, "loss": 0.2731, "mean_token_accuracy": 0.9143592342734337, "step": 302 }, { "epoch": 18.96, "grad_norm": 1.3014318164358583, "learning_rate": 4.999796064826576e-06, "loss": 0.2864, "mean_token_accuracy": 0.9086526706814766, "step": 303 }, { "epoch": 19.0, "grad_norm": 1.3014318164358583, "learning_rate": 4.9996374528403815e-06, "loss": 0.3116, "mean_token_accuracy": 0.9000489711761475, "step": 304 }, { "epoch": 19.064, "grad_norm": 1.675158213522968, "learning_rate": 4.99943352862064e-06, "loss": 0.2624, "mean_token_accuracy": 0.9192121773958206, "step": 305 }, { "epoch": 19.128, "grad_norm": 1.2185164261586823, "learning_rate": 4.999184296274798e-06, "loss": 0.2734, "mean_token_accuracy": 0.9161820486187935, "step": 306 }, { "epoch": 19.192, "grad_norm": 1.2409679304042336, "learning_rate": 4.9988897608228966e-06, "loss": 0.2656, "mean_token_accuracy": 0.9166105464100838, "step": 307 }, { "epoch": 19.256, "grad_norm": 1.2357072278822194, "learning_rate": 4.998549928197473e-06, "loss": 0.2727, "mean_token_accuracy": 0.9143303111195564, "step": 308 }, { "epoch": 19.32, "grad_norm": 1.5470655730679517, "learning_rate": 4.998164805243443e-06, "loss": 0.2559, "mean_token_accuracy": 0.9192274734377861, "step": 309 }, { "epoch": 19.384, "grad_norm": 1.2664747461391912, "learning_rate": 4.997734399717958e-06, "loss": 0.251, "mean_token_accuracy": 0.9215995296835899, "step": 310 }, { "epoch": 19.448, "grad_norm": 1.2621642330279086, "learning_rate": 4.997258720290254e-06, "loss": 0.2331, "mean_token_accuracy": 0.9276444613933563, "step": 311 }, { "epoch": 19.512, "grad_norm": 1.224211052571134, "learning_rate": 4.996737776541472e-06, "loss": 0.2389, "mean_token_accuracy": 0.9253183230757713, "step": 312 }, { "epoch": 19.576, "grad_norm": 1.1680324420308346, "learning_rate": 4.996171578964471e-06, "loss": 0.2629, "mean_token_accuracy": 0.916487567126751, "step": 313 }, { "epoch": 19.64, "grad_norm": 1.1787684249957533, "learning_rate": 4.995560138963612e-06, "loss": 0.2394, "mean_token_accuracy": 0.9244306981563568, "step": 314 }, { "epoch": 19.704, "grad_norm": 1.2181925408729082, "learning_rate": 4.994903468854526e-06, "loss": 0.2629, "mean_token_accuracy": 0.9163963124155998, "step": 315 }, { "epoch": 19.768, "grad_norm": 1.162919829354784, "learning_rate": 4.994201581863877e-06, "loss": 0.2855, "mean_token_accuracy": 0.9106115400791168, "step": 316 }, { "epoch": 19.832, "grad_norm": 1.2207989121050804, "learning_rate": 4.993454492129081e-06, "loss": 0.2568, "mean_token_accuracy": 0.9204517602920532, "step": 317 }, { "epoch": 19.896, "grad_norm": 1.3039461421456635, "learning_rate": 4.992662214698033e-06, "loss": 0.255, "mean_token_accuracy": 0.9197171851992607, "step": 318 }, { "epoch": 19.96, "grad_norm": 1.267722516744628, "learning_rate": 4.991824765528797e-06, "loss": 0.2443, "mean_token_accuracy": 0.9241828098893166, "step": 319 }, { "epoch": 20.0, "grad_norm": 1.3233353031879997, "learning_rate": 4.9909421614892885e-06, "loss": 0.2446, "mean_token_accuracy": 0.923996090888977, "step": 320 }, { "epoch": 20.064, "grad_norm": 1.7311584789938865, "learning_rate": 4.99001442035693e-06, "loss": 0.2122, "mean_token_accuracy": 0.9354952275753021, "step": 321 }, { "epoch": 20.128, "grad_norm": 1.1338684481890458, "learning_rate": 4.989041560818301e-06, "loss": 0.2194, "mean_token_accuracy": 0.9334598332643509, "step": 322 }, { "epoch": 20.192, "grad_norm": 1.292428697326829, "learning_rate": 4.988023602468752e-06, "loss": 0.2218, "mean_token_accuracy": 0.931684635579586, "step": 323 }, { "epoch": 20.256, "grad_norm": 1.304303135972976, "learning_rate": 4.9869605658120165e-06, "loss": 0.2422, "mean_token_accuracy": 0.9233594536781311, "step": 324 }, { "epoch": 20.32, "grad_norm": 1.3383850993194566, "learning_rate": 4.985852472259796e-06, "loss": 0.2037, "mean_token_accuracy": 0.9367236942052841, "step": 325 }, { "epoch": 20.384, "grad_norm": 1.186301912454717, "learning_rate": 4.98469934413133e-06, "loss": 0.2022, "mean_token_accuracy": 0.9370715022087097, "step": 326 }, { "epoch": 20.448, "grad_norm": 1.5403429968614817, "learning_rate": 4.983501204652941e-06, "loss": 0.2102, "mean_token_accuracy": 0.9366429969668388, "step": 327 }, { "epoch": 20.512, "grad_norm": 1.301124151975375, "learning_rate": 4.982258077957576e-06, "loss": 0.2366, "mean_token_accuracy": 0.9276444688439369, "step": 328 }, { "epoch": 20.576, "grad_norm": 1.2094334331524723, "learning_rate": 4.980969989084311e-06, "loss": 0.217, "mean_token_accuracy": 0.9335603788495064, "step": 329 }, { "epoch": 20.64, "grad_norm": 1.3605339304987623, "learning_rate": 4.979636963977854e-06, "loss": 0.2053, "mean_token_accuracy": 0.9358778148889542, "step": 330 }, { "epoch": 20.704, "grad_norm": 1.16862205555523, "learning_rate": 4.978259029488017e-06, "loss": 0.226, "mean_token_accuracy": 0.930521547794342, "step": 331 }, { "epoch": 20.768, "grad_norm": 1.1825476848250676, "learning_rate": 4.976836213369179e-06, "loss": 0.2075, "mean_token_accuracy": 0.9357247725129128, "step": 332 }, { "epoch": 20.832, "grad_norm": 1.1455615603245282, "learning_rate": 4.975368544279725e-06, "loss": 0.2166, "mean_token_accuracy": 0.9328170865774155, "step": 333 }, { "epoch": 20.896, "grad_norm": 1.224152736190998, "learning_rate": 4.97385605178147e-06, "loss": 0.214, "mean_token_accuracy": 0.9333221167325974, "step": 334 }, { "epoch": 20.96, "grad_norm": 1.2092868888592103, "learning_rate": 4.97229876633906e-06, "loss": 0.2165, "mean_token_accuracy": 0.9328017830848694, "step": 335 }, { "epoch": 21.0, "grad_norm": 1.6601916522799904, "learning_rate": 4.9706967193193655e-06, "loss": 0.2063, "mean_token_accuracy": 0.9350636601448059, "step": 336 }, { "epoch": 21.064, "grad_norm": 1.2815858297000546, "learning_rate": 4.9690499429908425e-06, "loss": 0.1827, "mean_token_accuracy": 0.9460088163614273, "step": 337 }, { "epoch": 21.128, "grad_norm": 1.097609174485078, "learning_rate": 4.967358470522886e-06, "loss": 0.2003, "mean_token_accuracy": 0.9375612065196037, "step": 338 }, { "epoch": 21.192, "grad_norm": 1.2081558747354215, "learning_rate": 4.96562233598516e-06, "loss": 0.1714, "mean_token_accuracy": 0.9477228224277496, "step": 339 }, { "epoch": 21.256, "grad_norm": 1.1354505023701997, "learning_rate": 4.963841574346917e-06, "loss": 0.1715, "mean_token_accuracy": 0.9477679803967476, "step": 340 }, { "epoch": 21.32, "grad_norm": 1.2902015926007226, "learning_rate": 4.9620162214762845e-06, "loss": 0.1987, "mean_token_accuracy": 0.9394741654396057, "step": 341 }, { "epoch": 21.384, "grad_norm": 1.2989055108557057, "learning_rate": 4.96014631413955e-06, "loss": 0.1874, "mean_token_accuracy": 0.9430093020200729, "step": 342 }, { "epoch": 21.448, "grad_norm": 1.2036617482027139, "learning_rate": 4.958231890000416e-06, "loss": 0.1619, "mean_token_accuracy": 0.9515793323516846, "step": 343 }, { "epoch": 21.512, "grad_norm": 1.3455552392994778, "learning_rate": 4.956272987619245e-06, "loss": 0.2018, "mean_token_accuracy": 0.9384794309735298, "step": 344 }, { "epoch": 21.576, "grad_norm": 1.325214554126647, "learning_rate": 4.95426964645228e-06, "loss": 0.1793, "mean_token_accuracy": 0.9449069574475288, "step": 345 }, { "epoch": 21.64, "grad_norm": 1.7733873688128772, "learning_rate": 4.952221906850852e-06, "loss": 0.1818, "mean_token_accuracy": 0.9450906068086624, "step": 346 }, { "epoch": 21.704, "grad_norm": 1.620059640871503, "learning_rate": 4.950129810060561e-06, "loss": 0.1712, "mean_token_accuracy": 0.9487634673714638, "step": 347 }, { "epoch": 21.768, "grad_norm": 1.2529249871087662, "learning_rate": 4.947993398220457e-06, "loss": 0.1691, "mean_token_accuracy": 0.9492072686553001, "step": 348 }, { "epoch": 21.832, "grad_norm": 1.2740719641742595, "learning_rate": 4.945812714362182e-06, "loss": 0.1667, "mean_token_accuracy": 0.9494521170854568, "step": 349 }, { "epoch": 21.896, "grad_norm": 1.383437254252314, "learning_rate": 4.943587802409103e-06, "loss": 0.1857, "mean_token_accuracy": 0.9432289302349091, "step": 350 }, { "epoch": 21.96, "grad_norm": 1.561602099072877, "learning_rate": 4.941318707175434e-06, "loss": 0.1663, "mean_token_accuracy": 0.9502173140645027, "step": 351 }, { "epoch": 22.0, "grad_norm": 1.561602099072877, "learning_rate": 4.939005474365327e-06, "loss": 0.1638, "mean_token_accuracy": 0.9499510288238525, "step": 352 }, { "epoch": 22.064, "grad_norm": 1.9350636323956207, "learning_rate": 4.936648150571952e-06, "loss": 0.1536, "mean_token_accuracy": 0.9548389986157417, "step": 353 }, { "epoch": 22.128, "grad_norm": 1.411575361903773, "learning_rate": 4.9342467832765665e-06, "loss": 0.1551, "mean_token_accuracy": 0.9558796510100365, "step": 354 }, { "epoch": 22.192, "grad_norm": 1.3387685223989483, "learning_rate": 4.931801420847546e-06, "loss": 0.156, "mean_token_accuracy": 0.953568808734417, "step": 355 }, { "epoch": 22.256, "grad_norm": 1.222872083987312, "learning_rate": 4.9293121125394205e-06, "loss": 0.1399, "mean_token_accuracy": 0.958391398191452, "step": 356 }, { "epoch": 22.32, "grad_norm": 1.7773947915387402, "learning_rate": 4.926778908491878e-06, "loss": 0.1399, "mean_token_accuracy": 0.958006851375103, "step": 357 }, { "epoch": 22.384, "grad_norm": 1.628233475054275, "learning_rate": 4.924201859728754e-06, "loss": 0.1535, "mean_token_accuracy": 0.9548083990812302, "step": 358 }, { "epoch": 22.448, "grad_norm": 1.882519723773846, "learning_rate": 4.921581018157008e-06, "loss": 0.1449, "mean_token_accuracy": 0.9579303339123726, "step": 359 }, { "epoch": 22.512, "grad_norm": 1.2943332078884695, "learning_rate": 4.91891643656567e-06, "loss": 0.1383, "mean_token_accuracy": 0.9605319648981094, "step": 360 }, { "epoch": 22.576, "grad_norm": 1.3679221930367507, "learning_rate": 4.916208168624787e-06, "loss": 0.1595, "mean_token_accuracy": 0.9521914795041084, "step": 361 }, { "epoch": 22.64, "grad_norm": 1.4471696090178732, "learning_rate": 4.913456268884336e-06, "loss": 0.1467, "mean_token_accuracy": 0.955711305141449, "step": 362 }, { "epoch": 22.704, "grad_norm": 1.604140634991468, "learning_rate": 4.910660792773122e-06, "loss": 0.1348, "mean_token_accuracy": 0.9596430063247681, "step": 363 }, { "epoch": 22.768, "grad_norm": 1.4540830134843588, "learning_rate": 4.907821796597673e-06, "loss": 0.1422, "mean_token_accuracy": 0.9575324505567551, "step": 364 }, { "epoch": 22.832, "grad_norm": 1.3887079384943788, "learning_rate": 4.904939337541093e-06, "loss": 0.1423, "mean_token_accuracy": 0.957716092467308, "step": 365 }, { "epoch": 22.896, "grad_norm": 1.2757987817832661, "learning_rate": 4.902013473661921e-06, "loss": 0.1398, "mean_token_accuracy": 0.9586189985275269, "step": 366 }, { "epoch": 22.96, "grad_norm": 1.298868948932969, "learning_rate": 4.899044263892952e-06, "loss": 0.143, "mean_token_accuracy": 0.9576701819896698, "step": 367 }, { "epoch": 23.0, "grad_norm": 1.961915531926537, "learning_rate": 4.896031768040057e-06, "loss": 0.1646, "mean_token_accuracy": 0.9496572136878967, "step": 368 }, { "epoch": 23.064, "grad_norm": 2.5411532400415346, "learning_rate": 4.892976046780976e-06, "loss": 0.1443, "mean_token_accuracy": 0.9583129212260246, "step": 369 }, { "epoch": 23.128, "grad_norm": 2.240402471949737, "learning_rate": 4.889877161664096e-06, "loss": 0.1435, "mean_token_accuracy": 0.9580068588256836, "step": 370 }, { "epoch": 23.192, "grad_norm": 1.8788771654270118, "learning_rate": 4.886735175107209e-06, "loss": 0.1253, "mean_token_accuracy": 0.963791623711586, "step": 371 }, { "epoch": 23.256, "grad_norm": 2.135954849216427, "learning_rate": 4.88355015039626e-06, "loss": 0.131, "mean_token_accuracy": 0.9618786722421646, "step": 372 }, { "epoch": 23.32, "grad_norm": 1.8331351807785414, "learning_rate": 4.880322151684066e-06, "loss": 0.1192, "mean_token_accuracy": 0.9654138013720512, "step": 373 }, { "epoch": 23.384, "grad_norm": 1.407812960168881, "learning_rate": 4.87705124398903e-06, "loss": 0.1089, "mean_token_accuracy": 0.9701120182871819, "step": 374 }, { "epoch": 23.448, "grad_norm": 1.1689059254996335, "learning_rate": 4.873737493193827e-06, "loss": 0.1202, "mean_token_accuracy": 0.9654891565442085, "step": 375 }, { "epoch": 23.512, "grad_norm": 1.2444852068725392, "learning_rate": 4.8703809660440785e-06, "loss": 0.1179, "mean_token_accuracy": 0.9655362367630005, "step": 376 }, { "epoch": 23.576, "grad_norm": 1.3767549284795062, "learning_rate": 4.866981730147008e-06, "loss": 0.1107, "mean_token_accuracy": 0.9675869196653366, "step": 377 }, { "epoch": 23.64, "grad_norm": 1.594294466491712, "learning_rate": 4.863539853970079e-06, "loss": 0.1139, "mean_token_accuracy": 0.9675126820802689, "step": 378 }, { "epoch": 23.704, "grad_norm": 1.2196620167162606, "learning_rate": 4.860055406839616e-06, "loss": 0.1133, "mean_token_accuracy": 0.9677246585488319, "step": 379 }, { "epoch": 23.768, "grad_norm": 1.2491254420542928, "learning_rate": 4.856528458939409e-06, "loss": 0.1186, "mean_token_accuracy": 0.9661483839154243, "step": 380 }, { "epoch": 23.832, "grad_norm": 1.371135678381332, "learning_rate": 4.8529590813093e-06, "loss": 0.1192, "mean_token_accuracy": 0.9656586796045303, "step": 381 }, { "epoch": 23.896, "grad_norm": 1.5459203204423382, "learning_rate": 4.84934734584375e-06, "loss": 0.1203, "mean_token_accuracy": 0.9651536494493484, "step": 382 }, { "epoch": 23.96, "grad_norm": 1.2535283055940967, "learning_rate": 4.845693325290391e-06, "loss": 0.1134, "mean_token_accuracy": 0.9670513048768044, "step": 383 }, { "epoch": 24.0, "grad_norm": 1.8961125063948017, "learning_rate": 4.841997093248566e-06, "loss": 0.1083, "mean_token_accuracy": 0.9685112595558166, "step": 384 }, { "epoch": 24.064, "grad_norm": 1.409943126366695, "learning_rate": 4.838258724167838e-06, "loss": 0.1144, "mean_token_accuracy": 0.9689030423760414, "step": 385 }, { "epoch": 24.128, "grad_norm": 1.1122463916545622, "learning_rate": 4.8344782933464974e-06, "loss": 0.0919, "mean_token_accuracy": 0.9742133989930153, "step": 386 }, { "epoch": 24.192, "grad_norm": 1.6760893804630144, "learning_rate": 4.830655876930042e-06, "loss": 0.0958, "mean_token_accuracy": 0.9723310545086861, "step": 387 }, { "epoch": 24.256, "grad_norm": 1.5060839423398382, "learning_rate": 4.826791551909644e-06, "loss": 0.0939, "mean_token_accuracy": 0.9739838391542435, "step": 388 }, { "epoch": 24.32, "grad_norm": 1.166334319696997, "learning_rate": 4.8228853961206005e-06, "loss": 0.0905, "mean_token_accuracy": 0.9740297570824623, "step": 389 }, { "epoch": 24.384, "grad_norm": 1.3250184993491816, "learning_rate": 4.818937488240765e-06, "loss": 0.0985, "mean_token_accuracy": 0.9734023064374924, "step": 390 }, { "epoch": 24.448, "grad_norm": 1.1848131881302477, "learning_rate": 4.8149479077889575e-06, "loss": 0.0982, "mean_token_accuracy": 0.973371684551239, "step": 391 }, { "epoch": 24.512, "grad_norm": 1.3384502647041565, "learning_rate": 4.810916735123375e-06, "loss": 0.091, "mean_token_accuracy": 0.9742210209369659, "step": 392 }, { "epoch": 24.576, "grad_norm": 1.4015813197834235, "learning_rate": 4.80684405143996e-06, "loss": 0.1056, "mean_token_accuracy": 0.9702650606632233, "step": 393 }, { "epoch": 24.64, "grad_norm": 1.4959678215541097, "learning_rate": 4.802729938770773e-06, "loss": 0.0914, "mean_token_accuracy": 0.9741062670946121, "step": 394 }, { "epoch": 24.704, "grad_norm": 1.2976007911004874, "learning_rate": 4.798574479982337e-06, "loss": 0.0938, "mean_token_accuracy": 0.9738461077213287, "step": 395 }, { "epoch": 24.768, "grad_norm": 1.2476048115003953, "learning_rate": 4.794377758773968e-06, "loss": 0.0905, "mean_token_accuracy": 0.9752234369516373, "step": 396 }, { "epoch": 24.832, "grad_norm": 1.3832446135540417, "learning_rate": 4.790139859676093e-06, "loss": 0.0993, "mean_token_accuracy": 0.9730276018381119, "step": 397 }, { "epoch": 24.896, "grad_norm": 1.390616772309962, "learning_rate": 4.7858608680485444e-06, "loss": 0.1025, "mean_token_accuracy": 0.9710914492607117, "step": 398 }, { "epoch": 24.96, "grad_norm": 1.124675138000985, "learning_rate": 4.781540870078838e-06, "loss": 0.0896, "mean_token_accuracy": 0.9746112897992134, "step": 399 }, { "epoch": 25.0, "grad_norm": 1.124675138000985, "learning_rate": 4.777179952780443e-06, "loss": 0.0998, "mean_token_accuracy": 0.9707639575004577, "step": 400 }, { "epoch": 25.064, "grad_norm": 1.818796054229727, "learning_rate": 4.772778203991026e-06, "loss": 0.0789, "mean_token_accuracy": 0.9792788997292519, "step": 401 }, { "epoch": 25.128, "grad_norm": 1.1915737380245375, "learning_rate": 4.76833571237068e-06, "loss": 0.0702, "mean_token_accuracy": 0.982002928853035, "step": 402 }, { "epoch": 25.192, "grad_norm": 1.0102535942250375, "learning_rate": 4.763852567400142e-06, "loss": 0.0745, "mean_token_accuracy": 0.980036549270153, "step": 403 }, { "epoch": 25.256, "grad_norm": 1.2426834668571503, "learning_rate": 4.759328859378992e-06, "loss": 0.0745, "mean_token_accuracy": 0.9791870713233948, "step": 404 }, { "epoch": 25.32, "grad_norm": 1.1758303173653355, "learning_rate": 4.754764679423827e-06, "loss": 0.0788, "mean_token_accuracy": 0.9787585809826851, "step": 405 }, { "epoch": 25.384, "grad_norm": 1.262305971116141, "learning_rate": 4.7501601194664345e-06, "loss": 0.0741, "mean_token_accuracy": 0.9805894047021866, "step": 406 }, { "epoch": 25.448, "grad_norm": 1.2538046328349908, "learning_rate": 4.745515272251934e-06, "loss": 0.083, "mean_token_accuracy": 0.9783606752753258, "step": 407 }, { "epoch": 25.512, "grad_norm": 1.2040444299176252, "learning_rate": 4.740830231336912e-06, "loss": 0.0747, "mean_token_accuracy": 0.9806409105658531, "step": 408 }, { "epoch": 25.576, "grad_norm": 1.1675550440789746, "learning_rate": 4.736105091087537e-06, "loss": 0.0716, "mean_token_accuracy": 0.9813448861241341, "step": 409 }, { "epoch": 25.64, "grad_norm": 1.158314577400772, "learning_rate": 4.731339946677661e-06, "loss": 0.0723, "mean_token_accuracy": 0.9802889227867126, "step": 410 }, { "epoch": 25.704, "grad_norm": 1.3746317328415838, "learning_rate": 4.726534894086898e-06, "loss": 0.0842, "mean_token_accuracy": 0.9767844006419182, "step": 411 }, { "epoch": 25.768, "grad_norm": 1.7286231547711393, "learning_rate": 4.721690030098693e-06, "loss": 0.0692, "mean_token_accuracy": 0.9816509559750557, "step": 412 }, { "epoch": 25.832, "grad_norm": 1.2639574684751778, "learning_rate": 4.716805452298376e-06, "loss": 0.0732, "mean_token_accuracy": 0.9806562140583992, "step": 413 }, { "epoch": 25.896, "grad_norm": 1.3840031790008775, "learning_rate": 4.71188125907119e-06, "loss": 0.0735, "mean_token_accuracy": 0.980702131986618, "step": 414 }, { "epoch": 25.96, "grad_norm": 1.2516197479884617, "learning_rate": 4.706917549600315e-06, "loss": 0.0764, "mean_token_accuracy": 0.9798757433891296, "step": 415 }, { "epoch": 26.0, "grad_norm": 1.6049999723704589, "learning_rate": 4.701914423864865e-06, "loss": 0.076, "mean_token_accuracy": 0.9789911866188049, "step": 416 }, { "epoch": 26.064, "grad_norm": 1.2709070981722457, "learning_rate": 4.696871982637879e-06, "loss": 0.0549, "mean_token_accuracy": 0.9865481033921242, "step": 417 }, { "epoch": 26.128, "grad_norm": 1.0709715023086532, "learning_rate": 4.691790327484288e-06, "loss": 0.0588, "mean_token_accuracy": 0.9843290969729424, "step": 418 }, { "epoch": 26.192, "grad_norm": 1.1600291771460849, "learning_rate": 4.686669560758874e-06, "loss": 0.0574, "mean_token_accuracy": 0.9850330501794815, "step": 419 }, { "epoch": 26.256, "grad_norm": 1.1766622538481495, "learning_rate": 4.681509785604199e-06, "loss": 0.0609, "mean_token_accuracy": 0.9838240742683411, "step": 420 }, { "epoch": 26.32, "grad_norm": 1.2022847073894367, "learning_rate": 4.676311105948539e-06, "loss": 0.0605, "mean_token_accuracy": 0.9845715165138245, "step": 421 }, { "epoch": 26.384, "grad_norm": 1.07705376381831, "learning_rate": 4.671073626503781e-06, "loss": 0.0558, "mean_token_accuracy": 0.9863338619470596, "step": 422 }, { "epoch": 26.448, "grad_norm": 1.1534904539833133, "learning_rate": 4.665797452763322e-06, "loss": 0.0585, "mean_token_accuracy": 0.9853850305080414, "step": 423 }, { "epoch": 26.512, "grad_norm": 0.9813940958436875, "learning_rate": 4.660482690999936e-06, "loss": 0.0542, "mean_token_accuracy": 0.9861808270215988, "step": 424 }, { "epoch": 26.576, "grad_norm": 1.312396612606, "learning_rate": 4.65512944826364e-06, "loss": 0.0605, "mean_token_accuracy": 0.9845839366316795, "step": 425 }, { "epoch": 26.64, "grad_norm": 1.209476180727321, "learning_rate": 4.649737832379535e-06, "loss": 0.0583, "mean_token_accuracy": 0.9849412366747856, "step": 426 }, { "epoch": 26.704, "grad_norm": 1.32672938403649, "learning_rate": 4.644307951945633e-06, "loss": 0.0602, "mean_token_accuracy": 0.9845433384180069, "step": 427 }, { "epoch": 26.768, "grad_norm": 1.2109768949772604, "learning_rate": 4.638839916330674e-06, "loss": 0.0632, "mean_token_accuracy": 0.983364962041378, "step": 428 }, { "epoch": 26.832, "grad_norm": 1.227382726166401, "learning_rate": 4.633333835671918e-06, "loss": 0.062, "mean_token_accuracy": 0.984114833176136, "step": 429 }, { "epoch": 26.896, "grad_norm": 1.2592375187276943, "learning_rate": 4.627789820872931e-06, "loss": 0.0629, "mean_token_accuracy": 0.9835332930088043, "step": 430 }, { "epoch": 26.96, "grad_norm": 1.185661978717221, "learning_rate": 4.622207983601347e-06, "loss": 0.0566, "mean_token_accuracy": 0.9855074658989906, "step": 431 }, { "epoch": 27.0, "grad_norm": 1.185661978717221, "learning_rate": 4.616588436286621e-06, "loss": 0.0612, "mean_token_accuracy": 0.9832272410392762, "step": 432 }, { "epoch": 27.064, "grad_norm": 1.6280113301164094, "learning_rate": 4.610931292117764e-06, "loss": 0.0423, "mean_token_accuracy": 0.991246335208416, "step": 433 }, { "epoch": 27.128, "grad_norm": 0.9956212697362047, "learning_rate": 4.6052366650410674e-06, "loss": 0.0464, "mean_token_accuracy": 0.9890426099300385, "step": 434 }, { "epoch": 27.192, "grad_norm": 1.0603360426043527, "learning_rate": 4.599504669757798e-06, "loss": 0.0427, "mean_token_accuracy": 0.9897312670946121, "step": 435 }, { "epoch": 27.256, "grad_norm": 1.346430735917998, "learning_rate": 4.593735421721897e-06, "loss": 0.0435, "mean_token_accuracy": 0.9889995083212852, "step": 436 }, { "epoch": 27.32, "grad_norm": 1.4481010275324788, "learning_rate": 4.587929037137652e-06, "loss": 0.0469, "mean_token_accuracy": 0.9881396815180779, "step": 437 }, { "epoch": 27.384, "grad_norm": 1.3196697235349921, "learning_rate": 4.582085632957352e-06, "loss": 0.0484, "mean_token_accuracy": 0.9881549924612045, "step": 438 }, { "epoch": 27.448, "grad_norm": 1.0179717012729854, "learning_rate": 4.576205326878939e-06, "loss": 0.0442, "mean_token_accuracy": 0.9894405156373978, "step": 439 }, { "epoch": 27.512, "grad_norm": 1.5657523776577251, "learning_rate": 4.570288237343632e-06, "loss": 0.0442, "mean_token_accuracy": 0.9897618815302849, "step": 440 }, { "epoch": 27.576, "grad_norm": 1.1117918454389126, "learning_rate": 4.564334483533542e-06, "loss": 0.0455, "mean_token_accuracy": 0.9884304478764534, "step": 441 }, { "epoch": 27.64, "grad_norm": 1.1173396896487664, "learning_rate": 4.558344185369275e-06, "loss": 0.0427, "mean_token_accuracy": 0.9893639832735062, "step": 442 }, { "epoch": 27.704, "grad_norm": 1.1621218738029238, "learning_rate": 4.552317463507512e-06, "loss": 0.0531, "mean_token_accuracy": 0.9868235811591148, "step": 443 }, { "epoch": 27.768, "grad_norm": 1.7743880401458227, "learning_rate": 4.546254439338579e-06, "loss": 0.0478, "mean_token_accuracy": 0.9880172535777092, "step": 444 }, { "epoch": 27.832, "grad_norm": 1.1453851956935277, "learning_rate": 4.540155234984008e-06, "loss": 0.046, "mean_token_accuracy": 0.9887014925479889, "step": 445 }, { "epoch": 27.896, "grad_norm": 1.2370405824386383, "learning_rate": 4.53401997329407e-06, "loss": 0.0483, "mean_token_accuracy": 0.9885528981685638, "step": 446 }, { "epoch": 27.96, "grad_norm": 1.112415684030047, "learning_rate": 4.527848777845304e-06, "loss": 0.0481, "mean_token_accuracy": 0.9882162138819695, "step": 447 }, { "epoch": 28.0, "grad_norm": 1.2116195606161426, "learning_rate": 4.5216417729380275e-06, "loss": 0.0429, "mean_token_accuracy": 0.9889079332351685, "step": 448 }, { "epoch": 28.064, "grad_norm": 1.680981597465113, "learning_rate": 4.515399083593832e-06, "loss": 0.0361, "mean_token_accuracy": 0.9921798557043076, "step": 449 }, { "epoch": 28.128, "grad_norm": 0.9057327474165457, "learning_rate": 4.509120835553068e-06, "loss": 0.0386, "mean_token_accuracy": 0.9912922456860542, "step": 450 }, { "epoch": 28.192, "grad_norm": 1.037719721867887, "learning_rate": 4.502807155272305e-06, "loss": 0.0342, "mean_token_accuracy": 0.9918584823608398, "step": 451 }, { "epoch": 28.256, "grad_norm": 1.0700393007690783, "learning_rate": 4.496458169921795e-06, "loss": 0.0315, "mean_token_accuracy": 0.9929297268390656, "step": 452 }, { "epoch": 28.32, "grad_norm": 0.902181470290813, "learning_rate": 4.490074007382901e-06, "loss": 0.0343, "mean_token_accuracy": 0.9919590875506401, "step": 453 }, { "epoch": 28.384, "grad_norm": 1.0208086332164772, "learning_rate": 4.483654796245526e-06, "loss": 0.0384, "mean_token_accuracy": 0.9910779967904091, "step": 454 }, { "epoch": 28.448, "grad_norm": 1.3060789083995143, "learning_rate": 4.477200665805525e-06, "loss": 0.0406, "mean_token_accuracy": 0.9907107129693031, "step": 455 }, { "epoch": 28.512, "grad_norm": 1.1326685033581283, "learning_rate": 4.470711746062097e-06, "loss": 0.0375, "mean_token_accuracy": 0.9911545068025589, "step": 456 }, { "epoch": 28.576, "grad_norm": 3.0371558523597457, "learning_rate": 4.4641881677151655e-06, "loss": 0.04, "mean_token_accuracy": 0.9904658421874046, "step": 457 }, { "epoch": 28.64, "grad_norm": 1.0208219296770973, "learning_rate": 4.457630062162751e-06, "loss": 0.0346, "mean_token_accuracy": 0.9913171902298927, "step": 458 }, { "epoch": 28.704, "grad_norm": 1.190975546736713, "learning_rate": 4.451037561498319e-06, "loss": 0.0353, "mean_token_accuracy": 0.9916289299726486, "step": 459 }, { "epoch": 28.768, "grad_norm": 1.0783479788711783, "learning_rate": 4.4444107985081255e-06, "loss": 0.0416, "mean_token_accuracy": 0.9900985509157181, "step": 460 }, { "epoch": 28.832, "grad_norm": 1.6087649280293042, "learning_rate": 4.437749906668535e-06, "loss": 0.0356, "mean_token_accuracy": 0.9913993701338768, "step": 461 }, { "epoch": 28.896, "grad_norm": 1.3241991173586936, "learning_rate": 4.431055020143337e-06, "loss": 0.0357, "mean_token_accuracy": 0.9914911836385727, "step": 462 }, { "epoch": 28.96, "grad_norm": 1.5375863343618428, "learning_rate": 4.424326273781044e-06, "loss": 0.0376, "mean_token_accuracy": 0.991368755698204, "step": 463 }, { "epoch": 29.0, "grad_norm": 1.709658962088631, "learning_rate": 4.417563803112173e-06, "loss": 0.0394, "mean_token_accuracy": 0.9906954050064087, "step": 464 }, { "epoch": 29.064, "grad_norm": 1.5696821157303265, "learning_rate": 4.410767744346517e-06, "loss": 0.03, "mean_token_accuracy": 0.9934194386005402, "step": 465 }, { "epoch": 29.128, "grad_norm": 2.4074195880303826, "learning_rate": 4.403938234370398e-06, "loss": 0.0317, "mean_token_accuracy": 0.9929297268390656, "step": 466 }, { "epoch": 29.192, "grad_norm": 1.9562891294404543, "learning_rate": 4.397075410743917e-06, "loss": 0.0292, "mean_token_accuracy": 0.9936795979738235, "step": 467 }, { "epoch": 29.256, "grad_norm": 0.9252062236698847, "learning_rate": 4.390179411698176e-06, "loss": 0.028, "mean_token_accuracy": 0.9941233992576599, "step": 468 }, { "epoch": 29.32, "grad_norm": 2.0110051812186645, "learning_rate": 4.383250376132499e-06, "loss": 0.0349, "mean_token_accuracy": 0.9917666539549828, "step": 469 }, { "epoch": 29.384, "grad_norm": 1.6024517533746272, "learning_rate": 4.376288443611632e-06, "loss": 0.0291, "mean_token_accuracy": 0.9932817071676254, "step": 470 }, { "epoch": 29.448, "grad_norm": 1.5870200339453013, "learning_rate": 4.369293754362929e-06, "loss": 0.0333, "mean_token_accuracy": 0.9921645447611809, "step": 471 }, { "epoch": 29.512, "grad_norm": 1.3785668547925878, "learning_rate": 4.362266449273533e-06, "loss": 0.0324, "mean_token_accuracy": 0.9927919954061508, "step": 472 }, { "epoch": 29.576, "grad_norm": 1.2396059870983502, "learning_rate": 4.355206669887537e-06, "loss": 0.0293, "mean_token_accuracy": 0.9936183840036392, "step": 473 }, { "epoch": 29.64, "grad_norm": 1.5235498003060588, "learning_rate": 4.348114558403129e-06, "loss": 0.0295, "mean_token_accuracy": 0.9933123141527176, "step": 474 }, { "epoch": 29.704, "grad_norm": 1.2674836008438006, "learning_rate": 4.340990257669732e-06, "loss": 0.0316, "mean_token_accuracy": 0.9927307814359665, "step": 475 }, { "epoch": 29.768, "grad_norm": 1.3142695432938407, "learning_rate": 4.3338339111851265e-06, "loss": 0.0312, "mean_token_accuracy": 0.9927307814359665, "step": 476 }, { "epoch": 29.832, "grad_norm": 1.4021083737326667, "learning_rate": 4.3266456630925565e-06, "loss": 0.0317, "mean_token_accuracy": 0.9921798557043076, "step": 477 }, { "epoch": 29.896, "grad_norm": 1.6251471215422262, "learning_rate": 4.319425658177831e-06, "loss": 0.0349, "mean_token_accuracy": 0.9914299696683884, "step": 478 }, { "epoch": 29.96, "grad_norm": 1.5275186055002292, "learning_rate": 4.312174041866403e-06, "loss": 0.0332, "mean_token_accuracy": 0.9925436675548553, "step": 479 }, { "epoch": 30.0, "grad_norm": 1.5275186055002292, "learning_rate": 4.304890960220446e-06, "loss": 0.0342, "mean_token_accuracy": 0.9919931530952454, "step": 480 }, { "epoch": 30.064, "grad_norm": 1.8550522951863249, "learning_rate": 4.297576559935906e-06, "loss": 0.0277, "mean_token_accuracy": 0.9941999167203903, "step": 481 }, { "epoch": 30.128, "grad_norm": 0.9468474377247835, "learning_rate": 4.29023098833955e-06, "loss": 0.0246, "mean_token_accuracy": 0.9950110018253326, "step": 482 }, { "epoch": 30.192, "grad_norm": 0.8908123340399178, "learning_rate": 4.282854393386e-06, "loss": 0.0264, "mean_token_accuracy": 0.9942611306905746, "step": 483 }, { "epoch": 30.256, "grad_norm": 1.0291648412314238, "learning_rate": 4.275446923654748e-06, "loss": 0.0234, "mean_token_accuracy": 0.9946590289473534, "step": 484 }, { "epoch": 30.32, "grad_norm": 1.110054152722114, "learning_rate": 4.268008728347168e-06, "loss": 0.0257, "mean_token_accuracy": 0.9941080957651138, "step": 485 }, { "epoch": 30.384, "grad_norm": 1.1203340386912817, "learning_rate": 4.26053995728351e-06, "loss": 0.0249, "mean_token_accuracy": 0.9941693097352982, "step": 486 }, { "epoch": 30.448, "grad_norm": 0.9757485698941055, "learning_rate": 4.253040760899878e-06, "loss": 0.0257, "mean_token_accuracy": 0.9943682551383972, "step": 487 }, { "epoch": 30.512, "grad_norm": 1.0467678006834251, "learning_rate": 4.245511290245209e-06, "loss": 0.0235, "mean_token_accuracy": 0.9945672005414963, "step": 488 }, { "epoch": 30.576, "grad_norm": 1.0470701480440627, "learning_rate": 4.237951696978217e-06, "loss": 0.0287, "mean_token_accuracy": 0.9936110079288483, "step": 489 }, { "epoch": 30.64, "grad_norm": 1.1063848677624253, "learning_rate": 4.230362133364354e-06, "loss": 0.0267, "mean_token_accuracy": 0.9940162748098373, "step": 490 }, { "epoch": 30.704, "grad_norm": 1.0292563622128144, "learning_rate": 4.22274275227273e-06, "loss": 0.024, "mean_token_accuracy": 0.994505986571312, "step": 491 }, { "epoch": 30.768, "grad_norm": 0.9993298222986402, "learning_rate": 4.215093707173041e-06, "loss": 0.0259, "mean_token_accuracy": 0.9941693097352982, "step": 492 }, { "epoch": 30.832, "grad_norm": 0.991144342968755, "learning_rate": 4.207415152132476e-06, "loss": 0.025, "mean_token_accuracy": 0.9940927922725677, "step": 493 }, { "epoch": 30.896, "grad_norm": 0.976853658225494, "learning_rate": 4.199707241812613e-06, "loss": 0.0265, "mean_token_accuracy": 0.9938053637742996, "step": 494 }, { "epoch": 30.96, "grad_norm": 1.0997159543493233, "learning_rate": 4.191970131466304e-06, "loss": 0.0266, "mean_token_accuracy": 0.9939397573471069, "step": 495 }, { "epoch": 31.0, "grad_norm": 1.216063443230978, "learning_rate": 4.184203976934552e-06, "loss": 0.0281, "mean_token_accuracy": 0.9934867739677429, "step": 496 }, { "epoch": 31.064, "grad_norm": 1.2655427816851061, "learning_rate": 4.176408934643364e-06, "loss": 0.0207, "mean_token_accuracy": 0.9955160170793533, "step": 497 }, { "epoch": 31.128, "grad_norm": 0.7859671981802873, "learning_rate": 4.1685851616006066e-06, "loss": 0.0222, "mean_token_accuracy": 0.9955160170793533, "step": 498 }, { "epoch": 31.192, "grad_norm": 0.8341844871090315, "learning_rate": 4.16073281539284e-06, "loss": 0.0205, "mean_token_accuracy": 0.9958527162671089, "step": 499 }, { "epoch": 31.256, "grad_norm": 0.9250772412805818, "learning_rate": 4.152852054182151e-06, "loss": 0.0211, "mean_token_accuracy": 0.9954088926315308, "step": 500 }, { "epoch": 31.32, "grad_norm": 0.8613922181527686, "learning_rate": 4.144943036702954e-06, "loss": 0.0214, "mean_token_accuracy": 0.9953476786613464, "step": 501 }, { "epoch": 31.384, "grad_norm": 0.7456698077209918, "learning_rate": 4.137005922258808e-06, "loss": 0.021, "mean_token_accuracy": 0.9953935891389847, "step": 502 }, { "epoch": 31.448, "grad_norm": 1.001595150899967, "learning_rate": 4.129040870719198e-06, "loss": 0.0201, "mean_token_accuracy": 0.9955925345420837, "step": 503 }, { "epoch": 31.512, "grad_norm": 0.9722771982399934, "learning_rate": 4.121048042516319e-06, "loss": 0.0231, "mean_token_accuracy": 0.9949477091431618, "step": 504 }, { "epoch": 31.576, "grad_norm": 1.06581093339698, "learning_rate": 4.113027598641845e-06, "loss": 0.0219, "mean_token_accuracy": 0.9950110092759132, "step": 505 }, { "epoch": 31.64, "grad_norm": 0.9864076392990991, "learning_rate": 4.104979700643685e-06, "loss": 0.0258, "mean_token_accuracy": 0.9941610842943192, "step": 506 }, { "epoch": 31.704, "grad_norm": 1.0289183677715343, "learning_rate": 4.0969045106227295e-06, "loss": 0.0243, "mean_token_accuracy": 0.9946896284818649, "step": 507 }, { "epoch": 31.768, "grad_norm": 1.2834427915966655, "learning_rate": 4.088802191229584e-06, "loss": 0.0226, "mean_token_accuracy": 0.9949191808700562, "step": 508 }, { "epoch": 31.832, "grad_norm": 0.872956532021559, "learning_rate": 4.080672905661296e-06, "loss": 0.0224, "mean_token_accuracy": 0.9951028227806091, "step": 509 }, { "epoch": 31.896, "grad_norm": 1.0697936039093632, "learning_rate": 4.072516817658065e-06, "loss": 0.0271, "mean_token_accuracy": 0.9942305237054825, "step": 510 }, { "epoch": 31.96, "grad_norm": 1.187723015667045, "learning_rate": 4.064334091499945e-06, "loss": 0.0245, "mean_token_accuracy": 0.994337648153305, "step": 511 }, { "epoch": 32.0, "grad_norm": 1.4882891947076426, "learning_rate": 4.056124892003541e-06, "loss": 0.027, "mean_token_accuracy": 0.994025456905365, "step": 512 }, { "epoch": 32.064, "grad_norm": 0.8104889104642802, "learning_rate": 4.047889384518676e-06, "loss": 0.0187, "mean_token_accuracy": 0.9962965101003647, "step": 513 }, { "epoch": 32.128, "grad_norm": 0.9266061488636069, "learning_rate": 4.0396277349250754e-06, "loss": 0.0187, "mean_token_accuracy": 0.9961128607392311, "step": 514 }, { "epoch": 32.192, "grad_norm": 0.7411759120560485, "learning_rate": 4.031340109629017e-06, "loss": 0.0179, "mean_token_accuracy": 0.9962046965956688, "step": 515 }, { "epoch": 32.256, "grad_norm": 0.7063843340368742, "learning_rate": 4.0230266755599825e-06, "loss": 0.0179, "mean_token_accuracy": 0.9958461299538612, "step": 516 }, { "epoch": 32.32, "grad_norm": 0.9008427861188805, "learning_rate": 4.014687600167294e-06, "loss": 0.0189, "mean_token_accuracy": 0.9959445297718048, "step": 517 }, { "epoch": 32.384, "grad_norm": 1.0027435934093918, "learning_rate": 4.006323051416741e-06, "loss": 0.0201, "mean_token_accuracy": 0.9953935891389847, "step": 518 }, { "epoch": 32.448, "grad_norm": 0.8114762373720891, "learning_rate": 3.997933197787198e-06, "loss": 0.0205, "mean_token_accuracy": 0.9958067834377289, "step": 519 }, { "epoch": 32.512, "grad_norm": 0.8166103536340461, "learning_rate": 3.989518208267231e-06, "loss": 0.0184, "mean_token_accuracy": 0.9961893856525421, "step": 520 }, { "epoch": 32.576, "grad_norm": 0.903261044310652, "learning_rate": 3.981078252351694e-06, "loss": 0.0206, "mean_token_accuracy": 0.9956537559628487, "step": 521 }, { "epoch": 32.64, "grad_norm": 0.9161871438125471, "learning_rate": 3.972613500038314e-06, "loss": 0.0186, "mean_token_accuracy": 0.9958373978734016, "step": 522 }, { "epoch": 32.704, "grad_norm": 0.9622734864137354, "learning_rate": 3.964124121824265e-06, "loss": 0.0195, "mean_token_accuracy": 0.9956954121589661, "step": 523 }, { "epoch": 32.768, "grad_norm": 1.0663548502851063, "learning_rate": 3.9556102887027425e-06, "loss": 0.0222, "mean_token_accuracy": 0.9948273673653603, "step": 524 }, { "epoch": 32.832, "grad_norm": 0.9772282493263409, "learning_rate": 3.9470721721595075e-06, "loss": 0.0209, "mean_token_accuracy": 0.995408907532692, "step": 525 }, { "epoch": 32.896, "grad_norm": 1.0278242585954591, "learning_rate": 3.938509944169441e-06, "loss": 0.0209, "mean_token_accuracy": 0.9950263053178787, "step": 526 }, { "epoch": 32.96, "grad_norm": 0.8949006192487979, "learning_rate": 3.929923777193073e-06, "loss": 0.02, "mean_token_accuracy": 0.995454803109169, "step": 527 }, { "epoch": 33.0, "grad_norm": 0.8949006192487979, "learning_rate": 3.92131384417312e-06, "loss": 0.0229, "mean_token_accuracy": 0.9950048804283143, "step": 528 }, { "epoch": 33.064, "grad_norm": 1.527726989347519, "learning_rate": 3.912680318530988e-06, "loss": 0.0151, "mean_token_accuracy": 0.9968321621417999, "step": 529 }, { "epoch": 33.128, "grad_norm": 0.6164697576358745, "learning_rate": 3.904023374163289e-06, "loss": 0.017, "mean_token_accuracy": 0.9960822612047195, "step": 530 }, { "epoch": 33.192, "grad_norm": 0.8570675467578164, "learning_rate": 3.895343185438335e-06, "loss": 0.0179, "mean_token_accuracy": 0.9957768023014069, "step": 531 }, { "epoch": 33.256, "grad_norm": 0.7527135437806243, "learning_rate": 3.886639927192628e-06, "loss": 0.0175, "mean_token_accuracy": 0.9962047040462494, "step": 532 }, { "epoch": 33.32, "grad_norm": 0.9063553845660416, "learning_rate": 3.877913774727332e-06, "loss": 0.0164, "mean_token_accuracy": 0.9962199926376343, "step": 533 }, { "epoch": 33.384, "grad_norm": 0.7531513051005051, "learning_rate": 3.8691649038047495e-06, "loss": 0.0174, "mean_token_accuracy": 0.995913915336132, "step": 534 }, { "epoch": 33.448, "grad_norm": 0.7590014466188122, "learning_rate": 3.860393490644781e-06, "loss": 0.017, "mean_token_accuracy": 0.9964801669120789, "step": 535 }, { "epoch": 33.512, "grad_norm": 0.7073361788610286, "learning_rate": 3.8515997119213686e-06, "loss": 0.0166, "mean_token_accuracy": 0.9962505996227264, "step": 536 }, { "epoch": 33.576, "grad_norm": 0.8118993811368365, "learning_rate": 3.842783744758944e-06, "loss": 0.0188, "mean_token_accuracy": 0.9961128681898117, "step": 537 }, { "epoch": 33.64, "grad_norm": 0.7860897765532269, "learning_rate": 3.833945766728859e-06, "loss": 0.0188, "mean_token_accuracy": 0.996051661670208, "step": 538 }, { "epoch": 33.704, "grad_norm": 0.9951256301747828, "learning_rate": 3.825085955845811e-06, "loss": 0.0194, "mean_token_accuracy": 0.9959598258137703, "step": 539 }, { "epoch": 33.768, "grad_norm": 0.7295678412453166, "learning_rate": 3.816204490564247e-06, "loss": 0.0182, "mean_token_accuracy": 0.9959245398640633, "step": 540 }, { "epoch": 33.832, "grad_norm": 0.9325030451817488, "learning_rate": 3.8073015497747873e-06, "loss": 0.0177, "mean_token_accuracy": 0.9960210546851158, "step": 541 }, { "epoch": 33.896, "grad_norm": 0.7197408366546506, "learning_rate": 3.7983773128006056e-06, "loss": 0.0189, "mean_token_accuracy": 0.995913915336132, "step": 542 }, { "epoch": 33.96, "grad_norm": 1.007837654503578, "learning_rate": 3.789431959393827e-06, "loss": 0.0179, "mean_token_accuracy": 0.9959598407149315, "step": 543 }, { "epoch": 34.0, "grad_norm": 1.1972908164812377, "learning_rate": 3.7804656697319025e-06, "loss": 0.0186, "mean_token_accuracy": 0.9956660032272339, "step": 544 }, { "epoch": 34.064, "grad_norm": 0.6750965146514032, "learning_rate": 3.7714786244139808e-06, "loss": 0.0148, "mean_token_accuracy": 0.9968168586492538, "step": 545 }, { "epoch": 34.128, "grad_norm": 0.6364362849963309, "learning_rate": 3.7624710044572727e-06, "loss": 0.0147, "mean_token_accuracy": 0.9968168586492538, "step": 546 }, { "epoch": 34.192, "grad_norm": 0.6784928303123082, "learning_rate": 3.7534429912934043e-06, "loss": 0.0151, "mean_token_accuracy": 0.9966791197657585, "step": 547 }, { "epoch": 34.256, "grad_norm": 0.7129998974640586, "learning_rate": 3.7443947667647606e-06, "loss": 0.0164, "mean_token_accuracy": 0.9962812215089798, "step": 548 }, { "epoch": 34.32, "grad_norm": 0.6627908244890321, "learning_rate": 3.7353265131208246e-06, "loss": 0.0138, "mean_token_accuracy": 0.9970923364162445, "step": 549 }, { "epoch": 34.384, "grad_norm": 0.7308868597318711, "learning_rate": 3.7262384130145058e-06, "loss": 0.0161, "mean_token_accuracy": 0.9966485053300858, "step": 550 }, { "epoch": 34.448, "grad_norm": 0.6991288888025083, "learning_rate": 3.717130649498463e-06, "loss": 0.0143, "mean_token_accuracy": 0.9968321546912193, "step": 551 }, { "epoch": 34.512, "grad_norm": 0.7827297733525048, "learning_rate": 3.7080034060214136e-06, "loss": 0.0162, "mean_token_accuracy": 0.9963883385062218, "step": 552 }, { "epoch": 34.576, "grad_norm": 0.9530782781603654, "learning_rate": 3.698856866424442e-06, "loss": 0.0178, "mean_token_accuracy": 0.9958220943808556, "step": 553 }, { "epoch": 34.64, "grad_norm": 0.8882675648323537, "learning_rate": 3.689691214937295e-06, "loss": 0.0169, "mean_token_accuracy": 0.9961434677243233, "step": 554 }, { "epoch": 34.704, "grad_norm": 0.7362987987409557, "learning_rate": 3.6805066361746714e-06, "loss": 0.015, "mean_token_accuracy": 0.9964770451188087, "step": 555 }, { "epoch": 34.768, "grad_norm": 0.6587155982627673, "learning_rate": 3.6713033151325045e-06, "loss": 0.0151, "mean_token_accuracy": 0.9967403411865234, "step": 556 }, { "epoch": 34.832, "grad_norm": 0.9271039098165166, "learning_rate": 3.6620814371842313e-06, "loss": 0.0174, "mean_token_accuracy": 0.9956384524703026, "step": 557 }, { "epoch": 34.896, "grad_norm": 1.1559399212401356, "learning_rate": 3.6528411880770676e-06, "loss": 0.0174, "mean_token_accuracy": 0.9960189610719681, "step": 558 }, { "epoch": 34.96, "grad_norm": 0.8768924628143494, "learning_rate": 3.6435827539282564e-06, "loss": 0.0177, "mean_token_accuracy": 0.9960057437419891, "step": 559 }, { "epoch": 35.0, "grad_norm": 0.8768924628143494, "learning_rate": 3.6343063212213285e-06, "loss": 0.0167, "mean_token_accuracy": 0.9963026285171509, "step": 560 }, { "epoch": 35.064, "grad_norm": 1.02071889836862, "learning_rate": 3.625012076802339e-06, "loss": 0.0135, "mean_token_accuracy": 0.9973218888044357, "step": 561 }, { "epoch": 35.128, "grad_norm": 0.7029979675597257, "learning_rate": 3.6157002078761065e-06, "loss": 0.0133, "mean_token_accuracy": 0.9970617219805717, "step": 562 }, { "epoch": 35.192, "grad_norm": 0.5953888204513831, "learning_rate": 3.6063709020024445e-06, "loss": 0.0144, "mean_token_accuracy": 0.9967097193002701, "step": 563 }, { "epoch": 35.256, "grad_norm": 0.6836849630588052, "learning_rate": 3.597024347092382e-06, "loss": 0.0146, "mean_token_accuracy": 0.9968474730849266, "step": 564 }, { "epoch": 35.32, "grad_norm": 0.7149779052517604, "learning_rate": 3.5876607314043773e-06, "loss": 0.0144, "mean_token_accuracy": 0.9968015626072884, "step": 565 }, { "epoch": 35.384, "grad_norm": 0.6716735768617133, "learning_rate": 3.578280243540526e-06, "loss": 0.0146, "mean_token_accuracy": 0.9966026023030281, "step": 566 }, { "epoch": 35.448, "grad_norm": 0.8063332714915842, "learning_rate": 3.568883072442765e-06, "loss": 0.0133, "mean_token_accuracy": 0.996847465634346, "step": 567 }, { "epoch": 35.512, "grad_norm": 0.6236698228698195, "learning_rate": 3.559469407389066e-06, "loss": 0.0141, "mean_token_accuracy": 0.9967556372284889, "step": 568 }, { "epoch": 35.576, "grad_norm": 0.6098941388192161, "learning_rate": 3.5500394379896196e-06, "loss": 0.0155, "mean_token_accuracy": 0.9965873062610626, "step": 569 }, { "epoch": 35.64, "grad_norm": 1.079712263205525, "learning_rate": 3.5405933541830217e-06, "loss": 0.0166, "mean_token_accuracy": 0.9961200952529907, "step": 570 }, { "epoch": 35.704, "grad_norm": 0.8563799931339328, "learning_rate": 3.531131346232442e-06, "loss": 0.0145, "mean_token_accuracy": 0.9968474730849266, "step": 571 }, { "epoch": 35.768, "grad_norm": 0.6833353775257224, "learning_rate": 3.5216536047217957e-06, "loss": 0.0157, "mean_token_accuracy": 0.9965812116861343, "step": 572 }, { "epoch": 35.832, "grad_norm": 0.8068432082858897, "learning_rate": 3.512160320551906e-06, "loss": 0.0151, "mean_token_accuracy": 0.9964036419987679, "step": 573 }, { "epoch": 35.896, "grad_norm": 0.8407840672712452, "learning_rate": 3.5026516849366524e-06, "loss": 0.0163, "mean_token_accuracy": 0.9964648559689522, "step": 574 }, { "epoch": 35.96, "grad_norm": 0.7808319687953745, "learning_rate": 3.4931278893991265e-06, "loss": 0.0148, "mean_token_accuracy": 0.9969545975327492, "step": 575 }, { "epoch": 36.0, "grad_norm": 0.7239459913181325, "learning_rate": 3.4835891257677695e-06, "loss": 0.015, "mean_token_accuracy": 0.9963271141052246, "step": 576 }, { "epoch": 36.064, "grad_norm": 0.9879486883006207, "learning_rate": 3.4740355861725116e-06, "loss": 0.0124, "mean_token_accuracy": 0.9972912818193436, "step": 577 }, { "epoch": 36.128, "grad_norm": 0.5178629660689309, "learning_rate": 3.4644674630409e-06, "loss": 0.0129, "mean_token_accuracy": 0.9974137097597122, "step": 578 }, { "epoch": 36.192, "grad_norm": 0.5349884866647774, "learning_rate": 3.4548849490942236e-06, "loss": 0.0127, "mean_token_accuracy": 0.9973371922969818, "step": 579 }, { "epoch": 36.256, "grad_norm": 0.45316419242817, "learning_rate": 3.445288237343632e-06, "loss": 0.0121, "mean_token_accuracy": 0.9974137097597122, "step": 580 }, { "epoch": 36.32, "grad_norm": 0.6850566560245732, "learning_rate": 3.4356775210862462e-06, "loss": 0.0131, "mean_token_accuracy": 0.99710763245821, "step": 581 }, { "epoch": 36.384, "grad_norm": 0.7766243678176012, "learning_rate": 3.426052993901267e-06, "loss": 0.0131, "mean_token_accuracy": 0.9970311149954796, "step": 582 }, { "epoch": 36.448, "grad_norm": 0.6727925112032404, "learning_rate": 3.4164148496460752e-06, "loss": 0.0136, "mean_token_accuracy": 0.9969951212406158, "step": 583 }, { "epoch": 36.512, "grad_norm": 0.7027768416169433, "learning_rate": 3.4067632824523255e-06, "loss": 0.0139, "mean_token_accuracy": 0.9966025948524475, "step": 584 }, { "epoch": 36.576, "grad_norm": 0.550685166145585, "learning_rate": 3.397098486722039e-06, "loss": 0.014, "mean_token_accuracy": 0.9969392940402031, "step": 585 }, { "epoch": 36.64, "grad_norm": 0.8097036886196215, "learning_rate": 3.387420657123686e-06, "loss": 0.0153, "mean_token_accuracy": 0.9964495822787285, "step": 586 }, { "epoch": 36.704, "grad_norm": 0.8076885632585951, "learning_rate": 3.3777299885882663e-06, "loss": 0.0146, "mean_token_accuracy": 0.996737077832222, "step": 587 }, { "epoch": 36.768, "grad_norm": 0.7647984818844884, "learning_rate": 3.3680266763053782e-06, "loss": 0.0136, "mean_token_accuracy": 0.9968168660998344, "step": 588 }, { "epoch": 36.832, "grad_norm": 0.5618449753048343, "learning_rate": 3.358310915719294e-06, "loss": 0.0137, "mean_token_accuracy": 0.9969698861241341, "step": 589 }, { "epoch": 36.896, "grad_norm": 0.5564032781081476, "learning_rate": 3.3485829025250194e-06, "loss": 0.014, "mean_token_accuracy": 0.9966944307088852, "step": 590 }, { "epoch": 36.96, "grad_norm": 0.8115550743614555, "learning_rate": 3.338842832664351e-06, "loss": 0.0146, "mean_token_accuracy": 0.9967556521296501, "step": 591 }, { "epoch": 37.0, "grad_norm": 0.9092318661373081, "learning_rate": 3.329090902321931e-06, "loss": 0.0131, "mean_token_accuracy": 0.9968413591384888, "step": 592 }, { "epoch": 37.064, "grad_norm": 0.5561723926711918, "learning_rate": 3.3193273079212968e-06, "loss": 0.0116, "mean_token_accuracy": 0.9974596202373505, "step": 593 }, { "epoch": 37.128, "grad_norm": 0.3938017941435899, "learning_rate": 3.309552246120924e-06, "loss": 0.0111, "mean_token_accuracy": 0.99739570915699, "step": 594 }, { "epoch": 37.192, "grad_norm": 0.6269457602793167, "learning_rate": 3.2997659138102616e-06, "loss": 0.0118, "mean_token_accuracy": 0.9972765892744064, "step": 595 }, { "epoch": 37.256, "grad_norm": 0.7194793808470971, "learning_rate": 3.2899685081057736e-06, "loss": 0.0121, "mean_token_accuracy": 0.9975361377000809, "step": 596 }, { "epoch": 37.32, "grad_norm": 0.6081988212641433, "learning_rate": 3.280160226346961e-06, "loss": 0.0124, "mean_token_accuracy": 0.9970923364162445, "step": 597 }, { "epoch": 37.384, "grad_norm": 0.5101593857163599, "learning_rate": 3.270341266092393e-06, "loss": 0.0129, "mean_token_accuracy": 0.9969392865896225, "step": 598 }, { "epoch": 37.448, "grad_norm": 0.7428299776346693, "learning_rate": 3.2605118251157225e-06, "loss": 0.0128, "mean_token_accuracy": 0.9970923364162445, "step": 599 }, { "epoch": 37.512, "grad_norm": 0.6572123544165882, "learning_rate": 3.2506721014017075e-06, "loss": 0.012, "mean_token_accuracy": 0.9970617219805717, "step": 600 }, { "epoch": 37.576, "grad_norm": 0.5701352106064574, "learning_rate": 3.2408222931422213e-06, "loss": 0.0129, "mean_token_accuracy": 0.9971076399087906, "step": 601 }, { "epoch": 37.64, "grad_norm": 0.6451975113041357, "learning_rate": 3.230962598732258e-06, "loss": 0.0135, "mean_token_accuracy": 0.9971229359507561, "step": 602 }, { "epoch": 37.704, "grad_norm": 0.627007739121674, "learning_rate": 3.2210932167659396e-06, "loss": 0.0135, "mean_token_accuracy": 0.9969392865896225, "step": 603 }, { "epoch": 37.768, "grad_norm": 0.802037351748065, "learning_rate": 3.211214346032519e-06, "loss": 0.0132, "mean_token_accuracy": 0.9968780651688576, "step": 604 }, { "epoch": 37.832, "grad_norm": 0.6179471709338933, "learning_rate": 3.201326185512364e-06, "loss": 0.0137, "mean_token_accuracy": 0.9969545975327492, "step": 605 }, { "epoch": 37.896, "grad_norm": 0.872564897107446, "learning_rate": 3.1914289343729645e-06, "loss": 0.013, "mean_token_accuracy": 0.9969086945056915, "step": 606 }, { "epoch": 37.96, "grad_norm": 0.624186774203636, "learning_rate": 3.18152279196491e-06, "loss": 0.0134, "mean_token_accuracy": 0.9967862516641617, "step": 607 }, { "epoch": 38.0, "grad_norm": 0.624186774203636, "learning_rate": 3.171607957817881e-06, "loss": 0.0139, "mean_token_accuracy": 0.9967434048652649, "step": 608 }, { "epoch": 38.064, "grad_norm": 1.1156731889285663, "learning_rate": 3.1616846316366228e-06, "loss": 0.0123, "mean_token_accuracy": 0.9970566481351852, "step": 609 }, { "epoch": 38.128, "grad_norm": 0.9234205425991477, "learning_rate": 3.1517530132969327e-06, "loss": 0.0107, "mean_token_accuracy": 0.9975055307149887, "step": 610 }, { "epoch": 38.192, "grad_norm": 0.41818822316568255, "learning_rate": 3.141813302841625e-06, "loss": 0.0107, "mean_token_accuracy": 0.9976891726255417, "step": 611 }, { "epoch": 38.256, "grad_norm": 0.3639074832769107, "learning_rate": 3.1318657004765053e-06, "loss": 0.0114, "mean_token_accuracy": 0.9972606673836708, "step": 612 }, { "epoch": 38.32, "grad_norm": 0.5988924018142379, "learning_rate": 3.121910406566342e-06, "loss": 0.012, "mean_token_accuracy": 0.9971382468938828, "step": 613 }, { "epoch": 38.384, "grad_norm": 0.5666499060518836, "learning_rate": 3.111947621630822e-06, "loss": 0.0109, "mean_token_accuracy": 0.9974902272224426, "step": 614 }, { "epoch": 38.448, "grad_norm": 0.46532989644167766, "learning_rate": 3.10197754634052e-06, "loss": 0.0117, "mean_token_accuracy": 0.9973831027746201, "step": 615 }, { "epoch": 38.512, "grad_norm": 0.5679197380383376, "learning_rate": 3.092000381512852e-06, "loss": 0.0117, "mean_token_accuracy": 0.997184157371521, "step": 616 }, { "epoch": 38.576, "grad_norm": 0.7179435729803193, "learning_rate": 3.08201632810803e-06, "loss": 0.0119, "mean_token_accuracy": 0.9972109347581863, "step": 617 }, { "epoch": 38.64, "grad_norm": 0.5923514709882038, "learning_rate": 3.072025587225019e-06, "loss": 0.0115, "mean_token_accuracy": 0.9972453713417053, "step": 618 }, { "epoch": 38.704, "grad_norm": 0.560264194513055, "learning_rate": 3.0620283600974817e-06, "loss": 0.0119, "mean_token_accuracy": 0.9972912818193436, "step": 619 }, { "epoch": 38.768, "grad_norm": 0.5905798589152937, "learning_rate": 3.052024848089725e-06, "loss": 0.013, "mean_token_accuracy": 0.9969545975327492, "step": 620 }, { "epoch": 38.832, "grad_norm": 0.7289094700202001, "learning_rate": 3.0420152526926494e-06, "loss": 0.0121, "mean_token_accuracy": 0.9971994608640671, "step": 621 }, { "epoch": 38.896, "grad_norm": 0.6005712022479127, "learning_rate": 3.031999775519685e-06, "loss": 0.0141, "mean_token_accuracy": 0.9969546049833298, "step": 622 }, { "epoch": 38.96, "grad_norm": 0.7188113692017786, "learning_rate": 3.0219786183027327e-06, "loss": 0.0123, "mean_token_accuracy": 0.9970770254731178, "step": 623 }, { "epoch": 39.0, "grad_norm": 0.9975624462721968, "learning_rate": 3.0119519828881016e-06, "loss": 0.0151, "mean_token_accuracy": 0.996645450592041, "step": 624 }, { "epoch": 39.064, "grad_norm": 0.7638831941827031, "learning_rate": 3.0019200712324433e-06, "loss": 0.011, "mean_token_accuracy": 0.9974443092942238, "step": 625 }, { "epoch": 39.128, "grad_norm": 0.9563739690173727, "learning_rate": 2.991883085398682e-06, "loss": 0.0107, "mean_token_accuracy": 0.9976585656404495, "step": 626 }, { "epoch": 39.192, "grad_norm": 0.4607188447559162, "learning_rate": 2.981841227551946e-06, "loss": 0.0105, "mean_token_accuracy": 0.9975031390786171, "step": 627 }, { "epoch": 39.256, "grad_norm": 0.4624563780058694, "learning_rate": 2.971794699955498e-06, "loss": 0.0111, "mean_token_accuracy": 0.9975208342075348, "step": 628 }, { "epoch": 39.32, "grad_norm": 0.6659333422553714, "learning_rate": 2.9617437049666582e-06, "loss": 0.0115, "mean_token_accuracy": 0.9974137097597122, "step": 629 }, { "epoch": 39.384, "grad_norm": 0.5730734341982656, "learning_rate": 2.951688445032726e-06, "loss": 0.0109, "mean_token_accuracy": 0.9974596202373505, "step": 630 }, { "epoch": 39.448, "grad_norm": 0.5284034646552216, "learning_rate": 2.9416291226869093e-06, "loss": 0.0111, "mean_token_accuracy": 0.9972759783267975, "step": 631 }, { "epoch": 39.512, "grad_norm": 0.6532922484844229, "learning_rate": 2.931565940544239e-06, "loss": 0.0126, "mean_token_accuracy": 0.9970158040523529, "step": 632 }, { "epoch": 39.576, "grad_norm": 0.677933711768172, "learning_rate": 2.921499101297492e-06, "loss": 0.0123, "mean_token_accuracy": 0.997119814157486, "step": 633 }, { "epoch": 39.64, "grad_norm": 0.5227923485149196, "learning_rate": 2.9114288077131037e-06, "loss": 0.011, "mean_token_accuracy": 0.9973371922969818, "step": 634 }, { "epoch": 39.704, "grad_norm": 0.6226210308194157, "learning_rate": 2.90135526262709e-06, "loss": 0.0115, "mean_token_accuracy": 0.997367799282074, "step": 635 }, { "epoch": 39.768, "grad_norm": 0.47626518634011616, "learning_rate": 2.8912786689409556e-06, "loss": 0.0114, "mean_token_accuracy": 0.997306577861309, "step": 636 }, { "epoch": 39.832, "grad_norm": 0.6425241603016666, "learning_rate": 2.881199229617613e-06, "loss": 0.0121, "mean_token_accuracy": 0.9971076399087906, "step": 637 }, { "epoch": 39.896, "grad_norm": 0.6386322575324032, "learning_rate": 2.8711171476772888e-06, "loss": 0.0113, "mean_token_accuracy": 0.997184157371521, "step": 638 }, { "epoch": 39.96, "grad_norm": 0.663520727634181, "learning_rate": 2.8610326261934405e-06, "loss": 0.0121, "mean_token_accuracy": 0.9971841499209404, "step": 639 }, { "epoch": 40.0, "grad_norm": 0.8534807532398108, "learning_rate": 2.850945868288659e-06, "loss": 0.0131, "mean_token_accuracy": 0.9970617175102234, "step": 640 }, { "epoch": 40.064, "grad_norm": 0.4165193803458853, "learning_rate": 2.8408570771305843e-06, "loss": 0.0104, "mean_token_accuracy": 0.9976738691329956, "step": 641 }, { "epoch": 40.128, "grad_norm": 0.5205781495841734, "learning_rate": 2.8307664559278107e-06, "loss": 0.0103, "mean_token_accuracy": 0.9974749237298965, "step": 642 }, { "epoch": 40.192, "grad_norm": 0.6673932626786626, "learning_rate": 2.820674207925789e-06, "loss": 0.0109, "mean_token_accuracy": 0.9972759708762169, "step": 643 }, { "epoch": 40.256, "grad_norm": 0.5134106384388535, "learning_rate": 2.8105805364027417e-06, "loss": 0.0109, "mean_token_accuracy": 0.9973183423280716, "step": 644 }, { "epoch": 40.32, "grad_norm": 0.8381774852877406, "learning_rate": 2.8004856446655614e-06, "loss": 0.0097, "mean_token_accuracy": 0.9978269040584564, "step": 645 }, { "epoch": 40.384, "grad_norm": 0.5475707119524557, "learning_rate": 2.7903897360457195e-06, "loss": 0.0108, "mean_token_accuracy": 0.9973984062671661, "step": 646 }, { "epoch": 40.448, "grad_norm": 0.6293161601765511, "learning_rate": 2.780293013895167e-06, "loss": 0.0114, "mean_token_accuracy": 0.9972300603985786, "step": 647 }, { "epoch": 40.512, "grad_norm": 0.7942973920056281, "learning_rate": 2.7701956815822454e-06, "loss": 0.0117, "mean_token_accuracy": 0.9972291812300682, "step": 648 }, { "epoch": 40.576, "grad_norm": 0.9474600904112908, "learning_rate": 2.760097942487583e-06, "loss": 0.0133, "mean_token_accuracy": 0.9971688464283943, "step": 649 }, { "epoch": 40.64, "grad_norm": 0.7076721358958215, "learning_rate": 2.7500000000000004e-06, "loss": 0.0114, "mean_token_accuracy": 0.9972912818193436, "step": 650 }, { "epoch": 40.704, "grad_norm": 0.6471355626469086, "learning_rate": 2.739902057512418e-06, "loss": 0.011, "mean_token_accuracy": 0.997367799282074, "step": 651 }, { "epoch": 40.768, "grad_norm": 0.4382435595877183, "learning_rate": 2.729804318417755e-06, "loss": 0.0108, "mean_token_accuracy": 0.9975055232644081, "step": 652 }, { "epoch": 40.832, "grad_norm": 0.5010873222387526, "learning_rate": 2.7197069861048327e-06, "loss": 0.0116, "mean_token_accuracy": 0.9970923289656639, "step": 653 }, { "epoch": 40.896, "grad_norm": 0.6187389148436806, "learning_rate": 2.709610263954282e-06, "loss": 0.0117, "mean_token_accuracy": 0.9971688538789749, "step": 654 }, { "epoch": 40.96, "grad_norm": 0.6898530358492299, "learning_rate": 2.6995143553344393e-06, "loss": 0.0117, "mean_token_accuracy": 0.9973983988165855, "step": 655 }, { "epoch": 41.0, "grad_norm": 0.6898530358492299, "learning_rate": 2.689419463597259e-06, "loss": 0.0116, "mean_token_accuracy": 0.9971596717834472, "step": 656 }, { "epoch": 41.064, "grad_norm": 0.7236757229820152, "learning_rate": 2.679325792074212e-06, "loss": 0.01, "mean_token_accuracy": 0.997551441192627, "step": 657 }, { "epoch": 41.128, "grad_norm": 0.46315648307241575, "learning_rate": 2.66923354407219e-06, "loss": 0.0099, "mean_token_accuracy": 0.9976738691329956, "step": 658 }, { "epoch": 41.192, "grad_norm": 0.39205603611160184, "learning_rate": 2.659142922869416e-06, "loss": 0.0102, "mean_token_accuracy": 0.9975820481777191, "step": 659 }, { "epoch": 41.256, "grad_norm": 0.4517989146543906, "learning_rate": 2.649054131711343e-06, "loss": 0.0098, "mean_token_accuracy": 0.997566744685173, "step": 660 }, { "epoch": 41.32, "grad_norm": 0.6389545858962598, "learning_rate": 2.6389673738065615e-06, "loss": 0.0106, "mean_token_accuracy": 0.9975973516702652, "step": 661 }, { "epoch": 41.384, "grad_norm": 0.44858889919354483, "learning_rate": 2.6288828523227115e-06, "loss": 0.0107, "mean_token_accuracy": 0.9972429797053337, "step": 662 }, { "epoch": 41.448, "grad_norm": 0.4944223413056835, "learning_rate": 2.6188007703823886e-06, "loss": 0.0103, "mean_token_accuracy": 0.9973831027746201, "step": 663 }, { "epoch": 41.512, "grad_norm": 0.531686175736076, "learning_rate": 2.6087213310590455e-06, "loss": 0.0112, "mean_token_accuracy": 0.9973371922969818, "step": 664 }, { "epoch": 41.576, "grad_norm": 0.9385393222342217, "learning_rate": 2.5986447373729113e-06, "loss": 0.0113, "mean_token_accuracy": 0.9971841499209404, "step": 665 }, { "epoch": 41.64, "grad_norm": 0.472206360683089, "learning_rate": 2.5885711922868966e-06, "loss": 0.0105, "mean_token_accuracy": 0.997456781566143, "step": 666 }, { "epoch": 41.704, "grad_norm": 0.5007381968152668, "learning_rate": 2.5785008987025097e-06, "loss": 0.0104, "mean_token_accuracy": 0.9975973516702652, "step": 667 }, { "epoch": 41.768, "grad_norm": 0.5396194525393733, "learning_rate": 2.5684340594557616e-06, "loss": 0.0111, "mean_token_accuracy": 0.9974290132522583, "step": 668 }, { "epoch": 41.832, "grad_norm": 0.5760242778960697, "learning_rate": 2.5583708773130922e-06, "loss": 0.0111, "mean_token_accuracy": 0.9971535503864288, "step": 669 }, { "epoch": 41.896, "grad_norm": 0.5892431612364954, "learning_rate": 2.5483115549672748e-06, "loss": 0.0124, "mean_token_accuracy": 0.9971229434013367, "step": 670 }, { "epoch": 41.96, "grad_norm": 0.5280434128744002, "learning_rate": 2.538256295033343e-06, "loss": 0.0118, "mean_token_accuracy": 0.9971076399087906, "step": 671 }, { "epoch": 42.0, "grad_norm": 1.0474016065594887, "learning_rate": 2.5282053000445022e-06, "loss": 0.0135, "mean_token_accuracy": 0.9967189192771911, "step": 672 }, { "epoch": 42.064, "grad_norm": 0.5656166822835598, "learning_rate": 2.5181587724480557e-06, "loss": 0.0097, "mean_token_accuracy": 0.9976891726255417, "step": 673 }, { "epoch": 42.128, "grad_norm": 0.4909215344916955, "learning_rate": 2.5081169146013196e-06, "loss": 0.0105, "mean_token_accuracy": 0.9975055307149887, "step": 674 }, { "epoch": 42.192, "grad_norm": 0.7052948258193316, "learning_rate": 2.498079928767558e-06, "loss": 0.0101, "mean_token_accuracy": 0.9974902272224426, "step": 675 }, { "epoch": 42.256, "grad_norm": 0.5722745698716226, "learning_rate": 2.488048017111899e-06, "loss": 0.0113, "mean_token_accuracy": 0.9973831027746201, "step": 676 }, { "epoch": 42.32, "grad_norm": 0.6843538414030581, "learning_rate": 2.478021381697268e-06, "loss": 0.0103, "mean_token_accuracy": 0.9975973516702652, "step": 677 }, { "epoch": 42.384, "grad_norm": 0.7867560052243934, "learning_rate": 2.4680002244803154e-06, "loss": 0.0108, "mean_token_accuracy": 0.997395284473896, "step": 678 }, { "epoch": 42.448, "grad_norm": 0.644452799517488, "learning_rate": 2.457984747307351e-06, "loss": 0.0108, "mean_token_accuracy": 0.9974443167448044, "step": 679 }, { "epoch": 42.512, "grad_norm": 0.5489215454084088, "learning_rate": 2.447975151910276e-06, "loss": 0.0105, "mean_token_accuracy": 0.9974137023091316, "step": 680 }, { "epoch": 42.576, "grad_norm": 0.6730293569309618, "learning_rate": 2.43797163990252e-06, "loss": 0.0111, "mean_token_accuracy": 0.9971382394433022, "step": 681 }, { "epoch": 42.64, "grad_norm": 0.5934605852042939, "learning_rate": 2.4279744127749816e-06, "loss": 0.011, "mean_token_accuracy": 0.9973371922969818, "step": 682 }, { "epoch": 42.704, "grad_norm": 0.46213644873616466, "learning_rate": 2.4179836718919707e-06, "loss": 0.0105, "mean_token_accuracy": 0.9972759783267975, "step": 683 }, { "epoch": 42.768, "grad_norm": 0.6193935281286815, "learning_rate": 2.407999618487149e-06, "loss": 0.0114, "mean_token_accuracy": 0.9973218888044357, "step": 684 }, { "epoch": 42.832, "grad_norm": 0.6979821967303077, "learning_rate": 2.3980224536594806e-06, "loss": 0.0127, "mean_token_accuracy": 0.9969086721539497, "step": 685 }, { "epoch": 42.896, "grad_norm": 0.789001343233974, "learning_rate": 2.3880523783691796e-06, "loss": 0.0123, "mean_token_accuracy": 0.997028723359108, "step": 686 }, { "epoch": 42.96, "grad_norm": 0.8792177333772107, "learning_rate": 2.3780895934336597e-06, "loss": 0.0113, "mean_token_accuracy": 0.9971382468938828, "step": 687 }, { "epoch": 43.0, "grad_norm": 0.8792177333772107, "learning_rate": 2.368134299523495e-06, "loss": 0.0115, "mean_token_accuracy": 0.9972576022148132, "step": 688 }, { "epoch": 43.064, "grad_norm": 1.1639832706566282, "learning_rate": 2.358186697158376e-06, "loss": 0.0097, "mean_token_accuracy": 0.9977044761180878, "step": 689 }, { "epoch": 43.128, "grad_norm": 0.4520443771688941, "learning_rate": 2.3482469867030676e-06, "loss": 0.0108, "mean_token_accuracy": 0.997306577861309, "step": 690 }, { "epoch": 43.192, "grad_norm": 0.5381457008485614, "learning_rate": 2.338315368363377e-06, "loss": 0.01, "mean_token_accuracy": 0.9976246953010559, "step": 691 }, { "epoch": 43.256, "grad_norm": 0.4252129735839273, "learning_rate": 2.3283920421821194e-06, "loss": 0.0112, "mean_token_accuracy": 0.9973831027746201, "step": 692 }, { "epoch": 43.32, "grad_norm": 0.6712697101774623, "learning_rate": 2.3184772080350903e-06, "loss": 0.0113, "mean_token_accuracy": 0.9971994608640671, "step": 693 }, { "epoch": 43.384, "grad_norm": 0.6769535009778792, "learning_rate": 2.3085710656270367e-06, "loss": 0.0111, "mean_token_accuracy": 0.9973371922969818, "step": 694 }, { "epoch": 43.448, "grad_norm": 0.5945695035935201, "learning_rate": 2.2986738144876362e-06, "loss": 0.0105, "mean_token_accuracy": 0.9973524883389473, "step": 695 }, { "epoch": 43.512, "grad_norm": 0.6254937086774944, "learning_rate": 2.288785653967482e-06, "loss": 0.01, "mean_token_accuracy": 0.9976279586553574, "step": 696 }, { "epoch": 43.576, "grad_norm": 0.6796085457642533, "learning_rate": 2.27890678323406e-06, "loss": 0.0104, "mean_token_accuracy": 0.9976126551628113, "step": 697 }, { "epoch": 43.64, "grad_norm": 0.5945555207049963, "learning_rate": 2.2690374012677423e-06, "loss": 0.011, "mean_token_accuracy": 0.9972759783267975, "step": 698 }, { "epoch": 43.704, "grad_norm": 0.5758907467379553, "learning_rate": 2.2591777068577802e-06, "loss": 0.0106, "mean_token_accuracy": 0.9972444847226143, "step": 699 }, { "epoch": 43.768, "grad_norm": 0.6080710552937597, "learning_rate": 2.2493278985982932e-06, "loss": 0.011, "mean_token_accuracy": 0.9972147569060326, "step": 700 }, { "epoch": 43.832, "grad_norm": 0.5497246394338794, "learning_rate": 2.2394881748842782e-06, "loss": 0.0101, "mean_token_accuracy": 0.9975208342075348, "step": 701 }, { "epoch": 43.896, "grad_norm": 0.445877691885476, "learning_rate": 2.2296587339076082e-06, "loss": 0.0109, "mean_token_accuracy": 0.9974749237298965, "step": 702 }, { "epoch": 43.96, "grad_norm": 0.559997751873104, "learning_rate": 2.2198397736530396e-06, "loss": 0.011, "mean_token_accuracy": 0.9973371922969818, "step": 703 }, { "epoch": 44.0, "grad_norm": 0.9455909941718332, "learning_rate": 2.2100314918942267e-06, "loss": 0.0109, "mean_token_accuracy": 0.9970862150192261, "step": 704 }, { "epoch": 44.064, "grad_norm": 0.8211865028629974, "learning_rate": 2.2002340861897382e-06, "loss": 0.0094, "mean_token_accuracy": 0.9976279586553574, "step": 705 }, { "epoch": 44.128, "grad_norm": 0.3496473820023393, "learning_rate": 2.1904477538790776e-06, "loss": 0.009, "mean_token_accuracy": 0.9978728145360947, "step": 706 }, { "epoch": 44.192, "grad_norm": 0.39419193227075044, "learning_rate": 2.180672692078704e-06, "loss": 0.0095, "mean_token_accuracy": 0.9976585656404495, "step": 707 }, { "epoch": 44.256, "grad_norm": 0.43561776419350157, "learning_rate": 2.1709090976780707e-06, "loss": 0.0093, "mean_token_accuracy": 0.997750386595726, "step": 708 }, { "epoch": 44.32, "grad_norm": 0.4087930781873581, "learning_rate": 2.1611571673356503e-06, "loss": 0.0103, "mean_token_accuracy": 0.9974479228258133, "step": 709 }, { "epoch": 44.384, "grad_norm": 0.5392256498495991, "learning_rate": 2.1514170974749817e-06, "loss": 0.0113, "mean_token_accuracy": 0.9974137023091316, "step": 710 }, { "epoch": 44.448, "grad_norm": 0.8872781206808652, "learning_rate": 2.141689084280706e-06, "loss": 0.0105, "mean_token_accuracy": 0.997272714972496, "step": 711 }, { "epoch": 44.512, "grad_norm": 0.45570792799208193, "learning_rate": 2.131973323694623e-06, "loss": 0.0118, "mean_token_accuracy": 0.9972300678491592, "step": 712 }, { "epoch": 44.576, "grad_norm": 0.7468669188621526, "learning_rate": 2.1222700114117344e-06, "loss": 0.0104, "mean_token_accuracy": 0.9973984062671661, "step": 713 }, { "epoch": 44.64, "grad_norm": 0.39692078650866014, "learning_rate": 2.1125793428763146e-06, "loss": 0.0103, "mean_token_accuracy": 0.9974749237298965, "step": 714 }, { "epoch": 44.704, "grad_norm": 0.5767455515325547, "learning_rate": 2.1029015132779616e-06, "loss": 0.01, "mean_token_accuracy": 0.9974290132522583, "step": 715 }, { "epoch": 44.768, "grad_norm": 0.5130351699263059, "learning_rate": 2.0932367175476752e-06, "loss": 0.0107, "mean_token_accuracy": 0.9973831027746201, "step": 716 }, { "epoch": 44.832, "grad_norm": 0.6465761953935526, "learning_rate": 2.0835851503539255e-06, "loss": 0.0108, "mean_token_accuracy": 0.9973984062671661, "step": 717 }, { "epoch": 44.896, "grad_norm": 0.4033731412492044, "learning_rate": 2.0739470060987333e-06, "loss": 0.0095, "mean_token_accuracy": 0.9976126551628113, "step": 718 }, { "epoch": 44.96, "grad_norm": 0.3586751351844863, "learning_rate": 2.064322478913755e-06, "loss": 0.0104, "mean_token_accuracy": 0.9976432621479034, "step": 719 }, { "epoch": 45.0, "grad_norm": 0.6555959700890318, "learning_rate": 2.054711762656369e-06, "loss": 0.01, "mean_token_accuracy": 0.9973800420761109, "step": 720 }, { "epoch": 45.064, "grad_norm": 0.40832188354167814, "learning_rate": 2.045115050905777e-06, "loss": 0.0091, "mean_token_accuracy": 0.9976585656404495, "step": 721 }, { "epoch": 45.128, "grad_norm": 0.41058465820409745, "learning_rate": 2.0355325369591006e-06, "loss": 0.0095, "mean_token_accuracy": 0.9976126551628113, "step": 722 }, { "epoch": 45.192, "grad_norm": 0.5652309703047484, "learning_rate": 2.0259644138274887e-06, "loss": 0.0104, "mean_token_accuracy": 0.9974596202373505, "step": 723 }, { "epoch": 45.256, "grad_norm": 0.31017365712695993, "learning_rate": 2.016410874232231e-06, "loss": 0.0089, "mean_token_accuracy": 0.9977656900882721, "step": 724 }, { "epoch": 45.32, "grad_norm": 0.4629502147033417, "learning_rate": 2.0068721106008746e-06, "loss": 0.0096, "mean_token_accuracy": 0.9975361302495003, "step": 725 }, { "epoch": 45.384, "grad_norm": 0.43823796735968357, "learning_rate": 1.9973483150633483e-06, "loss": 0.0092, "mean_token_accuracy": 0.9978881180286407, "step": 726 }, { "epoch": 45.448, "grad_norm": 0.2816645748435719, "learning_rate": 1.9878396794480947e-06, "loss": 0.0091, "mean_token_accuracy": 0.9976585656404495, "step": 727 }, { "epoch": 45.512, "grad_norm": 0.2908831632520279, "learning_rate": 1.9783463952782046e-06, "loss": 0.0091, "mean_token_accuracy": 0.9974596202373505, "step": 728 }, { "epoch": 45.576, "grad_norm": 0.40412815503664357, "learning_rate": 1.968868653767559e-06, "loss": 0.0092, "mean_token_accuracy": 0.9976585656404495, "step": 729 }, { "epoch": 45.64, "grad_norm": 0.5695184327648838, "learning_rate": 1.959406645816979e-06, "loss": 0.0102, "mean_token_accuracy": 0.9972147569060326, "step": 730 }, { "epoch": 45.704, "grad_norm": 0.3182964759850391, "learning_rate": 1.9499605620103807e-06, "loss": 0.0094, "mean_token_accuracy": 0.9977197796106339, "step": 731 }, { "epoch": 45.768, "grad_norm": 0.4066506305030847, "learning_rate": 1.940530592610935e-06, "loss": 0.0093, "mean_token_accuracy": 0.9976562932133675, "step": 732 }, { "epoch": 45.832, "grad_norm": 0.44634315418522186, "learning_rate": 1.9311169275572356e-06, "loss": 0.0096, "mean_token_accuracy": 0.9975856617093086, "step": 733 }, { "epoch": 45.896, "grad_norm": 0.39205164906449547, "learning_rate": 1.9217197564594746e-06, "loss": 0.0096, "mean_token_accuracy": 0.997566744685173, "step": 734 }, { "epoch": 45.96, "grad_norm": 0.3313544451499269, "learning_rate": 1.912339268595624e-06, "loss": 0.0096, "mean_token_accuracy": 0.9974596202373505, "step": 735 }, { "epoch": 46.0, "grad_norm": 0.3313544451499269, "learning_rate": 1.9029756529076183e-06, "loss": 0.0094, "mean_token_accuracy": 0.9977473258972168, "step": 736 }, { "epoch": 46.064, "grad_norm": 0.5284163935192331, "learning_rate": 1.8936290979975557e-06, "loss": 0.0086, "mean_token_accuracy": 0.9978116005659103, "step": 737 }, { "epoch": 46.128, "grad_norm": 0.2356875163765209, "learning_rate": 1.8842997921238942e-06, "loss": 0.0088, "mean_token_accuracy": 0.9976432621479034, "step": 738 }, { "epoch": 46.192, "grad_norm": 0.38731964204273156, "learning_rate": 1.8749879231976625e-06, "loss": 0.0088, "mean_token_accuracy": 0.9978137090802193, "step": 739 }, { "epoch": 46.256, "grad_norm": 0.3605993500165629, "learning_rate": 1.8656936787786722e-06, "loss": 0.0089, "mean_token_accuracy": 0.9975361377000809, "step": 740 }, { "epoch": 46.32, "grad_norm": 0.4205001794061004, "learning_rate": 1.856417246071744e-06, "loss": 0.0091, "mean_token_accuracy": 0.9978116005659103, "step": 741 }, { "epoch": 46.384, "grad_norm": 0.38574961664437213, "learning_rate": 1.8471588119229333e-06, "loss": 0.009, "mean_token_accuracy": 0.9976714551448822, "step": 742 }, { "epoch": 46.448, "grad_norm": 0.6461515159169602, "learning_rate": 1.8379185628157692e-06, "loss": 0.0091, "mean_token_accuracy": 0.9977044761180878, "step": 743 }, { "epoch": 46.512, "grad_norm": 0.3430837172100604, "learning_rate": 1.828696684867497e-06, "loss": 0.0088, "mean_token_accuracy": 0.9976126551628113, "step": 744 }, { "epoch": 46.576, "grad_norm": 0.3640191208329963, "learning_rate": 1.8194933638253293e-06, "loss": 0.0109, "mean_token_accuracy": 0.9976738691329956, "step": 745 }, { "epoch": 46.64, "grad_norm": 0.6641808943197925, "learning_rate": 1.8103087850627055e-06, "loss": 0.0094, "mean_token_accuracy": 0.9976585656404495, "step": 746 }, { "epoch": 46.704, "grad_norm": 0.34152584166283884, "learning_rate": 1.8011431335755585e-06, "loss": 0.0096, "mean_token_accuracy": 0.9974290058016777, "step": 747 }, { "epoch": 46.768, "grad_norm": 0.3138674568896022, "learning_rate": 1.7919965939785867e-06, "loss": 0.0095, "mean_token_accuracy": 0.9976585656404495, "step": 748 }, { "epoch": 46.832, "grad_norm": 0.3591591047523301, "learning_rate": 1.782869350501537e-06, "loss": 0.0096, "mean_token_accuracy": 0.9976891726255417, "step": 749 }, { "epoch": 46.896, "grad_norm": 0.40257498778333656, "learning_rate": 1.7737615869854945e-06, "loss": 0.0091, "mean_token_accuracy": 0.9974902272224426, "step": 750 }, { "epoch": 46.96, "grad_norm": 0.4412716103461209, "learning_rate": 1.7646734868791766e-06, "loss": 0.0096, "mean_token_accuracy": 0.9976585656404495, "step": 751 }, { "epoch": 47.0, "grad_norm": 0.4161981983020832, "learning_rate": 1.7556052332352407e-06, "loss": 0.0091, "mean_token_accuracy": 0.9976248979568482, "step": 752 }, { "epoch": 47.064, "grad_norm": 0.2832828810414189, "learning_rate": 1.7465570087065958e-06, "loss": 0.0084, "mean_token_accuracy": 0.9977656900882721, "step": 753 }, { "epoch": 47.128, "grad_norm": 0.21832612594080264, "learning_rate": 1.7375289955427276e-06, "loss": 0.0085, "mean_token_accuracy": 0.9978881180286407, "step": 754 }, { "epoch": 47.192, "grad_norm": 0.3204972605009617, "learning_rate": 1.72852137558602e-06, "loss": 0.0084, "mean_token_accuracy": 0.9977782964706421, "step": 755 }, { "epoch": 47.256, "grad_norm": 0.27341340204839315, "learning_rate": 1.7195343302680987e-06, "loss": 0.0089, "mean_token_accuracy": 0.9976279586553574, "step": 756 }, { "epoch": 47.32, "grad_norm": 0.3301760285983261, "learning_rate": 1.7105680406061737e-06, "loss": 0.0091, "mean_token_accuracy": 0.9977350831031799, "step": 757 }, { "epoch": 47.384, "grad_norm": 0.391122737310084, "learning_rate": 1.7016226871993951e-06, "loss": 0.0088, "mean_token_accuracy": 0.9976432621479034, "step": 758 }, { "epoch": 47.448, "grad_norm": 0.2851247761949009, "learning_rate": 1.6926984502252131e-06, "loss": 0.0087, "mean_token_accuracy": 0.9977248907089233, "step": 759 }, { "epoch": 47.512, "grad_norm": 0.30022412216882766, "learning_rate": 1.6837955094357533e-06, "loss": 0.0093, "mean_token_accuracy": 0.9977350831031799, "step": 760 }, { "epoch": 47.576, "grad_norm": 0.3900974688277349, "learning_rate": 1.674914044154191e-06, "loss": 0.0087, "mean_token_accuracy": 0.9975361377000809, "step": 761 }, { "epoch": 47.64, "grad_norm": 0.24104452472381788, "learning_rate": 1.6660542332711405e-06, "loss": 0.009, "mean_token_accuracy": 0.9976738691329956, "step": 762 }, { "epoch": 47.704, "grad_norm": 0.34273317391720776, "learning_rate": 1.6572162552410565e-06, "loss": 0.009, "mean_token_accuracy": 0.9978575110435486, "step": 763 }, { "epoch": 47.768, "grad_norm": 0.31192816934571316, "learning_rate": 1.6484002880786332e-06, "loss": 0.0088, "mean_token_accuracy": 0.9976432621479034, "step": 764 }, { "epoch": 47.832, "grad_norm": 0.38294183497031853, "learning_rate": 1.63960650935522e-06, "loss": 0.0091, "mean_token_accuracy": 0.9976891726255417, "step": 765 }, { "epoch": 47.896, "grad_norm": 0.3213590818377222, "learning_rate": 1.630835096195251e-06, "loss": 0.0088, "mean_token_accuracy": 0.997551441192627, "step": 766 }, { "epoch": 47.96, "grad_norm": 0.31375674558467337, "learning_rate": 1.6220862252726694e-06, "loss": 0.0089, "mean_token_accuracy": 0.9977044761180878, "step": 767 }, { "epoch": 48.0, "grad_norm": 0.3320825690642502, "learning_rate": 1.6133600728073728e-06, "loss": 0.0091, "mean_token_accuracy": 0.9974779844284057, "step": 768 }, { "epoch": 48.064, "grad_norm": 0.34059415297702106, "learning_rate": 1.6046568145616647e-06, "loss": 0.0083, "mean_token_accuracy": 0.9977627098560333, "step": 769 }, { "epoch": 48.128, "grad_norm": 0.3038296558736781, "learning_rate": 1.5959766258367115e-06, "loss": 0.0087, "mean_token_accuracy": 0.9977350831031799, "step": 770 }, { "epoch": 48.192, "grad_norm": 0.4008354211562346, "learning_rate": 1.5873196814690131e-06, "loss": 0.0084, "mean_token_accuracy": 0.9979646354913712, "step": 771 }, { "epoch": 48.256, "grad_norm": 0.36851928884792756, "learning_rate": 1.5786861558268815e-06, "loss": 0.0085, "mean_token_accuracy": 0.9978269040584564, "step": 772 }, { "epoch": 48.32, "grad_norm": 0.18445835327433446, "learning_rate": 1.5700762228069271e-06, "loss": 0.0084, "mean_token_accuracy": 0.9978881180286407, "step": 773 }, { "epoch": 48.384, "grad_norm": 0.18144517292595827, "learning_rate": 1.5614900558305603e-06, "loss": 0.0082, "mean_token_accuracy": 0.9979187250137329, "step": 774 }, { "epoch": 48.448, "grad_norm": 0.21932765256375614, "learning_rate": 1.552927827840493e-06, "loss": 0.0084, "mean_token_accuracy": 0.9977656900882721, "step": 775 }, { "epoch": 48.512, "grad_norm": 0.2142335192812177, "learning_rate": 1.5443897112972571e-06, "loss": 0.0084, "mean_token_accuracy": 0.9977809935808182, "step": 776 }, { "epoch": 48.576, "grad_norm": 0.21850798642766353, "learning_rate": 1.5358758781757356e-06, "loss": 0.0085, "mean_token_accuracy": 0.9978728145360947, "step": 777 }, { "epoch": 48.64, "grad_norm": 0.2507445542309199, "learning_rate": 1.527386499961688e-06, "loss": 0.009, "mean_token_accuracy": 0.9973831027746201, "step": 778 }, { "epoch": 48.704, "grad_norm": 0.2040594952397198, "learning_rate": 1.5189217476483067e-06, "loss": 0.0087, "mean_token_accuracy": 0.9975973516702652, "step": 779 }, { "epoch": 48.768, "grad_norm": 0.27687976808026465, "learning_rate": 1.5104817917327696e-06, "loss": 0.0091, "mean_token_accuracy": 0.9977197796106339, "step": 780 }, { "epoch": 48.832, "grad_norm": 0.2883239728320552, "learning_rate": 1.5020668022128029e-06, "loss": 0.0087, "mean_token_accuracy": 0.997750386595726, "step": 781 }, { "epoch": 48.896, "grad_norm": 0.41897703803928205, "learning_rate": 1.4936769485832593e-06, "loss": 0.009, "mean_token_accuracy": 0.9973831027746201, "step": 782 }, { "epoch": 48.96, "grad_norm": 0.3663075931183914, "learning_rate": 1.4853123998327068e-06, "loss": 0.0092, "mean_token_accuracy": 0.9975106418132782, "step": 783 }, { "epoch": 49.0, "grad_norm": 0.3663075931183914, "learning_rate": 1.4769733244400175e-06, "loss": 0.0091, "mean_token_accuracy": 0.9976248979568482, "step": 784 }, { "epoch": 49.064, "grad_norm": 0.488258719545098, "learning_rate": 1.468659890370983e-06, "loss": 0.0083, "mean_token_accuracy": 0.9978575110435486, "step": 785 }, { "epoch": 49.128, "grad_norm": 0.24704686604308318, "learning_rate": 1.4603722650749253e-06, "loss": 0.0086, "mean_token_accuracy": 0.9976738691329956, "step": 786 }, { "epoch": 49.192, "grad_norm": 0.1921189707644135, "learning_rate": 1.4521106154813243e-06, "loss": 0.0083, "mean_token_accuracy": 0.9977937415242195, "step": 787 }, { "epoch": 49.256, "grad_norm": 0.263735461832981, "learning_rate": 1.4438751079964596e-06, "loss": 0.0082, "mean_token_accuracy": 0.9977809935808182, "step": 788 }, { "epoch": 49.32, "grad_norm": 0.16773585523854326, "learning_rate": 1.4356659085000547e-06, "loss": 0.0082, "mean_token_accuracy": 0.9977809935808182, "step": 789 }, { "epoch": 49.384, "grad_norm": 0.19388384412231485, "learning_rate": 1.427483182341936e-06, "loss": 0.0081, "mean_token_accuracy": 0.9978269040584564, "step": 790 }, { "epoch": 49.448, "grad_norm": 0.19811291280204796, "learning_rate": 1.419327094338705e-06, "loss": 0.0083, "mean_token_accuracy": 0.9977386966347694, "step": 791 }, { "epoch": 49.512, "grad_norm": 0.4710968266343195, "learning_rate": 1.411197808770417e-06, "loss": 0.0083, "mean_token_accuracy": 0.9978881180286407, "step": 792 }, { "epoch": 49.576, "grad_norm": 0.2075362434024561, "learning_rate": 1.4030954893772708e-06, "loss": 0.0085, "mean_token_accuracy": 0.9977350831031799, "step": 793 }, { "epoch": 49.64, "grad_norm": 0.2113704124916447, "learning_rate": 1.3950202993563152e-06, "loss": 0.0085, "mean_token_accuracy": 0.9977962970733643, "step": 794 }, { "epoch": 49.704, "grad_norm": 0.2641275394977783, "learning_rate": 1.3869724013581556e-06, "loss": 0.0085, "mean_token_accuracy": 0.9978422075510025, "step": 795 }, { "epoch": 49.768, "grad_norm": 0.3290166272594496, "learning_rate": 1.378951957483682e-06, "loss": 0.0088, "mean_token_accuracy": 0.9977962970733643, "step": 796 }, { "epoch": 49.832, "grad_norm": 0.2488571222203342, "learning_rate": 1.3709591292808037e-06, "loss": 0.0085, "mean_token_accuracy": 0.997551441192627, "step": 797 }, { "epoch": 49.896, "grad_norm": 0.24102999379606427, "learning_rate": 1.3629940777411931e-06, "loss": 0.0095, "mean_token_accuracy": 0.9976738691329956, "step": 798 }, { "epoch": 49.96, "grad_norm": 0.38674374896600117, "learning_rate": 1.3550569632970462e-06, "loss": 0.0087, "mean_token_accuracy": 0.9976738691329956, "step": 799 }, { "epoch": 50.0, "grad_norm": 0.23348364737078411, "learning_rate": 1.3471479458178499e-06, "loss": 0.0088, "mean_token_accuracy": 0.9976983547210694, "step": 800 }, { "epoch": 50.064, "grad_norm": 0.23632095669865363, "learning_rate": 1.3392671846071598e-06, "loss": 0.0081, "mean_token_accuracy": 0.9979034215211868, "step": 801 }, { "epoch": 50.128, "grad_norm": 0.26316401122618605, "learning_rate": 1.331414838399394e-06, "loss": 0.0082, "mean_token_accuracy": 0.9978422075510025, "step": 802 }, { "epoch": 50.192, "grad_norm": 0.163050814225024, "learning_rate": 1.3235910653566369e-06, "loss": 0.008, "mean_token_accuracy": 0.9978422075510025, "step": 803 }, { "epoch": 50.256, "grad_norm": 0.13436863370460767, "learning_rate": 1.3157960230654482e-06, "loss": 0.008, "mean_token_accuracy": 0.9979034215211868, "step": 804 }, { "epoch": 50.32, "grad_norm": 0.18787063464267242, "learning_rate": 1.308029868533696e-06, "loss": 0.0082, "mean_token_accuracy": 0.9978575110435486, "step": 805 }, { "epoch": 50.384, "grad_norm": 0.19127962214373412, "learning_rate": 1.300292758187388e-06, "loss": 0.0081, "mean_token_accuracy": 0.9976126551628113, "step": 806 }, { "epoch": 50.448, "grad_norm": 0.29131396896300116, "learning_rate": 1.2925848478675245e-06, "loss": 0.0082, "mean_token_accuracy": 0.9976432621479034, "step": 807 }, { "epoch": 50.512, "grad_norm": 0.26381738871177507, "learning_rate": 1.2849062928269595e-06, "loss": 0.0083, "mean_token_accuracy": 0.9979187250137329, "step": 808 }, { "epoch": 50.576, "grad_norm": 0.7292060496015264, "learning_rate": 1.2772572477272717e-06, "loss": 0.0108, "mean_token_accuracy": 0.9973830953240395, "step": 809 }, { "epoch": 50.64, "grad_norm": 0.5357785067923548, "learning_rate": 1.2696378666356468e-06, "loss": 0.0084, "mean_token_accuracy": 0.997762568295002, "step": 810 }, { "epoch": 50.704, "grad_norm": 0.19374371653128392, "learning_rate": 1.262048303021784e-06, "loss": 0.0089, "mean_token_accuracy": 0.9977044761180878, "step": 811 }, { "epoch": 50.768, "grad_norm": 0.29372930267383707, "learning_rate": 1.254488709754793e-06, "loss": 0.0089, "mean_token_accuracy": 0.9974770322442055, "step": 812 }, { "epoch": 50.832, "grad_norm": 0.2656571476664123, "learning_rate": 1.2469592391001222e-06, "loss": 0.0086, "mean_token_accuracy": 0.9978575110435486, "step": 813 }, { "epoch": 50.896, "grad_norm": 0.27752205085870507, "learning_rate": 1.2394600427164912e-06, "loss": 0.0086, "mean_token_accuracy": 0.9976738691329956, "step": 814 }, { "epoch": 50.96, "grad_norm": 0.15467959920167934, "learning_rate": 1.2319912716528329e-06, "loss": 0.0087, "mean_token_accuracy": 0.9976126551628113, "step": 815 }, { "epoch": 51.0, "grad_norm": 0.15467959920167934, "learning_rate": 1.2245530763452537e-06, "loss": 0.0086, "mean_token_accuracy": 0.997551441192627, "step": 816 }, { "epoch": 51.064, "grad_norm": 0.4622429465142165, "learning_rate": 1.2171456066140017e-06, "loss": 0.0079, "mean_token_accuracy": 0.9980411529541016, "step": 817 }, { "epoch": 51.128, "grad_norm": 0.1354134578281581, "learning_rate": 1.2097690116604504e-06, "loss": 0.008, "mean_token_accuracy": 0.9977044761180878, "step": 818 }, { "epoch": 51.192, "grad_norm": 0.1584653147039429, "learning_rate": 1.2024234400640948e-06, "loss": 0.0081, "mean_token_accuracy": 0.9978084787726402, "step": 819 }, { "epoch": 51.256, "grad_norm": 0.3477545937883923, "learning_rate": 1.1951090397795546e-06, "loss": 0.0086, "mean_token_accuracy": 0.9977044761180878, "step": 820 }, { "epoch": 51.32, "grad_norm": 0.2842554387552033, "learning_rate": 1.1878259581335968e-06, "loss": 0.0082, "mean_token_accuracy": 0.9977350831031799, "step": 821 }, { "epoch": 51.384, "grad_norm": 0.29219583222461365, "learning_rate": 1.1805743418221704e-06, "loss": 0.0082, "mean_token_accuracy": 0.9978269040584564, "step": 822 }, { "epoch": 51.448, "grad_norm": 0.20986296174671532, "learning_rate": 1.1733543369074446e-06, "loss": 0.0083, "mean_token_accuracy": 0.9976285696029663, "step": 823 }, { "epoch": 51.512, "grad_norm": 0.3532574952377663, "learning_rate": 1.166166088814874e-06, "loss": 0.0083, "mean_token_accuracy": 0.9977656900882721, "step": 824 }, { "epoch": 51.576, "grad_norm": 0.19763286449849377, "learning_rate": 1.1590097423302683e-06, "loss": 0.0082, "mean_token_accuracy": 0.997750386595726, "step": 825 }, { "epoch": 51.64, "grad_norm": 0.25519257824958863, "learning_rate": 1.151885441596872e-06, "loss": 0.0083, "mean_token_accuracy": 0.9978575110435486, "step": 826 }, { "epoch": 51.704, "grad_norm": 0.2530228025265665, "learning_rate": 1.1447933301124637e-06, "loss": 0.0082, "mean_token_accuracy": 0.9978575110435486, "step": 827 }, { "epoch": 51.768, "grad_norm": 0.34122292114307995, "learning_rate": 1.1377335507264674e-06, "loss": 0.0087, "mean_token_accuracy": 0.9976891726255417, "step": 828 }, { "epoch": 51.832, "grad_norm": 0.28783964192228123, "learning_rate": 1.130706245637073e-06, "loss": 0.0092, "mean_token_accuracy": 0.9976432621479034, "step": 829 }, { "epoch": 51.896, "grad_norm": 0.4833595534128089, "learning_rate": 1.1237115563883694e-06, "loss": 0.0084, "mean_token_accuracy": 0.9977044761180878, "step": 830 }, { "epoch": 51.96, "grad_norm": 0.19581337054605016, "learning_rate": 1.1167496238675014e-06, "loss": 0.0085, "mean_token_accuracy": 0.9977350831031799, "step": 831 }, { "epoch": 52.0, "grad_norm": 0.3966310446734559, "learning_rate": 1.1098205883018246e-06, "loss": 0.0084, "mean_token_accuracy": 0.9977718114852905, "step": 832 }, { "epoch": 52.064, "grad_norm": 0.22079077321005453, "learning_rate": 1.1029245892560837e-06, "loss": 0.0078, "mean_token_accuracy": 0.9979187250137329, "step": 833 }, { "epoch": 52.128, "grad_norm": 0.14905600318128961, "learning_rate": 1.0960617656296024e-06, "loss": 0.008, "mean_token_accuracy": 0.9979187250137329, "step": 834 }, { "epoch": 52.192, "grad_norm": 0.18923127607463988, "learning_rate": 1.089232255653484e-06, "loss": 0.0079, "mean_token_accuracy": 0.9978116005659103, "step": 835 }, { "epoch": 52.256, "grad_norm": 0.15095758431658804, "learning_rate": 1.0824361968878271e-06, "loss": 0.0079, "mean_token_accuracy": 0.9977787211537361, "step": 836 }, { "epoch": 52.32, "grad_norm": 0.22084646302152827, "learning_rate": 1.0756737262189564e-06, "loss": 0.008, "mean_token_accuracy": 0.9976891726255417, "step": 837 }, { "epoch": 52.384, "grad_norm": 0.16650740845218587, "learning_rate": 1.0689449798566633e-06, "loss": 0.008, "mean_token_accuracy": 0.9979034215211868, "step": 838 }, { "epoch": 52.448, "grad_norm": 0.5367257842542196, "learning_rate": 1.0622500933314661e-06, "loss": 0.0087, "mean_token_accuracy": 0.9976738691329956, "step": 839 }, { "epoch": 52.512, "grad_norm": 0.15932230901724578, "learning_rate": 1.0555892014918756e-06, "loss": 0.0082, "mean_token_accuracy": 0.9977962970733643, "step": 840 }, { "epoch": 52.576, "grad_norm": 0.2659467113306451, "learning_rate": 1.048962438501681e-06, "loss": 0.0081, "mean_token_accuracy": 0.9976432621479034, "step": 841 }, { "epoch": 52.64, "grad_norm": 0.20271338804628306, "learning_rate": 1.0423699378372504e-06, "loss": 0.0081, "mean_token_accuracy": 0.997750386595726, "step": 842 }, { "epoch": 52.704, "grad_norm": 0.1762979387898389, "learning_rate": 1.0358118322848356e-06, "loss": 0.0082, "mean_token_accuracy": 0.9978422075510025, "step": 843 }, { "epoch": 52.768, "grad_norm": 0.2637024661552943, "learning_rate": 1.0292882539379038e-06, "loss": 0.0085, "mean_token_accuracy": 0.9978116005659103, "step": 844 }, { "epoch": 52.832, "grad_norm": 0.23391012596045194, "learning_rate": 1.022799334194475e-06, "loss": 0.0082, "mean_token_accuracy": 0.9978116005659103, "step": 845 }, { "epoch": 52.896, "grad_norm": 0.1516798624112433, "learning_rate": 1.0163452037544746e-06, "loss": 0.0083, "mean_token_accuracy": 0.9978044033050537, "step": 846 }, { "epoch": 52.96, "grad_norm": 0.35663830254194884, "learning_rate": 1.0099259926170998e-06, "loss": 0.0096, "mean_token_accuracy": 0.997291274368763, "step": 847 }, { "epoch": 53.0, "grad_norm": 0.40517239876601796, "learning_rate": 1.0035418300782063e-06, "loss": 0.0087, "mean_token_accuracy": 0.9974779844284057, "step": 848 }, { "epoch": 53.064, "grad_norm": 0.18199750980984034, "learning_rate": 9.971928447276954e-07, "loss": 0.0079, "mean_token_accuracy": 0.9979034215211868, "step": 849 }, { "epoch": 53.128, "grad_norm": 0.12177149552647823, "learning_rate": 9.908791644469333e-07, "loss": 0.0079, "mean_token_accuracy": 0.9978575110435486, "step": 850 }, { "epoch": 53.192, "grad_norm": 0.297338862443527, "learning_rate": 9.846009164061686e-07, "loss": 0.008, "mean_token_accuracy": 0.9977809935808182, "step": 851 }, { "epoch": 53.256, "grad_norm": 0.20330273559096518, "learning_rate": 9.78358227061973e-07, "loss": 0.0079, "mean_token_accuracy": 0.997934028506279, "step": 852 }, { "epoch": 53.32, "grad_norm": 0.34871862093616646, "learning_rate": 9.721512221546967e-07, "loss": 0.0079, "mean_token_accuracy": 0.9977044761180878, "step": 853 }, { "epoch": 53.384, "grad_norm": 0.1818545557181228, "learning_rate": 9.659800267059307e-07, "loss": 0.008, "mean_token_accuracy": 0.9977350831031799, "step": 854 }, { "epoch": 53.448, "grad_norm": 0.24133428352131955, "learning_rate": 9.59844765015993e-07, "loss": 0.0081, "mean_token_accuracy": 0.9978305175900459, "step": 855 }, { "epoch": 53.512, "grad_norm": 0.14502289396279333, "learning_rate": 9.537455606614222e-07, "loss": 0.008, "mean_token_accuracy": 0.9977962970733643, "step": 856 }, { "epoch": 53.576, "grad_norm": 0.1717480615528778, "learning_rate": 9.4768253649249e-07, "loss": 0.008, "mean_token_accuracy": 0.9979034215211868, "step": 857 }, { "epoch": 53.64, "grad_norm": 0.2141862898471825, "learning_rate": 9.416558146307254e-07, "loss": 0.008, "mean_token_accuracy": 0.9977656900882721, "step": 858 }, { "epoch": 53.704, "grad_norm": 0.21761866906037952, "learning_rate": 9.35665516466458e-07, "loss": 0.0083, "mean_token_accuracy": 0.9978116005659103, "step": 859 }, { "epoch": 53.768, "grad_norm": 0.17429975030438657, "learning_rate": 9.297117626563687e-07, "loss": 0.0081, "mean_token_accuracy": 0.9978395104408264, "step": 860 }, { "epoch": 53.832, "grad_norm": 0.22979599077621737, "learning_rate": 9.237946731210614e-07, "loss": 0.0083, "mean_token_accuracy": 0.9976891726255417, "step": 861 }, { "epoch": 53.896, "grad_norm": 0.15848246356911094, "learning_rate": 9.179143670426485e-07, "loss": 0.0083, "mean_token_accuracy": 0.997750386595726, "step": 862 }, { "epoch": 53.96, "grad_norm": 0.22865920679306917, "learning_rate": 9.120709628623495e-07, "loss": 0.0084, "mean_token_accuracy": 0.9977350831031799, "step": 863 }, { "epoch": 54.0, "grad_norm": 0.22865920679306917, "learning_rate": 9.062645782781033e-07, "loss": 0.0084, "mean_token_accuracy": 0.9976738691329956, "step": 864 }, { "epoch": 54.064, "grad_norm": 0.30826656474045566, "learning_rate": 9.00495330242203e-07, "loss": 0.0076, "mean_token_accuracy": 0.9978881180286407, "step": 865 }, { "epoch": 54.128, "grad_norm": 0.34759442822514836, "learning_rate": 8.947633349589338e-07, "loss": 0.0075, "mean_token_accuracy": 0.9978575110435486, "step": 866 }, { "epoch": 54.192, "grad_norm": 0.12242458570293652, "learning_rate": 8.890687078822357e-07, "loss": 0.0078, "mean_token_accuracy": 0.9979799389839172, "step": 867 }, { "epoch": 54.256, "grad_norm": 0.12733187955011038, "learning_rate": 8.834115637133806e-07, "loss": 0.0078, "mean_token_accuracy": 0.9979493319988251, "step": 868 }, { "epoch": 54.32, "grad_norm": 0.17625085196624593, "learning_rate": 8.777920163986539e-07, "loss": 0.0078, "mean_token_accuracy": 0.997934028506279, "step": 869 }, { "epoch": 54.384, "grad_norm": 0.13307040458058272, "learning_rate": 8.722101791270692e-07, "loss": 0.0079, "mean_token_accuracy": 0.9978881180286407, "step": 870 }, { "epoch": 54.448, "grad_norm": 0.13881932818428433, "learning_rate": 8.666661643280822e-07, "loss": 0.0079, "mean_token_accuracy": 0.9978728145360947, "step": 871 }, { "epoch": 54.512, "grad_norm": 0.15526585472091753, "learning_rate": 8.611600836693262e-07, "loss": 0.0079, "mean_token_accuracy": 0.9978269040584564, "step": 872 }, { "epoch": 54.576, "grad_norm": 0.17878347289110058, "learning_rate": 8.556920480543676e-07, "loss": 0.008, "mean_token_accuracy": 0.9976279586553574, "step": 873 }, { "epoch": 54.64, "grad_norm": 0.162144547796182, "learning_rate": 8.502621676204667e-07, "loss": 0.008, "mean_token_accuracy": 0.9978422075510025, "step": 874 }, { "epoch": 54.704, "grad_norm": 0.22168860069841154, "learning_rate": 8.448705517363608e-07, "loss": 0.008, "mean_token_accuracy": 0.9976891726255417, "step": 875 }, { "epoch": 54.768, "grad_norm": 0.15706467832442356, "learning_rate": 8.395173090000647e-07, "loss": 0.0081, "mean_token_accuracy": 0.9976432621479034, "step": 876 }, { "epoch": 54.832, "grad_norm": 0.18500500808436848, "learning_rate": 8.342025472366788e-07, "loss": 0.009, "mean_token_accuracy": 0.9976864755153656, "step": 877 }, { "epoch": 54.896, "grad_norm": 0.2729957481380432, "learning_rate": 8.289263734962187e-07, "loss": 0.0083, "mean_token_accuracy": 0.9977110847830772, "step": 878 }, { "epoch": 54.96, "grad_norm": 0.295960713747248, "learning_rate": 8.236888940514616e-07, "loss": 0.0085, "mean_token_accuracy": 0.9975208342075348, "step": 879 }, { "epoch": 55.0, "grad_norm": 0.3601162018663428, "learning_rate": 8.184902143958014e-07, "loss": 0.0082, "mean_token_accuracy": 0.9977718114852905, "step": 880 }, { "epoch": 55.064, "grad_norm": 0.16828959139773877, "learning_rate": 8.133304392411272e-07, "loss": 0.0078, "mean_token_accuracy": 0.9978116005659103, "step": 881 }, { "epoch": 55.128, "grad_norm": 0.13047369237040093, "learning_rate": 8.082096725157122e-07, "loss": 0.0079, "mean_token_accuracy": 0.9979646354913712, "step": 882 }, { "epoch": 55.192, "grad_norm": 0.13236287252844872, "learning_rate": 8.031280173621217e-07, "loss": 0.0077, "mean_token_accuracy": 0.9979952424764633, "step": 883 }, { "epoch": 55.256, "grad_norm": 0.15085704891782817, "learning_rate": 7.980855761351355e-07, "loss": 0.0077, "mean_token_accuracy": 0.9978728145360947, "step": 884 }, { "epoch": 55.32, "grad_norm": 0.12313688368558488, "learning_rate": 7.930824503996856e-07, "loss": 0.0078, "mean_token_accuracy": 0.9978269040584564, "step": 885 }, { "epoch": 55.384, "grad_norm": 0.12788028309114993, "learning_rate": 7.881187409288093e-07, "loss": 0.0079, "mean_token_accuracy": 0.9977656900882721, "step": 886 }, { "epoch": 55.448, "grad_norm": 0.15157729822602192, "learning_rate": 7.831945477016244e-07, "loss": 0.0077, "mean_token_accuracy": 0.9977350831031799, "step": 887 }, { "epoch": 55.512, "grad_norm": 0.11827374461170834, "learning_rate": 7.783099699013075e-07, "loss": 0.008, "mean_token_accuracy": 0.9978393688797951, "step": 888 }, { "epoch": 55.576, "grad_norm": 0.1307440646152956, "learning_rate": 7.734651059131029e-07, "loss": 0.0079, "mean_token_accuracy": 0.9975550547242165, "step": 889 }, { "epoch": 55.64, "grad_norm": 0.24897365032836144, "learning_rate": 7.686600533223395e-07, "loss": 0.0077, "mean_token_accuracy": 0.9978728145360947, "step": 890 }, { "epoch": 55.704, "grad_norm": 0.12654698796524802, "learning_rate": 7.638949089124633e-07, "loss": 0.0078, "mean_token_accuracy": 0.9977350831031799, "step": 891 }, { "epoch": 55.768, "grad_norm": 0.1836466663626651, "learning_rate": 7.591697686630885e-07, "loss": 0.0081, "mean_token_accuracy": 0.9977962970733643, "step": 892 }, { "epoch": 55.832, "grad_norm": 0.23068142592452065, "learning_rate": 7.544847277480662e-07, "loss": 0.0082, "mean_token_accuracy": 0.9977962970733643, "step": 893 }, { "epoch": 55.896, "grad_norm": 0.2837046352799425, "learning_rate": 7.498398805335655e-07, "loss": 0.0081, "mean_token_accuracy": 0.9977809935808182, "step": 894 }, { "epoch": 55.96, "grad_norm": 0.14608093209568704, "learning_rate": 7.452353205761725e-07, "loss": 0.0081, "mean_token_accuracy": 0.9976126551628113, "step": 895 }, { "epoch": 56.0, "grad_norm": 0.21416590462587234, "learning_rate": 7.406711406210082e-07, "loss": 0.008, "mean_token_accuracy": 0.9978452682495117, "step": 896 }, { "epoch": 56.064, "grad_norm": 0.17815315768709059, "learning_rate": 7.36147432599858e-07, "loss": 0.0076, "mean_token_accuracy": 0.9978728145360947, "step": 897 }, { "epoch": 56.128, "grad_norm": 0.11883619887123739, "learning_rate": 7.316642876293207e-07, "loss": 0.0077, "mean_token_accuracy": 0.9979034215211868, "step": 898 }, { "epoch": 56.192, "grad_norm": 0.17353554689617467, "learning_rate": 7.272217960089746e-07, "loss": 0.0076, "mean_token_accuracy": 0.9979493319988251, "step": 899 }, { "epoch": 56.256, "grad_norm": 0.1219550439977582, "learning_rate": 7.228200472195574e-07, "loss": 0.0076, "mean_token_accuracy": 0.9979034215211868, "step": 900 }, { "epoch": 56.32, "grad_norm": 0.17312087953043226, "learning_rate": 7.184591299211624e-07, "loss": 0.0077, "mean_token_accuracy": 0.9977197796106339, "step": 901 }, { "epoch": 56.384, "grad_norm": 0.15379809429906793, "learning_rate": 7.141391319514565e-07, "loss": 0.0078, "mean_token_accuracy": 0.9978728145360947, "step": 902 }, { "epoch": 56.448, "grad_norm": 0.1798850771956693, "learning_rate": 7.098601403239071e-07, "loss": 0.0079, "mean_token_accuracy": 0.9977197796106339, "step": 903 }, { "epoch": 56.512, "grad_norm": 0.13510483957152666, "learning_rate": 7.056222412260325e-07, "loss": 0.0079, "mean_token_accuracy": 0.9976432621479034, "step": 904 }, { "epoch": 56.576, "grad_norm": 0.17532243642256284, "learning_rate": 7.014255200176643e-07, "loss": 0.0079, "mean_token_accuracy": 0.9977809935808182, "step": 905 }, { "epoch": 56.64, "grad_norm": 0.12322179487131653, "learning_rate": 6.972700612292274e-07, "loss": 0.0077, "mean_token_accuracy": 0.9977940246462822, "step": 906 }, { "epoch": 56.704, "grad_norm": 0.15239927281855267, "learning_rate": 6.931559485600408e-07, "loss": 0.0079, "mean_token_accuracy": 0.9976891726255417, "step": 907 }, { "epoch": 56.768, "grad_norm": 0.14747163869229177, "learning_rate": 6.890832648766256e-07, "loss": 0.0079, "mean_token_accuracy": 0.9977044761180878, "step": 908 }, { "epoch": 56.832, "grad_norm": 0.7388945607516176, "learning_rate": 6.850520922110425e-07, "loss": 0.0084, "mean_token_accuracy": 0.9976957812905312, "step": 909 }, { "epoch": 56.896, "grad_norm": 0.5586582915578115, "learning_rate": 6.810625117592364e-07, "loss": 0.0082, "mean_token_accuracy": 0.9977197796106339, "step": 910 }, { "epoch": 56.96, "grad_norm": 0.17983868246990134, "learning_rate": 6.771146038793997e-07, "loss": 0.0083, "mean_token_accuracy": 0.9977962970733643, "step": 911 }, { "epoch": 57.0, "grad_norm": 0.17983868246990134, "learning_rate": 6.732084480903561e-07, "loss": 0.0082, "mean_token_accuracy": 0.9976248979568482, "step": 912 }, { "epoch": 57.064, "grad_norm": 0.19396688052255454, "learning_rate": 6.693441230699591e-07, "loss": 0.0076, "mean_token_accuracy": 0.9979034215211868, "step": 913 }, { "epoch": 57.128, "grad_norm": 0.2037178577961762, "learning_rate": 6.655217066535033e-07, "loss": 0.0076, "mean_token_accuracy": 0.9978269040584564, "step": 914 }, { "epoch": 57.192, "grad_norm": 0.14811317891483528, "learning_rate": 6.617412758321627e-07, "loss": 0.0076, "mean_token_accuracy": 0.9979187250137329, "step": 915 }, { "epoch": 57.256, "grad_norm": 0.10843517729197479, "learning_rate": 6.580029067514345e-07, "loss": 0.0076, "mean_token_accuracy": 0.9979187250137329, "step": 916 }, { "epoch": 57.32, "grad_norm": 0.11685446966416836, "learning_rate": 6.543066747096087e-07, "loss": 0.0076, "mean_token_accuracy": 0.9979799389839172, "step": 917 }, { "epoch": 57.384, "grad_norm": 0.11506671131310608, "learning_rate": 6.506526541562506e-07, "loss": 0.0076, "mean_token_accuracy": 0.9977809935808182, "step": 918 }, { "epoch": 57.448, "grad_norm": 0.12301390259737714, "learning_rate": 6.470409186907006e-07, "loss": 0.0079, "mean_token_accuracy": 0.9977656900882721, "step": 919 }, { "epoch": 57.512, "grad_norm": 0.13805698069087818, "learning_rate": 6.434715410605914e-07, "loss": 0.0078, "mean_token_accuracy": 0.9977197796106339, "step": 920 }, { "epoch": 57.576, "grad_norm": 0.13043919408493812, "learning_rate": 6.399445931603851e-07, "loss": 0.0078, "mean_token_accuracy": 0.9978116005659103, "step": 921 }, { "epoch": 57.64, "grad_norm": 0.26430408156378954, "learning_rate": 6.364601460299224e-07, "loss": 0.0078, "mean_token_accuracy": 0.9978422075510025, "step": 922 }, { "epoch": 57.704, "grad_norm": 0.13092761272439962, "learning_rate": 6.330182698529928e-07, "loss": 0.0078, "mean_token_accuracy": 0.9977962970733643, "step": 923 }, { "epoch": 57.768, "grad_norm": 0.19929641960692912, "learning_rate": 6.296190339559219e-07, "loss": 0.0079, "mean_token_accuracy": 0.9978116005659103, "step": 924 }, { "epoch": 57.832, "grad_norm": 0.14725421308628123, "learning_rate": 6.26262506806173e-07, "loss": 0.008, "mean_token_accuracy": 0.9976558685302734, "step": 925 }, { "epoch": 57.896, "grad_norm": 0.20303292246805132, "learning_rate": 6.229487560109702e-07, "loss": 0.0082, "mean_token_accuracy": 0.9978269040584564, "step": 926 }, { "epoch": 57.96, "grad_norm": 0.18640425324906262, "learning_rate": 6.196778483159346e-07, "loss": 0.0084, "mean_token_accuracy": 0.9977386966347694, "step": 927 }, { "epoch": 58.0, "grad_norm": 0.4015056226590585, "learning_rate": 6.164498496037407e-07, "loss": 0.0079, "mean_token_accuracy": 0.9976983547210694, "step": 928 }, { "epoch": 58.064, "grad_norm": 0.25763774461893446, "learning_rate": 6.132648248927915e-07, "loss": 0.0077, "mean_token_accuracy": 0.9979187250137329, "step": 929 }, { "epoch": 58.128, "grad_norm": 0.14149006743442843, "learning_rate": 6.101228383359046e-07, "loss": 0.0076, "mean_token_accuracy": 0.997934028506279, "step": 930 }, { "epoch": 58.192, "grad_norm": 0.1145910875253519, "learning_rate": 6.070239532190239e-07, "loss": 0.0075, "mean_token_accuracy": 0.9979187250137329, "step": 931 }, { "epoch": 58.256, "grad_norm": 0.20949184667372087, "learning_rate": 6.039682319599431e-07, "loss": 0.0078, "mean_token_accuracy": 0.9979493319988251, "step": 932 }, { "epoch": 58.32, "grad_norm": 0.19719078486923636, "learning_rate": 6.009557361070483e-07, "loss": 0.0077, "mean_token_accuracy": 0.9979493319988251, "step": 933 }, { "epoch": 58.384, "grad_norm": 0.15164744076886072, "learning_rate": 5.979865263380792e-07, "loss": 0.0076, "mean_token_accuracy": 0.9979005828499794, "step": 934 }, { "epoch": 58.448, "grad_norm": 0.14737720060817422, "learning_rate": 5.950606624589065e-07, "loss": 0.0077, "mean_token_accuracy": 0.9977962970733643, "step": 935 }, { "epoch": 58.512, "grad_norm": 0.2770307826533102, "learning_rate": 5.921782034023276e-07, "loss": 0.0078, "mean_token_accuracy": 0.9978881180286407, "step": 936 }, { "epoch": 58.576, "grad_norm": 0.13829870291568577, "learning_rate": 5.893392072268781e-07, "loss": 0.0077, "mean_token_accuracy": 0.9977197796106339, "step": 937 }, { "epoch": 58.64, "grad_norm": 0.21310180711307883, "learning_rate": 5.865437311156651e-07, "loss": 0.0081, "mean_token_accuracy": 0.9977656900882721, "step": 938 }, { "epoch": 58.704, "grad_norm": 0.13946601219625096, "learning_rate": 5.837918313752132e-07, "loss": 0.008, "mean_token_accuracy": 0.9977693036198616, "step": 939 }, { "epoch": 58.768, "grad_norm": 0.15713147827008037, "learning_rate": 5.810835634343302e-07, "loss": 0.0079, "mean_token_accuracy": 0.9976279586553574, "step": 940 }, { "epoch": 58.832, "grad_norm": 0.24148430682654637, "learning_rate": 5.784189818429931e-07, "loss": 0.0078, "mean_token_accuracy": 0.9978422075510025, "step": 941 }, { "epoch": 58.896, "grad_norm": 0.14488625889952955, "learning_rate": 5.757981402712463e-07, "loss": 0.0079, "mean_token_accuracy": 0.9976126551628113, "step": 942 }, { "epoch": 58.96, "grad_norm": 0.18461939181264783, "learning_rate": 5.732210915081225e-07, "loss": 0.0079, "mean_token_accuracy": 0.9977197796106339, "step": 943 }, { "epoch": 59.0, "grad_norm": 0.18461939181264783, "learning_rate": 5.706878874605801e-07, "loss": 0.0086, "mean_token_accuracy": 0.9977718114852905, "step": 944 }, { "epoch": 59.064, "grad_norm": 0.3869098504089327, "learning_rate": 5.681985791524547e-07, "loss": 0.0077, "mean_token_accuracy": 0.9979187250137329, "step": 945 }, { "epoch": 59.128, "grad_norm": 0.13025803477714734, "learning_rate": 5.657532167234343e-07, "loss": 0.0076, "mean_token_accuracy": 0.9979034215211868, "step": 946 }, { "epoch": 59.192, "grad_norm": 0.3449446687140992, "learning_rate": 5.633518494280479e-07, "loss": 0.0078, "mean_token_accuracy": 0.9979034215211868, "step": 947 }, { "epoch": 59.256, "grad_norm": 0.6851114636025829, "learning_rate": 5.60994525634674e-07, "loss": 0.0076, "mean_token_accuracy": 0.9978881180286407, "step": 948 }, { "epoch": 59.32, "grad_norm": 0.14167610175212855, "learning_rate": 5.586812928245662e-07, "loss": 0.0075, "mean_token_accuracy": 0.9980258494615555, "step": 949 }, { "epoch": 59.384, "grad_norm": 0.11394628370936732, "learning_rate": 5.564121975908969e-07, "loss": 0.0075, "mean_token_accuracy": 0.9978422075510025, "step": 950 }, { "epoch": 59.448, "grad_norm": 0.12189872680872615, "learning_rate": 5.541872856378183e-07, "loss": 0.0077, "mean_token_accuracy": 0.9978116005659103, "step": 951 }, { "epoch": 59.512, "grad_norm": 0.42816908761042094, "learning_rate": 5.52006601779543e-07, "loss": 0.0075, "mean_token_accuracy": 0.9977962970733643, "step": 952 }, { "epoch": 59.576, "grad_norm": 0.11451853853646499, "learning_rate": 5.498701899394395e-07, "loss": 0.0077, "mean_token_accuracy": 0.9977044761180878, "step": 953 }, { "epoch": 59.64, "grad_norm": 0.13269512266556796, "learning_rate": 5.477780931491494e-07, "loss": 0.0077, "mean_token_accuracy": 0.9977350831031799, "step": 954 }, { "epoch": 59.704, "grad_norm": 0.11790623333086475, "learning_rate": 5.457303535477203e-07, "loss": 0.0077, "mean_token_accuracy": 0.9978881180286407, "step": 955 }, { "epoch": 59.768, "grad_norm": 0.12824940598799198, "learning_rate": 5.43727012380755e-07, "loss": 0.0078, "mean_token_accuracy": 0.9976738691329956, "step": 956 }, { "epoch": 59.832, "grad_norm": 0.1386196827371057, "learning_rate": 5.417681099995841e-07, "loss": 0.0079, "mean_token_accuracy": 0.9977166578173637, "step": 957 }, { "epoch": 59.896, "grad_norm": 0.27135028793185323, "learning_rate": 5.398536858604507e-07, "loss": 0.0079, "mean_token_accuracy": 0.9977801069617271, "step": 958 }, { "epoch": 59.96, "grad_norm": 0.13815105359067129, "learning_rate": 5.379837785237157e-07, "loss": 0.0078, "mean_token_accuracy": 0.9976585656404495, "step": 959 }, { "epoch": 60.0, "grad_norm": 0.3310091628964995, "learning_rate": 5.361584256530833e-07, "loss": 0.0084, "mean_token_accuracy": 0.9976493835449218, "step": 960 }, { "epoch": 60.064, "grad_norm": 0.18069483610418383, "learning_rate": 5.343776640148399e-07, "loss": 0.0075, "mean_token_accuracy": 0.997750386595726, "step": 961 }, { "epoch": 60.128, "grad_norm": 0.21179256818559478, "learning_rate": 5.326415294771151e-07, "loss": 0.0078, "mean_token_accuracy": 0.9978881180286407, "step": 962 }, { "epoch": 60.192, "grad_norm": 0.12280690224875965, "learning_rate": 5.309500570091582e-07, "loss": 0.0075, "mean_token_accuracy": 0.9978728145360947, "step": 963 }, { "epoch": 60.256, "grad_norm": 0.11397056524393846, "learning_rate": 5.293032806806348e-07, "loss": 0.0077, "mean_token_accuracy": 0.997934028506279, "step": 964 }, { "epoch": 60.32, "grad_norm": 0.27181160189218634, "learning_rate": 5.277012336609404e-07, "loss": 0.0077, "mean_token_accuracy": 0.9979646354913712, "step": 965 }, { "epoch": 60.384, "grad_norm": 0.12630063679646805, "learning_rate": 5.26143948218531e-07, "loss": 0.0076, "mean_token_accuracy": 0.9978422075510025, "step": 966 }, { "epoch": 60.448, "grad_norm": 0.12790524426391564, "learning_rate": 5.246314557202754e-07, "loss": 0.0075, "mean_token_accuracy": 0.99788598716259, "step": 967 }, { "epoch": 60.512, "grad_norm": 0.14946512221245578, "learning_rate": 5.231637866308216e-07, "loss": 0.0078, "mean_token_accuracy": 0.9977656900882721, "step": 968 }, { "epoch": 60.576, "grad_norm": 0.14781019625529876, "learning_rate": 5.217409705119836e-07, "loss": 0.0079, "mean_token_accuracy": 0.9978290125727654, "step": 969 }, { "epoch": 60.64, "grad_norm": 0.33743884235299765, "learning_rate": 5.203630360221469e-07, "loss": 0.0076, "mean_token_accuracy": 0.9977809935808182, "step": 970 }, { "epoch": 60.704, "grad_norm": 0.36421678891360565, "learning_rate": 5.190300109156897e-07, "loss": 0.0084, "mean_token_accuracy": 0.9977962970733643, "step": 971 }, { "epoch": 60.768, "grad_norm": 0.13510413773311813, "learning_rate": 5.177419220424251e-07, "loss": 0.0077, "mean_token_accuracy": 0.9977350831031799, "step": 972 }, { "epoch": 60.832, "grad_norm": 0.1447084630986344, "learning_rate": 5.164987953470596e-07, "loss": 0.0079, "mean_token_accuracy": 0.9977197796106339, "step": 973 }, { "epoch": 60.896, "grad_norm": 0.1766088218845838, "learning_rate": 5.15300655868671e-07, "loss": 0.0076, "mean_token_accuracy": 0.9978575110435486, "step": 974 }, { "epoch": 60.96, "grad_norm": 0.33520794744198174, "learning_rate": 5.141475277402042e-07, "loss": 0.0079, "mean_token_accuracy": 0.9975820481777191, "step": 975 }, { "epoch": 61.0, "grad_norm": 0.29014238685920324, "learning_rate": 5.130394341879844e-07, "loss": 0.0078, "mean_token_accuracy": 0.997820782661438, "step": 976 }, { "epoch": 61.064, "grad_norm": 0.1329452257608293, "learning_rate": 5.119763975312489e-07, "loss": 0.0075, "mean_token_accuracy": 0.9979034215211868, "step": 977 }, { "epoch": 61.128, "grad_norm": 0.12136450011551236, "learning_rate": 5.109584391816999e-07, "loss": 0.0077, "mean_token_accuracy": 0.9979034215211868, "step": 978 }, { "epoch": 61.192, "grad_norm": 0.4515556414983452, "learning_rate": 5.0998557964307e-07, "loss": 0.0076, "mean_token_accuracy": 0.9977816045284271, "step": 979 }, { "epoch": 61.256, "grad_norm": 0.1111064353025462, "learning_rate": 5.090578385107121e-07, "loss": 0.0074, "mean_token_accuracy": 0.9978701174259186, "step": 980 }, { "epoch": 61.32, "grad_norm": 0.11140599857832552, "learning_rate": 5.081752344712032e-07, "loss": 0.0075, "mean_token_accuracy": 0.9980105459690094, "step": 981 }, { "epoch": 61.384, "grad_norm": 0.13072396404999803, "learning_rate": 5.073377853019672e-07, "loss": 0.0075, "mean_token_accuracy": 0.9978575110435486, "step": 982 }, { "epoch": 61.448, "grad_norm": 0.20894110900792462, "learning_rate": 5.06545507870919e-07, "loss": 0.0076, "mean_token_accuracy": 0.9977656900882721, "step": 983 }, { "epoch": 61.512, "grad_norm": 0.13009861709385875, "learning_rate": 5.057984181361235e-07, "loss": 0.0077, "mean_token_accuracy": 0.9977962970733643, "step": 984 }, { "epoch": 61.576, "grad_norm": 0.27833755847817326, "learning_rate": 5.050965311454739e-07, "loss": 0.0077, "mean_token_accuracy": 0.9977350831031799, "step": 985 }, { "epoch": 61.64, "grad_norm": 0.133741053297593, "learning_rate": 5.04439861036389e-07, "loss": 0.0077, "mean_token_accuracy": 0.9979187250137329, "step": 986 }, { "epoch": 61.704, "grad_norm": 0.20774118062384092, "learning_rate": 5.03828421035529e-07, "loss": 0.0077, "mean_token_accuracy": 0.9977962970733643, "step": 987 }, { "epoch": 61.768, "grad_norm": 0.12767183716460714, "learning_rate": 5.03262223458528e-07, "loss": 0.0078, "mean_token_accuracy": 0.9977197796106339, "step": 988 }, { "epoch": 61.832, "grad_norm": 0.14500994661161493, "learning_rate": 5.027412797097465e-07, "loss": 0.0077, "mean_token_accuracy": 0.9977197796106339, "step": 989 }, { "epoch": 61.896, "grad_norm": 0.1478329071840366, "learning_rate": 5.022656002820423e-07, "loss": 0.0078, "mean_token_accuracy": 0.9976738691329956, "step": 990 }, { "epoch": 61.96, "grad_norm": 0.1190688366260418, "learning_rate": 5.018351947565572e-07, "loss": 0.0077, "mean_token_accuracy": 0.9976891726255417, "step": 991 }, { "epoch": 62.0, "grad_norm": 0.1190688366260418, "learning_rate": 5.01450071802527e-07, "loss": 0.0078, "mean_token_accuracy": 0.9979187250137329, "step": 992 }, { "epoch": 62.064, "grad_norm": 0.17922878351021826, "learning_rate": 5.011102391771039e-07, "loss": 0.0074, "mean_token_accuracy": 0.9978269040584564, "step": 993 }, { "epoch": 62.128, "grad_norm": 0.11795337655620244, "learning_rate": 5.008157037252025e-07, "loss": 0.0074, "mean_token_accuracy": 0.9979646354913712, "step": 994 }, { "epoch": 62.192, "grad_norm": 0.11459627884119425, "learning_rate": 5.005664713793604e-07, "loss": 0.0074, "mean_token_accuracy": 0.9979799389839172, "step": 995 }, { "epoch": 62.256, "grad_norm": 0.11167547908113638, "learning_rate": 5.003625471596191e-07, "loss": 0.0075, "mean_token_accuracy": 0.9979493319988251, "step": 996 }, { "epoch": 62.32, "grad_norm": 0.12457183884507528, "learning_rate": 5.002039351734239e-07, "loss": 0.0075, "mean_token_accuracy": 0.9978269040584564, "step": 997 }, { "epoch": 62.384, "grad_norm": 0.3924160197285362, "learning_rate": 5.000906386155401e-07, "loss": 0.0075, "mean_token_accuracy": 0.997750386595726, "step": 998 }, { "epoch": 62.448, "grad_norm": 0.12755751595954642, "learning_rate": 5.000226597679883e-07, "loss": 0.0078, "mean_token_accuracy": 0.9977044761180878, "step": 999 }, { "epoch": 62.512, "grad_norm": 0.24625096388842677, "learning_rate": 5.000000000000001e-07, "loss": 0.0081, "mean_token_accuracy": 0.9977962970733643, "step": 1000 }, { "epoch": 62.512, "step": 1000, "total_flos": 55600396238848.0, "train_loss": 0.2335170691674575, "train_runtime": 13359.9683, "train_samples_per_second": 4.79, "train_steps_per_second": 0.075 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 67, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 55600396238848.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }