{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5884052641256665, "eval_steps": 200, "global_step": 8400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.004824572924602e-05, "grad_norm": 6.222772121429443, "learning_rate": 9.99930017513135e-05, "loss": 1.1076, "num_input_tokens_seen": 16384, "step": 1 }, { "epoch": 0.00014009649145849205, "grad_norm": 6.042057037353516, "learning_rate": 9.998600350262697e-05, "loss": 1.1086, "num_input_tokens_seen": 32768, "step": 2 }, { "epoch": 0.00021014473718773804, "grad_norm": 7.119229316711426, "learning_rate": 9.997900525394046e-05, "loss": 1.4047, "num_input_tokens_seen": 49152, "step": 3 }, { "epoch": 0.0002801929829169841, "grad_norm": 7.133191108703613, "learning_rate": 9.997200700525395e-05, "loss": 1.3921, "num_input_tokens_seen": 65536, "step": 4 }, { "epoch": 0.0003502412286462301, "grad_norm": 6.1078338623046875, "learning_rate": 9.996500875656743e-05, "loss": 1.3171, "num_input_tokens_seen": 81920, "step": 5 }, { "epoch": 0.0004202894743754761, "grad_norm": 6.466420650482178, "learning_rate": 9.995801050788092e-05, "loss": 1.0732, "num_input_tokens_seen": 97344, "step": 6 }, { "epoch": 0.0004903377201047221, "grad_norm": 5.578189849853516, "learning_rate": 9.99510122591944e-05, "loss": 0.9929, "num_input_tokens_seen": 113728, "step": 7 }, { "epoch": 0.0005603859658339682, "grad_norm": 7.197720527648926, "learning_rate": 9.994401401050789e-05, "loss": 1.2512, "num_input_tokens_seen": 129528, "step": 8 }, { "epoch": 0.0006304342115632141, "grad_norm": 6.618913650512695, "learning_rate": 9.993701576182136e-05, "loss": 1.3495, "num_input_tokens_seen": 145704, "step": 9 }, { "epoch": 0.0007004824572924602, "grad_norm": 6.955508232116699, "learning_rate": 9.993001751313485e-05, "loss": 1.1823, "num_input_tokens_seen": 161664, "step": 10 }, { "epoch": 0.0007705307030217062, "grad_norm": 6.6807074546813965, "learning_rate": 9.992301926444835e-05, "loss": 1.1693, "num_input_tokens_seen": 177960, "step": 11 }, { "epoch": 0.0008405789487509522, "grad_norm": 6.784447193145752, "learning_rate": 9.991602101576183e-05, "loss": 1.3744, "num_input_tokens_seen": 194344, "step": 12 }, { "epoch": 0.0009106271944801982, "grad_norm": 6.7418437004089355, "learning_rate": 9.990902276707532e-05, "loss": 1.22, "num_input_tokens_seen": 210728, "step": 13 }, { "epoch": 0.0009806754402094443, "grad_norm": 6.43395471572876, "learning_rate": 9.990202451838879e-05, "loss": 1.1772, "num_input_tokens_seen": 227112, "step": 14 }, { "epoch": 0.0010507236859386903, "grad_norm": 6.09422492980957, "learning_rate": 9.989502626970228e-05, "loss": 1.195, "num_input_tokens_seen": 243496, "step": 15 }, { "epoch": 0.0011207719316679364, "grad_norm": 6.238271236419678, "learning_rate": 9.988802802101577e-05, "loss": 1.2623, "num_input_tokens_seen": 259744, "step": 16 }, { "epoch": 0.0011908201773971822, "grad_norm": 6.56187629699707, "learning_rate": 9.988102977232926e-05, "loss": 1.2721, "num_input_tokens_seen": 276128, "step": 17 }, { "epoch": 0.0012608684231264283, "grad_norm": 6.818358898162842, "learning_rate": 9.987403152364275e-05, "loss": 1.2649, "num_input_tokens_seen": 292512, "step": 18 }, { "epoch": 0.0013309166688556743, "grad_norm": 5.950352191925049, "learning_rate": 9.986703327495622e-05, "loss": 1.0024, "num_input_tokens_seen": 308632, "step": 19 }, { "epoch": 0.0014009649145849204, "grad_norm": 6.387479305267334, "learning_rate": 9.986003502626971e-05, "loss": 1.2783, "num_input_tokens_seen": 325016, "step": 20 }, { "epoch": 0.0014710131603141664, "grad_norm": 6.187346458435059, "learning_rate": 9.985303677758318e-05, "loss": 1.1701, "num_input_tokens_seen": 341384, "step": 21 }, { "epoch": 0.0015410614060434125, "grad_norm": 5.371951103210449, "learning_rate": 9.984603852889667e-05, "loss": 1.0483, "num_input_tokens_seen": 357768, "step": 22 }, { "epoch": 0.0016111096517726585, "grad_norm": 6.2206807136535645, "learning_rate": 9.983904028021016e-05, "loss": 1.2516, "num_input_tokens_seen": 374152, "step": 23 }, { "epoch": 0.0016811578975019044, "grad_norm": 6.121264457702637, "learning_rate": 9.983204203152365e-05, "loss": 1.1506, "num_input_tokens_seen": 390536, "step": 24 }, { "epoch": 0.0017512061432311504, "grad_norm": 6.353756904602051, "learning_rate": 9.982504378283714e-05, "loss": 1.3118, "num_input_tokens_seen": 406920, "step": 25 }, { "epoch": 0.0018212543889603965, "grad_norm": 6.270686149597168, "learning_rate": 9.981804553415061e-05, "loss": 1.0883, "num_input_tokens_seen": 422728, "step": 26 }, { "epoch": 0.0018913026346896425, "grad_norm": 6.117632865905762, "learning_rate": 9.98110472854641e-05, "loss": 1.3346, "num_input_tokens_seen": 439112, "step": 27 }, { "epoch": 0.0019613508804188886, "grad_norm": 6.429015159606934, "learning_rate": 9.980404903677759e-05, "loss": 1.2494, "num_input_tokens_seen": 455144, "step": 28 }, { "epoch": 0.0020313991261481346, "grad_norm": 6.4467620849609375, "learning_rate": 9.979705078809107e-05, "loss": 1.3335, "num_input_tokens_seen": 470360, "step": 29 }, { "epoch": 0.0021014473718773807, "grad_norm": 6.57926082611084, "learning_rate": 9.979005253940455e-05, "loss": 1.2126, "num_input_tokens_seen": 486120, "step": 30 }, { "epoch": 0.0021714956176066267, "grad_norm": 5.650569915771484, "learning_rate": 9.978305429071804e-05, "loss": 1.1363, "num_input_tokens_seen": 501896, "step": 31 }, { "epoch": 0.0022415438633358728, "grad_norm": 6.380292892456055, "learning_rate": 9.977605604203153e-05, "loss": 1.2251, "num_input_tokens_seen": 517752, "step": 32 }, { "epoch": 0.002311592109065119, "grad_norm": 5.704173564910889, "learning_rate": 9.976905779334502e-05, "loss": 1.1685, "num_input_tokens_seen": 534136, "step": 33 }, { "epoch": 0.0023816403547943644, "grad_norm": 5.342978000640869, "learning_rate": 9.97620595446585e-05, "loss": 1.2012, "num_input_tokens_seen": 550216, "step": 34 }, { "epoch": 0.0024516886005236105, "grad_norm": 5.7014241218566895, "learning_rate": 9.975506129597198e-05, "loss": 1.2342, "num_input_tokens_seen": 566600, "step": 35 }, { "epoch": 0.0025217368462528565, "grad_norm": 6.26229190826416, "learning_rate": 9.974806304728546e-05, "loss": 1.2041, "num_input_tokens_seen": 582984, "step": 36 }, { "epoch": 0.0025917850919821026, "grad_norm": 6.583463191986084, "learning_rate": 9.974106479859896e-05, "loss": 1.3021, "num_input_tokens_seen": 598968, "step": 37 }, { "epoch": 0.0026618333377113486, "grad_norm": 5.58498477935791, "learning_rate": 9.973406654991245e-05, "loss": 1.1622, "num_input_tokens_seen": 614840, "step": 38 }, { "epoch": 0.0027318815834405947, "grad_norm": 5.906906604766846, "learning_rate": 9.972706830122592e-05, "loss": 1.1971, "num_input_tokens_seen": 631224, "step": 39 }, { "epoch": 0.0028019298291698407, "grad_norm": 5.962359428405762, "learning_rate": 9.972007005253941e-05, "loss": 1.1326, "num_input_tokens_seen": 647000, "step": 40 }, { "epoch": 0.002871978074899087, "grad_norm": 6.447500705718994, "learning_rate": 9.971307180385289e-05, "loss": 1.0905, "num_input_tokens_seen": 662480, "step": 41 }, { "epoch": 0.002942026320628333, "grad_norm": 5.7290520668029785, "learning_rate": 9.970607355516638e-05, "loss": 1.3585, "num_input_tokens_seen": 678480, "step": 42 }, { "epoch": 0.003012074566357579, "grad_norm": 6.063445568084717, "learning_rate": 9.969907530647987e-05, "loss": 1.2841, "num_input_tokens_seen": 694256, "step": 43 }, { "epoch": 0.003082122812086825, "grad_norm": 5.302809238433838, "learning_rate": 9.969207705779335e-05, "loss": 1.1168, "num_input_tokens_seen": 710152, "step": 44 }, { "epoch": 0.003152171057816071, "grad_norm": 5.634128093719482, "learning_rate": 9.968507880910684e-05, "loss": 1.0609, "num_input_tokens_seen": 726184, "step": 45 }, { "epoch": 0.003222219303545317, "grad_norm": 5.652642726898193, "learning_rate": 9.967808056042032e-05, "loss": 1.2228, "num_input_tokens_seen": 742520, "step": 46 }, { "epoch": 0.0032922675492745627, "grad_norm": 5.340751647949219, "learning_rate": 9.96710823117338e-05, "loss": 1.0595, "num_input_tokens_seen": 758904, "step": 47 }, { "epoch": 0.0033623157950038087, "grad_norm": 5.422239780426025, "learning_rate": 9.966408406304728e-05, "loss": 1.1161, "num_input_tokens_seen": 775040, "step": 48 }, { "epoch": 0.0034323640407330548, "grad_norm": 5.29241418838501, "learning_rate": 9.965708581436077e-05, "loss": 1.0255, "num_input_tokens_seen": 790856, "step": 49 }, { "epoch": 0.003502412286462301, "grad_norm": 5.146270275115967, "learning_rate": 9.965008756567426e-05, "loss": 0.9762, "num_input_tokens_seen": 807064, "step": 50 }, { "epoch": 0.003572460532191547, "grad_norm": 5.825758457183838, "learning_rate": 9.964308931698775e-05, "loss": 1.2108, "num_input_tokens_seen": 823448, "step": 51 }, { "epoch": 0.003642508777920793, "grad_norm": 6.179538726806641, "learning_rate": 9.963609106830124e-05, "loss": 1.322, "num_input_tokens_seen": 838888, "step": 52 }, { "epoch": 0.003712557023650039, "grad_norm": 6.464454174041748, "learning_rate": 9.962909281961471e-05, "loss": 1.5077, "num_input_tokens_seen": 855272, "step": 53 }, { "epoch": 0.003782605269379285, "grad_norm": 5.4227294921875, "learning_rate": 9.96220945709282e-05, "loss": 1.2679, "num_input_tokens_seen": 871656, "step": 54 }, { "epoch": 0.003852653515108531, "grad_norm": 5.949041366577148, "learning_rate": 9.961509632224169e-05, "loss": 1.3618, "num_input_tokens_seen": 888040, "step": 55 }, { "epoch": 0.003922701760837777, "grad_norm": 6.050904750823975, "learning_rate": 9.960809807355516e-05, "loss": 1.3155, "num_input_tokens_seen": 904400, "step": 56 }, { "epoch": 0.003992750006567023, "grad_norm": 6.048308849334717, "learning_rate": 9.960109982486866e-05, "loss": 1.3131, "num_input_tokens_seen": 919952, "step": 57 }, { "epoch": 0.004062798252296269, "grad_norm": 5.683863162994385, "learning_rate": 9.959410157618214e-05, "loss": 1.1692, "num_input_tokens_seen": 936336, "step": 58 }, { "epoch": 0.004132846498025515, "grad_norm": 5.449287414550781, "learning_rate": 9.958710332749563e-05, "loss": 1.0613, "num_input_tokens_seen": 952152, "step": 59 }, { "epoch": 0.004202894743754761, "grad_norm": 5.31496524810791, "learning_rate": 9.958010507880912e-05, "loss": 0.9605, "num_input_tokens_seen": 967824, "step": 60 }, { "epoch": 0.004272942989484007, "grad_norm": 5.57105016708374, "learning_rate": 9.957310683012259e-05, "loss": 1.1701, "num_input_tokens_seen": 983864, "step": 61 }, { "epoch": 0.004342991235213253, "grad_norm": 5.3456830978393555, "learning_rate": 9.956610858143608e-05, "loss": 1.0995, "num_input_tokens_seen": 1000248, "step": 62 }, { "epoch": 0.004413039480942499, "grad_norm": 5.453295707702637, "learning_rate": 9.955911033274957e-05, "loss": 1.2413, "num_input_tokens_seen": 1016632, "step": 63 }, { "epoch": 0.0044830877266717455, "grad_norm": 4.975449562072754, "learning_rate": 9.955211208406306e-05, "loss": 1.0961, "num_input_tokens_seen": 1033016, "step": 64 }, { "epoch": 0.004553135972400991, "grad_norm": 5.542137145996094, "learning_rate": 9.954511383537655e-05, "loss": 1.1171, "num_input_tokens_seen": 1049400, "step": 65 }, { "epoch": 0.004623184218130238, "grad_norm": 5.213950157165527, "learning_rate": 9.953811558669002e-05, "loss": 1.2228, "num_input_tokens_seen": 1065784, "step": 66 }, { "epoch": 0.004693232463859483, "grad_norm": 5.496099948883057, "learning_rate": 9.953111733800351e-05, "loss": 1.1529, "num_input_tokens_seen": 1082168, "step": 67 }, { "epoch": 0.004763280709588729, "grad_norm": 5.64145565032959, "learning_rate": 9.952411908931698e-05, "loss": 1.2301, "num_input_tokens_seen": 1098024, "step": 68 }, { "epoch": 0.004833328955317975, "grad_norm": 5.566709995269775, "learning_rate": 9.951712084063047e-05, "loss": 1.2679, "num_input_tokens_seen": 1114408, "step": 69 }, { "epoch": 0.004903377201047221, "grad_norm": 6.443673133850098, "learning_rate": 9.951012259194396e-05, "loss": 1.2313, "num_input_tokens_seen": 1130792, "step": 70 }, { "epoch": 0.0049734254467764675, "grad_norm": 5.882962226867676, "learning_rate": 9.950312434325745e-05, "loss": 1.4304, "num_input_tokens_seen": 1147176, "step": 71 }, { "epoch": 0.005043473692505713, "grad_norm": 6.0052666664123535, "learning_rate": 9.949612609457094e-05, "loss": 1.3027, "num_input_tokens_seen": 1160968, "step": 72 }, { "epoch": 0.0051135219382349596, "grad_norm": 5.260256767272949, "learning_rate": 9.948912784588441e-05, "loss": 1.1526, "num_input_tokens_seen": 1177352, "step": 73 }, { "epoch": 0.005183570183964205, "grad_norm": 5.641814708709717, "learning_rate": 9.94821295971979e-05, "loss": 1.0666, "num_input_tokens_seen": 1193032, "step": 74 }, { "epoch": 0.005253618429693452, "grad_norm": 5.121115207672119, "learning_rate": 9.947513134851138e-05, "loss": 1.2404, "num_input_tokens_seen": 1208952, "step": 75 }, { "epoch": 0.005323666675422697, "grad_norm": 5.63930082321167, "learning_rate": 9.946813309982487e-05, "loss": 1.5127, "num_input_tokens_seen": 1225000, "step": 76 }, { "epoch": 0.005393714921151944, "grad_norm": 4.880716800689697, "learning_rate": 9.946113485113837e-05, "loss": 1.1484, "num_input_tokens_seen": 1241384, "step": 77 }, { "epoch": 0.005463763166881189, "grad_norm": 5.59611177444458, "learning_rate": 9.945413660245184e-05, "loss": 1.1678, "num_input_tokens_seen": 1257680, "step": 78 }, { "epoch": 0.005533811412610436, "grad_norm": 5.052026271820068, "learning_rate": 9.944713835376533e-05, "loss": 1.2207, "num_input_tokens_seen": 1274064, "step": 79 }, { "epoch": 0.0056038596583396815, "grad_norm": 5.285096168518066, "learning_rate": 9.944014010507881e-05, "loss": 1.1457, "num_input_tokens_seen": 1290448, "step": 80 }, { "epoch": 0.005673907904068927, "grad_norm": 5.4286580085754395, "learning_rate": 9.94331418563923e-05, "loss": 1.3047, "num_input_tokens_seen": 1306832, "step": 81 }, { "epoch": 0.005743956149798174, "grad_norm": 5.937953472137451, "learning_rate": 9.942614360770578e-05, "loss": 1.4353, "num_input_tokens_seen": 1323216, "step": 82 }, { "epoch": 0.005814004395527419, "grad_norm": 5.129006385803223, "learning_rate": 9.941914535901927e-05, "loss": 1.1434, "num_input_tokens_seen": 1339408, "step": 83 }, { "epoch": 0.005884052641256666, "grad_norm": 5.179675102233887, "learning_rate": 9.941214711033276e-05, "loss": 1.2452, "num_input_tokens_seen": 1355792, "step": 84 }, { "epoch": 0.005954100886985911, "grad_norm": 4.912832736968994, "learning_rate": 9.940514886164624e-05, "loss": 1.1255, "num_input_tokens_seen": 1372176, "step": 85 }, { "epoch": 0.006024149132715158, "grad_norm": 5.190899848937988, "learning_rate": 9.939815061295973e-05, "loss": 1.2543, "num_input_tokens_seen": 1388560, "step": 86 }, { "epoch": 0.006094197378444403, "grad_norm": 5.1751275062561035, "learning_rate": 9.939115236427321e-05, "loss": 1.3145, "num_input_tokens_seen": 1404944, "step": 87 }, { "epoch": 0.00616424562417365, "grad_norm": 5.450705528259277, "learning_rate": 9.938415411558669e-05, "loss": 1.2844, "num_input_tokens_seen": 1421328, "step": 88 }, { "epoch": 0.0062342938699028955, "grad_norm": 5.593935012817383, "learning_rate": 9.937715586690018e-05, "loss": 1.3284, "num_input_tokens_seen": 1437464, "step": 89 }, { "epoch": 0.006304342115632142, "grad_norm": 5.156428813934326, "learning_rate": 9.937015761821367e-05, "loss": 1.1682, "num_input_tokens_seen": 1452952, "step": 90 }, { "epoch": 0.006374390361361388, "grad_norm": 4.673638820648193, "learning_rate": 9.936315936952715e-05, "loss": 1.004, "num_input_tokens_seen": 1469336, "step": 91 }, { "epoch": 0.006444438607090634, "grad_norm": 4.996700763702393, "learning_rate": 9.935616112084064e-05, "loss": 1.087, "num_input_tokens_seen": 1485448, "step": 92 }, { "epoch": 0.00651448685281988, "grad_norm": 4.817474365234375, "learning_rate": 9.934916287215412e-05, "loss": 1.151, "num_input_tokens_seen": 1501472, "step": 93 }, { "epoch": 0.006584535098549125, "grad_norm": 5.400479316711426, "learning_rate": 9.934216462346761e-05, "loss": 1.3144, "num_input_tokens_seen": 1516424, "step": 94 }, { "epoch": 0.006654583344278372, "grad_norm": 5.232216835021973, "learning_rate": 9.933516637478108e-05, "loss": 1.0019, "num_input_tokens_seen": 1532792, "step": 95 }, { "epoch": 0.006724631590007617, "grad_norm": 5.392521381378174, "learning_rate": 9.932816812609457e-05, "loss": 1.3195, "num_input_tokens_seen": 1548600, "step": 96 }, { "epoch": 0.006794679835736864, "grad_norm": 5.5280866622924805, "learning_rate": 9.932116987740806e-05, "loss": 1.283, "num_input_tokens_seen": 1564088, "step": 97 }, { "epoch": 0.0068647280814661095, "grad_norm": 4.963179588317871, "learning_rate": 9.931417162872155e-05, "loss": 1.2716, "num_input_tokens_seen": 1580040, "step": 98 }, { "epoch": 0.006934776327195356, "grad_norm": 4.920302391052246, "learning_rate": 9.930717338003504e-05, "loss": 1.088, "num_input_tokens_seen": 1595880, "step": 99 }, { "epoch": 0.007004824572924602, "grad_norm": 4.935486793518066, "learning_rate": 9.930017513134851e-05, "loss": 1.0122, "num_input_tokens_seen": 1611864, "step": 100 }, { "epoch": 0.007074872818653848, "grad_norm": 5.099087238311768, "learning_rate": 9.9293176882662e-05, "loss": 1.1605, "num_input_tokens_seen": 1627472, "step": 101 }, { "epoch": 0.007144921064383094, "grad_norm": 5.3764328956604, "learning_rate": 9.928617863397548e-05, "loss": 1.2225, "num_input_tokens_seen": 1643856, "step": 102 }, { "epoch": 0.00721496931011234, "grad_norm": 5.281564712524414, "learning_rate": 9.927918038528898e-05, "loss": 1.1483, "num_input_tokens_seen": 1660240, "step": 103 }, { "epoch": 0.007285017555841586, "grad_norm": 5.395167827606201, "learning_rate": 9.927218213660247e-05, "loss": 1.6014, "num_input_tokens_seen": 1676624, "step": 104 }, { "epoch": 0.007355065801570832, "grad_norm": 5.322319507598877, "learning_rate": 9.926518388791594e-05, "loss": 1.0933, "num_input_tokens_seen": 1693008, "step": 105 }, { "epoch": 0.007425114047300078, "grad_norm": 5.301229953765869, "learning_rate": 9.925818563922943e-05, "loss": 1.1998, "num_input_tokens_seen": 1708424, "step": 106 }, { "epoch": 0.0074951622930293236, "grad_norm": 4.958597183227539, "learning_rate": 9.92511873905429e-05, "loss": 1.3285, "num_input_tokens_seen": 1724808, "step": 107 }, { "epoch": 0.00756521053875857, "grad_norm": 4.3913960456848145, "learning_rate": 9.924418914185639e-05, "loss": 0.9017, "num_input_tokens_seen": 1740752, "step": 108 }, { "epoch": 0.007635258784487816, "grad_norm": 5.401021480560303, "learning_rate": 9.923719089316988e-05, "loss": 1.3646, "num_input_tokens_seen": 1755176, "step": 109 }, { "epoch": 0.007705307030217062, "grad_norm": 4.894444942474365, "learning_rate": 9.923019264448337e-05, "loss": 0.9955, "num_input_tokens_seen": 1771560, "step": 110 }, { "epoch": 0.007775355275946308, "grad_norm": 4.878688335418701, "learning_rate": 9.922319439579686e-05, "loss": 1.1766, "num_input_tokens_seen": 1787944, "step": 111 }, { "epoch": 0.007845403521675554, "grad_norm": 4.9379777908325195, "learning_rate": 9.921619614711033e-05, "loss": 1.1631, "num_input_tokens_seen": 1803568, "step": 112 }, { "epoch": 0.0079154517674048, "grad_norm": 5.101811408996582, "learning_rate": 9.920919789842382e-05, "loss": 1.2165, "num_input_tokens_seen": 1819952, "step": 113 }, { "epoch": 0.007985500013134045, "grad_norm": 5.32574987411499, "learning_rate": 9.920219964973731e-05, "loss": 1.3012, "num_input_tokens_seen": 1835296, "step": 114 }, { "epoch": 0.008055548258863293, "grad_norm": 5.2391180992126465, "learning_rate": 9.919520140105079e-05, "loss": 1.2451, "num_input_tokens_seen": 1851224, "step": 115 }, { "epoch": 0.008125596504592538, "grad_norm": 4.865017890930176, "learning_rate": 9.918820315236427e-05, "loss": 1.1683, "num_input_tokens_seen": 1867608, "step": 116 }, { "epoch": 0.008195644750321784, "grad_norm": 4.943136215209961, "learning_rate": 9.918120490367776e-05, "loss": 1.31, "num_input_tokens_seen": 1883696, "step": 117 }, { "epoch": 0.00826569299605103, "grad_norm": 4.769871711730957, "learning_rate": 9.917420665499125e-05, "loss": 1.1212, "num_input_tokens_seen": 1900080, "step": 118 }, { "epoch": 0.008335741241780275, "grad_norm": 4.785780429840088, "learning_rate": 9.916720840630474e-05, "loss": 1.2415, "num_input_tokens_seen": 1916464, "step": 119 }, { "epoch": 0.008405789487509523, "grad_norm": 4.802333831787109, "learning_rate": 9.916021015761822e-05, "loss": 1.0513, "num_input_tokens_seen": 1932848, "step": 120 }, { "epoch": 0.008475837733238768, "grad_norm": 5.22212553024292, "learning_rate": 9.91532119089317e-05, "loss": 1.2574, "num_input_tokens_seen": 1949232, "step": 121 }, { "epoch": 0.008545885978968014, "grad_norm": 5.104204177856445, "learning_rate": 9.914621366024518e-05, "loss": 1.0436, "num_input_tokens_seen": 1964184, "step": 122 }, { "epoch": 0.00861593422469726, "grad_norm": 5.11055326461792, "learning_rate": 9.913921541155868e-05, "loss": 1.1939, "num_input_tokens_seen": 1980568, "step": 123 }, { "epoch": 0.008685982470426507, "grad_norm": 4.784866809844971, "learning_rate": 9.913221716287216e-05, "loss": 1.2056, "num_input_tokens_seen": 1996952, "step": 124 }, { "epoch": 0.008756030716155752, "grad_norm": 4.763037204742432, "learning_rate": 9.912521891418564e-05, "loss": 1.1403, "num_input_tokens_seen": 2013336, "step": 125 }, { "epoch": 0.008826078961884998, "grad_norm": 4.813408851623535, "learning_rate": 9.911822066549913e-05, "loss": 1.1897, "num_input_tokens_seen": 2029720, "step": 126 }, { "epoch": 0.008896127207614244, "grad_norm": 4.79008674621582, "learning_rate": 9.911122241681261e-05, "loss": 1.2315, "num_input_tokens_seen": 2046104, "step": 127 }, { "epoch": 0.008966175453343491, "grad_norm": 4.843508720397949, "learning_rate": 9.91042241681261e-05, "loss": 1.0883, "num_input_tokens_seen": 2061592, "step": 128 }, { "epoch": 0.009036223699072737, "grad_norm": 4.917592525482178, "learning_rate": 9.909722591943959e-05, "loss": 1.2512, "num_input_tokens_seen": 2077792, "step": 129 }, { "epoch": 0.009106271944801982, "grad_norm": 4.9154133796691895, "learning_rate": 9.909022767075307e-05, "loss": 1.3284, "num_input_tokens_seen": 2094176, "step": 130 }, { "epoch": 0.009176320190531228, "grad_norm": 5.2125420570373535, "learning_rate": 9.908322942206656e-05, "loss": 1.3469, "num_input_tokens_seen": 2110480, "step": 131 }, { "epoch": 0.009246368436260475, "grad_norm": 4.715712547302246, "learning_rate": 9.907623117338004e-05, "loss": 1.0844, "num_input_tokens_seen": 2126864, "step": 132 }, { "epoch": 0.009316416681989721, "grad_norm": 4.805694580078125, "learning_rate": 9.906923292469353e-05, "loss": 1.069, "num_input_tokens_seen": 2142848, "step": 133 }, { "epoch": 0.009386464927718966, "grad_norm": 4.961355209350586, "learning_rate": 9.9062234676007e-05, "loss": 1.3387, "num_input_tokens_seen": 2159232, "step": 134 }, { "epoch": 0.009456513173448212, "grad_norm": 4.582219123840332, "learning_rate": 9.905523642732049e-05, "loss": 1.2013, "num_input_tokens_seen": 2175616, "step": 135 }, { "epoch": 0.009526561419177458, "grad_norm": 5.195998191833496, "learning_rate": 9.904823817863398e-05, "loss": 1.2552, "num_input_tokens_seen": 2191872, "step": 136 }, { "epoch": 0.009596609664906705, "grad_norm": 4.934189319610596, "learning_rate": 9.904123992994747e-05, "loss": 1.2961, "num_input_tokens_seen": 2208208, "step": 137 }, { "epoch": 0.00966665791063595, "grad_norm": 4.981037616729736, "learning_rate": 9.903424168126096e-05, "loss": 1.1546, "num_input_tokens_seen": 2224592, "step": 138 }, { "epoch": 0.009736706156365196, "grad_norm": 5.469496250152588, "learning_rate": 9.902724343257443e-05, "loss": 1.3833, "num_input_tokens_seen": 2240976, "step": 139 }, { "epoch": 0.009806754402094442, "grad_norm": 4.889583587646484, "learning_rate": 9.902024518388792e-05, "loss": 1.2095, "num_input_tokens_seen": 2257360, "step": 140 }, { "epoch": 0.00987680264782369, "grad_norm": 4.532052516937256, "learning_rate": 9.901324693520141e-05, "loss": 1.143, "num_input_tokens_seen": 2272848, "step": 141 }, { "epoch": 0.009946850893552935, "grad_norm": 5.278079032897949, "learning_rate": 9.900624868651488e-05, "loss": 1.2849, "num_input_tokens_seen": 2289232, "step": 142 }, { "epoch": 0.01001689913928218, "grad_norm": 4.549891948699951, "learning_rate": 9.899925043782839e-05, "loss": 1.0482, "num_input_tokens_seen": 2305424, "step": 143 }, { "epoch": 0.010086947385011426, "grad_norm": 4.7777180671691895, "learning_rate": 9.899225218914186e-05, "loss": 1.1926, "num_input_tokens_seen": 2320968, "step": 144 }, { "epoch": 0.010156995630740673, "grad_norm": 4.320313453674316, "learning_rate": 9.898525394045535e-05, "loss": 1.0468, "num_input_tokens_seen": 2337352, "step": 145 }, { "epoch": 0.010227043876469919, "grad_norm": 4.915202617645264, "learning_rate": 9.897825569176882e-05, "loss": 1.1326, "num_input_tokens_seen": 2353064, "step": 146 }, { "epoch": 0.010297092122199165, "grad_norm": 4.569783687591553, "learning_rate": 9.897125744308231e-05, "loss": 0.8586, "num_input_tokens_seen": 2369128, "step": 147 }, { "epoch": 0.01036714036792841, "grad_norm": 4.591664791107178, "learning_rate": 9.89642591943958e-05, "loss": 1.1369, "num_input_tokens_seen": 2385512, "step": 148 }, { "epoch": 0.010437188613657656, "grad_norm": 4.913016319274902, "learning_rate": 9.895726094570929e-05, "loss": 1.1564, "num_input_tokens_seen": 2401208, "step": 149 }, { "epoch": 0.010507236859386903, "grad_norm": 4.908018112182617, "learning_rate": 9.895026269702278e-05, "loss": 1.1247, "num_input_tokens_seen": 2417592, "step": 150 }, { "epoch": 0.010577285105116149, "grad_norm": 4.536910057067871, "learning_rate": 9.894326444833625e-05, "loss": 1.014, "num_input_tokens_seen": 2433976, "step": 151 }, { "epoch": 0.010647333350845395, "grad_norm": 4.899227142333984, "learning_rate": 9.893626619964974e-05, "loss": 1.0418, "num_input_tokens_seen": 2448072, "step": 152 }, { "epoch": 0.01071738159657464, "grad_norm": 4.600861072540283, "learning_rate": 9.892926795096323e-05, "loss": 1.0459, "num_input_tokens_seen": 2464240, "step": 153 }, { "epoch": 0.010787429842303888, "grad_norm": 4.707681179046631, "learning_rate": 9.89222697022767e-05, "loss": 1.0859, "num_input_tokens_seen": 2480624, "step": 154 }, { "epoch": 0.010857478088033133, "grad_norm": 4.748518466949463, "learning_rate": 9.89152714535902e-05, "loss": 1.0608, "num_input_tokens_seen": 2497008, "step": 155 }, { "epoch": 0.010927526333762379, "grad_norm": 4.794179439544678, "learning_rate": 9.890827320490368e-05, "loss": 1.2243, "num_input_tokens_seen": 2513392, "step": 156 }, { "epoch": 0.010997574579491624, "grad_norm": 4.593925476074219, "learning_rate": 9.890127495621717e-05, "loss": 1.1002, "num_input_tokens_seen": 2529776, "step": 157 }, { "epoch": 0.011067622825220872, "grad_norm": 4.318257808685303, "learning_rate": 9.889427670753066e-05, "loss": 0.9561, "num_input_tokens_seen": 2546160, "step": 158 }, { "epoch": 0.011137671070950117, "grad_norm": 4.631777286529541, "learning_rate": 9.888727845884414e-05, "loss": 1.1553, "num_input_tokens_seen": 2562544, "step": 159 }, { "epoch": 0.011207719316679363, "grad_norm": 4.896609783172607, "learning_rate": 9.888028021015762e-05, "loss": 1.1779, "num_input_tokens_seen": 2578088, "step": 160 }, { "epoch": 0.011277767562408609, "grad_norm": 4.3978681564331055, "learning_rate": 9.88732819614711e-05, "loss": 1.1778, "num_input_tokens_seen": 2594416, "step": 161 }, { "epoch": 0.011347815808137854, "grad_norm": 4.82927942276001, "learning_rate": 9.886628371278459e-05, "loss": 1.0339, "num_input_tokens_seen": 2609776, "step": 162 }, { "epoch": 0.011417864053867102, "grad_norm": 4.413319110870361, "learning_rate": 9.885928546409809e-05, "loss": 1.0992, "num_input_tokens_seen": 2626160, "step": 163 }, { "epoch": 0.011487912299596347, "grad_norm": 4.626354694366455, "learning_rate": 9.885228721541156e-05, "loss": 1.1948, "num_input_tokens_seen": 2642464, "step": 164 }, { "epoch": 0.011557960545325593, "grad_norm": 4.328434467315674, "learning_rate": 9.884528896672505e-05, "loss": 1.1493, "num_input_tokens_seen": 2658528, "step": 165 }, { "epoch": 0.011628008791054838, "grad_norm": 4.57839822769165, "learning_rate": 9.883829071803853e-05, "loss": 1.0775, "num_input_tokens_seen": 2674912, "step": 166 }, { "epoch": 0.011698057036784086, "grad_norm": 5.103973865509033, "learning_rate": 9.883129246935202e-05, "loss": 1.2458, "num_input_tokens_seen": 2690792, "step": 167 }, { "epoch": 0.011768105282513331, "grad_norm": 4.558016300201416, "learning_rate": 9.88242942206655e-05, "loss": 1.0122, "num_input_tokens_seen": 2705616, "step": 168 }, { "epoch": 0.011838153528242577, "grad_norm": 4.811260223388672, "learning_rate": 9.8817295971979e-05, "loss": 1.2989, "num_input_tokens_seen": 2721704, "step": 169 }, { "epoch": 0.011908201773971823, "grad_norm": 4.726966857910156, "learning_rate": 9.881029772329248e-05, "loss": 1.176, "num_input_tokens_seen": 2738088, "step": 170 }, { "epoch": 0.01197825001970107, "grad_norm": 4.874902725219727, "learning_rate": 9.880329947460596e-05, "loss": 1.2586, "num_input_tokens_seen": 2754040, "step": 171 }, { "epoch": 0.012048298265430316, "grad_norm": 4.379549980163574, "learning_rate": 9.879630122591945e-05, "loss": 1.1771, "num_input_tokens_seen": 2770424, "step": 172 }, { "epoch": 0.012118346511159561, "grad_norm": 4.455331802368164, "learning_rate": 9.878930297723292e-05, "loss": 1.0714, "num_input_tokens_seen": 2786808, "step": 173 }, { "epoch": 0.012188394756888807, "grad_norm": 4.42273473739624, "learning_rate": 9.878230472854641e-05, "loss": 1.1798, "num_input_tokens_seen": 2803176, "step": 174 }, { "epoch": 0.012258443002618052, "grad_norm": 4.4078874588012695, "learning_rate": 9.87753064798599e-05, "loss": 1.1672, "num_input_tokens_seen": 2819448, "step": 175 }, { "epoch": 0.0123284912483473, "grad_norm": 4.79048490524292, "learning_rate": 9.876830823117339e-05, "loss": 1.3331, "num_input_tokens_seen": 2835832, "step": 176 }, { "epoch": 0.012398539494076545, "grad_norm": 4.212133884429932, "learning_rate": 9.876130998248688e-05, "loss": 1.0007, "num_input_tokens_seen": 2851776, "step": 177 }, { "epoch": 0.012468587739805791, "grad_norm": 5.7587738037109375, "learning_rate": 9.875431173380035e-05, "loss": 1.4729, "num_input_tokens_seen": 2867896, "step": 178 }, { "epoch": 0.012538635985535037, "grad_norm": 4.3469462394714355, "learning_rate": 9.874731348511384e-05, "loss": 0.957, "num_input_tokens_seen": 2884280, "step": 179 }, { "epoch": 0.012608684231264284, "grad_norm": 4.584625244140625, "learning_rate": 9.874031523642733e-05, "loss": 1.0753, "num_input_tokens_seen": 2899208, "step": 180 }, { "epoch": 0.01267873247699353, "grad_norm": 4.544627666473389, "learning_rate": 9.87333169877408e-05, "loss": 1.1706, "num_input_tokens_seen": 2915416, "step": 181 }, { "epoch": 0.012748780722722775, "grad_norm": 4.8749237060546875, "learning_rate": 9.872631873905429e-05, "loss": 1.3382, "num_input_tokens_seen": 2931360, "step": 182 }, { "epoch": 0.01281882896845202, "grad_norm": 4.593903541564941, "learning_rate": 9.871932049036778e-05, "loss": 1.1588, "num_input_tokens_seen": 2947744, "step": 183 }, { "epoch": 0.012888877214181268, "grad_norm": 4.478219509124756, "learning_rate": 9.871232224168127e-05, "loss": 1.1013, "num_input_tokens_seen": 2963664, "step": 184 }, { "epoch": 0.012958925459910514, "grad_norm": 5.028106212615967, "learning_rate": 9.870532399299476e-05, "loss": 1.3223, "num_input_tokens_seen": 2980048, "step": 185 }, { "epoch": 0.01302897370563976, "grad_norm": 4.866946697235107, "learning_rate": 9.869832574430823e-05, "loss": 1.2376, "num_input_tokens_seen": 2995992, "step": 186 }, { "epoch": 0.013099021951369005, "grad_norm": 4.421341419219971, "learning_rate": 9.869132749562172e-05, "loss": 1.2252, "num_input_tokens_seen": 3012000, "step": 187 }, { "epoch": 0.01316907019709825, "grad_norm": 4.88083028793335, "learning_rate": 9.86843292469352e-05, "loss": 1.2951, "num_input_tokens_seen": 3028384, "step": 188 }, { "epoch": 0.013239118442827498, "grad_norm": 4.654318809509277, "learning_rate": 9.86773309982487e-05, "loss": 1.2839, "num_input_tokens_seen": 3044768, "step": 189 }, { "epoch": 0.013309166688556744, "grad_norm": 4.626763820648193, "learning_rate": 9.867033274956219e-05, "loss": 1.2389, "num_input_tokens_seen": 3061152, "step": 190 }, { "epoch": 0.01337921493428599, "grad_norm": 4.178484916687012, "learning_rate": 9.866333450087566e-05, "loss": 1.1186, "num_input_tokens_seen": 3077056, "step": 191 }, { "epoch": 0.013449263180015235, "grad_norm": 4.755034923553467, "learning_rate": 9.865633625218915e-05, "loss": 1.0594, "num_input_tokens_seen": 3093400, "step": 192 }, { "epoch": 0.013519311425744482, "grad_norm": 4.437506198883057, "learning_rate": 9.864933800350263e-05, "loss": 1.2078, "num_input_tokens_seen": 3109784, "step": 193 }, { "epoch": 0.013589359671473728, "grad_norm": 5.140488624572754, "learning_rate": 9.864233975481611e-05, "loss": 1.4312, "num_input_tokens_seen": 3124976, "step": 194 }, { "epoch": 0.013659407917202973, "grad_norm": 4.72155237197876, "learning_rate": 9.86353415061296e-05, "loss": 1.1752, "num_input_tokens_seen": 3140632, "step": 195 }, { "epoch": 0.013729456162932219, "grad_norm": 4.914645671844482, "learning_rate": 9.862834325744309e-05, "loss": 1.2464, "num_input_tokens_seen": 3156616, "step": 196 }, { "epoch": 0.013799504408661466, "grad_norm": 4.23387336730957, "learning_rate": 9.862134500875658e-05, "loss": 0.9722, "num_input_tokens_seen": 3172840, "step": 197 }, { "epoch": 0.013869552654390712, "grad_norm": 4.659370422363281, "learning_rate": 9.861434676007005e-05, "loss": 1.1981, "num_input_tokens_seen": 3188584, "step": 198 }, { "epoch": 0.013939600900119958, "grad_norm": 4.580902576446533, "learning_rate": 9.860734851138354e-05, "loss": 1.1913, "num_input_tokens_seen": 3204432, "step": 199 }, { "epoch": 0.014009649145849203, "grad_norm": 4.208237648010254, "learning_rate": 9.860035026269702e-05, "loss": 1.2056, "num_input_tokens_seen": 3220816, "step": 200 }, { "epoch": 0.014009649145849203, "eval_loss": 1.2226407527923584, "eval_runtime": 0.3992, "eval_samples_per_second": 2.505, "eval_steps_per_second": 2.505, "num_input_tokens_seen": 3220816, "step": 200 }, { "epoch": 0.014079697391578449, "grad_norm": 4.526260852813721, "learning_rate": 9.85933520140105e-05, "loss": 1.0488, "num_input_tokens_seen": 3237200, "step": 201 }, { "epoch": 0.014149745637307696, "grad_norm": 4.46895170211792, "learning_rate": 9.8586353765324e-05, "loss": 1.1101, "num_input_tokens_seen": 3253336, "step": 202 }, { "epoch": 0.014219793883036942, "grad_norm": 4.367347717285156, "learning_rate": 9.857935551663748e-05, "loss": 1.0425, "num_input_tokens_seen": 3269632, "step": 203 }, { "epoch": 0.014289842128766187, "grad_norm": 4.860860347747803, "learning_rate": 9.857235726795097e-05, "loss": 1.4068, "num_input_tokens_seen": 3285432, "step": 204 }, { "epoch": 0.014359890374495433, "grad_norm": 4.336480617523193, "learning_rate": 9.856535901926445e-05, "loss": 1.2579, "num_input_tokens_seen": 3301632, "step": 205 }, { "epoch": 0.01442993862022468, "grad_norm": 4.587873458862305, "learning_rate": 9.855836077057794e-05, "loss": 1.1508, "num_input_tokens_seen": 3318016, "step": 206 }, { "epoch": 0.014499986865953926, "grad_norm": 4.719262599945068, "learning_rate": 9.855136252189142e-05, "loss": 1.0208, "num_input_tokens_seen": 3333168, "step": 207 }, { "epoch": 0.014570035111683172, "grad_norm": 4.419138431549072, "learning_rate": 9.85443642732049e-05, "loss": 1.2576, "num_input_tokens_seen": 3349384, "step": 208 }, { "epoch": 0.014640083357412417, "grad_norm": 4.3150835037231445, "learning_rate": 9.85373660245184e-05, "loss": 1.1786, "num_input_tokens_seen": 3365768, "step": 209 }, { "epoch": 0.014710131603141665, "grad_norm": 4.5917649269104, "learning_rate": 9.853036777583188e-05, "loss": 1.2821, "num_input_tokens_seen": 3382152, "step": 210 }, { "epoch": 0.01478017984887091, "grad_norm": 4.9094343185424805, "learning_rate": 9.852336952714537e-05, "loss": 1.2415, "num_input_tokens_seen": 3397896, "step": 211 }, { "epoch": 0.014850228094600156, "grad_norm": 4.394861698150635, "learning_rate": 9.851637127845885e-05, "loss": 1.1776, "num_input_tokens_seen": 3414280, "step": 212 }, { "epoch": 0.014920276340329401, "grad_norm": 4.196374416351318, "learning_rate": 9.850937302977233e-05, "loss": 1.065, "num_input_tokens_seen": 3430584, "step": 213 }, { "epoch": 0.014990324586058647, "grad_norm": 4.728682518005371, "learning_rate": 9.850237478108582e-05, "loss": 1.2686, "num_input_tokens_seen": 3446968, "step": 214 }, { "epoch": 0.015060372831787894, "grad_norm": 4.291411876678467, "learning_rate": 9.84953765323993e-05, "loss": 1.1877, "num_input_tokens_seen": 3462568, "step": 215 }, { "epoch": 0.01513042107751714, "grad_norm": 4.405060768127441, "learning_rate": 9.84883782837128e-05, "loss": 1.2873, "num_input_tokens_seen": 3478952, "step": 216 }, { "epoch": 0.015200469323246386, "grad_norm": 4.254365921020508, "learning_rate": 9.848138003502628e-05, "loss": 1.1062, "num_input_tokens_seen": 3495304, "step": 217 }, { "epoch": 0.015270517568975631, "grad_norm": 4.741672039031982, "learning_rate": 9.847438178633976e-05, "loss": 1.1983, "num_input_tokens_seen": 3511688, "step": 218 }, { "epoch": 0.015340565814704879, "grad_norm": 4.352742671966553, "learning_rate": 9.846738353765325e-05, "loss": 1.2028, "num_input_tokens_seen": 3528072, "step": 219 }, { "epoch": 0.015410614060434124, "grad_norm": 4.996603488922119, "learning_rate": 9.846038528896672e-05, "loss": 1.1561, "num_input_tokens_seen": 3542904, "step": 220 }, { "epoch": 0.01548066230616337, "grad_norm": 4.911815166473389, "learning_rate": 9.845338704028021e-05, "loss": 1.3375, "num_input_tokens_seen": 3558352, "step": 221 }, { "epoch": 0.015550710551892616, "grad_norm": 4.638419151306152, "learning_rate": 9.84463887915937e-05, "loss": 1.1963, "num_input_tokens_seen": 3574736, "step": 222 }, { "epoch": 0.015620758797621863, "grad_norm": 4.323521614074707, "learning_rate": 9.843939054290719e-05, "loss": 1.1224, "num_input_tokens_seen": 3591120, "step": 223 }, { "epoch": 0.01569080704335111, "grad_norm": 4.466544151306152, "learning_rate": 9.843239229422068e-05, "loss": 1.3988, "num_input_tokens_seen": 3607392, "step": 224 }, { "epoch": 0.015760855289080354, "grad_norm": 4.476973533630371, "learning_rate": 9.842539404553415e-05, "loss": 1.184, "num_input_tokens_seen": 3623776, "step": 225 }, { "epoch": 0.0158309035348096, "grad_norm": 4.648625373840332, "learning_rate": 9.841839579684764e-05, "loss": 1.1768, "num_input_tokens_seen": 3640008, "step": 226 }, { "epoch": 0.015900951780538845, "grad_norm": 4.364476203918457, "learning_rate": 9.841139754816112e-05, "loss": 1.0208, "num_input_tokens_seen": 3656392, "step": 227 }, { "epoch": 0.01597100002626809, "grad_norm": 4.3054633140563965, "learning_rate": 9.84043992994746e-05, "loss": 1.1215, "num_input_tokens_seen": 3672392, "step": 228 }, { "epoch": 0.016041048271997337, "grad_norm": 4.83436918258667, "learning_rate": 9.83974010507881e-05, "loss": 1.2284, "num_input_tokens_seen": 3688776, "step": 229 }, { "epoch": 0.016111096517726586, "grad_norm": 4.447519779205322, "learning_rate": 9.839040280210158e-05, "loss": 1.1765, "num_input_tokens_seen": 3705080, "step": 230 }, { "epoch": 0.01618114476345583, "grad_norm": 4.269217491149902, "learning_rate": 9.838340455341507e-05, "loss": 1.0466, "num_input_tokens_seen": 3721464, "step": 231 }, { "epoch": 0.016251193009185077, "grad_norm": 4.41223669052124, "learning_rate": 9.837640630472854e-05, "loss": 1.2098, "num_input_tokens_seen": 3737184, "step": 232 }, { "epoch": 0.016321241254914323, "grad_norm": 4.632737159729004, "learning_rate": 9.836940805604203e-05, "loss": 1.1562, "num_input_tokens_seen": 3753192, "step": 233 }, { "epoch": 0.016391289500643568, "grad_norm": 4.379425525665283, "learning_rate": 9.836240980735552e-05, "loss": 1.1219, "num_input_tokens_seen": 3767976, "step": 234 }, { "epoch": 0.016461337746372814, "grad_norm": 4.28551721572876, "learning_rate": 9.835541155866901e-05, "loss": 1.0259, "num_input_tokens_seen": 3784008, "step": 235 }, { "epoch": 0.01653138599210206, "grad_norm": 4.642453670501709, "learning_rate": 9.83484133099825e-05, "loss": 1.1684, "num_input_tokens_seen": 3800000, "step": 236 }, { "epoch": 0.016601434237831305, "grad_norm": 4.367178440093994, "learning_rate": 9.834141506129597e-05, "loss": 1.2877, "num_input_tokens_seen": 3816384, "step": 237 }, { "epoch": 0.01667148248356055, "grad_norm": 4.5724005699157715, "learning_rate": 9.833441681260946e-05, "loss": 1.1814, "num_input_tokens_seen": 3830328, "step": 238 }, { "epoch": 0.0167415307292898, "grad_norm": 4.318159580230713, "learning_rate": 9.832741856392295e-05, "loss": 1.1143, "num_input_tokens_seen": 3846712, "step": 239 }, { "epoch": 0.016811578975019045, "grad_norm": 4.408501625061035, "learning_rate": 9.832042031523643e-05, "loss": 1.1508, "num_input_tokens_seen": 3861776, "step": 240 }, { "epoch": 0.01688162722074829, "grad_norm": 4.20060920715332, "learning_rate": 9.831342206654991e-05, "loss": 1.209, "num_input_tokens_seen": 3877736, "step": 241 }, { "epoch": 0.016951675466477537, "grad_norm": 4.431649208068848, "learning_rate": 9.83064238178634e-05, "loss": 1.2458, "num_input_tokens_seen": 3893320, "step": 242 }, { "epoch": 0.017021723712206782, "grad_norm": 4.000490188598633, "learning_rate": 9.829942556917689e-05, "loss": 1.0274, "num_input_tokens_seen": 3909704, "step": 243 }, { "epoch": 0.017091771957936028, "grad_norm": 4.703495025634766, "learning_rate": 9.829242732049038e-05, "loss": 1.1711, "num_input_tokens_seen": 3925808, "step": 244 }, { "epoch": 0.017161820203665273, "grad_norm": 4.639338970184326, "learning_rate": 9.828542907180386e-05, "loss": 1.3046, "num_input_tokens_seen": 3942192, "step": 245 }, { "epoch": 0.01723186844939452, "grad_norm": 4.414276599884033, "learning_rate": 9.827843082311734e-05, "loss": 1.271, "num_input_tokens_seen": 3958528, "step": 246 }, { "epoch": 0.017301916695123768, "grad_norm": 4.404853820800781, "learning_rate": 9.827143257443082e-05, "loss": 1.0693, "num_input_tokens_seen": 3974912, "step": 247 }, { "epoch": 0.017371964940853014, "grad_norm": 4.519491195678711, "learning_rate": 9.826443432574431e-05, "loss": 1.2894, "num_input_tokens_seen": 3991296, "step": 248 }, { "epoch": 0.01744201318658226, "grad_norm": 4.261727809906006, "learning_rate": 9.825743607705781e-05, "loss": 1.2059, "num_input_tokens_seen": 4006544, "step": 249 }, { "epoch": 0.017512061432311505, "grad_norm": 4.102485656738281, "learning_rate": 9.825043782837129e-05, "loss": 0.9365, "num_input_tokens_seen": 4022320, "step": 250 }, { "epoch": 0.01758210967804075, "grad_norm": 4.804764270782471, "learning_rate": 9.824343957968477e-05, "loss": 1.3344, "num_input_tokens_seen": 4037048, "step": 251 }, { "epoch": 0.017652157923769996, "grad_norm": 4.130600452423096, "learning_rate": 9.823644133099825e-05, "loss": 1.2349, "num_input_tokens_seen": 4053432, "step": 252 }, { "epoch": 0.017722206169499242, "grad_norm": 4.234742641448975, "learning_rate": 9.822944308231174e-05, "loss": 1.1371, "num_input_tokens_seen": 4069816, "step": 253 }, { "epoch": 0.017792254415228487, "grad_norm": 4.754928112030029, "learning_rate": 9.822244483362521e-05, "loss": 1.5168, "num_input_tokens_seen": 4085864, "step": 254 }, { "epoch": 0.017862302660957733, "grad_norm": 4.542768478393555, "learning_rate": 9.821544658493871e-05, "loss": 1.1943, "num_input_tokens_seen": 4102240, "step": 255 }, { "epoch": 0.017932350906686982, "grad_norm": 4.411310195922852, "learning_rate": 9.82084483362522e-05, "loss": 1.2694, "num_input_tokens_seen": 4118544, "step": 256 }, { "epoch": 0.018002399152416228, "grad_norm": 4.205377101898193, "learning_rate": 9.820145008756568e-05, "loss": 1.1581, "num_input_tokens_seen": 4134928, "step": 257 }, { "epoch": 0.018072447398145473, "grad_norm": 4.451165199279785, "learning_rate": 9.819445183887917e-05, "loss": 1.089, "num_input_tokens_seen": 4150848, "step": 258 }, { "epoch": 0.01814249564387472, "grad_norm": 4.366336822509766, "learning_rate": 9.818745359019264e-05, "loss": 1.1767, "num_input_tokens_seen": 4167184, "step": 259 }, { "epoch": 0.018212543889603965, "grad_norm": 4.394649982452393, "learning_rate": 9.818045534150613e-05, "loss": 1.0741, "num_input_tokens_seen": 4183376, "step": 260 }, { "epoch": 0.01828259213533321, "grad_norm": 4.344518184661865, "learning_rate": 9.817345709281962e-05, "loss": 1.2282, "num_input_tokens_seen": 4199760, "step": 261 }, { "epoch": 0.018352640381062456, "grad_norm": 4.403041362762451, "learning_rate": 9.816645884413311e-05, "loss": 1.2317, "num_input_tokens_seen": 4215816, "step": 262 }, { "epoch": 0.0184226886267917, "grad_norm": 4.715320110321045, "learning_rate": 9.81594605954466e-05, "loss": 1.3074, "num_input_tokens_seen": 4231504, "step": 263 }, { "epoch": 0.01849273687252095, "grad_norm": 4.5754265785217285, "learning_rate": 9.815246234676007e-05, "loss": 1.253, "num_input_tokens_seen": 4247888, "step": 264 }, { "epoch": 0.018562785118250196, "grad_norm": 4.2346930503845215, "learning_rate": 9.814546409807356e-05, "loss": 1.1727, "num_input_tokens_seen": 4264248, "step": 265 }, { "epoch": 0.018632833363979442, "grad_norm": 4.186713218688965, "learning_rate": 9.813846584938705e-05, "loss": 1.2693, "num_input_tokens_seen": 4280632, "step": 266 }, { "epoch": 0.018702881609708687, "grad_norm": 4.6356706619262695, "learning_rate": 9.813146760070052e-05, "loss": 1.3755, "num_input_tokens_seen": 4296648, "step": 267 }, { "epoch": 0.018772929855437933, "grad_norm": 4.466466903686523, "learning_rate": 9.812446935201401e-05, "loss": 1.283, "num_input_tokens_seen": 4311408, "step": 268 }, { "epoch": 0.01884297810116718, "grad_norm": 4.3369140625, "learning_rate": 9.81174711033275e-05, "loss": 1.1555, "num_input_tokens_seen": 4326736, "step": 269 }, { "epoch": 0.018913026346896424, "grad_norm": 4.434782028198242, "learning_rate": 9.811047285464099e-05, "loss": 1.2859, "num_input_tokens_seen": 4343120, "step": 270 }, { "epoch": 0.01898307459262567, "grad_norm": 4.346708297729492, "learning_rate": 9.810347460595448e-05, "loss": 1.1421, "num_input_tokens_seen": 4359504, "step": 271 }, { "epoch": 0.019053122838354915, "grad_norm": 4.529878616333008, "learning_rate": 9.809647635726795e-05, "loss": 1.2654, "num_input_tokens_seen": 4375888, "step": 272 }, { "epoch": 0.019123171084084165, "grad_norm": 4.051745891571045, "learning_rate": 9.808947810858144e-05, "loss": 1.1469, "num_input_tokens_seen": 4392224, "step": 273 }, { "epoch": 0.01919321932981341, "grad_norm": 4.403522491455078, "learning_rate": 9.808247985989492e-05, "loss": 1.233, "num_input_tokens_seen": 4408608, "step": 274 }, { "epoch": 0.019263267575542656, "grad_norm": 4.166261196136475, "learning_rate": 9.807548161120842e-05, "loss": 1.1697, "num_input_tokens_seen": 4424992, "step": 275 }, { "epoch": 0.0193333158212719, "grad_norm": 4.29187536239624, "learning_rate": 9.806848336252191e-05, "loss": 1.0503, "num_input_tokens_seen": 4441376, "step": 276 }, { "epoch": 0.019403364067001147, "grad_norm": 4.4056172370910645, "learning_rate": 9.806148511383538e-05, "loss": 1.1965, "num_input_tokens_seen": 4457760, "step": 277 }, { "epoch": 0.019473412312730393, "grad_norm": 4.355875015258789, "learning_rate": 9.805448686514887e-05, "loss": 1.1024, "num_input_tokens_seen": 4474144, "step": 278 }, { "epoch": 0.019543460558459638, "grad_norm": 4.46420955657959, "learning_rate": 9.804748861646235e-05, "loss": 1.203, "num_input_tokens_seen": 4488912, "step": 279 }, { "epoch": 0.019613508804188884, "grad_norm": 4.48052453994751, "learning_rate": 9.804049036777583e-05, "loss": 1.2089, "num_input_tokens_seen": 4505296, "step": 280 }, { "epoch": 0.01968355704991813, "grad_norm": 4.458749294281006, "learning_rate": 9.803349211908932e-05, "loss": 1.1557, "num_input_tokens_seen": 4520576, "step": 281 }, { "epoch": 0.01975360529564738, "grad_norm": 4.551771640777588, "learning_rate": 9.802649387040281e-05, "loss": 1.1671, "num_input_tokens_seen": 4536960, "step": 282 }, { "epoch": 0.019823653541376624, "grad_norm": 4.038064956665039, "learning_rate": 9.80194956217163e-05, "loss": 1.1562, "num_input_tokens_seen": 4553344, "step": 283 }, { "epoch": 0.01989370178710587, "grad_norm": 4.647075653076172, "learning_rate": 9.801249737302978e-05, "loss": 1.3069, "num_input_tokens_seen": 4568928, "step": 284 }, { "epoch": 0.019963750032835115, "grad_norm": 4.258941650390625, "learning_rate": 9.800549912434326e-05, "loss": 1.0349, "num_input_tokens_seen": 4585312, "step": 285 }, { "epoch": 0.02003379827856436, "grad_norm": 4.348769664764404, "learning_rate": 9.799850087565674e-05, "loss": 1.1163, "num_input_tokens_seen": 4601696, "step": 286 }, { "epoch": 0.020103846524293607, "grad_norm": 4.105901718139648, "learning_rate": 9.799150262697023e-05, "loss": 1.0313, "num_input_tokens_seen": 4617312, "step": 287 }, { "epoch": 0.020173894770022852, "grad_norm": 4.079495429992676, "learning_rate": 9.798450437828372e-05, "loss": 1.0828, "num_input_tokens_seen": 4633696, "step": 288 }, { "epoch": 0.020243943015752098, "grad_norm": 4.03472375869751, "learning_rate": 9.79775061295972e-05, "loss": 0.9475, "num_input_tokens_seen": 4650080, "step": 289 }, { "epoch": 0.020313991261481347, "grad_norm": 4.077049732208252, "learning_rate": 9.797050788091069e-05, "loss": 1.1323, "num_input_tokens_seen": 4666328, "step": 290 }, { "epoch": 0.020384039507210593, "grad_norm": 4.086606025695801, "learning_rate": 9.796350963222417e-05, "loss": 1.1218, "num_input_tokens_seen": 4682256, "step": 291 }, { "epoch": 0.020454087752939838, "grad_norm": 4.296900749206543, "learning_rate": 9.795651138353766e-05, "loss": 1.2964, "num_input_tokens_seen": 4698640, "step": 292 }, { "epoch": 0.020524135998669084, "grad_norm": 4.040759086608887, "learning_rate": 9.794951313485115e-05, "loss": 1.1077, "num_input_tokens_seen": 4714928, "step": 293 }, { "epoch": 0.02059418424439833, "grad_norm": 3.8260273933410645, "learning_rate": 9.794251488616462e-05, "loss": 0.9667, "num_input_tokens_seen": 4731312, "step": 294 }, { "epoch": 0.020664232490127575, "grad_norm": 4.294517993927002, "learning_rate": 9.793551663747811e-05, "loss": 1.2704, "num_input_tokens_seen": 4747544, "step": 295 }, { "epoch": 0.02073428073585682, "grad_norm": 4.206037521362305, "learning_rate": 9.79285183887916e-05, "loss": 1.1593, "num_input_tokens_seen": 4763928, "step": 296 }, { "epoch": 0.020804328981586066, "grad_norm": 4.147867202758789, "learning_rate": 9.792152014010509e-05, "loss": 1.1256, "num_input_tokens_seen": 4780312, "step": 297 }, { "epoch": 0.020874377227315312, "grad_norm": 4.23718786239624, "learning_rate": 9.791452189141857e-05, "loss": 1.2353, "num_input_tokens_seen": 4796384, "step": 298 }, { "epoch": 0.02094442547304456, "grad_norm": 4.172685146331787, "learning_rate": 9.790752364273205e-05, "loss": 1.1868, "num_input_tokens_seen": 4812768, "step": 299 }, { "epoch": 0.021014473718773807, "grad_norm": 4.167289733886719, "learning_rate": 9.790052539404554e-05, "loss": 1.0606, "num_input_tokens_seen": 4829152, "step": 300 }, { "epoch": 0.021084521964503052, "grad_norm": 4.096963882446289, "learning_rate": 9.789352714535903e-05, "loss": 1.0557, "num_input_tokens_seen": 4845384, "step": 301 }, { "epoch": 0.021154570210232298, "grad_norm": 4.223779678344727, "learning_rate": 9.788652889667252e-05, "loss": 1.1485, "num_input_tokens_seen": 4861768, "step": 302 }, { "epoch": 0.021224618455961543, "grad_norm": 3.8243472576141357, "learning_rate": 9.7879530647986e-05, "loss": 1.004, "num_input_tokens_seen": 4878152, "step": 303 }, { "epoch": 0.02129466670169079, "grad_norm": 4.092590808868408, "learning_rate": 9.787253239929948e-05, "loss": 1.0211, "num_input_tokens_seen": 4894536, "step": 304 }, { "epoch": 0.021364714947420035, "grad_norm": 4.42412805557251, "learning_rate": 9.786553415061297e-05, "loss": 0.9915, "num_input_tokens_seen": 4910320, "step": 305 }, { "epoch": 0.02143476319314928, "grad_norm": 4.488316535949707, "learning_rate": 9.785853590192644e-05, "loss": 1.1782, "num_input_tokens_seen": 4926704, "step": 306 }, { "epoch": 0.021504811438878526, "grad_norm": 4.110256195068359, "learning_rate": 9.785153765323993e-05, "loss": 1.102, "num_input_tokens_seen": 4943088, "step": 307 }, { "epoch": 0.021574859684607775, "grad_norm": 4.246950149536133, "learning_rate": 9.784453940455342e-05, "loss": 1.067, "num_input_tokens_seen": 4958736, "step": 308 }, { "epoch": 0.02164490793033702, "grad_norm": 4.175214767456055, "learning_rate": 9.783754115586691e-05, "loss": 1.0638, "num_input_tokens_seen": 4975120, "step": 309 }, { "epoch": 0.021714956176066266, "grad_norm": 4.427795886993408, "learning_rate": 9.78305429071804e-05, "loss": 1.1347, "num_input_tokens_seen": 4991504, "step": 310 }, { "epoch": 0.021785004421795512, "grad_norm": 4.158191204071045, "learning_rate": 9.782354465849387e-05, "loss": 1.1662, "num_input_tokens_seen": 5007152, "step": 311 }, { "epoch": 0.021855052667524758, "grad_norm": 4.184347629547119, "learning_rate": 9.781654640980736e-05, "loss": 1.0791, "num_input_tokens_seen": 5023536, "step": 312 }, { "epoch": 0.021925100913254003, "grad_norm": 3.8506295680999756, "learning_rate": 9.780954816112084e-05, "loss": 1.0615, "num_input_tokens_seen": 5039728, "step": 313 }, { "epoch": 0.02199514915898325, "grad_norm": 4.310062408447266, "learning_rate": 9.780254991243432e-05, "loss": 1.1363, "num_input_tokens_seen": 5056008, "step": 314 }, { "epoch": 0.022065197404712494, "grad_norm": 4.215006351470947, "learning_rate": 9.779555166374781e-05, "loss": 1.1715, "num_input_tokens_seen": 5072096, "step": 315 }, { "epoch": 0.022135245650441743, "grad_norm": 4.219073295593262, "learning_rate": 9.77885534150613e-05, "loss": 1.219, "num_input_tokens_seen": 5088432, "step": 316 }, { "epoch": 0.02220529389617099, "grad_norm": 4.319522857666016, "learning_rate": 9.778155516637479e-05, "loss": 1.3085, "num_input_tokens_seen": 5104240, "step": 317 }, { "epoch": 0.022275342141900235, "grad_norm": 4.118961334228516, "learning_rate": 9.777455691768827e-05, "loss": 1.0926, "num_input_tokens_seen": 5120624, "step": 318 }, { "epoch": 0.02234539038762948, "grad_norm": 4.195051193237305, "learning_rate": 9.776755866900175e-05, "loss": 1.0894, "num_input_tokens_seen": 5137008, "step": 319 }, { "epoch": 0.022415438633358726, "grad_norm": 4.114197254180908, "learning_rate": 9.776056042031524e-05, "loss": 1.1897, "num_input_tokens_seen": 5153272, "step": 320 }, { "epoch": 0.02248548687908797, "grad_norm": 4.014908313751221, "learning_rate": 9.775356217162872e-05, "loss": 1.0932, "num_input_tokens_seen": 5169472, "step": 321 }, { "epoch": 0.022555535124817217, "grad_norm": 4.190642356872559, "learning_rate": 9.774656392294222e-05, "loss": 1.1413, "num_input_tokens_seen": 5185856, "step": 322 }, { "epoch": 0.022625583370546463, "grad_norm": 4.562993049621582, "learning_rate": 9.77395656742557e-05, "loss": 1.2865, "num_input_tokens_seen": 5202240, "step": 323 }, { "epoch": 0.02269563161627571, "grad_norm": 4.607022762298584, "learning_rate": 9.773256742556918e-05, "loss": 1.1465, "num_input_tokens_seen": 5218168, "step": 324 }, { "epoch": 0.022765679862004957, "grad_norm": 3.956439256668091, "learning_rate": 9.772556917688267e-05, "loss": 1.028, "num_input_tokens_seen": 5234368, "step": 325 }, { "epoch": 0.022835728107734203, "grad_norm": 4.20713472366333, "learning_rate": 9.771857092819615e-05, "loss": 1.2332, "num_input_tokens_seen": 5249808, "step": 326 }, { "epoch": 0.02290577635346345, "grad_norm": 4.4092864990234375, "learning_rate": 9.771157267950964e-05, "loss": 1.104, "num_input_tokens_seen": 5266120, "step": 327 }, { "epoch": 0.022975824599192694, "grad_norm": 4.529845237731934, "learning_rate": 9.770457443082312e-05, "loss": 1.3475, "num_input_tokens_seen": 5282504, "step": 328 }, { "epoch": 0.02304587284492194, "grad_norm": 4.221986293792725, "learning_rate": 9.769757618213661e-05, "loss": 1.4115, "num_input_tokens_seen": 5298344, "step": 329 }, { "epoch": 0.023115921090651186, "grad_norm": 4.29000186920166, "learning_rate": 9.76905779334501e-05, "loss": 1.2855, "num_input_tokens_seen": 5314728, "step": 330 }, { "epoch": 0.02318596933638043, "grad_norm": 4.426812648773193, "learning_rate": 9.768357968476358e-05, "loss": 1.514, "num_input_tokens_seen": 5330816, "step": 331 }, { "epoch": 0.023256017582109677, "grad_norm": 4.210752964019775, "learning_rate": 9.767658143607706e-05, "loss": 1.0854, "num_input_tokens_seen": 5346552, "step": 332 }, { "epoch": 0.023326065827838922, "grad_norm": 4.216427326202393, "learning_rate": 9.766958318739054e-05, "loss": 1.1573, "num_input_tokens_seen": 5362936, "step": 333 }, { "epoch": 0.02339611407356817, "grad_norm": 4.132325649261475, "learning_rate": 9.766258493870403e-05, "loss": 1.0942, "num_input_tokens_seen": 5379320, "step": 334 }, { "epoch": 0.023466162319297417, "grad_norm": 4.277027130126953, "learning_rate": 9.765558669001752e-05, "loss": 1.1227, "num_input_tokens_seen": 5395704, "step": 335 }, { "epoch": 0.023536210565026663, "grad_norm": 4.228096961975098, "learning_rate": 9.7648588441331e-05, "loss": 1.1094, "num_input_tokens_seen": 5412088, "step": 336 }, { "epoch": 0.02360625881075591, "grad_norm": 4.194522380828857, "learning_rate": 9.76415901926445e-05, "loss": 1.2066, "num_input_tokens_seen": 5428472, "step": 337 }, { "epoch": 0.023676307056485154, "grad_norm": 4.336326599121094, "learning_rate": 9.763459194395797e-05, "loss": 1.2251, "num_input_tokens_seen": 5444856, "step": 338 }, { "epoch": 0.0237463553022144, "grad_norm": 4.2723307609558105, "learning_rate": 9.762759369527146e-05, "loss": 1.0927, "num_input_tokens_seen": 5460304, "step": 339 }, { "epoch": 0.023816403547943645, "grad_norm": 4.190036773681641, "learning_rate": 9.762059544658493e-05, "loss": 1.2036, "num_input_tokens_seen": 5476688, "step": 340 }, { "epoch": 0.02388645179367289, "grad_norm": 4.477560043334961, "learning_rate": 9.761359719789842e-05, "loss": 1.362, "num_input_tokens_seen": 5493072, "step": 341 }, { "epoch": 0.02395650003940214, "grad_norm": 4.160232067108154, "learning_rate": 9.760659894921192e-05, "loss": 1.1602, "num_input_tokens_seen": 5509456, "step": 342 }, { "epoch": 0.024026548285131386, "grad_norm": 3.857335090637207, "learning_rate": 9.75996007005254e-05, "loss": 1.0963, "num_input_tokens_seen": 5525840, "step": 343 }, { "epoch": 0.02409659653086063, "grad_norm": 4.141246318817139, "learning_rate": 9.759260245183889e-05, "loss": 1.2009, "num_input_tokens_seen": 5541888, "step": 344 }, { "epoch": 0.024166644776589877, "grad_norm": 4.50364875793457, "learning_rate": 9.758560420315236e-05, "loss": 1.1483, "num_input_tokens_seen": 5557848, "step": 345 }, { "epoch": 0.024236693022319122, "grad_norm": 4.3343353271484375, "learning_rate": 9.757860595446585e-05, "loss": 1.3594, "num_input_tokens_seen": 5573504, "step": 346 }, { "epoch": 0.024306741268048368, "grad_norm": 4.050408363342285, "learning_rate": 9.757160770577934e-05, "loss": 1.0563, "num_input_tokens_seen": 5589544, "step": 347 }, { "epoch": 0.024376789513777614, "grad_norm": 4.051811695098877, "learning_rate": 9.756460945709283e-05, "loss": 1.0288, "num_input_tokens_seen": 5605368, "step": 348 }, { "epoch": 0.02444683775950686, "grad_norm": 4.365113258361816, "learning_rate": 9.755761120840632e-05, "loss": 1.3054, "num_input_tokens_seen": 5621752, "step": 349 }, { "epoch": 0.024516886005236105, "grad_norm": 4.0057501792907715, "learning_rate": 9.755061295971979e-05, "loss": 1.1302, "num_input_tokens_seen": 5638136, "step": 350 }, { "epoch": 0.024586934250965354, "grad_norm": 4.254896640777588, "learning_rate": 9.754361471103328e-05, "loss": 1.0495, "num_input_tokens_seen": 5653168, "step": 351 }, { "epoch": 0.0246569824966946, "grad_norm": 3.8119771480560303, "learning_rate": 9.753661646234677e-05, "loss": 1.0349, "num_input_tokens_seen": 5669504, "step": 352 }, { "epoch": 0.024727030742423845, "grad_norm": 4.5082621574401855, "learning_rate": 9.752961821366024e-05, "loss": 1.2537, "num_input_tokens_seen": 5685168, "step": 353 }, { "epoch": 0.02479707898815309, "grad_norm": 4.392731189727783, "learning_rate": 9.752261996497373e-05, "loss": 1.2534, "num_input_tokens_seen": 5701240, "step": 354 }, { "epoch": 0.024867127233882336, "grad_norm": 4.293395519256592, "learning_rate": 9.751562171628722e-05, "loss": 1.2774, "num_input_tokens_seen": 5717624, "step": 355 }, { "epoch": 0.024937175479611582, "grad_norm": 4.64813756942749, "learning_rate": 9.750862346760071e-05, "loss": 1.2795, "num_input_tokens_seen": 5733104, "step": 356 }, { "epoch": 0.025007223725340828, "grad_norm": 4.5166778564453125, "learning_rate": 9.75016252189142e-05, "loss": 1.1301, "num_input_tokens_seen": 5749488, "step": 357 }, { "epoch": 0.025077271971070073, "grad_norm": 3.894291400909424, "learning_rate": 9.749462697022767e-05, "loss": 0.901, "num_input_tokens_seen": 5765872, "step": 358 }, { "epoch": 0.02514732021679932, "grad_norm": 4.10056209564209, "learning_rate": 9.748762872154116e-05, "loss": 1.0529, "num_input_tokens_seen": 5780856, "step": 359 }, { "epoch": 0.025217368462528568, "grad_norm": 4.6277666091918945, "learning_rate": 9.748063047285464e-05, "loss": 1.3649, "num_input_tokens_seen": 5796856, "step": 360 }, { "epoch": 0.025287416708257814, "grad_norm": 4.029720306396484, "learning_rate": 9.747363222416813e-05, "loss": 0.8863, "num_input_tokens_seen": 5812176, "step": 361 }, { "epoch": 0.02535746495398706, "grad_norm": 3.7772202491760254, "learning_rate": 9.746663397548161e-05, "loss": 1.0448, "num_input_tokens_seen": 5828064, "step": 362 }, { "epoch": 0.025427513199716305, "grad_norm": 4.379861354827881, "learning_rate": 9.74596357267951e-05, "loss": 1.3274, "num_input_tokens_seen": 5843680, "step": 363 }, { "epoch": 0.02549756144544555, "grad_norm": 4.254587173461914, "learning_rate": 9.745263747810859e-05, "loss": 1.1502, "num_input_tokens_seen": 5859024, "step": 364 }, { "epoch": 0.025567609691174796, "grad_norm": 4.271276473999023, "learning_rate": 9.744563922942207e-05, "loss": 1.2785, "num_input_tokens_seen": 5874320, "step": 365 }, { "epoch": 0.02563765793690404, "grad_norm": 4.224324703216553, "learning_rate": 9.743864098073555e-05, "loss": 1.0926, "num_input_tokens_seen": 5890704, "step": 366 }, { "epoch": 0.025707706182633287, "grad_norm": 4.289444446563721, "learning_rate": 9.743164273204903e-05, "loss": 1.1913, "num_input_tokens_seen": 5906016, "step": 367 }, { "epoch": 0.025777754428362536, "grad_norm": 4.280707359313965, "learning_rate": 9.742464448336253e-05, "loss": 1.2238, "num_input_tokens_seen": 5921784, "step": 368 }, { "epoch": 0.025847802674091782, "grad_norm": 4.554803848266602, "learning_rate": 9.741764623467602e-05, "loss": 1.2491, "num_input_tokens_seen": 5938072, "step": 369 }, { "epoch": 0.025917850919821028, "grad_norm": 4.677784442901611, "learning_rate": 9.74106479859895e-05, "loss": 1.2387, "num_input_tokens_seen": 5954456, "step": 370 }, { "epoch": 0.025987899165550273, "grad_norm": 4.268225193023682, "learning_rate": 9.740364973730298e-05, "loss": 1.2983, "num_input_tokens_seen": 5970664, "step": 371 }, { "epoch": 0.02605794741127952, "grad_norm": 4.361818790435791, "learning_rate": 9.739665148861646e-05, "loss": 1.199, "num_input_tokens_seen": 5987048, "step": 372 }, { "epoch": 0.026127995657008764, "grad_norm": 3.9990735054016113, "learning_rate": 9.738965323992995e-05, "loss": 1.0777, "num_input_tokens_seen": 6003432, "step": 373 }, { "epoch": 0.02619804390273801, "grad_norm": 3.992142915725708, "learning_rate": 9.738265499124344e-05, "loss": 1.0443, "num_input_tokens_seen": 6019816, "step": 374 }, { "epoch": 0.026268092148467256, "grad_norm": 4.270167827606201, "learning_rate": 9.737565674255693e-05, "loss": 1.1764, "num_input_tokens_seen": 6036200, "step": 375 }, { "epoch": 0.0263381403941965, "grad_norm": 4.362086296081543, "learning_rate": 9.736865849387041e-05, "loss": 1.2735, "num_input_tokens_seen": 6052120, "step": 376 }, { "epoch": 0.02640818863992575, "grad_norm": 3.6900475025177, "learning_rate": 9.736166024518389e-05, "loss": 0.8729, "num_input_tokens_seen": 6068264, "step": 377 }, { "epoch": 0.026478236885654996, "grad_norm": 3.8281285762786865, "learning_rate": 9.735466199649738e-05, "loss": 1.1096, "num_input_tokens_seen": 6084504, "step": 378 }, { "epoch": 0.02654828513138424, "grad_norm": 3.9335553646087646, "learning_rate": 9.734766374781087e-05, "loss": 1.0763, "num_input_tokens_seen": 6100592, "step": 379 }, { "epoch": 0.026618333377113487, "grad_norm": 4.332645416259766, "learning_rate": 9.734066549912434e-05, "loss": 1.1751, "num_input_tokens_seen": 6116976, "step": 380 }, { "epoch": 0.026688381622842733, "grad_norm": 4.160863399505615, "learning_rate": 9.733366725043783e-05, "loss": 1.0778, "num_input_tokens_seen": 6133360, "step": 381 }, { "epoch": 0.02675842986857198, "grad_norm": 4.388178825378418, "learning_rate": 9.732666900175132e-05, "loss": 1.2214, "num_input_tokens_seen": 6149744, "step": 382 }, { "epoch": 0.026828478114301224, "grad_norm": 4.354910373687744, "learning_rate": 9.73196707530648e-05, "loss": 1.4115, "num_input_tokens_seen": 6166048, "step": 383 }, { "epoch": 0.02689852636003047, "grad_norm": 4.058071613311768, "learning_rate": 9.73126725043783e-05, "loss": 1.0934, "num_input_tokens_seen": 6181840, "step": 384 }, { "epoch": 0.026968574605759715, "grad_norm": 4.060855865478516, "learning_rate": 9.730567425569177e-05, "loss": 1.1395, "num_input_tokens_seen": 6198224, "step": 385 }, { "epoch": 0.027038622851488964, "grad_norm": 4.316681385040283, "learning_rate": 9.729867600700526e-05, "loss": 1.1052, "num_input_tokens_seen": 6214608, "step": 386 }, { "epoch": 0.02710867109721821, "grad_norm": 4.322516918182373, "learning_rate": 9.729167775831873e-05, "loss": 1.2512, "num_input_tokens_seen": 6230992, "step": 387 }, { "epoch": 0.027178719342947456, "grad_norm": 4.090857028961182, "learning_rate": 9.728467950963224e-05, "loss": 1.0772, "num_input_tokens_seen": 6246760, "step": 388 }, { "epoch": 0.0272487675886767, "grad_norm": 4.0143961906433105, "learning_rate": 9.727768126094571e-05, "loss": 1.0578, "num_input_tokens_seen": 6261968, "step": 389 }, { "epoch": 0.027318815834405947, "grad_norm": 4.911194324493408, "learning_rate": 9.72706830122592e-05, "loss": 1.3016, "num_input_tokens_seen": 6276664, "step": 390 }, { "epoch": 0.027388864080135192, "grad_norm": 4.057498931884766, "learning_rate": 9.726368476357269e-05, "loss": 1.026, "num_input_tokens_seen": 6293048, "step": 391 }, { "epoch": 0.027458912325864438, "grad_norm": 3.9827401638031006, "learning_rate": 9.725668651488616e-05, "loss": 1.136, "num_input_tokens_seen": 6309432, "step": 392 }, { "epoch": 0.027528960571593684, "grad_norm": 4.640822887420654, "learning_rate": 9.724968826619965e-05, "loss": 1.2823, "num_input_tokens_seen": 6325568, "step": 393 }, { "epoch": 0.027599008817322933, "grad_norm": 4.372538089752197, "learning_rate": 9.724269001751314e-05, "loss": 1.0354, "num_input_tokens_seen": 6341952, "step": 394 }, { "epoch": 0.02766905706305218, "grad_norm": 4.018289566040039, "learning_rate": 9.723569176882663e-05, "loss": 1.029, "num_input_tokens_seen": 6358336, "step": 395 }, { "epoch": 0.027739105308781424, "grad_norm": 4.440858364105225, "learning_rate": 9.722869352014012e-05, "loss": 1.2272, "num_input_tokens_seen": 6374680, "step": 396 }, { "epoch": 0.02780915355451067, "grad_norm": 4.246788024902344, "learning_rate": 9.722169527145359e-05, "loss": 1.0161, "num_input_tokens_seen": 6390672, "step": 397 }, { "epoch": 0.027879201800239915, "grad_norm": 4.27274751663208, "learning_rate": 9.721469702276708e-05, "loss": 1.293, "num_input_tokens_seen": 6407056, "step": 398 }, { "epoch": 0.02794925004596916, "grad_norm": 4.171760559082031, "learning_rate": 9.720769877408056e-05, "loss": 1.2766, "num_input_tokens_seen": 6423440, "step": 399 }, { "epoch": 0.028019298291698407, "grad_norm": 4.174622535705566, "learning_rate": 9.720070052539405e-05, "loss": 1.049, "num_input_tokens_seen": 6439824, "step": 400 }, { "epoch": 0.028019298291698407, "eval_loss": 1.1994441747665405, "eval_runtime": 0.2131, "eval_samples_per_second": 4.693, "eval_steps_per_second": 4.693, "num_input_tokens_seen": 6439824, "step": 400 }, { "epoch": 0.028089346537427652, "grad_norm": 4.199150562286377, "learning_rate": 9.719370227670753e-05, "loss": 1.3432, "num_input_tokens_seen": 6456208, "step": 401 }, { "epoch": 0.028159394783156898, "grad_norm": 3.9011733531951904, "learning_rate": 9.718670402802102e-05, "loss": 1.0895, "num_input_tokens_seen": 6472592, "step": 402 }, { "epoch": 0.028229443028886147, "grad_norm": 4.142306327819824, "learning_rate": 9.717970577933451e-05, "loss": 0.9031, "num_input_tokens_seen": 6488976, "step": 403 }, { "epoch": 0.028299491274615392, "grad_norm": 3.9745633602142334, "learning_rate": 9.717270753064799e-05, "loss": 0.9951, "num_input_tokens_seen": 6505360, "step": 404 }, { "epoch": 0.028369539520344638, "grad_norm": 3.838865280151367, "learning_rate": 9.716570928196147e-05, "loss": 0.809, "num_input_tokens_seen": 6521744, "step": 405 }, { "epoch": 0.028439587766073884, "grad_norm": 4.48146390914917, "learning_rate": 9.715871103327496e-05, "loss": 1.4985, "num_input_tokens_seen": 6538128, "step": 406 }, { "epoch": 0.02850963601180313, "grad_norm": 4.393556594848633, "learning_rate": 9.715171278458844e-05, "loss": 1.2355, "num_input_tokens_seen": 6554512, "step": 407 }, { "epoch": 0.028579684257532375, "grad_norm": 3.970860004425049, "learning_rate": 9.714471453590194e-05, "loss": 1.1513, "num_input_tokens_seen": 6570896, "step": 408 }, { "epoch": 0.02864973250326162, "grad_norm": 4.166610240936279, "learning_rate": 9.713771628721542e-05, "loss": 1.108, "num_input_tokens_seen": 6587216, "step": 409 }, { "epoch": 0.028719780748990866, "grad_norm": 3.9887096881866455, "learning_rate": 9.71307180385289e-05, "loss": 1.1639, "num_input_tokens_seen": 6603600, "step": 410 }, { "epoch": 0.028789828994720112, "grad_norm": 4.195802211761475, "learning_rate": 9.712371978984239e-05, "loss": 1.1478, "num_input_tokens_seen": 6619984, "step": 411 }, { "epoch": 0.02885987724044936, "grad_norm": 4.011331081390381, "learning_rate": 9.711672154115587e-05, "loss": 0.9554, "num_input_tokens_seen": 6635904, "step": 412 }, { "epoch": 0.028929925486178606, "grad_norm": 4.4170026779174805, "learning_rate": 9.710972329246936e-05, "loss": 1.1452, "num_input_tokens_seen": 6651944, "step": 413 }, { "epoch": 0.028999973731907852, "grad_norm": 4.073450088500977, "learning_rate": 9.710272504378284e-05, "loss": 1.1187, "num_input_tokens_seen": 6668096, "step": 414 }, { "epoch": 0.029070021977637098, "grad_norm": 4.161722183227539, "learning_rate": 9.709572679509633e-05, "loss": 1.1603, "num_input_tokens_seen": 6684480, "step": 415 }, { "epoch": 0.029140070223366343, "grad_norm": 4.540097713470459, "learning_rate": 9.708872854640981e-05, "loss": 1.2143, "num_input_tokens_seen": 6700536, "step": 416 }, { "epoch": 0.02921011846909559, "grad_norm": 4.030871868133545, "learning_rate": 9.70817302977233e-05, "loss": 0.9791, "num_input_tokens_seen": 6716920, "step": 417 }, { "epoch": 0.029280166714824835, "grad_norm": 4.1743268966674805, "learning_rate": 9.707473204903679e-05, "loss": 0.9818, "num_input_tokens_seen": 6733304, "step": 418 }, { "epoch": 0.02935021496055408, "grad_norm": 4.227272987365723, "learning_rate": 9.706773380035026e-05, "loss": 1.0945, "num_input_tokens_seen": 6749688, "step": 419 }, { "epoch": 0.02942026320628333, "grad_norm": 4.406428813934326, "learning_rate": 9.706073555166375e-05, "loss": 1.0302, "num_input_tokens_seen": 6766072, "step": 420 }, { "epoch": 0.029490311452012575, "grad_norm": 4.17899227142334, "learning_rate": 9.705373730297724e-05, "loss": 1.1048, "num_input_tokens_seen": 6782456, "step": 421 }, { "epoch": 0.02956035969774182, "grad_norm": 4.034752368927002, "learning_rate": 9.704673905429073e-05, "loss": 1.2639, "num_input_tokens_seen": 6798840, "step": 422 }, { "epoch": 0.029630407943471066, "grad_norm": 4.795727729797363, "learning_rate": 9.703974080560421e-05, "loss": 1.2448, "num_input_tokens_seen": 6814912, "step": 423 }, { "epoch": 0.029700456189200312, "grad_norm": 4.509056568145752, "learning_rate": 9.703274255691769e-05, "loss": 1.2157, "num_input_tokens_seen": 6830720, "step": 424 }, { "epoch": 0.029770504434929557, "grad_norm": 4.064620494842529, "learning_rate": 9.702574430823118e-05, "loss": 1.2042, "num_input_tokens_seen": 6847104, "step": 425 }, { "epoch": 0.029840552680658803, "grad_norm": 3.9060182571411133, "learning_rate": 9.701874605954465e-05, "loss": 0.9116, "num_input_tokens_seen": 6862952, "step": 426 }, { "epoch": 0.02991060092638805, "grad_norm": 3.9900951385498047, "learning_rate": 9.701174781085814e-05, "loss": 1.1621, "num_input_tokens_seen": 6879336, "step": 427 }, { "epoch": 0.029980649172117294, "grad_norm": 4.371436595916748, "learning_rate": 9.700474956217164e-05, "loss": 1.2731, "num_input_tokens_seen": 6895720, "step": 428 }, { "epoch": 0.030050697417846543, "grad_norm": 3.9422085285186768, "learning_rate": 9.699775131348512e-05, "loss": 0.9636, "num_input_tokens_seen": 6912104, "step": 429 }, { "epoch": 0.03012074566357579, "grad_norm": 4.080913543701172, "learning_rate": 9.699075306479861e-05, "loss": 1.1507, "num_input_tokens_seen": 6928488, "step": 430 }, { "epoch": 0.030190793909305035, "grad_norm": 4.493942737579346, "learning_rate": 9.698375481611208e-05, "loss": 1.2274, "num_input_tokens_seen": 6944664, "step": 431 }, { "epoch": 0.03026084215503428, "grad_norm": 4.073723793029785, "learning_rate": 9.697675656742557e-05, "loss": 1.0498, "num_input_tokens_seen": 6960344, "step": 432 }, { "epoch": 0.030330890400763526, "grad_norm": 3.9672274589538574, "learning_rate": 9.696975831873906e-05, "loss": 1.007, "num_input_tokens_seen": 6976720, "step": 433 }, { "epoch": 0.03040093864649277, "grad_norm": 4.497872829437256, "learning_rate": 9.696276007005255e-05, "loss": 1.1339, "num_input_tokens_seen": 6992552, "step": 434 }, { "epoch": 0.030470986892222017, "grad_norm": 4.422168731689453, "learning_rate": 9.695576182136604e-05, "loss": 1.34, "num_input_tokens_seen": 7008936, "step": 435 }, { "epoch": 0.030541035137951263, "grad_norm": 4.3009138107299805, "learning_rate": 9.694876357267951e-05, "loss": 1.2479, "num_input_tokens_seen": 7024512, "step": 436 }, { "epoch": 0.030611083383680508, "grad_norm": 4.04030704498291, "learning_rate": 9.6941765323993e-05, "loss": 1.097, "num_input_tokens_seen": 7040896, "step": 437 }, { "epoch": 0.030681131629409757, "grad_norm": 3.877417802810669, "learning_rate": 9.693476707530649e-05, "loss": 1.1363, "num_input_tokens_seen": 7057280, "step": 438 }, { "epoch": 0.030751179875139003, "grad_norm": 3.8185505867004395, "learning_rate": 9.692776882661996e-05, "loss": 0.9067, "num_input_tokens_seen": 7072544, "step": 439 }, { "epoch": 0.03082122812086825, "grad_norm": 4.028950214385986, "learning_rate": 9.692077057793345e-05, "loss": 1.1195, "num_input_tokens_seen": 7088928, "step": 440 }, { "epoch": 0.030891276366597494, "grad_norm": 4.2786431312561035, "learning_rate": 9.691377232924694e-05, "loss": 1.1199, "num_input_tokens_seen": 7105248, "step": 441 }, { "epoch": 0.03096132461232674, "grad_norm": 4.193462371826172, "learning_rate": 9.690677408056043e-05, "loss": 1.1812, "num_input_tokens_seen": 7121008, "step": 442 }, { "epoch": 0.031031372858055985, "grad_norm": 3.93597412109375, "learning_rate": 9.68997758318739e-05, "loss": 1.0677, "num_input_tokens_seen": 7136944, "step": 443 }, { "epoch": 0.03110142110378523, "grad_norm": 4.3208537101745605, "learning_rate": 9.68927775831874e-05, "loss": 1.1358, "num_input_tokens_seen": 7152928, "step": 444 }, { "epoch": 0.031171469349514477, "grad_norm": 3.9743378162384033, "learning_rate": 9.688577933450088e-05, "loss": 1.094, "num_input_tokens_seen": 7169312, "step": 445 }, { "epoch": 0.031241517595243726, "grad_norm": 4.226114273071289, "learning_rate": 9.687878108581436e-05, "loss": 1.1752, "num_input_tokens_seen": 7185696, "step": 446 }, { "epoch": 0.03131156584097297, "grad_norm": 4.210222244262695, "learning_rate": 9.687178283712785e-05, "loss": 1.1262, "num_input_tokens_seen": 7201784, "step": 447 }, { "epoch": 0.03138161408670222, "grad_norm": 4.311635971069336, "learning_rate": 9.686478458844133e-05, "loss": 1.2491, "num_input_tokens_seen": 7218168, "step": 448 }, { "epoch": 0.03145166233243146, "grad_norm": 4.56603479385376, "learning_rate": 9.685778633975482e-05, "loss": 1.3512, "num_input_tokens_seen": 7233360, "step": 449 }, { "epoch": 0.03152171057816071, "grad_norm": 4.232856750488281, "learning_rate": 9.685078809106831e-05, "loss": 0.9387, "num_input_tokens_seen": 7248280, "step": 450 }, { "epoch": 0.031591758823889954, "grad_norm": 4.512947082519531, "learning_rate": 9.684378984238179e-05, "loss": 1.1988, "num_input_tokens_seen": 7264664, "step": 451 }, { "epoch": 0.0316618070696192, "grad_norm": 4.273897171020508, "learning_rate": 9.683679159369528e-05, "loss": 1.2523, "num_input_tokens_seen": 7281048, "step": 452 }, { "epoch": 0.031731855315348445, "grad_norm": 4.288438320159912, "learning_rate": 9.682979334500875e-05, "loss": 1.1692, "num_input_tokens_seen": 7297424, "step": 453 }, { "epoch": 0.03180190356107769, "grad_norm": 4.27367639541626, "learning_rate": 9.682279509632225e-05, "loss": 1.1868, "num_input_tokens_seen": 7312792, "step": 454 }, { "epoch": 0.031871951806806936, "grad_norm": 3.978926181793213, "learning_rate": 9.681579684763574e-05, "loss": 1.0382, "num_input_tokens_seen": 7329176, "step": 455 }, { "epoch": 0.03194200005253618, "grad_norm": 4.4399919509887695, "learning_rate": 9.680879859894922e-05, "loss": 1.2072, "num_input_tokens_seen": 7345560, "step": 456 }, { "epoch": 0.03201204829826543, "grad_norm": 3.9786529541015625, "learning_rate": 9.68018003502627e-05, "loss": 1.1704, "num_input_tokens_seen": 7361944, "step": 457 }, { "epoch": 0.03208209654399467, "grad_norm": 4.171195030212402, "learning_rate": 9.679480210157618e-05, "loss": 1.1307, "num_input_tokens_seen": 7378328, "step": 458 }, { "epoch": 0.032152144789723926, "grad_norm": 3.9415268898010254, "learning_rate": 9.678780385288967e-05, "loss": 0.9971, "num_input_tokens_seen": 7394208, "step": 459 }, { "epoch": 0.03222219303545317, "grad_norm": 4.066036224365234, "learning_rate": 9.678080560420316e-05, "loss": 1.1227, "num_input_tokens_seen": 7410328, "step": 460 }, { "epoch": 0.03229224128118242, "grad_norm": 4.22513484954834, "learning_rate": 9.677380735551665e-05, "loss": 1.0883, "num_input_tokens_seen": 7426712, "step": 461 }, { "epoch": 0.03236228952691166, "grad_norm": 4.310954570770264, "learning_rate": 9.676680910683013e-05, "loss": 1.1695, "num_input_tokens_seen": 7442736, "step": 462 }, { "epoch": 0.03243233777264091, "grad_norm": 4.2868828773498535, "learning_rate": 9.675981085814361e-05, "loss": 1.0594, "num_input_tokens_seen": 7458560, "step": 463 }, { "epoch": 0.032502386018370154, "grad_norm": 4.318186283111572, "learning_rate": 9.67528126094571e-05, "loss": 1.1791, "num_input_tokens_seen": 7474944, "step": 464 }, { "epoch": 0.0325724342640994, "grad_norm": 4.040421009063721, "learning_rate": 9.674581436077059e-05, "loss": 1.0649, "num_input_tokens_seen": 7490344, "step": 465 }, { "epoch": 0.032642482509828645, "grad_norm": 3.914815902709961, "learning_rate": 9.673881611208406e-05, "loss": 1.1381, "num_input_tokens_seen": 7506728, "step": 466 }, { "epoch": 0.03271253075555789, "grad_norm": 4.054527282714844, "learning_rate": 9.673181786339755e-05, "loss": 1.2264, "num_input_tokens_seen": 7522912, "step": 467 }, { "epoch": 0.032782579001287136, "grad_norm": 4.295147895812988, "learning_rate": 9.672481961471104e-05, "loss": 1.1369, "num_input_tokens_seen": 7539040, "step": 468 }, { "epoch": 0.03285262724701638, "grad_norm": 4.109183311462402, "learning_rate": 9.671782136602453e-05, "loss": 1.1676, "num_input_tokens_seen": 7555424, "step": 469 }, { "epoch": 0.03292267549274563, "grad_norm": 4.131369590759277, "learning_rate": 9.6710823117338e-05, "loss": 1.1188, "num_input_tokens_seen": 7571808, "step": 470 }, { "epoch": 0.03299272373847487, "grad_norm": 3.998414993286133, "learning_rate": 9.670382486865149e-05, "loss": 1.0201, "num_input_tokens_seen": 7587528, "step": 471 }, { "epoch": 0.03306277198420412, "grad_norm": 4.1235551834106445, "learning_rate": 9.669682661996498e-05, "loss": 1.1265, "num_input_tokens_seen": 7603912, "step": 472 }, { "epoch": 0.033132820229933364, "grad_norm": 4.800798416137695, "learning_rate": 9.668982837127845e-05, "loss": 1.3634, "num_input_tokens_seen": 7617512, "step": 473 }, { "epoch": 0.03320286847566261, "grad_norm": 4.068000316619873, "learning_rate": 9.668283012259196e-05, "loss": 1.1427, "num_input_tokens_seen": 7633040, "step": 474 }, { "epoch": 0.033272916721391856, "grad_norm": 4.0715484619140625, "learning_rate": 9.667583187390543e-05, "loss": 1.0633, "num_input_tokens_seen": 7648416, "step": 475 }, { "epoch": 0.0333429649671211, "grad_norm": 3.937807321548462, "learning_rate": 9.666883362521892e-05, "loss": 1.1393, "num_input_tokens_seen": 7664624, "step": 476 }, { "epoch": 0.033413013212850354, "grad_norm": 4.195656776428223, "learning_rate": 9.666183537653241e-05, "loss": 1.1801, "num_input_tokens_seen": 7680480, "step": 477 }, { "epoch": 0.0334830614585796, "grad_norm": 4.227575778961182, "learning_rate": 9.665483712784588e-05, "loss": 1.0453, "num_input_tokens_seen": 7696632, "step": 478 }, { "epoch": 0.033553109704308845, "grad_norm": 4.328822135925293, "learning_rate": 9.664783887915937e-05, "loss": 1.221, "num_input_tokens_seen": 7713016, "step": 479 }, { "epoch": 0.03362315795003809, "grad_norm": 4.086736679077148, "learning_rate": 9.664084063047286e-05, "loss": 1.2817, "num_input_tokens_seen": 7729400, "step": 480 }, { "epoch": 0.033693206195767336, "grad_norm": 4.555233955383301, "learning_rate": 9.663384238178635e-05, "loss": 1.483, "num_input_tokens_seen": 7745784, "step": 481 }, { "epoch": 0.03376325444149658, "grad_norm": 4.118983745574951, "learning_rate": 9.662684413309984e-05, "loss": 0.9139, "num_input_tokens_seen": 7762168, "step": 482 }, { "epoch": 0.03383330268722583, "grad_norm": 4.232059001922607, "learning_rate": 9.661984588441331e-05, "loss": 1.1269, "num_input_tokens_seen": 7777920, "step": 483 }, { "epoch": 0.03390335093295507, "grad_norm": 6.288865089416504, "learning_rate": 9.66128476357268e-05, "loss": 1.0642, "num_input_tokens_seen": 7794304, "step": 484 }, { "epoch": 0.03397339917868432, "grad_norm": 4.133046627044678, "learning_rate": 9.660584938704028e-05, "loss": 1.2067, "num_input_tokens_seen": 7810200, "step": 485 }, { "epoch": 0.034043447424413564, "grad_norm": 4.147965431213379, "learning_rate": 9.659885113835377e-05, "loss": 1.0367, "num_input_tokens_seen": 7826384, "step": 486 }, { "epoch": 0.03411349567014281, "grad_norm": 4.1191020011901855, "learning_rate": 9.659185288966725e-05, "loss": 1.0972, "num_input_tokens_seen": 7841704, "step": 487 }, { "epoch": 0.034183543915872056, "grad_norm": 4.518441677093506, "learning_rate": 9.658485464098074e-05, "loss": 1.263, "num_input_tokens_seen": 7858088, "step": 488 }, { "epoch": 0.0342535921616013, "grad_norm": 4.321181297302246, "learning_rate": 9.657785639229423e-05, "loss": 1.1378, "num_input_tokens_seen": 7874472, "step": 489 }, { "epoch": 0.03432364040733055, "grad_norm": 4.366185665130615, "learning_rate": 9.65708581436077e-05, "loss": 1.1636, "num_input_tokens_seen": 7890856, "step": 490 }, { "epoch": 0.03439368865305979, "grad_norm": 4.042731761932373, "learning_rate": 9.65638598949212e-05, "loss": 1.0601, "num_input_tokens_seen": 7906776, "step": 491 }, { "epoch": 0.03446373689878904, "grad_norm": 3.743668556213379, "learning_rate": 9.655686164623468e-05, "loss": 1.0441, "num_input_tokens_seen": 7923160, "step": 492 }, { "epoch": 0.034533785144518284, "grad_norm": 3.8547139167785645, "learning_rate": 9.654986339754816e-05, "loss": 1.0842, "num_input_tokens_seen": 7939296, "step": 493 }, { "epoch": 0.034603833390247536, "grad_norm": 4.238414287567139, "learning_rate": 9.654286514886166e-05, "loss": 1.2498, "num_input_tokens_seen": 7955504, "step": 494 }, { "epoch": 0.03467388163597678, "grad_norm": 4.134857177734375, "learning_rate": 9.653586690017514e-05, "loss": 1.1241, "num_input_tokens_seen": 7971888, "step": 495 }, { "epoch": 0.03474392988170603, "grad_norm": 4.2501983642578125, "learning_rate": 9.652886865148862e-05, "loss": 1.1829, "num_input_tokens_seen": 7988272, "step": 496 }, { "epoch": 0.03481397812743527, "grad_norm": 7.4397053718566895, "learning_rate": 9.65218704028021e-05, "loss": 0.9952, "num_input_tokens_seen": 8003744, "step": 497 }, { "epoch": 0.03488402637316452, "grad_norm": 4.2750959396362305, "learning_rate": 9.651487215411559e-05, "loss": 1.2387, "num_input_tokens_seen": 8019184, "step": 498 }, { "epoch": 0.034954074618893764, "grad_norm": 4.156162261962891, "learning_rate": 9.650787390542908e-05, "loss": 1.1201, "num_input_tokens_seen": 8035176, "step": 499 }, { "epoch": 0.03502412286462301, "grad_norm": 4.178225040435791, "learning_rate": 9.650087565674257e-05, "loss": 1.2026, "num_input_tokens_seen": 8051560, "step": 500 }, { "epoch": 0.035094171110352256, "grad_norm": 4.147096157073975, "learning_rate": 9.649387740805605e-05, "loss": 1.2465, "num_input_tokens_seen": 8067944, "step": 501 }, { "epoch": 0.0351642193560815, "grad_norm": 4.329249858856201, "learning_rate": 9.648687915936953e-05, "loss": 1.2742, "num_input_tokens_seen": 8083824, "step": 502 }, { "epoch": 0.03523426760181075, "grad_norm": 4.404232978820801, "learning_rate": 9.647988091068302e-05, "loss": 1.1511, "num_input_tokens_seen": 8100208, "step": 503 }, { "epoch": 0.03530431584753999, "grad_norm": 4.190586090087891, "learning_rate": 9.64728826619965e-05, "loss": 0.9884, "num_input_tokens_seen": 8116048, "step": 504 }, { "epoch": 0.03537436409326924, "grad_norm": 4.262845516204834, "learning_rate": 9.646588441330998e-05, "loss": 1.1321, "num_input_tokens_seen": 8132432, "step": 505 }, { "epoch": 0.035444412338998484, "grad_norm": 4.452746391296387, "learning_rate": 9.645888616462347e-05, "loss": 1.1667, "num_input_tokens_seen": 8148816, "step": 506 }, { "epoch": 0.03551446058472773, "grad_norm": 4.111443042755127, "learning_rate": 9.645188791593696e-05, "loss": 1.0049, "num_input_tokens_seen": 8164856, "step": 507 }, { "epoch": 0.035584508830456975, "grad_norm": 4.292227268218994, "learning_rate": 9.644488966725045e-05, "loss": 1.1535, "num_input_tokens_seen": 8181240, "step": 508 }, { "epoch": 0.03565455707618622, "grad_norm": 4.295238971710205, "learning_rate": 9.643789141856394e-05, "loss": 1.236, "num_input_tokens_seen": 8197624, "step": 509 }, { "epoch": 0.035724605321915466, "grad_norm": 3.930659294128418, "learning_rate": 9.643089316987741e-05, "loss": 0.9195, "num_input_tokens_seen": 8213816, "step": 510 }, { "epoch": 0.03579465356764472, "grad_norm": 4.092316150665283, "learning_rate": 9.64238949211909e-05, "loss": 1.0799, "num_input_tokens_seen": 8229632, "step": 511 }, { "epoch": 0.035864701813373964, "grad_norm": 4.2939252853393555, "learning_rate": 9.641689667250437e-05, "loss": 1.111, "num_input_tokens_seen": 8245232, "step": 512 }, { "epoch": 0.03593475005910321, "grad_norm": 4.191503524780273, "learning_rate": 9.640989842381786e-05, "loss": 0.9399, "num_input_tokens_seen": 8260912, "step": 513 }, { "epoch": 0.036004798304832455, "grad_norm": 4.141485214233398, "learning_rate": 9.640290017513136e-05, "loss": 1.1334, "num_input_tokens_seen": 8276864, "step": 514 }, { "epoch": 0.0360748465505617, "grad_norm": 3.890547752380371, "learning_rate": 9.639590192644484e-05, "loss": 1.0055, "num_input_tokens_seen": 8292720, "step": 515 }, { "epoch": 0.03614489479629095, "grad_norm": 4.405922889709473, "learning_rate": 9.638890367775833e-05, "loss": 1.2238, "num_input_tokens_seen": 8309104, "step": 516 }, { "epoch": 0.03621494304202019, "grad_norm": 4.207942485809326, "learning_rate": 9.63819054290718e-05, "loss": 1.0688, "num_input_tokens_seen": 8325304, "step": 517 }, { "epoch": 0.03628499128774944, "grad_norm": 4.174366474151611, "learning_rate": 9.637490718038529e-05, "loss": 1.2303, "num_input_tokens_seen": 8341688, "step": 518 }, { "epoch": 0.036355039533478684, "grad_norm": 3.9641714096069336, "learning_rate": 9.636790893169878e-05, "loss": 1.2244, "num_input_tokens_seen": 8357760, "step": 519 }, { "epoch": 0.03642508777920793, "grad_norm": 5.832678318023682, "learning_rate": 9.636091068301227e-05, "loss": 1.0645, "num_input_tokens_seen": 8372712, "step": 520 }, { "epoch": 0.036495136024937175, "grad_norm": 3.7905161380767822, "learning_rate": 9.635391243432576e-05, "loss": 1.0551, "num_input_tokens_seen": 8389096, "step": 521 }, { "epoch": 0.03656518427066642, "grad_norm": 3.6744072437286377, "learning_rate": 9.634691418563923e-05, "loss": 1.0687, "num_input_tokens_seen": 8405216, "step": 522 }, { "epoch": 0.036635232516395666, "grad_norm": 4.897486209869385, "learning_rate": 9.633991593695272e-05, "loss": 1.1968, "num_input_tokens_seen": 8421600, "step": 523 }, { "epoch": 0.03670528076212491, "grad_norm": 3.821457862854004, "learning_rate": 9.63329176882662e-05, "loss": 1.0473, "num_input_tokens_seen": 8437984, "step": 524 }, { "epoch": 0.03677532900785416, "grad_norm": 3.873832941055298, "learning_rate": 9.632591943957969e-05, "loss": 0.9656, "num_input_tokens_seen": 8453760, "step": 525 }, { "epoch": 0.0368453772535834, "grad_norm": 4.139901161193848, "learning_rate": 9.631892119089317e-05, "loss": 1.0881, "num_input_tokens_seen": 8470144, "step": 526 }, { "epoch": 0.03691542549931265, "grad_norm": 3.9512782096862793, "learning_rate": 9.631192294220666e-05, "loss": 1.1093, "num_input_tokens_seen": 8486528, "step": 527 }, { "epoch": 0.0369854737450419, "grad_norm": 3.8937103748321533, "learning_rate": 9.630492469352015e-05, "loss": 0.9722, "num_input_tokens_seen": 8502912, "step": 528 }, { "epoch": 0.03705552199077115, "grad_norm": 4.482640743255615, "learning_rate": 9.629792644483363e-05, "loss": 1.056, "num_input_tokens_seen": 8519296, "step": 529 }, { "epoch": 0.03712557023650039, "grad_norm": 4.127941131591797, "learning_rate": 9.629092819614711e-05, "loss": 1.0285, "num_input_tokens_seen": 8535160, "step": 530 }, { "epoch": 0.03719561848222964, "grad_norm": 3.973585844039917, "learning_rate": 9.62839299474606e-05, "loss": 1.0356, "num_input_tokens_seen": 8551256, "step": 531 }, { "epoch": 0.037265666727958884, "grad_norm": 4.22855281829834, "learning_rate": 9.627693169877408e-05, "loss": 1.134, "num_input_tokens_seen": 8567640, "step": 532 }, { "epoch": 0.03733571497368813, "grad_norm": 4.144021511077881, "learning_rate": 9.626993345008757e-05, "loss": 1.0963, "num_input_tokens_seen": 8583504, "step": 533 }, { "epoch": 0.037405763219417375, "grad_norm": 3.8666226863861084, "learning_rate": 9.626293520140106e-05, "loss": 0.912, "num_input_tokens_seen": 8599888, "step": 534 }, { "epoch": 0.03747581146514662, "grad_norm": 4.215412616729736, "learning_rate": 9.625593695271454e-05, "loss": 1.1055, "num_input_tokens_seen": 8616256, "step": 535 }, { "epoch": 0.037545859710875866, "grad_norm": 4.353022575378418, "learning_rate": 9.624893870402803e-05, "loss": 1.0379, "num_input_tokens_seen": 8632640, "step": 536 }, { "epoch": 0.03761590795660511, "grad_norm": 3.778947591781616, "learning_rate": 9.624194045534151e-05, "loss": 1.0547, "num_input_tokens_seen": 8648624, "step": 537 }, { "epoch": 0.03768595620233436, "grad_norm": 4.481568336486816, "learning_rate": 9.6234942206655e-05, "loss": 1.3407, "num_input_tokens_seen": 8664200, "step": 538 }, { "epoch": 0.0377560044480636, "grad_norm": 4.066302299499512, "learning_rate": 9.622794395796847e-05, "loss": 0.995, "num_input_tokens_seen": 8680584, "step": 539 }, { "epoch": 0.03782605269379285, "grad_norm": 4.262768268585205, "learning_rate": 9.622094570928197e-05, "loss": 1.3054, "num_input_tokens_seen": 8696968, "step": 540 }, { "epoch": 0.037896100939522094, "grad_norm": 3.777597665786743, "learning_rate": 9.621394746059546e-05, "loss": 0.9831, "num_input_tokens_seen": 8713352, "step": 541 }, { "epoch": 0.03796614918525134, "grad_norm": 3.9732742309570312, "learning_rate": 9.620694921190894e-05, "loss": 1.0699, "num_input_tokens_seen": 8729048, "step": 542 }, { "epoch": 0.038036197430980585, "grad_norm": 4.543329238891602, "learning_rate": 9.619995096322243e-05, "loss": 1.1546, "num_input_tokens_seen": 8745432, "step": 543 }, { "epoch": 0.03810624567670983, "grad_norm": 4.903865814208984, "learning_rate": 9.61929527145359e-05, "loss": 1.1548, "num_input_tokens_seen": 8760296, "step": 544 }, { "epoch": 0.03817629392243908, "grad_norm": 4.197691917419434, "learning_rate": 9.618595446584939e-05, "loss": 1.1616, "num_input_tokens_seen": 8776680, "step": 545 }, { "epoch": 0.03824634216816833, "grad_norm": 3.912689208984375, "learning_rate": 9.617895621716288e-05, "loss": 0.9926, "num_input_tokens_seen": 8793064, "step": 546 }, { "epoch": 0.038316390413897575, "grad_norm": 4.291840076446533, "learning_rate": 9.617195796847637e-05, "loss": 1.1943, "num_input_tokens_seen": 8809448, "step": 547 }, { "epoch": 0.03838643865962682, "grad_norm": 3.9053072929382324, "learning_rate": 9.616495971978985e-05, "loss": 1.2437, "num_input_tokens_seen": 8825536, "step": 548 }, { "epoch": 0.038456486905356066, "grad_norm": 4.860696315765381, "learning_rate": 9.615796147110333e-05, "loss": 1.3045, "num_input_tokens_seen": 8841920, "step": 549 }, { "epoch": 0.03852653515108531, "grad_norm": 3.9394373893737793, "learning_rate": 9.615096322241682e-05, "loss": 1.1367, "num_input_tokens_seen": 8858304, "step": 550 }, { "epoch": 0.03859658339681456, "grad_norm": 3.8160409927368164, "learning_rate": 9.61439649737303e-05, "loss": 1.0864, "num_input_tokens_seen": 8874688, "step": 551 }, { "epoch": 0.0386666316425438, "grad_norm": 4.3792805671691895, "learning_rate": 9.613696672504378e-05, "loss": 1.2516, "num_input_tokens_seen": 8891072, "step": 552 }, { "epoch": 0.03873667988827305, "grad_norm": 4.103452682495117, "learning_rate": 9.612996847635727e-05, "loss": 0.9737, "num_input_tokens_seen": 8907456, "step": 553 }, { "epoch": 0.038806728134002294, "grad_norm": 4.117603302001953, "learning_rate": 9.612297022767076e-05, "loss": 1.096, "num_input_tokens_seen": 8923816, "step": 554 }, { "epoch": 0.03887677637973154, "grad_norm": 4.272468566894531, "learning_rate": 9.611597197898425e-05, "loss": 1.161, "num_input_tokens_seen": 8939344, "step": 555 }, { "epoch": 0.038946824625460785, "grad_norm": 4.323635578155518, "learning_rate": 9.610897373029772e-05, "loss": 1.1922, "num_input_tokens_seen": 8954920, "step": 556 }, { "epoch": 0.03901687287119003, "grad_norm": 3.783510684967041, "learning_rate": 9.610197548161121e-05, "loss": 1.0658, "num_input_tokens_seen": 8971304, "step": 557 }, { "epoch": 0.039086921116919277, "grad_norm": 4.3757548332214355, "learning_rate": 9.60949772329247e-05, "loss": 1.3186, "num_input_tokens_seen": 8987672, "step": 558 }, { "epoch": 0.03915696936264852, "grad_norm": 4.048824787139893, "learning_rate": 9.608797898423818e-05, "loss": 1.1452, "num_input_tokens_seen": 9003896, "step": 559 }, { "epoch": 0.03922701760837777, "grad_norm": 4.06865930557251, "learning_rate": 9.608098073555168e-05, "loss": 0.9861, "num_input_tokens_seen": 9020280, "step": 560 }, { "epoch": 0.03929706585410701, "grad_norm": 3.966737747192383, "learning_rate": 9.607398248686515e-05, "loss": 1.0323, "num_input_tokens_seen": 9036280, "step": 561 }, { "epoch": 0.03936711409983626, "grad_norm": 4.466656684875488, "learning_rate": 9.606698423817864e-05, "loss": 1.2462, "num_input_tokens_seen": 9052664, "step": 562 }, { "epoch": 0.03943716234556551, "grad_norm": 4.312132358551025, "learning_rate": 9.605998598949213e-05, "loss": 1.2133, "num_input_tokens_seen": 9068832, "step": 563 }, { "epoch": 0.03950721059129476, "grad_norm": 3.9202895164489746, "learning_rate": 9.60529877408056e-05, "loss": 1.0723, "num_input_tokens_seen": 9084680, "step": 564 }, { "epoch": 0.039577258837024, "grad_norm": 5.139899730682373, "learning_rate": 9.604598949211909e-05, "loss": 1.1165, "num_input_tokens_seen": 9099792, "step": 565 }, { "epoch": 0.03964730708275325, "grad_norm": 4.398557186126709, "learning_rate": 9.603899124343258e-05, "loss": 1.1737, "num_input_tokens_seen": 9116136, "step": 566 }, { "epoch": 0.039717355328482494, "grad_norm": 4.350982666015625, "learning_rate": 9.603199299474607e-05, "loss": 1.2174, "num_input_tokens_seen": 9132520, "step": 567 }, { "epoch": 0.03978740357421174, "grad_norm": 3.787644386291504, "learning_rate": 9.602499474605956e-05, "loss": 0.9914, "num_input_tokens_seen": 9148856, "step": 568 }, { "epoch": 0.039857451819940985, "grad_norm": 4.630245685577393, "learning_rate": 9.601799649737303e-05, "loss": 1.4135, "num_input_tokens_seen": 9164888, "step": 569 }, { "epoch": 0.03992750006567023, "grad_norm": 4.063969135284424, "learning_rate": 9.601099824868652e-05, "loss": 1.1312, "num_input_tokens_seen": 9181272, "step": 570 }, { "epoch": 0.039997548311399476, "grad_norm": 4.2443413734436035, "learning_rate": 9.6004e-05, "loss": 1.1627, "num_input_tokens_seen": 9197344, "step": 571 }, { "epoch": 0.04006759655712872, "grad_norm": 4.396352767944336, "learning_rate": 9.599700175131349e-05, "loss": 1.1222, "num_input_tokens_seen": 9212312, "step": 572 }, { "epoch": 0.04013764480285797, "grad_norm": 4.364585876464844, "learning_rate": 9.599000350262697e-05, "loss": 1.0522, "num_input_tokens_seen": 9228696, "step": 573 }, { "epoch": 0.04020769304858721, "grad_norm": 3.9348409175872803, "learning_rate": 9.598300525394046e-05, "loss": 1.1375, "num_input_tokens_seen": 9245080, "step": 574 }, { "epoch": 0.04027774129431646, "grad_norm": 4.051416873931885, "learning_rate": 9.597600700525395e-05, "loss": 1.0265, "num_input_tokens_seen": 9260752, "step": 575 }, { "epoch": 0.040347789540045705, "grad_norm": 4.661770820617676, "learning_rate": 9.596900875656743e-05, "loss": 1.192, "num_input_tokens_seen": 9276792, "step": 576 }, { "epoch": 0.04041783778577495, "grad_norm": 4.378422260284424, "learning_rate": 9.596201050788092e-05, "loss": 1.0497, "num_input_tokens_seen": 9292768, "step": 577 }, { "epoch": 0.040487886031504196, "grad_norm": 4.4690399169921875, "learning_rate": 9.595501225919439e-05, "loss": 1.2398, "num_input_tokens_seen": 9309152, "step": 578 }, { "epoch": 0.04055793427723344, "grad_norm": 4.1711273193359375, "learning_rate": 9.594801401050788e-05, "loss": 1.097, "num_input_tokens_seen": 9325536, "step": 579 }, { "epoch": 0.040627982522962694, "grad_norm": 3.8115949630737305, "learning_rate": 9.594101576182137e-05, "loss": 1.0317, "num_input_tokens_seen": 9341920, "step": 580 }, { "epoch": 0.04069803076869194, "grad_norm": 4.072190284729004, "learning_rate": 9.593401751313486e-05, "loss": 1.0649, "num_input_tokens_seen": 9357904, "step": 581 }, { "epoch": 0.040768079014421185, "grad_norm": 3.895766258239746, "learning_rate": 9.592701926444835e-05, "loss": 1.1906, "num_input_tokens_seen": 9373496, "step": 582 }, { "epoch": 0.04083812726015043, "grad_norm": 4.026490688323975, "learning_rate": 9.592002101576182e-05, "loss": 0.9913, "num_input_tokens_seen": 9389824, "step": 583 }, { "epoch": 0.040908175505879676, "grad_norm": 3.612987518310547, "learning_rate": 9.591302276707531e-05, "loss": 0.9376, "num_input_tokens_seen": 9406208, "step": 584 }, { "epoch": 0.04097822375160892, "grad_norm": 4.4619646072387695, "learning_rate": 9.59060245183888e-05, "loss": 1.2198, "num_input_tokens_seen": 9422592, "step": 585 }, { "epoch": 0.04104827199733817, "grad_norm": 3.990372896194458, "learning_rate": 9.589902626970229e-05, "loss": 1.082, "num_input_tokens_seen": 9438816, "step": 586 }, { "epoch": 0.04111832024306741, "grad_norm": 3.7697947025299072, "learning_rate": 9.589202802101577e-05, "loss": 1.0173, "num_input_tokens_seen": 9455200, "step": 587 }, { "epoch": 0.04118836848879666, "grad_norm": 4.066056728363037, "learning_rate": 9.588502977232925e-05, "loss": 1.124, "num_input_tokens_seen": 9471320, "step": 588 }, { "epoch": 0.041258416734525905, "grad_norm": 3.913506507873535, "learning_rate": 9.587803152364274e-05, "loss": 1.0501, "num_input_tokens_seen": 9487304, "step": 589 }, { "epoch": 0.04132846498025515, "grad_norm": 3.9049429893493652, "learning_rate": 9.587103327495623e-05, "loss": 1.0563, "num_input_tokens_seen": 9503688, "step": 590 }, { "epoch": 0.041398513225984396, "grad_norm": 4.316978454589844, "learning_rate": 9.58640350262697e-05, "loss": 1.1333, "num_input_tokens_seen": 9519488, "step": 591 }, { "epoch": 0.04146856147171364, "grad_norm": 3.7818517684936523, "learning_rate": 9.585703677758319e-05, "loss": 1.0537, "num_input_tokens_seen": 9535872, "step": 592 }, { "epoch": 0.04153860971744289, "grad_norm": 3.8751401901245117, "learning_rate": 9.585003852889668e-05, "loss": 1.1745, "num_input_tokens_seen": 9551928, "step": 593 }, { "epoch": 0.04160865796317213, "grad_norm": 4.357265949249268, "learning_rate": 9.584304028021017e-05, "loss": 1.1154, "num_input_tokens_seen": 9568312, "step": 594 }, { "epoch": 0.04167870620890138, "grad_norm": 4.184159755706787, "learning_rate": 9.583604203152366e-05, "loss": 1.125, "num_input_tokens_seen": 9583968, "step": 595 }, { "epoch": 0.041748754454630624, "grad_norm": 3.9540369510650635, "learning_rate": 9.582904378283713e-05, "loss": 1.2032, "num_input_tokens_seen": 9600152, "step": 596 }, { "epoch": 0.04181880270035987, "grad_norm": 4.401122093200684, "learning_rate": 9.582204553415062e-05, "loss": 1.4808, "num_input_tokens_seen": 9615632, "step": 597 }, { "epoch": 0.04188885094608912, "grad_norm": 4.418131351470947, "learning_rate": 9.58150472854641e-05, "loss": 1.0077, "num_input_tokens_seen": 9631712, "step": 598 }, { "epoch": 0.04195889919181837, "grad_norm": 4.362226963043213, "learning_rate": 9.580804903677758e-05, "loss": 1.1614, "num_input_tokens_seen": 9648096, "step": 599 }, { "epoch": 0.04202894743754761, "grad_norm": 4.051177024841309, "learning_rate": 9.580105078809107e-05, "loss": 1.0718, "num_input_tokens_seen": 9663792, "step": 600 }, { "epoch": 0.04202894743754761, "eval_loss": 1.1809133291244507, "eval_runtime": 0.2062, "eval_samples_per_second": 4.849, "eval_steps_per_second": 4.849, "num_input_tokens_seen": 9663792, "step": 600 }, { "epoch": 0.04209899568327686, "grad_norm": 4.478739261627197, "learning_rate": 9.579405253940456e-05, "loss": 1.1963, "num_input_tokens_seen": 9680176, "step": 601 }, { "epoch": 0.042169043929006104, "grad_norm": 4.05004358291626, "learning_rate": 9.578705429071805e-05, "loss": 1.1005, "num_input_tokens_seen": 9696560, "step": 602 }, { "epoch": 0.04223909217473535, "grad_norm": 4.092396259307861, "learning_rate": 9.578005604203152e-05, "loss": 1.1796, "num_input_tokens_seen": 9712944, "step": 603 }, { "epoch": 0.042309140420464596, "grad_norm": 4.428014278411865, "learning_rate": 9.577305779334501e-05, "loss": 0.9734, "num_input_tokens_seen": 9729096, "step": 604 }, { "epoch": 0.04237918866619384, "grad_norm": 4.202315807342529, "learning_rate": 9.576605954465849e-05, "loss": 1.0502, "num_input_tokens_seen": 9745480, "step": 605 }, { "epoch": 0.04244923691192309, "grad_norm": 3.7633514404296875, "learning_rate": 9.575906129597198e-05, "loss": 0.9218, "num_input_tokens_seen": 9761272, "step": 606 }, { "epoch": 0.04251928515765233, "grad_norm": 4.170671463012695, "learning_rate": 9.575206304728548e-05, "loss": 1.1196, "num_input_tokens_seen": 9777656, "step": 607 }, { "epoch": 0.04258933340338158, "grad_norm": 4.20021915435791, "learning_rate": 9.574506479859895e-05, "loss": 1.1146, "num_input_tokens_seen": 9794032, "step": 608 }, { "epoch": 0.042659381649110824, "grad_norm": 4.437755107879639, "learning_rate": 9.573806654991244e-05, "loss": 1.0911, "num_input_tokens_seen": 9809936, "step": 609 }, { "epoch": 0.04272942989484007, "grad_norm": 4.417452335357666, "learning_rate": 9.573106830122592e-05, "loss": 1.2079, "num_input_tokens_seen": 9825232, "step": 610 }, { "epoch": 0.042799478140569315, "grad_norm": 4.144030570983887, "learning_rate": 9.57240700525394e-05, "loss": 1.1229, "num_input_tokens_seen": 9840648, "step": 611 }, { "epoch": 0.04286952638629856, "grad_norm": 3.991605043411255, "learning_rate": 9.57170718038529e-05, "loss": 1.0762, "num_input_tokens_seen": 9857032, "step": 612 }, { "epoch": 0.042939574632027806, "grad_norm": 4.516556262969971, "learning_rate": 9.571007355516638e-05, "loss": 1.3056, "num_input_tokens_seen": 9872328, "step": 613 }, { "epoch": 0.04300962287775705, "grad_norm": 4.030200481414795, "learning_rate": 9.570307530647987e-05, "loss": 0.9493, "num_input_tokens_seen": 9887832, "step": 614 }, { "epoch": 0.043079671123486304, "grad_norm": 4.345893859863281, "learning_rate": 9.569607705779335e-05, "loss": 1.2707, "num_input_tokens_seen": 9904216, "step": 615 }, { "epoch": 0.04314971936921555, "grad_norm": 4.158145427703857, "learning_rate": 9.568907880910684e-05, "loss": 1.0377, "num_input_tokens_seen": 9920072, "step": 616 }, { "epoch": 0.043219767614944796, "grad_norm": 4.155702590942383, "learning_rate": 9.568208056042032e-05, "loss": 1.091, "num_input_tokens_seen": 9936416, "step": 617 }, { "epoch": 0.04328981586067404, "grad_norm": 3.76328444480896, "learning_rate": 9.56750823117338e-05, "loss": 1.1011, "num_input_tokens_seen": 9952456, "step": 618 }, { "epoch": 0.04335986410640329, "grad_norm": 4.252495765686035, "learning_rate": 9.566808406304729e-05, "loss": 1.0616, "num_input_tokens_seen": 9968608, "step": 619 }, { "epoch": 0.04342991235213253, "grad_norm": 9.254091262817383, "learning_rate": 9.566108581436078e-05, "loss": 1.0315, "num_input_tokens_seen": 9983016, "step": 620 }, { "epoch": 0.04349996059786178, "grad_norm": 4.028343200683594, "learning_rate": 9.565408756567426e-05, "loss": 1.0667, "num_input_tokens_seen": 9999400, "step": 621 }, { "epoch": 0.043570008843591024, "grad_norm": 4.051328659057617, "learning_rate": 9.564708931698775e-05, "loss": 1.1375, "num_input_tokens_seen": 10015384, "step": 622 }, { "epoch": 0.04364005708932027, "grad_norm": 4.495016098022461, "learning_rate": 9.564009106830123e-05, "loss": 1.0691, "num_input_tokens_seen": 10031152, "step": 623 }, { "epoch": 0.043710105335049515, "grad_norm": 4.876840114593506, "learning_rate": 9.563309281961472e-05, "loss": 1.17, "num_input_tokens_seen": 10047536, "step": 624 }, { "epoch": 0.04378015358077876, "grad_norm": 4.407329559326172, "learning_rate": 9.562609457092819e-05, "loss": 1.2381, "num_input_tokens_seen": 10063920, "step": 625 }, { "epoch": 0.043850201826508006, "grad_norm": 4.161394119262695, "learning_rate": 9.561909632224168e-05, "loss": 1.0903, "num_input_tokens_seen": 10079024, "step": 626 }, { "epoch": 0.04392025007223725, "grad_norm": 4.382974624633789, "learning_rate": 9.561209807355518e-05, "loss": 1.3156, "num_input_tokens_seen": 10095408, "step": 627 }, { "epoch": 0.0439902983179665, "grad_norm": 4.004157543182373, "learning_rate": 9.560509982486866e-05, "loss": 1.1333, "num_input_tokens_seen": 10111792, "step": 628 }, { "epoch": 0.04406034656369574, "grad_norm": 3.9019265174865723, "learning_rate": 9.559810157618215e-05, "loss": 1.0948, "num_input_tokens_seen": 10128144, "step": 629 }, { "epoch": 0.04413039480942499, "grad_norm": 4.410470485687256, "learning_rate": 9.559110332749562e-05, "loss": 1.3219, "num_input_tokens_seen": 10144288, "step": 630 }, { "epoch": 0.044200443055154234, "grad_norm": 4.233544826507568, "learning_rate": 9.558410507880911e-05, "loss": 0.999, "num_input_tokens_seen": 10160296, "step": 631 }, { "epoch": 0.04427049130088349, "grad_norm": 4.120091438293457, "learning_rate": 9.557710683012258e-05, "loss": 1.0166, "num_input_tokens_seen": 10176680, "step": 632 }, { "epoch": 0.04434053954661273, "grad_norm": 5.061972618103027, "learning_rate": 9.557010858143609e-05, "loss": 1.251, "num_input_tokens_seen": 10192088, "step": 633 }, { "epoch": 0.04441058779234198, "grad_norm": 4.3690948486328125, "learning_rate": 9.556311033274958e-05, "loss": 1.2113, "num_input_tokens_seen": 10208472, "step": 634 }, { "epoch": 0.044480636038071224, "grad_norm": 3.798710346221924, "learning_rate": 9.555611208406305e-05, "loss": 1.0286, "num_input_tokens_seen": 10224856, "step": 635 }, { "epoch": 0.04455068428380047, "grad_norm": 4.41818380355835, "learning_rate": 9.554911383537654e-05, "loss": 1.14, "num_input_tokens_seen": 10241200, "step": 636 }, { "epoch": 0.044620732529529715, "grad_norm": 4.256262302398682, "learning_rate": 9.554211558669001e-05, "loss": 1.3103, "num_input_tokens_seen": 10257584, "step": 637 }, { "epoch": 0.04469078077525896, "grad_norm": 4.176064968109131, "learning_rate": 9.55351173380035e-05, "loss": 1.1985, "num_input_tokens_seen": 10273760, "step": 638 }, { "epoch": 0.044760829020988206, "grad_norm": 3.9971530437469482, "learning_rate": 9.552811908931699e-05, "loss": 1.1579, "num_input_tokens_seen": 10290144, "step": 639 }, { "epoch": 0.04483087726671745, "grad_norm": 4.150514602661133, "learning_rate": 9.552112084063048e-05, "loss": 1.1144, "num_input_tokens_seen": 10306528, "step": 640 }, { "epoch": 0.0449009255124467, "grad_norm": 4.1868367195129395, "learning_rate": 9.551412259194397e-05, "loss": 1.0099, "num_input_tokens_seen": 10322480, "step": 641 }, { "epoch": 0.04497097375817594, "grad_norm": 4.409821510314941, "learning_rate": 9.550712434325744e-05, "loss": 1.2574, "num_input_tokens_seen": 10338864, "step": 642 }, { "epoch": 0.04504102200390519, "grad_norm": 4.500023365020752, "learning_rate": 9.550012609457093e-05, "loss": 1.35, "num_input_tokens_seen": 10355072, "step": 643 }, { "epoch": 0.045111070249634434, "grad_norm": 10.278129577636719, "learning_rate": 9.549312784588442e-05, "loss": 1.0618, "num_input_tokens_seen": 10371456, "step": 644 }, { "epoch": 0.04518111849536368, "grad_norm": 3.9800543785095215, "learning_rate": 9.54861295971979e-05, "loss": 1.0341, "num_input_tokens_seen": 10387720, "step": 645 }, { "epoch": 0.045251166741092926, "grad_norm": 3.855720281600952, "learning_rate": 9.547913134851138e-05, "loss": 1.1323, "num_input_tokens_seen": 10403936, "step": 646 }, { "epoch": 0.04532121498682217, "grad_norm": 4.719264984130859, "learning_rate": 9.547213309982487e-05, "loss": 1.1407, "num_input_tokens_seen": 10420320, "step": 647 }, { "epoch": 0.04539126323255142, "grad_norm": 4.6528167724609375, "learning_rate": 9.546513485113836e-05, "loss": 1.1014, "num_input_tokens_seen": 10436704, "step": 648 }, { "epoch": 0.04546131147828066, "grad_norm": 4.0597028732299805, "learning_rate": 9.545813660245185e-05, "loss": 1.116, "num_input_tokens_seen": 10452592, "step": 649 }, { "epoch": 0.045531359724009915, "grad_norm": 4.161896705627441, "learning_rate": 9.545113835376533e-05, "loss": 1.1373, "num_input_tokens_seen": 10468976, "step": 650 }, { "epoch": 0.04560140796973916, "grad_norm": 4.125041961669922, "learning_rate": 9.544414010507881e-05, "loss": 1.0947, "num_input_tokens_seen": 10484584, "step": 651 }, { "epoch": 0.045671456215468406, "grad_norm": 4.278462886810303, "learning_rate": 9.543714185639229e-05, "loss": 1.1369, "num_input_tokens_seen": 10500504, "step": 652 }, { "epoch": 0.04574150446119765, "grad_norm": 4.766538619995117, "learning_rate": 9.543014360770579e-05, "loss": 1.1876, "num_input_tokens_seen": 10516472, "step": 653 }, { "epoch": 0.0458115527069269, "grad_norm": 4.457921504974365, "learning_rate": 9.542314535901928e-05, "loss": 1.0788, "num_input_tokens_seen": 10532272, "step": 654 }, { "epoch": 0.04588160095265614, "grad_norm": 5.021823883056641, "learning_rate": 9.541614711033275e-05, "loss": 1.1152, "num_input_tokens_seen": 10547696, "step": 655 }, { "epoch": 0.04595164919838539, "grad_norm": 4.407228469848633, "learning_rate": 9.540914886164624e-05, "loss": 1.0863, "num_input_tokens_seen": 10564080, "step": 656 }, { "epoch": 0.046021697444114634, "grad_norm": 3.9986062049865723, "learning_rate": 9.540215061295972e-05, "loss": 1.1624, "num_input_tokens_seen": 10580464, "step": 657 }, { "epoch": 0.04609174568984388, "grad_norm": 7.9165191650390625, "learning_rate": 9.539515236427321e-05, "loss": 1.0809, "num_input_tokens_seen": 10595336, "step": 658 }, { "epoch": 0.046161793935573125, "grad_norm": 4.357856273651123, "learning_rate": 9.53881541155867e-05, "loss": 1.0324, "num_input_tokens_seen": 10611720, "step": 659 }, { "epoch": 0.04623184218130237, "grad_norm": 3.8115761280059814, "learning_rate": 9.538115586690018e-05, "loss": 1.1499, "num_input_tokens_seen": 10628104, "step": 660 }, { "epoch": 0.04630189042703162, "grad_norm": 3.879671096801758, "learning_rate": 9.537415761821367e-05, "loss": 1.0474, "num_input_tokens_seen": 10644096, "step": 661 }, { "epoch": 0.04637193867276086, "grad_norm": 4.324586391448975, "learning_rate": 9.536715936952715e-05, "loss": 1.1904, "num_input_tokens_seen": 10659408, "step": 662 }, { "epoch": 0.04644198691849011, "grad_norm": 4.020029067993164, "learning_rate": 9.536016112084064e-05, "loss": 1.0848, "num_input_tokens_seen": 10675792, "step": 663 }, { "epoch": 0.046512035164219354, "grad_norm": 4.563455581665039, "learning_rate": 9.535316287215411e-05, "loss": 1.1735, "num_input_tokens_seen": 10691632, "step": 664 }, { "epoch": 0.0465820834099486, "grad_norm": 4.444424629211426, "learning_rate": 9.53461646234676e-05, "loss": 1.258, "num_input_tokens_seen": 10708016, "step": 665 }, { "epoch": 0.046652131655677845, "grad_norm": 3.9864089488983154, "learning_rate": 9.533916637478109e-05, "loss": 1.1315, "num_input_tokens_seen": 10724176, "step": 666 }, { "epoch": 0.0467221799014071, "grad_norm": 4.860849857330322, "learning_rate": 9.533216812609458e-05, "loss": 1.2276, "num_input_tokens_seen": 10740560, "step": 667 }, { "epoch": 0.04679222814713634, "grad_norm": 3.9701120853424072, "learning_rate": 9.532516987740807e-05, "loss": 1.1406, "num_input_tokens_seen": 10756864, "step": 668 }, { "epoch": 0.04686227639286559, "grad_norm": 3.660257577896118, "learning_rate": 9.531817162872154e-05, "loss": 1.0182, "num_input_tokens_seen": 10773248, "step": 669 }, { "epoch": 0.046932324638594834, "grad_norm": 3.888510227203369, "learning_rate": 9.531117338003503e-05, "loss": 1.0223, "num_input_tokens_seen": 10789632, "step": 670 }, { "epoch": 0.04700237288432408, "grad_norm": 4.794105052947998, "learning_rate": 9.530417513134852e-05, "loss": 1.0565, "num_input_tokens_seen": 10804496, "step": 671 }, { "epoch": 0.047072421130053325, "grad_norm": 4.293116092681885, "learning_rate": 9.529717688266199e-05, "loss": 1.2509, "num_input_tokens_seen": 10819976, "step": 672 }, { "epoch": 0.04714246937578257, "grad_norm": 5.112069129943848, "learning_rate": 9.52901786339755e-05, "loss": 1.0964, "num_input_tokens_seen": 10836360, "step": 673 }, { "epoch": 0.04721251762151182, "grad_norm": 3.9091360569000244, "learning_rate": 9.528318038528897e-05, "loss": 1.0647, "num_input_tokens_seen": 10852744, "step": 674 }, { "epoch": 0.04728256586724106, "grad_norm": 4.032161235809326, "learning_rate": 9.527618213660246e-05, "loss": 1.2362, "num_input_tokens_seen": 10868928, "step": 675 }, { "epoch": 0.04735261411297031, "grad_norm": 3.931156635284424, "learning_rate": 9.526918388791595e-05, "loss": 1.0571, "num_input_tokens_seen": 10884776, "step": 676 }, { "epoch": 0.047422662358699554, "grad_norm": 3.9511048793792725, "learning_rate": 9.526218563922942e-05, "loss": 1.0249, "num_input_tokens_seen": 10901160, "step": 677 }, { "epoch": 0.0474927106044288, "grad_norm": 4.199029445648193, "learning_rate": 9.525518739054291e-05, "loss": 1.2813, "num_input_tokens_seen": 10917544, "step": 678 }, { "epoch": 0.047562758850158045, "grad_norm": 3.8590247631073, "learning_rate": 9.52481891418564e-05, "loss": 1.02, "num_input_tokens_seen": 10933928, "step": 679 }, { "epoch": 0.04763280709588729, "grad_norm": 5.530341625213623, "learning_rate": 9.524119089316989e-05, "loss": 1.2316, "num_input_tokens_seen": 10949600, "step": 680 }, { "epoch": 0.047702855341616536, "grad_norm": 4.17647123336792, "learning_rate": 9.523419264448338e-05, "loss": 1.2985, "num_input_tokens_seen": 10965984, "step": 681 }, { "epoch": 0.04777290358734578, "grad_norm": 4.250451564788818, "learning_rate": 9.522719439579685e-05, "loss": 1.1638, "num_input_tokens_seen": 10982368, "step": 682 }, { "epoch": 0.04784295183307503, "grad_norm": 4.132594108581543, "learning_rate": 9.522019614711034e-05, "loss": 0.9638, "num_input_tokens_seen": 10998752, "step": 683 }, { "epoch": 0.04791300007880428, "grad_norm": 5.863363265991211, "learning_rate": 9.521319789842382e-05, "loss": 1.0736, "num_input_tokens_seen": 11014376, "step": 684 }, { "epoch": 0.047983048324533525, "grad_norm": 3.740323543548584, "learning_rate": 9.52061996497373e-05, "loss": 0.9958, "num_input_tokens_seen": 11030440, "step": 685 }, { "epoch": 0.04805309657026277, "grad_norm": 4.927120685577393, "learning_rate": 9.519920140105079e-05, "loss": 1.156, "num_input_tokens_seen": 11046824, "step": 686 }, { "epoch": 0.04812314481599202, "grad_norm": 4.708818435668945, "learning_rate": 9.519220315236428e-05, "loss": 1.2139, "num_input_tokens_seen": 11063208, "step": 687 }, { "epoch": 0.04819319306172126, "grad_norm": 3.7547767162323, "learning_rate": 9.518520490367777e-05, "loss": 0.9557, "num_input_tokens_seen": 11079592, "step": 688 }, { "epoch": 0.04826324130745051, "grad_norm": 4.038534641265869, "learning_rate": 9.517820665499124e-05, "loss": 1.1124, "num_input_tokens_seen": 11095976, "step": 689 }, { "epoch": 0.048333289553179754, "grad_norm": 4.159554481506348, "learning_rate": 9.517120840630473e-05, "loss": 1.0043, "num_input_tokens_seen": 11112360, "step": 690 }, { "epoch": 0.048403337798909, "grad_norm": 7.104836463928223, "learning_rate": 9.516421015761821e-05, "loss": 0.9736, "num_input_tokens_seen": 11127800, "step": 691 }, { "epoch": 0.048473386044638245, "grad_norm": 4.073885917663574, "learning_rate": 9.51572119089317e-05, "loss": 1.1249, "num_input_tokens_seen": 11144184, "step": 692 }, { "epoch": 0.04854343429036749, "grad_norm": 3.7190351486206055, "learning_rate": 9.51502136602452e-05, "loss": 1.1035, "num_input_tokens_seen": 11160568, "step": 693 }, { "epoch": 0.048613482536096736, "grad_norm": 4.252142429351807, "learning_rate": 9.514321541155867e-05, "loss": 1.1588, "num_input_tokens_seen": 11176952, "step": 694 }, { "epoch": 0.04868353078182598, "grad_norm": 4.418105125427246, "learning_rate": 9.513621716287216e-05, "loss": 1.2496, "num_input_tokens_seen": 11193336, "step": 695 }, { "epoch": 0.04875357902755523, "grad_norm": 4.195918560028076, "learning_rate": 9.512921891418564e-05, "loss": 1.0193, "num_input_tokens_seen": 11209720, "step": 696 }, { "epoch": 0.04882362727328447, "grad_norm": 5.138080596923828, "learning_rate": 9.512222066549913e-05, "loss": 1.1861, "num_input_tokens_seen": 11225888, "step": 697 }, { "epoch": 0.04889367551901372, "grad_norm": 4.489223003387451, "learning_rate": 9.511522241681261e-05, "loss": 1.1497, "num_input_tokens_seen": 11241744, "step": 698 }, { "epoch": 0.048963723764742964, "grad_norm": 3.972590208053589, "learning_rate": 9.51082241681261e-05, "loss": 1.2765, "num_input_tokens_seen": 11257768, "step": 699 }, { "epoch": 0.04903377201047221, "grad_norm": 13.274886131286621, "learning_rate": 9.510122591943959e-05, "loss": 1.1124, "num_input_tokens_seen": 11273216, "step": 700 }, { "epoch": 0.049103820256201455, "grad_norm": 3.7899255752563477, "learning_rate": 9.509422767075307e-05, "loss": 1.0445, "num_input_tokens_seen": 11289600, "step": 701 }, { "epoch": 0.04917386850193071, "grad_norm": 4.226947784423828, "learning_rate": 9.508722942206656e-05, "loss": 1.4313, "num_input_tokens_seen": 11305920, "step": 702 }, { "epoch": 0.049243916747659953, "grad_norm": 4.098162651062012, "learning_rate": 9.508023117338003e-05, "loss": 0.952, "num_input_tokens_seen": 11322304, "step": 703 }, { "epoch": 0.0493139649933892, "grad_norm": 3.9205965995788574, "learning_rate": 9.507323292469352e-05, "loss": 1.1648, "num_input_tokens_seen": 11338688, "step": 704 }, { "epoch": 0.049384013239118445, "grad_norm": 4.06537389755249, "learning_rate": 9.506623467600701e-05, "loss": 1.1295, "num_input_tokens_seen": 11353544, "step": 705 }, { "epoch": 0.04945406148484769, "grad_norm": 4.309032440185547, "learning_rate": 9.50592364273205e-05, "loss": 1.1475, "num_input_tokens_seen": 11369928, "step": 706 }, { "epoch": 0.049524109730576936, "grad_norm": 4.320526599884033, "learning_rate": 9.505223817863399e-05, "loss": 1.0102, "num_input_tokens_seen": 11386312, "step": 707 }, { "epoch": 0.04959415797630618, "grad_norm": 5.025510787963867, "learning_rate": 9.504523992994747e-05, "loss": 1.1182, "num_input_tokens_seen": 11402696, "step": 708 }, { "epoch": 0.04966420622203543, "grad_norm": 3.9406464099884033, "learning_rate": 9.503824168126095e-05, "loss": 1.068, "num_input_tokens_seen": 11419080, "step": 709 }, { "epoch": 0.04973425446776467, "grad_norm": 3.9148502349853516, "learning_rate": 9.503124343257444e-05, "loss": 1.1062, "num_input_tokens_seen": 11435464, "step": 710 }, { "epoch": 0.04980430271349392, "grad_norm": 3.9386026859283447, "learning_rate": 9.502424518388791e-05, "loss": 0.9516, "num_input_tokens_seen": 11451848, "step": 711 }, { "epoch": 0.049874350959223164, "grad_norm": 3.9537665843963623, "learning_rate": 9.50172469352014e-05, "loss": 1.1372, "num_input_tokens_seen": 11468216, "step": 712 }, { "epoch": 0.04994439920495241, "grad_norm": 3.97929310798645, "learning_rate": 9.501024868651489e-05, "loss": 1.0705, "num_input_tokens_seen": 11484192, "step": 713 }, { "epoch": 0.050014447450681655, "grad_norm": 3.9326419830322266, "learning_rate": 9.500325043782838e-05, "loss": 1.0986, "num_input_tokens_seen": 11500576, "step": 714 }, { "epoch": 0.0500844956964109, "grad_norm": 3.769347667694092, "learning_rate": 9.499625218914187e-05, "loss": 0.9265, "num_input_tokens_seen": 11516960, "step": 715 }, { "epoch": 0.050154543942140146, "grad_norm": 4.264547348022461, "learning_rate": 9.498925394045534e-05, "loss": 1.3166, "num_input_tokens_seen": 11532616, "step": 716 }, { "epoch": 0.05022459218786939, "grad_norm": 4.885791778564453, "learning_rate": 9.498225569176883e-05, "loss": 1.0669, "num_input_tokens_seen": 11548552, "step": 717 }, { "epoch": 0.05029464043359864, "grad_norm": 5.4089741706848145, "learning_rate": 9.49752574430823e-05, "loss": 1.3986, "num_input_tokens_seen": 11564936, "step": 718 }, { "epoch": 0.05036468867932789, "grad_norm": 4.503393173217773, "learning_rate": 9.496825919439581e-05, "loss": 0.9947, "num_input_tokens_seen": 11580720, "step": 719 }, { "epoch": 0.050434736925057136, "grad_norm": 4.364518165588379, "learning_rate": 9.49612609457093e-05, "loss": 1.12, "num_input_tokens_seen": 11597104, "step": 720 }, { "epoch": 0.05050478517078638, "grad_norm": 4.229926109313965, "learning_rate": 9.495426269702277e-05, "loss": 1.098, "num_input_tokens_seen": 11612120, "step": 721 }, { "epoch": 0.05057483341651563, "grad_norm": 4.477171897888184, "learning_rate": 9.494726444833626e-05, "loss": 1.1565, "num_input_tokens_seen": 11627000, "step": 722 }, { "epoch": 0.05064488166224487, "grad_norm": 4.071736812591553, "learning_rate": 9.494026619964973e-05, "loss": 1.2951, "num_input_tokens_seen": 11643256, "step": 723 }, { "epoch": 0.05071492990797412, "grad_norm": 4.219758033752441, "learning_rate": 9.493326795096322e-05, "loss": 1.1408, "num_input_tokens_seen": 11659424, "step": 724 }, { "epoch": 0.050784978153703364, "grad_norm": 4.108195781707764, "learning_rate": 9.492626970227671e-05, "loss": 0.9847, "num_input_tokens_seen": 11675808, "step": 725 }, { "epoch": 0.05085502639943261, "grad_norm": 3.964359760284424, "learning_rate": 9.49192714535902e-05, "loss": 1.0935, "num_input_tokens_seen": 11691760, "step": 726 }, { "epoch": 0.050925074645161855, "grad_norm": 4.585779190063477, "learning_rate": 9.491227320490369e-05, "loss": 1.1561, "num_input_tokens_seen": 11706600, "step": 727 }, { "epoch": 0.0509951228908911, "grad_norm": 3.8540141582489014, "learning_rate": 9.490527495621716e-05, "loss": 1.0163, "num_input_tokens_seen": 11722984, "step": 728 }, { "epoch": 0.051065171136620346, "grad_norm": 4.138955593109131, "learning_rate": 9.489827670753065e-05, "loss": 1.2842, "num_input_tokens_seen": 11738968, "step": 729 }, { "epoch": 0.05113521938234959, "grad_norm": 4.138274192810059, "learning_rate": 9.489127845884413e-05, "loss": 1.1452, "num_input_tokens_seen": 11754952, "step": 730 }, { "epoch": 0.05120526762807884, "grad_norm": 4.374305248260498, "learning_rate": 9.488428021015762e-05, "loss": 1.3622, "num_input_tokens_seen": 11770832, "step": 731 }, { "epoch": 0.05127531587380808, "grad_norm": 4.242674350738525, "learning_rate": 9.48772819614711e-05, "loss": 1.1914, "num_input_tokens_seen": 11786872, "step": 732 }, { "epoch": 0.05134536411953733, "grad_norm": 4.173389911651611, "learning_rate": 9.48702837127846e-05, "loss": 1.1853, "num_input_tokens_seen": 11803256, "step": 733 }, { "epoch": 0.051415412365266575, "grad_norm": 4.014588356018066, "learning_rate": 9.486328546409808e-05, "loss": 1.0436, "num_input_tokens_seen": 11819608, "step": 734 }, { "epoch": 0.05148546061099582, "grad_norm": 4.759418964385986, "learning_rate": 9.485628721541157e-05, "loss": 1.1605, "num_input_tokens_seen": 11834296, "step": 735 }, { "epoch": 0.05155550885672507, "grad_norm": 4.258687973022461, "learning_rate": 9.484928896672505e-05, "loss": 1.2993, "num_input_tokens_seen": 11849728, "step": 736 }, { "epoch": 0.05162555710245432, "grad_norm": 4.690395832061768, "learning_rate": 9.484229071803853e-05, "loss": 1.0655, "num_input_tokens_seen": 11866112, "step": 737 }, { "epoch": 0.051695605348183564, "grad_norm": 4.373327255249023, "learning_rate": 9.483529246935201e-05, "loss": 1.1364, "num_input_tokens_seen": 11881960, "step": 738 }, { "epoch": 0.05176565359391281, "grad_norm": 4.008789539337158, "learning_rate": 9.482829422066551e-05, "loss": 1.1174, "num_input_tokens_seen": 11897936, "step": 739 }, { "epoch": 0.051835701839642055, "grad_norm": 4.391345977783203, "learning_rate": 9.482129597197899e-05, "loss": 1.2045, "num_input_tokens_seen": 11914320, "step": 740 }, { "epoch": 0.0519057500853713, "grad_norm": 4.119503021240234, "learning_rate": 9.481429772329248e-05, "loss": 0.927, "num_input_tokens_seen": 11930440, "step": 741 }, { "epoch": 0.051975798331100546, "grad_norm": 4.186014175415039, "learning_rate": 9.480729947460596e-05, "loss": 1.1583, "num_input_tokens_seen": 11946720, "step": 742 }, { "epoch": 0.05204584657682979, "grad_norm": 4.119131088256836, "learning_rate": 9.480030122591944e-05, "loss": 1.0792, "num_input_tokens_seen": 11962360, "step": 743 }, { "epoch": 0.05211589482255904, "grad_norm": 3.921030044555664, "learning_rate": 9.479330297723293e-05, "loss": 0.9966, "num_input_tokens_seen": 11978744, "step": 744 }, { "epoch": 0.05218594306828828, "grad_norm": 3.806251049041748, "learning_rate": 9.478630472854642e-05, "loss": 1.1207, "num_input_tokens_seen": 11994912, "step": 745 }, { "epoch": 0.05225599131401753, "grad_norm": 4.508687973022461, "learning_rate": 9.47793064798599e-05, "loss": 1.1038, "num_input_tokens_seen": 12011296, "step": 746 }, { "epoch": 0.052326039559746775, "grad_norm": 4.458346843719482, "learning_rate": 9.47723082311734e-05, "loss": 1.2878, "num_input_tokens_seen": 12027408, "step": 747 }, { "epoch": 0.05239608780547602, "grad_norm": 5.779678821563721, "learning_rate": 9.476530998248687e-05, "loss": 1.2722, "num_input_tokens_seen": 12043792, "step": 748 }, { "epoch": 0.052466136051205266, "grad_norm": 4.621145725250244, "learning_rate": 9.475831173380036e-05, "loss": 1.2636, "num_input_tokens_seen": 12059856, "step": 749 }, { "epoch": 0.05253618429693451, "grad_norm": 4.276626110076904, "learning_rate": 9.475131348511383e-05, "loss": 1.3378, "num_input_tokens_seen": 12076240, "step": 750 }, { "epoch": 0.05260623254266376, "grad_norm": 4.533468246459961, "learning_rate": 9.474431523642732e-05, "loss": 0.921, "num_input_tokens_seen": 12092416, "step": 751 }, { "epoch": 0.052676280788393, "grad_norm": 4.626596927642822, "learning_rate": 9.473731698774081e-05, "loss": 1.2807, "num_input_tokens_seen": 12108664, "step": 752 }, { "epoch": 0.052746329034122255, "grad_norm": 4.3372907638549805, "learning_rate": 9.47303187390543e-05, "loss": 1.2754, "num_input_tokens_seen": 12125048, "step": 753 }, { "epoch": 0.0528163772798515, "grad_norm": 3.6576266288757324, "learning_rate": 9.472332049036779e-05, "loss": 0.8487, "num_input_tokens_seen": 12141296, "step": 754 }, { "epoch": 0.052886425525580746, "grad_norm": 3.8973164558410645, "learning_rate": 9.471632224168126e-05, "loss": 1.1211, "num_input_tokens_seen": 12157544, "step": 755 }, { "epoch": 0.05295647377130999, "grad_norm": 3.9059019088745117, "learning_rate": 9.470932399299475e-05, "loss": 1.2484, "num_input_tokens_seen": 12173928, "step": 756 }, { "epoch": 0.05302652201703924, "grad_norm": 4.133029937744141, "learning_rate": 9.470232574430822e-05, "loss": 1.0762, "num_input_tokens_seen": 12189864, "step": 757 }, { "epoch": 0.05309657026276848, "grad_norm": 3.8380961418151855, "learning_rate": 9.469532749562171e-05, "loss": 0.9938, "num_input_tokens_seen": 12206248, "step": 758 }, { "epoch": 0.05316661850849773, "grad_norm": 4.753637790679932, "learning_rate": 9.468832924693522e-05, "loss": 1.1272, "num_input_tokens_seen": 12222632, "step": 759 }, { "epoch": 0.053236666754226974, "grad_norm": 4.704193592071533, "learning_rate": 9.468133099824869e-05, "loss": 1.2276, "num_input_tokens_seen": 12239016, "step": 760 }, { "epoch": 0.05330671499995622, "grad_norm": 3.870870351791382, "learning_rate": 9.467433274956218e-05, "loss": 0.916, "num_input_tokens_seen": 12254784, "step": 761 }, { "epoch": 0.053376763245685466, "grad_norm": 3.8597328662872314, "learning_rate": 9.466733450087567e-05, "loss": 0.9871, "num_input_tokens_seen": 12271160, "step": 762 }, { "epoch": 0.05344681149141471, "grad_norm": 3.7109553813934326, "learning_rate": 9.466033625218914e-05, "loss": 1.1248, "num_input_tokens_seen": 12286944, "step": 763 }, { "epoch": 0.05351685973714396, "grad_norm": 3.985595464706421, "learning_rate": 9.465333800350263e-05, "loss": 1.0524, "num_input_tokens_seen": 12303312, "step": 764 }, { "epoch": 0.0535869079828732, "grad_norm": 3.797247886657715, "learning_rate": 9.464633975481612e-05, "loss": 1.0799, "num_input_tokens_seen": 12319696, "step": 765 }, { "epoch": 0.05365695622860245, "grad_norm": 4.88303279876709, "learning_rate": 9.463934150612961e-05, "loss": 1.2865, "num_input_tokens_seen": 12335448, "step": 766 }, { "epoch": 0.053727004474331694, "grad_norm": 4.273831367492676, "learning_rate": 9.463234325744308e-05, "loss": 1.1724, "num_input_tokens_seen": 12351720, "step": 767 }, { "epoch": 0.05379705272006094, "grad_norm": 3.9505984783172607, "learning_rate": 9.462534500875657e-05, "loss": 1.1478, "num_input_tokens_seen": 12368104, "step": 768 }, { "epoch": 0.053867100965790185, "grad_norm": 4.20963191986084, "learning_rate": 9.461834676007006e-05, "loss": 1.1018, "num_input_tokens_seen": 12384488, "step": 769 }, { "epoch": 0.05393714921151943, "grad_norm": 4.106869220733643, "learning_rate": 9.461134851138354e-05, "loss": 1.1097, "num_input_tokens_seen": 12400128, "step": 770 }, { "epoch": 0.05400719745724868, "grad_norm": 4.28592586517334, "learning_rate": 9.460435026269702e-05, "loss": 1.036, "num_input_tokens_seen": 12416512, "step": 771 }, { "epoch": 0.05407724570297793, "grad_norm": 3.821927070617676, "learning_rate": 9.459735201401051e-05, "loss": 1.1215, "num_input_tokens_seen": 12432896, "step": 772 }, { "epoch": 0.054147293948707174, "grad_norm": 4.14424467086792, "learning_rate": 9.4590353765324e-05, "loss": 1.0092, "num_input_tokens_seen": 12449208, "step": 773 }, { "epoch": 0.05421734219443642, "grad_norm": 4.610694885253906, "learning_rate": 9.458335551663749e-05, "loss": 1.2265, "num_input_tokens_seen": 12464128, "step": 774 }, { "epoch": 0.054287390440165666, "grad_norm": 4.410182952880859, "learning_rate": 9.457635726795097e-05, "loss": 1.1904, "num_input_tokens_seen": 12479728, "step": 775 }, { "epoch": 0.05435743868589491, "grad_norm": 4.096780300140381, "learning_rate": 9.456935901926445e-05, "loss": 1.2317, "num_input_tokens_seen": 12495720, "step": 776 }, { "epoch": 0.05442748693162416, "grad_norm": 4.028350830078125, "learning_rate": 9.456236077057793e-05, "loss": 1.1825, "num_input_tokens_seen": 12511480, "step": 777 }, { "epoch": 0.0544975351773534, "grad_norm": 5.264276504516602, "learning_rate": 9.455536252189142e-05, "loss": 1.057, "num_input_tokens_seen": 12527864, "step": 778 }, { "epoch": 0.05456758342308265, "grad_norm": 4.371725082397461, "learning_rate": 9.454836427320492e-05, "loss": 1.1625, "num_input_tokens_seen": 12544168, "step": 779 }, { "epoch": 0.054637631668811894, "grad_norm": 4.692862510681152, "learning_rate": 9.45413660245184e-05, "loss": 1.2211, "num_input_tokens_seen": 12560552, "step": 780 }, { "epoch": 0.05470767991454114, "grad_norm": 3.7462823390960693, "learning_rate": 9.453436777583188e-05, "loss": 1.0815, "num_input_tokens_seen": 12576936, "step": 781 }, { "epoch": 0.054777728160270385, "grad_norm": 4.161571025848389, "learning_rate": 9.452736952714536e-05, "loss": 0.9788, "num_input_tokens_seen": 12593040, "step": 782 }, { "epoch": 0.05484777640599963, "grad_norm": 3.96793532371521, "learning_rate": 9.452037127845885e-05, "loss": 1.1396, "num_input_tokens_seen": 12609424, "step": 783 }, { "epoch": 0.054917824651728876, "grad_norm": 4.183755874633789, "learning_rate": 9.451337302977232e-05, "loss": 1.0868, "num_input_tokens_seen": 12625312, "step": 784 }, { "epoch": 0.05498787289745812, "grad_norm": 4.506673336029053, "learning_rate": 9.450637478108582e-05, "loss": 1.1112, "num_input_tokens_seen": 12641696, "step": 785 }, { "epoch": 0.05505792114318737, "grad_norm": 3.8601651191711426, "learning_rate": 9.449937653239931e-05, "loss": 1.2149, "num_input_tokens_seen": 12658080, "step": 786 }, { "epoch": 0.05512796938891661, "grad_norm": 5.190856456756592, "learning_rate": 9.449237828371279e-05, "loss": 1.2661, "num_input_tokens_seen": 12673032, "step": 787 }, { "epoch": 0.055198017634645866, "grad_norm": 4.323099136352539, "learning_rate": 9.448538003502628e-05, "loss": 1.139, "num_input_tokens_seen": 12689064, "step": 788 }, { "epoch": 0.05526806588037511, "grad_norm": 4.271193981170654, "learning_rate": 9.447838178633976e-05, "loss": 1.037, "num_input_tokens_seen": 12705448, "step": 789 }, { "epoch": 0.05533811412610436, "grad_norm": 3.793525218963623, "learning_rate": 9.447138353765324e-05, "loss": 1.0265, "num_input_tokens_seen": 12721832, "step": 790 }, { "epoch": 0.0554081623718336, "grad_norm": 3.747575283050537, "learning_rate": 9.446438528896673e-05, "loss": 0.9567, "num_input_tokens_seen": 12738216, "step": 791 }, { "epoch": 0.05547821061756285, "grad_norm": 4.222849369049072, "learning_rate": 9.445738704028022e-05, "loss": 1.1859, "num_input_tokens_seen": 12754600, "step": 792 }, { "epoch": 0.055548258863292094, "grad_norm": 9.102783203125, "learning_rate": 9.44503887915937e-05, "loss": 1.0361, "num_input_tokens_seen": 12770568, "step": 793 }, { "epoch": 0.05561830710902134, "grad_norm": 4.4447808265686035, "learning_rate": 9.444339054290718e-05, "loss": 1.2908, "num_input_tokens_seen": 12785768, "step": 794 }, { "epoch": 0.055688355354750585, "grad_norm": 4.038604259490967, "learning_rate": 9.443639229422067e-05, "loss": 0.9294, "num_input_tokens_seen": 12801704, "step": 795 }, { "epoch": 0.05575840360047983, "grad_norm": 4.492194652557373, "learning_rate": 9.442939404553416e-05, "loss": 1.0466, "num_input_tokens_seen": 12818088, "step": 796 }, { "epoch": 0.055828451846209076, "grad_norm": 3.978029489517212, "learning_rate": 9.442239579684763e-05, "loss": 1.1719, "num_input_tokens_seen": 12834432, "step": 797 }, { "epoch": 0.05589850009193832, "grad_norm": 4.014431476593018, "learning_rate": 9.441539754816112e-05, "loss": 1.1222, "num_input_tokens_seen": 12850816, "step": 798 }, { "epoch": 0.05596854833766757, "grad_norm": 4.0948638916015625, "learning_rate": 9.440839929947461e-05, "loss": 1.2013, "num_input_tokens_seen": 12867200, "step": 799 }, { "epoch": 0.05603859658339681, "grad_norm": 4.18120813369751, "learning_rate": 9.44014010507881e-05, "loss": 0.9403, "num_input_tokens_seen": 12883072, "step": 800 }, { "epoch": 0.05603859658339681, "eval_loss": 1.1718552112579346, "eval_runtime": 0.2039, "eval_samples_per_second": 4.905, "eval_steps_per_second": 4.905, "num_input_tokens_seen": 12883072, "step": 800 }, { "epoch": 0.05610864482912606, "grad_norm": 4.425891399383545, "learning_rate": 9.439440280210159e-05, "loss": 1.0435, "num_input_tokens_seen": 12899456, "step": 801 }, { "epoch": 0.056178693074855304, "grad_norm": 4.319190979003906, "learning_rate": 9.438740455341506e-05, "loss": 1.2612, "num_input_tokens_seen": 12915840, "step": 802 }, { "epoch": 0.05624874132058455, "grad_norm": 4.28010892868042, "learning_rate": 9.438040630472855e-05, "loss": 1.0853, "num_input_tokens_seen": 12932096, "step": 803 }, { "epoch": 0.056318789566313796, "grad_norm": 3.9454870223999023, "learning_rate": 9.437340805604203e-05, "loss": 1.055, "num_input_tokens_seen": 12948208, "step": 804 }, { "epoch": 0.05638883781204305, "grad_norm": 4.009400367736816, "learning_rate": 9.436640980735553e-05, "loss": 1.0681, "num_input_tokens_seen": 12964096, "step": 805 }, { "epoch": 0.056458886057772294, "grad_norm": 3.7949161529541016, "learning_rate": 9.435941155866902e-05, "loss": 1.0787, "num_input_tokens_seen": 12980480, "step": 806 }, { "epoch": 0.05652893430350154, "grad_norm": 3.910456418991089, "learning_rate": 9.435241330998249e-05, "loss": 0.9212, "num_input_tokens_seen": 12996864, "step": 807 }, { "epoch": 0.056598982549230785, "grad_norm": 4.744706630706787, "learning_rate": 9.434541506129598e-05, "loss": 1.0582, "num_input_tokens_seen": 13013248, "step": 808 }, { "epoch": 0.05666903079496003, "grad_norm": 4.4282732009887695, "learning_rate": 9.433841681260946e-05, "loss": 1.1353, "num_input_tokens_seen": 13029632, "step": 809 }, { "epoch": 0.056739079040689276, "grad_norm": 3.8422467708587646, "learning_rate": 9.433141856392294e-05, "loss": 0.9881, "num_input_tokens_seen": 13046016, "step": 810 }, { "epoch": 0.05680912728641852, "grad_norm": 4.1764445304870605, "learning_rate": 9.432442031523643e-05, "loss": 1.183, "num_input_tokens_seen": 13062400, "step": 811 }, { "epoch": 0.05687917553214777, "grad_norm": 4.713895320892334, "learning_rate": 9.431742206654992e-05, "loss": 1.0752, "num_input_tokens_seen": 13078584, "step": 812 }, { "epoch": 0.05694922377787701, "grad_norm": 4.265610694885254, "learning_rate": 9.431042381786341e-05, "loss": 0.9469, "num_input_tokens_seen": 13094968, "step": 813 }, { "epoch": 0.05701927202360626, "grad_norm": 3.9274330139160156, "learning_rate": 9.430342556917688e-05, "loss": 1.1765, "num_input_tokens_seen": 13111304, "step": 814 }, { "epoch": 0.057089320269335504, "grad_norm": 4.44935941696167, "learning_rate": 9.429642732049037e-05, "loss": 1.1014, "num_input_tokens_seen": 13127304, "step": 815 }, { "epoch": 0.05715936851506475, "grad_norm": 5.019375801086426, "learning_rate": 9.428942907180386e-05, "loss": 1.0535, "num_input_tokens_seen": 13143688, "step": 816 }, { "epoch": 0.057229416760793995, "grad_norm": 4.743424892425537, "learning_rate": 9.428243082311734e-05, "loss": 1.3912, "num_input_tokens_seen": 13160072, "step": 817 }, { "epoch": 0.05729946500652324, "grad_norm": 3.921475887298584, "learning_rate": 9.427543257443083e-05, "loss": 1.1116, "num_input_tokens_seen": 13176456, "step": 818 }, { "epoch": 0.05736951325225249, "grad_norm": 4.106019020080566, "learning_rate": 9.426843432574431e-05, "loss": 0.9, "num_input_tokens_seen": 13192840, "step": 819 }, { "epoch": 0.05743956149798173, "grad_norm": 4.298704147338867, "learning_rate": 9.42614360770578e-05, "loss": 1.281, "num_input_tokens_seen": 13209144, "step": 820 }, { "epoch": 0.05750960974371098, "grad_norm": 4.29774284362793, "learning_rate": 9.425443782837128e-05, "loss": 1.2703, "num_input_tokens_seen": 13224752, "step": 821 }, { "epoch": 0.057579657989440224, "grad_norm": 4.6176838874816895, "learning_rate": 9.424743957968477e-05, "loss": 1.232, "num_input_tokens_seen": 13240856, "step": 822 }, { "epoch": 0.057649706235169476, "grad_norm": 4.450786590576172, "learning_rate": 9.424044133099826e-05, "loss": 1.1369, "num_input_tokens_seen": 13256800, "step": 823 }, { "epoch": 0.05771975448089872, "grad_norm": 3.8302414417266846, "learning_rate": 9.423344308231173e-05, "loss": 0.9985, "num_input_tokens_seen": 13273032, "step": 824 }, { "epoch": 0.05778980272662797, "grad_norm": 4.641941070556641, "learning_rate": 9.422644483362523e-05, "loss": 1.2238, "num_input_tokens_seen": 13289104, "step": 825 }, { "epoch": 0.05785985097235721, "grad_norm": 4.369805335998535, "learning_rate": 9.421944658493871e-05, "loss": 1.2047, "num_input_tokens_seen": 13304752, "step": 826 }, { "epoch": 0.05792989921808646, "grad_norm": 3.863507032394409, "learning_rate": 9.42124483362522e-05, "loss": 1.1098, "num_input_tokens_seen": 13321088, "step": 827 }, { "epoch": 0.057999947463815704, "grad_norm": 5.323369979858398, "learning_rate": 9.420545008756568e-05, "loss": 1.1722, "num_input_tokens_seen": 13336912, "step": 828 }, { "epoch": 0.05806999570954495, "grad_norm": 4.006597995758057, "learning_rate": 9.419845183887916e-05, "loss": 1.0382, "num_input_tokens_seen": 13353280, "step": 829 }, { "epoch": 0.058140043955274195, "grad_norm": 4.1039886474609375, "learning_rate": 9.419145359019265e-05, "loss": 1.2037, "num_input_tokens_seen": 13369664, "step": 830 }, { "epoch": 0.05821009220100344, "grad_norm": 3.903517007827759, "learning_rate": 9.418445534150614e-05, "loss": 1.2185, "num_input_tokens_seen": 13386048, "step": 831 }, { "epoch": 0.05828014044673269, "grad_norm": 4.434885025024414, "learning_rate": 9.417745709281963e-05, "loss": 1.2444, "num_input_tokens_seen": 13402432, "step": 832 }, { "epoch": 0.05835018869246193, "grad_norm": 4.6121296882629395, "learning_rate": 9.417045884413311e-05, "loss": 1.2831, "num_input_tokens_seen": 13418816, "step": 833 }, { "epoch": 0.05842023693819118, "grad_norm": 3.6966841220855713, "learning_rate": 9.416346059544659e-05, "loss": 1.0751, "num_input_tokens_seen": 13435200, "step": 834 }, { "epoch": 0.058490285183920424, "grad_norm": 4.292221546173096, "learning_rate": 9.415646234676008e-05, "loss": 1.2068, "num_input_tokens_seen": 13451584, "step": 835 }, { "epoch": 0.05856033342964967, "grad_norm": 4.053999900817871, "learning_rate": 9.414946409807355e-05, "loss": 1.1735, "num_input_tokens_seen": 13467824, "step": 836 }, { "epoch": 0.058630381675378915, "grad_norm": 4.4411234855651855, "learning_rate": 9.414246584938704e-05, "loss": 1.0647, "num_input_tokens_seen": 13483200, "step": 837 }, { "epoch": 0.05870042992110816, "grad_norm": 3.956787347793579, "learning_rate": 9.413546760070053e-05, "loss": 0.9813, "num_input_tokens_seen": 13499584, "step": 838 }, { "epoch": 0.058770478166837406, "grad_norm": 5.050291061401367, "learning_rate": 9.412846935201402e-05, "loss": 1.1193, "num_input_tokens_seen": 13515448, "step": 839 }, { "epoch": 0.05884052641256666, "grad_norm": 3.8736393451690674, "learning_rate": 9.412147110332751e-05, "loss": 1.0294, "num_input_tokens_seen": 13531200, "step": 840 }, { "epoch": 0.058910574658295904, "grad_norm": 6.07747745513916, "learning_rate": 9.411447285464098e-05, "loss": 0.9684, "num_input_tokens_seen": 13547584, "step": 841 }, { "epoch": 0.05898062290402515, "grad_norm": 4.606445789337158, "learning_rate": 9.410747460595447e-05, "loss": 1.2119, "num_input_tokens_seen": 13563528, "step": 842 }, { "epoch": 0.059050671149754395, "grad_norm": 4.3981709480285645, "learning_rate": 9.410047635726796e-05, "loss": 1.3313, "num_input_tokens_seen": 13579912, "step": 843 }, { "epoch": 0.05912071939548364, "grad_norm": 3.64546799659729, "learning_rate": 9.409347810858143e-05, "loss": 0.8892, "num_input_tokens_seen": 13596296, "step": 844 }, { "epoch": 0.05919076764121289, "grad_norm": 4.15845251083374, "learning_rate": 9.408647985989494e-05, "loss": 1.1464, "num_input_tokens_seen": 13612680, "step": 845 }, { "epoch": 0.05926081588694213, "grad_norm": 6.049203872680664, "learning_rate": 9.407948161120841e-05, "loss": 1.1907, "num_input_tokens_seen": 13627832, "step": 846 }, { "epoch": 0.05933086413267138, "grad_norm": 3.7192461490631104, "learning_rate": 9.40724833625219e-05, "loss": 1.165, "num_input_tokens_seen": 13643824, "step": 847 }, { "epoch": 0.059400912378400623, "grad_norm": 4.183239936828613, "learning_rate": 9.406548511383537e-05, "loss": 1.1697, "num_input_tokens_seen": 13660208, "step": 848 }, { "epoch": 0.05947096062412987, "grad_norm": 4.126212120056152, "learning_rate": 9.405848686514886e-05, "loss": 1.0532, "num_input_tokens_seen": 13676592, "step": 849 }, { "epoch": 0.059541008869859115, "grad_norm": 4.033525466918945, "learning_rate": 9.405148861646235e-05, "loss": 1.1497, "num_input_tokens_seen": 13692600, "step": 850 }, { "epoch": 0.05961105711558836, "grad_norm": 4.162797451019287, "learning_rate": 9.404449036777584e-05, "loss": 1.162, "num_input_tokens_seen": 13708984, "step": 851 }, { "epoch": 0.059681105361317606, "grad_norm": 4.057224750518799, "learning_rate": 9.403749211908933e-05, "loss": 1.2166, "num_input_tokens_seen": 13724656, "step": 852 }, { "epoch": 0.05975115360704685, "grad_norm": 4.201955318450928, "learning_rate": 9.40304938704028e-05, "loss": 1.2195, "num_input_tokens_seen": 13741040, "step": 853 }, { "epoch": 0.0598212018527761, "grad_norm": 3.8704352378845215, "learning_rate": 9.402349562171629e-05, "loss": 0.8946, "num_input_tokens_seen": 13757424, "step": 854 }, { "epoch": 0.05989125009850534, "grad_norm": 6.010958671569824, "learning_rate": 9.401649737302978e-05, "loss": 1.2095, "num_input_tokens_seen": 13773808, "step": 855 }, { "epoch": 0.05996129834423459, "grad_norm": 4.975742816925049, "learning_rate": 9.400949912434326e-05, "loss": 1.1064, "num_input_tokens_seen": 13789704, "step": 856 }, { "epoch": 0.06003134658996384, "grad_norm": 4.021739959716797, "learning_rate": 9.400250087565675e-05, "loss": 1.2036, "num_input_tokens_seen": 13806088, "step": 857 }, { "epoch": 0.06010139483569309, "grad_norm": 4.262394905090332, "learning_rate": 9.399550262697023e-05, "loss": 1.1053, "num_input_tokens_seen": 13821928, "step": 858 }, { "epoch": 0.06017144308142233, "grad_norm": 4.3033671379089355, "learning_rate": 9.398850437828372e-05, "loss": 1.0213, "num_input_tokens_seen": 13838232, "step": 859 }, { "epoch": 0.06024149132715158, "grad_norm": 4.066610336303711, "learning_rate": 9.398150612959721e-05, "loss": 1.0579, "num_input_tokens_seen": 13853912, "step": 860 }, { "epoch": 0.06031153957288082, "grad_norm": 4.308155059814453, "learning_rate": 9.397450788091069e-05, "loss": 1.3624, "num_input_tokens_seen": 13870224, "step": 861 }, { "epoch": 0.06038158781861007, "grad_norm": 4.307553291320801, "learning_rate": 9.396750963222417e-05, "loss": 1.0942, "num_input_tokens_seen": 13886608, "step": 862 }, { "epoch": 0.060451636064339315, "grad_norm": 3.8107142448425293, "learning_rate": 9.396051138353765e-05, "loss": 1.1285, "num_input_tokens_seen": 13902992, "step": 863 }, { "epoch": 0.06052168431006856, "grad_norm": 4.530765533447266, "learning_rate": 9.395351313485114e-05, "loss": 1.2028, "num_input_tokens_seen": 13919376, "step": 864 }, { "epoch": 0.060591732555797806, "grad_norm": 4.035069465637207, "learning_rate": 9.394651488616463e-05, "loss": 1.0291, "num_input_tokens_seen": 13935664, "step": 865 }, { "epoch": 0.06066178080152705, "grad_norm": 4.028316497802734, "learning_rate": 9.393951663747812e-05, "loss": 1.21, "num_input_tokens_seen": 13951096, "step": 866 }, { "epoch": 0.0607318290472563, "grad_norm": 4.039167881011963, "learning_rate": 9.39325183887916e-05, "loss": 0.929, "num_input_tokens_seen": 13966272, "step": 867 }, { "epoch": 0.06080187729298554, "grad_norm": 4.139703273773193, "learning_rate": 9.392552014010508e-05, "loss": 1.2575, "num_input_tokens_seen": 13981848, "step": 868 }, { "epoch": 0.06087192553871479, "grad_norm": 4.222180366516113, "learning_rate": 9.391852189141857e-05, "loss": 1.2067, "num_input_tokens_seen": 13997920, "step": 869 }, { "epoch": 0.060941973784444034, "grad_norm": 3.7993030548095703, "learning_rate": 9.391152364273206e-05, "loss": 1.0865, "num_input_tokens_seen": 14014304, "step": 870 }, { "epoch": 0.06101202203017328, "grad_norm": 4.811493396759033, "learning_rate": 9.390452539404554e-05, "loss": 1.1331, "num_input_tokens_seen": 14030688, "step": 871 }, { "epoch": 0.061082070275902525, "grad_norm": 13.88792610168457, "learning_rate": 9.389752714535903e-05, "loss": 1.1368, "num_input_tokens_seen": 14045584, "step": 872 }, { "epoch": 0.06115211852163177, "grad_norm": 3.7678709030151367, "learning_rate": 9.389052889667251e-05, "loss": 1.1012, "num_input_tokens_seen": 14061968, "step": 873 }, { "epoch": 0.061222166767361016, "grad_norm": 4.252075672149658, "learning_rate": 9.3883530647986e-05, "loss": 1.0472, "num_input_tokens_seen": 14077584, "step": 874 }, { "epoch": 0.06129221501309027, "grad_norm": 3.555629253387451, "learning_rate": 9.387653239929947e-05, "loss": 0.8653, "num_input_tokens_seen": 14093704, "step": 875 }, { "epoch": 0.061362263258819515, "grad_norm": 4.122331619262695, "learning_rate": 9.386953415061296e-05, "loss": 1.0395, "num_input_tokens_seen": 14109624, "step": 876 }, { "epoch": 0.06143231150454876, "grad_norm": 3.6772518157958984, "learning_rate": 9.386253590192645e-05, "loss": 0.8842, "num_input_tokens_seen": 14126008, "step": 877 }, { "epoch": 0.061502359750278006, "grad_norm": 3.791351079940796, "learning_rate": 9.385553765323994e-05, "loss": 1.1118, "num_input_tokens_seen": 14142392, "step": 878 }, { "epoch": 0.06157240799600725, "grad_norm": 3.781759738922119, "learning_rate": 9.384853940455343e-05, "loss": 1.0577, "num_input_tokens_seen": 14158776, "step": 879 }, { "epoch": 0.0616424562417365, "grad_norm": 4.2420830726623535, "learning_rate": 9.38415411558669e-05, "loss": 1.268, "num_input_tokens_seen": 14173920, "step": 880 }, { "epoch": 0.06171250448746574, "grad_norm": 4.000860214233398, "learning_rate": 9.383454290718039e-05, "loss": 1.1626, "num_input_tokens_seen": 14190032, "step": 881 }, { "epoch": 0.06178255273319499, "grad_norm": 3.760969877243042, "learning_rate": 9.382754465849388e-05, "loss": 0.9684, "num_input_tokens_seen": 14206416, "step": 882 }, { "epoch": 0.061852600978924234, "grad_norm": 4.81919002532959, "learning_rate": 9.382054640980735e-05, "loss": 1.1056, "num_input_tokens_seen": 14222408, "step": 883 }, { "epoch": 0.06192264922465348, "grad_norm": 4.951950550079346, "learning_rate": 9.381354816112084e-05, "loss": 1.0334, "num_input_tokens_seen": 14238616, "step": 884 }, { "epoch": 0.061992697470382725, "grad_norm": 4.15132999420166, "learning_rate": 9.380654991243433e-05, "loss": 1.3171, "num_input_tokens_seen": 14254968, "step": 885 }, { "epoch": 0.06206274571611197, "grad_norm": 5.100244998931885, "learning_rate": 9.379955166374782e-05, "loss": 1.1684, "num_input_tokens_seen": 14271352, "step": 886 }, { "epoch": 0.062132793961841216, "grad_norm": 5.999105453491211, "learning_rate": 9.379255341506131e-05, "loss": 0.9824, "num_input_tokens_seen": 14287496, "step": 887 }, { "epoch": 0.06220284220757046, "grad_norm": 3.8826348781585693, "learning_rate": 9.378555516637478e-05, "loss": 1.0829, "num_input_tokens_seen": 14303880, "step": 888 }, { "epoch": 0.06227289045329971, "grad_norm": 5.308819770812988, "learning_rate": 9.377855691768827e-05, "loss": 1.1377, "num_input_tokens_seen": 14320264, "step": 889 }, { "epoch": 0.06234293869902895, "grad_norm": 4.383331775665283, "learning_rate": 9.377155866900175e-05, "loss": 1.0147, "num_input_tokens_seen": 14336232, "step": 890 }, { "epoch": 0.0624129869447582, "grad_norm": 4.335045337677002, "learning_rate": 9.376456042031524e-05, "loss": 0.9807, "num_input_tokens_seen": 14351704, "step": 891 }, { "epoch": 0.06248303519048745, "grad_norm": 3.6901326179504395, "learning_rate": 9.375756217162872e-05, "loss": 1.0494, "num_input_tokens_seen": 14368088, "step": 892 }, { "epoch": 0.0625530834362167, "grad_norm": 3.912727117538452, "learning_rate": 9.375056392294221e-05, "loss": 1.1191, "num_input_tokens_seen": 14383904, "step": 893 }, { "epoch": 0.06262313168194594, "grad_norm": 3.5688252449035645, "learning_rate": 9.37435656742557e-05, "loss": 0.833, "num_input_tokens_seen": 14399648, "step": 894 }, { "epoch": 0.06269317992767519, "grad_norm": 4.6460137367248535, "learning_rate": 9.373656742556918e-05, "loss": 1.2523, "num_input_tokens_seen": 14415640, "step": 895 }, { "epoch": 0.06276322817340443, "grad_norm": 3.8113012313842773, "learning_rate": 9.372956917688266e-05, "loss": 1.1789, "num_input_tokens_seen": 14432024, "step": 896 }, { "epoch": 0.06283327641913368, "grad_norm": 3.8755953311920166, "learning_rate": 9.372257092819615e-05, "loss": 1.1506, "num_input_tokens_seen": 14448152, "step": 897 }, { "epoch": 0.06290332466486293, "grad_norm": 4.225901126861572, "learning_rate": 9.371557267950964e-05, "loss": 1.0754, "num_input_tokens_seen": 14464536, "step": 898 }, { "epoch": 0.06297337291059217, "grad_norm": 3.9437992572784424, "learning_rate": 9.370857443082313e-05, "loss": 1.049, "num_input_tokens_seen": 14480072, "step": 899 }, { "epoch": 0.06304342115632142, "grad_norm": 3.8961846828460693, "learning_rate": 9.37015761821366e-05, "loss": 1.1925, "num_input_tokens_seen": 14496456, "step": 900 }, { "epoch": 0.06311346940205066, "grad_norm": 4.844581604003906, "learning_rate": 9.36945779334501e-05, "loss": 1.0867, "num_input_tokens_seen": 14512520, "step": 901 }, { "epoch": 0.06318351764777991, "grad_norm": 4.89027214050293, "learning_rate": 9.368757968476357e-05, "loss": 1.0997, "num_input_tokens_seen": 14528904, "step": 902 }, { "epoch": 0.06325356589350915, "grad_norm": 4.303073883056641, "learning_rate": 9.368058143607706e-05, "loss": 1.0626, "num_input_tokens_seen": 14545288, "step": 903 }, { "epoch": 0.0633236141392384, "grad_norm": 5.145171165466309, "learning_rate": 9.367358318739055e-05, "loss": 1.3597, "num_input_tokens_seen": 14561672, "step": 904 }, { "epoch": 0.06339366238496764, "grad_norm": 5.7905964851379395, "learning_rate": 9.366658493870403e-05, "loss": 1.1075, "num_input_tokens_seen": 14575896, "step": 905 }, { "epoch": 0.06346371063069689, "grad_norm": 3.7394728660583496, "learning_rate": 9.365958669001752e-05, "loss": 0.9347, "num_input_tokens_seen": 14592280, "step": 906 }, { "epoch": 0.06353375887642614, "grad_norm": 3.916626453399658, "learning_rate": 9.3652588441331e-05, "loss": 1.0793, "num_input_tokens_seen": 14608072, "step": 907 }, { "epoch": 0.06360380712215538, "grad_norm": 5.088227272033691, "learning_rate": 9.364559019264449e-05, "loss": 1.158, "num_input_tokens_seen": 14624360, "step": 908 }, { "epoch": 0.06367385536788463, "grad_norm": 3.8519606590270996, "learning_rate": 9.363859194395798e-05, "loss": 1.1235, "num_input_tokens_seen": 14640744, "step": 909 }, { "epoch": 0.06374390361361387, "grad_norm": 4.450200080871582, "learning_rate": 9.363159369527145e-05, "loss": 1.0145, "num_input_tokens_seen": 14657128, "step": 910 }, { "epoch": 0.06381395185934312, "grad_norm": 4.188115119934082, "learning_rate": 9.362459544658494e-05, "loss": 1.1457, "num_input_tokens_seen": 14673128, "step": 911 }, { "epoch": 0.06388400010507236, "grad_norm": 4.67346715927124, "learning_rate": 9.361759719789843e-05, "loss": 1.2841, "num_input_tokens_seen": 14689512, "step": 912 }, { "epoch": 0.06395404835080161, "grad_norm": 3.737790822982788, "learning_rate": 9.361059894921192e-05, "loss": 1.0114, "num_input_tokens_seen": 14705872, "step": 913 }, { "epoch": 0.06402409659653086, "grad_norm": 4.2486653327941895, "learning_rate": 9.36036007005254e-05, "loss": 1.1526, "num_input_tokens_seen": 14721816, "step": 914 }, { "epoch": 0.0640941448422601, "grad_norm": 4.120566368103027, "learning_rate": 9.359660245183888e-05, "loss": 1.1045, "num_input_tokens_seen": 14738200, "step": 915 }, { "epoch": 0.06416419308798935, "grad_norm": 5.259902477264404, "learning_rate": 9.358960420315237e-05, "loss": 1.3544, "num_input_tokens_seen": 14753920, "step": 916 }, { "epoch": 0.06423424133371859, "grad_norm": 3.900827646255493, "learning_rate": 9.358260595446584e-05, "loss": 1.1079, "num_input_tokens_seen": 14769640, "step": 917 }, { "epoch": 0.06430428957944785, "grad_norm": 4.103065490722656, "learning_rate": 9.357560770577935e-05, "loss": 0.963, "num_input_tokens_seen": 14786024, "step": 918 }, { "epoch": 0.0643743378251771, "grad_norm": 3.9913623332977295, "learning_rate": 9.356860945709282e-05, "loss": 1.0959, "num_input_tokens_seen": 14802408, "step": 919 }, { "epoch": 0.06444438607090634, "grad_norm": 3.7369885444641113, "learning_rate": 9.356161120840631e-05, "loss": 1.131, "num_input_tokens_seen": 14818792, "step": 920 }, { "epoch": 0.06451443431663559, "grad_norm": 4.029351711273193, "learning_rate": 9.35546129597198e-05, "loss": 1.0378, "num_input_tokens_seen": 14833792, "step": 921 }, { "epoch": 0.06458448256236483, "grad_norm": 4.043665885925293, "learning_rate": 9.354761471103327e-05, "loss": 1.179, "num_input_tokens_seen": 14850176, "step": 922 }, { "epoch": 0.06465453080809408, "grad_norm": 3.7803280353546143, "learning_rate": 9.354061646234676e-05, "loss": 0.9886, "num_input_tokens_seen": 14866096, "step": 923 }, { "epoch": 0.06472457905382333, "grad_norm": 5.537375450134277, "learning_rate": 9.353361821366025e-05, "loss": 1.2519, "num_input_tokens_seen": 14882480, "step": 924 }, { "epoch": 0.06479462729955257, "grad_norm": 4.944652557373047, "learning_rate": 9.352661996497374e-05, "loss": 1.1963, "num_input_tokens_seen": 14898864, "step": 925 }, { "epoch": 0.06486467554528182, "grad_norm": 4.3231611251831055, "learning_rate": 9.351962171628723e-05, "loss": 1.1858, "num_input_tokens_seen": 14913856, "step": 926 }, { "epoch": 0.06493472379101106, "grad_norm": 4.386692523956299, "learning_rate": 9.35126234676007e-05, "loss": 1.0464, "num_input_tokens_seen": 14929816, "step": 927 }, { "epoch": 0.06500477203674031, "grad_norm": 4.607088088989258, "learning_rate": 9.350562521891419e-05, "loss": 1.2197, "num_input_tokens_seen": 14946200, "step": 928 }, { "epoch": 0.06507482028246955, "grad_norm": 4.7108001708984375, "learning_rate": 9.349862697022767e-05, "loss": 1.2335, "num_input_tokens_seen": 14961816, "step": 929 }, { "epoch": 0.0651448685281988, "grad_norm": 3.844571352005005, "learning_rate": 9.349162872154115e-05, "loss": 1.2745, "num_input_tokens_seen": 14978200, "step": 930 }, { "epoch": 0.06521491677392804, "grad_norm": 4.078561782836914, "learning_rate": 9.348463047285464e-05, "loss": 1.1737, "num_input_tokens_seen": 14994440, "step": 931 }, { "epoch": 0.06528496501965729, "grad_norm": 4.317986011505127, "learning_rate": 9.347763222416813e-05, "loss": 1.3046, "num_input_tokens_seen": 15010824, "step": 932 }, { "epoch": 0.06535501326538654, "grad_norm": 4.459141254425049, "learning_rate": 9.347063397548162e-05, "loss": 1.2893, "num_input_tokens_seen": 15026608, "step": 933 }, { "epoch": 0.06542506151111578, "grad_norm": 4.251399993896484, "learning_rate": 9.34636357267951e-05, "loss": 1.2346, "num_input_tokens_seen": 15042328, "step": 934 }, { "epoch": 0.06549510975684503, "grad_norm": 4.568341255187988, "learning_rate": 9.345663747810858e-05, "loss": 1.4343, "num_input_tokens_seen": 15058712, "step": 935 }, { "epoch": 0.06556515800257427, "grad_norm": 4.7616424560546875, "learning_rate": 9.344963922942207e-05, "loss": 1.0925, "num_input_tokens_seen": 15075096, "step": 936 }, { "epoch": 0.06563520624830352, "grad_norm": 3.8224191665649414, "learning_rate": 9.344264098073555e-05, "loss": 1.0958, "num_input_tokens_seen": 15091480, "step": 937 }, { "epoch": 0.06570525449403276, "grad_norm": 4.985624313354492, "learning_rate": 9.343564273204905e-05, "loss": 1.233, "num_input_tokens_seen": 15107864, "step": 938 }, { "epoch": 0.06577530273976201, "grad_norm": 4.3780975341796875, "learning_rate": 9.342864448336252e-05, "loss": 1.1819, "num_input_tokens_seen": 15123656, "step": 939 }, { "epoch": 0.06584535098549125, "grad_norm": 4.435183525085449, "learning_rate": 9.342164623467601e-05, "loss": 1.1107, "num_input_tokens_seen": 15140040, "step": 940 }, { "epoch": 0.0659153992312205, "grad_norm": 4.560804843902588, "learning_rate": 9.34146479859895e-05, "loss": 1.1274, "num_input_tokens_seen": 15156424, "step": 941 }, { "epoch": 0.06598544747694975, "grad_norm": 5.184841156005859, "learning_rate": 9.340764973730298e-05, "loss": 1.3124, "num_input_tokens_seen": 15172504, "step": 942 }, { "epoch": 0.06605549572267899, "grad_norm": 3.5243096351623535, "learning_rate": 9.340065148861647e-05, "loss": 0.8203, "num_input_tokens_seen": 15188888, "step": 943 }, { "epoch": 0.06612554396840824, "grad_norm": 4.041544437408447, "learning_rate": 9.339365323992995e-05, "loss": 1.0602, "num_input_tokens_seen": 15204672, "step": 944 }, { "epoch": 0.06619559221413748, "grad_norm": 3.720906972885132, "learning_rate": 9.338665499124344e-05, "loss": 1.0722, "num_input_tokens_seen": 15220688, "step": 945 }, { "epoch": 0.06626564045986673, "grad_norm": 3.9778380393981934, "learning_rate": 9.337965674255692e-05, "loss": 1.2653, "num_input_tokens_seen": 15236856, "step": 946 }, { "epoch": 0.06633568870559597, "grad_norm": 4.486488342285156, "learning_rate": 9.33726584938704e-05, "loss": 1.2408, "num_input_tokens_seen": 15253240, "step": 947 }, { "epoch": 0.06640573695132522, "grad_norm": 8.369994163513184, "learning_rate": 9.33656602451839e-05, "loss": 1.4841, "num_input_tokens_seen": 15267728, "step": 948 }, { "epoch": 0.06647578519705447, "grad_norm": 4.2056732177734375, "learning_rate": 9.335866199649737e-05, "loss": 1.4258, "num_input_tokens_seen": 15284112, "step": 949 }, { "epoch": 0.06654583344278371, "grad_norm": 4.396723747253418, "learning_rate": 9.335166374781086e-05, "loss": 1.1578, "num_input_tokens_seen": 15300496, "step": 950 }, { "epoch": 0.06661588168851296, "grad_norm": 3.7177491188049316, "learning_rate": 9.334466549912435e-05, "loss": 1.0664, "num_input_tokens_seen": 15316608, "step": 951 }, { "epoch": 0.0666859299342422, "grad_norm": 4.080933094024658, "learning_rate": 9.333766725043784e-05, "loss": 1.1282, "num_input_tokens_seen": 15332976, "step": 952 }, { "epoch": 0.06675597817997146, "grad_norm": 5.188856601715088, "learning_rate": 9.333066900175132e-05, "loss": 1.2079, "num_input_tokens_seen": 15349080, "step": 953 }, { "epoch": 0.06682602642570071, "grad_norm": 4.583539962768555, "learning_rate": 9.33236707530648e-05, "loss": 0.9047, "num_input_tokens_seen": 15365256, "step": 954 }, { "epoch": 0.06689607467142995, "grad_norm": 3.873830795288086, "learning_rate": 9.331667250437829e-05, "loss": 1.159, "num_input_tokens_seen": 15381640, "step": 955 }, { "epoch": 0.0669661229171592, "grad_norm": 3.9574460983276367, "learning_rate": 9.330967425569176e-05, "loss": 1.0696, "num_input_tokens_seen": 15397800, "step": 956 }, { "epoch": 0.06703617116288844, "grad_norm": 3.8933448791503906, "learning_rate": 9.330267600700525e-05, "loss": 0.9844, "num_input_tokens_seen": 15414112, "step": 957 }, { "epoch": 0.06710621940861769, "grad_norm": 4.748478412628174, "learning_rate": 9.329567775831875e-05, "loss": 1.1308, "num_input_tokens_seen": 15430496, "step": 958 }, { "epoch": 0.06717626765434694, "grad_norm": 6.755379676818848, "learning_rate": 9.328867950963223e-05, "loss": 1.206, "num_input_tokens_seen": 15445072, "step": 959 }, { "epoch": 0.06724631590007618, "grad_norm": 4.382065773010254, "learning_rate": 9.328168126094572e-05, "loss": 1.0753, "num_input_tokens_seen": 15460336, "step": 960 }, { "epoch": 0.06731636414580543, "grad_norm": 5.037116527557373, "learning_rate": 9.327468301225919e-05, "loss": 1.0562, "num_input_tokens_seen": 15474752, "step": 961 }, { "epoch": 0.06738641239153467, "grad_norm": 5.838945388793945, "learning_rate": 9.326768476357268e-05, "loss": 1.314, "num_input_tokens_seen": 15491136, "step": 962 }, { "epoch": 0.06745646063726392, "grad_norm": 3.690436840057373, "learning_rate": 9.326068651488617e-05, "loss": 0.996, "num_input_tokens_seen": 15507520, "step": 963 }, { "epoch": 0.06752650888299316, "grad_norm": 4.1123247146606445, "learning_rate": 9.325368826619966e-05, "loss": 1.2031, "num_input_tokens_seen": 15523904, "step": 964 }, { "epoch": 0.06759655712872241, "grad_norm": 4.120308876037598, "learning_rate": 9.324669001751315e-05, "loss": 0.9671, "num_input_tokens_seen": 15540136, "step": 965 }, { "epoch": 0.06766660537445165, "grad_norm": 3.9849514961242676, "learning_rate": 9.323969176882662e-05, "loss": 1.1669, "num_input_tokens_seen": 15556312, "step": 966 }, { "epoch": 0.0677366536201809, "grad_norm": 3.9164884090423584, "learning_rate": 9.323269352014011e-05, "loss": 1.0883, "num_input_tokens_seen": 15571864, "step": 967 }, { "epoch": 0.06780670186591015, "grad_norm": 4.282434940338135, "learning_rate": 9.32256952714536e-05, "loss": 1.241, "num_input_tokens_seen": 15587800, "step": 968 }, { "epoch": 0.06787675011163939, "grad_norm": 4.118724346160889, "learning_rate": 9.321869702276707e-05, "loss": 1.0905, "num_input_tokens_seen": 15603128, "step": 969 }, { "epoch": 0.06794679835736864, "grad_norm": 4.233770847320557, "learning_rate": 9.321169877408056e-05, "loss": 1.0618, "num_input_tokens_seen": 15617864, "step": 970 }, { "epoch": 0.06801684660309788, "grad_norm": 3.933587074279785, "learning_rate": 9.320470052539405e-05, "loss": 0.982, "num_input_tokens_seen": 15634248, "step": 971 }, { "epoch": 0.06808689484882713, "grad_norm": 4.641788482666016, "learning_rate": 9.319770227670754e-05, "loss": 0.9793, "num_input_tokens_seen": 15650304, "step": 972 }, { "epoch": 0.06815694309455637, "grad_norm": 4.138880729675293, "learning_rate": 9.319070402802102e-05, "loss": 1.1991, "num_input_tokens_seen": 15666688, "step": 973 }, { "epoch": 0.06822699134028562, "grad_norm": 4.823685169219971, "learning_rate": 9.31837057793345e-05, "loss": 0.9162, "num_input_tokens_seen": 15682936, "step": 974 }, { "epoch": 0.06829703958601487, "grad_norm": 4.432481288909912, "learning_rate": 9.317670753064799e-05, "loss": 0.9626, "num_input_tokens_seen": 15699320, "step": 975 }, { "epoch": 0.06836708783174411, "grad_norm": 4.115868091583252, "learning_rate": 9.316970928196147e-05, "loss": 1.105, "num_input_tokens_seen": 15715296, "step": 976 }, { "epoch": 0.06843713607747336, "grad_norm": 3.964905023574829, "learning_rate": 9.316271103327496e-05, "loss": 1.0064, "num_input_tokens_seen": 15731680, "step": 977 }, { "epoch": 0.0685071843232026, "grad_norm": 3.686522960662842, "learning_rate": 9.315571278458846e-05, "loss": 0.9924, "num_input_tokens_seen": 15747808, "step": 978 }, { "epoch": 0.06857723256893185, "grad_norm": 4.0614423751831055, "learning_rate": 9.314871453590193e-05, "loss": 1.0425, "num_input_tokens_seen": 15764168, "step": 979 }, { "epoch": 0.0686472808146611, "grad_norm": 3.756350517272949, "learning_rate": 9.314171628721542e-05, "loss": 1.0757, "num_input_tokens_seen": 15780176, "step": 980 }, { "epoch": 0.06871732906039034, "grad_norm": 4.30344820022583, "learning_rate": 9.31347180385289e-05, "loss": 0.9496, "num_input_tokens_seen": 15795720, "step": 981 }, { "epoch": 0.06878737730611958, "grad_norm": 4.055768013000488, "learning_rate": 9.312771978984239e-05, "loss": 1.0189, "num_input_tokens_seen": 15811528, "step": 982 }, { "epoch": 0.06885742555184883, "grad_norm": 3.8779115676879883, "learning_rate": 9.312072154115586e-05, "loss": 1.0516, "num_input_tokens_seen": 15827392, "step": 983 }, { "epoch": 0.06892747379757808, "grad_norm": 5.014206886291504, "learning_rate": 9.311372329246936e-05, "loss": 1.3421, "num_input_tokens_seen": 15843776, "step": 984 }, { "epoch": 0.06899752204330732, "grad_norm": 4.548489570617676, "learning_rate": 9.310672504378285e-05, "loss": 1.1652, "num_input_tokens_seen": 15858880, "step": 985 }, { "epoch": 0.06906757028903657, "grad_norm": 4.312918186187744, "learning_rate": 9.309972679509633e-05, "loss": 1.2728, "num_input_tokens_seen": 15874840, "step": 986 }, { "epoch": 0.06913761853476583, "grad_norm": 3.9783735275268555, "learning_rate": 9.309272854640981e-05, "loss": 0.9377, "num_input_tokens_seen": 15890568, "step": 987 }, { "epoch": 0.06920766678049507, "grad_norm": 4.155986309051514, "learning_rate": 9.308573029772329e-05, "loss": 1.0278, "num_input_tokens_seen": 15906952, "step": 988 }, { "epoch": 0.06927771502622432, "grad_norm": 3.633018732070923, "learning_rate": 9.307873204903678e-05, "loss": 1.1276, "num_input_tokens_seen": 15923336, "step": 989 }, { "epoch": 0.06934776327195356, "grad_norm": 3.9513449668884277, "learning_rate": 9.307173380035027e-05, "loss": 0.9076, "num_input_tokens_seen": 15939720, "step": 990 }, { "epoch": 0.06941781151768281, "grad_norm": 4.296191692352295, "learning_rate": 9.306473555166376e-05, "loss": 1.0375, "num_input_tokens_seen": 15956104, "step": 991 }, { "epoch": 0.06948785976341205, "grad_norm": 5.266847133636475, "learning_rate": 9.305773730297724e-05, "loss": 1.1645, "num_input_tokens_seen": 15972488, "step": 992 }, { "epoch": 0.0695579080091413, "grad_norm": 4.321287155151367, "learning_rate": 9.305073905429072e-05, "loss": 1.046, "num_input_tokens_seen": 15988408, "step": 993 }, { "epoch": 0.06962795625487055, "grad_norm": 4.1421613693237305, "learning_rate": 9.304374080560421e-05, "loss": 1.0639, "num_input_tokens_seen": 16002904, "step": 994 }, { "epoch": 0.06969800450059979, "grad_norm": 6.811270713806152, "learning_rate": 9.30367425569177e-05, "loss": 1.1012, "num_input_tokens_seen": 16017424, "step": 995 }, { "epoch": 0.06976805274632904, "grad_norm": 4.968684196472168, "learning_rate": 9.302974430823117e-05, "loss": 1.0935, "num_input_tokens_seen": 16033808, "step": 996 }, { "epoch": 0.06983810099205828, "grad_norm": 4.592737197875977, "learning_rate": 9.302274605954466e-05, "loss": 0.9698, "num_input_tokens_seen": 16050192, "step": 997 }, { "epoch": 0.06990814923778753, "grad_norm": 3.7984917163848877, "learning_rate": 9.301574781085815e-05, "loss": 1.0976, "num_input_tokens_seen": 16066192, "step": 998 }, { "epoch": 0.06997819748351677, "grad_norm": 4.594212055206299, "learning_rate": 9.300874956217164e-05, "loss": 1.3718, "num_input_tokens_seen": 16082576, "step": 999 }, { "epoch": 0.07004824572924602, "grad_norm": 5.062666893005371, "learning_rate": 9.300175131348511e-05, "loss": 1.3139, "num_input_tokens_seen": 16098960, "step": 1000 }, { "epoch": 0.07004824572924602, "eval_loss": 1.1650840044021606, "eval_runtime": 0.192, "eval_samples_per_second": 5.208, "eval_steps_per_second": 5.208, "num_input_tokens_seen": 16098960, "step": 1000 }, { "epoch": 0.07011829397497527, "grad_norm": 4.100902557373047, "learning_rate": 9.29947530647986e-05, "loss": 1.2711, "num_input_tokens_seen": 16115216, "step": 1001 }, { "epoch": 0.07018834222070451, "grad_norm": 4.24728536605835, "learning_rate": 9.298775481611209e-05, "loss": 0.9946, "num_input_tokens_seen": 16130080, "step": 1002 }, { "epoch": 0.07025839046643376, "grad_norm": 3.4653356075286865, "learning_rate": 9.298075656742556e-05, "loss": 0.8736, "num_input_tokens_seen": 16146400, "step": 1003 }, { "epoch": 0.070328438712163, "grad_norm": 5.548775672912598, "learning_rate": 9.297375831873907e-05, "loss": 0.9841, "num_input_tokens_seen": 16162784, "step": 1004 }, { "epoch": 0.07039848695789225, "grad_norm": 4.11661958694458, "learning_rate": 9.296676007005256e-05, "loss": 0.9857, "num_input_tokens_seen": 16179024, "step": 1005 }, { "epoch": 0.0704685352036215, "grad_norm": 4.006300449371338, "learning_rate": 9.295976182136603e-05, "loss": 1.0587, "num_input_tokens_seen": 16195408, "step": 1006 }, { "epoch": 0.07053858344935074, "grad_norm": 4.418802261352539, "learning_rate": 9.295276357267952e-05, "loss": 1.3845, "num_input_tokens_seen": 16211792, "step": 1007 }, { "epoch": 0.07060863169507998, "grad_norm": 5.625720024108887, "learning_rate": 9.2945765323993e-05, "loss": 1.2198, "num_input_tokens_seen": 16226584, "step": 1008 }, { "epoch": 0.07067867994080923, "grad_norm": 4.209630489349365, "learning_rate": 9.293876707530648e-05, "loss": 0.9387, "num_input_tokens_seen": 16242256, "step": 1009 }, { "epoch": 0.07074872818653848, "grad_norm": 4.0324788093566895, "learning_rate": 9.293176882661997e-05, "loss": 1.0713, "num_input_tokens_seen": 16258640, "step": 1010 }, { "epoch": 0.07081877643226772, "grad_norm": 4.0557684898376465, "learning_rate": 9.292477057793346e-05, "loss": 1.2831, "num_input_tokens_seen": 16275024, "step": 1011 }, { "epoch": 0.07088882467799697, "grad_norm": 4.511384010314941, "learning_rate": 9.291777232924695e-05, "loss": 1.1949, "num_input_tokens_seen": 16291112, "step": 1012 }, { "epoch": 0.07095887292372621, "grad_norm": 3.8120172023773193, "learning_rate": 9.291077408056042e-05, "loss": 1.013, "num_input_tokens_seen": 16307496, "step": 1013 }, { "epoch": 0.07102892116945546, "grad_norm": 4.039558410644531, "learning_rate": 9.290377583187391e-05, "loss": 1.1575, "num_input_tokens_seen": 16323880, "step": 1014 }, { "epoch": 0.0710989694151847, "grad_norm": 3.9076366424560547, "learning_rate": 9.289677758318739e-05, "loss": 1.1776, "num_input_tokens_seen": 16339624, "step": 1015 }, { "epoch": 0.07116901766091395, "grad_norm": 3.8083527088165283, "learning_rate": 9.288977933450088e-05, "loss": 0.965, "num_input_tokens_seen": 16356008, "step": 1016 }, { "epoch": 0.0712390659066432, "grad_norm": 4.5387282371521, "learning_rate": 9.288278108581436e-05, "loss": 1.1113, "num_input_tokens_seen": 16372392, "step": 1017 }, { "epoch": 0.07130911415237244, "grad_norm": 3.9228522777557373, "learning_rate": 9.287578283712785e-05, "loss": 1.1609, "num_input_tokens_seen": 16388776, "step": 1018 }, { "epoch": 0.07137916239810169, "grad_norm": 4.170912742614746, "learning_rate": 9.286878458844134e-05, "loss": 1.1324, "num_input_tokens_seen": 16405160, "step": 1019 }, { "epoch": 0.07144921064383093, "grad_norm": 4.426759719848633, "learning_rate": 9.286178633975482e-05, "loss": 1.2825, "num_input_tokens_seen": 16421544, "step": 1020 }, { "epoch": 0.07151925888956018, "grad_norm": 3.8606133460998535, "learning_rate": 9.28547880910683e-05, "loss": 1.1734, "num_input_tokens_seen": 16437736, "step": 1021 }, { "epoch": 0.07158930713528944, "grad_norm": 4.040006637573242, "learning_rate": 9.28477898423818e-05, "loss": 1.0824, "num_input_tokens_seen": 16453776, "step": 1022 }, { "epoch": 0.07165935538101868, "grad_norm": 3.7698042392730713, "learning_rate": 9.284079159369527e-05, "loss": 1.0951, "num_input_tokens_seen": 16470160, "step": 1023 }, { "epoch": 0.07172940362674793, "grad_norm": 4.180328369140625, "learning_rate": 9.283379334500877e-05, "loss": 1.0087, "num_input_tokens_seen": 16486280, "step": 1024 }, { "epoch": 0.07179945187247717, "grad_norm": 6.02299690246582, "learning_rate": 9.282679509632225e-05, "loss": 0.9788, "num_input_tokens_seen": 16501784, "step": 1025 }, { "epoch": 0.07186950011820642, "grad_norm": 4.239454746246338, "learning_rate": 9.281979684763573e-05, "loss": 1.3031, "num_input_tokens_seen": 16518096, "step": 1026 }, { "epoch": 0.07193954836393567, "grad_norm": 3.446030616760254, "learning_rate": 9.281279859894921e-05, "loss": 0.9523, "num_input_tokens_seen": 16534480, "step": 1027 }, { "epoch": 0.07200959660966491, "grad_norm": 4.2813568115234375, "learning_rate": 9.28058003502627e-05, "loss": 1.1041, "num_input_tokens_seen": 16550864, "step": 1028 }, { "epoch": 0.07207964485539416, "grad_norm": 5.289443016052246, "learning_rate": 9.279880210157619e-05, "loss": 1.3036, "num_input_tokens_seen": 16567248, "step": 1029 }, { "epoch": 0.0721496931011234, "grad_norm": 3.680283308029175, "learning_rate": 9.279180385288967e-05, "loss": 1.1434, "num_input_tokens_seen": 16583632, "step": 1030 }, { "epoch": 0.07221974134685265, "grad_norm": 4.283925533294678, "learning_rate": 9.278480560420316e-05, "loss": 1.1569, "num_input_tokens_seen": 16600016, "step": 1031 }, { "epoch": 0.0722897895925819, "grad_norm": 4.913532733917236, "learning_rate": 9.277780735551665e-05, "loss": 1.218, "num_input_tokens_seen": 16616400, "step": 1032 }, { "epoch": 0.07235983783831114, "grad_norm": 4.344277381896973, "learning_rate": 9.277080910683013e-05, "loss": 1.1495, "num_input_tokens_seen": 16632024, "step": 1033 }, { "epoch": 0.07242988608404038, "grad_norm": 3.9231889247894287, "learning_rate": 9.276381085814362e-05, "loss": 1.0492, "num_input_tokens_seen": 16648408, "step": 1034 }, { "epoch": 0.07249993432976963, "grad_norm": 4.062288284301758, "learning_rate": 9.275681260945709e-05, "loss": 0.927, "num_input_tokens_seen": 16664792, "step": 1035 }, { "epoch": 0.07256998257549888, "grad_norm": 4.163131237030029, "learning_rate": 9.274981436077058e-05, "loss": 1.0782, "num_input_tokens_seen": 16680216, "step": 1036 }, { "epoch": 0.07264003082122812, "grad_norm": 5.220231056213379, "learning_rate": 9.274281611208407e-05, "loss": 1.125, "num_input_tokens_seen": 16696160, "step": 1037 }, { "epoch": 0.07271007906695737, "grad_norm": 3.63785457611084, "learning_rate": 9.273581786339756e-05, "loss": 1.0229, "num_input_tokens_seen": 16712544, "step": 1038 }, { "epoch": 0.07278012731268661, "grad_norm": 4.612295627593994, "learning_rate": 9.272881961471105e-05, "loss": 1.3076, "num_input_tokens_seen": 16728928, "step": 1039 }, { "epoch": 0.07285017555841586, "grad_norm": 5.278262615203857, "learning_rate": 9.272182136602452e-05, "loss": 1.2682, "num_input_tokens_seen": 16744184, "step": 1040 }, { "epoch": 0.0729202238041451, "grad_norm": 4.3274455070495605, "learning_rate": 9.271482311733801e-05, "loss": 1.3517, "num_input_tokens_seen": 16760056, "step": 1041 }, { "epoch": 0.07299027204987435, "grad_norm": 4.1077375411987305, "learning_rate": 9.270782486865148e-05, "loss": 1.175, "num_input_tokens_seen": 16776280, "step": 1042 }, { "epoch": 0.0730603202956036, "grad_norm": 3.954604148864746, "learning_rate": 9.270082661996497e-05, "loss": 1.189, "num_input_tokens_seen": 16792456, "step": 1043 }, { "epoch": 0.07313036854133284, "grad_norm": 4.111297607421875, "learning_rate": 9.269382837127847e-05, "loss": 1.0265, "num_input_tokens_seen": 16808840, "step": 1044 }, { "epoch": 0.07320041678706209, "grad_norm": 3.56953501701355, "learning_rate": 9.268683012259195e-05, "loss": 1.0114, "num_input_tokens_seen": 16824720, "step": 1045 }, { "epoch": 0.07327046503279133, "grad_norm": 4.962648868560791, "learning_rate": 9.267983187390544e-05, "loss": 1.1714, "num_input_tokens_seen": 16841104, "step": 1046 }, { "epoch": 0.07334051327852058, "grad_norm": 3.7930710315704346, "learning_rate": 9.267283362521891e-05, "loss": 1.0903, "num_input_tokens_seen": 16857488, "step": 1047 }, { "epoch": 0.07341056152424982, "grad_norm": 4.158027172088623, "learning_rate": 9.26658353765324e-05, "loss": 1.1823, "num_input_tokens_seen": 16873856, "step": 1048 }, { "epoch": 0.07348060976997907, "grad_norm": 4.1571197509765625, "learning_rate": 9.265883712784589e-05, "loss": 1.2572, "num_input_tokens_seen": 16890240, "step": 1049 }, { "epoch": 0.07355065801570831, "grad_norm": 4.330874443054199, "learning_rate": 9.265183887915938e-05, "loss": 1.194, "num_input_tokens_seen": 16906624, "step": 1050 }, { "epoch": 0.07362070626143756, "grad_norm": 6.105716705322266, "learning_rate": 9.264484063047287e-05, "loss": 1.0685, "num_input_tokens_seen": 16922864, "step": 1051 }, { "epoch": 0.0736907545071668, "grad_norm": 4.8344407081604, "learning_rate": 9.263784238178634e-05, "loss": 1.1992, "num_input_tokens_seen": 16939200, "step": 1052 }, { "epoch": 0.07376080275289605, "grad_norm": 3.553568124771118, "learning_rate": 9.263084413309983e-05, "loss": 0.7907, "num_input_tokens_seen": 16955584, "step": 1053 }, { "epoch": 0.0738308509986253, "grad_norm": 3.8178694248199463, "learning_rate": 9.26238458844133e-05, "loss": 1.2031, "num_input_tokens_seen": 16971968, "step": 1054 }, { "epoch": 0.07390089924435454, "grad_norm": 3.5509321689605713, "learning_rate": 9.26168476357268e-05, "loss": 1.1189, "num_input_tokens_seen": 16988352, "step": 1055 }, { "epoch": 0.0739709474900838, "grad_norm": 3.870811939239502, "learning_rate": 9.260984938704028e-05, "loss": 1.0205, "num_input_tokens_seen": 17004736, "step": 1056 }, { "epoch": 0.07404099573581305, "grad_norm": 11.86201286315918, "learning_rate": 9.260285113835377e-05, "loss": 1.037, "num_input_tokens_seen": 17020544, "step": 1057 }, { "epoch": 0.0741110439815423, "grad_norm": 5.2176127433776855, "learning_rate": 9.259585288966726e-05, "loss": 1.0797, "num_input_tokens_seen": 17036472, "step": 1058 }, { "epoch": 0.07418109222727154, "grad_norm": 3.72566819190979, "learning_rate": 9.258885464098075e-05, "loss": 0.9307, "num_input_tokens_seen": 17052360, "step": 1059 }, { "epoch": 0.07425114047300078, "grad_norm": 4.323361396789551, "learning_rate": 9.258185639229422e-05, "loss": 1.0783, "num_input_tokens_seen": 17067672, "step": 1060 }, { "epoch": 0.07432118871873003, "grad_norm": 4.01705265045166, "learning_rate": 9.257485814360771e-05, "loss": 1.0402, "num_input_tokens_seen": 17084056, "step": 1061 }, { "epoch": 0.07439123696445928, "grad_norm": 4.4460039138793945, "learning_rate": 9.256785989492119e-05, "loss": 1.2294, "num_input_tokens_seen": 17100096, "step": 1062 }, { "epoch": 0.07446128521018852, "grad_norm": 4.634500503540039, "learning_rate": 9.256086164623468e-05, "loss": 1.1479, "num_input_tokens_seen": 17116440, "step": 1063 }, { "epoch": 0.07453133345591777, "grad_norm": 4.146971702575684, "learning_rate": 9.255386339754817e-05, "loss": 0.9052, "num_input_tokens_seen": 17132592, "step": 1064 }, { "epoch": 0.07460138170164701, "grad_norm": 6.171874523162842, "learning_rate": 9.254686514886165e-05, "loss": 1.1135, "num_input_tokens_seen": 17148704, "step": 1065 }, { "epoch": 0.07467142994737626, "grad_norm": 6.25461483001709, "learning_rate": 9.253986690017514e-05, "loss": 1.0003, "num_input_tokens_seen": 17164920, "step": 1066 }, { "epoch": 0.0747414781931055, "grad_norm": 3.886582851409912, "learning_rate": 9.253286865148862e-05, "loss": 1.1917, "num_input_tokens_seen": 17181304, "step": 1067 }, { "epoch": 0.07481152643883475, "grad_norm": 5.067885398864746, "learning_rate": 9.25258704028021e-05, "loss": 1.4475, "num_input_tokens_seen": 17197208, "step": 1068 }, { "epoch": 0.074881574684564, "grad_norm": 4.186190128326416, "learning_rate": 9.251887215411558e-05, "loss": 1.1255, "num_input_tokens_seen": 17212680, "step": 1069 }, { "epoch": 0.07495162293029324, "grad_norm": 4.059047698974609, "learning_rate": 9.251187390542908e-05, "loss": 1.1467, "num_input_tokens_seen": 17229064, "step": 1070 }, { "epoch": 0.07502167117602249, "grad_norm": 4.154530048370361, "learning_rate": 9.250487565674257e-05, "loss": 1.0811, "num_input_tokens_seen": 17245448, "step": 1071 }, { "epoch": 0.07509171942175173, "grad_norm": 3.760453701019287, "learning_rate": 9.249787740805605e-05, "loss": 1.1493, "num_input_tokens_seen": 17261832, "step": 1072 }, { "epoch": 0.07516176766748098, "grad_norm": 3.8155417442321777, "learning_rate": 9.249087915936954e-05, "loss": 1.0934, "num_input_tokens_seen": 17278216, "step": 1073 }, { "epoch": 0.07523181591321022, "grad_norm": 4.807973384857178, "learning_rate": 9.248388091068301e-05, "loss": 1.0704, "num_input_tokens_seen": 17294600, "step": 1074 }, { "epoch": 0.07530186415893947, "grad_norm": 11.421661376953125, "learning_rate": 9.24768826619965e-05, "loss": 0.9472, "num_input_tokens_seen": 17308960, "step": 1075 }, { "epoch": 0.07537191240466871, "grad_norm": 3.7491819858551025, "learning_rate": 9.246988441330999e-05, "loss": 1.1395, "num_input_tokens_seen": 17324536, "step": 1076 }, { "epoch": 0.07544196065039796, "grad_norm": 3.6289992332458496, "learning_rate": 9.246288616462348e-05, "loss": 0.9375, "num_input_tokens_seen": 17340920, "step": 1077 }, { "epoch": 0.0755120088961272, "grad_norm": 5.741896629333496, "learning_rate": 9.245588791593696e-05, "loss": 1.1656, "num_input_tokens_seen": 17357304, "step": 1078 }, { "epoch": 0.07558205714185645, "grad_norm": 3.5879697799682617, "learning_rate": 9.244888966725044e-05, "loss": 0.9421, "num_input_tokens_seen": 17373592, "step": 1079 }, { "epoch": 0.0756521053875857, "grad_norm": 7.3384504318237305, "learning_rate": 9.244189141856393e-05, "loss": 1.1358, "num_input_tokens_seen": 17387872, "step": 1080 }, { "epoch": 0.07572215363331494, "grad_norm": 3.6677255630493164, "learning_rate": 9.24348931698774e-05, "loss": 0.892, "num_input_tokens_seen": 17403088, "step": 1081 }, { "epoch": 0.07579220187904419, "grad_norm": 3.953216075897217, "learning_rate": 9.242789492119089e-05, "loss": 0.9757, "num_input_tokens_seen": 17419392, "step": 1082 }, { "epoch": 0.07586225012477343, "grad_norm": 4.827987194061279, "learning_rate": 9.242089667250438e-05, "loss": 1.1493, "num_input_tokens_seen": 17435776, "step": 1083 }, { "epoch": 0.07593229837050268, "grad_norm": 4.416223526000977, "learning_rate": 9.241389842381787e-05, "loss": 0.9913, "num_input_tokens_seen": 17452080, "step": 1084 }, { "epoch": 0.07600234661623193, "grad_norm": 3.7776753902435303, "learning_rate": 9.240690017513136e-05, "loss": 1.0589, "num_input_tokens_seen": 17468160, "step": 1085 }, { "epoch": 0.07607239486196117, "grad_norm": 4.139477252960205, "learning_rate": 9.239990192644485e-05, "loss": 0.9475, "num_input_tokens_seen": 17484544, "step": 1086 }, { "epoch": 0.07614244310769042, "grad_norm": 5.218942642211914, "learning_rate": 9.239290367775832e-05, "loss": 1.1626, "num_input_tokens_seen": 17500928, "step": 1087 }, { "epoch": 0.07621249135341966, "grad_norm": 4.773080348968506, "learning_rate": 9.238590542907181e-05, "loss": 1.154, "num_input_tokens_seen": 17517312, "step": 1088 }, { "epoch": 0.07628253959914891, "grad_norm": 3.840151309967041, "learning_rate": 9.237890718038528e-05, "loss": 1.0862, "num_input_tokens_seen": 17533696, "step": 1089 }, { "epoch": 0.07635258784487815, "grad_norm": 4.201962471008301, "learning_rate": 9.237190893169879e-05, "loss": 1.0945, "num_input_tokens_seen": 17549512, "step": 1090 }, { "epoch": 0.07642263609060741, "grad_norm": 4.4583001136779785, "learning_rate": 9.236491068301226e-05, "loss": 1.074, "num_input_tokens_seen": 17565896, "step": 1091 }, { "epoch": 0.07649268433633666, "grad_norm": 4.013672351837158, "learning_rate": 9.235791243432575e-05, "loss": 1.2545, "num_input_tokens_seen": 17582264, "step": 1092 }, { "epoch": 0.0765627325820659, "grad_norm": 3.69555926322937, "learning_rate": 9.235091418563924e-05, "loss": 1.1615, "num_input_tokens_seen": 17597888, "step": 1093 }, { "epoch": 0.07663278082779515, "grad_norm": 4.341784954071045, "learning_rate": 9.234391593695271e-05, "loss": 1.0369, "num_input_tokens_seen": 17613392, "step": 1094 }, { "epoch": 0.0767028290735244, "grad_norm": 4.043522357940674, "learning_rate": 9.23369176882662e-05, "loss": 1.0509, "num_input_tokens_seen": 17629216, "step": 1095 }, { "epoch": 0.07677287731925364, "grad_norm": 4.330739498138428, "learning_rate": 9.232991943957969e-05, "loss": 1.2208, "num_input_tokens_seen": 17645600, "step": 1096 }, { "epoch": 0.07684292556498289, "grad_norm": 4.8433122634887695, "learning_rate": 9.232292119089318e-05, "loss": 0.9492, "num_input_tokens_seen": 17660952, "step": 1097 }, { "epoch": 0.07691297381071213, "grad_norm": 3.9039859771728516, "learning_rate": 9.231592294220667e-05, "loss": 1.0601, "num_input_tokens_seen": 17677336, "step": 1098 }, { "epoch": 0.07698302205644138, "grad_norm": 3.814103126525879, "learning_rate": 9.230892469352014e-05, "loss": 0.9902, "num_input_tokens_seen": 17693720, "step": 1099 }, { "epoch": 0.07705307030217062, "grad_norm": 3.9864039421081543, "learning_rate": 9.230192644483363e-05, "loss": 1.1622, "num_input_tokens_seen": 17710104, "step": 1100 }, { "epoch": 0.07712311854789987, "grad_norm": 4.469820499420166, "learning_rate": 9.229492819614711e-05, "loss": 1.044, "num_input_tokens_seen": 17726488, "step": 1101 }, { "epoch": 0.07719316679362911, "grad_norm": 3.8044216632843018, "learning_rate": 9.22879299474606e-05, "loss": 1.1283, "num_input_tokens_seen": 17742648, "step": 1102 }, { "epoch": 0.07726321503935836, "grad_norm": 4.859435558319092, "learning_rate": 9.228093169877408e-05, "loss": 1.0995, "num_input_tokens_seen": 17759032, "step": 1103 }, { "epoch": 0.0773332632850876, "grad_norm": 3.830214023590088, "learning_rate": 9.227393345008757e-05, "loss": 1.1731, "num_input_tokens_seen": 17774872, "step": 1104 }, { "epoch": 0.07740331153081685, "grad_norm": 4.196676254272461, "learning_rate": 9.226693520140106e-05, "loss": 1.2055, "num_input_tokens_seen": 17790832, "step": 1105 }, { "epoch": 0.0774733597765461, "grad_norm": 4.50007438659668, "learning_rate": 9.225993695271454e-05, "loss": 0.952, "num_input_tokens_seen": 17805024, "step": 1106 }, { "epoch": 0.07754340802227534, "grad_norm": 4.392070293426514, "learning_rate": 9.225293870402803e-05, "loss": 1.1548, "num_input_tokens_seen": 17820008, "step": 1107 }, { "epoch": 0.07761345626800459, "grad_norm": 4.09447717666626, "learning_rate": 9.22459404553415e-05, "loss": 1.1233, "num_input_tokens_seen": 17836392, "step": 1108 }, { "epoch": 0.07768350451373383, "grad_norm": 4.591554641723633, "learning_rate": 9.223894220665499e-05, "loss": 1.2772, "num_input_tokens_seen": 17852776, "step": 1109 }, { "epoch": 0.07775355275946308, "grad_norm": 5.629931926727295, "learning_rate": 9.223194395796849e-05, "loss": 1.1453, "num_input_tokens_seen": 17869160, "step": 1110 }, { "epoch": 0.07782360100519232, "grad_norm": 4.307553768157959, "learning_rate": 9.222494570928197e-05, "loss": 1.1479, "num_input_tokens_seen": 17885544, "step": 1111 }, { "epoch": 0.07789364925092157, "grad_norm": 4.599300384521484, "learning_rate": 9.221794746059545e-05, "loss": 1.1304, "num_input_tokens_seen": 17901848, "step": 1112 }, { "epoch": 0.07796369749665082, "grad_norm": 4.217408657073975, "learning_rate": 9.221094921190894e-05, "loss": 1.1611, "num_input_tokens_seen": 17918232, "step": 1113 }, { "epoch": 0.07803374574238006, "grad_norm": 3.885847568511963, "learning_rate": 9.220395096322242e-05, "loss": 0.968, "num_input_tokens_seen": 17934504, "step": 1114 }, { "epoch": 0.07810379398810931, "grad_norm": 4.280134677886963, "learning_rate": 9.219695271453591e-05, "loss": 1.0944, "num_input_tokens_seen": 17950888, "step": 1115 }, { "epoch": 0.07817384223383855, "grad_norm": 4.081259727478027, "learning_rate": 9.21899544658494e-05, "loss": 1.0872, "num_input_tokens_seen": 17967088, "step": 1116 }, { "epoch": 0.0782438904795678, "grad_norm": 4.206293106079102, "learning_rate": 9.218295621716288e-05, "loss": 1.2013, "num_input_tokens_seen": 17983312, "step": 1117 }, { "epoch": 0.07831393872529704, "grad_norm": 4.837226390838623, "learning_rate": 9.217595796847636e-05, "loss": 1.2628, "num_input_tokens_seen": 17998768, "step": 1118 }, { "epoch": 0.07838398697102629, "grad_norm": 4.344440460205078, "learning_rate": 9.216895971978985e-05, "loss": 1.0389, "num_input_tokens_seen": 18014840, "step": 1119 }, { "epoch": 0.07845403521675554, "grad_norm": 4.357896327972412, "learning_rate": 9.216196147110334e-05, "loss": 1.2444, "num_input_tokens_seen": 18030696, "step": 1120 }, { "epoch": 0.07852408346248478, "grad_norm": 3.6449878215789795, "learning_rate": 9.215496322241681e-05, "loss": 1.0622, "num_input_tokens_seen": 18047024, "step": 1121 }, { "epoch": 0.07859413170821403, "grad_norm": 4.154385566711426, "learning_rate": 9.21479649737303e-05, "loss": 1.1551, "num_input_tokens_seen": 18063408, "step": 1122 }, { "epoch": 0.07866417995394327, "grad_norm": 3.5929031372070312, "learning_rate": 9.214096672504379e-05, "loss": 0.9682, "num_input_tokens_seen": 18079280, "step": 1123 }, { "epoch": 0.07873422819967252, "grad_norm": 3.5724170207977295, "learning_rate": 9.213396847635728e-05, "loss": 0.8952, "num_input_tokens_seen": 18094488, "step": 1124 }, { "epoch": 0.07880427644540176, "grad_norm": 4.100067615509033, "learning_rate": 9.212697022767077e-05, "loss": 0.9066, "num_input_tokens_seen": 18110872, "step": 1125 }, { "epoch": 0.07887432469113102, "grad_norm": 4.431338787078857, "learning_rate": 9.211997197898424e-05, "loss": 1.0116, "num_input_tokens_seen": 18127256, "step": 1126 }, { "epoch": 0.07894437293686027, "grad_norm": 3.9577043056488037, "learning_rate": 9.211297373029773e-05, "loss": 1.1299, "num_input_tokens_seen": 18143208, "step": 1127 }, { "epoch": 0.07901442118258951, "grad_norm": 4.753921985626221, "learning_rate": 9.21059754816112e-05, "loss": 1.0686, "num_input_tokens_seen": 18158888, "step": 1128 }, { "epoch": 0.07908446942831876, "grad_norm": 3.763982057571411, "learning_rate": 9.209897723292469e-05, "loss": 1.0467, "num_input_tokens_seen": 18175192, "step": 1129 }, { "epoch": 0.079154517674048, "grad_norm": 3.729553699493408, "learning_rate": 9.20919789842382e-05, "loss": 1.1152, "num_input_tokens_seen": 18191384, "step": 1130 }, { "epoch": 0.07922456591977725, "grad_norm": 3.7760956287384033, "learning_rate": 9.208498073555167e-05, "loss": 1.0994, "num_input_tokens_seen": 18207768, "step": 1131 }, { "epoch": 0.0792946141655065, "grad_norm": 4.64035177230835, "learning_rate": 9.207798248686516e-05, "loss": 1.1037, "num_input_tokens_seen": 18224152, "step": 1132 }, { "epoch": 0.07936466241123574, "grad_norm": 4.1443352699279785, "learning_rate": 9.207098423817863e-05, "loss": 1.2329, "num_input_tokens_seen": 18240536, "step": 1133 }, { "epoch": 0.07943471065696499, "grad_norm": 5.332706451416016, "learning_rate": 9.206398598949212e-05, "loss": 1.1303, "num_input_tokens_seen": 18255528, "step": 1134 }, { "epoch": 0.07950475890269423, "grad_norm": 3.914705514907837, "learning_rate": 9.20569877408056e-05, "loss": 1.1182, "num_input_tokens_seen": 18271768, "step": 1135 }, { "epoch": 0.07957480714842348, "grad_norm": 4.994162559509277, "learning_rate": 9.20499894921191e-05, "loss": 1.175, "num_input_tokens_seen": 18288152, "step": 1136 }, { "epoch": 0.07964485539415272, "grad_norm": 4.132298946380615, "learning_rate": 9.204299124343259e-05, "loss": 0.9402, "num_input_tokens_seen": 18303784, "step": 1137 }, { "epoch": 0.07971490363988197, "grad_norm": 3.9048449993133545, "learning_rate": 9.203599299474606e-05, "loss": 1.1283, "num_input_tokens_seen": 18319968, "step": 1138 }, { "epoch": 0.07978495188561122, "grad_norm": 3.981844425201416, "learning_rate": 9.202899474605955e-05, "loss": 1.0472, "num_input_tokens_seen": 18335976, "step": 1139 }, { "epoch": 0.07985500013134046, "grad_norm": 4.491240501403809, "learning_rate": 9.202199649737304e-05, "loss": 1.1022, "num_input_tokens_seen": 18352360, "step": 1140 }, { "epoch": 0.07992504837706971, "grad_norm": 4.152430534362793, "learning_rate": 9.201499824868652e-05, "loss": 1.0688, "num_input_tokens_seen": 18368736, "step": 1141 }, { "epoch": 0.07999509662279895, "grad_norm": 4.337832450866699, "learning_rate": 9.2008e-05, "loss": 1.0397, "num_input_tokens_seen": 18385120, "step": 1142 }, { "epoch": 0.0800651448685282, "grad_norm": 4.865042209625244, "learning_rate": 9.200100175131349e-05, "loss": 0.9616, "num_input_tokens_seen": 18401504, "step": 1143 }, { "epoch": 0.08013519311425744, "grad_norm": 3.783113479614258, "learning_rate": 9.199400350262698e-05, "loss": 1.0001, "num_input_tokens_seen": 18417176, "step": 1144 }, { "epoch": 0.08020524135998669, "grad_norm": 4.98455286026001, "learning_rate": 9.198700525394046e-05, "loss": 1.2139, "num_input_tokens_seen": 18432584, "step": 1145 }, { "epoch": 0.08027528960571594, "grad_norm": 4.1859517097473145, "learning_rate": 9.198000700525394e-05, "loss": 1.1333, "num_input_tokens_seen": 18448968, "step": 1146 }, { "epoch": 0.08034533785144518, "grad_norm": 3.7193386554718018, "learning_rate": 9.197300875656743e-05, "loss": 1.0055, "num_input_tokens_seen": 18465352, "step": 1147 }, { "epoch": 0.08041538609717443, "grad_norm": 4.280893325805664, "learning_rate": 9.196601050788091e-05, "loss": 1.1261, "num_input_tokens_seen": 18481736, "step": 1148 }, { "epoch": 0.08048543434290367, "grad_norm": 3.9979352951049805, "learning_rate": 9.19590122591944e-05, "loss": 1.025, "num_input_tokens_seen": 18498120, "step": 1149 }, { "epoch": 0.08055548258863292, "grad_norm": 5.594225883483887, "learning_rate": 9.195201401050789e-05, "loss": 1.0527, "num_input_tokens_seen": 18513944, "step": 1150 }, { "epoch": 0.08062553083436216, "grad_norm": 4.758842468261719, "learning_rate": 9.194501576182137e-05, "loss": 1.0915, "num_input_tokens_seen": 18530328, "step": 1151 }, { "epoch": 0.08069557908009141, "grad_norm": 5.597489356994629, "learning_rate": 9.193801751313486e-05, "loss": 1.0673, "num_input_tokens_seen": 18546632, "step": 1152 }, { "epoch": 0.08076562732582065, "grad_norm": 5.279472827911377, "learning_rate": 9.193101926444834e-05, "loss": 1.2897, "num_input_tokens_seen": 18561856, "step": 1153 }, { "epoch": 0.0808356755715499, "grad_norm": 4.672069072723389, "learning_rate": 9.192402101576183e-05, "loss": 1.0298, "num_input_tokens_seen": 18577944, "step": 1154 }, { "epoch": 0.08090572381727915, "grad_norm": 3.65533447265625, "learning_rate": 9.19170227670753e-05, "loss": 0.933, "num_input_tokens_seen": 18593720, "step": 1155 }, { "epoch": 0.08097577206300839, "grad_norm": 4.212414741516113, "learning_rate": 9.19100245183888e-05, "loss": 1.0496, "num_input_tokens_seen": 18609864, "step": 1156 }, { "epoch": 0.08104582030873764, "grad_norm": 4.471503734588623, "learning_rate": 9.190302626970229e-05, "loss": 1.2261, "num_input_tokens_seen": 18626248, "step": 1157 }, { "epoch": 0.08111586855446688, "grad_norm": 4.952723979949951, "learning_rate": 9.189602802101577e-05, "loss": 1.056, "num_input_tokens_seen": 18642632, "step": 1158 }, { "epoch": 0.08118591680019613, "grad_norm": 3.921449661254883, "learning_rate": 9.188902977232926e-05, "loss": 1.1617, "num_input_tokens_seen": 18659016, "step": 1159 }, { "epoch": 0.08125596504592539, "grad_norm": 3.728752374649048, "learning_rate": 9.188203152364273e-05, "loss": 1.1217, "num_input_tokens_seen": 18675400, "step": 1160 }, { "epoch": 0.08132601329165463, "grad_norm": 3.8742613792419434, "learning_rate": 9.187503327495622e-05, "loss": 1.1538, "num_input_tokens_seen": 18691232, "step": 1161 }, { "epoch": 0.08139606153738388, "grad_norm": 3.827157735824585, "learning_rate": 9.186803502626971e-05, "loss": 1.1457, "num_input_tokens_seen": 18707616, "step": 1162 }, { "epoch": 0.08146610978311312, "grad_norm": 3.8507778644561768, "learning_rate": 9.18610367775832e-05, "loss": 1.0317, "num_input_tokens_seen": 18724000, "step": 1163 }, { "epoch": 0.08153615802884237, "grad_norm": 5.328095436096191, "learning_rate": 9.185403852889669e-05, "loss": 1.0921, "num_input_tokens_seen": 18740384, "step": 1164 }, { "epoch": 0.08160620627457162, "grad_norm": 4.8900322914123535, "learning_rate": 9.184704028021016e-05, "loss": 1.1308, "num_input_tokens_seen": 18756768, "step": 1165 }, { "epoch": 0.08167625452030086, "grad_norm": 3.810084104537964, "learning_rate": 9.184004203152365e-05, "loss": 1.1244, "num_input_tokens_seen": 18772632, "step": 1166 }, { "epoch": 0.08174630276603011, "grad_norm": 4.318419456481934, "learning_rate": 9.183304378283714e-05, "loss": 1.0372, "num_input_tokens_seen": 18788272, "step": 1167 }, { "epoch": 0.08181635101175935, "grad_norm": 4.093379020690918, "learning_rate": 9.182604553415061e-05, "loss": 1.18, "num_input_tokens_seen": 18803672, "step": 1168 }, { "epoch": 0.0818863992574886, "grad_norm": 4.630450248718262, "learning_rate": 9.18190472854641e-05, "loss": 1.1439, "num_input_tokens_seen": 18820056, "step": 1169 }, { "epoch": 0.08195644750321784, "grad_norm": 4.388457775115967, "learning_rate": 9.181204903677759e-05, "loss": 1.0971, "num_input_tokens_seen": 18836440, "step": 1170 }, { "epoch": 0.08202649574894709, "grad_norm": 3.6942262649536133, "learning_rate": 9.180505078809108e-05, "loss": 1.1594, "num_input_tokens_seen": 18852824, "step": 1171 }, { "epoch": 0.08209654399467634, "grad_norm": 3.937696933746338, "learning_rate": 9.179805253940455e-05, "loss": 1.1841, "num_input_tokens_seen": 18869208, "step": 1172 }, { "epoch": 0.08216659224040558, "grad_norm": 4.062703609466553, "learning_rate": 9.179105429071804e-05, "loss": 1.083, "num_input_tokens_seen": 18885320, "step": 1173 }, { "epoch": 0.08223664048613483, "grad_norm": 7.794081211090088, "learning_rate": 9.178405604203153e-05, "loss": 1.2287, "num_input_tokens_seen": 18900224, "step": 1174 }, { "epoch": 0.08230668873186407, "grad_norm": 4.429391860961914, "learning_rate": 9.1777057793345e-05, "loss": 1.0504, "num_input_tokens_seen": 18916456, "step": 1175 }, { "epoch": 0.08237673697759332, "grad_norm": 3.954869508743286, "learning_rate": 9.17700595446585e-05, "loss": 1.1558, "num_input_tokens_seen": 18932840, "step": 1176 }, { "epoch": 0.08244678522332256, "grad_norm": 5.555337429046631, "learning_rate": 9.176306129597198e-05, "loss": 1.3628, "num_input_tokens_seen": 18949224, "step": 1177 }, { "epoch": 0.08251683346905181, "grad_norm": 3.575295925140381, "learning_rate": 9.175606304728547e-05, "loss": 1.0651, "num_input_tokens_seen": 18965552, "step": 1178 }, { "epoch": 0.08258688171478105, "grad_norm": 5.927703380584717, "learning_rate": 9.174906479859896e-05, "loss": 1.0582, "num_input_tokens_seen": 18981496, "step": 1179 }, { "epoch": 0.0826569299605103, "grad_norm": 6.553986549377441, "learning_rate": 9.174206654991243e-05, "loss": 1.4058, "num_input_tokens_seen": 18996808, "step": 1180 }, { "epoch": 0.08272697820623955, "grad_norm": 4.315832138061523, "learning_rate": 9.173506830122592e-05, "loss": 1.1166, "num_input_tokens_seen": 19013192, "step": 1181 }, { "epoch": 0.08279702645196879, "grad_norm": 3.818033218383789, "learning_rate": 9.172807005253941e-05, "loss": 1.0744, "num_input_tokens_seen": 19029464, "step": 1182 }, { "epoch": 0.08286707469769804, "grad_norm": 3.4207711219787598, "learning_rate": 9.17210718038529e-05, "loss": 0.8952, "num_input_tokens_seen": 19045592, "step": 1183 }, { "epoch": 0.08293712294342728, "grad_norm": 4.3305864334106445, "learning_rate": 9.171407355516639e-05, "loss": 0.9617, "num_input_tokens_seen": 19061864, "step": 1184 }, { "epoch": 0.08300717118915653, "grad_norm": 5.365218162536621, "learning_rate": 9.170707530647986e-05, "loss": 1.1669, "num_input_tokens_seen": 19075448, "step": 1185 }, { "epoch": 0.08307721943488577, "grad_norm": 3.9939708709716797, "learning_rate": 9.170007705779335e-05, "loss": 1.1325, "num_input_tokens_seen": 19091832, "step": 1186 }, { "epoch": 0.08314726768061502, "grad_norm": 3.8088884353637695, "learning_rate": 9.169307880910683e-05, "loss": 1.0132, "num_input_tokens_seen": 19107920, "step": 1187 }, { "epoch": 0.08321731592634427, "grad_norm": 3.858799457550049, "learning_rate": 9.168608056042032e-05, "loss": 0.9805, "num_input_tokens_seen": 19123776, "step": 1188 }, { "epoch": 0.08328736417207351, "grad_norm": 4.042770862579346, "learning_rate": 9.16790823117338e-05, "loss": 1.1668, "num_input_tokens_seen": 19139752, "step": 1189 }, { "epoch": 0.08335741241780276, "grad_norm": 4.2054762840271, "learning_rate": 9.16720840630473e-05, "loss": 1.0702, "num_input_tokens_seen": 19156136, "step": 1190 }, { "epoch": 0.083427460663532, "grad_norm": 4.450238227844238, "learning_rate": 9.166508581436078e-05, "loss": 1.0751, "num_input_tokens_seen": 19172240, "step": 1191 }, { "epoch": 0.08349750890926125, "grad_norm": 4.126129627227783, "learning_rate": 9.165808756567426e-05, "loss": 0.9957, "num_input_tokens_seen": 19188624, "step": 1192 }, { "epoch": 0.0835675571549905, "grad_norm": 4.131893157958984, "learning_rate": 9.165108931698775e-05, "loss": 1.2004, "num_input_tokens_seen": 19205008, "step": 1193 }, { "epoch": 0.08363760540071974, "grad_norm": 4.25187873840332, "learning_rate": 9.164409106830123e-05, "loss": 1.3571, "num_input_tokens_seen": 19220856, "step": 1194 }, { "epoch": 0.083707653646449, "grad_norm": 3.842498302459717, "learning_rate": 9.163709281961471e-05, "loss": 1.0963, "num_input_tokens_seen": 19237208, "step": 1195 }, { "epoch": 0.08377770189217824, "grad_norm": 3.694279432296753, "learning_rate": 9.16300945709282e-05, "loss": 1.1177, "num_input_tokens_seen": 19253592, "step": 1196 }, { "epoch": 0.08384775013790749, "grad_norm": 4.382254123687744, "learning_rate": 9.162309632224169e-05, "loss": 1.0344, "num_input_tokens_seen": 19269976, "step": 1197 }, { "epoch": 0.08391779838363674, "grad_norm": 4.267289161682129, "learning_rate": 9.161609807355518e-05, "loss": 1.1211, "num_input_tokens_seen": 19286360, "step": 1198 }, { "epoch": 0.08398784662936598, "grad_norm": 5.554534435272217, "learning_rate": 9.160909982486865e-05, "loss": 0.9674, "num_input_tokens_seen": 19301800, "step": 1199 }, { "epoch": 0.08405789487509523, "grad_norm": 4.1479668617248535, "learning_rate": 9.160210157618214e-05, "loss": 1.2334, "num_input_tokens_seen": 19317392, "step": 1200 }, { "epoch": 0.08405789487509523, "eval_loss": 1.1600490808486938, "eval_runtime": 0.2015, "eval_samples_per_second": 4.962, "eval_steps_per_second": 4.962, "num_input_tokens_seen": 19317392, "step": 1200 }, { "epoch": 0.08412794312082447, "grad_norm": 4.1876349449157715, "learning_rate": 9.159510332749563e-05, "loss": 1.2036, "num_input_tokens_seen": 19333776, "step": 1201 }, { "epoch": 0.08419799136655372, "grad_norm": 4.031203746795654, "learning_rate": 9.15881050788091e-05, "loss": 1.2127, "num_input_tokens_seen": 19349616, "step": 1202 }, { "epoch": 0.08426803961228296, "grad_norm": 4.013350963592529, "learning_rate": 9.15811068301226e-05, "loss": 1.2147, "num_input_tokens_seen": 19366000, "step": 1203 }, { "epoch": 0.08433808785801221, "grad_norm": 4.509790897369385, "learning_rate": 9.157410858143608e-05, "loss": 1.3484, "num_input_tokens_seen": 19381904, "step": 1204 }, { "epoch": 0.08440813610374145, "grad_norm": 4.630336761474609, "learning_rate": 9.156711033274957e-05, "loss": 1.0246, "num_input_tokens_seen": 19398288, "step": 1205 }, { "epoch": 0.0844781843494707, "grad_norm": 3.819884777069092, "learning_rate": 9.156011208406304e-05, "loss": 1.1242, "num_input_tokens_seen": 19414248, "step": 1206 }, { "epoch": 0.08454823259519995, "grad_norm": 3.7933132648468018, "learning_rate": 9.155311383537653e-05, "loss": 1.0766, "num_input_tokens_seen": 19430632, "step": 1207 }, { "epoch": 0.08461828084092919, "grad_norm": 5.7384934425354, "learning_rate": 9.154611558669002e-05, "loss": 1.0691, "num_input_tokens_seen": 19446248, "step": 1208 }, { "epoch": 0.08468832908665844, "grad_norm": 3.9594175815582275, "learning_rate": 9.153911733800351e-05, "loss": 1.2029, "num_input_tokens_seen": 19462632, "step": 1209 }, { "epoch": 0.08475837733238768, "grad_norm": 3.8251891136169434, "learning_rate": 9.1532119089317e-05, "loss": 0.9994, "num_input_tokens_seen": 19479016, "step": 1210 }, { "epoch": 0.08482842557811693, "grad_norm": 3.9750332832336426, "learning_rate": 9.152512084063049e-05, "loss": 1.1737, "num_input_tokens_seen": 19495112, "step": 1211 }, { "epoch": 0.08489847382384617, "grad_norm": 3.986170530319214, "learning_rate": 9.151812259194396e-05, "loss": 1.1441, "num_input_tokens_seen": 19511216, "step": 1212 }, { "epoch": 0.08496852206957542, "grad_norm": 3.914065361022949, "learning_rate": 9.151112434325745e-05, "loss": 1.2233, "num_input_tokens_seen": 19527600, "step": 1213 }, { "epoch": 0.08503857031530467, "grad_norm": 4.328094482421875, "learning_rate": 9.150412609457093e-05, "loss": 1.2076, "num_input_tokens_seen": 19543984, "step": 1214 }, { "epoch": 0.08510861856103391, "grad_norm": 4.112467288970947, "learning_rate": 9.149712784588441e-05, "loss": 1.1732, "num_input_tokens_seen": 19560368, "step": 1215 }, { "epoch": 0.08517866680676316, "grad_norm": 4.680009365081787, "learning_rate": 9.14901295971979e-05, "loss": 0.985, "num_input_tokens_seen": 19575616, "step": 1216 }, { "epoch": 0.0852487150524924, "grad_norm": 4.4872660636901855, "learning_rate": 9.148313134851139e-05, "loss": 1.1799, "num_input_tokens_seen": 19592000, "step": 1217 }, { "epoch": 0.08531876329822165, "grad_norm": 3.7546637058258057, "learning_rate": 9.147613309982488e-05, "loss": 1.1989, "num_input_tokens_seen": 19608384, "step": 1218 }, { "epoch": 0.0853888115439509, "grad_norm": 5.590888500213623, "learning_rate": 9.146913485113835e-05, "loss": 1.1411, "num_input_tokens_seen": 19624768, "step": 1219 }, { "epoch": 0.08545885978968014, "grad_norm": 3.958021640777588, "learning_rate": 9.146213660245184e-05, "loss": 0.9309, "num_input_tokens_seen": 19641152, "step": 1220 }, { "epoch": 0.08552890803540938, "grad_norm": 3.7641196250915527, "learning_rate": 9.145513835376533e-05, "loss": 1.0299, "num_input_tokens_seen": 19657536, "step": 1221 }, { "epoch": 0.08559895628113863, "grad_norm": 4.395461559295654, "learning_rate": 9.14481401050788e-05, "loss": 1.1404, "num_input_tokens_seen": 19673712, "step": 1222 }, { "epoch": 0.08566900452686788, "grad_norm": 3.8162319660186768, "learning_rate": 9.144114185639231e-05, "loss": 1.1638, "num_input_tokens_seen": 19689336, "step": 1223 }, { "epoch": 0.08573905277259712, "grad_norm": 3.7025444507598877, "learning_rate": 9.143414360770578e-05, "loss": 0.9995, "num_input_tokens_seen": 19705464, "step": 1224 }, { "epoch": 0.08580910101832637, "grad_norm": 3.8621439933776855, "learning_rate": 9.142714535901927e-05, "loss": 1.1639, "num_input_tokens_seen": 19721848, "step": 1225 }, { "epoch": 0.08587914926405561, "grad_norm": 4.243250846862793, "learning_rate": 9.142014711033275e-05, "loss": 1.0104, "num_input_tokens_seen": 19738072, "step": 1226 }, { "epoch": 0.08594919750978486, "grad_norm": 4.05800724029541, "learning_rate": 9.141314886164624e-05, "loss": 1.0257, "num_input_tokens_seen": 19754456, "step": 1227 }, { "epoch": 0.0860192457555141, "grad_norm": 4.0894455909729, "learning_rate": 9.140615061295972e-05, "loss": 1.254, "num_input_tokens_seen": 19770840, "step": 1228 }, { "epoch": 0.08608929400124336, "grad_norm": 4.296894073486328, "learning_rate": 9.139915236427321e-05, "loss": 1.1298, "num_input_tokens_seen": 19786864, "step": 1229 }, { "epoch": 0.08615934224697261, "grad_norm": 4.0352888107299805, "learning_rate": 9.13921541155867e-05, "loss": 1.0611, "num_input_tokens_seen": 19801800, "step": 1230 }, { "epoch": 0.08622939049270185, "grad_norm": 4.087375640869141, "learning_rate": 9.138515586690018e-05, "loss": 0.9686, "num_input_tokens_seen": 19818184, "step": 1231 }, { "epoch": 0.0862994387384311, "grad_norm": 4.045078754425049, "learning_rate": 9.137815761821367e-05, "loss": 1.0915, "num_input_tokens_seen": 19833016, "step": 1232 }, { "epoch": 0.08636948698416035, "grad_norm": 4.399363040924072, "learning_rate": 9.137115936952714e-05, "loss": 1.1875, "num_input_tokens_seen": 19848912, "step": 1233 }, { "epoch": 0.08643953522988959, "grad_norm": 4.420406818389893, "learning_rate": 9.136416112084063e-05, "loss": 1.0534, "num_input_tokens_seen": 19865296, "step": 1234 }, { "epoch": 0.08650958347561884, "grad_norm": 4.131808280944824, "learning_rate": 9.135716287215412e-05, "loss": 1.1865, "num_input_tokens_seen": 19881376, "step": 1235 }, { "epoch": 0.08657963172134808, "grad_norm": 3.8256850242614746, "learning_rate": 9.13501646234676e-05, "loss": 1.0539, "num_input_tokens_seen": 19897704, "step": 1236 }, { "epoch": 0.08664967996707733, "grad_norm": 4.3497233390808105, "learning_rate": 9.13431663747811e-05, "loss": 1.191, "num_input_tokens_seen": 19914088, "step": 1237 }, { "epoch": 0.08671972821280657, "grad_norm": 4.18136739730835, "learning_rate": 9.133616812609458e-05, "loss": 1.0539, "num_input_tokens_seen": 19930128, "step": 1238 }, { "epoch": 0.08678977645853582, "grad_norm": 4.782970905303955, "learning_rate": 9.132916987740806e-05, "loss": 1.1992, "num_input_tokens_seen": 19946512, "step": 1239 }, { "epoch": 0.08685982470426507, "grad_norm": 4.16589879989624, "learning_rate": 9.132217162872155e-05, "loss": 1.1463, "num_input_tokens_seen": 19962488, "step": 1240 }, { "epoch": 0.08692987294999431, "grad_norm": 3.73541522026062, "learning_rate": 9.131517338003502e-05, "loss": 1.0272, "num_input_tokens_seen": 19978584, "step": 1241 }, { "epoch": 0.08699992119572356, "grad_norm": 4.225815773010254, "learning_rate": 9.130817513134851e-05, "loss": 1.177, "num_input_tokens_seen": 19994816, "step": 1242 }, { "epoch": 0.0870699694414528, "grad_norm": 7.807470321655273, "learning_rate": 9.1301176882662e-05, "loss": 1.1635, "num_input_tokens_seen": 20010576, "step": 1243 }, { "epoch": 0.08714001768718205, "grad_norm": 4.818174839019775, "learning_rate": 9.129417863397549e-05, "loss": 1.1892, "num_input_tokens_seen": 20025712, "step": 1244 }, { "epoch": 0.0872100659329113, "grad_norm": 3.8367979526519775, "learning_rate": 9.128718038528898e-05, "loss": 1.0096, "num_input_tokens_seen": 20041904, "step": 1245 }, { "epoch": 0.08728011417864054, "grad_norm": 3.9912586212158203, "learning_rate": 9.128018213660245e-05, "loss": 1.097, "num_input_tokens_seen": 20058288, "step": 1246 }, { "epoch": 0.08735016242436978, "grad_norm": 4.842557907104492, "learning_rate": 9.127318388791594e-05, "loss": 1.2012, "num_input_tokens_seen": 20074672, "step": 1247 }, { "epoch": 0.08742021067009903, "grad_norm": 3.816938877105713, "learning_rate": 9.126618563922943e-05, "loss": 1.1683, "num_input_tokens_seen": 20090664, "step": 1248 }, { "epoch": 0.08749025891582828, "grad_norm": 3.712480306625366, "learning_rate": 9.125918739054292e-05, "loss": 1.1978, "num_input_tokens_seen": 20107048, "step": 1249 }, { "epoch": 0.08756030716155752, "grad_norm": 4.185492515563965, "learning_rate": 9.12521891418564e-05, "loss": 1.2042, "num_input_tokens_seen": 20123432, "step": 1250 }, { "epoch": 0.08763035540728677, "grad_norm": 5.510714530944824, "learning_rate": 9.124519089316988e-05, "loss": 0.9757, "num_input_tokens_seen": 20139112, "step": 1251 }, { "epoch": 0.08770040365301601, "grad_norm": 3.9170289039611816, "learning_rate": 9.123819264448337e-05, "loss": 1.0213, "num_input_tokens_seen": 20155496, "step": 1252 }, { "epoch": 0.08777045189874526, "grad_norm": 3.738008975982666, "learning_rate": 9.123119439579684e-05, "loss": 0.9446, "num_input_tokens_seen": 20171760, "step": 1253 }, { "epoch": 0.0878405001444745, "grad_norm": 4.845873832702637, "learning_rate": 9.122419614711033e-05, "loss": 1.2135, "num_input_tokens_seen": 20188056, "step": 1254 }, { "epoch": 0.08791054839020375, "grad_norm": 4.166906356811523, "learning_rate": 9.121719789842382e-05, "loss": 1.1558, "num_input_tokens_seen": 20204440, "step": 1255 }, { "epoch": 0.087980596635933, "grad_norm": 4.039194107055664, "learning_rate": 9.121019964973731e-05, "loss": 1.0297, "num_input_tokens_seen": 20220824, "step": 1256 }, { "epoch": 0.08805064488166224, "grad_norm": 3.545482635498047, "learning_rate": 9.12032014010508e-05, "loss": 0.9757, "num_input_tokens_seen": 20236888, "step": 1257 }, { "epoch": 0.08812069312739149, "grad_norm": 3.82114839553833, "learning_rate": 9.119620315236427e-05, "loss": 1.1637, "num_input_tokens_seen": 20253272, "step": 1258 }, { "epoch": 0.08819074137312073, "grad_norm": 4.770678997039795, "learning_rate": 9.118920490367776e-05, "loss": 1.1421, "num_input_tokens_seen": 20269656, "step": 1259 }, { "epoch": 0.08826078961884998, "grad_norm": 4.4319539070129395, "learning_rate": 9.118220665499124e-05, "loss": 1.1565, "num_input_tokens_seen": 20285456, "step": 1260 }, { "epoch": 0.08833083786457922, "grad_norm": 4.0923357009887695, "learning_rate": 9.117520840630473e-05, "loss": 1.2328, "num_input_tokens_seen": 20301232, "step": 1261 }, { "epoch": 0.08840088611030847, "grad_norm": 5.8347344398498535, "learning_rate": 9.116821015761821e-05, "loss": 0.8824, "num_input_tokens_seen": 20317224, "step": 1262 }, { "epoch": 0.08847093435603771, "grad_norm": 4.525367259979248, "learning_rate": 9.11612119089317e-05, "loss": 1.1554, "num_input_tokens_seen": 20332616, "step": 1263 }, { "epoch": 0.08854098260176697, "grad_norm": 3.9754436016082764, "learning_rate": 9.115421366024519e-05, "loss": 1.0423, "num_input_tokens_seen": 20348336, "step": 1264 }, { "epoch": 0.08861103084749622, "grad_norm": 4.40745735168457, "learning_rate": 9.114721541155868e-05, "loss": 1.0485, "num_input_tokens_seen": 20364312, "step": 1265 }, { "epoch": 0.08868107909322547, "grad_norm": 7.126221179962158, "learning_rate": 9.114021716287216e-05, "loss": 1.2035, "num_input_tokens_seen": 20380696, "step": 1266 }, { "epoch": 0.08875112733895471, "grad_norm": 4.306386947631836, "learning_rate": 9.113321891418564e-05, "loss": 1.0399, "num_input_tokens_seen": 20397080, "step": 1267 }, { "epoch": 0.08882117558468396, "grad_norm": 3.566943407058716, "learning_rate": 9.112622066549912e-05, "loss": 1.0463, "num_input_tokens_seen": 20413464, "step": 1268 }, { "epoch": 0.0888912238304132, "grad_norm": 3.975228786468506, "learning_rate": 9.111922241681262e-05, "loss": 1.2576, "num_input_tokens_seen": 20429848, "step": 1269 }, { "epoch": 0.08896127207614245, "grad_norm": 4.928854465484619, "learning_rate": 9.11122241681261e-05, "loss": 1.1555, "num_input_tokens_seen": 20446192, "step": 1270 }, { "epoch": 0.0890313203218717, "grad_norm": 4.288821697235107, "learning_rate": 9.110522591943958e-05, "loss": 1.2559, "num_input_tokens_seen": 20462576, "step": 1271 }, { "epoch": 0.08910136856760094, "grad_norm": 3.9346396923065186, "learning_rate": 9.109822767075307e-05, "loss": 1.1479, "num_input_tokens_seen": 20478520, "step": 1272 }, { "epoch": 0.08917141681333018, "grad_norm": 3.7976620197296143, "learning_rate": 9.109122942206655e-05, "loss": 0.9903, "num_input_tokens_seen": 20494408, "step": 1273 }, { "epoch": 0.08924146505905943, "grad_norm": 5.373577117919922, "learning_rate": 9.108423117338004e-05, "loss": 0.8863, "num_input_tokens_seen": 20510792, "step": 1274 }, { "epoch": 0.08931151330478868, "grad_norm": 4.248324394226074, "learning_rate": 9.107723292469353e-05, "loss": 1.3492, "num_input_tokens_seen": 20527064, "step": 1275 }, { "epoch": 0.08938156155051792, "grad_norm": 4.453672885894775, "learning_rate": 9.107023467600701e-05, "loss": 0.9763, "num_input_tokens_seen": 20543448, "step": 1276 }, { "epoch": 0.08945160979624717, "grad_norm": 4.8721184730529785, "learning_rate": 9.10632364273205e-05, "loss": 0.9455, "num_input_tokens_seen": 20559832, "step": 1277 }, { "epoch": 0.08952165804197641, "grad_norm": 5.0173540115356445, "learning_rate": 9.105623817863398e-05, "loss": 1.0303, "num_input_tokens_seen": 20576216, "step": 1278 }, { "epoch": 0.08959170628770566, "grad_norm": 5.00100040435791, "learning_rate": 9.104923992994747e-05, "loss": 1.0393, "num_input_tokens_seen": 20592600, "step": 1279 }, { "epoch": 0.0896617545334349, "grad_norm": 4.271099090576172, "learning_rate": 9.104224168126094e-05, "loss": 1.2307, "num_input_tokens_seen": 20608632, "step": 1280 }, { "epoch": 0.08973180277916415, "grad_norm": 4.246976852416992, "learning_rate": 9.103524343257443e-05, "loss": 1.1405, "num_input_tokens_seen": 20625016, "step": 1281 }, { "epoch": 0.0898018510248934, "grad_norm": 5.033923149108887, "learning_rate": 9.102824518388792e-05, "loss": 1.0849, "num_input_tokens_seen": 20641400, "step": 1282 }, { "epoch": 0.08987189927062264, "grad_norm": 4.4118571281433105, "learning_rate": 9.102124693520141e-05, "loss": 1.118, "num_input_tokens_seen": 20657448, "step": 1283 }, { "epoch": 0.08994194751635189, "grad_norm": 4.150144577026367, "learning_rate": 9.10142486865149e-05, "loss": 1.0676, "num_input_tokens_seen": 20673080, "step": 1284 }, { "epoch": 0.09001199576208113, "grad_norm": 3.767683744430542, "learning_rate": 9.100725043782837e-05, "loss": 0.8968, "num_input_tokens_seen": 20689464, "step": 1285 }, { "epoch": 0.09008204400781038, "grad_norm": 4.816582202911377, "learning_rate": 9.100025218914186e-05, "loss": 1.0039, "num_input_tokens_seen": 20703896, "step": 1286 }, { "epoch": 0.09015209225353962, "grad_norm": 3.8913414478302, "learning_rate": 9.099325394045533e-05, "loss": 1.0077, "num_input_tokens_seen": 20720280, "step": 1287 }, { "epoch": 0.09022214049926887, "grad_norm": 4.305298328399658, "learning_rate": 9.098625569176882e-05, "loss": 1.1555, "num_input_tokens_seen": 20735944, "step": 1288 }, { "epoch": 0.09029218874499811, "grad_norm": 3.3120992183685303, "learning_rate": 9.097925744308233e-05, "loss": 0.8591, "num_input_tokens_seen": 20752128, "step": 1289 }, { "epoch": 0.09036223699072736, "grad_norm": 4.705013751983643, "learning_rate": 9.09722591943958e-05, "loss": 1.4579, "num_input_tokens_seen": 20768512, "step": 1290 }, { "epoch": 0.0904322852364566, "grad_norm": 5.08630895614624, "learning_rate": 9.096526094570929e-05, "loss": 1.1049, "num_input_tokens_seen": 20783976, "step": 1291 }, { "epoch": 0.09050233348218585, "grad_norm": 3.634686231613159, "learning_rate": 9.095826269702278e-05, "loss": 1.0344, "num_input_tokens_seen": 20800360, "step": 1292 }, { "epoch": 0.0905723817279151, "grad_norm": 4.220744609832764, "learning_rate": 9.095126444833625e-05, "loss": 1.1843, "num_input_tokens_seen": 20816744, "step": 1293 }, { "epoch": 0.09064242997364434, "grad_norm": 4.724472522735596, "learning_rate": 9.094426619964974e-05, "loss": 1.1365, "num_input_tokens_seen": 20833128, "step": 1294 }, { "epoch": 0.09071247821937359, "grad_norm": 3.9398090839385986, "learning_rate": 9.093726795096323e-05, "loss": 1.0703, "num_input_tokens_seen": 20849448, "step": 1295 }, { "epoch": 0.09078252646510283, "grad_norm": 4.260062217712402, "learning_rate": 9.093026970227672e-05, "loss": 1.0968, "num_input_tokens_seen": 20865832, "step": 1296 }, { "epoch": 0.09085257471083208, "grad_norm": 4.383310317993164, "learning_rate": 9.09232714535902e-05, "loss": 1.2542, "num_input_tokens_seen": 20881288, "step": 1297 }, { "epoch": 0.09092262295656132, "grad_norm": 4.479433059692383, "learning_rate": 9.091627320490368e-05, "loss": 0.9533, "num_input_tokens_seen": 20897328, "step": 1298 }, { "epoch": 0.09099267120229058, "grad_norm": 4.911858081817627, "learning_rate": 9.090927495621717e-05, "loss": 1.3399, "num_input_tokens_seen": 20913712, "step": 1299 }, { "epoch": 0.09106271944801983, "grad_norm": 4.015485763549805, "learning_rate": 9.090227670753065e-05, "loss": 1.1156, "num_input_tokens_seen": 20929984, "step": 1300 }, { "epoch": 0.09113276769374908, "grad_norm": 3.8690338134765625, "learning_rate": 9.089527845884413e-05, "loss": 1.0634, "num_input_tokens_seen": 20946368, "step": 1301 }, { "epoch": 0.09120281593947832, "grad_norm": 5.142012596130371, "learning_rate": 9.088828021015762e-05, "loss": 1.0579, "num_input_tokens_seen": 20962752, "step": 1302 }, { "epoch": 0.09127286418520757, "grad_norm": 3.954049587249756, "learning_rate": 9.088128196147111e-05, "loss": 1.0862, "num_input_tokens_seen": 20979136, "step": 1303 }, { "epoch": 0.09134291243093681, "grad_norm": 4.13312292098999, "learning_rate": 9.08742837127846e-05, "loss": 1.0548, "num_input_tokens_seen": 20995520, "step": 1304 }, { "epoch": 0.09141296067666606, "grad_norm": 4.24699592590332, "learning_rate": 9.086728546409808e-05, "loss": 1.0126, "num_input_tokens_seen": 21011904, "step": 1305 }, { "epoch": 0.0914830089223953, "grad_norm": 4.847048759460449, "learning_rate": 9.086028721541156e-05, "loss": 0.9973, "num_input_tokens_seen": 21027664, "step": 1306 }, { "epoch": 0.09155305716812455, "grad_norm": 4.573661804199219, "learning_rate": 9.085328896672504e-05, "loss": 1.005, "num_input_tokens_seen": 21044048, "step": 1307 }, { "epoch": 0.0916231054138538, "grad_norm": 4.13530158996582, "learning_rate": 9.084629071803853e-05, "loss": 1.1033, "num_input_tokens_seen": 21060432, "step": 1308 }, { "epoch": 0.09169315365958304, "grad_norm": 4.017937183380127, "learning_rate": 9.083929246935203e-05, "loss": 1.1971, "num_input_tokens_seen": 21076816, "step": 1309 }, { "epoch": 0.09176320190531229, "grad_norm": 5.928586483001709, "learning_rate": 9.08322942206655e-05, "loss": 1.0547, "num_input_tokens_seen": 21093200, "step": 1310 }, { "epoch": 0.09183325015104153, "grad_norm": 4.2442169189453125, "learning_rate": 9.082529597197899e-05, "loss": 1.2794, "num_input_tokens_seen": 21109256, "step": 1311 }, { "epoch": 0.09190329839677078, "grad_norm": 4.891444683074951, "learning_rate": 9.081829772329247e-05, "loss": 1.1833, "num_input_tokens_seen": 21124848, "step": 1312 }, { "epoch": 0.09197334664250002, "grad_norm": 4.323850154876709, "learning_rate": 9.081129947460596e-05, "loss": 1.1683, "num_input_tokens_seen": 21141176, "step": 1313 }, { "epoch": 0.09204339488822927, "grad_norm": 4.239765644073486, "learning_rate": 9.080430122591943e-05, "loss": 1.1073, "num_input_tokens_seen": 21157240, "step": 1314 }, { "epoch": 0.09211344313395851, "grad_norm": 4.12881326675415, "learning_rate": 9.079730297723293e-05, "loss": 1.2522, "num_input_tokens_seen": 21173624, "step": 1315 }, { "epoch": 0.09218349137968776, "grad_norm": 4.238161087036133, "learning_rate": 9.079030472854642e-05, "loss": 1.1828, "num_input_tokens_seen": 21190008, "step": 1316 }, { "epoch": 0.092253539625417, "grad_norm": 4.124176502227783, "learning_rate": 9.07833064798599e-05, "loss": 1.1388, "num_input_tokens_seen": 21206392, "step": 1317 }, { "epoch": 0.09232358787114625, "grad_norm": 3.772136926651001, "learning_rate": 9.077630823117339e-05, "loss": 1.068, "num_input_tokens_seen": 21222776, "step": 1318 }, { "epoch": 0.0923936361168755, "grad_norm": 4.628321170806885, "learning_rate": 9.076930998248687e-05, "loss": 1.2363, "num_input_tokens_seen": 21239160, "step": 1319 }, { "epoch": 0.09246368436260474, "grad_norm": 5.3034348487854, "learning_rate": 9.076231173380035e-05, "loss": 1.0638, "num_input_tokens_seen": 21255544, "step": 1320 }, { "epoch": 0.09253373260833399, "grad_norm": 3.6543760299682617, "learning_rate": 9.075531348511384e-05, "loss": 1.0071, "num_input_tokens_seen": 21271928, "step": 1321 }, { "epoch": 0.09260378085406323, "grad_norm": 4.1335062980651855, "learning_rate": 9.074831523642733e-05, "loss": 1.084, "num_input_tokens_seen": 21288312, "step": 1322 }, { "epoch": 0.09267382909979248, "grad_norm": 3.6392204761505127, "learning_rate": 9.074131698774082e-05, "loss": 1.1146, "num_input_tokens_seen": 21304696, "step": 1323 }, { "epoch": 0.09274387734552172, "grad_norm": 4.035269737243652, "learning_rate": 9.073431873905429e-05, "loss": 0.9578, "num_input_tokens_seen": 21321080, "step": 1324 }, { "epoch": 0.09281392559125097, "grad_norm": 4.650269508361816, "learning_rate": 9.072732049036778e-05, "loss": 1.0242, "num_input_tokens_seen": 21337464, "step": 1325 }, { "epoch": 0.09288397383698022, "grad_norm": 5.850543022155762, "learning_rate": 9.072032224168127e-05, "loss": 1.1196, "num_input_tokens_seen": 21352968, "step": 1326 }, { "epoch": 0.09295402208270946, "grad_norm": 4.177901744842529, "learning_rate": 9.071332399299474e-05, "loss": 1.1351, "num_input_tokens_seen": 21368968, "step": 1327 }, { "epoch": 0.09302407032843871, "grad_norm": 4.582173824310303, "learning_rate": 9.070632574430823e-05, "loss": 0.9115, "num_input_tokens_seen": 21385352, "step": 1328 }, { "epoch": 0.09309411857416795, "grad_norm": 4.7911787033081055, "learning_rate": 9.069932749562173e-05, "loss": 1.0413, "num_input_tokens_seen": 21401144, "step": 1329 }, { "epoch": 0.0931641668198972, "grad_norm": 4.058457374572754, "learning_rate": 9.069232924693521e-05, "loss": 1.0611, "num_input_tokens_seen": 21416640, "step": 1330 }, { "epoch": 0.09323421506562644, "grad_norm": 4.972208499908447, "learning_rate": 9.06853309982487e-05, "loss": 1.016, "num_input_tokens_seen": 21433024, "step": 1331 }, { "epoch": 0.09330426331135569, "grad_norm": 4.0875091552734375, "learning_rate": 9.067833274956217e-05, "loss": 1.089, "num_input_tokens_seen": 21448888, "step": 1332 }, { "epoch": 0.09337431155708495, "grad_norm": 3.923112154006958, "learning_rate": 9.067133450087566e-05, "loss": 0.9824, "num_input_tokens_seen": 21465272, "step": 1333 }, { "epoch": 0.0934443598028142, "grad_norm": 4.067697525024414, "learning_rate": 9.066433625218914e-05, "loss": 1.0492, "num_input_tokens_seen": 21481656, "step": 1334 }, { "epoch": 0.09351440804854344, "grad_norm": 4.185417652130127, "learning_rate": 9.065733800350264e-05, "loss": 1.1073, "num_input_tokens_seen": 21498040, "step": 1335 }, { "epoch": 0.09358445629427269, "grad_norm": 7.31542444229126, "learning_rate": 9.065033975481613e-05, "loss": 1.4322, "num_input_tokens_seen": 21514088, "step": 1336 }, { "epoch": 0.09365450454000193, "grad_norm": 4.754745006561279, "learning_rate": 9.06433415061296e-05, "loss": 0.9953, "num_input_tokens_seen": 21530472, "step": 1337 }, { "epoch": 0.09372455278573118, "grad_norm": 5.81265926361084, "learning_rate": 9.063634325744309e-05, "loss": 1.1434, "num_input_tokens_seen": 21545728, "step": 1338 }, { "epoch": 0.09379460103146042, "grad_norm": 5.586238861083984, "learning_rate": 9.062934500875657e-05, "loss": 0.9818, "num_input_tokens_seen": 21562112, "step": 1339 }, { "epoch": 0.09386464927718967, "grad_norm": 4.096534729003906, "learning_rate": 9.062234676007005e-05, "loss": 1.1856, "num_input_tokens_seen": 21578496, "step": 1340 }, { "epoch": 0.09393469752291891, "grad_norm": 4.913814544677734, "learning_rate": 9.061534851138354e-05, "loss": 1.041, "num_input_tokens_seen": 21594792, "step": 1341 }, { "epoch": 0.09400474576864816, "grad_norm": 3.8853912353515625, "learning_rate": 9.060835026269703e-05, "loss": 1.1651, "num_input_tokens_seen": 21611176, "step": 1342 }, { "epoch": 0.0940747940143774, "grad_norm": 4.187959671020508, "learning_rate": 9.060135201401052e-05, "loss": 1.1757, "num_input_tokens_seen": 21627560, "step": 1343 }, { "epoch": 0.09414484226010665, "grad_norm": 4.128627777099609, "learning_rate": 9.0594353765324e-05, "loss": 0.9243, "num_input_tokens_seen": 21643576, "step": 1344 }, { "epoch": 0.0942148905058359, "grad_norm": 4.7016825675964355, "learning_rate": 9.058735551663748e-05, "loss": 1.2425, "num_input_tokens_seen": 21658600, "step": 1345 }, { "epoch": 0.09428493875156514, "grad_norm": 3.970548391342163, "learning_rate": 9.058035726795097e-05, "loss": 1.0495, "num_input_tokens_seen": 21674264, "step": 1346 }, { "epoch": 0.09435498699729439, "grad_norm": 3.812196731567383, "learning_rate": 9.057335901926445e-05, "loss": 0.9558, "num_input_tokens_seen": 21690112, "step": 1347 }, { "epoch": 0.09442503524302363, "grad_norm": 3.6845176219940186, "learning_rate": 9.056636077057794e-05, "loss": 0.9758, "num_input_tokens_seen": 21705744, "step": 1348 }, { "epoch": 0.09449508348875288, "grad_norm": 4.119202136993408, "learning_rate": 9.055936252189142e-05, "loss": 1.0948, "num_input_tokens_seen": 21721776, "step": 1349 }, { "epoch": 0.09456513173448212, "grad_norm": 4.176985740661621, "learning_rate": 9.055236427320491e-05, "loss": 0.9475, "num_input_tokens_seen": 21737912, "step": 1350 }, { "epoch": 0.09463517998021137, "grad_norm": 4.057264804840088, "learning_rate": 9.054536602451839e-05, "loss": 1.1746, "num_input_tokens_seen": 21754296, "step": 1351 }, { "epoch": 0.09470522822594062, "grad_norm": 4.5631914138793945, "learning_rate": 9.053836777583188e-05, "loss": 1.0894, "num_input_tokens_seen": 21770680, "step": 1352 }, { "epoch": 0.09477527647166986, "grad_norm": 4.854849815368652, "learning_rate": 9.053136952714536e-05, "loss": 1.0686, "num_input_tokens_seen": 21787064, "step": 1353 }, { "epoch": 0.09484532471739911, "grad_norm": 5.326946258544922, "learning_rate": 9.052437127845884e-05, "loss": 0.872, "num_input_tokens_seen": 21803448, "step": 1354 }, { "epoch": 0.09491537296312835, "grad_norm": 4.283742904663086, "learning_rate": 9.051737302977234e-05, "loss": 1.2683, "num_input_tokens_seen": 21819832, "step": 1355 }, { "epoch": 0.0949854212088576, "grad_norm": 4.165935039520264, "learning_rate": 9.051037478108582e-05, "loss": 0.977, "num_input_tokens_seen": 21836216, "step": 1356 }, { "epoch": 0.09505546945458684, "grad_norm": 4.502480983734131, "learning_rate": 9.05033765323993e-05, "loss": 1.2854, "num_input_tokens_seen": 21852600, "step": 1357 }, { "epoch": 0.09512551770031609, "grad_norm": 4.185445308685303, "learning_rate": 9.04963782837128e-05, "loss": 1.2225, "num_input_tokens_seen": 21868984, "step": 1358 }, { "epoch": 0.09519556594604534, "grad_norm": 7.288909435272217, "learning_rate": 9.048938003502627e-05, "loss": 1.154, "num_input_tokens_seen": 21884648, "step": 1359 }, { "epoch": 0.09526561419177458, "grad_norm": 4.038896560668945, "learning_rate": 9.048238178633976e-05, "loss": 1.1437, "num_input_tokens_seen": 21900704, "step": 1360 }, { "epoch": 0.09533566243750383, "grad_norm": 4.216241836547852, "learning_rate": 9.047538353765325e-05, "loss": 1.1379, "num_input_tokens_seen": 21916520, "step": 1361 }, { "epoch": 0.09540571068323307, "grad_norm": 4.2549147605896, "learning_rate": 9.046838528896673e-05, "loss": 1.2578, "num_input_tokens_seen": 21932904, "step": 1362 }, { "epoch": 0.09547575892896232, "grad_norm": 3.6919445991516113, "learning_rate": 9.046138704028022e-05, "loss": 0.9876, "num_input_tokens_seen": 21949288, "step": 1363 }, { "epoch": 0.09554580717469156, "grad_norm": 5.467876434326172, "learning_rate": 9.04543887915937e-05, "loss": 0.9735, "num_input_tokens_seen": 21965672, "step": 1364 }, { "epoch": 0.09561585542042081, "grad_norm": 4.036736011505127, "learning_rate": 9.044739054290719e-05, "loss": 1.0712, "num_input_tokens_seen": 21980792, "step": 1365 }, { "epoch": 0.09568590366615005, "grad_norm": 4.083346843719482, "learning_rate": 9.044039229422066e-05, "loss": 1.0883, "num_input_tokens_seen": 21996888, "step": 1366 }, { "epoch": 0.0957559519118793, "grad_norm": 3.553262948989868, "learning_rate": 9.043339404553415e-05, "loss": 1.0116, "num_input_tokens_seen": 22013160, "step": 1367 }, { "epoch": 0.09582600015760856, "grad_norm": 4.787721633911133, "learning_rate": 9.042639579684764e-05, "loss": 1.1444, "num_input_tokens_seen": 22029544, "step": 1368 }, { "epoch": 0.0958960484033378, "grad_norm": 3.8053700923919678, "learning_rate": 9.041939754816113e-05, "loss": 1.1654, "num_input_tokens_seen": 22045888, "step": 1369 }, { "epoch": 0.09596609664906705, "grad_norm": 3.7679660320281982, "learning_rate": 9.041239929947462e-05, "loss": 1.1753, "num_input_tokens_seen": 22062272, "step": 1370 }, { "epoch": 0.0960361448947963, "grad_norm": 5.086554527282715, "learning_rate": 9.040540105078809e-05, "loss": 0.9579, "num_input_tokens_seen": 22078080, "step": 1371 }, { "epoch": 0.09610619314052554, "grad_norm": 4.255527496337891, "learning_rate": 9.039840280210158e-05, "loss": 1.0953, "num_input_tokens_seen": 22093808, "step": 1372 }, { "epoch": 0.09617624138625479, "grad_norm": 6.081700325012207, "learning_rate": 9.039140455341507e-05, "loss": 1.0363, "num_input_tokens_seen": 22110192, "step": 1373 }, { "epoch": 0.09624628963198403, "grad_norm": 4.376565456390381, "learning_rate": 9.038440630472854e-05, "loss": 1.1737, "num_input_tokens_seen": 22126576, "step": 1374 }, { "epoch": 0.09631633787771328, "grad_norm": 4.051114559173584, "learning_rate": 9.037740805604205e-05, "loss": 1.1921, "num_input_tokens_seen": 22142768, "step": 1375 }, { "epoch": 0.09638638612344252, "grad_norm": 4.46164083480835, "learning_rate": 9.037040980735552e-05, "loss": 1.1541, "num_input_tokens_seen": 22158600, "step": 1376 }, { "epoch": 0.09645643436917177, "grad_norm": 4.242503643035889, "learning_rate": 9.036341155866901e-05, "loss": 1.1314, "num_input_tokens_seen": 22174984, "step": 1377 }, { "epoch": 0.09652648261490102, "grad_norm": 3.6338908672332764, "learning_rate": 9.035641330998248e-05, "loss": 0.9257, "num_input_tokens_seen": 22190880, "step": 1378 }, { "epoch": 0.09659653086063026, "grad_norm": 4.73402738571167, "learning_rate": 9.034941506129597e-05, "loss": 1.1981, "num_input_tokens_seen": 22206632, "step": 1379 }, { "epoch": 0.09666657910635951, "grad_norm": 4.450289726257324, "learning_rate": 9.034241681260946e-05, "loss": 1.0851, "num_input_tokens_seen": 22222896, "step": 1380 }, { "epoch": 0.09673662735208875, "grad_norm": 5.578179359436035, "learning_rate": 9.033541856392295e-05, "loss": 1.2856, "num_input_tokens_seen": 22238280, "step": 1381 }, { "epoch": 0.096806675597818, "grad_norm": 3.8745546340942383, "learning_rate": 9.032842031523644e-05, "loss": 0.9841, "num_input_tokens_seen": 22254664, "step": 1382 }, { "epoch": 0.09687672384354724, "grad_norm": 5.7268548011779785, "learning_rate": 9.032142206654991e-05, "loss": 1.2024, "num_input_tokens_seen": 22270000, "step": 1383 }, { "epoch": 0.09694677208927649, "grad_norm": 4.380898952484131, "learning_rate": 9.03144238178634e-05, "loss": 1.0589, "num_input_tokens_seen": 22286384, "step": 1384 }, { "epoch": 0.09701682033500574, "grad_norm": 5.762500762939453, "learning_rate": 9.030742556917689e-05, "loss": 1.2061, "num_input_tokens_seen": 22302272, "step": 1385 }, { "epoch": 0.09708686858073498, "grad_norm": 3.739488363265991, "learning_rate": 9.030042732049037e-05, "loss": 0.9867, "num_input_tokens_seen": 22318656, "step": 1386 }, { "epoch": 0.09715691682646423, "grad_norm": 4.584897994995117, "learning_rate": 9.029342907180385e-05, "loss": 1.1934, "num_input_tokens_seen": 22334704, "step": 1387 }, { "epoch": 0.09722696507219347, "grad_norm": 4.161139488220215, "learning_rate": 9.028643082311734e-05, "loss": 1.1638, "num_input_tokens_seen": 22349800, "step": 1388 }, { "epoch": 0.09729701331792272, "grad_norm": 4.115293979644775, "learning_rate": 9.027943257443083e-05, "loss": 1.0181, "num_input_tokens_seen": 22366184, "step": 1389 }, { "epoch": 0.09736706156365196, "grad_norm": 3.7355988025665283, "learning_rate": 9.027243432574432e-05, "loss": 1.1182, "num_input_tokens_seen": 22382568, "step": 1390 }, { "epoch": 0.09743710980938121, "grad_norm": 4.15507173538208, "learning_rate": 9.02654360770578e-05, "loss": 1.0272, "num_input_tokens_seen": 22398480, "step": 1391 }, { "epoch": 0.09750715805511045, "grad_norm": 3.770918607711792, "learning_rate": 9.025843782837128e-05, "loss": 0.9834, "num_input_tokens_seen": 22414864, "step": 1392 }, { "epoch": 0.0975772063008397, "grad_norm": 4.214321136474609, "learning_rate": 9.025143957968476e-05, "loss": 1.1738, "num_input_tokens_seen": 22429752, "step": 1393 }, { "epoch": 0.09764725454656895, "grad_norm": 3.9854986667633057, "learning_rate": 9.024444133099825e-05, "loss": 1.2832, "num_input_tokens_seen": 22446136, "step": 1394 }, { "epoch": 0.09771730279229819, "grad_norm": 4.996057510375977, "learning_rate": 9.023744308231174e-05, "loss": 1.1691, "num_input_tokens_seen": 22461160, "step": 1395 }, { "epoch": 0.09778735103802744, "grad_norm": 3.682765007019043, "learning_rate": 9.023044483362523e-05, "loss": 0.9548, "num_input_tokens_seen": 22477336, "step": 1396 }, { "epoch": 0.09785739928375668, "grad_norm": 4.367272853851318, "learning_rate": 9.022344658493871e-05, "loss": 1.0512, "num_input_tokens_seen": 22492952, "step": 1397 }, { "epoch": 0.09792744752948593, "grad_norm": 3.9716336727142334, "learning_rate": 9.021644833625219e-05, "loss": 1.103, "num_input_tokens_seen": 22509336, "step": 1398 }, { "epoch": 0.09799749577521517, "grad_norm": 4.043631553649902, "learning_rate": 9.020945008756568e-05, "loss": 1.1439, "num_input_tokens_seen": 22525568, "step": 1399 }, { "epoch": 0.09806754402094442, "grad_norm": 4.343166351318359, "learning_rate": 9.020245183887917e-05, "loss": 1.1948, "num_input_tokens_seen": 22541328, "step": 1400 }, { "epoch": 0.09806754402094442, "eval_loss": 1.1561514139175415, "eval_runtime": 0.1977, "eval_samples_per_second": 5.058, "eval_steps_per_second": 5.058, "num_input_tokens_seen": 22541328, "step": 1400 }, { "epoch": 0.09813759226667366, "grad_norm": 4.709417819976807, "learning_rate": 9.019545359019265e-05, "loss": 1.1398, "num_input_tokens_seen": 22557304, "step": 1401 }, { "epoch": 0.09820764051240291, "grad_norm": 7.022638320922852, "learning_rate": 9.018845534150614e-05, "loss": 1.0342, "num_input_tokens_seen": 22573688, "step": 1402 }, { "epoch": 0.09827768875813217, "grad_norm": 3.7976694107055664, "learning_rate": 9.018145709281962e-05, "loss": 0.9829, "num_input_tokens_seen": 22589848, "step": 1403 }, { "epoch": 0.09834773700386142, "grad_norm": 3.70877742767334, "learning_rate": 9.01744588441331e-05, "loss": 0.9707, "num_input_tokens_seen": 22606232, "step": 1404 }, { "epoch": 0.09841778524959066, "grad_norm": 7.724960803985596, "learning_rate": 9.016746059544658e-05, "loss": 0.9602, "num_input_tokens_seen": 22621912, "step": 1405 }, { "epoch": 0.09848783349531991, "grad_norm": 3.9619522094726562, "learning_rate": 9.016046234676007e-05, "loss": 0.998, "num_input_tokens_seen": 22638296, "step": 1406 }, { "epoch": 0.09855788174104915, "grad_norm": 3.8303041458129883, "learning_rate": 9.015346409807356e-05, "loss": 1.0762, "num_input_tokens_seen": 22654496, "step": 1407 }, { "epoch": 0.0986279299867784, "grad_norm": 4.029507637023926, "learning_rate": 9.014646584938705e-05, "loss": 1.2072, "num_input_tokens_seen": 22670544, "step": 1408 }, { "epoch": 0.09869797823250764, "grad_norm": 3.8487346172332764, "learning_rate": 9.013946760070054e-05, "loss": 1.1834, "num_input_tokens_seen": 22686592, "step": 1409 }, { "epoch": 0.09876802647823689, "grad_norm": 3.700751543045044, "learning_rate": 9.013246935201401e-05, "loss": 0.8698, "num_input_tokens_seen": 22702976, "step": 1410 }, { "epoch": 0.09883807472396614, "grad_norm": 3.686884641647339, "learning_rate": 9.01254711033275e-05, "loss": 0.9591, "num_input_tokens_seen": 22719360, "step": 1411 }, { "epoch": 0.09890812296969538, "grad_norm": 4.176409721374512, "learning_rate": 9.011847285464099e-05, "loss": 1.1578, "num_input_tokens_seen": 22735744, "step": 1412 }, { "epoch": 0.09897817121542463, "grad_norm": 4.331852912902832, "learning_rate": 9.011147460595446e-05, "loss": 0.9769, "num_input_tokens_seen": 22752128, "step": 1413 }, { "epoch": 0.09904821946115387, "grad_norm": 3.8534255027770996, "learning_rate": 9.010447635726795e-05, "loss": 1.1536, "num_input_tokens_seen": 22768512, "step": 1414 }, { "epoch": 0.09911826770688312, "grad_norm": 4.066548824310303, "learning_rate": 9.009747810858144e-05, "loss": 1.1199, "num_input_tokens_seen": 22784760, "step": 1415 }, { "epoch": 0.09918831595261236, "grad_norm": 4.076517581939697, "learning_rate": 9.009047985989493e-05, "loss": 1.1132, "num_input_tokens_seen": 22801144, "step": 1416 }, { "epoch": 0.09925836419834161, "grad_norm": 3.8858346939086914, "learning_rate": 9.008348161120842e-05, "loss": 0.9509, "num_input_tokens_seen": 22817320, "step": 1417 }, { "epoch": 0.09932841244407085, "grad_norm": 6.4605584144592285, "learning_rate": 9.007648336252189e-05, "loss": 1.2701, "num_input_tokens_seen": 22833704, "step": 1418 }, { "epoch": 0.0993984606898001, "grad_norm": 4.157481670379639, "learning_rate": 9.006948511383538e-05, "loss": 1.0169, "num_input_tokens_seen": 22850088, "step": 1419 }, { "epoch": 0.09946850893552935, "grad_norm": 3.725755214691162, "learning_rate": 9.006248686514886e-05, "loss": 1.0183, "num_input_tokens_seen": 22866472, "step": 1420 }, { "epoch": 0.09953855718125859, "grad_norm": 4.012838363647461, "learning_rate": 9.005548861646236e-05, "loss": 0.8425, "num_input_tokens_seen": 22882856, "step": 1421 }, { "epoch": 0.09960860542698784, "grad_norm": 3.8754239082336426, "learning_rate": 9.004849036777583e-05, "loss": 1.1375, "num_input_tokens_seen": 22899240, "step": 1422 }, { "epoch": 0.09967865367271708, "grad_norm": 3.90873384475708, "learning_rate": 9.004149211908932e-05, "loss": 1.0574, "num_input_tokens_seen": 22915160, "step": 1423 }, { "epoch": 0.09974870191844633, "grad_norm": 5.698948860168457, "learning_rate": 9.003449387040281e-05, "loss": 1.1338, "num_input_tokens_seen": 22930592, "step": 1424 }, { "epoch": 0.09981875016417557, "grad_norm": 4.103662014007568, "learning_rate": 9.002749562171629e-05, "loss": 1.2384, "num_input_tokens_seen": 22946976, "step": 1425 }, { "epoch": 0.09988879840990482, "grad_norm": 4.404048442840576, "learning_rate": 9.002049737302977e-05, "loss": 1.3855, "num_input_tokens_seen": 22963360, "step": 1426 }, { "epoch": 0.09995884665563406, "grad_norm": 4.043710708618164, "learning_rate": 9.001349912434326e-05, "loss": 1.2713, "num_input_tokens_seen": 22979544, "step": 1427 }, { "epoch": 0.10002889490136331, "grad_norm": 4.169802188873291, "learning_rate": 9.000650087565675e-05, "loss": 1.0777, "num_input_tokens_seen": 22995072, "step": 1428 }, { "epoch": 0.10009894314709256, "grad_norm": 4.010350227355957, "learning_rate": 8.999950262697024e-05, "loss": 1.1245, "num_input_tokens_seen": 23010904, "step": 1429 }, { "epoch": 0.1001689913928218, "grad_norm": 4.496591567993164, "learning_rate": 8.999250437828372e-05, "loss": 1.3372, "num_input_tokens_seen": 23027288, "step": 1430 }, { "epoch": 0.10023903963855105, "grad_norm": 4.2428765296936035, "learning_rate": 8.99855061295972e-05, "loss": 1.0258, "num_input_tokens_seen": 23043352, "step": 1431 }, { "epoch": 0.10030908788428029, "grad_norm": 4.083342552185059, "learning_rate": 8.997850788091068e-05, "loss": 1.227, "num_input_tokens_seen": 23059736, "step": 1432 }, { "epoch": 0.10037913613000954, "grad_norm": 3.860734462738037, "learning_rate": 8.997150963222417e-05, "loss": 1.0791, "num_input_tokens_seen": 23075400, "step": 1433 }, { "epoch": 0.10044918437573878, "grad_norm": 3.985151767730713, "learning_rate": 8.996451138353766e-05, "loss": 1.0486, "num_input_tokens_seen": 23091704, "step": 1434 }, { "epoch": 0.10051923262146803, "grad_norm": 4.039731502532959, "learning_rate": 8.995751313485114e-05, "loss": 0.9793, "num_input_tokens_seen": 23108088, "step": 1435 }, { "epoch": 0.10058928086719728, "grad_norm": 6.1780619621276855, "learning_rate": 8.995051488616463e-05, "loss": 1.0645, "num_input_tokens_seen": 23123128, "step": 1436 }, { "epoch": 0.10065932911292653, "grad_norm": 4.5783586502075195, "learning_rate": 8.994351663747811e-05, "loss": 1.1634, "num_input_tokens_seen": 23139168, "step": 1437 }, { "epoch": 0.10072937735865578, "grad_norm": 3.889927864074707, "learning_rate": 8.99365183887916e-05, "loss": 0.97, "num_input_tokens_seen": 23154952, "step": 1438 }, { "epoch": 0.10079942560438503, "grad_norm": 3.927945852279663, "learning_rate": 8.992952014010509e-05, "loss": 1.2428, "num_input_tokens_seen": 23170688, "step": 1439 }, { "epoch": 0.10086947385011427, "grad_norm": 3.8991434574127197, "learning_rate": 8.992252189141856e-05, "loss": 0.9519, "num_input_tokens_seen": 23186432, "step": 1440 }, { "epoch": 0.10093952209584352, "grad_norm": 3.6479310989379883, "learning_rate": 8.991552364273206e-05, "loss": 0.9656, "num_input_tokens_seen": 23202816, "step": 1441 }, { "epoch": 0.10100957034157276, "grad_norm": 4.637960910797119, "learning_rate": 8.990852539404554e-05, "loss": 1.2853, "num_input_tokens_seen": 23218304, "step": 1442 }, { "epoch": 0.10107961858730201, "grad_norm": 4.000091552734375, "learning_rate": 8.990152714535903e-05, "loss": 1.0421, "num_input_tokens_seen": 23234688, "step": 1443 }, { "epoch": 0.10114966683303125, "grad_norm": 4.959738731384277, "learning_rate": 8.989452889667251e-05, "loss": 1.0904, "num_input_tokens_seen": 23250656, "step": 1444 }, { "epoch": 0.1012197150787605, "grad_norm": 3.9251675605773926, "learning_rate": 8.988753064798599e-05, "loss": 0.9219, "num_input_tokens_seen": 23266984, "step": 1445 }, { "epoch": 0.10128976332448975, "grad_norm": 4.28665828704834, "learning_rate": 8.988053239929948e-05, "loss": 1.1465, "num_input_tokens_seen": 23283368, "step": 1446 }, { "epoch": 0.10135981157021899, "grad_norm": 4.421731472015381, "learning_rate": 8.987353415061297e-05, "loss": 1.1098, "num_input_tokens_seen": 23298728, "step": 1447 }, { "epoch": 0.10142985981594824, "grad_norm": 5.080065727233887, "learning_rate": 8.986653590192646e-05, "loss": 1.1172, "num_input_tokens_seen": 23315112, "step": 1448 }, { "epoch": 0.10149990806167748, "grad_norm": 5.618803977966309, "learning_rate": 8.985953765323993e-05, "loss": 0.9551, "num_input_tokens_seen": 23329864, "step": 1449 }, { "epoch": 0.10156995630740673, "grad_norm": 3.756836414337158, "learning_rate": 8.985253940455342e-05, "loss": 1.0981, "num_input_tokens_seen": 23345672, "step": 1450 }, { "epoch": 0.10164000455313597, "grad_norm": 4.461424827575684, "learning_rate": 8.984554115586691e-05, "loss": 1.1914, "num_input_tokens_seen": 23362056, "step": 1451 }, { "epoch": 0.10171005279886522, "grad_norm": 5.267919063568115, "learning_rate": 8.983854290718038e-05, "loss": 1.1928, "num_input_tokens_seen": 23378440, "step": 1452 }, { "epoch": 0.10178010104459446, "grad_norm": 5.513551235198975, "learning_rate": 8.983154465849387e-05, "loss": 1.2405, "num_input_tokens_seen": 23394824, "step": 1453 }, { "epoch": 0.10185014929032371, "grad_norm": 4.46366548538208, "learning_rate": 8.982454640980736e-05, "loss": 1.1436, "num_input_tokens_seen": 23410568, "step": 1454 }, { "epoch": 0.10192019753605296, "grad_norm": 5.066692352294922, "learning_rate": 8.981754816112085e-05, "loss": 1.1389, "num_input_tokens_seen": 23426952, "step": 1455 }, { "epoch": 0.1019902457817822, "grad_norm": 3.980743169784546, "learning_rate": 8.981054991243434e-05, "loss": 1.0623, "num_input_tokens_seen": 23443256, "step": 1456 }, { "epoch": 0.10206029402751145, "grad_norm": 4.088611125946045, "learning_rate": 8.980355166374781e-05, "loss": 1.0388, "num_input_tokens_seen": 23459640, "step": 1457 }, { "epoch": 0.10213034227324069, "grad_norm": 3.9585626125335693, "learning_rate": 8.97965534150613e-05, "loss": 1.2051, "num_input_tokens_seen": 23475176, "step": 1458 }, { "epoch": 0.10220039051896994, "grad_norm": 3.7923290729522705, "learning_rate": 8.978955516637478e-05, "loss": 1.0001, "num_input_tokens_seen": 23490704, "step": 1459 }, { "epoch": 0.10227043876469918, "grad_norm": 3.9089629650115967, "learning_rate": 8.978255691768826e-05, "loss": 0.9786, "num_input_tokens_seen": 23506168, "step": 1460 }, { "epoch": 0.10234048701042843, "grad_norm": 6.2259039878845215, "learning_rate": 8.977555866900175e-05, "loss": 1.2854, "num_input_tokens_seen": 23522552, "step": 1461 }, { "epoch": 0.10241053525615768, "grad_norm": 4.071867942810059, "learning_rate": 8.976856042031524e-05, "loss": 1.0724, "num_input_tokens_seen": 23538936, "step": 1462 }, { "epoch": 0.10248058350188692, "grad_norm": 4.587897777557373, "learning_rate": 8.976156217162873e-05, "loss": 1.1307, "num_input_tokens_seen": 23554536, "step": 1463 }, { "epoch": 0.10255063174761617, "grad_norm": 3.944937229156494, "learning_rate": 8.97545639229422e-05, "loss": 1.1503, "num_input_tokens_seen": 23570888, "step": 1464 }, { "epoch": 0.10262067999334541, "grad_norm": 3.7418766021728516, "learning_rate": 8.97475656742557e-05, "loss": 1.0414, "num_input_tokens_seen": 23587272, "step": 1465 }, { "epoch": 0.10269072823907466, "grad_norm": 3.9055676460266113, "learning_rate": 8.974056742556918e-05, "loss": 1.2284, "num_input_tokens_seen": 23603640, "step": 1466 }, { "epoch": 0.1027607764848039, "grad_norm": 3.9338066577911377, "learning_rate": 8.973356917688267e-05, "loss": 1.2389, "num_input_tokens_seen": 23620024, "step": 1467 }, { "epoch": 0.10283082473053315, "grad_norm": 4.024602890014648, "learning_rate": 8.972657092819616e-05, "loss": 1.1358, "num_input_tokens_seen": 23636408, "step": 1468 }, { "epoch": 0.1029008729762624, "grad_norm": 4.09812068939209, "learning_rate": 8.971957267950963e-05, "loss": 1.0734, "num_input_tokens_seen": 23652480, "step": 1469 }, { "epoch": 0.10297092122199164, "grad_norm": 4.7382025718688965, "learning_rate": 8.971257443082312e-05, "loss": 1.4112, "num_input_tokens_seen": 23668424, "step": 1470 }, { "epoch": 0.10304096946772089, "grad_norm": 4.518669605255127, "learning_rate": 8.970557618213661e-05, "loss": 1.3466, "num_input_tokens_seen": 23684808, "step": 1471 }, { "epoch": 0.10311101771345015, "grad_norm": 4.023036003112793, "learning_rate": 8.969857793345009e-05, "loss": 1.0246, "num_input_tokens_seen": 23701192, "step": 1472 }, { "epoch": 0.10318106595917939, "grad_norm": 4.6244215965271, "learning_rate": 8.969157968476358e-05, "loss": 1.2391, "num_input_tokens_seen": 23717576, "step": 1473 }, { "epoch": 0.10325111420490864, "grad_norm": 4.517683506011963, "learning_rate": 8.968458143607706e-05, "loss": 1.3872, "num_input_tokens_seen": 23733960, "step": 1474 }, { "epoch": 0.10332116245063788, "grad_norm": 4.048764705657959, "learning_rate": 8.967758318739055e-05, "loss": 1.0453, "num_input_tokens_seen": 23750344, "step": 1475 }, { "epoch": 0.10339121069636713, "grad_norm": 4.248376369476318, "learning_rate": 8.967058493870403e-05, "loss": 1.176, "num_input_tokens_seen": 23766160, "step": 1476 }, { "epoch": 0.10346125894209637, "grad_norm": 3.780548095703125, "learning_rate": 8.966358669001752e-05, "loss": 0.9048, "num_input_tokens_seen": 23782544, "step": 1477 }, { "epoch": 0.10353130718782562, "grad_norm": 4.26375675201416, "learning_rate": 8.9656588441331e-05, "loss": 0.8651, "num_input_tokens_seen": 23798928, "step": 1478 }, { "epoch": 0.10360135543355486, "grad_norm": 3.9202687740325928, "learning_rate": 8.964959019264448e-05, "loss": 1.1058, "num_input_tokens_seen": 23815312, "step": 1479 }, { "epoch": 0.10367140367928411, "grad_norm": 3.983797788619995, "learning_rate": 8.964259194395797e-05, "loss": 1.0778, "num_input_tokens_seen": 23831696, "step": 1480 }, { "epoch": 0.10374145192501336, "grad_norm": 4.471195220947266, "learning_rate": 8.963559369527146e-05, "loss": 1.1858, "num_input_tokens_seen": 23847768, "step": 1481 }, { "epoch": 0.1038115001707426, "grad_norm": 3.560317039489746, "learning_rate": 8.962859544658495e-05, "loss": 1.0205, "num_input_tokens_seen": 23864152, "step": 1482 }, { "epoch": 0.10388154841647185, "grad_norm": 3.8699846267700195, "learning_rate": 8.962159719789843e-05, "loss": 1.1438, "num_input_tokens_seen": 23880536, "step": 1483 }, { "epoch": 0.10395159666220109, "grad_norm": 4.547862529754639, "learning_rate": 8.961459894921191e-05, "loss": 1.0303, "num_input_tokens_seen": 23896704, "step": 1484 }, { "epoch": 0.10402164490793034, "grad_norm": 4.669456481933594, "learning_rate": 8.96076007005254e-05, "loss": 1.1994, "num_input_tokens_seen": 23913088, "step": 1485 }, { "epoch": 0.10409169315365958, "grad_norm": 4.346285343170166, "learning_rate": 8.960060245183887e-05, "loss": 1.2677, "num_input_tokens_seen": 23929472, "step": 1486 }, { "epoch": 0.10416174139938883, "grad_norm": 6.5028581619262695, "learning_rate": 8.959360420315236e-05, "loss": 0.989, "num_input_tokens_seen": 23945216, "step": 1487 }, { "epoch": 0.10423178964511808, "grad_norm": 3.935488224029541, "learning_rate": 8.958660595446586e-05, "loss": 1.2657, "num_input_tokens_seen": 23961600, "step": 1488 }, { "epoch": 0.10430183789084732, "grad_norm": 3.772397518157959, "learning_rate": 8.957960770577934e-05, "loss": 1.1038, "num_input_tokens_seen": 23977984, "step": 1489 }, { "epoch": 0.10437188613657657, "grad_norm": 4.508286476135254, "learning_rate": 8.957260945709283e-05, "loss": 1.2694, "num_input_tokens_seen": 23993752, "step": 1490 }, { "epoch": 0.10444193438230581, "grad_norm": 4.667380332946777, "learning_rate": 8.95656112084063e-05, "loss": 1.2837, "num_input_tokens_seen": 24009832, "step": 1491 }, { "epoch": 0.10451198262803506, "grad_norm": 7.675503730773926, "learning_rate": 8.955861295971979e-05, "loss": 1.121, "num_input_tokens_seen": 24025784, "step": 1492 }, { "epoch": 0.1045820308737643, "grad_norm": 4.427548408508301, "learning_rate": 8.955161471103328e-05, "loss": 0.835, "num_input_tokens_seen": 24041568, "step": 1493 }, { "epoch": 0.10465207911949355, "grad_norm": 3.9065396785736084, "learning_rate": 8.954461646234677e-05, "loss": 1.1322, "num_input_tokens_seen": 24057952, "step": 1494 }, { "epoch": 0.1047221273652228, "grad_norm": 4.052605628967285, "learning_rate": 8.953761821366026e-05, "loss": 1.1133, "num_input_tokens_seen": 24074336, "step": 1495 }, { "epoch": 0.10479217561095204, "grad_norm": 3.758476734161377, "learning_rate": 8.953061996497373e-05, "loss": 1.1302, "num_input_tokens_seen": 24090720, "step": 1496 }, { "epoch": 0.10486222385668129, "grad_norm": 4.4470014572143555, "learning_rate": 8.952362171628722e-05, "loss": 1.0969, "num_input_tokens_seen": 24107024, "step": 1497 }, { "epoch": 0.10493227210241053, "grad_norm": 4.222001075744629, "learning_rate": 8.951662346760071e-05, "loss": 1.147, "num_input_tokens_seen": 24123408, "step": 1498 }, { "epoch": 0.10500232034813978, "grad_norm": 4.72997522354126, "learning_rate": 8.950962521891418e-05, "loss": 1.1086, "num_input_tokens_seen": 24137672, "step": 1499 }, { "epoch": 0.10507236859386902, "grad_norm": 4.342312812805176, "learning_rate": 8.950262697022767e-05, "loss": 1.2044, "num_input_tokens_seen": 24153248, "step": 1500 }, { "epoch": 0.10514241683959827, "grad_norm": 4.723706245422363, "learning_rate": 8.949562872154116e-05, "loss": 1.1075, "num_input_tokens_seen": 24169240, "step": 1501 }, { "epoch": 0.10521246508532751, "grad_norm": 4.244345188140869, "learning_rate": 8.948863047285465e-05, "loss": 1.1839, "num_input_tokens_seen": 24184608, "step": 1502 }, { "epoch": 0.10528251333105676, "grad_norm": 3.6271615028381348, "learning_rate": 8.948163222416812e-05, "loss": 1.0755, "num_input_tokens_seen": 24200992, "step": 1503 }, { "epoch": 0.105352561576786, "grad_norm": 3.858696937561035, "learning_rate": 8.947463397548161e-05, "loss": 1.0598, "num_input_tokens_seen": 24217376, "step": 1504 }, { "epoch": 0.10542260982251525, "grad_norm": 7.14077091217041, "learning_rate": 8.94676357267951e-05, "loss": 1.0362, "num_input_tokens_seen": 24232368, "step": 1505 }, { "epoch": 0.10549265806824451, "grad_norm": 4.203495979309082, "learning_rate": 8.946063747810858e-05, "loss": 1.2491, "num_input_tokens_seen": 24248520, "step": 1506 }, { "epoch": 0.10556270631397376, "grad_norm": 4.344188213348389, "learning_rate": 8.945363922942207e-05, "loss": 0.905, "num_input_tokens_seen": 24264824, "step": 1507 }, { "epoch": 0.105632754559703, "grad_norm": 6.156280517578125, "learning_rate": 8.944664098073557e-05, "loss": 1.3046, "num_input_tokens_seen": 24281208, "step": 1508 }, { "epoch": 0.10570280280543225, "grad_norm": 4.687212944030762, "learning_rate": 8.943964273204904e-05, "loss": 1.1898, "num_input_tokens_seen": 24297592, "step": 1509 }, { "epoch": 0.10577285105116149, "grad_norm": 3.9128546714782715, "learning_rate": 8.943264448336253e-05, "loss": 1.0506, "num_input_tokens_seen": 24313976, "step": 1510 }, { "epoch": 0.10584289929689074, "grad_norm": 5.766979694366455, "learning_rate": 8.9425646234676e-05, "loss": 1.119, "num_input_tokens_seen": 24330296, "step": 1511 }, { "epoch": 0.10591294754261998, "grad_norm": 3.9610238075256348, "learning_rate": 8.94186479859895e-05, "loss": 1.279, "num_input_tokens_seen": 24346680, "step": 1512 }, { "epoch": 0.10598299578834923, "grad_norm": 4.262688636779785, "learning_rate": 8.941164973730297e-05, "loss": 1.3217, "num_input_tokens_seen": 24362408, "step": 1513 }, { "epoch": 0.10605304403407848, "grad_norm": 5.190121173858643, "learning_rate": 8.940465148861647e-05, "loss": 1.0615, "num_input_tokens_seen": 24378248, "step": 1514 }, { "epoch": 0.10612309227980772, "grad_norm": 4.5859479904174805, "learning_rate": 8.939765323992996e-05, "loss": 1.1377, "num_input_tokens_seen": 24394632, "step": 1515 }, { "epoch": 0.10619314052553697, "grad_norm": 4.021294593811035, "learning_rate": 8.939065499124344e-05, "loss": 0.9913, "num_input_tokens_seen": 24411016, "step": 1516 }, { "epoch": 0.10626318877126621, "grad_norm": 4.296265602111816, "learning_rate": 8.938365674255692e-05, "loss": 1.1753, "num_input_tokens_seen": 24426792, "step": 1517 }, { "epoch": 0.10633323701699546, "grad_norm": 3.4397289752960205, "learning_rate": 8.93766584938704e-05, "loss": 0.8159, "num_input_tokens_seen": 24443176, "step": 1518 }, { "epoch": 0.1064032852627247, "grad_norm": 4.009952545166016, "learning_rate": 8.936966024518389e-05, "loss": 1.0728, "num_input_tokens_seen": 24459384, "step": 1519 }, { "epoch": 0.10647333350845395, "grad_norm": 4.786280632019043, "learning_rate": 8.936266199649738e-05, "loss": 1.1303, "num_input_tokens_seen": 24474904, "step": 1520 }, { "epoch": 0.1065433817541832, "grad_norm": 3.869297981262207, "learning_rate": 8.935566374781087e-05, "loss": 1.0829, "num_input_tokens_seen": 24490456, "step": 1521 }, { "epoch": 0.10661342999991244, "grad_norm": 3.995553731918335, "learning_rate": 8.934866549912435e-05, "loss": 1.0813, "num_input_tokens_seen": 24506840, "step": 1522 }, { "epoch": 0.10668347824564169, "grad_norm": 4.195018291473389, "learning_rate": 8.934166725043783e-05, "loss": 1.0585, "num_input_tokens_seen": 24522440, "step": 1523 }, { "epoch": 0.10675352649137093, "grad_norm": 4.0432515144348145, "learning_rate": 8.933466900175132e-05, "loss": 1.0757, "num_input_tokens_seen": 24538824, "step": 1524 }, { "epoch": 0.10682357473710018, "grad_norm": 5.120638847351074, "learning_rate": 8.93276707530648e-05, "loss": 1.1328, "num_input_tokens_seen": 24555208, "step": 1525 }, { "epoch": 0.10689362298282942, "grad_norm": 3.925096035003662, "learning_rate": 8.932067250437828e-05, "loss": 1.1569, "num_input_tokens_seen": 24571544, "step": 1526 }, { "epoch": 0.10696367122855867, "grad_norm": 3.930328130722046, "learning_rate": 8.931367425569177e-05, "loss": 0.9385, "num_input_tokens_seen": 24587736, "step": 1527 }, { "epoch": 0.10703371947428791, "grad_norm": 3.7056055068969727, "learning_rate": 8.930667600700526e-05, "loss": 0.8675, "num_input_tokens_seen": 24604120, "step": 1528 }, { "epoch": 0.10710376772001716, "grad_norm": 5.945568561553955, "learning_rate": 8.929967775831875e-05, "loss": 1.0395, "num_input_tokens_seen": 24620504, "step": 1529 }, { "epoch": 0.1071738159657464, "grad_norm": 3.7765939235687256, "learning_rate": 8.929267950963222e-05, "loss": 0.8796, "num_input_tokens_seen": 24635440, "step": 1530 }, { "epoch": 0.10724386421147565, "grad_norm": 4.229284286499023, "learning_rate": 8.928568126094571e-05, "loss": 1.0941, "num_input_tokens_seen": 24651824, "step": 1531 }, { "epoch": 0.1073139124572049, "grad_norm": 4.198834419250488, "learning_rate": 8.92786830122592e-05, "loss": 1.118, "num_input_tokens_seen": 24668208, "step": 1532 }, { "epoch": 0.10738396070293414, "grad_norm": 8.091620445251465, "learning_rate": 8.927168476357267e-05, "loss": 1.1515, "num_input_tokens_seen": 24684248, "step": 1533 }, { "epoch": 0.10745400894866339, "grad_norm": 4.091879844665527, "learning_rate": 8.926468651488618e-05, "loss": 1.1283, "num_input_tokens_seen": 24700632, "step": 1534 }, { "epoch": 0.10752405719439263, "grad_norm": 3.90326189994812, "learning_rate": 8.925768826619966e-05, "loss": 1.047, "num_input_tokens_seen": 24717016, "step": 1535 }, { "epoch": 0.10759410544012188, "grad_norm": 4.097111225128174, "learning_rate": 8.925069001751314e-05, "loss": 1.1623, "num_input_tokens_seen": 24732776, "step": 1536 }, { "epoch": 0.10766415368585112, "grad_norm": 3.5537095069885254, "learning_rate": 8.924369176882663e-05, "loss": 0.989, "num_input_tokens_seen": 24749064, "step": 1537 }, { "epoch": 0.10773420193158037, "grad_norm": 4.3086256980896, "learning_rate": 8.92366935201401e-05, "loss": 1.0864, "num_input_tokens_seen": 24765448, "step": 1538 }, { "epoch": 0.10780425017730962, "grad_norm": 4.177425861358643, "learning_rate": 8.922969527145359e-05, "loss": 1.0652, "num_input_tokens_seen": 24780816, "step": 1539 }, { "epoch": 0.10787429842303886, "grad_norm": 3.6013338565826416, "learning_rate": 8.922269702276708e-05, "loss": 1.1045, "num_input_tokens_seen": 24796600, "step": 1540 }, { "epoch": 0.10794434666876812, "grad_norm": 4.05686092376709, "learning_rate": 8.921569877408057e-05, "loss": 1.1408, "num_input_tokens_seen": 24812984, "step": 1541 }, { "epoch": 0.10801439491449737, "grad_norm": 4.245424747467041, "learning_rate": 8.920870052539406e-05, "loss": 1.2634, "num_input_tokens_seen": 24829368, "step": 1542 }, { "epoch": 0.10808444316022661, "grad_norm": 3.9563350677490234, "learning_rate": 8.920170227670753e-05, "loss": 1.1015, "num_input_tokens_seen": 24845752, "step": 1543 }, { "epoch": 0.10815449140595586, "grad_norm": 4.209373474121094, "learning_rate": 8.919470402802102e-05, "loss": 1.2394, "num_input_tokens_seen": 24862136, "step": 1544 }, { "epoch": 0.1082245396516851, "grad_norm": 3.6590163707733154, "learning_rate": 8.91877057793345e-05, "loss": 1.0168, "num_input_tokens_seen": 24878520, "step": 1545 }, { "epoch": 0.10829458789741435, "grad_norm": 3.937568187713623, "learning_rate": 8.918070753064799e-05, "loss": 1.0999, "num_input_tokens_seen": 24894696, "step": 1546 }, { "epoch": 0.1083646361431436, "grad_norm": 3.948453426361084, "learning_rate": 8.917370928196147e-05, "loss": 1.0565, "num_input_tokens_seen": 24910208, "step": 1547 }, { "epoch": 0.10843468438887284, "grad_norm": 3.61549711227417, "learning_rate": 8.916671103327496e-05, "loss": 1.0294, "num_input_tokens_seen": 24926592, "step": 1548 }, { "epoch": 0.10850473263460209, "grad_norm": 4.091664791107178, "learning_rate": 8.915971278458845e-05, "loss": 1.0596, "num_input_tokens_seen": 24942976, "step": 1549 }, { "epoch": 0.10857478088033133, "grad_norm": 5.494830131530762, "learning_rate": 8.915271453590193e-05, "loss": 1.1564, "num_input_tokens_seen": 24957984, "step": 1550 }, { "epoch": 0.10864482912606058, "grad_norm": 4.546476364135742, "learning_rate": 8.914571628721541e-05, "loss": 1.0753, "num_input_tokens_seen": 24974368, "step": 1551 }, { "epoch": 0.10871487737178982, "grad_norm": 3.775996446609497, "learning_rate": 8.91387180385289e-05, "loss": 1.11, "num_input_tokens_seen": 24990200, "step": 1552 }, { "epoch": 0.10878492561751907, "grad_norm": 3.9989728927612305, "learning_rate": 8.913171978984238e-05, "loss": 1.0121, "num_input_tokens_seen": 25006584, "step": 1553 }, { "epoch": 0.10885497386324831, "grad_norm": 4.417224884033203, "learning_rate": 8.912472154115588e-05, "loss": 1.1891, "num_input_tokens_seen": 25022464, "step": 1554 }, { "epoch": 0.10892502210897756, "grad_norm": 4.604903697967529, "learning_rate": 8.911772329246936e-05, "loss": 0.9414, "num_input_tokens_seen": 25038848, "step": 1555 }, { "epoch": 0.1089950703547068, "grad_norm": 4.823176860809326, "learning_rate": 8.911072504378284e-05, "loss": 1.1259, "num_input_tokens_seen": 25053776, "step": 1556 }, { "epoch": 0.10906511860043605, "grad_norm": 3.6778531074523926, "learning_rate": 8.910372679509632e-05, "loss": 0.9995, "num_input_tokens_seen": 25069872, "step": 1557 }, { "epoch": 0.1091351668461653, "grad_norm": 4.344213485717773, "learning_rate": 8.909672854640981e-05, "loss": 1.1984, "num_input_tokens_seen": 25086256, "step": 1558 }, { "epoch": 0.10920521509189454, "grad_norm": 4.592464923858643, "learning_rate": 8.90897302977233e-05, "loss": 1.502, "num_input_tokens_seen": 25102640, "step": 1559 }, { "epoch": 0.10927526333762379, "grad_norm": 4.103248119354248, "learning_rate": 8.908273204903678e-05, "loss": 0.9454, "num_input_tokens_seen": 25118328, "step": 1560 }, { "epoch": 0.10934531158335303, "grad_norm": 4.637456893920898, "learning_rate": 8.907573380035027e-05, "loss": 1.3611, "num_input_tokens_seen": 25134712, "step": 1561 }, { "epoch": 0.10941535982908228, "grad_norm": 4.4709930419921875, "learning_rate": 8.906873555166376e-05, "loss": 1.1147, "num_input_tokens_seen": 25149304, "step": 1562 }, { "epoch": 0.10948540807481152, "grad_norm": 4.154660701751709, "learning_rate": 8.906173730297724e-05, "loss": 1.2855, "num_input_tokens_seen": 25165360, "step": 1563 }, { "epoch": 0.10955545632054077, "grad_norm": 4.1212334632873535, "learning_rate": 8.905473905429073e-05, "loss": 1.2015, "num_input_tokens_seen": 25181744, "step": 1564 }, { "epoch": 0.10962550456627002, "grad_norm": 3.8060882091522217, "learning_rate": 8.90477408056042e-05, "loss": 1.0333, "num_input_tokens_seen": 25197800, "step": 1565 }, { "epoch": 0.10969555281199926, "grad_norm": 3.4948956966400146, "learning_rate": 8.904074255691769e-05, "loss": 0.941, "num_input_tokens_seen": 25214008, "step": 1566 }, { "epoch": 0.1097656010577285, "grad_norm": 4.181606292724609, "learning_rate": 8.903374430823118e-05, "loss": 1.1185, "num_input_tokens_seen": 25229496, "step": 1567 }, { "epoch": 0.10983564930345775, "grad_norm": 4.206098556518555, "learning_rate": 8.902674605954467e-05, "loss": 1.0363, "num_input_tokens_seen": 25244864, "step": 1568 }, { "epoch": 0.109905697549187, "grad_norm": 3.797475576400757, "learning_rate": 8.901974781085815e-05, "loss": 1.0443, "num_input_tokens_seen": 25261248, "step": 1569 }, { "epoch": 0.10997574579491624, "grad_norm": 4.131814479827881, "learning_rate": 8.901274956217163e-05, "loss": 0.9977, "num_input_tokens_seen": 25277632, "step": 1570 }, { "epoch": 0.11004579404064549, "grad_norm": 3.9447309970855713, "learning_rate": 8.900575131348512e-05, "loss": 1.0839, "num_input_tokens_seen": 25294016, "step": 1571 }, { "epoch": 0.11011584228637473, "grad_norm": 3.916949510574341, "learning_rate": 8.89987530647986e-05, "loss": 1.1793, "num_input_tokens_seen": 25309912, "step": 1572 }, { "epoch": 0.11018589053210398, "grad_norm": 3.7132885456085205, "learning_rate": 8.899175481611208e-05, "loss": 1.081, "num_input_tokens_seen": 25326296, "step": 1573 }, { "epoch": 0.11025593877783323, "grad_norm": 4.5842390060424805, "learning_rate": 8.898475656742558e-05, "loss": 0.926, "num_input_tokens_seen": 25342328, "step": 1574 }, { "epoch": 0.11032598702356247, "grad_norm": 3.578962802886963, "learning_rate": 8.897775831873906e-05, "loss": 1.0599, "num_input_tokens_seen": 25357640, "step": 1575 }, { "epoch": 0.11039603526929173, "grad_norm": 3.5823471546173096, "learning_rate": 8.897076007005255e-05, "loss": 0.9519, "num_input_tokens_seen": 25373424, "step": 1576 }, { "epoch": 0.11046608351502098, "grad_norm": 3.721482515335083, "learning_rate": 8.896376182136602e-05, "loss": 0.976, "num_input_tokens_seen": 25389808, "step": 1577 }, { "epoch": 0.11053613176075022, "grad_norm": 4.874295711517334, "learning_rate": 8.895676357267951e-05, "loss": 1.3507, "num_input_tokens_seen": 25406192, "step": 1578 }, { "epoch": 0.11060618000647947, "grad_norm": 3.8547258377075195, "learning_rate": 8.8949765323993e-05, "loss": 0.9444, "num_input_tokens_seen": 25421632, "step": 1579 }, { "epoch": 0.11067622825220871, "grad_norm": 4.847586631774902, "learning_rate": 8.894276707530649e-05, "loss": 1.0526, "num_input_tokens_seen": 25438016, "step": 1580 }, { "epoch": 0.11074627649793796, "grad_norm": 3.950594425201416, "learning_rate": 8.893576882661998e-05, "loss": 1.0688, "num_input_tokens_seen": 25454400, "step": 1581 }, { "epoch": 0.1108163247436672, "grad_norm": 3.7372758388519287, "learning_rate": 8.892877057793345e-05, "loss": 1.2211, "num_input_tokens_seen": 25470304, "step": 1582 }, { "epoch": 0.11088637298939645, "grad_norm": 3.8695788383483887, "learning_rate": 8.892177232924694e-05, "loss": 1.1006, "num_input_tokens_seen": 25486688, "step": 1583 }, { "epoch": 0.1109564212351257, "grad_norm": 4.623810768127441, "learning_rate": 8.891477408056042e-05, "loss": 1.034, "num_input_tokens_seen": 25503072, "step": 1584 }, { "epoch": 0.11102646948085494, "grad_norm": 4.03538179397583, "learning_rate": 8.89077758318739e-05, "loss": 1.0915, "num_input_tokens_seen": 25519008, "step": 1585 }, { "epoch": 0.11109651772658419, "grad_norm": 7.486603736877441, "learning_rate": 8.890077758318739e-05, "loss": 1.0137, "num_input_tokens_seen": 25533808, "step": 1586 }, { "epoch": 0.11116656597231343, "grad_norm": 4.660414218902588, "learning_rate": 8.889377933450088e-05, "loss": 1.0172, "num_input_tokens_seen": 25549784, "step": 1587 }, { "epoch": 0.11123661421804268, "grad_norm": 3.9375548362731934, "learning_rate": 8.888678108581437e-05, "loss": 0.9843, "num_input_tokens_seen": 25566168, "step": 1588 }, { "epoch": 0.11130666246377192, "grad_norm": 4.275035858154297, "learning_rate": 8.887978283712786e-05, "loss": 1.1802, "num_input_tokens_seen": 25582552, "step": 1589 }, { "epoch": 0.11137671070950117, "grad_norm": 4.799124240875244, "learning_rate": 8.887278458844133e-05, "loss": 1.2702, "num_input_tokens_seen": 25598936, "step": 1590 }, { "epoch": 0.11144675895523042, "grad_norm": 4.143614768981934, "learning_rate": 8.886578633975482e-05, "loss": 1.1797, "num_input_tokens_seen": 25615320, "step": 1591 }, { "epoch": 0.11151680720095966, "grad_norm": 4.490556716918945, "learning_rate": 8.88587880910683e-05, "loss": 1.1351, "num_input_tokens_seen": 25630624, "step": 1592 }, { "epoch": 0.1115868554466889, "grad_norm": 6.010688781738281, "learning_rate": 8.885178984238179e-05, "loss": 1.059, "num_input_tokens_seen": 25646048, "step": 1593 }, { "epoch": 0.11165690369241815, "grad_norm": 3.7447726726531982, "learning_rate": 8.884479159369527e-05, "loss": 0.9902, "num_input_tokens_seen": 25661528, "step": 1594 }, { "epoch": 0.1117269519381474, "grad_norm": 4.77920389175415, "learning_rate": 8.883779334500876e-05, "loss": 1.1158, "num_input_tokens_seen": 25677912, "step": 1595 }, { "epoch": 0.11179700018387664, "grad_norm": 3.9812231063842773, "learning_rate": 8.883079509632225e-05, "loss": 1.096, "num_input_tokens_seen": 25694296, "step": 1596 }, { "epoch": 0.11186704842960589, "grad_norm": 3.7404634952545166, "learning_rate": 8.882379684763573e-05, "loss": 0.9965, "num_input_tokens_seen": 25710448, "step": 1597 }, { "epoch": 0.11193709667533513, "grad_norm": 4.466211318969727, "learning_rate": 8.881679859894922e-05, "loss": 1.1495, "num_input_tokens_seen": 25726624, "step": 1598 }, { "epoch": 0.11200714492106438, "grad_norm": 3.6850225925445557, "learning_rate": 8.880980035026269e-05, "loss": 0.9685, "num_input_tokens_seen": 25742456, "step": 1599 }, { "epoch": 0.11207719316679363, "grad_norm": 4.128363609313965, "learning_rate": 8.880280210157619e-05, "loss": 1.1052, "num_input_tokens_seen": 25758840, "step": 1600 }, { "epoch": 0.11207719316679363, "eval_loss": 1.1512293815612793, "eval_runtime": 0.1988, "eval_samples_per_second": 5.031, "eval_steps_per_second": 5.031, "num_input_tokens_seen": 25758840, "step": 1600 }, { "epoch": 0.11214724141252287, "grad_norm": 4.852661609649658, "learning_rate": 8.879580385288968e-05, "loss": 1.0778, "num_input_tokens_seen": 25774312, "step": 1601 }, { "epoch": 0.11221728965825212, "grad_norm": 4.501857280731201, "learning_rate": 8.878880560420316e-05, "loss": 1.302, "num_input_tokens_seen": 25790696, "step": 1602 }, { "epoch": 0.11228733790398136, "grad_norm": 4.142490863800049, "learning_rate": 8.878180735551665e-05, "loss": 1.0375, "num_input_tokens_seen": 25807080, "step": 1603 }, { "epoch": 0.11235738614971061, "grad_norm": 3.606905698776245, "learning_rate": 8.877480910683012e-05, "loss": 0.9254, "num_input_tokens_seen": 25822552, "step": 1604 }, { "epoch": 0.11242743439543985, "grad_norm": 3.837010145187378, "learning_rate": 8.876781085814361e-05, "loss": 1.1756, "num_input_tokens_seen": 25838088, "step": 1605 }, { "epoch": 0.1124974826411691, "grad_norm": 3.9082963466644287, "learning_rate": 8.87608126094571e-05, "loss": 1.0201, "num_input_tokens_seen": 25854240, "step": 1606 }, { "epoch": 0.11256753088689835, "grad_norm": 4.062923908233643, "learning_rate": 8.875381436077059e-05, "loss": 1.1034, "num_input_tokens_seen": 25870624, "step": 1607 }, { "epoch": 0.11263757913262759, "grad_norm": 4.331594944000244, "learning_rate": 8.874681611208407e-05, "loss": 1.2043, "num_input_tokens_seen": 25886656, "step": 1608 }, { "epoch": 0.11270762737835684, "grad_norm": 3.77466082572937, "learning_rate": 8.873981786339755e-05, "loss": 0.936, "num_input_tokens_seen": 25902704, "step": 1609 }, { "epoch": 0.1127776756240861, "grad_norm": 3.3747365474700928, "learning_rate": 8.873281961471104e-05, "loss": 0.9071, "num_input_tokens_seen": 25919088, "step": 1610 }, { "epoch": 0.11284772386981534, "grad_norm": 5.377493381500244, "learning_rate": 8.872582136602451e-05, "loss": 0.9246, "num_input_tokens_seen": 25935472, "step": 1611 }, { "epoch": 0.11291777211554459, "grad_norm": 5.506969451904297, "learning_rate": 8.8718823117338e-05, "loss": 0.9211, "num_input_tokens_seen": 25951664, "step": 1612 }, { "epoch": 0.11298782036127383, "grad_norm": 4.874104976654053, "learning_rate": 8.871182486865149e-05, "loss": 1.1654, "num_input_tokens_seen": 25968048, "step": 1613 }, { "epoch": 0.11305786860700308, "grad_norm": 4.666824817657471, "learning_rate": 8.870482661996498e-05, "loss": 1.2155, "num_input_tokens_seen": 25983784, "step": 1614 }, { "epoch": 0.11312791685273232, "grad_norm": 3.949862241744995, "learning_rate": 8.869782837127847e-05, "loss": 1.1243, "num_input_tokens_seen": 26000168, "step": 1615 }, { "epoch": 0.11319796509846157, "grad_norm": 3.866542339324951, "learning_rate": 8.869083012259196e-05, "loss": 1.1302, "num_input_tokens_seen": 26015456, "step": 1616 }, { "epoch": 0.11326801334419082, "grad_norm": 3.8679909706115723, "learning_rate": 8.868383187390543e-05, "loss": 1.0886, "num_input_tokens_seen": 26031224, "step": 1617 }, { "epoch": 0.11333806158992006, "grad_norm": 4.7508087158203125, "learning_rate": 8.867683362521892e-05, "loss": 1.2837, "num_input_tokens_seen": 26046952, "step": 1618 }, { "epoch": 0.1134081098356493, "grad_norm": 3.878549337387085, "learning_rate": 8.86698353765324e-05, "loss": 0.99, "num_input_tokens_seen": 26063280, "step": 1619 }, { "epoch": 0.11347815808137855, "grad_norm": 3.8016276359558105, "learning_rate": 8.86628371278459e-05, "loss": 1.1682, "num_input_tokens_seen": 26079616, "step": 1620 }, { "epoch": 0.1135482063271078, "grad_norm": 4.040102481842041, "learning_rate": 8.865583887915937e-05, "loss": 1.1008, "num_input_tokens_seen": 26095232, "step": 1621 }, { "epoch": 0.11361825457283704, "grad_norm": 3.932529926300049, "learning_rate": 8.864884063047286e-05, "loss": 1.1663, "num_input_tokens_seen": 26111616, "step": 1622 }, { "epoch": 0.11368830281856629, "grad_norm": 4.568112373352051, "learning_rate": 8.864184238178635e-05, "loss": 1.1932, "num_input_tokens_seen": 26128000, "step": 1623 }, { "epoch": 0.11375835106429553, "grad_norm": 4.23036527633667, "learning_rate": 8.863484413309982e-05, "loss": 1.0223, "num_input_tokens_seen": 26144384, "step": 1624 }, { "epoch": 0.11382839931002478, "grad_norm": 4.209012031555176, "learning_rate": 8.862784588441331e-05, "loss": 1.0992, "num_input_tokens_seen": 26160768, "step": 1625 }, { "epoch": 0.11389844755575403, "grad_norm": 3.865983724594116, "learning_rate": 8.86208476357268e-05, "loss": 1.1213, "num_input_tokens_seen": 26177152, "step": 1626 }, { "epoch": 0.11396849580148327, "grad_norm": 3.781083822250366, "learning_rate": 8.861384938704029e-05, "loss": 1.0132, "num_input_tokens_seen": 26193536, "step": 1627 }, { "epoch": 0.11403854404721252, "grad_norm": 4.330471038818359, "learning_rate": 8.860685113835378e-05, "loss": 0.9749, "num_input_tokens_seen": 26208976, "step": 1628 }, { "epoch": 0.11410859229294176, "grad_norm": 4.772238254547119, "learning_rate": 8.859985288966725e-05, "loss": 1.2796, "num_input_tokens_seen": 26225360, "step": 1629 }, { "epoch": 0.11417864053867101, "grad_norm": 4.0468668937683105, "learning_rate": 8.859285464098074e-05, "loss": 1.0056, "num_input_tokens_seen": 26241744, "step": 1630 }, { "epoch": 0.11424868878440025, "grad_norm": 3.9648735523223877, "learning_rate": 8.858585639229422e-05, "loss": 1.2185, "num_input_tokens_seen": 26258128, "step": 1631 }, { "epoch": 0.1143187370301295, "grad_norm": 4.7014079093933105, "learning_rate": 8.85788581436077e-05, "loss": 1.1795, "num_input_tokens_seen": 26274512, "step": 1632 }, { "epoch": 0.11438878527585875, "grad_norm": 4.6375627517700195, "learning_rate": 8.85718598949212e-05, "loss": 1.0074, "num_input_tokens_seen": 26290008, "step": 1633 }, { "epoch": 0.11445883352158799, "grad_norm": 4.427719593048096, "learning_rate": 8.856486164623468e-05, "loss": 1.2769, "num_input_tokens_seen": 26305512, "step": 1634 }, { "epoch": 0.11452888176731724, "grad_norm": 6.001821994781494, "learning_rate": 8.855786339754817e-05, "loss": 1.0606, "num_input_tokens_seen": 26319504, "step": 1635 }, { "epoch": 0.11459893001304648, "grad_norm": 3.970672369003296, "learning_rate": 8.855086514886165e-05, "loss": 1.1944, "num_input_tokens_seen": 26335888, "step": 1636 }, { "epoch": 0.11466897825877573, "grad_norm": 3.924450635910034, "learning_rate": 8.854386690017514e-05, "loss": 0.9607, "num_input_tokens_seen": 26351536, "step": 1637 }, { "epoch": 0.11473902650450497, "grad_norm": 4.400977611541748, "learning_rate": 8.853686865148861e-05, "loss": 1.0641, "num_input_tokens_seen": 26367808, "step": 1638 }, { "epoch": 0.11480907475023422, "grad_norm": 3.9734365940093994, "learning_rate": 8.85298704028021e-05, "loss": 1.2258, "num_input_tokens_seen": 26383864, "step": 1639 }, { "epoch": 0.11487912299596346, "grad_norm": 3.792949914932251, "learning_rate": 8.85228721541156e-05, "loss": 1.0401, "num_input_tokens_seen": 26400248, "step": 1640 }, { "epoch": 0.11494917124169271, "grad_norm": 5.14591121673584, "learning_rate": 8.851587390542908e-05, "loss": 1.0484, "num_input_tokens_seen": 26416056, "step": 1641 }, { "epoch": 0.11501921948742196, "grad_norm": 5.0158162117004395, "learning_rate": 8.850887565674256e-05, "loss": 1.2823, "num_input_tokens_seen": 26431400, "step": 1642 }, { "epoch": 0.1150892677331512, "grad_norm": 4.459201812744141, "learning_rate": 8.850187740805605e-05, "loss": 1.2371, "num_input_tokens_seen": 26446920, "step": 1643 }, { "epoch": 0.11515931597888045, "grad_norm": 3.717949867248535, "learning_rate": 8.849487915936953e-05, "loss": 1.1299, "num_input_tokens_seen": 26463304, "step": 1644 }, { "epoch": 0.1152293642246097, "grad_norm": 3.7555253505706787, "learning_rate": 8.848788091068302e-05, "loss": 1.0835, "num_input_tokens_seen": 26479296, "step": 1645 }, { "epoch": 0.11529941247033895, "grad_norm": 4.3726325035095215, "learning_rate": 8.84808826619965e-05, "loss": 0.9606, "num_input_tokens_seen": 26495024, "step": 1646 }, { "epoch": 0.1153694607160682, "grad_norm": 3.728700876235962, "learning_rate": 8.847388441331e-05, "loss": 1.0486, "num_input_tokens_seen": 26511408, "step": 1647 }, { "epoch": 0.11543950896179744, "grad_norm": 4.276855945587158, "learning_rate": 8.846688616462347e-05, "loss": 0.9869, "num_input_tokens_seen": 26527688, "step": 1648 }, { "epoch": 0.11550955720752669, "grad_norm": 5.386009693145752, "learning_rate": 8.845988791593696e-05, "loss": 1.0021, "num_input_tokens_seen": 26544072, "step": 1649 }, { "epoch": 0.11557960545325593, "grad_norm": 4.978610992431641, "learning_rate": 8.845288966725045e-05, "loss": 1.2531, "num_input_tokens_seen": 26560456, "step": 1650 }, { "epoch": 0.11564965369898518, "grad_norm": 5.325594425201416, "learning_rate": 8.844589141856392e-05, "loss": 0.9983, "num_input_tokens_seen": 26576840, "step": 1651 }, { "epoch": 0.11571970194471443, "grad_norm": 4.359868049621582, "learning_rate": 8.843889316987741e-05, "loss": 0.9652, "num_input_tokens_seen": 26593224, "step": 1652 }, { "epoch": 0.11578975019044367, "grad_norm": 7.921500205993652, "learning_rate": 8.84318949211909e-05, "loss": 1.0767, "num_input_tokens_seen": 26607352, "step": 1653 }, { "epoch": 0.11585979843617292, "grad_norm": 3.51788330078125, "learning_rate": 8.842489667250439e-05, "loss": 1.0677, "num_input_tokens_seen": 26623696, "step": 1654 }, { "epoch": 0.11592984668190216, "grad_norm": 4.120747089385986, "learning_rate": 8.841789842381788e-05, "loss": 1.2139, "num_input_tokens_seen": 26639832, "step": 1655 }, { "epoch": 0.11599989492763141, "grad_norm": 4.077361106872559, "learning_rate": 8.841090017513135e-05, "loss": 1.0639, "num_input_tokens_seen": 26655432, "step": 1656 }, { "epoch": 0.11606994317336065, "grad_norm": 3.9629955291748047, "learning_rate": 8.840390192644484e-05, "loss": 1.0846, "num_input_tokens_seen": 26671816, "step": 1657 }, { "epoch": 0.1161399914190899, "grad_norm": 3.933544635772705, "learning_rate": 8.839690367775831e-05, "loss": 1.1543, "num_input_tokens_seen": 26688096, "step": 1658 }, { "epoch": 0.11621003966481915, "grad_norm": 4.702983379364014, "learning_rate": 8.83899054290718e-05, "loss": 1.0699, "num_input_tokens_seen": 26704480, "step": 1659 }, { "epoch": 0.11628008791054839, "grad_norm": 4.536739826202393, "learning_rate": 8.83829071803853e-05, "loss": 1.149, "num_input_tokens_seen": 26720864, "step": 1660 }, { "epoch": 0.11635013615627764, "grad_norm": 4.419711589813232, "learning_rate": 8.837590893169878e-05, "loss": 1.1994, "num_input_tokens_seen": 26737248, "step": 1661 }, { "epoch": 0.11642018440200688, "grad_norm": 4.106175899505615, "learning_rate": 8.836891068301227e-05, "loss": 1.0682, "num_input_tokens_seen": 26753632, "step": 1662 }, { "epoch": 0.11649023264773613, "grad_norm": 3.469658374786377, "learning_rate": 8.836191243432574e-05, "loss": 1.0356, "num_input_tokens_seen": 26769944, "step": 1663 }, { "epoch": 0.11656028089346537, "grad_norm": 7.273227691650391, "learning_rate": 8.835491418563923e-05, "loss": 1.1699, "num_input_tokens_seen": 26784520, "step": 1664 }, { "epoch": 0.11663032913919462, "grad_norm": 3.611165761947632, "learning_rate": 8.834791593695271e-05, "loss": 0.8595, "num_input_tokens_seen": 26800360, "step": 1665 }, { "epoch": 0.11670037738492386, "grad_norm": 4.405304908752441, "learning_rate": 8.834091768826621e-05, "loss": 1.2055, "num_input_tokens_seen": 26816744, "step": 1666 }, { "epoch": 0.11677042563065311, "grad_norm": 3.897247791290283, "learning_rate": 8.83339194395797e-05, "loss": 0.9599, "num_input_tokens_seen": 26832520, "step": 1667 }, { "epoch": 0.11684047387638236, "grad_norm": 3.898019313812256, "learning_rate": 8.832692119089317e-05, "loss": 1.0838, "num_input_tokens_seen": 26848080, "step": 1668 }, { "epoch": 0.1169105221221116, "grad_norm": 4.6351542472839355, "learning_rate": 8.831992294220666e-05, "loss": 1.2776, "num_input_tokens_seen": 26864464, "step": 1669 }, { "epoch": 0.11698057036784085, "grad_norm": 4.020237922668457, "learning_rate": 8.831292469352015e-05, "loss": 0.9955, "num_input_tokens_seen": 26880848, "step": 1670 }, { "epoch": 0.11705061861357009, "grad_norm": 5.813192367553711, "learning_rate": 8.830592644483363e-05, "loss": 1.2867, "num_input_tokens_seen": 26897232, "step": 1671 }, { "epoch": 0.11712066685929934, "grad_norm": 4.058423042297363, "learning_rate": 8.829892819614711e-05, "loss": 1.0697, "num_input_tokens_seen": 26912872, "step": 1672 }, { "epoch": 0.11719071510502858, "grad_norm": 4.76987361907959, "learning_rate": 8.82919299474606e-05, "loss": 0.9226, "num_input_tokens_seen": 26929256, "step": 1673 }, { "epoch": 0.11726076335075783, "grad_norm": 3.8400967121124268, "learning_rate": 8.828493169877409e-05, "loss": 1.0089, "num_input_tokens_seen": 26945624, "step": 1674 }, { "epoch": 0.11733081159648708, "grad_norm": 4.49709415435791, "learning_rate": 8.827793345008757e-05, "loss": 1.0898, "num_input_tokens_seen": 26961464, "step": 1675 }, { "epoch": 0.11740085984221632, "grad_norm": 4.143093109130859, "learning_rate": 8.827093520140105e-05, "loss": 1.0493, "num_input_tokens_seen": 26976720, "step": 1676 }, { "epoch": 0.11747090808794557, "grad_norm": 4.138030529022217, "learning_rate": 8.826393695271454e-05, "loss": 1.1555, "num_input_tokens_seen": 26993056, "step": 1677 }, { "epoch": 0.11754095633367481, "grad_norm": 3.8191847801208496, "learning_rate": 8.825693870402802e-05, "loss": 1.0993, "num_input_tokens_seen": 27009440, "step": 1678 }, { "epoch": 0.11761100457940406, "grad_norm": 3.8392176628112793, "learning_rate": 8.824994045534151e-05, "loss": 1.1067, "num_input_tokens_seen": 27024880, "step": 1679 }, { "epoch": 0.11768105282513332, "grad_norm": 4.468568801879883, "learning_rate": 8.8242942206655e-05, "loss": 1.1424, "num_input_tokens_seen": 27040672, "step": 1680 }, { "epoch": 0.11775110107086256, "grad_norm": 3.6515510082244873, "learning_rate": 8.823594395796848e-05, "loss": 1.0659, "num_input_tokens_seen": 27057056, "step": 1681 }, { "epoch": 0.11782114931659181, "grad_norm": 4.479739189147949, "learning_rate": 8.822894570928197e-05, "loss": 1.0399, "num_input_tokens_seen": 27073440, "step": 1682 }, { "epoch": 0.11789119756232105, "grad_norm": 3.762479782104492, "learning_rate": 8.822194746059545e-05, "loss": 1.1041, "num_input_tokens_seen": 27089824, "step": 1683 }, { "epoch": 0.1179612458080503, "grad_norm": 4.694389343261719, "learning_rate": 8.821494921190894e-05, "loss": 1.2785, "num_input_tokens_seen": 27106208, "step": 1684 }, { "epoch": 0.11803129405377955, "grad_norm": 3.738931179046631, "learning_rate": 8.820795096322241e-05, "loss": 0.9039, "num_input_tokens_seen": 27122352, "step": 1685 }, { "epoch": 0.11810134229950879, "grad_norm": 4.065624237060547, "learning_rate": 8.820095271453591e-05, "loss": 1.0048, "num_input_tokens_seen": 27138160, "step": 1686 }, { "epoch": 0.11817139054523804, "grad_norm": 3.5373826026916504, "learning_rate": 8.81939544658494e-05, "loss": 0.8786, "num_input_tokens_seen": 27154544, "step": 1687 }, { "epoch": 0.11824143879096728, "grad_norm": 3.773066282272339, "learning_rate": 8.818695621716288e-05, "loss": 1.0043, "num_input_tokens_seen": 27170928, "step": 1688 }, { "epoch": 0.11831148703669653, "grad_norm": 3.3876242637634277, "learning_rate": 8.817995796847637e-05, "loss": 0.9909, "num_input_tokens_seen": 27187312, "step": 1689 }, { "epoch": 0.11838153528242577, "grad_norm": 4.526343822479248, "learning_rate": 8.817295971978984e-05, "loss": 1.0899, "num_input_tokens_seen": 27202208, "step": 1690 }, { "epoch": 0.11845158352815502, "grad_norm": 4.691114902496338, "learning_rate": 8.816596147110333e-05, "loss": 1.0823, "num_input_tokens_seen": 27218592, "step": 1691 }, { "epoch": 0.11852163177388426, "grad_norm": 3.90531849861145, "learning_rate": 8.815896322241682e-05, "loss": 1.1438, "num_input_tokens_seen": 27234976, "step": 1692 }, { "epoch": 0.11859168001961351, "grad_norm": 3.5546317100524902, "learning_rate": 8.81519649737303e-05, "loss": 1.0326, "num_input_tokens_seen": 27251360, "step": 1693 }, { "epoch": 0.11866172826534276, "grad_norm": 5.117360591888428, "learning_rate": 8.81449667250438e-05, "loss": 1.1921, "num_input_tokens_seen": 27267744, "step": 1694 }, { "epoch": 0.118731776511072, "grad_norm": 4.055267810821533, "learning_rate": 8.813796847635727e-05, "loss": 1.0607, "num_input_tokens_seen": 27283688, "step": 1695 }, { "epoch": 0.11880182475680125, "grad_norm": 4.04268741607666, "learning_rate": 8.813097022767076e-05, "loss": 1.1862, "num_input_tokens_seen": 27300072, "step": 1696 }, { "epoch": 0.11887187300253049, "grad_norm": 4.048800945281982, "learning_rate": 8.812397197898425e-05, "loss": 0.9231, "num_input_tokens_seen": 27316456, "step": 1697 }, { "epoch": 0.11894192124825974, "grad_norm": 4.445494174957275, "learning_rate": 8.811697373029772e-05, "loss": 1.241, "num_input_tokens_seen": 27332464, "step": 1698 }, { "epoch": 0.11901196949398898, "grad_norm": 4.522054672241211, "learning_rate": 8.810997548161121e-05, "loss": 1.3945, "num_input_tokens_seen": 27348848, "step": 1699 }, { "epoch": 0.11908201773971823, "grad_norm": 4.106349468231201, "learning_rate": 8.81029772329247e-05, "loss": 1.1457, "num_input_tokens_seen": 27365232, "step": 1700 }, { "epoch": 0.11915206598544748, "grad_norm": 6.059356689453125, "learning_rate": 8.809597898423819e-05, "loss": 1.3381, "num_input_tokens_seen": 27380448, "step": 1701 }, { "epoch": 0.11922211423117672, "grad_norm": 3.8089959621429443, "learning_rate": 8.808898073555166e-05, "loss": 1.0699, "num_input_tokens_seen": 27396832, "step": 1702 }, { "epoch": 0.11929216247690597, "grad_norm": 4.21024227142334, "learning_rate": 8.808198248686515e-05, "loss": 1.306, "num_input_tokens_seen": 27413096, "step": 1703 }, { "epoch": 0.11936221072263521, "grad_norm": 4.286004066467285, "learning_rate": 8.807498423817864e-05, "loss": 1.2325, "num_input_tokens_seen": 27429480, "step": 1704 }, { "epoch": 0.11943225896836446, "grad_norm": 3.512561559677124, "learning_rate": 8.806798598949212e-05, "loss": 0.8804, "num_input_tokens_seen": 27445864, "step": 1705 }, { "epoch": 0.1195023072140937, "grad_norm": 4.096526145935059, "learning_rate": 8.806098774080562e-05, "loss": 1.0591, "num_input_tokens_seen": 27462248, "step": 1706 }, { "epoch": 0.11957235545982295, "grad_norm": 5.032350063323975, "learning_rate": 8.805398949211909e-05, "loss": 0.8948, "num_input_tokens_seen": 27478312, "step": 1707 }, { "epoch": 0.1196424037055522, "grad_norm": 4.756420612335205, "learning_rate": 8.804699124343258e-05, "loss": 1.0584, "num_input_tokens_seen": 27494696, "step": 1708 }, { "epoch": 0.11971245195128144, "grad_norm": 4.869518756866455, "learning_rate": 8.803999299474607e-05, "loss": 0.9394, "num_input_tokens_seen": 27511080, "step": 1709 }, { "epoch": 0.11978250019701069, "grad_norm": 3.451759099960327, "learning_rate": 8.803299474605954e-05, "loss": 0.9171, "num_input_tokens_seen": 27527328, "step": 1710 }, { "epoch": 0.11985254844273993, "grad_norm": 4.247021675109863, "learning_rate": 8.802599649737303e-05, "loss": 1.1204, "num_input_tokens_seen": 27543712, "step": 1711 }, { "epoch": 0.11992259668846918, "grad_norm": 4.597024917602539, "learning_rate": 8.801899824868652e-05, "loss": 1.196, "num_input_tokens_seen": 27560096, "step": 1712 }, { "epoch": 0.11999264493419842, "grad_norm": 4.242952823638916, "learning_rate": 8.801200000000001e-05, "loss": 1.1747, "num_input_tokens_seen": 27576320, "step": 1713 }, { "epoch": 0.12006269317992768, "grad_norm": 5.1166486740112305, "learning_rate": 8.80050017513135e-05, "loss": 1.4222, "num_input_tokens_seen": 27591024, "step": 1714 }, { "epoch": 0.12013274142565693, "grad_norm": 4.6713714599609375, "learning_rate": 8.799800350262697e-05, "loss": 1.1869, "num_input_tokens_seen": 27606352, "step": 1715 }, { "epoch": 0.12020278967138617, "grad_norm": 4.62678861618042, "learning_rate": 8.799100525394046e-05, "loss": 1.1524, "num_input_tokens_seen": 27622736, "step": 1716 }, { "epoch": 0.12027283791711542, "grad_norm": 3.611985206604004, "learning_rate": 8.798400700525394e-05, "loss": 1.1179, "num_input_tokens_seen": 27639120, "step": 1717 }, { "epoch": 0.12034288616284466, "grad_norm": 4.165099143981934, "learning_rate": 8.797700875656743e-05, "loss": 1.0104, "num_input_tokens_seen": 27654024, "step": 1718 }, { "epoch": 0.12041293440857391, "grad_norm": 4.532061576843262, "learning_rate": 8.797001050788091e-05, "loss": 1.05, "num_input_tokens_seen": 27670408, "step": 1719 }, { "epoch": 0.12048298265430316, "grad_norm": 4.880197048187256, "learning_rate": 8.79630122591944e-05, "loss": 1.0321, "num_input_tokens_seen": 27686792, "step": 1720 }, { "epoch": 0.1205530309000324, "grad_norm": 3.521052360534668, "learning_rate": 8.795601401050789e-05, "loss": 0.9048, "num_input_tokens_seen": 27703176, "step": 1721 }, { "epoch": 0.12062307914576165, "grad_norm": 3.965725898742676, "learning_rate": 8.794901576182137e-05, "loss": 1.1348, "num_input_tokens_seen": 27719024, "step": 1722 }, { "epoch": 0.12069312739149089, "grad_norm": 3.936962842941284, "learning_rate": 8.794201751313486e-05, "loss": 1.1531, "num_input_tokens_seen": 27734736, "step": 1723 }, { "epoch": 0.12076317563722014, "grad_norm": 5.225526332855225, "learning_rate": 8.793501926444834e-05, "loss": 1.2784, "num_input_tokens_seen": 27751120, "step": 1724 }, { "epoch": 0.12083322388294938, "grad_norm": 4.125289440155029, "learning_rate": 8.792802101576182e-05, "loss": 1.1893, "num_input_tokens_seen": 27767288, "step": 1725 }, { "epoch": 0.12090327212867863, "grad_norm": 3.9352405071258545, "learning_rate": 8.792102276707532e-05, "loss": 1.1867, "num_input_tokens_seen": 27783672, "step": 1726 }, { "epoch": 0.12097332037440787, "grad_norm": 3.908578634262085, "learning_rate": 8.79140245183888e-05, "loss": 1.0024, "num_input_tokens_seen": 27799640, "step": 1727 }, { "epoch": 0.12104336862013712, "grad_norm": 3.694387435913086, "learning_rate": 8.790702626970229e-05, "loss": 1.0652, "num_input_tokens_seen": 27816024, "step": 1728 }, { "epoch": 0.12111341686586637, "grad_norm": 4.0100016593933105, "learning_rate": 8.790002802101576e-05, "loss": 1.0511, "num_input_tokens_seen": 27832408, "step": 1729 }, { "epoch": 0.12118346511159561, "grad_norm": 5.454882621765137, "learning_rate": 8.789302977232925e-05, "loss": 1.1096, "num_input_tokens_seen": 27848792, "step": 1730 }, { "epoch": 0.12125351335732486, "grad_norm": 5.065526485443115, "learning_rate": 8.788603152364274e-05, "loss": 1.0354, "num_input_tokens_seen": 27864688, "step": 1731 }, { "epoch": 0.1213235616030541, "grad_norm": 3.73103666305542, "learning_rate": 8.787903327495623e-05, "loss": 1.0328, "num_input_tokens_seen": 27881072, "step": 1732 }, { "epoch": 0.12139360984878335, "grad_norm": 3.971198081970215, "learning_rate": 8.787203502626971e-05, "loss": 1.1908, "num_input_tokens_seen": 27896912, "step": 1733 }, { "epoch": 0.1214636580945126, "grad_norm": 3.933809518814087, "learning_rate": 8.786503677758319e-05, "loss": 1.1125, "num_input_tokens_seen": 27913104, "step": 1734 }, { "epoch": 0.12153370634024184, "grad_norm": 3.92167329788208, "learning_rate": 8.785803852889668e-05, "loss": 1.0007, "num_input_tokens_seen": 27929488, "step": 1735 }, { "epoch": 0.12160375458597109, "grad_norm": 4.441089630126953, "learning_rate": 8.785104028021017e-05, "loss": 0.9748, "num_input_tokens_seen": 27945504, "step": 1736 }, { "epoch": 0.12167380283170033, "grad_norm": 4.023623466491699, "learning_rate": 8.784404203152364e-05, "loss": 0.8826, "num_input_tokens_seen": 27961888, "step": 1737 }, { "epoch": 0.12174385107742958, "grad_norm": 4.0328826904296875, "learning_rate": 8.783704378283713e-05, "loss": 1.2769, "num_input_tokens_seen": 27978024, "step": 1738 }, { "epoch": 0.12181389932315882, "grad_norm": 4.5445733070373535, "learning_rate": 8.783004553415062e-05, "loss": 1.3745, "num_input_tokens_seen": 27993840, "step": 1739 }, { "epoch": 0.12188394756888807, "grad_norm": 3.609834671020508, "learning_rate": 8.782304728546411e-05, "loss": 0.916, "num_input_tokens_seen": 28010224, "step": 1740 }, { "epoch": 0.12195399581461731, "grad_norm": 3.849306344985962, "learning_rate": 8.78160490367776e-05, "loss": 1.1135, "num_input_tokens_seen": 28026232, "step": 1741 }, { "epoch": 0.12202404406034656, "grad_norm": 4.11102294921875, "learning_rate": 8.780905078809107e-05, "loss": 1.2269, "num_input_tokens_seen": 28041880, "step": 1742 }, { "epoch": 0.1220940923060758, "grad_norm": 4.156986713409424, "learning_rate": 8.780205253940456e-05, "loss": 1.0321, "num_input_tokens_seen": 28058264, "step": 1743 }, { "epoch": 0.12216414055180505, "grad_norm": 3.9670159816741943, "learning_rate": 8.779505429071803e-05, "loss": 0.9752, "num_input_tokens_seen": 28073168, "step": 1744 }, { "epoch": 0.1222341887975343, "grad_norm": 5.342650890350342, "learning_rate": 8.778805604203152e-05, "loss": 1.1416, "num_input_tokens_seen": 28089552, "step": 1745 }, { "epoch": 0.12230423704326354, "grad_norm": 4.031285285949707, "learning_rate": 8.778105779334501e-05, "loss": 1.1134, "num_input_tokens_seen": 28105264, "step": 1746 }, { "epoch": 0.12237428528899279, "grad_norm": 3.5976450443267822, "learning_rate": 8.77740595446585e-05, "loss": 1.0342, "num_input_tokens_seen": 28121648, "step": 1747 }, { "epoch": 0.12244433353472203, "grad_norm": 4.947859764099121, "learning_rate": 8.776706129597199e-05, "loss": 1.0809, "num_input_tokens_seen": 28137640, "step": 1748 }, { "epoch": 0.12251438178045129, "grad_norm": 4.004949569702148, "learning_rate": 8.776006304728546e-05, "loss": 1.0921, "num_input_tokens_seen": 28154024, "step": 1749 }, { "epoch": 0.12258443002618054, "grad_norm": 3.9022445678710938, "learning_rate": 8.775306479859895e-05, "loss": 1.0844, "num_input_tokens_seen": 28170408, "step": 1750 }, { "epoch": 0.12265447827190978, "grad_norm": 4.171925067901611, "learning_rate": 8.774606654991244e-05, "loss": 1.1894, "num_input_tokens_seen": 28186792, "step": 1751 }, { "epoch": 0.12272452651763903, "grad_norm": 3.9387433528900146, "learning_rate": 8.773906830122592e-05, "loss": 1.0303, "num_input_tokens_seen": 28203176, "step": 1752 }, { "epoch": 0.12279457476336827, "grad_norm": 5.067278861999512, "learning_rate": 8.773207005253942e-05, "loss": 1.1924, "num_input_tokens_seen": 28219192, "step": 1753 }, { "epoch": 0.12286462300909752, "grad_norm": 3.673807144165039, "learning_rate": 8.77250718038529e-05, "loss": 1.0438, "num_input_tokens_seen": 28235576, "step": 1754 }, { "epoch": 0.12293467125482677, "grad_norm": 5.303588390350342, "learning_rate": 8.771807355516638e-05, "loss": 1.2601, "num_input_tokens_seen": 28251960, "step": 1755 }, { "epoch": 0.12300471950055601, "grad_norm": 5.343825340270996, "learning_rate": 8.771107530647986e-05, "loss": 1.1126, "num_input_tokens_seen": 28268344, "step": 1756 }, { "epoch": 0.12307476774628526, "grad_norm": 4.125874996185303, "learning_rate": 8.770407705779335e-05, "loss": 1.1497, "num_input_tokens_seen": 28284144, "step": 1757 }, { "epoch": 0.1231448159920145, "grad_norm": 4.628546714782715, "learning_rate": 8.769707880910683e-05, "loss": 1.1757, "num_input_tokens_seen": 28299896, "step": 1758 }, { "epoch": 0.12321486423774375, "grad_norm": 3.946603775024414, "learning_rate": 8.769008056042032e-05, "loss": 1.2739, "num_input_tokens_seen": 28316280, "step": 1759 }, { "epoch": 0.123284912483473, "grad_norm": 3.4837770462036133, "learning_rate": 8.768308231173381e-05, "loss": 0.9682, "num_input_tokens_seen": 28332128, "step": 1760 }, { "epoch": 0.12335496072920224, "grad_norm": 3.9601573944091797, "learning_rate": 8.767608406304729e-05, "loss": 1.2647, "num_input_tokens_seen": 28347488, "step": 1761 }, { "epoch": 0.12342500897493149, "grad_norm": 4.178001403808594, "learning_rate": 8.766908581436078e-05, "loss": 1.0055, "num_input_tokens_seen": 28363872, "step": 1762 }, { "epoch": 0.12349505722066073, "grad_norm": 3.9182498455047607, "learning_rate": 8.766208756567426e-05, "loss": 1.1407, "num_input_tokens_seen": 28380208, "step": 1763 }, { "epoch": 0.12356510546638998, "grad_norm": 4.071939468383789, "learning_rate": 8.765508931698774e-05, "loss": 1.3196, "num_input_tokens_seen": 28396592, "step": 1764 }, { "epoch": 0.12363515371211922, "grad_norm": 4.657908916473389, "learning_rate": 8.764809106830123e-05, "loss": 1.0739, "num_input_tokens_seen": 28412976, "step": 1765 }, { "epoch": 0.12370520195784847, "grad_norm": 3.9706201553344727, "learning_rate": 8.764109281961472e-05, "loss": 1.0904, "num_input_tokens_seen": 28429088, "step": 1766 }, { "epoch": 0.12377525020357771, "grad_norm": 4.571341514587402, "learning_rate": 8.76340945709282e-05, "loss": 1.1314, "num_input_tokens_seen": 28445472, "step": 1767 }, { "epoch": 0.12384529844930696, "grad_norm": 4.197002410888672, "learning_rate": 8.762709632224169e-05, "loss": 0.8251, "num_input_tokens_seen": 28461656, "step": 1768 }, { "epoch": 0.1239153466950362, "grad_norm": 5.376040935516357, "learning_rate": 8.762009807355517e-05, "loss": 1.1626, "num_input_tokens_seen": 28477088, "step": 1769 }, { "epoch": 0.12398539494076545, "grad_norm": 3.987495183944702, "learning_rate": 8.761309982486866e-05, "loss": 1.2449, "num_input_tokens_seen": 28493472, "step": 1770 }, { "epoch": 0.1240554431864947, "grad_norm": 4.379208564758301, "learning_rate": 8.760610157618213e-05, "loss": 1.2834, "num_input_tokens_seen": 28509856, "step": 1771 }, { "epoch": 0.12412549143222394, "grad_norm": 3.7258729934692383, "learning_rate": 8.759910332749562e-05, "loss": 1.1115, "num_input_tokens_seen": 28525664, "step": 1772 }, { "epoch": 0.12419553967795319, "grad_norm": 4.0574774742126465, "learning_rate": 8.759210507880911e-05, "loss": 1.1005, "num_input_tokens_seen": 28541920, "step": 1773 }, { "epoch": 0.12426558792368243, "grad_norm": 3.8423895835876465, "learning_rate": 8.75851068301226e-05, "loss": 1.1067, "num_input_tokens_seen": 28558216, "step": 1774 }, { "epoch": 0.12433563616941168, "grad_norm": 3.8898398876190186, "learning_rate": 8.757810858143609e-05, "loss": 1.1963, "num_input_tokens_seen": 28574536, "step": 1775 }, { "epoch": 0.12440568441514092, "grad_norm": 3.286412000656128, "learning_rate": 8.757111033274956e-05, "loss": 0.9159, "num_input_tokens_seen": 28590920, "step": 1776 }, { "epoch": 0.12447573266087017, "grad_norm": 3.7219464778900146, "learning_rate": 8.756411208406305e-05, "loss": 1.0883, "num_input_tokens_seen": 28607192, "step": 1777 }, { "epoch": 0.12454578090659942, "grad_norm": 3.8907012939453125, "learning_rate": 8.755711383537654e-05, "loss": 1.0226, "num_input_tokens_seen": 28623176, "step": 1778 }, { "epoch": 0.12461582915232866, "grad_norm": 3.8087925910949707, "learning_rate": 8.755011558669003e-05, "loss": 1.0115, "num_input_tokens_seen": 28639528, "step": 1779 }, { "epoch": 0.1246858773980579, "grad_norm": 4.8956217765808105, "learning_rate": 8.754311733800352e-05, "loss": 1.0108, "num_input_tokens_seen": 28654976, "step": 1780 }, { "epoch": 0.12475592564378715, "grad_norm": 3.7400572299957275, "learning_rate": 8.753611908931699e-05, "loss": 0.8787, "num_input_tokens_seen": 28671064, "step": 1781 }, { "epoch": 0.1248259738895164, "grad_norm": 4.689199924468994, "learning_rate": 8.752912084063048e-05, "loss": 1.2326, "num_input_tokens_seen": 28686664, "step": 1782 }, { "epoch": 0.12489602213524566, "grad_norm": 3.6594929695129395, "learning_rate": 8.752212259194395e-05, "loss": 1.1626, "num_input_tokens_seen": 28703048, "step": 1783 }, { "epoch": 0.1249660703809749, "grad_norm": 4.6070356369018555, "learning_rate": 8.751512434325744e-05, "loss": 1.358, "num_input_tokens_seen": 28719000, "step": 1784 }, { "epoch": 0.12503611862670413, "grad_norm": 4.658362865447998, "learning_rate": 8.750812609457093e-05, "loss": 1.2852, "num_input_tokens_seen": 28735384, "step": 1785 }, { "epoch": 0.1251061668724334, "grad_norm": 3.6963465213775635, "learning_rate": 8.750112784588442e-05, "loss": 1.1068, "num_input_tokens_seen": 28750856, "step": 1786 }, { "epoch": 0.12517621511816263, "grad_norm": 4.419562816619873, "learning_rate": 8.749412959719791e-05, "loss": 1.1559, "num_input_tokens_seen": 28766824, "step": 1787 }, { "epoch": 0.12524626336389189, "grad_norm": 4.601676940917969, "learning_rate": 8.74871313485114e-05, "loss": 1.0642, "num_input_tokens_seen": 28783208, "step": 1788 }, { "epoch": 0.12531631160962112, "grad_norm": 3.8597445487976074, "learning_rate": 8.748013309982487e-05, "loss": 1.1149, "num_input_tokens_seen": 28799160, "step": 1789 }, { "epoch": 0.12538635985535038, "grad_norm": 3.654649257659912, "learning_rate": 8.747313485113835e-05, "loss": 1.3127, "num_input_tokens_seen": 28815440, "step": 1790 }, { "epoch": 0.1254564081010796, "grad_norm": 4.043321132659912, "learning_rate": 8.746613660245184e-05, "loss": 1.0844, "num_input_tokens_seen": 28831824, "step": 1791 }, { "epoch": 0.12552645634680887, "grad_norm": 4.5223894119262695, "learning_rate": 8.745913835376532e-05, "loss": 1.0627, "num_input_tokens_seen": 28846984, "step": 1792 }, { "epoch": 0.1255965045925381, "grad_norm": 4.074361801147461, "learning_rate": 8.745214010507881e-05, "loss": 0.9772, "num_input_tokens_seen": 28863368, "step": 1793 }, { "epoch": 0.12566655283826736, "grad_norm": 4.661183834075928, "learning_rate": 8.74451418563923e-05, "loss": 1.152, "num_input_tokens_seen": 28879752, "step": 1794 }, { "epoch": 0.1257366010839966, "grad_norm": 3.95831561088562, "learning_rate": 8.743814360770579e-05, "loss": 1.117, "num_input_tokens_seen": 28895728, "step": 1795 }, { "epoch": 0.12580664932972585, "grad_norm": 4.271726131439209, "learning_rate": 8.743114535901927e-05, "loss": 1.0935, "num_input_tokens_seen": 28912112, "step": 1796 }, { "epoch": 0.12587669757545508, "grad_norm": 4.079075336456299, "learning_rate": 8.742414711033275e-05, "loss": 1.1397, "num_input_tokens_seen": 28928496, "step": 1797 }, { "epoch": 0.12594674582118434, "grad_norm": 4.030980587005615, "learning_rate": 8.741714886164623e-05, "loss": 0.9405, "num_input_tokens_seen": 28943968, "step": 1798 }, { "epoch": 0.12601679406691357, "grad_norm": 3.7285454273223877, "learning_rate": 8.741015061295973e-05, "loss": 0.8448, "num_input_tokens_seen": 28959800, "step": 1799 }, { "epoch": 0.12608684231264283, "grad_norm": 3.964663028717041, "learning_rate": 8.74031523642732e-05, "loss": 1.1614, "num_input_tokens_seen": 28976184, "step": 1800 }, { "epoch": 0.12608684231264283, "eval_loss": 1.1493111848831177, "eval_runtime": 0.196, "eval_samples_per_second": 5.102, "eval_steps_per_second": 5.102, "num_input_tokens_seen": 28976184, "step": 1800 }, { "epoch": 0.1261568905583721, "grad_norm": 4.2887396812438965, "learning_rate": 8.73961541155867e-05, "loss": 0.9047, "num_input_tokens_seen": 28992552, "step": 1801 }, { "epoch": 0.12622693880410132, "grad_norm": 5.139194011688232, "learning_rate": 8.738915586690018e-05, "loss": 1.1656, "num_input_tokens_seen": 29007480, "step": 1802 }, { "epoch": 0.12629698704983058, "grad_norm": 4.023421287536621, "learning_rate": 8.738215761821366e-05, "loss": 1.0585, "num_input_tokens_seen": 29023864, "step": 1803 }, { "epoch": 0.12636703529555982, "grad_norm": 3.6131162643432617, "learning_rate": 8.737515936952715e-05, "loss": 1.0964, "num_input_tokens_seen": 29039640, "step": 1804 }, { "epoch": 0.12643708354128907, "grad_norm": 4.477705478668213, "learning_rate": 8.736816112084064e-05, "loss": 0.8054, "num_input_tokens_seen": 29055816, "step": 1805 }, { "epoch": 0.1265071317870183, "grad_norm": 3.7637252807617188, "learning_rate": 8.736116287215412e-05, "loss": 1.0389, "num_input_tokens_seen": 29071456, "step": 1806 }, { "epoch": 0.12657718003274757, "grad_norm": 3.9611611366271973, "learning_rate": 8.735416462346761e-05, "loss": 1.1907, "num_input_tokens_seen": 29087840, "step": 1807 }, { "epoch": 0.1266472282784768, "grad_norm": 3.6022791862487793, "learning_rate": 8.734716637478109e-05, "loss": 0.9538, "num_input_tokens_seen": 29104224, "step": 1808 }, { "epoch": 0.12671727652420606, "grad_norm": 3.7403485774993896, "learning_rate": 8.734016812609458e-05, "loss": 1.12, "num_input_tokens_seen": 29120608, "step": 1809 }, { "epoch": 0.1267873247699353, "grad_norm": 3.5624709129333496, "learning_rate": 8.733316987740805e-05, "loss": 1.0931, "num_input_tokens_seen": 29136840, "step": 1810 }, { "epoch": 0.12685737301566455, "grad_norm": 3.961516857147217, "learning_rate": 8.732617162872154e-05, "loss": 0.9529, "num_input_tokens_seen": 29153224, "step": 1811 }, { "epoch": 0.12692742126139378, "grad_norm": 4.895046234130859, "learning_rate": 8.731917338003503e-05, "loss": 1.0697, "num_input_tokens_seen": 29168336, "step": 1812 }, { "epoch": 0.12699746950712304, "grad_norm": 4.290217876434326, "learning_rate": 8.731217513134852e-05, "loss": 0.8945, "num_input_tokens_seen": 29184720, "step": 1813 }, { "epoch": 0.12706751775285227, "grad_norm": 3.6602399349212646, "learning_rate": 8.7305176882662e-05, "loss": 1.0465, "num_input_tokens_seen": 29200920, "step": 1814 }, { "epoch": 0.12713756599858153, "grad_norm": 3.7980921268463135, "learning_rate": 8.72981786339755e-05, "loss": 0.8915, "num_input_tokens_seen": 29217304, "step": 1815 }, { "epoch": 0.12720761424431076, "grad_norm": 3.646242141723633, "learning_rate": 8.729118038528897e-05, "loss": 1.0058, "num_input_tokens_seen": 29233688, "step": 1816 }, { "epoch": 0.12727766249004002, "grad_norm": 5.226564884185791, "learning_rate": 8.728418213660244e-05, "loss": 0.9569, "num_input_tokens_seen": 29247896, "step": 1817 }, { "epoch": 0.12734771073576925, "grad_norm": 3.8191912174224854, "learning_rate": 8.727718388791593e-05, "loss": 1.1548, "num_input_tokens_seen": 29263896, "step": 1818 }, { "epoch": 0.1274177589814985, "grad_norm": 4.349045276641846, "learning_rate": 8.727018563922944e-05, "loss": 1.1368, "num_input_tokens_seen": 29280224, "step": 1819 }, { "epoch": 0.12748780722722775, "grad_norm": 3.842888116836548, "learning_rate": 8.726318739054291e-05, "loss": 1.0052, "num_input_tokens_seen": 29296608, "step": 1820 }, { "epoch": 0.127557855472957, "grad_norm": 3.8854012489318848, "learning_rate": 8.72561891418564e-05, "loss": 1.0584, "num_input_tokens_seen": 29312992, "step": 1821 }, { "epoch": 0.12762790371868624, "grad_norm": 4.102949619293213, "learning_rate": 8.724919089316989e-05, "loss": 0.9004, "num_input_tokens_seen": 29328416, "step": 1822 }, { "epoch": 0.1276979519644155, "grad_norm": 5.0174336433410645, "learning_rate": 8.724219264448336e-05, "loss": 1.0837, "num_input_tokens_seen": 29344800, "step": 1823 }, { "epoch": 0.12776800021014473, "grad_norm": 3.6122186183929443, "learning_rate": 8.723519439579685e-05, "loss": 0.924, "num_input_tokens_seen": 29361184, "step": 1824 }, { "epoch": 0.127838048455874, "grad_norm": 4.086683750152588, "learning_rate": 8.722819614711034e-05, "loss": 1.0945, "num_input_tokens_seen": 29376840, "step": 1825 }, { "epoch": 0.12790809670160322, "grad_norm": 4.279770851135254, "learning_rate": 8.722119789842383e-05, "loss": 0.9831, "num_input_tokens_seen": 29393016, "step": 1826 }, { "epoch": 0.12797814494733248, "grad_norm": 5.032819747924805, "learning_rate": 8.72141996497373e-05, "loss": 1.1691, "num_input_tokens_seen": 29409400, "step": 1827 }, { "epoch": 0.1280481931930617, "grad_norm": 4.480144023895264, "learning_rate": 8.720720140105079e-05, "loss": 1.1481, "num_input_tokens_seen": 29425472, "step": 1828 }, { "epoch": 0.12811824143879097, "grad_norm": 3.6843478679656982, "learning_rate": 8.720020315236428e-05, "loss": 1.1302, "num_input_tokens_seen": 29441472, "step": 1829 }, { "epoch": 0.1281882896845202, "grad_norm": 3.7091941833496094, "learning_rate": 8.719320490367776e-05, "loss": 1.0351, "num_input_tokens_seen": 29457600, "step": 1830 }, { "epoch": 0.12825833793024946, "grad_norm": 4.122303009033203, "learning_rate": 8.718620665499124e-05, "loss": 1.0791, "num_input_tokens_seen": 29473984, "step": 1831 }, { "epoch": 0.1283283861759787, "grad_norm": 5.282047748565674, "learning_rate": 8.717920840630473e-05, "loss": 1.4479, "num_input_tokens_seen": 29490336, "step": 1832 }, { "epoch": 0.12839843442170795, "grad_norm": 4.0706586837768555, "learning_rate": 8.717221015761822e-05, "loss": 1.0026, "num_input_tokens_seen": 29506432, "step": 1833 }, { "epoch": 0.12846848266743718, "grad_norm": 3.856018543243408, "learning_rate": 8.716521190893171e-05, "loss": 1.0545, "num_input_tokens_seen": 29521744, "step": 1834 }, { "epoch": 0.12853853091316644, "grad_norm": 3.7059905529022217, "learning_rate": 8.715821366024518e-05, "loss": 0.9876, "num_input_tokens_seen": 29537104, "step": 1835 }, { "epoch": 0.1286085791588957, "grad_norm": 3.915038585662842, "learning_rate": 8.715121541155867e-05, "loss": 1.2072, "num_input_tokens_seen": 29552928, "step": 1836 }, { "epoch": 0.12867862740462493, "grad_norm": 3.6828839778900146, "learning_rate": 8.714421716287215e-05, "loss": 0.9849, "num_input_tokens_seen": 29569312, "step": 1837 }, { "epoch": 0.1287486756503542, "grad_norm": 4.3285441398620605, "learning_rate": 8.713721891418564e-05, "loss": 1.2812, "num_input_tokens_seen": 29584376, "step": 1838 }, { "epoch": 0.12881872389608343, "grad_norm": 4.646363258361816, "learning_rate": 8.713022066549914e-05, "loss": 1.1107, "num_input_tokens_seen": 29599856, "step": 1839 }, { "epoch": 0.12888877214181269, "grad_norm": 4.180859088897705, "learning_rate": 8.712322241681261e-05, "loss": 1.0751, "num_input_tokens_seen": 29616224, "step": 1840 }, { "epoch": 0.12895882038754192, "grad_norm": 3.666090250015259, "learning_rate": 8.71162241681261e-05, "loss": 1.0568, "num_input_tokens_seen": 29632608, "step": 1841 }, { "epoch": 0.12902886863327118, "grad_norm": 3.4623513221740723, "learning_rate": 8.710922591943959e-05, "loss": 0.9662, "num_input_tokens_seen": 29648992, "step": 1842 }, { "epoch": 0.1290989168790004, "grad_norm": 4.720603942871094, "learning_rate": 8.710222767075307e-05, "loss": 1.0566, "num_input_tokens_seen": 29665136, "step": 1843 }, { "epoch": 0.12916896512472967, "grad_norm": 4.208099365234375, "learning_rate": 8.709522942206654e-05, "loss": 1.1878, "num_input_tokens_seen": 29681520, "step": 1844 }, { "epoch": 0.1292390133704589, "grad_norm": 4.145462989807129, "learning_rate": 8.708823117338004e-05, "loss": 1.0159, "num_input_tokens_seen": 29697480, "step": 1845 }, { "epoch": 0.12930906161618816, "grad_norm": 3.9043843746185303, "learning_rate": 8.708123292469353e-05, "loss": 1.0809, "num_input_tokens_seen": 29713560, "step": 1846 }, { "epoch": 0.1293791098619174, "grad_norm": 4.092489242553711, "learning_rate": 8.707423467600701e-05, "loss": 1.0432, "num_input_tokens_seen": 29729944, "step": 1847 }, { "epoch": 0.12944915810764665, "grad_norm": 4.73677396774292, "learning_rate": 8.70672364273205e-05, "loss": 1.0276, "num_input_tokens_seen": 29746328, "step": 1848 }, { "epoch": 0.12951920635337588, "grad_norm": 6.134850025177002, "learning_rate": 8.706023817863398e-05, "loss": 0.9749, "num_input_tokens_seen": 29762584, "step": 1849 }, { "epoch": 0.12958925459910514, "grad_norm": 3.4841954708099365, "learning_rate": 8.705323992994746e-05, "loss": 0.9534, "num_input_tokens_seen": 29778968, "step": 1850 }, { "epoch": 0.12965930284483437, "grad_norm": 3.8816237449645996, "learning_rate": 8.704624168126095e-05, "loss": 0.7471, "num_input_tokens_seen": 29795352, "step": 1851 }, { "epoch": 0.12972935109056363, "grad_norm": 3.596538543701172, "learning_rate": 8.703924343257444e-05, "loss": 0.9753, "num_input_tokens_seen": 29811608, "step": 1852 }, { "epoch": 0.12979939933629286, "grad_norm": 3.9403269290924072, "learning_rate": 8.703224518388793e-05, "loss": 1.0667, "num_input_tokens_seen": 29827608, "step": 1853 }, { "epoch": 0.12986944758202212, "grad_norm": 4.586714744567871, "learning_rate": 8.70252469352014e-05, "loss": 0.9335, "num_input_tokens_seen": 29843992, "step": 1854 }, { "epoch": 0.12993949582775136, "grad_norm": 3.905280590057373, "learning_rate": 8.701824868651489e-05, "loss": 0.9115, "num_input_tokens_seen": 29860376, "step": 1855 }, { "epoch": 0.13000954407348062, "grad_norm": 4.974122524261475, "learning_rate": 8.701125043782838e-05, "loss": 0.9887, "num_input_tokens_seen": 29875880, "step": 1856 }, { "epoch": 0.13007959231920985, "grad_norm": 4.33966064453125, "learning_rate": 8.700425218914185e-05, "loss": 1.1955, "num_input_tokens_seen": 29891088, "step": 1857 }, { "epoch": 0.1301496405649391, "grad_norm": 4.593107223510742, "learning_rate": 8.699725394045534e-05, "loss": 0.9012, "num_input_tokens_seen": 29907472, "step": 1858 }, { "epoch": 0.13021968881066834, "grad_norm": 4.036941051483154, "learning_rate": 8.699025569176884e-05, "loss": 1.048, "num_input_tokens_seen": 29923856, "step": 1859 }, { "epoch": 0.1302897370563976, "grad_norm": 3.887981653213501, "learning_rate": 8.698325744308232e-05, "loss": 1.2116, "num_input_tokens_seen": 29939872, "step": 1860 }, { "epoch": 0.13035978530212683, "grad_norm": 3.796053886413574, "learning_rate": 8.697625919439581e-05, "loss": 1.1678, "num_input_tokens_seen": 29955928, "step": 1861 }, { "epoch": 0.1304298335478561, "grad_norm": 4.5357184410095215, "learning_rate": 8.696926094570928e-05, "loss": 0.9246, "num_input_tokens_seen": 29970760, "step": 1862 }, { "epoch": 0.13049988179358532, "grad_norm": 5.54911994934082, "learning_rate": 8.696226269702277e-05, "loss": 1.1874, "num_input_tokens_seen": 29986408, "step": 1863 }, { "epoch": 0.13056993003931458, "grad_norm": 3.6517300605773926, "learning_rate": 8.695526444833625e-05, "loss": 1.0949, "num_input_tokens_seen": 30002792, "step": 1864 }, { "epoch": 0.1306399782850438, "grad_norm": 3.6885063648223877, "learning_rate": 8.694826619964975e-05, "loss": 1.0027, "num_input_tokens_seen": 30019176, "step": 1865 }, { "epoch": 0.13071002653077307, "grad_norm": 4.417117118835449, "learning_rate": 8.694126795096324e-05, "loss": 1.1017, "num_input_tokens_seen": 30034856, "step": 1866 }, { "epoch": 0.1307800747765023, "grad_norm": 4.070515155792236, "learning_rate": 8.693426970227671e-05, "loss": 1.0393, "num_input_tokens_seen": 30051240, "step": 1867 }, { "epoch": 0.13085012302223156, "grad_norm": 4.135226726531982, "learning_rate": 8.69272714535902e-05, "loss": 1.0886, "num_input_tokens_seen": 30067392, "step": 1868 }, { "epoch": 0.1309201712679608, "grad_norm": 4.304529666900635, "learning_rate": 8.692027320490369e-05, "loss": 1.0851, "num_input_tokens_seen": 30083640, "step": 1869 }, { "epoch": 0.13099021951369005, "grad_norm": 4.633643627166748, "learning_rate": 8.691327495621716e-05, "loss": 1.1934, "num_input_tokens_seen": 30099968, "step": 1870 }, { "epoch": 0.1310602677594193, "grad_norm": 3.6481478214263916, "learning_rate": 8.690627670753065e-05, "loss": 1.0661, "num_input_tokens_seen": 30116352, "step": 1871 }, { "epoch": 0.13113031600514855, "grad_norm": 4.15482234954834, "learning_rate": 8.689927845884414e-05, "loss": 1.1083, "num_input_tokens_seen": 30132256, "step": 1872 }, { "epoch": 0.1312003642508778, "grad_norm": 3.6562340259552, "learning_rate": 8.689228021015763e-05, "loss": 0.9322, "num_input_tokens_seen": 30147520, "step": 1873 }, { "epoch": 0.13127041249660704, "grad_norm": 5.323586463928223, "learning_rate": 8.68852819614711e-05, "loss": 1.4077, "num_input_tokens_seen": 30163880, "step": 1874 }, { "epoch": 0.1313404607423363, "grad_norm": 4.068235397338867, "learning_rate": 8.687828371278459e-05, "loss": 1.144, "num_input_tokens_seen": 30180264, "step": 1875 }, { "epoch": 0.13141050898806553, "grad_norm": 3.743837594985962, "learning_rate": 8.687128546409808e-05, "loss": 0.9754, "num_input_tokens_seen": 30196520, "step": 1876 }, { "epoch": 0.1314805572337948, "grad_norm": 4.344557285308838, "learning_rate": 8.686428721541156e-05, "loss": 1.2745, "num_input_tokens_seen": 30212904, "step": 1877 }, { "epoch": 0.13155060547952402, "grad_norm": 4.048375129699707, "learning_rate": 8.685728896672505e-05, "loss": 1.1916, "num_input_tokens_seen": 30228464, "step": 1878 }, { "epoch": 0.13162065372525328, "grad_norm": 3.893768548965454, "learning_rate": 8.685029071803853e-05, "loss": 1.1462, "num_input_tokens_seen": 30244848, "step": 1879 }, { "epoch": 0.1316907019709825, "grad_norm": 4.469354629516602, "learning_rate": 8.684329246935202e-05, "loss": 1.0267, "num_input_tokens_seen": 30260744, "step": 1880 }, { "epoch": 0.13176075021671177, "grad_norm": 3.8471877574920654, "learning_rate": 8.68362942206655e-05, "loss": 0.8467, "num_input_tokens_seen": 30277128, "step": 1881 }, { "epoch": 0.131830798462441, "grad_norm": 4.37143611907959, "learning_rate": 8.682929597197899e-05, "loss": 0.9103, "num_input_tokens_seen": 30293184, "step": 1882 }, { "epoch": 0.13190084670817026, "grad_norm": 4.4709601402282715, "learning_rate": 8.682229772329247e-05, "loss": 0.9975, "num_input_tokens_seen": 30309568, "step": 1883 }, { "epoch": 0.1319708949538995, "grad_norm": 4.016445159912109, "learning_rate": 8.681529947460595e-05, "loss": 1.1499, "num_input_tokens_seen": 30325952, "step": 1884 }, { "epoch": 0.13204094319962875, "grad_norm": 3.6610453128814697, "learning_rate": 8.680830122591945e-05, "loss": 1.1407, "num_input_tokens_seen": 30341608, "step": 1885 }, { "epoch": 0.13211099144535798, "grad_norm": 4.226510524749756, "learning_rate": 8.680130297723294e-05, "loss": 0.8327, "num_input_tokens_seen": 30357992, "step": 1886 }, { "epoch": 0.13218103969108724, "grad_norm": 4.135020732879639, "learning_rate": 8.679430472854642e-05, "loss": 1.0807, "num_input_tokens_seen": 30373464, "step": 1887 }, { "epoch": 0.13225108793681647, "grad_norm": 3.858785629272461, "learning_rate": 8.67873064798599e-05, "loss": 0.9305, "num_input_tokens_seen": 30389336, "step": 1888 }, { "epoch": 0.13232113618254573, "grad_norm": 3.5424365997314453, "learning_rate": 8.678030823117338e-05, "loss": 1.0885, "num_input_tokens_seen": 30405720, "step": 1889 }, { "epoch": 0.13239118442827497, "grad_norm": 4.177000522613525, "learning_rate": 8.677330998248687e-05, "loss": 1.2172, "num_input_tokens_seen": 30422104, "step": 1890 }, { "epoch": 0.13246123267400423, "grad_norm": 4.08710241317749, "learning_rate": 8.676631173380036e-05, "loss": 1.0063, "num_input_tokens_seen": 30437560, "step": 1891 }, { "epoch": 0.13253128091973346, "grad_norm": 3.889277219772339, "learning_rate": 8.675931348511384e-05, "loss": 1.0227, "num_input_tokens_seen": 30453944, "step": 1892 }, { "epoch": 0.13260132916546272, "grad_norm": 3.7967042922973633, "learning_rate": 8.675231523642733e-05, "loss": 0.8988, "num_input_tokens_seen": 30469480, "step": 1893 }, { "epoch": 0.13267137741119195, "grad_norm": 4.2189202308654785, "learning_rate": 8.674531698774081e-05, "loss": 1.0591, "num_input_tokens_seen": 30485536, "step": 1894 }, { "epoch": 0.1327414256569212, "grad_norm": 4.682656764984131, "learning_rate": 8.67383187390543e-05, "loss": 1.2001, "num_input_tokens_seen": 30501720, "step": 1895 }, { "epoch": 0.13281147390265044, "grad_norm": 4.151151657104492, "learning_rate": 8.673132049036779e-05, "loss": 1.027, "num_input_tokens_seen": 30518104, "step": 1896 }, { "epoch": 0.1328815221483797, "grad_norm": 3.700916290283203, "learning_rate": 8.672432224168126e-05, "loss": 1.0545, "num_input_tokens_seen": 30534488, "step": 1897 }, { "epoch": 0.13295157039410893, "grad_norm": 3.512343406677246, "learning_rate": 8.671732399299475e-05, "loss": 1.0569, "num_input_tokens_seen": 30550872, "step": 1898 }, { "epoch": 0.1330216186398382, "grad_norm": 3.5579488277435303, "learning_rate": 8.671032574430824e-05, "loss": 0.9725, "num_input_tokens_seen": 30567256, "step": 1899 }, { "epoch": 0.13309166688556742, "grad_norm": 3.7006070613861084, "learning_rate": 8.670332749562173e-05, "loss": 0.9628, "num_input_tokens_seen": 30582784, "step": 1900 }, { "epoch": 0.13316171513129668, "grad_norm": 4.373071670532227, "learning_rate": 8.66963292469352e-05, "loss": 1.2223, "num_input_tokens_seen": 30599168, "step": 1901 }, { "epoch": 0.1332317633770259, "grad_norm": 4.459958076477051, "learning_rate": 8.668933099824869e-05, "loss": 1.2149, "num_input_tokens_seen": 30615552, "step": 1902 }, { "epoch": 0.13330181162275517, "grad_norm": 4.919619560241699, "learning_rate": 8.668233274956218e-05, "loss": 1.069, "num_input_tokens_seen": 30631936, "step": 1903 }, { "epoch": 0.1333718598684844, "grad_norm": 3.709568977355957, "learning_rate": 8.667533450087565e-05, "loss": 0.9867, "num_input_tokens_seen": 30648320, "step": 1904 }, { "epoch": 0.13344190811421366, "grad_norm": 4.097365379333496, "learning_rate": 8.666833625218916e-05, "loss": 1.2128, "num_input_tokens_seen": 30664704, "step": 1905 }, { "epoch": 0.13351195635994292, "grad_norm": 4.702358722686768, "learning_rate": 8.666133800350263e-05, "loss": 1.2809, "num_input_tokens_seen": 30681088, "step": 1906 }, { "epoch": 0.13358200460567216, "grad_norm": 3.7732086181640625, "learning_rate": 8.665433975481612e-05, "loss": 1.1529, "num_input_tokens_seen": 30697472, "step": 1907 }, { "epoch": 0.13365205285140142, "grad_norm": 5.318485260009766, "learning_rate": 8.66473415061296e-05, "loss": 1.0414, "num_input_tokens_seen": 30712336, "step": 1908 }, { "epoch": 0.13372210109713065, "grad_norm": 4.364311695098877, "learning_rate": 8.664034325744308e-05, "loss": 1.0634, "num_input_tokens_seen": 30728600, "step": 1909 }, { "epoch": 0.1337921493428599, "grad_norm": 4.860876083374023, "learning_rate": 8.663334500875657e-05, "loss": 1.0945, "num_input_tokens_seen": 30744832, "step": 1910 }, { "epoch": 0.13386219758858914, "grad_norm": 4.455454349517822, "learning_rate": 8.662634676007006e-05, "loss": 1.1765, "num_input_tokens_seen": 30761216, "step": 1911 }, { "epoch": 0.1339322458343184, "grad_norm": 4.70845365524292, "learning_rate": 8.661934851138355e-05, "loss": 1.2774, "num_input_tokens_seen": 30776600, "step": 1912 }, { "epoch": 0.13400229408004763, "grad_norm": 3.9769747257232666, "learning_rate": 8.661235026269704e-05, "loss": 1.006, "num_input_tokens_seen": 30792632, "step": 1913 }, { "epoch": 0.1340723423257769, "grad_norm": 4.387015342712402, "learning_rate": 8.660535201401051e-05, "loss": 1.1839, "num_input_tokens_seen": 30809016, "step": 1914 }, { "epoch": 0.13414239057150612, "grad_norm": 4.786890506744385, "learning_rate": 8.6598353765324e-05, "loss": 1.2352, "num_input_tokens_seen": 30825136, "step": 1915 }, { "epoch": 0.13421243881723538, "grad_norm": 3.502570629119873, "learning_rate": 8.659135551663748e-05, "loss": 1.0175, "num_input_tokens_seen": 30841472, "step": 1916 }, { "epoch": 0.1342824870629646, "grad_norm": 4.2404913902282715, "learning_rate": 8.658435726795096e-05, "loss": 1.1882, "num_input_tokens_seen": 30857856, "step": 1917 }, { "epoch": 0.13435253530869387, "grad_norm": 4.230425834655762, "learning_rate": 8.657735901926445e-05, "loss": 1.098, "num_input_tokens_seen": 30874240, "step": 1918 }, { "epoch": 0.1344225835544231, "grad_norm": 3.9034597873687744, "learning_rate": 8.657036077057794e-05, "loss": 1.0441, "num_input_tokens_seen": 30890560, "step": 1919 }, { "epoch": 0.13449263180015236, "grad_norm": 3.829190492630005, "learning_rate": 8.656336252189143e-05, "loss": 1.0675, "num_input_tokens_seen": 30906480, "step": 1920 }, { "epoch": 0.1345626800458816, "grad_norm": 3.9801993370056152, "learning_rate": 8.65563642732049e-05, "loss": 1.0407, "num_input_tokens_seen": 30922160, "step": 1921 }, { "epoch": 0.13463272829161085, "grad_norm": 5.018815994262695, "learning_rate": 8.65493660245184e-05, "loss": 1.1155, "num_input_tokens_seen": 30938544, "step": 1922 }, { "epoch": 0.13470277653734009, "grad_norm": 3.6515283584594727, "learning_rate": 8.654236777583188e-05, "loss": 1.0436, "num_input_tokens_seen": 30954088, "step": 1923 }, { "epoch": 0.13477282478306934, "grad_norm": 4.440131664276123, "learning_rate": 8.653536952714536e-05, "loss": 1.002, "num_input_tokens_seen": 30970472, "step": 1924 }, { "epoch": 0.13484287302879858, "grad_norm": 5.27577543258667, "learning_rate": 8.652837127845885e-05, "loss": 1.0783, "num_input_tokens_seen": 30985544, "step": 1925 }, { "epoch": 0.13491292127452784, "grad_norm": 4.632978916168213, "learning_rate": 8.652137302977233e-05, "loss": 1.1539, "num_input_tokens_seen": 31001928, "step": 1926 }, { "epoch": 0.13498296952025707, "grad_norm": 3.9239861965179443, "learning_rate": 8.651437478108582e-05, "loss": 1.0231, "num_input_tokens_seen": 31018312, "step": 1927 }, { "epoch": 0.13505301776598633, "grad_norm": 4.819107532501221, "learning_rate": 8.65073765323993e-05, "loss": 1.1631, "num_input_tokens_seen": 31033568, "step": 1928 }, { "epoch": 0.13512306601171556, "grad_norm": 3.5287766456604004, "learning_rate": 8.650037828371279e-05, "loss": 1.0172, "num_input_tokens_seen": 31049952, "step": 1929 }, { "epoch": 0.13519311425744482, "grad_norm": 3.536736488342285, "learning_rate": 8.649338003502628e-05, "loss": 0.9576, "num_input_tokens_seen": 31066336, "step": 1930 }, { "epoch": 0.13526316250317405, "grad_norm": 5.148278713226318, "learning_rate": 8.648638178633976e-05, "loss": 1.2137, "num_input_tokens_seen": 31082136, "step": 1931 }, { "epoch": 0.1353332107489033, "grad_norm": 4.076564788818359, "learning_rate": 8.647938353765325e-05, "loss": 1.081, "num_input_tokens_seen": 31098520, "step": 1932 }, { "epoch": 0.13540325899463254, "grad_norm": 4.747740745544434, "learning_rate": 8.647238528896673e-05, "loss": 1.1989, "num_input_tokens_seen": 31114560, "step": 1933 }, { "epoch": 0.1354733072403618, "grad_norm": 3.662280797958374, "learning_rate": 8.646538704028022e-05, "loss": 1.0797, "num_input_tokens_seen": 31130944, "step": 1934 }, { "epoch": 0.13554335548609103, "grad_norm": 3.8747782707214355, "learning_rate": 8.645838879159369e-05, "loss": 0.9258, "num_input_tokens_seen": 31146544, "step": 1935 }, { "epoch": 0.1356134037318203, "grad_norm": 3.465095281600952, "learning_rate": 8.645139054290718e-05, "loss": 1.0582, "num_input_tokens_seen": 31162928, "step": 1936 }, { "epoch": 0.13568345197754952, "grad_norm": 4.640190124511719, "learning_rate": 8.644439229422067e-05, "loss": 1.1265, "num_input_tokens_seen": 31177712, "step": 1937 }, { "epoch": 0.13575350022327878, "grad_norm": 3.88620924949646, "learning_rate": 8.643739404553416e-05, "loss": 1.0244, "num_input_tokens_seen": 31193640, "step": 1938 }, { "epoch": 0.13582354846900804, "grad_norm": 3.657331705093384, "learning_rate": 8.643039579684765e-05, "loss": 0.9715, "num_input_tokens_seen": 31209112, "step": 1939 }, { "epoch": 0.13589359671473727, "grad_norm": 6.8866448402404785, "learning_rate": 8.642339754816113e-05, "loss": 0.9734, "num_input_tokens_seen": 31223968, "step": 1940 }, { "epoch": 0.13596364496046653, "grad_norm": 5.0794172286987305, "learning_rate": 8.641639929947461e-05, "loss": 1.1988, "num_input_tokens_seen": 31240352, "step": 1941 }, { "epoch": 0.13603369320619577, "grad_norm": 4.631995677947998, "learning_rate": 8.64094010507881e-05, "loss": 1.1814, "num_input_tokens_seen": 31256736, "step": 1942 }, { "epoch": 0.13610374145192503, "grad_norm": 5.566014766693115, "learning_rate": 8.640240280210157e-05, "loss": 1.1769, "num_input_tokens_seen": 31273120, "step": 1943 }, { "epoch": 0.13617378969765426, "grad_norm": 3.940988302230835, "learning_rate": 8.639540455341506e-05, "loss": 1.0196, "num_input_tokens_seen": 31289504, "step": 1944 }, { "epoch": 0.13624383794338352, "grad_norm": 3.9979453086853027, "learning_rate": 8.638840630472855e-05, "loss": 1.0467, "num_input_tokens_seen": 31305888, "step": 1945 }, { "epoch": 0.13631388618911275, "grad_norm": 5.303500175476074, "learning_rate": 8.638140805604204e-05, "loss": 1.0938, "num_input_tokens_seen": 31321512, "step": 1946 }, { "epoch": 0.136383934434842, "grad_norm": 4.6745429039001465, "learning_rate": 8.637440980735553e-05, "loss": 1.3665, "num_input_tokens_seen": 31337896, "step": 1947 }, { "epoch": 0.13645398268057124, "grad_norm": 4.203839302062988, "learning_rate": 8.6367411558669e-05, "loss": 0.8949, "num_input_tokens_seen": 31354176, "step": 1948 }, { "epoch": 0.1365240309263005, "grad_norm": 4.802511215209961, "learning_rate": 8.636041330998249e-05, "loss": 1.2427, "num_input_tokens_seen": 31369976, "step": 1949 }, { "epoch": 0.13659407917202973, "grad_norm": 4.077885627746582, "learning_rate": 8.635341506129598e-05, "loss": 1.1259, "num_input_tokens_seen": 31386360, "step": 1950 }, { "epoch": 0.136664127417759, "grad_norm": 5.009285926818848, "learning_rate": 8.634641681260947e-05, "loss": 1.0278, "num_input_tokens_seen": 31402744, "step": 1951 }, { "epoch": 0.13673417566348822, "grad_norm": 3.539872646331787, "learning_rate": 8.633941856392294e-05, "loss": 1.0522, "num_input_tokens_seen": 31419128, "step": 1952 }, { "epoch": 0.13680422390921748, "grad_norm": 4.664520740509033, "learning_rate": 8.633242031523643e-05, "loss": 1.1559, "num_input_tokens_seen": 31435400, "step": 1953 }, { "epoch": 0.1368742721549467, "grad_norm": 3.8469269275665283, "learning_rate": 8.632542206654992e-05, "loss": 1.1237, "num_input_tokens_seen": 31451408, "step": 1954 }, { "epoch": 0.13694432040067597, "grad_norm": 4.064670085906982, "learning_rate": 8.63184238178634e-05, "loss": 0.8825, "num_input_tokens_seen": 31467504, "step": 1955 }, { "epoch": 0.1370143686464052, "grad_norm": 3.9931817054748535, "learning_rate": 8.631142556917688e-05, "loss": 1.17, "num_input_tokens_seen": 31483528, "step": 1956 }, { "epoch": 0.13708441689213446, "grad_norm": 4.136581897735596, "learning_rate": 8.630442732049037e-05, "loss": 1.069, "num_input_tokens_seen": 31499912, "step": 1957 }, { "epoch": 0.1371544651378637, "grad_norm": 3.7189536094665527, "learning_rate": 8.629742907180386e-05, "loss": 1.0509, "num_input_tokens_seen": 31515560, "step": 1958 }, { "epoch": 0.13722451338359296, "grad_norm": 3.7821719646453857, "learning_rate": 8.629043082311735e-05, "loss": 1.0583, "num_input_tokens_seen": 31531944, "step": 1959 }, { "epoch": 0.1372945616293222, "grad_norm": 6.815886497497559, "learning_rate": 8.628343257443082e-05, "loss": 0.9118, "num_input_tokens_seen": 31548248, "step": 1960 }, { "epoch": 0.13736460987505145, "grad_norm": 7.490451812744141, "learning_rate": 8.627643432574431e-05, "loss": 1.1145, "num_input_tokens_seen": 31562560, "step": 1961 }, { "epoch": 0.13743465812078068, "grad_norm": 4.918768405914307, "learning_rate": 8.626943607705779e-05, "loss": 1.2198, "num_input_tokens_seen": 31578944, "step": 1962 }, { "epoch": 0.13750470636650994, "grad_norm": 5.567696571350098, "learning_rate": 8.626243782837128e-05, "loss": 1.1083, "num_input_tokens_seen": 31594312, "step": 1963 }, { "epoch": 0.13757475461223917, "grad_norm": 4.24015474319458, "learning_rate": 8.625543957968477e-05, "loss": 1.1807, "num_input_tokens_seen": 31609656, "step": 1964 }, { "epoch": 0.13764480285796843, "grad_norm": 5.664759635925293, "learning_rate": 8.624844133099825e-05, "loss": 1.1775, "num_input_tokens_seen": 31626040, "step": 1965 }, { "epoch": 0.13771485110369766, "grad_norm": 3.7281267642974854, "learning_rate": 8.624144308231174e-05, "loss": 1.0994, "num_input_tokens_seen": 31642424, "step": 1966 }, { "epoch": 0.13778489934942692, "grad_norm": 4.112753391265869, "learning_rate": 8.623444483362523e-05, "loss": 1.2113, "num_input_tokens_seen": 31658808, "step": 1967 }, { "epoch": 0.13785494759515615, "grad_norm": 3.8851754665374756, "learning_rate": 8.62274465849387e-05, "loss": 1.0596, "num_input_tokens_seen": 31675192, "step": 1968 }, { "epoch": 0.1379249958408854, "grad_norm": 4.161825656890869, "learning_rate": 8.62204483362522e-05, "loss": 1.03, "num_input_tokens_seen": 31691576, "step": 1969 }, { "epoch": 0.13799504408661464, "grad_norm": 4.802804470062256, "learning_rate": 8.621345008756567e-05, "loss": 1.4374, "num_input_tokens_seen": 31707960, "step": 1970 }, { "epoch": 0.1380650923323439, "grad_norm": 3.752012252807617, "learning_rate": 8.620645183887917e-05, "loss": 1.124, "num_input_tokens_seen": 31724344, "step": 1971 }, { "epoch": 0.13813514057807313, "grad_norm": 3.8039815425872803, "learning_rate": 8.619945359019265e-05, "loss": 1.0051, "num_input_tokens_seen": 31740456, "step": 1972 }, { "epoch": 0.1382051888238024, "grad_norm": 4.029634952545166, "learning_rate": 8.619245534150614e-05, "loss": 1.2221, "num_input_tokens_seen": 31756776, "step": 1973 }, { "epoch": 0.13827523706953165, "grad_norm": 5.531665802001953, "learning_rate": 8.618545709281962e-05, "loss": 1.1534, "num_input_tokens_seen": 31772480, "step": 1974 }, { "epoch": 0.13834528531526089, "grad_norm": 4.6494646072387695, "learning_rate": 8.61784588441331e-05, "loss": 0.9723, "num_input_tokens_seen": 31788504, "step": 1975 }, { "epoch": 0.13841533356099014, "grad_norm": 4.201340675354004, "learning_rate": 8.617146059544659e-05, "loss": 1.0648, "num_input_tokens_seen": 31804888, "step": 1976 }, { "epoch": 0.13848538180671938, "grad_norm": 4.272038459777832, "learning_rate": 8.616446234676008e-05, "loss": 1.2557, "num_input_tokens_seen": 31821272, "step": 1977 }, { "epoch": 0.13855543005244864, "grad_norm": 3.729841947555542, "learning_rate": 8.615746409807357e-05, "loss": 1.0346, "num_input_tokens_seen": 31837656, "step": 1978 }, { "epoch": 0.13862547829817787, "grad_norm": 3.5615944862365723, "learning_rate": 8.615046584938704e-05, "loss": 0.9986, "num_input_tokens_seen": 31854040, "step": 1979 }, { "epoch": 0.13869552654390713, "grad_norm": 3.7658376693725586, "learning_rate": 8.614346760070053e-05, "loss": 1.3268, "num_input_tokens_seen": 31870424, "step": 1980 }, { "epoch": 0.13876557478963636, "grad_norm": 4.124275207519531, "learning_rate": 8.613646935201402e-05, "loss": 1.2736, "num_input_tokens_seen": 31886808, "step": 1981 }, { "epoch": 0.13883562303536562, "grad_norm": 5.348685264587402, "learning_rate": 8.612947110332749e-05, "loss": 1.0492, "num_input_tokens_seen": 31902880, "step": 1982 }, { "epoch": 0.13890567128109485, "grad_norm": 5.311651706695557, "learning_rate": 8.612247285464098e-05, "loss": 1.2034, "num_input_tokens_seen": 31918704, "step": 1983 }, { "epoch": 0.1389757195268241, "grad_norm": 4.194555759429932, "learning_rate": 8.611547460595447e-05, "loss": 1.2802, "num_input_tokens_seen": 31935088, "step": 1984 }, { "epoch": 0.13904576777255334, "grad_norm": 3.6576390266418457, "learning_rate": 8.610847635726796e-05, "loss": 1.0618, "num_input_tokens_seen": 31951472, "step": 1985 }, { "epoch": 0.1391158160182826, "grad_norm": 4.169801235198975, "learning_rate": 8.610147810858145e-05, "loss": 1.1668, "num_input_tokens_seen": 31967856, "step": 1986 }, { "epoch": 0.13918586426401183, "grad_norm": 3.79791259765625, "learning_rate": 8.609447985989492e-05, "loss": 1.2546, "num_input_tokens_seen": 31984232, "step": 1987 }, { "epoch": 0.1392559125097411, "grad_norm": 3.726701021194458, "learning_rate": 8.608748161120841e-05, "loss": 1.177, "num_input_tokens_seen": 31999720, "step": 1988 }, { "epoch": 0.13932596075547032, "grad_norm": 3.7376129627227783, "learning_rate": 8.608048336252189e-05, "loss": 1.0174, "num_input_tokens_seen": 32016104, "step": 1989 }, { "epoch": 0.13939600900119958, "grad_norm": 4.290423393249512, "learning_rate": 8.607348511383537e-05, "loss": 1.1556, "num_input_tokens_seen": 32031992, "step": 1990 }, { "epoch": 0.13946605724692882, "grad_norm": 3.592384099960327, "learning_rate": 8.606648686514888e-05, "loss": 1.0629, "num_input_tokens_seen": 32047904, "step": 1991 }, { "epoch": 0.13953610549265807, "grad_norm": 3.753692626953125, "learning_rate": 8.605948861646235e-05, "loss": 1.0111, "num_input_tokens_seen": 32063720, "step": 1992 }, { "epoch": 0.1396061537383873, "grad_norm": 4.698465347290039, "learning_rate": 8.605249036777584e-05, "loss": 1.0255, "num_input_tokens_seen": 32079768, "step": 1993 }, { "epoch": 0.13967620198411657, "grad_norm": 4.187407970428467, "learning_rate": 8.604549211908933e-05, "loss": 0.9006, "num_input_tokens_seen": 32095120, "step": 1994 }, { "epoch": 0.1397462502298458, "grad_norm": 4.256275653839111, "learning_rate": 8.60384938704028e-05, "loss": 1.1607, "num_input_tokens_seen": 32111408, "step": 1995 }, { "epoch": 0.13981629847557506, "grad_norm": 6.693331241607666, "learning_rate": 8.603149562171629e-05, "loss": 1.2317, "num_input_tokens_seen": 32127792, "step": 1996 }, { "epoch": 0.1398863467213043, "grad_norm": 3.69393253326416, "learning_rate": 8.602449737302978e-05, "loss": 0.9747, "num_input_tokens_seen": 32143792, "step": 1997 }, { "epoch": 0.13995639496703355, "grad_norm": 4.117836952209473, "learning_rate": 8.601749912434327e-05, "loss": 1.0732, "num_input_tokens_seen": 32158624, "step": 1998 }, { "epoch": 0.14002644321276278, "grad_norm": 5.14541482925415, "learning_rate": 8.601050087565674e-05, "loss": 1.1787, "num_input_tokens_seen": 32175008, "step": 1999 }, { "epoch": 0.14009649145849204, "grad_norm": 4.0103535652160645, "learning_rate": 8.600350262697023e-05, "loss": 1.1308, "num_input_tokens_seen": 32191392, "step": 2000 }, { "epoch": 0.14009649145849204, "eval_loss": 1.1461617946624756, "eval_runtime": 0.1945, "eval_samples_per_second": 5.141, "eval_steps_per_second": 5.141, "num_input_tokens_seen": 32191392, "step": 2000 }, { "epoch": 0.14016653970422127, "grad_norm": 3.8072049617767334, "learning_rate": 8.599650437828372e-05, "loss": 0.8617, "num_input_tokens_seen": 32207712, "step": 2001 }, { "epoch": 0.14023658794995053, "grad_norm": 4.034494400024414, "learning_rate": 8.59895061295972e-05, "loss": 1.1719, "num_input_tokens_seen": 32223440, "step": 2002 }, { "epoch": 0.14030663619567976, "grad_norm": 3.9485251903533936, "learning_rate": 8.598250788091069e-05, "loss": 1.2242, "num_input_tokens_seen": 32239824, "step": 2003 }, { "epoch": 0.14037668444140902, "grad_norm": 5.427109241485596, "learning_rate": 8.597550963222417e-05, "loss": 1.1922, "num_input_tokens_seen": 32255976, "step": 2004 }, { "epoch": 0.14044673268713825, "grad_norm": 4.4832000732421875, "learning_rate": 8.596851138353766e-05, "loss": 1.2791, "num_input_tokens_seen": 32272304, "step": 2005 }, { "epoch": 0.1405167809328675, "grad_norm": 4.4699859619140625, "learning_rate": 8.596151313485114e-05, "loss": 1.0175, "num_input_tokens_seen": 32288688, "step": 2006 }, { "epoch": 0.14058682917859674, "grad_norm": 6.007316589355469, "learning_rate": 8.595451488616463e-05, "loss": 1.2402, "num_input_tokens_seen": 32304992, "step": 2007 }, { "epoch": 0.140656877424326, "grad_norm": 5.460748195648193, "learning_rate": 8.594751663747811e-05, "loss": 1.2683, "num_input_tokens_seen": 32320104, "step": 2008 }, { "epoch": 0.14072692567005526, "grad_norm": 4.430675029754639, "learning_rate": 8.594051838879159e-05, "loss": 0.9664, "num_input_tokens_seen": 32336040, "step": 2009 }, { "epoch": 0.1407969739157845, "grad_norm": 4.469089508056641, "learning_rate": 8.593352014010508e-05, "loss": 1.0335, "num_input_tokens_seen": 32352424, "step": 2010 }, { "epoch": 0.14086702216151376, "grad_norm": 4.94099760055542, "learning_rate": 8.592652189141858e-05, "loss": 1.1091, "num_input_tokens_seen": 32367944, "step": 2011 }, { "epoch": 0.140937070407243, "grad_norm": 5.430322170257568, "learning_rate": 8.591952364273206e-05, "loss": 1.2256, "num_input_tokens_seen": 32384328, "step": 2012 }, { "epoch": 0.14100711865297225, "grad_norm": 3.847569704055786, "learning_rate": 8.591252539404554e-05, "loss": 1.019, "num_input_tokens_seen": 32400712, "step": 2013 }, { "epoch": 0.14107716689870148, "grad_norm": 3.7531189918518066, "learning_rate": 8.590552714535902e-05, "loss": 0.9409, "num_input_tokens_seen": 32417096, "step": 2014 }, { "epoch": 0.14114721514443074, "grad_norm": 4.070606708526611, "learning_rate": 8.589852889667251e-05, "loss": 1.0857, "num_input_tokens_seen": 32432504, "step": 2015 }, { "epoch": 0.14121726339015997, "grad_norm": 4.791952610015869, "learning_rate": 8.589153064798598e-05, "loss": 0.8467, "num_input_tokens_seen": 32448008, "step": 2016 }, { "epoch": 0.14128731163588923, "grad_norm": 4.672977924346924, "learning_rate": 8.588453239929948e-05, "loss": 1.081, "num_input_tokens_seen": 32463792, "step": 2017 }, { "epoch": 0.14135735988161846, "grad_norm": 6.187239170074463, "learning_rate": 8.587753415061297e-05, "loss": 1.0416, "num_input_tokens_seen": 32480104, "step": 2018 }, { "epoch": 0.14142740812734772, "grad_norm": 4.058189392089844, "learning_rate": 8.587053590192645e-05, "loss": 1.0598, "num_input_tokens_seen": 32495824, "step": 2019 }, { "epoch": 0.14149745637307695, "grad_norm": 3.862661838531494, "learning_rate": 8.586353765323994e-05, "loss": 0.9371, "num_input_tokens_seen": 32512208, "step": 2020 }, { "epoch": 0.1415675046188062, "grad_norm": 3.7348716259002686, "learning_rate": 8.585653940455343e-05, "loss": 1.1021, "num_input_tokens_seen": 32528592, "step": 2021 }, { "epoch": 0.14163755286453544, "grad_norm": 4.405923843383789, "learning_rate": 8.58495411558669e-05, "loss": 1.1405, "num_input_tokens_seen": 32544120, "step": 2022 }, { "epoch": 0.1417076011102647, "grad_norm": 3.73984694480896, "learning_rate": 8.584254290718039e-05, "loss": 1.0797, "num_input_tokens_seen": 32560504, "step": 2023 }, { "epoch": 0.14177764935599393, "grad_norm": 5.73613166809082, "learning_rate": 8.583554465849388e-05, "loss": 1.2119, "num_input_tokens_seen": 32576888, "step": 2024 }, { "epoch": 0.1418476976017232, "grad_norm": 6.435116291046143, "learning_rate": 8.582854640980737e-05, "loss": 1.3408, "num_input_tokens_seen": 32591592, "step": 2025 }, { "epoch": 0.14191774584745243, "grad_norm": 4.520002365112305, "learning_rate": 8.582154816112084e-05, "loss": 1.1654, "num_input_tokens_seen": 32607448, "step": 2026 }, { "epoch": 0.14198779409318169, "grad_norm": 4.01891565322876, "learning_rate": 8.581454991243433e-05, "loss": 1.1203, "num_input_tokens_seen": 32623104, "step": 2027 }, { "epoch": 0.14205784233891092, "grad_norm": 3.8237030506134033, "learning_rate": 8.580755166374782e-05, "loss": 1.0548, "num_input_tokens_seen": 32639376, "step": 2028 }, { "epoch": 0.14212789058464018, "grad_norm": 4.893499851226807, "learning_rate": 8.58005534150613e-05, "loss": 1.1629, "num_input_tokens_seen": 32654800, "step": 2029 }, { "epoch": 0.1421979388303694, "grad_norm": 3.6075315475463867, "learning_rate": 8.579355516637478e-05, "loss": 1.0889, "num_input_tokens_seen": 32671184, "step": 2030 }, { "epoch": 0.14226798707609867, "grad_norm": 4.696410179138184, "learning_rate": 8.578655691768827e-05, "loss": 1.1777, "num_input_tokens_seen": 32687360, "step": 2031 }, { "epoch": 0.1423380353218279, "grad_norm": 3.9465558528900146, "learning_rate": 8.577955866900176e-05, "loss": 1.1378, "num_input_tokens_seen": 32703744, "step": 2032 }, { "epoch": 0.14240808356755716, "grad_norm": 3.933898448944092, "learning_rate": 8.577256042031523e-05, "loss": 0.8353, "num_input_tokens_seen": 32720128, "step": 2033 }, { "epoch": 0.1424781318132864, "grad_norm": 3.865894317626953, "learning_rate": 8.576556217162872e-05, "loss": 0.9827, "num_input_tokens_seen": 32735976, "step": 2034 }, { "epoch": 0.14254818005901565, "grad_norm": 3.9533474445343018, "learning_rate": 8.575856392294221e-05, "loss": 1.1028, "num_input_tokens_seen": 32752240, "step": 2035 }, { "epoch": 0.14261822830474488, "grad_norm": 3.5534164905548096, "learning_rate": 8.575156567425569e-05, "loss": 1.0887, "num_input_tokens_seen": 32768624, "step": 2036 }, { "epoch": 0.14268827655047414, "grad_norm": 5.689724922180176, "learning_rate": 8.574456742556918e-05, "loss": 1.0588, "num_input_tokens_seen": 32784600, "step": 2037 }, { "epoch": 0.14275832479620337, "grad_norm": 4.010136604309082, "learning_rate": 8.573756917688268e-05, "loss": 0.8989, "num_input_tokens_seen": 32799824, "step": 2038 }, { "epoch": 0.14282837304193263, "grad_norm": 4.153547763824463, "learning_rate": 8.573057092819615e-05, "loss": 1.162, "num_input_tokens_seen": 32815744, "step": 2039 }, { "epoch": 0.14289842128766186, "grad_norm": 3.976120948791504, "learning_rate": 8.572357267950964e-05, "loss": 1.2234, "num_input_tokens_seen": 32831664, "step": 2040 }, { "epoch": 0.14296846953339112, "grad_norm": 3.9593231678009033, "learning_rate": 8.571657443082312e-05, "loss": 1.0482, "num_input_tokens_seen": 32848048, "step": 2041 }, { "epoch": 0.14303851777912036, "grad_norm": 3.920823097229004, "learning_rate": 8.57095761821366e-05, "loss": 1.1891, "num_input_tokens_seen": 32863168, "step": 2042 }, { "epoch": 0.14310856602484961, "grad_norm": 4.754055976867676, "learning_rate": 8.57025779334501e-05, "loss": 1.1123, "num_input_tokens_seen": 32879552, "step": 2043 }, { "epoch": 0.14317861427057887, "grad_norm": 3.6835105419158936, "learning_rate": 8.569557968476358e-05, "loss": 1.0919, "num_input_tokens_seen": 32895864, "step": 2044 }, { "epoch": 0.1432486625163081, "grad_norm": 4.115698337554932, "learning_rate": 8.568858143607707e-05, "loss": 1.166, "num_input_tokens_seen": 32912232, "step": 2045 }, { "epoch": 0.14331871076203737, "grad_norm": 6.536626815795898, "learning_rate": 8.568158318739055e-05, "loss": 1.1534, "num_input_tokens_seen": 32928616, "step": 2046 }, { "epoch": 0.1433887590077666, "grad_norm": 5.43113899230957, "learning_rate": 8.567458493870403e-05, "loss": 0.9645, "num_input_tokens_seen": 32945000, "step": 2047 }, { "epoch": 0.14345880725349586, "grad_norm": 3.8677239418029785, "learning_rate": 8.566758669001752e-05, "loss": 1.2213, "num_input_tokens_seen": 32961384, "step": 2048 }, { "epoch": 0.1435288554992251, "grad_norm": 6.913444995880127, "learning_rate": 8.5660588441331e-05, "loss": 1.2204, "num_input_tokens_seen": 32977768, "step": 2049 }, { "epoch": 0.14359890374495435, "grad_norm": 4.870579719543457, "learning_rate": 8.565359019264449e-05, "loss": 1.1022, "num_input_tokens_seen": 32994152, "step": 2050 }, { "epoch": 0.14366895199068358, "grad_norm": 4.057044982910156, "learning_rate": 8.564659194395797e-05, "loss": 1.0599, "num_input_tokens_seen": 33010536, "step": 2051 }, { "epoch": 0.14373900023641284, "grad_norm": 8.405828475952148, "learning_rate": 8.563959369527146e-05, "loss": 1.0928, "num_input_tokens_seen": 33025192, "step": 2052 }, { "epoch": 0.14380904848214207, "grad_norm": 4.188510894775391, "learning_rate": 8.563259544658494e-05, "loss": 1.1207, "num_input_tokens_seen": 33041576, "step": 2053 }, { "epoch": 0.14387909672787133, "grad_norm": 6.505815505981445, "learning_rate": 8.562559719789843e-05, "loss": 1.1196, "num_input_tokens_seen": 33057800, "step": 2054 }, { "epoch": 0.14394914497360056, "grad_norm": 4.021209716796875, "learning_rate": 8.561859894921192e-05, "loss": 0.953, "num_input_tokens_seen": 33073872, "step": 2055 }, { "epoch": 0.14401919321932982, "grad_norm": 3.924671173095703, "learning_rate": 8.561160070052539e-05, "loss": 1.022, "num_input_tokens_seen": 33090256, "step": 2056 }, { "epoch": 0.14408924146505905, "grad_norm": 4.1323418617248535, "learning_rate": 8.560460245183888e-05, "loss": 1.0995, "num_input_tokens_seen": 33106256, "step": 2057 }, { "epoch": 0.1441592897107883, "grad_norm": 4.236043930053711, "learning_rate": 8.559760420315237e-05, "loss": 1.0842, "num_input_tokens_seen": 33122352, "step": 2058 }, { "epoch": 0.14422933795651754, "grad_norm": 3.4836020469665527, "learning_rate": 8.559060595446586e-05, "loss": 1.0136, "num_input_tokens_seen": 33138736, "step": 2059 }, { "epoch": 0.1442993862022468, "grad_norm": 4.363439083099365, "learning_rate": 8.558360770577933e-05, "loss": 1.1382, "num_input_tokens_seen": 33153936, "step": 2060 }, { "epoch": 0.14436943444797604, "grad_norm": 5.099925994873047, "learning_rate": 8.557660945709282e-05, "loss": 1.0027, "num_input_tokens_seen": 33170320, "step": 2061 }, { "epoch": 0.1444394826937053, "grad_norm": 4.438295364379883, "learning_rate": 8.556961120840631e-05, "loss": 1.11, "num_input_tokens_seen": 33186704, "step": 2062 }, { "epoch": 0.14450953093943453, "grad_norm": 3.7912747859954834, "learning_rate": 8.556261295971978e-05, "loss": 1.0708, "num_input_tokens_seen": 33203088, "step": 2063 }, { "epoch": 0.1445795791851638, "grad_norm": 4.679794788360596, "learning_rate": 8.555561471103329e-05, "loss": 0.9707, "num_input_tokens_seen": 33218936, "step": 2064 }, { "epoch": 0.14464962743089302, "grad_norm": 4.092919826507568, "learning_rate": 8.554861646234677e-05, "loss": 1.2103, "num_input_tokens_seen": 33235320, "step": 2065 }, { "epoch": 0.14471967567662228, "grad_norm": 4.13189172744751, "learning_rate": 8.554161821366025e-05, "loss": 0.9919, "num_input_tokens_seen": 33251704, "step": 2066 }, { "epoch": 0.1447897239223515, "grad_norm": 3.618739366531372, "learning_rate": 8.553461996497374e-05, "loss": 1.0026, "num_input_tokens_seen": 33268088, "step": 2067 }, { "epoch": 0.14485977216808077, "grad_norm": 4.197813034057617, "learning_rate": 8.552762171628721e-05, "loss": 1.3134, "num_input_tokens_seen": 33284472, "step": 2068 }, { "epoch": 0.14492982041381, "grad_norm": 4.159245491027832, "learning_rate": 8.55206234676007e-05, "loss": 0.9579, "num_input_tokens_seen": 33300560, "step": 2069 }, { "epoch": 0.14499986865953926, "grad_norm": 3.970898389816284, "learning_rate": 8.551362521891419e-05, "loss": 1.0587, "num_input_tokens_seen": 33316744, "step": 2070 }, { "epoch": 0.1450699169052685, "grad_norm": 5.635775089263916, "learning_rate": 8.550662697022768e-05, "loss": 1.2284, "num_input_tokens_seen": 33333128, "step": 2071 }, { "epoch": 0.14513996515099775, "grad_norm": 5.239542007446289, "learning_rate": 8.549962872154117e-05, "loss": 1.166, "num_input_tokens_seen": 33348392, "step": 2072 }, { "epoch": 0.14521001339672698, "grad_norm": 3.8646957874298096, "learning_rate": 8.549263047285464e-05, "loss": 1.0343, "num_input_tokens_seen": 33364504, "step": 2073 }, { "epoch": 0.14528006164245624, "grad_norm": 4.49400520324707, "learning_rate": 8.548563222416813e-05, "loss": 0.9953, "num_input_tokens_seen": 33379680, "step": 2074 }, { "epoch": 0.14535010988818547, "grad_norm": 3.782107353210449, "learning_rate": 8.547863397548162e-05, "loss": 1.1396, "num_input_tokens_seen": 33396064, "step": 2075 }, { "epoch": 0.14542015813391473, "grad_norm": 3.8171703815460205, "learning_rate": 8.54716357267951e-05, "loss": 1.1364, "num_input_tokens_seen": 33411640, "step": 2076 }, { "epoch": 0.14549020637964397, "grad_norm": 3.56487774848938, "learning_rate": 8.546463747810858e-05, "loss": 1.0396, "num_input_tokens_seen": 33428024, "step": 2077 }, { "epoch": 0.14556025462537323, "grad_norm": 5.169209003448486, "learning_rate": 8.545763922942207e-05, "loss": 1.1262, "num_input_tokens_seen": 33444408, "step": 2078 }, { "epoch": 0.14563030287110248, "grad_norm": 3.718086004257202, "learning_rate": 8.545064098073556e-05, "loss": 1.0769, "num_input_tokens_seen": 33460416, "step": 2079 }, { "epoch": 0.14570035111683172, "grad_norm": 4.2451372146606445, "learning_rate": 8.544364273204904e-05, "loss": 1.0298, "num_input_tokens_seen": 33476800, "step": 2080 }, { "epoch": 0.14577039936256098, "grad_norm": 3.7441632747650146, "learning_rate": 8.543664448336252e-05, "loss": 0.9785, "num_input_tokens_seen": 33492536, "step": 2081 }, { "epoch": 0.1458404476082902, "grad_norm": 3.8453383445739746, "learning_rate": 8.542964623467601e-05, "loss": 1.2527, "num_input_tokens_seen": 33508920, "step": 2082 }, { "epoch": 0.14591049585401947, "grad_norm": 3.6744494438171387, "learning_rate": 8.542264798598949e-05, "loss": 1.0739, "num_input_tokens_seen": 33525304, "step": 2083 }, { "epoch": 0.1459805440997487, "grad_norm": 4.209956645965576, "learning_rate": 8.541564973730299e-05, "loss": 1.1538, "num_input_tokens_seen": 33541544, "step": 2084 }, { "epoch": 0.14605059234547796, "grad_norm": 4.347019672393799, "learning_rate": 8.540865148861647e-05, "loss": 1.1078, "num_input_tokens_seen": 33557928, "step": 2085 }, { "epoch": 0.1461206405912072, "grad_norm": 5.323390483856201, "learning_rate": 8.540165323992995e-05, "loss": 1.0776, "num_input_tokens_seen": 33573152, "step": 2086 }, { "epoch": 0.14619068883693645, "grad_norm": 3.632425308227539, "learning_rate": 8.539465499124343e-05, "loss": 1.0595, "num_input_tokens_seen": 33588848, "step": 2087 }, { "epoch": 0.14626073708266568, "grad_norm": 4.460893154144287, "learning_rate": 8.538765674255692e-05, "loss": 1.1131, "num_input_tokens_seen": 33604984, "step": 2088 }, { "epoch": 0.14633078532839494, "grad_norm": 4.059104919433594, "learning_rate": 8.53806584938704e-05, "loss": 1.1818, "num_input_tokens_seen": 33620384, "step": 2089 }, { "epoch": 0.14640083357412417, "grad_norm": 6.023964881896973, "learning_rate": 8.53736602451839e-05, "loss": 1.1589, "num_input_tokens_seen": 33636416, "step": 2090 }, { "epoch": 0.14647088181985343, "grad_norm": 4.462921619415283, "learning_rate": 8.536666199649738e-05, "loss": 0.9362, "num_input_tokens_seen": 33652504, "step": 2091 }, { "epoch": 0.14654093006558266, "grad_norm": 4.003902435302734, "learning_rate": 8.535966374781087e-05, "loss": 1.1062, "num_input_tokens_seen": 33668888, "step": 2092 }, { "epoch": 0.14661097831131192, "grad_norm": 4.161351680755615, "learning_rate": 8.535266549912435e-05, "loss": 1.0252, "num_input_tokens_seen": 33685272, "step": 2093 }, { "epoch": 0.14668102655704116, "grad_norm": 4.424163341522217, "learning_rate": 8.534566725043784e-05, "loss": 1.0225, "num_input_tokens_seen": 33700872, "step": 2094 }, { "epoch": 0.14675107480277041, "grad_norm": 4.2255072593688965, "learning_rate": 8.533866900175131e-05, "loss": 1.2044, "num_input_tokens_seen": 33717256, "step": 2095 }, { "epoch": 0.14682112304849965, "grad_norm": 4.204975605010986, "learning_rate": 8.53316707530648e-05, "loss": 1.1861, "num_input_tokens_seen": 33732544, "step": 2096 }, { "epoch": 0.1468911712942289, "grad_norm": 3.7058298587799072, "learning_rate": 8.532467250437829e-05, "loss": 1.1568, "num_input_tokens_seen": 33748928, "step": 2097 }, { "epoch": 0.14696121953995814, "grad_norm": 6.157133102416992, "learning_rate": 8.531767425569178e-05, "loss": 1.0704, "num_input_tokens_seen": 33765312, "step": 2098 }, { "epoch": 0.1470312677856874, "grad_norm": 4.3684210777282715, "learning_rate": 8.531067600700526e-05, "loss": 1.0977, "num_input_tokens_seen": 33781552, "step": 2099 }, { "epoch": 0.14710131603141663, "grad_norm": 3.957848310470581, "learning_rate": 8.530367775831874e-05, "loss": 1.1412, "num_input_tokens_seen": 33797464, "step": 2100 }, { "epoch": 0.1471713642771459, "grad_norm": 4.9368486404418945, "learning_rate": 8.529667950963223e-05, "loss": 0.9986, "num_input_tokens_seen": 33812672, "step": 2101 }, { "epoch": 0.14724141252287512, "grad_norm": 3.8515660762786865, "learning_rate": 8.528968126094572e-05, "loss": 0.8715, "num_input_tokens_seen": 33829024, "step": 2102 }, { "epoch": 0.14731146076860438, "grad_norm": 3.961448907852173, "learning_rate": 8.528268301225919e-05, "loss": 1.1416, "num_input_tokens_seen": 33845408, "step": 2103 }, { "epoch": 0.1473815090143336, "grad_norm": 4.101677894592285, "learning_rate": 8.52756847635727e-05, "loss": 1.001, "num_input_tokens_seen": 33861240, "step": 2104 }, { "epoch": 0.14745155726006287, "grad_norm": 3.886634349822998, "learning_rate": 8.526868651488617e-05, "loss": 1.1546, "num_input_tokens_seen": 33876832, "step": 2105 }, { "epoch": 0.1475216055057921, "grad_norm": 3.7241156101226807, "learning_rate": 8.526168826619966e-05, "loss": 1.2116, "num_input_tokens_seen": 33893216, "step": 2106 }, { "epoch": 0.14759165375152136, "grad_norm": 3.829458236694336, "learning_rate": 8.525469001751313e-05, "loss": 0.8324, "num_input_tokens_seen": 33909224, "step": 2107 }, { "epoch": 0.1476617019972506, "grad_norm": 5.275660514831543, "learning_rate": 8.524769176882662e-05, "loss": 1.0253, "num_input_tokens_seen": 33924768, "step": 2108 }, { "epoch": 0.14773175024297985, "grad_norm": 4.207718372344971, "learning_rate": 8.524069352014011e-05, "loss": 1.1871, "num_input_tokens_seen": 33940288, "step": 2109 }, { "epoch": 0.14780179848870909, "grad_norm": 4.205242156982422, "learning_rate": 8.52336952714536e-05, "loss": 1.0834, "num_input_tokens_seen": 33956512, "step": 2110 }, { "epoch": 0.14787184673443834, "grad_norm": 4.365423202514648, "learning_rate": 8.522669702276709e-05, "loss": 1.2043, "num_input_tokens_seen": 33972896, "step": 2111 }, { "epoch": 0.1479418949801676, "grad_norm": 4.416136741638184, "learning_rate": 8.521969877408056e-05, "loss": 1.001, "num_input_tokens_seen": 33989280, "step": 2112 }, { "epoch": 0.14801194322589684, "grad_norm": 4.375226974487305, "learning_rate": 8.521270052539405e-05, "loss": 1.1186, "num_input_tokens_seen": 34005664, "step": 2113 }, { "epoch": 0.1480819914716261, "grad_norm": 5.2603840827941895, "learning_rate": 8.520570227670753e-05, "loss": 1.0723, "num_input_tokens_seen": 34021576, "step": 2114 }, { "epoch": 0.14815203971735533, "grad_norm": 4.02445125579834, "learning_rate": 8.519870402802101e-05, "loss": 1.11, "num_input_tokens_seen": 34037960, "step": 2115 }, { "epoch": 0.1482220879630846, "grad_norm": 3.6527910232543945, "learning_rate": 8.51917057793345e-05, "loss": 1.0293, "num_input_tokens_seen": 34053240, "step": 2116 }, { "epoch": 0.14829213620881382, "grad_norm": 4.170680999755859, "learning_rate": 8.518470753064799e-05, "loss": 1.2068, "num_input_tokens_seen": 34068896, "step": 2117 }, { "epoch": 0.14836218445454308, "grad_norm": 4.366664886474609, "learning_rate": 8.517770928196148e-05, "loss": 0.9541, "num_input_tokens_seen": 34085280, "step": 2118 }, { "epoch": 0.1484322327002723, "grad_norm": 3.50757098197937, "learning_rate": 8.517071103327497e-05, "loss": 0.9992, "num_input_tokens_seen": 34101664, "step": 2119 }, { "epoch": 0.14850228094600157, "grad_norm": 4.607417106628418, "learning_rate": 8.516371278458844e-05, "loss": 1.1974, "num_input_tokens_seen": 34117752, "step": 2120 }, { "epoch": 0.1485723291917308, "grad_norm": 3.959874391555786, "learning_rate": 8.515671453590193e-05, "loss": 0.9902, "num_input_tokens_seen": 34133576, "step": 2121 }, { "epoch": 0.14864237743746006, "grad_norm": 4.708366870880127, "learning_rate": 8.514971628721541e-05, "loss": 1.1201, "num_input_tokens_seen": 34149952, "step": 2122 }, { "epoch": 0.1487124256831893, "grad_norm": 3.6237339973449707, "learning_rate": 8.51427180385289e-05, "loss": 1.1091, "num_input_tokens_seen": 34166336, "step": 2123 }, { "epoch": 0.14878247392891855, "grad_norm": 4.606329917907715, "learning_rate": 8.513571978984238e-05, "loss": 1.0986, "num_input_tokens_seen": 34181128, "step": 2124 }, { "epoch": 0.14885252217464778, "grad_norm": 4.559760093688965, "learning_rate": 8.512872154115587e-05, "loss": 1.1022, "num_input_tokens_seen": 34197512, "step": 2125 }, { "epoch": 0.14892257042037704, "grad_norm": 3.870089292526245, "learning_rate": 8.512172329246936e-05, "loss": 1.0378, "num_input_tokens_seen": 34213896, "step": 2126 }, { "epoch": 0.14899261866610627, "grad_norm": 4.441296100616455, "learning_rate": 8.511472504378284e-05, "loss": 1.1473, "num_input_tokens_seen": 34229472, "step": 2127 }, { "epoch": 0.14906266691183553, "grad_norm": 3.8565545082092285, "learning_rate": 8.510772679509633e-05, "loss": 1.1465, "num_input_tokens_seen": 34245856, "step": 2128 }, { "epoch": 0.14913271515756477, "grad_norm": 3.563889741897583, "learning_rate": 8.510072854640981e-05, "loss": 0.8612, "num_input_tokens_seen": 34262240, "step": 2129 }, { "epoch": 0.14920276340329403, "grad_norm": 4.2634429931640625, "learning_rate": 8.50937302977233e-05, "loss": 1.1825, "num_input_tokens_seen": 34278624, "step": 2130 }, { "epoch": 0.14927281164902326, "grad_norm": 5.418450355529785, "learning_rate": 8.508673204903679e-05, "loss": 0.9869, "num_input_tokens_seen": 34294216, "step": 2131 }, { "epoch": 0.14934285989475252, "grad_norm": 3.511815309524536, "learning_rate": 8.507973380035027e-05, "loss": 0.9725, "num_input_tokens_seen": 34310592, "step": 2132 }, { "epoch": 0.14941290814048175, "grad_norm": 4.088070392608643, "learning_rate": 8.507273555166375e-05, "loss": 1.1299, "num_input_tokens_seen": 34326352, "step": 2133 }, { "epoch": 0.149482956386211, "grad_norm": 3.8594932556152344, "learning_rate": 8.506573730297723e-05, "loss": 1.0281, "num_input_tokens_seen": 34342512, "step": 2134 }, { "epoch": 0.14955300463194024, "grad_norm": 5.410063743591309, "learning_rate": 8.505873905429072e-05, "loss": 1.1376, "num_input_tokens_seen": 34358896, "step": 2135 }, { "epoch": 0.1496230528776695, "grad_norm": 4.02821159362793, "learning_rate": 8.505174080560421e-05, "loss": 0.9707, "num_input_tokens_seen": 34375280, "step": 2136 }, { "epoch": 0.14969310112339873, "grad_norm": 3.866480827331543, "learning_rate": 8.50447425569177e-05, "loss": 1.0727, "num_input_tokens_seen": 34391584, "step": 2137 }, { "epoch": 0.149763149369128, "grad_norm": 3.667064905166626, "learning_rate": 8.503774430823118e-05, "loss": 1.0609, "num_input_tokens_seen": 34407264, "step": 2138 }, { "epoch": 0.14983319761485722, "grad_norm": 5.41308069229126, "learning_rate": 8.503074605954466e-05, "loss": 1.0779, "num_input_tokens_seen": 34423648, "step": 2139 }, { "epoch": 0.14990324586058648, "grad_norm": 4.1716485023498535, "learning_rate": 8.502374781085815e-05, "loss": 1.2112, "num_input_tokens_seen": 34439512, "step": 2140 }, { "epoch": 0.1499732941063157, "grad_norm": 4.1403913497924805, "learning_rate": 8.501674956217162e-05, "loss": 1.0773, "num_input_tokens_seen": 34455896, "step": 2141 }, { "epoch": 0.15004334235204497, "grad_norm": 3.75219988822937, "learning_rate": 8.500975131348511e-05, "loss": 1.0685, "num_input_tokens_seen": 34472280, "step": 2142 }, { "epoch": 0.1501133905977742, "grad_norm": 4.339532852172852, "learning_rate": 8.50027530647986e-05, "loss": 1.0439, "num_input_tokens_seen": 34488664, "step": 2143 }, { "epoch": 0.15018343884350346, "grad_norm": 4.259124755859375, "learning_rate": 8.499575481611209e-05, "loss": 1.0576, "num_input_tokens_seen": 34505048, "step": 2144 }, { "epoch": 0.1502534870892327, "grad_norm": 5.031396865844727, "learning_rate": 8.498875656742558e-05, "loss": 0.9932, "num_input_tokens_seen": 34521432, "step": 2145 }, { "epoch": 0.15032353533496196, "grad_norm": 5.313172340393066, "learning_rate": 8.498175831873907e-05, "loss": 1.1737, "num_input_tokens_seen": 34536344, "step": 2146 }, { "epoch": 0.15039358358069121, "grad_norm": 4.844740390777588, "learning_rate": 8.497476007005254e-05, "loss": 1.4095, "num_input_tokens_seen": 34552728, "step": 2147 }, { "epoch": 0.15046363182642045, "grad_norm": 4.231154441833496, "learning_rate": 8.496776182136603e-05, "loss": 1.1196, "num_input_tokens_seen": 34569016, "step": 2148 }, { "epoch": 0.1505336800721497, "grad_norm": 4.176802635192871, "learning_rate": 8.49607635726795e-05, "loss": 1.0856, "num_input_tokens_seen": 34585376, "step": 2149 }, { "epoch": 0.15060372831787894, "grad_norm": 4.710334777832031, "learning_rate": 8.4953765323993e-05, "loss": 1.0085, "num_input_tokens_seen": 34600400, "step": 2150 }, { "epoch": 0.1506737765636082, "grad_norm": 3.9053258895874023, "learning_rate": 8.494676707530648e-05, "loss": 1.2191, "num_input_tokens_seen": 34616688, "step": 2151 }, { "epoch": 0.15074382480933743, "grad_norm": 4.043003559112549, "learning_rate": 8.493976882661997e-05, "loss": 1.0541, "num_input_tokens_seen": 34631920, "step": 2152 }, { "epoch": 0.1508138730550667, "grad_norm": 5.230721473693848, "learning_rate": 8.493277057793346e-05, "loss": 1.1491, "num_input_tokens_seen": 34648128, "step": 2153 }, { "epoch": 0.15088392130079592, "grad_norm": 4.098349094390869, "learning_rate": 8.492577232924693e-05, "loss": 1.1302, "num_input_tokens_seen": 34664512, "step": 2154 }, { "epoch": 0.15095396954652518, "grad_norm": 4.803813457489014, "learning_rate": 8.491877408056042e-05, "loss": 0.9653, "num_input_tokens_seen": 34680560, "step": 2155 }, { "epoch": 0.1510240177922544, "grad_norm": 4.25751256942749, "learning_rate": 8.491177583187391e-05, "loss": 1.2481, "num_input_tokens_seen": 34696944, "step": 2156 }, { "epoch": 0.15109406603798367, "grad_norm": 6.600613117218018, "learning_rate": 8.49047775831874e-05, "loss": 1.1786, "num_input_tokens_seen": 34712416, "step": 2157 }, { "epoch": 0.1511641142837129, "grad_norm": 5.649744987487793, "learning_rate": 8.489777933450087e-05, "loss": 1.3045, "num_input_tokens_seen": 34728520, "step": 2158 }, { "epoch": 0.15123416252944216, "grad_norm": 5.778639316558838, "learning_rate": 8.489078108581436e-05, "loss": 1.1224, "num_input_tokens_seen": 34744776, "step": 2159 }, { "epoch": 0.1513042107751714, "grad_norm": 5.944733619689941, "learning_rate": 8.488378283712785e-05, "loss": 1.3293, "num_input_tokens_seen": 34761160, "step": 2160 }, { "epoch": 0.15137425902090065, "grad_norm": 3.7783594131469727, "learning_rate": 8.487678458844133e-05, "loss": 1.0975, "num_input_tokens_seen": 34777544, "step": 2161 }, { "epoch": 0.15144430726662989, "grad_norm": 5.126344680786133, "learning_rate": 8.486978633975482e-05, "loss": 1.0509, "num_input_tokens_seen": 34793072, "step": 2162 }, { "epoch": 0.15151435551235914, "grad_norm": 4.689150333404541, "learning_rate": 8.48627880910683e-05, "loss": 1.1454, "num_input_tokens_seen": 34809456, "step": 2163 }, { "epoch": 0.15158440375808838, "grad_norm": 3.7559547424316406, "learning_rate": 8.485578984238179e-05, "loss": 1.1414, "num_input_tokens_seen": 34825208, "step": 2164 }, { "epoch": 0.15165445200381764, "grad_norm": 3.9225172996520996, "learning_rate": 8.484879159369528e-05, "loss": 1.1771, "num_input_tokens_seen": 34841592, "step": 2165 }, { "epoch": 0.15172450024954687, "grad_norm": 4.264125347137451, "learning_rate": 8.484179334500876e-05, "loss": 1.0046, "num_input_tokens_seen": 34857928, "step": 2166 }, { "epoch": 0.15179454849527613, "grad_norm": 4.0784382820129395, "learning_rate": 8.483479509632224e-05, "loss": 1.0638, "num_input_tokens_seen": 34873224, "step": 2167 }, { "epoch": 0.15186459674100536, "grad_norm": 4.371130466461182, "learning_rate": 8.482779684763572e-05, "loss": 1.3854, "num_input_tokens_seen": 34889608, "step": 2168 }, { "epoch": 0.15193464498673462, "grad_norm": 3.7022883892059326, "learning_rate": 8.482079859894921e-05, "loss": 0.9892, "num_input_tokens_seen": 34905984, "step": 2169 }, { "epoch": 0.15200469323246385, "grad_norm": 4.196985721588135, "learning_rate": 8.481380035026271e-05, "loss": 0.9674, "num_input_tokens_seen": 34922368, "step": 2170 }, { "epoch": 0.1520747414781931, "grad_norm": 4.0252580642700195, "learning_rate": 8.480680210157619e-05, "loss": 1.0478, "num_input_tokens_seen": 34938752, "step": 2171 }, { "epoch": 0.15214478972392234, "grad_norm": 4.03692626953125, "learning_rate": 8.479980385288967e-05, "loss": 1.1801, "num_input_tokens_seen": 34954176, "step": 2172 }, { "epoch": 0.1522148379696516, "grad_norm": 4.183175563812256, "learning_rate": 8.479280560420316e-05, "loss": 1.1117, "num_input_tokens_seen": 34969880, "step": 2173 }, { "epoch": 0.15228488621538083, "grad_norm": 3.757636070251465, "learning_rate": 8.478580735551664e-05, "loss": 1.1507, "num_input_tokens_seen": 34985576, "step": 2174 }, { "epoch": 0.1523549344611101, "grad_norm": 3.9442903995513916, "learning_rate": 8.477880910683013e-05, "loss": 1.034, "num_input_tokens_seen": 35001896, "step": 2175 }, { "epoch": 0.15242498270683932, "grad_norm": 4.092566013336182, "learning_rate": 8.477181085814362e-05, "loss": 1.0542, "num_input_tokens_seen": 35018280, "step": 2176 }, { "epoch": 0.15249503095256858, "grad_norm": 5.494921684265137, "learning_rate": 8.47648126094571e-05, "loss": 1.0988, "num_input_tokens_seen": 35034544, "step": 2177 }, { "epoch": 0.15256507919829781, "grad_norm": 7.327289581298828, "learning_rate": 8.475781436077058e-05, "loss": 1.1879, "num_input_tokens_seen": 35050928, "step": 2178 }, { "epoch": 0.15263512744402707, "grad_norm": 4.048150539398193, "learning_rate": 8.475081611208407e-05, "loss": 1.1071, "num_input_tokens_seen": 35067000, "step": 2179 }, { "epoch": 0.1527051756897563, "grad_norm": 6.388006210327148, "learning_rate": 8.474381786339756e-05, "loss": 0.9821, "num_input_tokens_seen": 35082064, "step": 2180 }, { "epoch": 0.15277522393548557, "grad_norm": 4.289052963256836, "learning_rate": 8.473681961471103e-05, "loss": 1.077, "num_input_tokens_seen": 35098448, "step": 2181 }, { "epoch": 0.15284527218121483, "grad_norm": 4.288560390472412, "learning_rate": 8.472982136602452e-05, "loss": 1.2723, "num_input_tokens_seen": 35114832, "step": 2182 }, { "epoch": 0.15291532042694406, "grad_norm": 4.17701530456543, "learning_rate": 8.472282311733801e-05, "loss": 1.1691, "num_input_tokens_seen": 35131216, "step": 2183 }, { "epoch": 0.15298536867267332, "grad_norm": 4.975949764251709, "learning_rate": 8.47158248686515e-05, "loss": 1.057, "num_input_tokens_seen": 35147600, "step": 2184 }, { "epoch": 0.15305541691840255, "grad_norm": 5.465437889099121, "learning_rate": 8.470882661996497e-05, "loss": 1.0328, "num_input_tokens_seen": 35162464, "step": 2185 }, { "epoch": 0.1531254651641318, "grad_norm": 3.329401731491089, "learning_rate": 8.470182837127846e-05, "loss": 1.0596, "num_input_tokens_seen": 35178744, "step": 2186 }, { "epoch": 0.15319551340986104, "grad_norm": 5.962124824523926, "learning_rate": 8.469483012259195e-05, "loss": 1.2799, "num_input_tokens_seen": 35194736, "step": 2187 }, { "epoch": 0.1532655616555903, "grad_norm": 3.897841691970825, "learning_rate": 8.468783187390542e-05, "loss": 1.1701, "num_input_tokens_seen": 35211120, "step": 2188 }, { "epoch": 0.15333560990131953, "grad_norm": 3.9668943881988525, "learning_rate": 8.468083362521891e-05, "loss": 1.1302, "num_input_tokens_seen": 35227504, "step": 2189 }, { "epoch": 0.1534056581470488, "grad_norm": 3.8960444927215576, "learning_rate": 8.467383537653241e-05, "loss": 0.891, "num_input_tokens_seen": 35243584, "step": 2190 }, { "epoch": 0.15347570639277802, "grad_norm": 3.7700982093811035, "learning_rate": 8.466683712784589e-05, "loss": 1.1744, "num_input_tokens_seen": 35259968, "step": 2191 }, { "epoch": 0.15354575463850728, "grad_norm": 4.65008020401001, "learning_rate": 8.465983887915938e-05, "loss": 1.2807, "num_input_tokens_seen": 35276352, "step": 2192 }, { "epoch": 0.1536158028842365, "grad_norm": 3.5371146202087402, "learning_rate": 8.465284063047285e-05, "loss": 0.9699, "num_input_tokens_seen": 35292736, "step": 2193 }, { "epoch": 0.15368585112996577, "grad_norm": 4.395732879638672, "learning_rate": 8.464584238178634e-05, "loss": 0.9862, "num_input_tokens_seen": 35309120, "step": 2194 }, { "epoch": 0.153755899375695, "grad_norm": 5.01919412612915, "learning_rate": 8.463884413309982e-05, "loss": 1.0143, "num_input_tokens_seen": 35325504, "step": 2195 }, { "epoch": 0.15382594762142426, "grad_norm": 3.7417054176330566, "learning_rate": 8.463184588441332e-05, "loss": 1.0712, "num_input_tokens_seen": 35341376, "step": 2196 }, { "epoch": 0.1538959958671535, "grad_norm": 4.119459629058838, "learning_rate": 8.462484763572681e-05, "loss": 1.0919, "num_input_tokens_seen": 35357520, "step": 2197 }, { "epoch": 0.15396604411288276, "grad_norm": 6.938751220703125, "learning_rate": 8.461784938704028e-05, "loss": 1.1272, "num_input_tokens_seen": 35372920, "step": 2198 }, { "epoch": 0.154036092358612, "grad_norm": 5.000339984893799, "learning_rate": 8.461085113835377e-05, "loss": 1.1508, "num_input_tokens_seen": 35389304, "step": 2199 }, { "epoch": 0.15410614060434125, "grad_norm": 3.6554362773895264, "learning_rate": 8.460385288966726e-05, "loss": 1.0765, "num_input_tokens_seen": 35405688, "step": 2200 }, { "epoch": 0.15410614060434125, "eval_loss": 1.145054578781128, "eval_runtime": 0.1886, "eval_samples_per_second": 5.303, "eval_steps_per_second": 5.303, "num_input_tokens_seen": 35405688, "step": 2200 }, { "epoch": 0.15417618885007048, "grad_norm": 3.718207836151123, "learning_rate": 8.459685464098073e-05, "loss": 0.8814, "num_input_tokens_seen": 35422072, "step": 2201 }, { "epoch": 0.15424623709579974, "grad_norm": 4.98813533782959, "learning_rate": 8.458985639229422e-05, "loss": 1.1814, "num_input_tokens_seen": 35438456, "step": 2202 }, { "epoch": 0.15431628534152897, "grad_norm": 3.550008535385132, "learning_rate": 8.458285814360771e-05, "loss": 1.1281, "num_input_tokens_seen": 35454840, "step": 2203 }, { "epoch": 0.15438633358725823, "grad_norm": 3.8408641815185547, "learning_rate": 8.45758598949212e-05, "loss": 0.9759, "num_input_tokens_seen": 35471080, "step": 2204 }, { "epoch": 0.15445638183298746, "grad_norm": 4.515852451324463, "learning_rate": 8.456886164623468e-05, "loss": 0.9394, "num_input_tokens_seen": 35486904, "step": 2205 }, { "epoch": 0.15452643007871672, "grad_norm": 3.6536715030670166, "learning_rate": 8.456186339754816e-05, "loss": 0.9649, "num_input_tokens_seen": 35503064, "step": 2206 }, { "epoch": 0.15459647832444595, "grad_norm": 4.071808338165283, "learning_rate": 8.455486514886165e-05, "loss": 1.0972, "num_input_tokens_seen": 35518880, "step": 2207 }, { "epoch": 0.1546665265701752, "grad_norm": 4.329566955566406, "learning_rate": 8.454786690017513e-05, "loss": 1.0843, "num_input_tokens_seen": 35535256, "step": 2208 }, { "epoch": 0.15473657481590444, "grad_norm": 4.243298053741455, "learning_rate": 8.454086865148862e-05, "loss": 1.1688, "num_input_tokens_seen": 35551376, "step": 2209 }, { "epoch": 0.1548066230616337, "grad_norm": 4.154253959655762, "learning_rate": 8.453387040280212e-05, "loss": 1.0458, "num_input_tokens_seen": 35567696, "step": 2210 }, { "epoch": 0.15487667130736293, "grad_norm": 4.0564494132995605, "learning_rate": 8.45268721541156e-05, "loss": 1.0585, "num_input_tokens_seen": 35583576, "step": 2211 }, { "epoch": 0.1549467195530922, "grad_norm": 3.735724687576294, "learning_rate": 8.451987390542907e-05, "loss": 0.92, "num_input_tokens_seen": 35599536, "step": 2212 }, { "epoch": 0.15501676779882143, "grad_norm": 4.651454925537109, "learning_rate": 8.451287565674256e-05, "loss": 1.2097, "num_input_tokens_seen": 35615920, "step": 2213 }, { "epoch": 0.15508681604455068, "grad_norm": 5.01883602142334, "learning_rate": 8.450587740805605e-05, "loss": 0.9275, "num_input_tokens_seen": 35631208, "step": 2214 }, { "epoch": 0.15515686429027992, "grad_norm": 4.435250282287598, "learning_rate": 8.449887915936952e-05, "loss": 1.003, "num_input_tokens_seen": 35647328, "step": 2215 }, { "epoch": 0.15522691253600918, "grad_norm": 3.495476245880127, "learning_rate": 8.449188091068302e-05, "loss": 0.9968, "num_input_tokens_seen": 35663472, "step": 2216 }, { "epoch": 0.15529696078173844, "grad_norm": 4.461013317108154, "learning_rate": 8.448488266199651e-05, "loss": 1.1098, "num_input_tokens_seen": 35679856, "step": 2217 }, { "epoch": 0.15536700902746767, "grad_norm": 5.4857683181762695, "learning_rate": 8.447788441330999e-05, "loss": 1.143, "num_input_tokens_seen": 35695616, "step": 2218 }, { "epoch": 0.15543705727319693, "grad_norm": 4.20158052444458, "learning_rate": 8.447088616462348e-05, "loss": 1.1643, "num_input_tokens_seen": 35711432, "step": 2219 }, { "epoch": 0.15550710551892616, "grad_norm": 4.289988040924072, "learning_rate": 8.446388791593695e-05, "loss": 1.1582, "num_input_tokens_seen": 35727552, "step": 2220 }, { "epoch": 0.15557715376465542, "grad_norm": 3.7897555828094482, "learning_rate": 8.445688966725044e-05, "loss": 1.255, "num_input_tokens_seen": 35743800, "step": 2221 }, { "epoch": 0.15564720201038465, "grad_norm": 4.405816078186035, "learning_rate": 8.444989141856393e-05, "loss": 1.1057, "num_input_tokens_seen": 35760184, "step": 2222 }, { "epoch": 0.1557172502561139, "grad_norm": 4.2683610916137695, "learning_rate": 8.444289316987742e-05, "loss": 1.1042, "num_input_tokens_seen": 35776568, "step": 2223 }, { "epoch": 0.15578729850184314, "grad_norm": 3.9999659061431885, "learning_rate": 8.44358949211909e-05, "loss": 1.0504, "num_input_tokens_seen": 35792952, "step": 2224 }, { "epoch": 0.1558573467475724, "grad_norm": 3.6252965927124023, "learning_rate": 8.442889667250438e-05, "loss": 0.9755, "num_input_tokens_seen": 35809176, "step": 2225 }, { "epoch": 0.15592739499330163, "grad_norm": 3.9726274013519287, "learning_rate": 8.442189842381787e-05, "loss": 1.1104, "num_input_tokens_seen": 35825560, "step": 2226 }, { "epoch": 0.1559974432390309, "grad_norm": 5.004739761352539, "learning_rate": 8.441490017513136e-05, "loss": 1.2484, "num_input_tokens_seen": 35841936, "step": 2227 }, { "epoch": 0.15606749148476012, "grad_norm": 5.432271480560303, "learning_rate": 8.440790192644483e-05, "loss": 0.9799, "num_input_tokens_seen": 35857944, "step": 2228 }, { "epoch": 0.15613753973048938, "grad_norm": 4.553518295288086, "learning_rate": 8.440090367775832e-05, "loss": 1.1077, "num_input_tokens_seen": 35873920, "step": 2229 }, { "epoch": 0.15620758797621861, "grad_norm": 5.924668312072754, "learning_rate": 8.439390542907181e-05, "loss": 1.2937, "num_input_tokens_seen": 35888872, "step": 2230 }, { "epoch": 0.15627763622194787, "grad_norm": 4.276167392730713, "learning_rate": 8.43869071803853e-05, "loss": 1.1883, "num_input_tokens_seen": 35905256, "step": 2231 }, { "epoch": 0.1563476844676771, "grad_norm": 3.719632863998413, "learning_rate": 8.437990893169877e-05, "loss": 1.0713, "num_input_tokens_seen": 35921640, "step": 2232 }, { "epoch": 0.15641773271340637, "grad_norm": 4.769368648529053, "learning_rate": 8.437291068301226e-05, "loss": 1.079, "num_input_tokens_seen": 35936256, "step": 2233 }, { "epoch": 0.1564877809591356, "grad_norm": 4.957282543182373, "learning_rate": 8.436591243432575e-05, "loss": 1.0535, "num_input_tokens_seen": 35952640, "step": 2234 }, { "epoch": 0.15655782920486486, "grad_norm": 4.782018661499023, "learning_rate": 8.435891418563923e-05, "loss": 1.0799, "num_input_tokens_seen": 35967880, "step": 2235 }, { "epoch": 0.1566278774505941, "grad_norm": 4.716582775115967, "learning_rate": 8.435191593695273e-05, "loss": 1.1388, "num_input_tokens_seen": 35984016, "step": 2236 }, { "epoch": 0.15669792569632335, "grad_norm": 4.36606502532959, "learning_rate": 8.434491768826622e-05, "loss": 0.954, "num_input_tokens_seen": 35999904, "step": 2237 }, { "epoch": 0.15676797394205258, "grad_norm": 3.8300321102142334, "learning_rate": 8.433791943957969e-05, "loss": 1.0903, "num_input_tokens_seen": 36016216, "step": 2238 }, { "epoch": 0.15683802218778184, "grad_norm": 3.7595677375793457, "learning_rate": 8.433092119089317e-05, "loss": 1.0214, "num_input_tokens_seen": 36032600, "step": 2239 }, { "epoch": 0.15690807043351107, "grad_norm": 4.783555030822754, "learning_rate": 8.432392294220665e-05, "loss": 1.1621, "num_input_tokens_seen": 36048984, "step": 2240 }, { "epoch": 0.15697811867924033, "grad_norm": 4.393221855163574, "learning_rate": 8.431692469352014e-05, "loss": 1.2196, "num_input_tokens_seen": 36065368, "step": 2241 }, { "epoch": 0.15704816692496956, "grad_norm": 3.8634722232818604, "learning_rate": 8.430992644483363e-05, "loss": 1.0227, "num_input_tokens_seen": 36081752, "step": 2242 }, { "epoch": 0.15711821517069882, "grad_norm": 4.5091233253479, "learning_rate": 8.430292819614712e-05, "loss": 0.9261, "num_input_tokens_seen": 36097672, "step": 2243 }, { "epoch": 0.15718826341642805, "grad_norm": 3.89699387550354, "learning_rate": 8.429592994746061e-05, "loss": 1.0023, "num_input_tokens_seen": 36114048, "step": 2244 }, { "epoch": 0.1572583116621573, "grad_norm": 3.8859546184539795, "learning_rate": 8.428893169877408e-05, "loss": 0.9597, "num_input_tokens_seen": 36130024, "step": 2245 }, { "epoch": 0.15732835990788654, "grad_norm": 4.236848831176758, "learning_rate": 8.428193345008757e-05, "loss": 1.1777, "num_input_tokens_seen": 36146408, "step": 2246 }, { "epoch": 0.1573984081536158, "grad_norm": 6.742307662963867, "learning_rate": 8.427493520140105e-05, "loss": 0.9674, "num_input_tokens_seen": 36161440, "step": 2247 }, { "epoch": 0.15746845639934504, "grad_norm": 3.332416534423828, "learning_rate": 8.426793695271454e-05, "loss": 0.7694, "num_input_tokens_seen": 36177824, "step": 2248 }, { "epoch": 0.1575385046450743, "grad_norm": 4.672734260559082, "learning_rate": 8.426093870402802e-05, "loss": 0.9228, "num_input_tokens_seen": 36193320, "step": 2249 }, { "epoch": 0.15760855289080353, "grad_norm": 4.437155246734619, "learning_rate": 8.425394045534151e-05, "loss": 1.2712, "num_input_tokens_seen": 36209704, "step": 2250 }, { "epoch": 0.1576786011365328, "grad_norm": 4.112512111663818, "learning_rate": 8.4246942206655e-05, "loss": 1.3494, "num_input_tokens_seen": 36226088, "step": 2251 }, { "epoch": 0.15774864938226205, "grad_norm": 4.432194709777832, "learning_rate": 8.423994395796848e-05, "loss": 1.1303, "num_input_tokens_seen": 36242472, "step": 2252 }, { "epoch": 0.15781869762799128, "grad_norm": 4.322375297546387, "learning_rate": 8.423294570928197e-05, "loss": 1.084, "num_input_tokens_seen": 36258680, "step": 2253 }, { "epoch": 0.15788874587372054, "grad_norm": 3.848836660385132, "learning_rate": 8.422594746059545e-05, "loss": 1.2057, "num_input_tokens_seen": 36274512, "step": 2254 }, { "epoch": 0.15795879411944977, "grad_norm": 4.022729396820068, "learning_rate": 8.421894921190893e-05, "loss": 1.0584, "num_input_tokens_seen": 36289568, "step": 2255 }, { "epoch": 0.15802884236517903, "grad_norm": 3.8060622215270996, "learning_rate": 8.421195096322243e-05, "loss": 1.1144, "num_input_tokens_seen": 36305256, "step": 2256 }, { "epoch": 0.15809889061090826, "grad_norm": 4.685004234313965, "learning_rate": 8.42049527145359e-05, "loss": 1.1341, "num_input_tokens_seen": 36321008, "step": 2257 }, { "epoch": 0.15816893885663752, "grad_norm": 3.4483463764190674, "learning_rate": 8.41979544658494e-05, "loss": 0.9563, "num_input_tokens_seen": 36337000, "step": 2258 }, { "epoch": 0.15823898710236675, "grad_norm": 3.7172203063964844, "learning_rate": 8.419095621716287e-05, "loss": 1.1463, "num_input_tokens_seen": 36353160, "step": 2259 }, { "epoch": 0.158309035348096, "grad_norm": 5.734589099884033, "learning_rate": 8.418395796847636e-05, "loss": 0.9321, "num_input_tokens_seen": 36369248, "step": 2260 }, { "epoch": 0.15837908359382524, "grad_norm": 4.060257911682129, "learning_rate": 8.417695971978985e-05, "loss": 1.2162, "num_input_tokens_seen": 36384736, "step": 2261 }, { "epoch": 0.1584491318395545, "grad_norm": 5.240515232086182, "learning_rate": 8.416996147110334e-05, "loss": 0.9652, "num_input_tokens_seen": 36401120, "step": 2262 }, { "epoch": 0.15851918008528373, "grad_norm": 5.482649803161621, "learning_rate": 8.416296322241682e-05, "loss": 1.207, "num_input_tokens_seen": 36417504, "step": 2263 }, { "epoch": 0.158589228331013, "grad_norm": 3.9862253665924072, "learning_rate": 8.415596497373031e-05, "loss": 1.1354, "num_input_tokens_seen": 36433888, "step": 2264 }, { "epoch": 0.15865927657674223, "grad_norm": 6.322808742523193, "learning_rate": 8.414896672504379e-05, "loss": 1.1144, "num_input_tokens_seen": 36449552, "step": 2265 }, { "epoch": 0.15872932482247148, "grad_norm": 4.312921524047852, "learning_rate": 8.414196847635726e-05, "loss": 1.1254, "num_input_tokens_seen": 36465936, "step": 2266 }, { "epoch": 0.15879937306820072, "grad_norm": 4.178677082061768, "learning_rate": 8.413497022767075e-05, "loss": 1.2539, "num_input_tokens_seen": 36482184, "step": 2267 }, { "epoch": 0.15886942131392998, "grad_norm": 4.304810523986816, "learning_rate": 8.412797197898424e-05, "loss": 1.199, "num_input_tokens_seen": 36498320, "step": 2268 }, { "epoch": 0.1589394695596592, "grad_norm": 3.723483085632324, "learning_rate": 8.412097373029773e-05, "loss": 1.0335, "num_input_tokens_seen": 36514704, "step": 2269 }, { "epoch": 0.15900951780538847, "grad_norm": 4.285789489746094, "learning_rate": 8.411397548161122e-05, "loss": 1.2463, "num_input_tokens_seen": 36531032, "step": 2270 }, { "epoch": 0.1590795660511177, "grad_norm": 3.5788466930389404, "learning_rate": 8.41069772329247e-05, "loss": 0.7809, "num_input_tokens_seen": 36547416, "step": 2271 }, { "epoch": 0.15914961429684696, "grad_norm": 5.785874366760254, "learning_rate": 8.409997898423818e-05, "loss": 1.2832, "num_input_tokens_seen": 36563800, "step": 2272 }, { "epoch": 0.1592196625425762, "grad_norm": 3.914402723312378, "learning_rate": 8.409298073555167e-05, "loss": 1.2065, "num_input_tokens_seen": 36580184, "step": 2273 }, { "epoch": 0.15928971078830545, "grad_norm": 3.878512144088745, "learning_rate": 8.408598248686514e-05, "loss": 1.1457, "num_input_tokens_seen": 36596568, "step": 2274 }, { "epoch": 0.15935975903403468, "grad_norm": 4.195454120635986, "learning_rate": 8.407898423817863e-05, "loss": 1.2628, "num_input_tokens_seen": 36612952, "step": 2275 }, { "epoch": 0.15942980727976394, "grad_norm": 3.847649097442627, "learning_rate": 8.407198598949212e-05, "loss": 1.0678, "num_input_tokens_seen": 36628752, "step": 2276 }, { "epoch": 0.15949985552549317, "grad_norm": 5.284397125244141, "learning_rate": 8.406498774080561e-05, "loss": 1.0508, "num_input_tokens_seen": 36645136, "step": 2277 }, { "epoch": 0.15956990377122243, "grad_norm": 4.10982084274292, "learning_rate": 8.40579894921191e-05, "loss": 1.0558, "num_input_tokens_seen": 36661392, "step": 2278 }, { "epoch": 0.15963995201695166, "grad_norm": 3.8282828330993652, "learning_rate": 8.405099124343257e-05, "loss": 1.1064, "num_input_tokens_seen": 36676856, "step": 2279 }, { "epoch": 0.15971000026268092, "grad_norm": 4.115365028381348, "learning_rate": 8.404399299474606e-05, "loss": 1.0081, "num_input_tokens_seen": 36693080, "step": 2280 }, { "epoch": 0.15978004850841016, "grad_norm": 3.6131088733673096, "learning_rate": 8.403699474605955e-05, "loss": 0.8565, "num_input_tokens_seen": 36709440, "step": 2281 }, { "epoch": 0.15985009675413941, "grad_norm": 3.83146071434021, "learning_rate": 8.402999649737304e-05, "loss": 1.0762, "num_input_tokens_seen": 36725496, "step": 2282 }, { "epoch": 0.15992014499986865, "grad_norm": 3.8456339836120605, "learning_rate": 8.402299824868653e-05, "loss": 1.053, "num_input_tokens_seen": 36741544, "step": 2283 }, { "epoch": 0.1599901932455979, "grad_norm": 3.717014789581299, "learning_rate": 8.4016e-05, "loss": 1.0053, "num_input_tokens_seen": 36757928, "step": 2284 }, { "epoch": 0.16006024149132717, "grad_norm": 4.3730854988098145, "learning_rate": 8.400900175131349e-05, "loss": 1.1639, "num_input_tokens_seen": 36774144, "step": 2285 }, { "epoch": 0.1601302897370564, "grad_norm": 3.6635241508483887, "learning_rate": 8.400200350262697e-05, "loss": 0.9721, "num_input_tokens_seen": 36790248, "step": 2286 }, { "epoch": 0.16020033798278566, "grad_norm": 3.9058330059051514, "learning_rate": 8.399500525394046e-05, "loss": 1.0814, "num_input_tokens_seen": 36806632, "step": 2287 }, { "epoch": 0.1602703862285149, "grad_norm": 3.60127854347229, "learning_rate": 8.398800700525394e-05, "loss": 1.1541, "num_input_tokens_seen": 36823016, "step": 2288 }, { "epoch": 0.16034043447424415, "grad_norm": 5.762889385223389, "learning_rate": 8.398100875656743e-05, "loss": 0.9572, "num_input_tokens_seen": 36838576, "step": 2289 }, { "epoch": 0.16041048271997338, "grad_norm": 3.495436191558838, "learning_rate": 8.397401050788092e-05, "loss": 1.0156, "num_input_tokens_seen": 36854960, "step": 2290 }, { "epoch": 0.16048053096570264, "grad_norm": 4.083384037017822, "learning_rate": 8.396701225919441e-05, "loss": 1.1724, "num_input_tokens_seen": 36870672, "step": 2291 }, { "epoch": 0.16055057921143187, "grad_norm": 3.7010245323181152, "learning_rate": 8.396001401050788e-05, "loss": 0.8871, "num_input_tokens_seen": 36887056, "step": 2292 }, { "epoch": 0.16062062745716113, "grad_norm": 3.419485330581665, "learning_rate": 8.395301576182136e-05, "loss": 0.9586, "num_input_tokens_seen": 36903144, "step": 2293 }, { "epoch": 0.16069067570289036, "grad_norm": 3.593970537185669, "learning_rate": 8.394601751313485e-05, "loss": 1.0109, "num_input_tokens_seen": 36919192, "step": 2294 }, { "epoch": 0.16076072394861962, "grad_norm": 3.729038953781128, "learning_rate": 8.393901926444834e-05, "loss": 1.288, "num_input_tokens_seen": 36935576, "step": 2295 }, { "epoch": 0.16083077219434885, "grad_norm": 3.60687255859375, "learning_rate": 8.393202101576183e-05, "loss": 0.9423, "num_input_tokens_seen": 36951960, "step": 2296 }, { "epoch": 0.1609008204400781, "grad_norm": 3.4520435333251953, "learning_rate": 8.392502276707531e-05, "loss": 0.9515, "num_input_tokens_seen": 36968344, "step": 2297 }, { "epoch": 0.16097086868580734, "grad_norm": 3.71907639503479, "learning_rate": 8.39180245183888e-05, "loss": 1.1141, "num_input_tokens_seen": 36984440, "step": 2298 }, { "epoch": 0.1610409169315366, "grad_norm": 3.8897864818573, "learning_rate": 8.391102626970228e-05, "loss": 1.1124, "num_input_tokens_seen": 37000824, "step": 2299 }, { "epoch": 0.16111096517726584, "grad_norm": 3.579921245574951, "learning_rate": 8.390402802101577e-05, "loss": 1.0998, "num_input_tokens_seen": 37017088, "step": 2300 }, { "epoch": 0.1611810134229951, "grad_norm": 3.9658427238464355, "learning_rate": 8.389702977232924e-05, "loss": 1.034, "num_input_tokens_seen": 37033232, "step": 2301 }, { "epoch": 0.16125106166872433, "grad_norm": 4.2862725257873535, "learning_rate": 8.389003152364274e-05, "loss": 0.9662, "num_input_tokens_seen": 37049616, "step": 2302 }, { "epoch": 0.1613211099144536, "grad_norm": 3.7523694038391113, "learning_rate": 8.388303327495622e-05, "loss": 1.0806, "num_input_tokens_seen": 37065784, "step": 2303 }, { "epoch": 0.16139115816018282, "grad_norm": 3.9068679809570312, "learning_rate": 8.387603502626971e-05, "loss": 1.0985, "num_input_tokens_seen": 37082168, "step": 2304 }, { "epoch": 0.16146120640591208, "grad_norm": 5.876891613006592, "learning_rate": 8.38690367775832e-05, "loss": 1.2938, "num_input_tokens_seen": 37097072, "step": 2305 }, { "epoch": 0.1615312546516413, "grad_norm": 4.040335655212402, "learning_rate": 8.386203852889667e-05, "loss": 1.1376, "num_input_tokens_seen": 37112936, "step": 2306 }, { "epoch": 0.16160130289737057, "grad_norm": 3.566763401031494, "learning_rate": 8.385504028021016e-05, "loss": 0.9164, "num_input_tokens_seen": 37129320, "step": 2307 }, { "epoch": 0.1616713511430998, "grad_norm": 3.7780325412750244, "learning_rate": 8.384804203152365e-05, "loss": 0.9541, "num_input_tokens_seen": 37144832, "step": 2308 }, { "epoch": 0.16174139938882906, "grad_norm": 4.291510105133057, "learning_rate": 8.384104378283714e-05, "loss": 1.2579, "num_input_tokens_seen": 37160312, "step": 2309 }, { "epoch": 0.1618114476345583, "grad_norm": 3.721531629562378, "learning_rate": 8.383404553415063e-05, "loss": 1.0108, "num_input_tokens_seen": 37176696, "step": 2310 }, { "epoch": 0.16188149588028755, "grad_norm": 3.883301258087158, "learning_rate": 8.38270472854641e-05, "loss": 1.17, "num_input_tokens_seen": 37192632, "step": 2311 }, { "epoch": 0.16195154412601678, "grad_norm": 4.240591049194336, "learning_rate": 8.382004903677759e-05, "loss": 1.2263, "num_input_tokens_seen": 37208448, "step": 2312 }, { "epoch": 0.16202159237174604, "grad_norm": 4.711728572845459, "learning_rate": 8.381305078809106e-05, "loss": 1.1743, "num_input_tokens_seen": 37223176, "step": 2313 }, { "epoch": 0.16209164061747527, "grad_norm": 4.733399391174316, "learning_rate": 8.380605253940455e-05, "loss": 1.2512, "num_input_tokens_seen": 37239560, "step": 2314 }, { "epoch": 0.16216168886320453, "grad_norm": 5.842257976531982, "learning_rate": 8.379905429071804e-05, "loss": 1.029, "num_input_tokens_seen": 37255536, "step": 2315 }, { "epoch": 0.16223173710893377, "grad_norm": 3.9891135692596436, "learning_rate": 8.379205604203153e-05, "loss": 1.0767, "num_input_tokens_seen": 37271920, "step": 2316 }, { "epoch": 0.16230178535466303, "grad_norm": 3.3596630096435547, "learning_rate": 8.378505779334502e-05, "loss": 0.8963, "num_input_tokens_seen": 37288024, "step": 2317 }, { "epoch": 0.16237183360039226, "grad_norm": 4.346104621887207, "learning_rate": 8.377805954465851e-05, "loss": 1.0947, "num_input_tokens_seen": 37304264, "step": 2318 }, { "epoch": 0.16244188184612152, "grad_norm": 3.5524039268493652, "learning_rate": 8.377106129597198e-05, "loss": 0.9435, "num_input_tokens_seen": 37320648, "step": 2319 }, { "epoch": 0.16251193009185078, "grad_norm": 4.335781574249268, "learning_rate": 8.376406304728546e-05, "loss": 0.9151, "num_input_tokens_seen": 37336104, "step": 2320 }, { "epoch": 0.16258197833758, "grad_norm": 3.7356534004211426, "learning_rate": 8.375706479859895e-05, "loss": 1.0195, "num_input_tokens_seen": 37352488, "step": 2321 }, { "epoch": 0.16265202658330927, "grad_norm": 3.842710494995117, "learning_rate": 8.375006654991243e-05, "loss": 1.0543, "num_input_tokens_seen": 37368872, "step": 2322 }, { "epoch": 0.1627220748290385, "grad_norm": 3.9485390186309814, "learning_rate": 8.374306830122592e-05, "loss": 1.2149, "num_input_tokens_seen": 37385256, "step": 2323 }, { "epoch": 0.16279212307476776, "grad_norm": 3.9196622371673584, "learning_rate": 8.373607005253941e-05, "loss": 1.0907, "num_input_tokens_seen": 37401224, "step": 2324 }, { "epoch": 0.162862171320497, "grad_norm": 4.2444844245910645, "learning_rate": 8.37290718038529e-05, "loss": 1.0201, "num_input_tokens_seen": 37417016, "step": 2325 }, { "epoch": 0.16293221956622625, "grad_norm": 3.974438190460205, "learning_rate": 8.372207355516638e-05, "loss": 1.0733, "num_input_tokens_seen": 37433400, "step": 2326 }, { "epoch": 0.16300226781195548, "grad_norm": 3.833350658416748, "learning_rate": 8.371507530647986e-05, "loss": 1.1536, "num_input_tokens_seen": 37449784, "step": 2327 }, { "epoch": 0.16307231605768474, "grad_norm": 4.566055774688721, "learning_rate": 8.370807705779335e-05, "loss": 1.093, "num_input_tokens_seen": 37465720, "step": 2328 }, { "epoch": 0.16314236430341397, "grad_norm": 3.455068588256836, "learning_rate": 8.370107880910684e-05, "loss": 0.9396, "num_input_tokens_seen": 37482104, "step": 2329 }, { "epoch": 0.16321241254914323, "grad_norm": 4.584096908569336, "learning_rate": 8.369408056042032e-05, "loss": 1.0109, "num_input_tokens_seen": 37498488, "step": 2330 }, { "epoch": 0.16328246079487246, "grad_norm": 4.0225958824157715, "learning_rate": 8.36870823117338e-05, "loss": 1.1507, "num_input_tokens_seen": 37514264, "step": 2331 }, { "epoch": 0.16335250904060172, "grad_norm": 5.311272144317627, "learning_rate": 8.368008406304729e-05, "loss": 1.2248, "num_input_tokens_seen": 37529280, "step": 2332 }, { "epoch": 0.16342255728633096, "grad_norm": 3.752720594406128, "learning_rate": 8.367308581436077e-05, "loss": 0.964, "num_input_tokens_seen": 37545664, "step": 2333 }, { "epoch": 0.16349260553206021, "grad_norm": 3.8337442874908447, "learning_rate": 8.366608756567426e-05, "loss": 1.1928, "num_input_tokens_seen": 37562048, "step": 2334 }, { "epoch": 0.16356265377778945, "grad_norm": 3.818251132965088, "learning_rate": 8.365908931698775e-05, "loss": 1.0032, "num_input_tokens_seen": 37577848, "step": 2335 }, { "epoch": 0.1636327020235187, "grad_norm": 3.7170960903167725, "learning_rate": 8.365209106830123e-05, "loss": 1.2297, "num_input_tokens_seen": 37594232, "step": 2336 }, { "epoch": 0.16370275026924794, "grad_norm": 3.984950304031372, "learning_rate": 8.364509281961472e-05, "loss": 1.0744, "num_input_tokens_seen": 37610248, "step": 2337 }, { "epoch": 0.1637727985149772, "grad_norm": 3.4384636878967285, "learning_rate": 8.36380945709282e-05, "loss": 1.0015, "num_input_tokens_seen": 37626632, "step": 2338 }, { "epoch": 0.16384284676070643, "grad_norm": 3.952625274658203, "learning_rate": 8.363109632224169e-05, "loss": 1.1604, "num_input_tokens_seen": 37643016, "step": 2339 }, { "epoch": 0.1639128950064357, "grad_norm": 3.7193119525909424, "learning_rate": 8.362409807355516e-05, "loss": 0.9054, "num_input_tokens_seen": 37658216, "step": 2340 }, { "epoch": 0.16398294325216492, "grad_norm": 3.977997303009033, "learning_rate": 8.361709982486865e-05, "loss": 1.2904, "num_input_tokens_seen": 37674600, "step": 2341 }, { "epoch": 0.16405299149789418, "grad_norm": 5.108094215393066, "learning_rate": 8.361010157618214e-05, "loss": 1.0664, "num_input_tokens_seen": 37690184, "step": 2342 }, { "epoch": 0.1641230397436234, "grad_norm": 4.881065845489502, "learning_rate": 8.360310332749563e-05, "loss": 1.0787, "num_input_tokens_seen": 37705352, "step": 2343 }, { "epoch": 0.16419308798935267, "grad_norm": 4.128891468048096, "learning_rate": 8.359610507880912e-05, "loss": 0.8745, "num_input_tokens_seen": 37721736, "step": 2344 }, { "epoch": 0.1642631362350819, "grad_norm": 4.006495475769043, "learning_rate": 8.35891068301226e-05, "loss": 0.9992, "num_input_tokens_seen": 37738120, "step": 2345 }, { "epoch": 0.16433318448081116, "grad_norm": 3.877427101135254, "learning_rate": 8.358210858143608e-05, "loss": 0.9334, "num_input_tokens_seen": 37754504, "step": 2346 }, { "epoch": 0.1644032327265404, "grad_norm": 3.7013916969299316, "learning_rate": 8.357511033274955e-05, "loss": 1.069, "num_input_tokens_seen": 37770792, "step": 2347 }, { "epoch": 0.16447328097226965, "grad_norm": 3.675049066543579, "learning_rate": 8.356811208406304e-05, "loss": 0.9863, "num_input_tokens_seen": 37786800, "step": 2348 }, { "epoch": 0.16454332921799888, "grad_norm": 4.831826210021973, "learning_rate": 8.356111383537654e-05, "loss": 0.9077, "num_input_tokens_seen": 37801760, "step": 2349 }, { "epoch": 0.16461337746372814, "grad_norm": 4.207952499389648, "learning_rate": 8.355411558669002e-05, "loss": 1.0585, "num_input_tokens_seen": 37818144, "step": 2350 }, { "epoch": 0.16468342570945738, "grad_norm": 3.9083497524261475, "learning_rate": 8.354711733800351e-05, "loss": 1.1437, "num_input_tokens_seen": 37833896, "step": 2351 }, { "epoch": 0.16475347395518664, "grad_norm": 4.307275295257568, "learning_rate": 8.3540119089317e-05, "loss": 1.0692, "num_input_tokens_seen": 37850280, "step": 2352 }, { "epoch": 0.16482352220091587, "grad_norm": 3.9434409141540527, "learning_rate": 8.353312084063047e-05, "loss": 0.9842, "num_input_tokens_seen": 37866664, "step": 2353 }, { "epoch": 0.16489357044664513, "grad_norm": 4.162476539611816, "learning_rate": 8.352612259194396e-05, "loss": 1.2, "num_input_tokens_seen": 37883048, "step": 2354 }, { "epoch": 0.1649636186923744, "grad_norm": 4.3073506355285645, "learning_rate": 8.351912434325745e-05, "loss": 1.2625, "num_input_tokens_seen": 37899264, "step": 2355 }, { "epoch": 0.16503366693810362, "grad_norm": 3.9900870323181152, "learning_rate": 8.351212609457094e-05, "loss": 1.079, "num_input_tokens_seen": 37915648, "step": 2356 }, { "epoch": 0.16510371518383288, "grad_norm": 3.599282741546631, "learning_rate": 8.350512784588441e-05, "loss": 0.9226, "num_input_tokens_seen": 37932032, "step": 2357 }, { "epoch": 0.1651737634295621, "grad_norm": 3.796546697616577, "learning_rate": 8.34981295971979e-05, "loss": 0.9095, "num_input_tokens_seen": 37948416, "step": 2358 }, { "epoch": 0.16524381167529137, "grad_norm": 4.0810017585754395, "learning_rate": 8.349113134851139e-05, "loss": 0.9083, "num_input_tokens_seen": 37964072, "step": 2359 }, { "epoch": 0.1653138599210206, "grad_norm": 4.155765533447266, "learning_rate": 8.348413309982487e-05, "loss": 1.1827, "num_input_tokens_seen": 37980320, "step": 2360 }, { "epoch": 0.16538390816674986, "grad_norm": 4.131893634796143, "learning_rate": 8.347713485113835e-05, "loss": 1.1245, "num_input_tokens_seen": 37995872, "step": 2361 }, { "epoch": 0.1654539564124791, "grad_norm": 4.266848564147949, "learning_rate": 8.347013660245184e-05, "loss": 1.1084, "num_input_tokens_seen": 38011856, "step": 2362 }, { "epoch": 0.16552400465820835, "grad_norm": 3.8229875564575195, "learning_rate": 8.346313835376533e-05, "loss": 1.0592, "num_input_tokens_seen": 38028080, "step": 2363 }, { "epoch": 0.16559405290393758, "grad_norm": 4.0808234214782715, "learning_rate": 8.345614010507882e-05, "loss": 0.9667, "num_input_tokens_seen": 38043992, "step": 2364 }, { "epoch": 0.16566410114966684, "grad_norm": 4.470417022705078, "learning_rate": 8.34491418563923e-05, "loss": 1.2859, "num_input_tokens_seen": 38059848, "step": 2365 }, { "epoch": 0.16573414939539607, "grad_norm": 3.459963798522949, "learning_rate": 8.344214360770578e-05, "loss": 1.0801, "num_input_tokens_seen": 38076232, "step": 2366 }, { "epoch": 0.16580419764112533, "grad_norm": 3.6845312118530273, "learning_rate": 8.343514535901926e-05, "loss": 1.1277, "num_input_tokens_seen": 38092616, "step": 2367 }, { "epoch": 0.16587424588685457, "grad_norm": 3.683866500854492, "learning_rate": 8.342814711033275e-05, "loss": 1.0821, "num_input_tokens_seen": 38108880, "step": 2368 }, { "epoch": 0.16594429413258382, "grad_norm": 4.3266191482543945, "learning_rate": 8.342114886164625e-05, "loss": 1.1432, "num_input_tokens_seen": 38125264, "step": 2369 }, { "epoch": 0.16601434237831306, "grad_norm": 3.9031660556793213, "learning_rate": 8.341415061295972e-05, "loss": 1.0378, "num_input_tokens_seen": 38141648, "step": 2370 }, { "epoch": 0.16608439062404232, "grad_norm": 5.415440082550049, "learning_rate": 8.340715236427321e-05, "loss": 1.2011, "num_input_tokens_seen": 38157328, "step": 2371 }, { "epoch": 0.16615443886977155, "grad_norm": 4.017500877380371, "learning_rate": 8.34001541155867e-05, "loss": 1.0771, "num_input_tokens_seen": 38173096, "step": 2372 }, { "epoch": 0.1662244871155008, "grad_norm": 3.855212926864624, "learning_rate": 8.339315586690018e-05, "loss": 1.173, "num_input_tokens_seen": 38189480, "step": 2373 }, { "epoch": 0.16629453536123004, "grad_norm": 3.8502743244171143, "learning_rate": 8.338615761821365e-05, "loss": 1.0241, "num_input_tokens_seen": 38205416, "step": 2374 }, { "epoch": 0.1663645836069593, "grad_norm": 6.8746867179870605, "learning_rate": 8.337915936952715e-05, "loss": 1.0459, "num_input_tokens_seen": 38221800, "step": 2375 }, { "epoch": 0.16643463185268853, "grad_norm": 3.9708571434020996, "learning_rate": 8.337216112084064e-05, "loss": 0.9832, "num_input_tokens_seen": 38237208, "step": 2376 }, { "epoch": 0.1665046800984178, "grad_norm": 4.927229404449463, "learning_rate": 8.336516287215412e-05, "loss": 1.1103, "num_input_tokens_seen": 38253592, "step": 2377 }, { "epoch": 0.16657472834414702, "grad_norm": 3.9976963996887207, "learning_rate": 8.33581646234676e-05, "loss": 1.1451, "num_input_tokens_seen": 38269184, "step": 2378 }, { "epoch": 0.16664477658987628, "grad_norm": 3.680177927017212, "learning_rate": 8.33511663747811e-05, "loss": 1.0602, "num_input_tokens_seen": 38285568, "step": 2379 }, { "epoch": 0.1667148248356055, "grad_norm": 3.768069267272949, "learning_rate": 8.334416812609457e-05, "loss": 1.0822, "num_input_tokens_seen": 38301952, "step": 2380 }, { "epoch": 0.16678487308133477, "grad_norm": 4.554010391235352, "learning_rate": 8.333716987740806e-05, "loss": 1.3037, "num_input_tokens_seen": 38318336, "step": 2381 }, { "epoch": 0.166854921327064, "grad_norm": 3.6799368858337402, "learning_rate": 8.333017162872155e-05, "loss": 1.0152, "num_input_tokens_seen": 38333544, "step": 2382 }, { "epoch": 0.16692496957279326, "grad_norm": 3.5584356784820557, "learning_rate": 8.332317338003503e-05, "loss": 0.9617, "num_input_tokens_seen": 38349632, "step": 2383 }, { "epoch": 0.1669950178185225, "grad_norm": 5.978849411010742, "learning_rate": 8.331617513134851e-05, "loss": 0.9975, "num_input_tokens_seen": 38364872, "step": 2384 }, { "epoch": 0.16706506606425175, "grad_norm": 4.641121864318848, "learning_rate": 8.3309176882662e-05, "loss": 1.0021, "num_input_tokens_seen": 38379800, "step": 2385 }, { "epoch": 0.167135114309981, "grad_norm": 3.895772695541382, "learning_rate": 8.330217863397549e-05, "loss": 1.1187, "num_input_tokens_seen": 38395744, "step": 2386 }, { "epoch": 0.16720516255571025, "grad_norm": 3.48437762260437, "learning_rate": 8.329518038528896e-05, "loss": 1.0527, "num_input_tokens_seen": 38412056, "step": 2387 }, { "epoch": 0.16727521080143948, "grad_norm": 4.2831549644470215, "learning_rate": 8.328818213660245e-05, "loss": 0.8967, "num_input_tokens_seen": 38426768, "step": 2388 }, { "epoch": 0.16734525904716874, "grad_norm": 3.7090001106262207, "learning_rate": 8.328118388791595e-05, "loss": 0.9903, "num_input_tokens_seen": 38442296, "step": 2389 }, { "epoch": 0.167415307292898, "grad_norm": 4.253223896026611, "learning_rate": 8.327418563922943e-05, "loss": 1.0169, "num_input_tokens_seen": 38458664, "step": 2390 }, { "epoch": 0.16748535553862723, "grad_norm": 4.919910907745361, "learning_rate": 8.326718739054292e-05, "loss": 1.233, "num_input_tokens_seen": 38475048, "step": 2391 }, { "epoch": 0.1675554037843565, "grad_norm": 7.881314277648926, "learning_rate": 8.326018914185639e-05, "loss": 1.1, "num_input_tokens_seen": 38491432, "step": 2392 }, { "epoch": 0.16762545203008572, "grad_norm": 6.979029655456543, "learning_rate": 8.325319089316988e-05, "loss": 0.9189, "num_input_tokens_seen": 38506312, "step": 2393 }, { "epoch": 0.16769550027581498, "grad_norm": 4.141571044921875, "learning_rate": 8.324619264448336e-05, "loss": 1.0821, "num_input_tokens_seen": 38522696, "step": 2394 }, { "epoch": 0.1677655485215442, "grad_norm": 4.306760311126709, "learning_rate": 8.323919439579686e-05, "loss": 1.1857, "num_input_tokens_seen": 38539080, "step": 2395 }, { "epoch": 0.16783559676727347, "grad_norm": 4.089770793914795, "learning_rate": 8.323219614711035e-05, "loss": 1.0994, "num_input_tokens_seen": 38555464, "step": 2396 }, { "epoch": 0.1679056450130027, "grad_norm": 3.648800849914551, "learning_rate": 8.322519789842382e-05, "loss": 1.1015, "num_input_tokens_seen": 38571848, "step": 2397 }, { "epoch": 0.16797569325873196, "grad_norm": 4.310317516326904, "learning_rate": 8.321819964973731e-05, "loss": 1.171, "num_input_tokens_seen": 38587616, "step": 2398 }, { "epoch": 0.1680457415044612, "grad_norm": 5.373032093048096, "learning_rate": 8.32112014010508e-05, "loss": 0.9952, "num_input_tokens_seen": 38604000, "step": 2399 }, { "epoch": 0.16811578975019045, "grad_norm": 3.7830634117126465, "learning_rate": 8.320420315236427e-05, "loss": 0.9953, "num_input_tokens_seen": 38620384, "step": 2400 }, { "epoch": 0.16811578975019045, "eval_loss": 1.1429402828216553, "eval_runtime": 0.205, "eval_samples_per_second": 4.878, "eval_steps_per_second": 4.878, "num_input_tokens_seen": 38620384, "step": 2400 }, { "epoch": 0.16818583799591968, "grad_norm": 6.3896684646606445, "learning_rate": 8.319720490367776e-05, "loss": 1.184, "num_input_tokens_seen": 38636288, "step": 2401 }, { "epoch": 0.16825588624164894, "grad_norm": 4.178726673126221, "learning_rate": 8.319020665499125e-05, "loss": 1.0362, "num_input_tokens_seen": 38652352, "step": 2402 }, { "epoch": 0.16832593448737818, "grad_norm": 3.7572708129882812, "learning_rate": 8.318320840630474e-05, "loss": 0.9756, "num_input_tokens_seen": 38668712, "step": 2403 }, { "epoch": 0.16839598273310744, "grad_norm": 3.688552141189575, "learning_rate": 8.317621015761821e-05, "loss": 1.0644, "num_input_tokens_seen": 38685096, "step": 2404 }, { "epoch": 0.16846603097883667, "grad_norm": 4.2040510177612305, "learning_rate": 8.31692119089317e-05, "loss": 1.1251, "num_input_tokens_seen": 38701480, "step": 2405 }, { "epoch": 0.16853607922456593, "grad_norm": 3.9412119388580322, "learning_rate": 8.316221366024519e-05, "loss": 1.0243, "num_input_tokens_seen": 38716904, "step": 2406 }, { "epoch": 0.16860612747029516, "grad_norm": 3.9538826942443848, "learning_rate": 8.315521541155867e-05, "loss": 1.0361, "num_input_tokens_seen": 38733288, "step": 2407 }, { "epoch": 0.16867617571602442, "grad_norm": 3.803135871887207, "learning_rate": 8.314821716287215e-05, "loss": 1.0684, "num_input_tokens_seen": 38749672, "step": 2408 }, { "epoch": 0.16874622396175365, "grad_norm": 4.323539733886719, "learning_rate": 8.314121891418564e-05, "loss": 1.1091, "num_input_tokens_seen": 38766056, "step": 2409 }, { "epoch": 0.1688162722074829, "grad_norm": 3.84000825881958, "learning_rate": 8.313422066549913e-05, "loss": 1.0052, "num_input_tokens_seen": 38782440, "step": 2410 }, { "epoch": 0.16888632045321214, "grad_norm": 6.76428747177124, "learning_rate": 8.312722241681261e-05, "loss": 1.1773, "num_input_tokens_seen": 38798824, "step": 2411 }, { "epoch": 0.1689563686989414, "grad_norm": 5.8638224601745605, "learning_rate": 8.31202241681261e-05, "loss": 0.9515, "num_input_tokens_seen": 38815112, "step": 2412 }, { "epoch": 0.16902641694467063, "grad_norm": 4.254051685333252, "learning_rate": 8.311322591943958e-05, "loss": 1.1365, "num_input_tokens_seen": 38831192, "step": 2413 }, { "epoch": 0.1690964651903999, "grad_norm": 3.641663074493408, "learning_rate": 8.310622767075306e-05, "loss": 0.9888, "num_input_tokens_seen": 38847360, "step": 2414 }, { "epoch": 0.16916651343612912, "grad_norm": 3.594768762588501, "learning_rate": 8.309922942206656e-05, "loss": 1.1156, "num_input_tokens_seen": 38863744, "step": 2415 }, { "epoch": 0.16923656168185838, "grad_norm": 3.6955742835998535, "learning_rate": 8.309223117338005e-05, "loss": 0.9514, "num_input_tokens_seen": 38879880, "step": 2416 }, { "epoch": 0.16930660992758761, "grad_norm": 3.64803409576416, "learning_rate": 8.308523292469353e-05, "loss": 1.0045, "num_input_tokens_seen": 38896264, "step": 2417 }, { "epoch": 0.16937665817331687, "grad_norm": 3.7921512126922607, "learning_rate": 8.307823467600701e-05, "loss": 1.0838, "num_input_tokens_seen": 38912648, "step": 2418 }, { "epoch": 0.1694467064190461, "grad_norm": 4.777346611022949, "learning_rate": 8.307123642732049e-05, "loss": 1.239, "num_input_tokens_seen": 38929032, "step": 2419 }, { "epoch": 0.16951675466477537, "grad_norm": 4.417767524719238, "learning_rate": 8.306423817863398e-05, "loss": 1.0101, "num_input_tokens_seen": 38945416, "step": 2420 }, { "epoch": 0.1695868029105046, "grad_norm": 4.257672309875488, "learning_rate": 8.305723992994747e-05, "loss": 0.8461, "num_input_tokens_seen": 38961800, "step": 2421 }, { "epoch": 0.16965685115623386, "grad_norm": 4.098975658416748, "learning_rate": 8.305024168126095e-05, "loss": 1.158, "num_input_tokens_seen": 38978184, "step": 2422 }, { "epoch": 0.1697268994019631, "grad_norm": 5.206361293792725, "learning_rate": 8.304324343257444e-05, "loss": 1.041, "num_input_tokens_seen": 38994568, "step": 2423 }, { "epoch": 0.16979694764769235, "grad_norm": 3.638395309448242, "learning_rate": 8.303624518388792e-05, "loss": 0.8883, "num_input_tokens_seen": 39010136, "step": 2424 }, { "epoch": 0.1698669958934216, "grad_norm": 3.4154045581817627, "learning_rate": 8.30292469352014e-05, "loss": 1.0024, "num_input_tokens_seen": 39026520, "step": 2425 }, { "epoch": 0.16993704413915084, "grad_norm": 3.923617362976074, "learning_rate": 8.30222486865149e-05, "loss": 1.1696, "num_input_tokens_seen": 39042816, "step": 2426 }, { "epoch": 0.1700070923848801, "grad_norm": 4.469310760498047, "learning_rate": 8.301525043782837e-05, "loss": 1.3424, "num_input_tokens_seen": 39059040, "step": 2427 }, { "epoch": 0.17007714063060933, "grad_norm": 4.111564636230469, "learning_rate": 8.300825218914186e-05, "loss": 0.9867, "num_input_tokens_seen": 39074992, "step": 2428 }, { "epoch": 0.1701471888763386, "grad_norm": 3.7809438705444336, "learning_rate": 8.300125394045535e-05, "loss": 0.965, "num_input_tokens_seen": 39090840, "step": 2429 }, { "epoch": 0.17021723712206782, "grad_norm": 3.704542875289917, "learning_rate": 8.299425569176884e-05, "loss": 1.1784, "num_input_tokens_seen": 39107136, "step": 2430 }, { "epoch": 0.17028728536779708, "grad_norm": 4.356417179107666, "learning_rate": 8.298725744308231e-05, "loss": 1.149, "num_input_tokens_seen": 39123520, "step": 2431 }, { "epoch": 0.1703573336135263, "grad_norm": 3.400228500366211, "learning_rate": 8.29802591943958e-05, "loss": 0.867, "num_input_tokens_seen": 39139904, "step": 2432 }, { "epoch": 0.17042738185925557, "grad_norm": 4.777987480163574, "learning_rate": 8.297326094570929e-05, "loss": 1.1159, "num_input_tokens_seen": 39156288, "step": 2433 }, { "epoch": 0.1704974301049848, "grad_norm": 5.600007057189941, "learning_rate": 8.296626269702276e-05, "loss": 0.8863, "num_input_tokens_seen": 39171928, "step": 2434 }, { "epoch": 0.17056747835071406, "grad_norm": 3.72717022895813, "learning_rate": 8.295926444833627e-05, "loss": 1.079, "num_input_tokens_seen": 39188032, "step": 2435 }, { "epoch": 0.1706375265964433, "grad_norm": 7.264038562774658, "learning_rate": 8.295226619964974e-05, "loss": 0.8546, "num_input_tokens_seen": 39203816, "step": 2436 }, { "epoch": 0.17070757484217255, "grad_norm": 4.103509426116943, "learning_rate": 8.294526795096323e-05, "loss": 1.0138, "num_input_tokens_seen": 39220200, "step": 2437 }, { "epoch": 0.1707776230879018, "grad_norm": 3.6456661224365234, "learning_rate": 8.29382697022767e-05, "loss": 0.9107, "num_input_tokens_seen": 39236584, "step": 2438 }, { "epoch": 0.17084767133363105, "grad_norm": 3.750075340270996, "learning_rate": 8.293127145359019e-05, "loss": 1.0773, "num_input_tokens_seen": 39252968, "step": 2439 }, { "epoch": 0.17091771957936028, "grad_norm": 4.5003581047058105, "learning_rate": 8.292427320490368e-05, "loss": 1.1834, "num_input_tokens_seen": 39269192, "step": 2440 }, { "epoch": 0.17098776782508954, "grad_norm": 4.513885498046875, "learning_rate": 8.291727495621717e-05, "loss": 1.0337, "num_input_tokens_seen": 39285576, "step": 2441 }, { "epoch": 0.17105781607081877, "grad_norm": 4.220343589782715, "learning_rate": 8.291027670753066e-05, "loss": 1.3044, "num_input_tokens_seen": 39300864, "step": 2442 }, { "epoch": 0.17112786431654803, "grad_norm": 4.986631393432617, "learning_rate": 8.290327845884415e-05, "loss": 1.0377, "num_input_tokens_seen": 39317208, "step": 2443 }, { "epoch": 0.17119791256227726, "grad_norm": 7.632670879364014, "learning_rate": 8.289628021015762e-05, "loss": 1.1749, "num_input_tokens_seen": 39332392, "step": 2444 }, { "epoch": 0.17126796080800652, "grad_norm": 3.588841199874878, "learning_rate": 8.288928196147111e-05, "loss": 0.8124, "num_input_tokens_seen": 39348600, "step": 2445 }, { "epoch": 0.17133800905373575, "grad_norm": 4.311728477478027, "learning_rate": 8.288228371278459e-05, "loss": 1.035, "num_input_tokens_seen": 39364456, "step": 2446 }, { "epoch": 0.171408057299465, "grad_norm": 6.236140251159668, "learning_rate": 8.287528546409807e-05, "loss": 1.1243, "num_input_tokens_seen": 39379496, "step": 2447 }, { "epoch": 0.17147810554519424, "grad_norm": 4.228808403015137, "learning_rate": 8.286828721541156e-05, "loss": 1.0185, "num_input_tokens_seen": 39395880, "step": 2448 }, { "epoch": 0.1715481537909235, "grad_norm": 3.873366117477417, "learning_rate": 8.286128896672505e-05, "loss": 0.9684, "num_input_tokens_seen": 39412264, "step": 2449 }, { "epoch": 0.17161820203665273, "grad_norm": 3.797846794128418, "learning_rate": 8.285429071803854e-05, "loss": 1.0562, "num_input_tokens_seen": 39428648, "step": 2450 }, { "epoch": 0.171688250282382, "grad_norm": 3.798875093460083, "learning_rate": 8.284729246935202e-05, "loss": 1.0409, "num_input_tokens_seen": 39445032, "step": 2451 }, { "epoch": 0.17175829852811123, "grad_norm": 5.118900299072266, "learning_rate": 8.28402942206655e-05, "loss": 1.14, "num_input_tokens_seen": 39460168, "step": 2452 }, { "epoch": 0.17182834677384048, "grad_norm": 4.157371520996094, "learning_rate": 8.283329597197899e-05, "loss": 1.1676, "num_input_tokens_seen": 39476544, "step": 2453 }, { "epoch": 0.17189839501956972, "grad_norm": 3.760786771774292, "learning_rate": 8.282629772329247e-05, "loss": 1.0482, "num_input_tokens_seen": 39492928, "step": 2454 }, { "epoch": 0.17196844326529898, "grad_norm": 4.252779960632324, "learning_rate": 8.281929947460596e-05, "loss": 1.1538, "num_input_tokens_seen": 39509312, "step": 2455 }, { "epoch": 0.1720384915110282, "grad_norm": 4.374740123748779, "learning_rate": 8.281230122591944e-05, "loss": 1.0132, "num_input_tokens_seen": 39525696, "step": 2456 }, { "epoch": 0.17210853975675747, "grad_norm": 4.460380554199219, "learning_rate": 8.280530297723293e-05, "loss": 1.1876, "num_input_tokens_seen": 39541864, "step": 2457 }, { "epoch": 0.17217858800248673, "grad_norm": 4.22148323059082, "learning_rate": 8.279830472854641e-05, "loss": 1.11, "num_input_tokens_seen": 39557944, "step": 2458 }, { "epoch": 0.17224863624821596, "grad_norm": 4.310081481933594, "learning_rate": 8.27913064798599e-05, "loss": 1.0506, "num_input_tokens_seen": 39574328, "step": 2459 }, { "epoch": 0.17231868449394522, "grad_norm": 4.15192174911499, "learning_rate": 8.278430823117339e-05, "loss": 0.7793, "num_input_tokens_seen": 39589312, "step": 2460 }, { "epoch": 0.17238873273967445, "grad_norm": 4.6561455726623535, "learning_rate": 8.277730998248687e-05, "loss": 1.2239, "num_input_tokens_seen": 39605456, "step": 2461 }, { "epoch": 0.1724587809854037, "grad_norm": 4.273087978363037, "learning_rate": 8.277031173380036e-05, "loss": 1.1436, "num_input_tokens_seen": 39621840, "step": 2462 }, { "epoch": 0.17252882923113294, "grad_norm": 4.575830459594727, "learning_rate": 8.276331348511384e-05, "loss": 1.2589, "num_input_tokens_seen": 39638224, "step": 2463 }, { "epoch": 0.1725988774768622, "grad_norm": 3.9122824668884277, "learning_rate": 8.275631523642733e-05, "loss": 1.0634, "num_input_tokens_seen": 39654608, "step": 2464 }, { "epoch": 0.17266892572259143, "grad_norm": 4.991362571716309, "learning_rate": 8.27493169877408e-05, "loss": 1.2077, "num_input_tokens_seen": 39669824, "step": 2465 }, { "epoch": 0.1727389739683207, "grad_norm": 4.688175678253174, "learning_rate": 8.274231873905429e-05, "loss": 1.0955, "num_input_tokens_seen": 39686208, "step": 2466 }, { "epoch": 0.17280902221404992, "grad_norm": 3.779524087905884, "learning_rate": 8.273532049036778e-05, "loss": 1.004, "num_input_tokens_seen": 39702336, "step": 2467 }, { "epoch": 0.17287907045977918, "grad_norm": 4.117679595947266, "learning_rate": 8.272832224168127e-05, "loss": 1.0321, "num_input_tokens_seen": 39718232, "step": 2468 }, { "epoch": 0.17294911870550841, "grad_norm": 3.810084819793701, "learning_rate": 8.272132399299476e-05, "loss": 1.0325, "num_input_tokens_seen": 39733584, "step": 2469 }, { "epoch": 0.17301916695123767, "grad_norm": 3.7730038166046143, "learning_rate": 8.271432574430824e-05, "loss": 0.9207, "num_input_tokens_seen": 39749968, "step": 2470 }, { "epoch": 0.1730892151969669, "grad_norm": 7.299304008483887, "learning_rate": 8.270732749562172e-05, "loss": 1.3425, "num_input_tokens_seen": 39765552, "step": 2471 }, { "epoch": 0.17315926344269617, "grad_norm": 4.079380512237549, "learning_rate": 8.270032924693521e-05, "loss": 1.0336, "num_input_tokens_seen": 39781936, "step": 2472 }, { "epoch": 0.1732293116884254, "grad_norm": 3.736607789993286, "learning_rate": 8.269333099824868e-05, "loss": 1.0126, "num_input_tokens_seen": 39797688, "step": 2473 }, { "epoch": 0.17329935993415466, "grad_norm": 5.587291240692139, "learning_rate": 8.268633274956217e-05, "loss": 1.1422, "num_input_tokens_seen": 39814072, "step": 2474 }, { "epoch": 0.1733694081798839, "grad_norm": 3.5963592529296875, "learning_rate": 8.267933450087566e-05, "loss": 0.9947, "num_input_tokens_seen": 39830456, "step": 2475 }, { "epoch": 0.17343945642561315, "grad_norm": 5.241317272186279, "learning_rate": 8.267233625218915e-05, "loss": 1.0661, "num_input_tokens_seen": 39846728, "step": 2476 }, { "epoch": 0.17350950467134238, "grad_norm": 4.194108009338379, "learning_rate": 8.266533800350264e-05, "loss": 1.1659, "num_input_tokens_seen": 39863112, "step": 2477 }, { "epoch": 0.17357955291707164, "grad_norm": 4.698538780212402, "learning_rate": 8.265833975481611e-05, "loss": 1.3673, "num_input_tokens_seen": 39878624, "step": 2478 }, { "epoch": 0.17364960116280087, "grad_norm": 5.960018634796143, "learning_rate": 8.26513415061296e-05, "loss": 1.104, "num_input_tokens_seen": 39894944, "step": 2479 }, { "epoch": 0.17371964940853013, "grad_norm": 4.386090278625488, "learning_rate": 8.264434325744309e-05, "loss": 1.1284, "num_input_tokens_seen": 39911040, "step": 2480 }, { "epoch": 0.17378969765425936, "grad_norm": 3.7272467613220215, "learning_rate": 8.263734500875658e-05, "loss": 1.1066, "num_input_tokens_seen": 39927408, "step": 2481 }, { "epoch": 0.17385974589998862, "grad_norm": 4.296888828277588, "learning_rate": 8.263034676007005e-05, "loss": 1.1014, "num_input_tokens_seen": 39943792, "step": 2482 }, { "epoch": 0.17392979414571785, "grad_norm": 5.469056606292725, "learning_rate": 8.262334851138354e-05, "loss": 1.1672, "num_input_tokens_seen": 39958176, "step": 2483 }, { "epoch": 0.1739998423914471, "grad_norm": 5.6080498695373535, "learning_rate": 8.261635026269703e-05, "loss": 1.2713, "num_input_tokens_seen": 39973592, "step": 2484 }, { "epoch": 0.17406989063717634, "grad_norm": 3.6164181232452393, "learning_rate": 8.26093520140105e-05, "loss": 0.9019, "num_input_tokens_seen": 39989792, "step": 2485 }, { "epoch": 0.1741399388829056, "grad_norm": 3.757291078567505, "learning_rate": 8.2602353765324e-05, "loss": 1.1038, "num_input_tokens_seen": 40005672, "step": 2486 }, { "epoch": 0.17420998712863484, "grad_norm": 5.1490559577941895, "learning_rate": 8.259535551663748e-05, "loss": 1.1524, "num_input_tokens_seen": 40021816, "step": 2487 }, { "epoch": 0.1742800353743641, "grad_norm": 3.9055886268615723, "learning_rate": 8.258835726795097e-05, "loss": 1.1, "num_input_tokens_seen": 40038200, "step": 2488 }, { "epoch": 0.17435008362009333, "grad_norm": 5.496553897857666, "learning_rate": 8.258135901926446e-05, "loss": 1.3214, "num_input_tokens_seen": 40054584, "step": 2489 }, { "epoch": 0.1744201318658226, "grad_norm": 4.069197177886963, "learning_rate": 8.257436077057793e-05, "loss": 0.888, "num_input_tokens_seen": 40070968, "step": 2490 }, { "epoch": 0.17449018011155182, "grad_norm": 5.098565101623535, "learning_rate": 8.256736252189142e-05, "loss": 0.9918, "num_input_tokens_seen": 40087352, "step": 2491 }, { "epoch": 0.17456022835728108, "grad_norm": 4.083621025085449, "learning_rate": 8.25603642732049e-05, "loss": 0.9506, "num_input_tokens_seen": 40103736, "step": 2492 }, { "epoch": 0.17463027660301034, "grad_norm": 3.8676462173461914, "learning_rate": 8.255336602451839e-05, "loss": 1.0746, "num_input_tokens_seen": 40120120, "step": 2493 }, { "epoch": 0.17470032484873957, "grad_norm": 3.8799197673797607, "learning_rate": 8.254636777583188e-05, "loss": 1.0207, "num_input_tokens_seen": 40136504, "step": 2494 }, { "epoch": 0.17477037309446883, "grad_norm": 5.469006538391113, "learning_rate": 8.253936952714536e-05, "loss": 1.0081, "num_input_tokens_seen": 40152888, "step": 2495 }, { "epoch": 0.17484042134019806, "grad_norm": 4.163306713104248, "learning_rate": 8.253237127845885e-05, "loss": 1.2059, "num_input_tokens_seen": 40169272, "step": 2496 }, { "epoch": 0.17491046958592732, "grad_norm": 3.792062282562256, "learning_rate": 8.252537302977234e-05, "loss": 1.0806, "num_input_tokens_seen": 40185656, "step": 2497 }, { "epoch": 0.17498051783165655, "grad_norm": 3.6881046295166016, "learning_rate": 8.251837478108582e-05, "loss": 1.1557, "num_input_tokens_seen": 40202040, "step": 2498 }, { "epoch": 0.1750505660773858, "grad_norm": 3.818491220474243, "learning_rate": 8.25113765323993e-05, "loss": 1.2193, "num_input_tokens_seen": 40218424, "step": 2499 }, { "epoch": 0.17512061432311504, "grad_norm": 3.77933931350708, "learning_rate": 8.250437828371278e-05, "loss": 1.0508, "num_input_tokens_seen": 40234216, "step": 2500 }, { "epoch": 0.1751906625688443, "grad_norm": 4.106552600860596, "learning_rate": 8.249738003502628e-05, "loss": 0.8558, "num_input_tokens_seen": 40250368, "step": 2501 }, { "epoch": 0.17526071081457353, "grad_norm": 4.9382710456848145, "learning_rate": 8.249038178633976e-05, "loss": 1.3082, "num_input_tokens_seen": 40266600, "step": 2502 }, { "epoch": 0.1753307590603028, "grad_norm": 3.8894200325012207, "learning_rate": 8.248338353765325e-05, "loss": 1.035, "num_input_tokens_seen": 40282984, "step": 2503 }, { "epoch": 0.17540080730603202, "grad_norm": 3.793044328689575, "learning_rate": 8.247638528896673e-05, "loss": 1.0376, "num_input_tokens_seen": 40299368, "step": 2504 }, { "epoch": 0.17547085555176128, "grad_norm": 4.874731540679932, "learning_rate": 8.246938704028021e-05, "loss": 1.2598, "num_input_tokens_seen": 40315752, "step": 2505 }, { "epoch": 0.17554090379749052, "grad_norm": 3.908191680908203, "learning_rate": 8.24623887915937e-05, "loss": 0.9739, "num_input_tokens_seen": 40332136, "step": 2506 }, { "epoch": 0.17561095204321978, "grad_norm": 3.585002899169922, "learning_rate": 8.245539054290719e-05, "loss": 0.9736, "num_input_tokens_seen": 40348520, "step": 2507 }, { "epoch": 0.175681000288949, "grad_norm": 3.9742348194122314, "learning_rate": 8.244839229422068e-05, "loss": 1.0278, "num_input_tokens_seen": 40364760, "step": 2508 }, { "epoch": 0.17575104853467827, "grad_norm": 5.1725921630859375, "learning_rate": 8.244139404553415e-05, "loss": 1.1488, "num_input_tokens_seen": 40380072, "step": 2509 }, { "epoch": 0.1758210967804075, "grad_norm": 4.038326263427734, "learning_rate": 8.243439579684764e-05, "loss": 1.2252, "num_input_tokens_seen": 40395472, "step": 2510 }, { "epoch": 0.17589114502613676, "grad_norm": 3.7381017208099365, "learning_rate": 8.242739754816113e-05, "loss": 1.041, "num_input_tokens_seen": 40411280, "step": 2511 }, { "epoch": 0.175961193271866, "grad_norm": 4.327959060668945, "learning_rate": 8.24203992994746e-05, "loss": 1.0272, "num_input_tokens_seen": 40427664, "step": 2512 }, { "epoch": 0.17603124151759525, "grad_norm": 3.720078706741333, "learning_rate": 8.241340105078809e-05, "loss": 1.2306, "num_input_tokens_seen": 40443760, "step": 2513 }, { "epoch": 0.17610128976332448, "grad_norm": 4.0901618003845215, "learning_rate": 8.240640280210158e-05, "loss": 1.0098, "num_input_tokens_seen": 40460144, "step": 2514 }, { "epoch": 0.17617133800905374, "grad_norm": 4.013705730438232, "learning_rate": 8.239940455341507e-05, "loss": 1.0817, "num_input_tokens_seen": 40476528, "step": 2515 }, { "epoch": 0.17624138625478297, "grad_norm": 3.8833489418029785, "learning_rate": 8.239240630472856e-05, "loss": 1.119, "num_input_tokens_seen": 40492768, "step": 2516 }, { "epoch": 0.17631143450051223, "grad_norm": 7.381611347198486, "learning_rate": 8.238540805604203e-05, "loss": 1.3033, "num_input_tokens_seen": 40507344, "step": 2517 }, { "epoch": 0.17638148274624146, "grad_norm": 3.8792364597320557, "learning_rate": 8.237840980735552e-05, "loss": 1.1113, "num_input_tokens_seen": 40523552, "step": 2518 }, { "epoch": 0.17645153099197072, "grad_norm": 5.19634485244751, "learning_rate": 8.2371411558669e-05, "loss": 1.2186, "num_input_tokens_seen": 40538640, "step": 2519 }, { "epoch": 0.17652157923769995, "grad_norm": 4.081907749176025, "learning_rate": 8.236441330998248e-05, "loss": 1.1075, "num_input_tokens_seen": 40555024, "step": 2520 }, { "epoch": 0.17659162748342921, "grad_norm": 4.296377182006836, "learning_rate": 8.235741506129599e-05, "loss": 0.9319, "num_input_tokens_seen": 40570480, "step": 2521 }, { "epoch": 0.17666167572915845, "grad_norm": 4.143492221832275, "learning_rate": 8.235041681260946e-05, "loss": 0.964, "num_input_tokens_seen": 40586664, "step": 2522 }, { "epoch": 0.1767317239748877, "grad_norm": 3.9894590377807617, "learning_rate": 8.234341856392295e-05, "loss": 0.913, "num_input_tokens_seen": 40603048, "step": 2523 }, { "epoch": 0.17680177222061694, "grad_norm": 4.283662796020508, "learning_rate": 8.233642031523644e-05, "loss": 1.0709, "num_input_tokens_seen": 40618440, "step": 2524 }, { "epoch": 0.1768718204663462, "grad_norm": 4.126082420349121, "learning_rate": 8.232942206654991e-05, "loss": 1.1371, "num_input_tokens_seen": 40634824, "step": 2525 }, { "epoch": 0.17694186871207543, "grad_norm": 4.252981662750244, "learning_rate": 8.23224238178634e-05, "loss": 1.0351, "num_input_tokens_seen": 40650640, "step": 2526 }, { "epoch": 0.1770119169578047, "grad_norm": 3.768542528152466, "learning_rate": 8.231542556917689e-05, "loss": 0.8221, "num_input_tokens_seen": 40667000, "step": 2527 }, { "epoch": 0.17708196520353395, "grad_norm": 4.067849636077881, "learning_rate": 8.230842732049038e-05, "loss": 1.2117, "num_input_tokens_seen": 40683288, "step": 2528 }, { "epoch": 0.17715201344926318, "grad_norm": 4.7552995681762695, "learning_rate": 8.230142907180385e-05, "loss": 1.0001, "num_input_tokens_seen": 40699304, "step": 2529 }, { "epoch": 0.17722206169499244, "grad_norm": 4.099888324737549, "learning_rate": 8.229443082311734e-05, "loss": 1.3335, "num_input_tokens_seen": 40715688, "step": 2530 }, { "epoch": 0.17729210994072167, "grad_norm": 4.219737529754639, "learning_rate": 8.228743257443083e-05, "loss": 1.3004, "num_input_tokens_seen": 40731640, "step": 2531 }, { "epoch": 0.17736215818645093, "grad_norm": 4.125600337982178, "learning_rate": 8.22804343257443e-05, "loss": 1.1828, "num_input_tokens_seen": 40747664, "step": 2532 }, { "epoch": 0.17743220643218016, "grad_norm": 3.7761423587799072, "learning_rate": 8.22734360770578e-05, "loss": 1.1082, "num_input_tokens_seen": 40764048, "step": 2533 }, { "epoch": 0.17750225467790942, "grad_norm": 5.0669026374816895, "learning_rate": 8.226643782837128e-05, "loss": 1.1434, "num_input_tokens_seen": 40779160, "step": 2534 }, { "epoch": 0.17757230292363865, "grad_norm": 4.688200950622559, "learning_rate": 8.225943957968477e-05, "loss": 1.2135, "num_input_tokens_seen": 40795416, "step": 2535 }, { "epoch": 0.1776423511693679, "grad_norm": 3.62204647064209, "learning_rate": 8.225244133099825e-05, "loss": 1.0816, "num_input_tokens_seen": 40811800, "step": 2536 }, { "epoch": 0.17771239941509714, "grad_norm": 4.086390495300293, "learning_rate": 8.224544308231174e-05, "loss": 1.067, "num_input_tokens_seen": 40826960, "step": 2537 }, { "epoch": 0.1777824476608264, "grad_norm": 5.574249744415283, "learning_rate": 8.223844483362522e-05, "loss": 1.2678, "num_input_tokens_seen": 40843344, "step": 2538 }, { "epoch": 0.17785249590655564, "grad_norm": 3.9721264839172363, "learning_rate": 8.22314465849387e-05, "loss": 1.0381, "num_input_tokens_seen": 40859448, "step": 2539 }, { "epoch": 0.1779225441522849, "grad_norm": 4.220152854919434, "learning_rate": 8.222444833625219e-05, "loss": 1.1014, "num_input_tokens_seen": 40875128, "step": 2540 }, { "epoch": 0.17799259239801413, "grad_norm": 3.905205011367798, "learning_rate": 8.221745008756569e-05, "loss": 0.9568, "num_input_tokens_seen": 40890624, "step": 2541 }, { "epoch": 0.1780626406437434, "grad_norm": 4.114316463470459, "learning_rate": 8.221045183887917e-05, "loss": 0.9885, "num_input_tokens_seen": 40905624, "step": 2542 }, { "epoch": 0.17813268888947262, "grad_norm": 4.810879230499268, "learning_rate": 8.220345359019265e-05, "loss": 0.9447, "num_input_tokens_seen": 40922008, "step": 2543 }, { "epoch": 0.17820273713520188, "grad_norm": 4.224065780639648, "learning_rate": 8.219645534150613e-05, "loss": 1.1176, "num_input_tokens_seen": 40938392, "step": 2544 }, { "epoch": 0.1782727853809311, "grad_norm": 3.7086703777313232, "learning_rate": 8.218945709281962e-05, "loss": 0.8931, "num_input_tokens_seen": 40954776, "step": 2545 }, { "epoch": 0.17834283362666037, "grad_norm": 4.346426963806152, "learning_rate": 8.218245884413309e-05, "loss": 0.9808, "num_input_tokens_seen": 40971160, "step": 2546 }, { "epoch": 0.1784128818723896, "grad_norm": 3.9295589923858643, "learning_rate": 8.21754605954466e-05, "loss": 1.1054, "num_input_tokens_seen": 40987544, "step": 2547 }, { "epoch": 0.17848293011811886, "grad_norm": 4.224534034729004, "learning_rate": 8.216846234676008e-05, "loss": 1.1131, "num_input_tokens_seen": 41002816, "step": 2548 }, { "epoch": 0.1785529783638481, "grad_norm": 3.940401315689087, "learning_rate": 8.216146409807356e-05, "loss": 1.1551, "num_input_tokens_seen": 41018560, "step": 2549 }, { "epoch": 0.17862302660957735, "grad_norm": 4.010072231292725, "learning_rate": 8.215446584938705e-05, "loss": 1.0915, "num_input_tokens_seen": 41033976, "step": 2550 }, { "epoch": 0.17869307485530658, "grad_norm": 4.192416191101074, "learning_rate": 8.214746760070054e-05, "loss": 1.0954, "num_input_tokens_seen": 41049384, "step": 2551 }, { "epoch": 0.17876312310103584, "grad_norm": 3.765962600708008, "learning_rate": 8.214046935201401e-05, "loss": 1.1029, "num_input_tokens_seen": 41065528, "step": 2552 }, { "epoch": 0.17883317134676507, "grad_norm": 3.7856082916259766, "learning_rate": 8.21334711033275e-05, "loss": 1.1063, "num_input_tokens_seen": 41081912, "step": 2553 }, { "epoch": 0.17890321959249433, "grad_norm": 4.845935821533203, "learning_rate": 8.212647285464099e-05, "loss": 1.2907, "num_input_tokens_seen": 41098056, "step": 2554 }, { "epoch": 0.17897326783822357, "grad_norm": 4.835206985473633, "learning_rate": 8.211947460595448e-05, "loss": 1.0591, "num_input_tokens_seen": 41114376, "step": 2555 }, { "epoch": 0.17904331608395282, "grad_norm": 3.9637155532836914, "learning_rate": 8.211247635726795e-05, "loss": 1.1689, "num_input_tokens_seen": 41130760, "step": 2556 }, { "epoch": 0.17911336432968206, "grad_norm": 3.5001652240753174, "learning_rate": 8.210547810858144e-05, "loss": 0.9798, "num_input_tokens_seen": 41147040, "step": 2557 }, { "epoch": 0.17918341257541132, "grad_norm": 5.54505729675293, "learning_rate": 8.209847985989493e-05, "loss": 1.3004, "num_input_tokens_seen": 41163312, "step": 2558 }, { "epoch": 0.17925346082114055, "grad_norm": 4.122933387756348, "learning_rate": 8.20914816112084e-05, "loss": 1.1754, "num_input_tokens_seen": 41179632, "step": 2559 }, { "epoch": 0.1793235090668698, "grad_norm": 4.166035175323486, "learning_rate": 8.208448336252189e-05, "loss": 1.0022, "num_input_tokens_seen": 41196000, "step": 2560 }, { "epoch": 0.17939355731259904, "grad_norm": 4.129281520843506, "learning_rate": 8.20774851138354e-05, "loss": 1.2342, "num_input_tokens_seen": 41211944, "step": 2561 }, { "epoch": 0.1794636055583283, "grad_norm": 3.9011406898498535, "learning_rate": 8.207048686514887e-05, "loss": 1.0238, "num_input_tokens_seen": 41227680, "step": 2562 }, { "epoch": 0.17953365380405756, "grad_norm": 3.717945098876953, "learning_rate": 8.206348861646234e-05, "loss": 0.9601, "num_input_tokens_seen": 41244064, "step": 2563 }, { "epoch": 0.1796037020497868, "grad_norm": 5.05475378036499, "learning_rate": 8.205649036777583e-05, "loss": 1.1192, "num_input_tokens_seen": 41260448, "step": 2564 }, { "epoch": 0.17967375029551605, "grad_norm": 4.52910041809082, "learning_rate": 8.204949211908932e-05, "loss": 0.9443, "num_input_tokens_seen": 41276832, "step": 2565 }, { "epoch": 0.17974379854124528, "grad_norm": 4.6492695808410645, "learning_rate": 8.20424938704028e-05, "loss": 1.0729, "num_input_tokens_seen": 41293216, "step": 2566 }, { "epoch": 0.17981384678697454, "grad_norm": 4.7587456703186035, "learning_rate": 8.20354956217163e-05, "loss": 0.9702, "num_input_tokens_seen": 41309600, "step": 2567 }, { "epoch": 0.17988389503270377, "grad_norm": 6.8467817306518555, "learning_rate": 8.202849737302979e-05, "loss": 1.1385, "num_input_tokens_seen": 41325984, "step": 2568 }, { "epoch": 0.17995394327843303, "grad_norm": 3.7771074771881104, "learning_rate": 8.202149912434326e-05, "loss": 1.1603, "num_input_tokens_seen": 41342368, "step": 2569 }, { "epoch": 0.18002399152416226, "grad_norm": 3.8494906425476074, "learning_rate": 8.201450087565675e-05, "loss": 1.056, "num_input_tokens_seen": 41357992, "step": 2570 }, { "epoch": 0.18009403976989152, "grad_norm": 4.079790115356445, "learning_rate": 8.200750262697023e-05, "loss": 1.1159, "num_input_tokens_seen": 41374256, "step": 2571 }, { "epoch": 0.18016408801562075, "grad_norm": 7.093918800354004, "learning_rate": 8.200050437828371e-05, "loss": 1.1756, "num_input_tokens_seen": 41388728, "step": 2572 }, { "epoch": 0.18023413626135001, "grad_norm": 4.636250972747803, "learning_rate": 8.19935061295972e-05, "loss": 1.1599, "num_input_tokens_seen": 41404488, "step": 2573 }, { "epoch": 0.18030418450707925, "grad_norm": 3.789625644683838, "learning_rate": 8.198650788091069e-05, "loss": 1.162, "num_input_tokens_seen": 41420200, "step": 2574 }, { "epoch": 0.1803742327528085, "grad_norm": 3.849637508392334, "learning_rate": 8.197950963222418e-05, "loss": 1.1399, "num_input_tokens_seen": 41436496, "step": 2575 }, { "epoch": 0.18044428099853774, "grad_norm": 3.6819775104522705, "learning_rate": 8.197251138353766e-05, "loss": 1.1467, "num_input_tokens_seen": 41452736, "step": 2576 }, { "epoch": 0.180514329244267, "grad_norm": 4.505229473114014, "learning_rate": 8.196551313485114e-05, "loss": 1.0336, "num_input_tokens_seen": 41468976, "step": 2577 }, { "epoch": 0.18058437748999623, "grad_norm": 5.465007781982422, "learning_rate": 8.195851488616463e-05, "loss": 0.983, "num_input_tokens_seen": 41485064, "step": 2578 }, { "epoch": 0.1806544257357255, "grad_norm": 3.993953227996826, "learning_rate": 8.195151663747811e-05, "loss": 1.3406, "num_input_tokens_seen": 41501448, "step": 2579 }, { "epoch": 0.18072447398145472, "grad_norm": 5.29327392578125, "learning_rate": 8.19445183887916e-05, "loss": 1.2397, "num_input_tokens_seen": 41517832, "step": 2580 }, { "epoch": 0.18079452222718398, "grad_norm": 4.132434844970703, "learning_rate": 8.193752014010508e-05, "loss": 1.2522, "num_input_tokens_seen": 41532976, "step": 2581 }, { "epoch": 0.1808645704729132, "grad_norm": 5.620279788970947, "learning_rate": 8.193052189141857e-05, "loss": 1.06, "num_input_tokens_seen": 41548784, "step": 2582 }, { "epoch": 0.18093461871864247, "grad_norm": 3.9721081256866455, "learning_rate": 8.192352364273205e-05, "loss": 1.0458, "num_input_tokens_seen": 41565168, "step": 2583 }, { "epoch": 0.1810046669643717, "grad_norm": 5.015312194824219, "learning_rate": 8.191652539404554e-05, "loss": 0.9813, "num_input_tokens_seen": 41580584, "step": 2584 }, { "epoch": 0.18107471521010096, "grad_norm": 5.385783672332764, "learning_rate": 8.190952714535903e-05, "loss": 1.0853, "num_input_tokens_seen": 41596656, "step": 2585 }, { "epoch": 0.1811447634558302, "grad_norm": 4.1005120277404785, "learning_rate": 8.19025288966725e-05, "loss": 1.0509, "num_input_tokens_seen": 41611752, "step": 2586 }, { "epoch": 0.18121481170155945, "grad_norm": 3.6853153705596924, "learning_rate": 8.1895530647986e-05, "loss": 1.0736, "num_input_tokens_seen": 41627408, "step": 2587 }, { "epoch": 0.18128485994728868, "grad_norm": 3.7818400859832764, "learning_rate": 8.188853239929949e-05, "loss": 1.1182, "num_input_tokens_seen": 41643792, "step": 2588 }, { "epoch": 0.18135490819301794, "grad_norm": 4.564868450164795, "learning_rate": 8.188153415061297e-05, "loss": 1.1408, "num_input_tokens_seen": 41658768, "step": 2589 }, { "epoch": 0.18142495643874718, "grad_norm": 4.092021465301514, "learning_rate": 8.187453590192644e-05, "loss": 1.0978, "num_input_tokens_seen": 41675088, "step": 2590 }, { "epoch": 0.18149500468447644, "grad_norm": 5.051564693450928, "learning_rate": 8.186753765323993e-05, "loss": 0.9746, "num_input_tokens_seen": 41690376, "step": 2591 }, { "epoch": 0.18156505293020567, "grad_norm": 3.5786261558532715, "learning_rate": 8.186053940455342e-05, "loss": 0.9638, "num_input_tokens_seen": 41706760, "step": 2592 }, { "epoch": 0.18163510117593493, "grad_norm": 4.11420202255249, "learning_rate": 8.185354115586691e-05, "loss": 1.1234, "num_input_tokens_seen": 41721760, "step": 2593 }, { "epoch": 0.18170514942166416, "grad_norm": 4.445348262786865, "learning_rate": 8.18465429071804e-05, "loss": 1.0846, "num_input_tokens_seen": 41737640, "step": 2594 }, { "epoch": 0.18177519766739342, "grad_norm": 5.705301284790039, "learning_rate": 8.183954465849388e-05, "loss": 1.2254, "num_input_tokens_seen": 41753784, "step": 2595 }, { "epoch": 0.18184524591312265, "grad_norm": 3.7948646545410156, "learning_rate": 8.183254640980736e-05, "loss": 0.9929, "num_input_tokens_seen": 41770120, "step": 2596 }, { "epoch": 0.1819152941588519, "grad_norm": 4.296072959899902, "learning_rate": 8.182554816112085e-05, "loss": 1.1365, "num_input_tokens_seen": 41786504, "step": 2597 }, { "epoch": 0.18198534240458117, "grad_norm": 4.750889778137207, "learning_rate": 8.181854991243432e-05, "loss": 1.1295, "num_input_tokens_seen": 41802888, "step": 2598 }, { "epoch": 0.1820553906503104, "grad_norm": 4.031731128692627, "learning_rate": 8.181155166374781e-05, "loss": 1.1096, "num_input_tokens_seen": 41819264, "step": 2599 }, { "epoch": 0.18212543889603966, "grad_norm": 4.620563507080078, "learning_rate": 8.18045534150613e-05, "loss": 1.1862, "num_input_tokens_seen": 41835016, "step": 2600 }, { "epoch": 0.18212543889603966, "eval_loss": 1.1404880285263062, "eval_runtime": 0.189, "eval_samples_per_second": 5.291, "eval_steps_per_second": 5.291, "num_input_tokens_seen": 41835016, "step": 2600 }, { "epoch": 0.1821954871417689, "grad_norm": 3.8487257957458496, "learning_rate": 8.179755516637479e-05, "loss": 1.0103, "num_input_tokens_seen": 41851400, "step": 2601 }, { "epoch": 0.18226553538749815, "grad_norm": 4.221493244171143, "learning_rate": 8.179055691768828e-05, "loss": 1.1346, "num_input_tokens_seen": 41867784, "step": 2602 }, { "epoch": 0.18233558363322738, "grad_norm": 3.88747239112854, "learning_rate": 8.178355866900175e-05, "loss": 1.0679, "num_input_tokens_seen": 41884024, "step": 2603 }, { "epoch": 0.18240563187895664, "grad_norm": 3.845551013946533, "learning_rate": 8.177656042031524e-05, "loss": 0.9442, "num_input_tokens_seen": 41899936, "step": 2604 }, { "epoch": 0.18247568012468587, "grad_norm": 3.6964564323425293, "learning_rate": 8.176956217162873e-05, "loss": 0.815, "num_input_tokens_seen": 41915512, "step": 2605 }, { "epoch": 0.18254572837041513, "grad_norm": 4.105105400085449, "learning_rate": 8.17625639229422e-05, "loss": 1.0891, "num_input_tokens_seen": 41931728, "step": 2606 }, { "epoch": 0.18261577661614437, "grad_norm": 5.0245842933654785, "learning_rate": 8.175556567425569e-05, "loss": 0.9657, "num_input_tokens_seen": 41947528, "step": 2607 }, { "epoch": 0.18268582486187362, "grad_norm": 3.4683709144592285, "learning_rate": 8.174856742556918e-05, "loss": 0.8183, "num_input_tokens_seen": 41963912, "step": 2608 }, { "epoch": 0.18275587310760286, "grad_norm": 4.603201866149902, "learning_rate": 8.174156917688267e-05, "loss": 1.1339, "num_input_tokens_seen": 41979976, "step": 2609 }, { "epoch": 0.18282592135333212, "grad_norm": 3.9904422760009766, "learning_rate": 8.173457092819615e-05, "loss": 1.0661, "num_input_tokens_seen": 41996360, "step": 2610 }, { "epoch": 0.18289596959906135, "grad_norm": 6.363785743713379, "learning_rate": 8.172757267950963e-05, "loss": 0.9569, "num_input_tokens_seen": 42011712, "step": 2611 }, { "epoch": 0.1829660178447906, "grad_norm": 3.7257959842681885, "learning_rate": 8.172057443082312e-05, "loss": 1.0227, "num_input_tokens_seen": 42028096, "step": 2612 }, { "epoch": 0.18303606609051984, "grad_norm": 3.8486809730529785, "learning_rate": 8.171357618213661e-05, "loss": 1.0442, "num_input_tokens_seen": 42044480, "step": 2613 }, { "epoch": 0.1831061143362491, "grad_norm": 4.620292663574219, "learning_rate": 8.17065779334501e-05, "loss": 0.9917, "num_input_tokens_seen": 42060864, "step": 2614 }, { "epoch": 0.18317616258197833, "grad_norm": 3.52644944190979, "learning_rate": 8.169957968476359e-05, "loss": 1.1402, "num_input_tokens_seen": 42077072, "step": 2615 }, { "epoch": 0.1832462108277076, "grad_norm": 3.800718069076538, "learning_rate": 8.169258143607706e-05, "loss": 0.9864, "num_input_tokens_seen": 42093128, "step": 2616 }, { "epoch": 0.18331625907343682, "grad_norm": 3.9447405338287354, "learning_rate": 8.168558318739054e-05, "loss": 0.9923, "num_input_tokens_seen": 42109512, "step": 2617 }, { "epoch": 0.18338630731916608, "grad_norm": 4.109864234924316, "learning_rate": 8.167858493870403e-05, "loss": 0.9583, "num_input_tokens_seen": 42125776, "step": 2618 }, { "epoch": 0.1834563555648953, "grad_norm": 3.6538870334625244, "learning_rate": 8.167158669001752e-05, "loss": 1.0731, "num_input_tokens_seen": 42141760, "step": 2619 }, { "epoch": 0.18352640381062457, "grad_norm": 5.139223098754883, "learning_rate": 8.1664588441331e-05, "loss": 1.2108, "num_input_tokens_seen": 42157096, "step": 2620 }, { "epoch": 0.1835964520563538, "grad_norm": 4.420098781585693, "learning_rate": 8.165759019264449e-05, "loss": 1.0652, "num_input_tokens_seen": 42173480, "step": 2621 }, { "epoch": 0.18366650030208306, "grad_norm": 5.559954643249512, "learning_rate": 8.165059194395798e-05, "loss": 0.9224, "num_input_tokens_seen": 42188944, "step": 2622 }, { "epoch": 0.1837365485478123, "grad_norm": 3.827627420425415, "learning_rate": 8.164359369527146e-05, "loss": 0.9185, "num_input_tokens_seen": 42204952, "step": 2623 }, { "epoch": 0.18380659679354155, "grad_norm": 7.454338550567627, "learning_rate": 8.163659544658494e-05, "loss": 1.2441, "num_input_tokens_seen": 42221336, "step": 2624 }, { "epoch": 0.1838766450392708, "grad_norm": 4.34182071685791, "learning_rate": 8.162959719789842e-05, "loss": 1.122, "num_input_tokens_seen": 42237720, "step": 2625 }, { "epoch": 0.18394669328500005, "grad_norm": 3.9157843589782715, "learning_rate": 8.162259894921191e-05, "loss": 1.1206, "num_input_tokens_seen": 42253584, "step": 2626 }, { "epoch": 0.18401674153072928, "grad_norm": 3.952451467514038, "learning_rate": 8.16156007005254e-05, "loss": 1.2001, "num_input_tokens_seen": 42269968, "step": 2627 }, { "epoch": 0.18408678977645854, "grad_norm": 5.0545148849487305, "learning_rate": 8.160860245183889e-05, "loss": 1.0629, "num_input_tokens_seen": 42286232, "step": 2628 }, { "epoch": 0.18415683802218777, "grad_norm": 7.176907062530518, "learning_rate": 8.160160420315237e-05, "loss": 1.1248, "num_input_tokens_seen": 42302616, "step": 2629 }, { "epoch": 0.18422688626791703, "grad_norm": 3.994748830795288, "learning_rate": 8.159460595446585e-05, "loss": 0.9938, "num_input_tokens_seen": 42319000, "step": 2630 }, { "epoch": 0.18429693451364626, "grad_norm": 3.5744547843933105, "learning_rate": 8.158760770577934e-05, "loss": 1.0644, "num_input_tokens_seen": 42335384, "step": 2631 }, { "epoch": 0.18436698275937552, "grad_norm": 3.528723955154419, "learning_rate": 8.158060945709283e-05, "loss": 0.9955, "num_input_tokens_seen": 42351768, "step": 2632 }, { "epoch": 0.18443703100510478, "grad_norm": 3.9958291053771973, "learning_rate": 8.15736112084063e-05, "loss": 1.076, "num_input_tokens_seen": 42368152, "step": 2633 }, { "epoch": 0.184507079250834, "grad_norm": 4.1659650802612305, "learning_rate": 8.15666129597198e-05, "loss": 1.1427, "num_input_tokens_seen": 42384536, "step": 2634 }, { "epoch": 0.18457712749656327, "grad_norm": 5.116000652313232, "learning_rate": 8.155961471103328e-05, "loss": 1.1418, "num_input_tokens_seen": 42399704, "step": 2635 }, { "epoch": 0.1846471757422925, "grad_norm": 4.01514196395874, "learning_rate": 8.155261646234677e-05, "loss": 0.9521, "num_input_tokens_seen": 42416056, "step": 2636 }, { "epoch": 0.18471722398802176, "grad_norm": 4.290152072906494, "learning_rate": 8.154561821366024e-05, "loss": 1.06, "num_input_tokens_seen": 42431544, "step": 2637 }, { "epoch": 0.184787272233751, "grad_norm": 4.267684459686279, "learning_rate": 8.153861996497373e-05, "loss": 1.0247, "num_input_tokens_seen": 42447928, "step": 2638 }, { "epoch": 0.18485732047948025, "grad_norm": 3.593191385269165, "learning_rate": 8.153162171628722e-05, "loss": 0.9917, "num_input_tokens_seen": 42464312, "step": 2639 }, { "epoch": 0.18492736872520948, "grad_norm": 4.322700023651123, "learning_rate": 8.152462346760071e-05, "loss": 1.1686, "num_input_tokens_seen": 42480696, "step": 2640 }, { "epoch": 0.18499741697093874, "grad_norm": 4.176753520965576, "learning_rate": 8.15176252189142e-05, "loss": 1.046, "num_input_tokens_seen": 42496520, "step": 2641 }, { "epoch": 0.18506746521666798, "grad_norm": 4.405294895172119, "learning_rate": 8.151062697022769e-05, "loss": 1.0884, "num_input_tokens_seen": 42512904, "step": 2642 }, { "epoch": 0.18513751346239724, "grad_norm": 3.8770217895507812, "learning_rate": 8.150362872154116e-05, "loss": 1.124, "num_input_tokens_seen": 42529288, "step": 2643 }, { "epoch": 0.18520756170812647, "grad_norm": 3.5909271240234375, "learning_rate": 8.149663047285464e-05, "loss": 1.107, "num_input_tokens_seen": 42545672, "step": 2644 }, { "epoch": 0.18527760995385573, "grad_norm": 3.73958420753479, "learning_rate": 8.148963222416812e-05, "loss": 0.9943, "num_input_tokens_seen": 42562056, "step": 2645 }, { "epoch": 0.18534765819958496, "grad_norm": 3.6813879013061523, "learning_rate": 8.148263397548161e-05, "loss": 0.9861, "num_input_tokens_seen": 42577720, "step": 2646 }, { "epoch": 0.18541770644531422, "grad_norm": 4.13958215713501, "learning_rate": 8.14756357267951e-05, "loss": 1.0882, "num_input_tokens_seen": 42594104, "step": 2647 }, { "epoch": 0.18548775469104345, "grad_norm": 3.757805109024048, "learning_rate": 8.146863747810859e-05, "loss": 1.0872, "num_input_tokens_seen": 42610256, "step": 2648 }, { "epoch": 0.1855578029367727, "grad_norm": 4.57798957824707, "learning_rate": 8.146163922942208e-05, "loss": 0.9471, "num_input_tokens_seen": 42626424, "step": 2649 }, { "epoch": 0.18562785118250194, "grad_norm": 3.797257423400879, "learning_rate": 8.145464098073555e-05, "loss": 0.9336, "num_input_tokens_seen": 42642200, "step": 2650 }, { "epoch": 0.1856978994282312, "grad_norm": 4.258513450622559, "learning_rate": 8.144764273204904e-05, "loss": 1.1557, "num_input_tokens_seen": 42657416, "step": 2651 }, { "epoch": 0.18576794767396043, "grad_norm": 4.369161605834961, "learning_rate": 8.144064448336252e-05, "loss": 1.0013, "num_input_tokens_seen": 42673752, "step": 2652 }, { "epoch": 0.1858379959196897, "grad_norm": 4.159987926483154, "learning_rate": 8.1433646234676e-05, "loss": 1.151, "num_input_tokens_seen": 42690136, "step": 2653 }, { "epoch": 0.18590804416541892, "grad_norm": 7.164428234100342, "learning_rate": 8.14266479859895e-05, "loss": 1.1637, "num_input_tokens_seen": 42706520, "step": 2654 }, { "epoch": 0.18597809241114818, "grad_norm": 3.4230172634124756, "learning_rate": 8.141964973730298e-05, "loss": 0.9291, "num_input_tokens_seen": 42722904, "step": 2655 }, { "epoch": 0.18604814065687741, "grad_norm": 4.316817283630371, "learning_rate": 8.141265148861647e-05, "loss": 1.2645, "num_input_tokens_seen": 42738640, "step": 2656 }, { "epoch": 0.18611818890260667, "grad_norm": 3.7894318103790283, "learning_rate": 8.140565323992995e-05, "loss": 1.1287, "num_input_tokens_seen": 42754848, "step": 2657 }, { "epoch": 0.1861882371483359, "grad_norm": 4.198835849761963, "learning_rate": 8.139865499124344e-05, "loss": 1.1525, "num_input_tokens_seen": 42771232, "step": 2658 }, { "epoch": 0.18625828539406517, "grad_norm": 3.796414852142334, "learning_rate": 8.139165674255692e-05, "loss": 1.0313, "num_input_tokens_seen": 42787344, "step": 2659 }, { "epoch": 0.1863283336397944, "grad_norm": 3.6421244144439697, "learning_rate": 8.138465849387041e-05, "loss": 1.0497, "num_input_tokens_seen": 42803728, "step": 2660 }, { "epoch": 0.18639838188552366, "grad_norm": 4.391780376434326, "learning_rate": 8.13776602451839e-05, "loss": 1.0564, "num_input_tokens_seen": 42820112, "step": 2661 }, { "epoch": 0.1864684301312529, "grad_norm": 4.187370777130127, "learning_rate": 8.137066199649738e-05, "loss": 1.0289, "num_input_tokens_seen": 42836496, "step": 2662 }, { "epoch": 0.18653847837698215, "grad_norm": 3.794281244277954, "learning_rate": 8.136366374781086e-05, "loss": 1.172, "num_input_tokens_seen": 42852880, "step": 2663 }, { "epoch": 0.18660852662271138, "grad_norm": 4.386116981506348, "learning_rate": 8.135666549912434e-05, "loss": 1.1443, "num_input_tokens_seen": 42869264, "step": 2664 }, { "epoch": 0.18667857486844064, "grad_norm": 4.223747253417969, "learning_rate": 8.134966725043783e-05, "loss": 1.074, "num_input_tokens_seen": 42885528, "step": 2665 }, { "epoch": 0.1867486231141699, "grad_norm": 5.020680904388428, "learning_rate": 8.134266900175132e-05, "loss": 1.1927, "num_input_tokens_seen": 42901912, "step": 2666 }, { "epoch": 0.18681867135989913, "grad_norm": 8.58757495880127, "learning_rate": 8.13356707530648e-05, "loss": 1.1377, "num_input_tokens_seen": 42917072, "step": 2667 }, { "epoch": 0.1868887196056284, "grad_norm": 3.6986234188079834, "learning_rate": 8.13286725043783e-05, "loss": 1.0536, "num_input_tokens_seen": 42933296, "step": 2668 }, { "epoch": 0.18695876785135762, "grad_norm": 4.196423053741455, "learning_rate": 8.132167425569178e-05, "loss": 1.0484, "num_input_tokens_seen": 42948968, "step": 2669 }, { "epoch": 0.18702881609708688, "grad_norm": 4.019235610961914, "learning_rate": 8.131467600700526e-05, "loss": 1.1241, "num_input_tokens_seen": 42965352, "step": 2670 }, { "epoch": 0.1870988643428161, "grad_norm": 4.035778045654297, "learning_rate": 8.130767775831873e-05, "loss": 0.8962, "num_input_tokens_seen": 42980872, "step": 2671 }, { "epoch": 0.18716891258854537, "grad_norm": 4.193873882293701, "learning_rate": 8.130067950963222e-05, "loss": 1.0494, "num_input_tokens_seen": 42996848, "step": 2672 }, { "epoch": 0.1872389608342746, "grad_norm": 4.011183738708496, "learning_rate": 8.129368126094571e-05, "loss": 1.1151, "num_input_tokens_seen": 43012728, "step": 2673 }, { "epoch": 0.18730900908000386, "grad_norm": 5.662332534790039, "learning_rate": 8.12866830122592e-05, "loss": 1.1238, "num_input_tokens_seen": 43028728, "step": 2674 }, { "epoch": 0.1873790573257331, "grad_norm": 4.4699387550354, "learning_rate": 8.127968476357269e-05, "loss": 1.0712, "num_input_tokens_seen": 43044504, "step": 2675 }, { "epoch": 0.18744910557146235, "grad_norm": 3.857011556625366, "learning_rate": 8.127268651488618e-05, "loss": 0.9866, "num_input_tokens_seen": 43060496, "step": 2676 }, { "epoch": 0.1875191538171916, "grad_norm": 3.5458414554595947, "learning_rate": 8.126568826619965e-05, "loss": 0.9317, "num_input_tokens_seen": 43076880, "step": 2677 }, { "epoch": 0.18758920206292085, "grad_norm": 5.4007744789123535, "learning_rate": 8.125869001751314e-05, "loss": 1.2016, "num_input_tokens_seen": 43091368, "step": 2678 }, { "epoch": 0.18765925030865008, "grad_norm": 5.15717077255249, "learning_rate": 8.125169176882661e-05, "loss": 1.0662, "num_input_tokens_seen": 43107752, "step": 2679 }, { "epoch": 0.18772929855437934, "grad_norm": 4.891427516937256, "learning_rate": 8.124469352014012e-05, "loss": 1.1684, "num_input_tokens_seen": 43122808, "step": 2680 }, { "epoch": 0.18779934680010857, "grad_norm": 4.651966571807861, "learning_rate": 8.123769527145359e-05, "loss": 1.1457, "num_input_tokens_seen": 43139056, "step": 2681 }, { "epoch": 0.18786939504583783, "grad_norm": 3.844129800796509, "learning_rate": 8.123069702276708e-05, "loss": 0.9282, "num_input_tokens_seen": 43155440, "step": 2682 }, { "epoch": 0.18793944329156706, "grad_norm": 3.669360876083374, "learning_rate": 8.122369877408057e-05, "loss": 1.1418, "num_input_tokens_seen": 43171824, "step": 2683 }, { "epoch": 0.18800949153729632, "grad_norm": 3.6102185249328613, "learning_rate": 8.121670052539404e-05, "loss": 1.0786, "num_input_tokens_seen": 43188208, "step": 2684 }, { "epoch": 0.18807953978302555, "grad_norm": 3.593414783477783, "learning_rate": 8.120970227670753e-05, "loss": 0.9982, "num_input_tokens_seen": 43204248, "step": 2685 }, { "epoch": 0.1881495880287548, "grad_norm": 5.017848491668701, "learning_rate": 8.120270402802102e-05, "loss": 0.9573, "num_input_tokens_seen": 43219808, "step": 2686 }, { "epoch": 0.18821963627448404, "grad_norm": 4.083794593811035, "learning_rate": 8.119570577933451e-05, "loss": 1.0678, "num_input_tokens_seen": 43235712, "step": 2687 }, { "epoch": 0.1882896845202133, "grad_norm": 4.265167713165283, "learning_rate": 8.1188707530648e-05, "loss": 1.2967, "num_input_tokens_seen": 43252048, "step": 2688 }, { "epoch": 0.18835973276594253, "grad_norm": 4.24991512298584, "learning_rate": 8.118170928196147e-05, "loss": 1.0267, "num_input_tokens_seen": 43268152, "step": 2689 }, { "epoch": 0.1884297810116718, "grad_norm": 4.059658050537109, "learning_rate": 8.117471103327496e-05, "loss": 1.1356, "num_input_tokens_seen": 43284240, "step": 2690 }, { "epoch": 0.18849982925740102, "grad_norm": 4.807305812835693, "learning_rate": 8.116771278458844e-05, "loss": 1.0424, "num_input_tokens_seen": 43299368, "step": 2691 }, { "epoch": 0.18856987750313028, "grad_norm": 5.590726852416992, "learning_rate": 8.116071453590193e-05, "loss": 1.1008, "num_input_tokens_seen": 43315648, "step": 2692 }, { "epoch": 0.18863992574885952, "grad_norm": 5.114964485168457, "learning_rate": 8.115371628721541e-05, "loss": 0.8916, "num_input_tokens_seen": 43331688, "step": 2693 }, { "epoch": 0.18870997399458878, "grad_norm": 4.323836803436279, "learning_rate": 8.11467180385289e-05, "loss": 1.1858, "num_input_tokens_seen": 43346672, "step": 2694 }, { "epoch": 0.188780022240318, "grad_norm": 4.290014743804932, "learning_rate": 8.113971978984239e-05, "loss": 1.2498, "num_input_tokens_seen": 43362872, "step": 2695 }, { "epoch": 0.18885007048604727, "grad_norm": 3.5292484760284424, "learning_rate": 8.113272154115588e-05, "loss": 1.0045, "num_input_tokens_seen": 43379256, "step": 2696 }, { "epoch": 0.1889201187317765, "grad_norm": 4.21523380279541, "learning_rate": 8.112572329246935e-05, "loss": 1.0515, "num_input_tokens_seen": 43395152, "step": 2697 }, { "epoch": 0.18899016697750576, "grad_norm": 4.900782108306885, "learning_rate": 8.111872504378283e-05, "loss": 1.1038, "num_input_tokens_seen": 43411536, "step": 2698 }, { "epoch": 0.189060215223235, "grad_norm": 3.613231658935547, "learning_rate": 8.111172679509632e-05, "loss": 1.017, "num_input_tokens_seen": 43427920, "step": 2699 }, { "epoch": 0.18913026346896425, "grad_norm": 3.681725263595581, "learning_rate": 8.110472854640982e-05, "loss": 1.1396, "num_input_tokens_seen": 43444304, "step": 2700 }, { "epoch": 0.1892003117146935, "grad_norm": 3.801785707473755, "learning_rate": 8.10977302977233e-05, "loss": 0.9856, "num_input_tokens_seen": 43459960, "step": 2701 }, { "epoch": 0.18927035996042274, "grad_norm": 3.4208626747131348, "learning_rate": 8.109073204903678e-05, "loss": 1.0048, "num_input_tokens_seen": 43476344, "step": 2702 }, { "epoch": 0.189340408206152, "grad_norm": 4.169189453125, "learning_rate": 8.108373380035027e-05, "loss": 1.0014, "num_input_tokens_seen": 43492728, "step": 2703 }, { "epoch": 0.18941045645188123, "grad_norm": 3.7125117778778076, "learning_rate": 8.107673555166375e-05, "loss": 0.9707, "num_input_tokens_seen": 43508168, "step": 2704 }, { "epoch": 0.1894805046976105, "grad_norm": 4.550642490386963, "learning_rate": 8.106973730297724e-05, "loss": 1.0832, "num_input_tokens_seen": 43524480, "step": 2705 }, { "epoch": 0.18955055294333972, "grad_norm": 4.219499588012695, "learning_rate": 8.106273905429072e-05, "loss": 1.148, "num_input_tokens_seen": 43540864, "step": 2706 }, { "epoch": 0.18962060118906898, "grad_norm": 4.605996131896973, "learning_rate": 8.105574080560421e-05, "loss": 1.0564, "num_input_tokens_seen": 43557248, "step": 2707 }, { "epoch": 0.18969064943479821, "grad_norm": 3.740314245223999, "learning_rate": 8.104874255691769e-05, "loss": 1.0194, "num_input_tokens_seen": 43573632, "step": 2708 }, { "epoch": 0.18976069768052747, "grad_norm": 3.92555832862854, "learning_rate": 8.104174430823118e-05, "loss": 1.1663, "num_input_tokens_seen": 43589728, "step": 2709 }, { "epoch": 0.1898307459262567, "grad_norm": 3.5653927326202393, "learning_rate": 8.103474605954467e-05, "loss": 1.1165, "num_input_tokens_seen": 43606112, "step": 2710 }, { "epoch": 0.18990079417198596, "grad_norm": 5.943650245666504, "learning_rate": 8.102774781085814e-05, "loss": 1.309, "num_input_tokens_seen": 43621072, "step": 2711 }, { "epoch": 0.1899708424177152, "grad_norm": 3.7632322311401367, "learning_rate": 8.102074956217163e-05, "loss": 1.0963, "num_input_tokens_seen": 43636976, "step": 2712 }, { "epoch": 0.19004089066344446, "grad_norm": 3.605536699295044, "learning_rate": 8.101375131348512e-05, "loss": 0.9509, "num_input_tokens_seen": 43653360, "step": 2713 }, { "epoch": 0.1901109389091737, "grad_norm": 3.7717363834381104, "learning_rate": 8.10067530647986e-05, "loss": 0.9407, "num_input_tokens_seen": 43669488, "step": 2714 }, { "epoch": 0.19018098715490295, "grad_norm": 4.55484676361084, "learning_rate": 8.09997548161121e-05, "loss": 0.8501, "num_input_tokens_seen": 43684704, "step": 2715 }, { "epoch": 0.19025103540063218, "grad_norm": 4.155830383300781, "learning_rate": 8.099275656742557e-05, "loss": 0.9936, "num_input_tokens_seen": 43700112, "step": 2716 }, { "epoch": 0.19032108364636144, "grad_norm": 5.615505695343018, "learning_rate": 8.098575831873906e-05, "loss": 1.2055, "num_input_tokens_seen": 43716136, "step": 2717 }, { "epoch": 0.19039113189209067, "grad_norm": 4.60966157913208, "learning_rate": 8.097876007005253e-05, "loss": 1.0531, "num_input_tokens_seen": 43731576, "step": 2718 }, { "epoch": 0.19046118013781993, "grad_norm": 5.698062896728516, "learning_rate": 8.097176182136602e-05, "loss": 0.9692, "num_input_tokens_seen": 43747960, "step": 2719 }, { "epoch": 0.19053122838354916, "grad_norm": 3.760756492614746, "learning_rate": 8.096476357267952e-05, "loss": 0.9638, "num_input_tokens_seen": 43764304, "step": 2720 }, { "epoch": 0.19060127662927842, "grad_norm": 4.084067344665527, "learning_rate": 8.0957765323993e-05, "loss": 1.083, "num_input_tokens_seen": 43780688, "step": 2721 }, { "epoch": 0.19067132487500765, "grad_norm": 3.9934301376342773, "learning_rate": 8.095076707530649e-05, "loss": 0.9757, "num_input_tokens_seen": 43797072, "step": 2722 }, { "epoch": 0.1907413731207369, "grad_norm": 3.915512800216675, "learning_rate": 8.094376882661998e-05, "loss": 1.1031, "num_input_tokens_seen": 43813456, "step": 2723 }, { "epoch": 0.19081142136646614, "grad_norm": 3.967040777206421, "learning_rate": 8.093677057793345e-05, "loss": 0.9821, "num_input_tokens_seen": 43829656, "step": 2724 }, { "epoch": 0.1908814696121954, "grad_norm": 3.707667827606201, "learning_rate": 8.092977232924693e-05, "loss": 1.1489, "num_input_tokens_seen": 43846040, "step": 2725 }, { "epoch": 0.19095151785792464, "grad_norm": 3.3822734355926514, "learning_rate": 8.092277408056043e-05, "loss": 1.0051, "num_input_tokens_seen": 43862144, "step": 2726 }, { "epoch": 0.1910215661036539, "grad_norm": 3.7703781127929688, "learning_rate": 8.091577583187392e-05, "loss": 1.0363, "num_input_tokens_seen": 43878328, "step": 2727 }, { "epoch": 0.19109161434938313, "grad_norm": 3.902003049850464, "learning_rate": 8.090877758318739e-05, "loss": 1.0051, "num_input_tokens_seen": 43893480, "step": 2728 }, { "epoch": 0.19116166259511239, "grad_norm": 3.971395969390869, "learning_rate": 8.090177933450088e-05, "loss": 1.0469, "num_input_tokens_seen": 43909752, "step": 2729 }, { "epoch": 0.19123171084084162, "grad_norm": 3.4233641624450684, "learning_rate": 8.089478108581437e-05, "loss": 0.8821, "num_input_tokens_seen": 43926136, "step": 2730 }, { "epoch": 0.19130175908657088, "grad_norm": 5.967614650726318, "learning_rate": 8.088778283712784e-05, "loss": 1.1995, "num_input_tokens_seen": 43941592, "step": 2731 }, { "epoch": 0.1913718073323001, "grad_norm": 4.431912899017334, "learning_rate": 8.088078458844133e-05, "loss": 1.2471, "num_input_tokens_seen": 43957784, "step": 2732 }, { "epoch": 0.19144185557802937, "grad_norm": 3.659182071685791, "learning_rate": 8.087378633975482e-05, "loss": 0.9701, "num_input_tokens_seen": 43973648, "step": 2733 }, { "epoch": 0.1915119038237586, "grad_norm": 4.983634948730469, "learning_rate": 8.086678809106831e-05, "loss": 1.1023, "num_input_tokens_seen": 43990032, "step": 2734 }, { "epoch": 0.19158195206948786, "grad_norm": 4.236748695373535, "learning_rate": 8.085978984238179e-05, "loss": 1.0724, "num_input_tokens_seen": 44005064, "step": 2735 }, { "epoch": 0.19165200031521712, "grad_norm": 3.3617727756500244, "learning_rate": 8.085279159369527e-05, "loss": 0.9986, "num_input_tokens_seen": 44021448, "step": 2736 }, { "epoch": 0.19172204856094635, "grad_norm": 3.4514083862304688, "learning_rate": 8.084579334500876e-05, "loss": 0.8738, "num_input_tokens_seen": 44037832, "step": 2737 }, { "epoch": 0.1917920968066756, "grad_norm": 4.126194000244141, "learning_rate": 8.083879509632224e-05, "loss": 1.1142, "num_input_tokens_seen": 44053384, "step": 2738 }, { "epoch": 0.19186214505240484, "grad_norm": 5.12385368347168, "learning_rate": 8.083179684763573e-05, "loss": 1.251, "num_input_tokens_seen": 44068728, "step": 2739 }, { "epoch": 0.1919321932981341, "grad_norm": 3.457253932952881, "learning_rate": 8.082479859894923e-05, "loss": 0.8251, "num_input_tokens_seen": 44085112, "step": 2740 }, { "epoch": 0.19200224154386333, "grad_norm": 3.8708858489990234, "learning_rate": 8.08178003502627e-05, "loss": 1.1838, "num_input_tokens_seen": 44101456, "step": 2741 }, { "epoch": 0.1920722897895926, "grad_norm": 4.175468921661377, "learning_rate": 8.081080210157619e-05, "loss": 1.0062, "num_input_tokens_seen": 44116640, "step": 2742 }, { "epoch": 0.19214233803532182, "grad_norm": 4.141748428344727, "learning_rate": 8.080380385288967e-05, "loss": 1.1609, "num_input_tokens_seen": 44132328, "step": 2743 }, { "epoch": 0.19221238628105108, "grad_norm": 5.1061692237854, "learning_rate": 8.079680560420316e-05, "loss": 1.172, "num_input_tokens_seen": 44148712, "step": 2744 }, { "epoch": 0.19228243452678032, "grad_norm": 3.990196704864502, "learning_rate": 8.078980735551663e-05, "loss": 0.9997, "num_input_tokens_seen": 44164600, "step": 2745 }, { "epoch": 0.19235248277250958, "grad_norm": 4.365367412567139, "learning_rate": 8.078280910683013e-05, "loss": 1.0672, "num_input_tokens_seen": 44180984, "step": 2746 }, { "epoch": 0.1924225310182388, "grad_norm": 4.092031002044678, "learning_rate": 8.077581085814362e-05, "loss": 1.1405, "num_input_tokens_seen": 44196400, "step": 2747 }, { "epoch": 0.19249257926396807, "grad_norm": 3.4052438735961914, "learning_rate": 8.07688126094571e-05, "loss": 1.0128, "num_input_tokens_seen": 44212736, "step": 2748 }, { "epoch": 0.1925626275096973, "grad_norm": 4.703436374664307, "learning_rate": 8.076181436077059e-05, "loss": 1.2058, "num_input_tokens_seen": 44229120, "step": 2749 }, { "epoch": 0.19263267575542656, "grad_norm": 3.7579853534698486, "learning_rate": 8.075481611208407e-05, "loss": 0.9081, "num_input_tokens_seen": 44245144, "step": 2750 }, { "epoch": 0.1927027240011558, "grad_norm": 3.6251869201660156, "learning_rate": 8.074781786339755e-05, "loss": 0.9854, "num_input_tokens_seen": 44260920, "step": 2751 }, { "epoch": 0.19277277224688505, "grad_norm": 3.4949889183044434, "learning_rate": 8.074081961471104e-05, "loss": 1.1115, "num_input_tokens_seen": 44277280, "step": 2752 }, { "epoch": 0.19284282049261428, "grad_norm": 4.28520393371582, "learning_rate": 8.073382136602453e-05, "loss": 1.2536, "num_input_tokens_seen": 44293664, "step": 2753 }, { "epoch": 0.19291286873834354, "grad_norm": 3.9574859142303467, "learning_rate": 8.072682311733801e-05, "loss": 1.1584, "num_input_tokens_seen": 44309328, "step": 2754 }, { "epoch": 0.19298291698407277, "grad_norm": 3.6340646743774414, "learning_rate": 8.071982486865149e-05, "loss": 1.0116, "num_input_tokens_seen": 44325336, "step": 2755 }, { "epoch": 0.19305296522980203, "grad_norm": 5.131178855895996, "learning_rate": 8.071282661996498e-05, "loss": 1.1226, "num_input_tokens_seen": 44341264, "step": 2756 }, { "epoch": 0.19312301347553126, "grad_norm": 4.273870944976807, "learning_rate": 8.070582837127847e-05, "loss": 1.0953, "num_input_tokens_seen": 44357648, "step": 2757 }, { "epoch": 0.19319306172126052, "grad_norm": 3.883690118789673, "learning_rate": 8.069883012259194e-05, "loss": 1.2978, "num_input_tokens_seen": 44373984, "step": 2758 }, { "epoch": 0.19326310996698975, "grad_norm": 4.284129619598389, "learning_rate": 8.069183187390543e-05, "loss": 1.0356, "num_input_tokens_seen": 44389160, "step": 2759 }, { "epoch": 0.19333315821271901, "grad_norm": 4.517998695373535, "learning_rate": 8.068483362521892e-05, "loss": 1.0378, "num_input_tokens_seen": 44405544, "step": 2760 }, { "epoch": 0.19340320645844825, "grad_norm": 4.098707675933838, "learning_rate": 8.067783537653241e-05, "loss": 1.2235, "num_input_tokens_seen": 44421560, "step": 2761 }, { "epoch": 0.1934732547041775, "grad_norm": 3.656461477279663, "learning_rate": 8.067083712784588e-05, "loss": 0.8462, "num_input_tokens_seen": 44437944, "step": 2762 }, { "epoch": 0.19354330294990674, "grad_norm": 3.8305914402008057, "learning_rate": 8.066383887915937e-05, "loss": 1.1084, "num_input_tokens_seen": 44454208, "step": 2763 }, { "epoch": 0.193613351195636, "grad_norm": 4.0582990646362305, "learning_rate": 8.065684063047286e-05, "loss": 1.2152, "num_input_tokens_seen": 44470592, "step": 2764 }, { "epoch": 0.19368339944136523, "grad_norm": 4.159184455871582, "learning_rate": 8.064984238178633e-05, "loss": 1.0183, "num_input_tokens_seen": 44486976, "step": 2765 }, { "epoch": 0.1937534476870945, "grad_norm": 3.7490620613098145, "learning_rate": 8.064284413309984e-05, "loss": 1.0883, "num_input_tokens_seen": 44503360, "step": 2766 }, { "epoch": 0.19382349593282372, "grad_norm": 4.3000288009643555, "learning_rate": 8.063584588441333e-05, "loss": 1.2323, "num_input_tokens_seen": 44519744, "step": 2767 }, { "epoch": 0.19389354417855298, "grad_norm": 3.9175477027893066, "learning_rate": 8.06288476357268e-05, "loss": 0.8758, "num_input_tokens_seen": 44535664, "step": 2768 }, { "epoch": 0.1939635924242822, "grad_norm": 4.4328293800354, "learning_rate": 8.062184938704029e-05, "loss": 1.0173, "num_input_tokens_seen": 44550984, "step": 2769 }, { "epoch": 0.19403364067001147, "grad_norm": 4.556321620941162, "learning_rate": 8.061485113835376e-05, "loss": 1.1389, "num_input_tokens_seen": 44566808, "step": 2770 }, { "epoch": 0.19410368891574073, "grad_norm": 4.382159233093262, "learning_rate": 8.060785288966725e-05, "loss": 1.1211, "num_input_tokens_seen": 44583192, "step": 2771 }, { "epoch": 0.19417373716146996, "grad_norm": 3.920137405395508, "learning_rate": 8.060085464098074e-05, "loss": 0.9815, "num_input_tokens_seen": 44599480, "step": 2772 }, { "epoch": 0.19424378540719922, "grad_norm": 4.23013162612915, "learning_rate": 8.059385639229423e-05, "loss": 1.2268, "num_input_tokens_seen": 44615240, "step": 2773 }, { "epoch": 0.19431383365292845, "grad_norm": 3.7917346954345703, "learning_rate": 8.058685814360772e-05, "loss": 1.0935, "num_input_tokens_seen": 44630952, "step": 2774 }, { "epoch": 0.1943838818986577, "grad_norm": 4.798681259155273, "learning_rate": 8.05798598949212e-05, "loss": 1.1321, "num_input_tokens_seen": 44647336, "step": 2775 }, { "epoch": 0.19445393014438694, "grad_norm": 3.563124418258667, "learning_rate": 8.057286164623468e-05, "loss": 1.1231, "num_input_tokens_seen": 44663720, "step": 2776 }, { "epoch": 0.1945239783901162, "grad_norm": 6.6064019203186035, "learning_rate": 8.056586339754817e-05, "loss": 0.8685, "num_input_tokens_seen": 44679616, "step": 2777 }, { "epoch": 0.19459402663584544, "grad_norm": 4.1651291847229, "learning_rate": 8.055886514886165e-05, "loss": 1.1634, "num_input_tokens_seen": 44695800, "step": 2778 }, { "epoch": 0.1946640748815747, "grad_norm": 3.929474353790283, "learning_rate": 8.055186690017513e-05, "loss": 1.1127, "num_input_tokens_seen": 44711744, "step": 2779 }, { "epoch": 0.19473412312730393, "grad_norm": 3.758721351623535, "learning_rate": 8.054486865148862e-05, "loss": 0.9218, "num_input_tokens_seen": 44728128, "step": 2780 }, { "epoch": 0.19480417137303319, "grad_norm": 4.988550662994385, "learning_rate": 8.053787040280211e-05, "loss": 1.222, "num_input_tokens_seen": 44744512, "step": 2781 }, { "epoch": 0.19487421961876242, "grad_norm": 3.7875940799713135, "learning_rate": 8.053087215411559e-05, "loss": 1.0393, "num_input_tokens_seen": 44760896, "step": 2782 }, { "epoch": 0.19494426786449168, "grad_norm": 3.877729654312134, "learning_rate": 8.052387390542908e-05, "loss": 1.1748, "num_input_tokens_seen": 44777280, "step": 2783 }, { "epoch": 0.1950143161102209, "grad_norm": 4.979894161224365, "learning_rate": 8.051687565674256e-05, "loss": 1.1506, "num_input_tokens_seen": 44793664, "step": 2784 }, { "epoch": 0.19508436435595017, "grad_norm": 4.3148579597473145, "learning_rate": 8.050987740805604e-05, "loss": 1.1587, "num_input_tokens_seen": 44809688, "step": 2785 }, { "epoch": 0.1951544126016794, "grad_norm": 4.082404136657715, "learning_rate": 8.050287915936954e-05, "loss": 1.1488, "num_input_tokens_seen": 44825600, "step": 2786 }, { "epoch": 0.19522446084740866, "grad_norm": 3.6951189041137695, "learning_rate": 8.049588091068302e-05, "loss": 1.1542, "num_input_tokens_seen": 44841984, "step": 2787 }, { "epoch": 0.1952945090931379, "grad_norm": 3.797136068344116, "learning_rate": 8.04888826619965e-05, "loss": 0.964, "num_input_tokens_seen": 44858368, "step": 2788 }, { "epoch": 0.19536455733886715, "grad_norm": 3.8912811279296875, "learning_rate": 8.048188441330998e-05, "loss": 0.8985, "num_input_tokens_seen": 44873752, "step": 2789 }, { "epoch": 0.19543460558459638, "grad_norm": 4.355793476104736, "learning_rate": 8.047488616462347e-05, "loss": 1.1546, "num_input_tokens_seen": 44889336, "step": 2790 }, { "epoch": 0.19550465383032564, "grad_norm": 4.216153144836426, "learning_rate": 8.046788791593696e-05, "loss": 1.0922, "num_input_tokens_seen": 44905720, "step": 2791 }, { "epoch": 0.19557470207605487, "grad_norm": 3.5346696376800537, "learning_rate": 8.046088966725045e-05, "loss": 1.1628, "num_input_tokens_seen": 44921864, "step": 2792 }, { "epoch": 0.19564475032178413, "grad_norm": 4.2197489738464355, "learning_rate": 8.045389141856393e-05, "loss": 1.0177, "num_input_tokens_seen": 44938248, "step": 2793 }, { "epoch": 0.19571479856751337, "grad_norm": 3.66995906829834, "learning_rate": 8.044689316987742e-05, "loss": 1.0401, "num_input_tokens_seen": 44954632, "step": 2794 }, { "epoch": 0.19578484681324262, "grad_norm": 5.062297821044922, "learning_rate": 8.04398949211909e-05, "loss": 1.2106, "num_input_tokens_seen": 44971016, "step": 2795 }, { "epoch": 0.19585489505897186, "grad_norm": 4.473872661590576, "learning_rate": 8.043289667250439e-05, "loss": 1.153, "num_input_tokens_seen": 44987400, "step": 2796 }, { "epoch": 0.19592494330470112, "grad_norm": 4.724556922912598, "learning_rate": 8.042589842381786e-05, "loss": 1.23, "num_input_tokens_seen": 45002968, "step": 2797 }, { "epoch": 0.19599499155043035, "grad_norm": 4.324196815490723, "learning_rate": 8.041890017513135e-05, "loss": 0.8708, "num_input_tokens_seen": 45019352, "step": 2798 }, { "epoch": 0.1960650397961596, "grad_norm": 4.309204578399658, "learning_rate": 8.041190192644484e-05, "loss": 1.0769, "num_input_tokens_seen": 45034960, "step": 2799 }, { "epoch": 0.19613508804188884, "grad_norm": 3.4928808212280273, "learning_rate": 8.040490367775833e-05, "loss": 0.9394, "num_input_tokens_seen": 45051344, "step": 2800 }, { "epoch": 0.19613508804188884, "eval_loss": 1.1401225328445435, "eval_runtime": 0.185, "eval_samples_per_second": 5.405, "eval_steps_per_second": 5.405, "num_input_tokens_seen": 45051344, "step": 2800 }, { "epoch": 0.1962051362876181, "grad_norm": 6.388762474060059, "learning_rate": 8.039790542907182e-05, "loss": 1.047, "num_input_tokens_seen": 45066712, "step": 2801 }, { "epoch": 0.19627518453334733, "grad_norm": 3.8386781215667725, "learning_rate": 8.039090718038529e-05, "loss": 1.0248, "num_input_tokens_seen": 45082472, "step": 2802 }, { "epoch": 0.1963452327790766, "grad_norm": 3.540064573287964, "learning_rate": 8.038390893169878e-05, "loss": 0.846, "num_input_tokens_seen": 45098072, "step": 2803 }, { "epoch": 0.19641528102480582, "grad_norm": 3.9858322143554688, "learning_rate": 8.037691068301227e-05, "loss": 1.1443, "num_input_tokens_seen": 45114456, "step": 2804 }, { "epoch": 0.19648532927053508, "grad_norm": 4.418299674987793, "learning_rate": 8.036991243432574e-05, "loss": 1.0391, "num_input_tokens_seen": 45130416, "step": 2805 }, { "epoch": 0.19655537751626434, "grad_norm": 4.6108880043029785, "learning_rate": 8.036291418563923e-05, "loss": 0.9911, "num_input_tokens_seen": 45146800, "step": 2806 }, { "epoch": 0.19662542576199357, "grad_norm": 3.686781883239746, "learning_rate": 8.035591593695272e-05, "loss": 0.904, "num_input_tokens_seen": 45163016, "step": 2807 }, { "epoch": 0.19669547400772283, "grad_norm": 3.7459771633148193, "learning_rate": 8.034891768826621e-05, "loss": 1.0635, "num_input_tokens_seen": 45178912, "step": 2808 }, { "epoch": 0.19676552225345206, "grad_norm": 4.955589771270752, "learning_rate": 8.034191943957968e-05, "loss": 0.951, "num_input_tokens_seen": 45193928, "step": 2809 }, { "epoch": 0.19683557049918132, "grad_norm": 4.901642322540283, "learning_rate": 8.033492119089317e-05, "loss": 1.0751, "num_input_tokens_seen": 45209080, "step": 2810 }, { "epoch": 0.19690561874491055, "grad_norm": 3.685493230819702, "learning_rate": 8.032792294220666e-05, "loss": 1.0408, "num_input_tokens_seen": 45225400, "step": 2811 }, { "epoch": 0.19697566699063981, "grad_norm": 4.731873512268066, "learning_rate": 8.032092469352015e-05, "loss": 0.9684, "num_input_tokens_seen": 45241152, "step": 2812 }, { "epoch": 0.19704571523636905, "grad_norm": 4.52595853805542, "learning_rate": 8.031392644483364e-05, "loss": 1.142, "num_input_tokens_seen": 45256976, "step": 2813 }, { "epoch": 0.1971157634820983, "grad_norm": 4.4693074226379395, "learning_rate": 8.030692819614711e-05, "loss": 1.0846, "num_input_tokens_seen": 45273360, "step": 2814 }, { "epoch": 0.19718581172782754, "grad_norm": 5.599058151245117, "learning_rate": 8.02999299474606e-05, "loss": 1.1544, "num_input_tokens_seen": 45289744, "step": 2815 }, { "epoch": 0.1972558599735568, "grad_norm": 3.758751153945923, "learning_rate": 8.029293169877408e-05, "loss": 1.1877, "num_input_tokens_seen": 45305960, "step": 2816 }, { "epoch": 0.19732590821928603, "grad_norm": 4.059335231781006, "learning_rate": 8.028593345008757e-05, "loss": 1.0294, "num_input_tokens_seen": 45321536, "step": 2817 }, { "epoch": 0.1973959564650153, "grad_norm": 3.8090553283691406, "learning_rate": 8.027893520140105e-05, "loss": 1.1264, "num_input_tokens_seen": 45337920, "step": 2818 }, { "epoch": 0.19746600471074452, "grad_norm": 3.7900006771087646, "learning_rate": 8.027193695271454e-05, "loss": 1.2042, "num_input_tokens_seen": 45353632, "step": 2819 }, { "epoch": 0.19753605295647378, "grad_norm": 4.279977321624756, "learning_rate": 8.026493870402803e-05, "loss": 1.0786, "num_input_tokens_seen": 45369712, "step": 2820 }, { "epoch": 0.197606101202203, "grad_norm": 3.7417356967926025, "learning_rate": 8.025794045534152e-05, "loss": 1.0756, "num_input_tokens_seen": 45384816, "step": 2821 }, { "epoch": 0.19767614944793227, "grad_norm": 4.084759712219238, "learning_rate": 8.0250942206655e-05, "loss": 0.9187, "num_input_tokens_seen": 45401200, "step": 2822 }, { "epoch": 0.1977461976936615, "grad_norm": 4.963731288909912, "learning_rate": 8.024394395796848e-05, "loss": 1.2548, "num_input_tokens_seen": 45417096, "step": 2823 }, { "epoch": 0.19781624593939076, "grad_norm": 4.115303993225098, "learning_rate": 8.023694570928196e-05, "loss": 1.2127, "num_input_tokens_seen": 45433480, "step": 2824 }, { "epoch": 0.19788629418512, "grad_norm": 3.908439874649048, "learning_rate": 8.022994746059545e-05, "loss": 1.0171, "num_input_tokens_seen": 45448984, "step": 2825 }, { "epoch": 0.19795634243084925, "grad_norm": 4.0723090171813965, "learning_rate": 8.022294921190894e-05, "loss": 0.9883, "num_input_tokens_seen": 45465192, "step": 2826 }, { "epoch": 0.19802639067657848, "grad_norm": 4.219478607177734, "learning_rate": 8.021595096322242e-05, "loss": 1.109, "num_input_tokens_seen": 45480904, "step": 2827 }, { "epoch": 0.19809643892230774, "grad_norm": 4.246188163757324, "learning_rate": 8.020895271453591e-05, "loss": 1.3058, "num_input_tokens_seen": 45497288, "step": 2828 }, { "epoch": 0.19816648716803698, "grad_norm": 4.898525714874268, "learning_rate": 8.020195446584939e-05, "loss": 1.1058, "num_input_tokens_seen": 45513456, "step": 2829 }, { "epoch": 0.19823653541376623, "grad_norm": 4.1247239112854, "learning_rate": 8.019495621716288e-05, "loss": 1.031, "num_input_tokens_seen": 45528752, "step": 2830 }, { "epoch": 0.19830658365949547, "grad_norm": 4.352110385894775, "learning_rate": 8.018795796847636e-05, "loss": 1.3602, "num_input_tokens_seen": 45545136, "step": 2831 }, { "epoch": 0.19837663190522473, "grad_norm": 3.731719732284546, "learning_rate": 8.018095971978985e-05, "loss": 0.9833, "num_input_tokens_seen": 45561160, "step": 2832 }, { "epoch": 0.19844668015095396, "grad_norm": 4.234768867492676, "learning_rate": 8.017396147110333e-05, "loss": 1.2279, "num_input_tokens_seen": 45577288, "step": 2833 }, { "epoch": 0.19851672839668322, "grad_norm": 4.682285308837891, "learning_rate": 8.016696322241682e-05, "loss": 1.0376, "num_input_tokens_seen": 45593152, "step": 2834 }, { "epoch": 0.19858677664241245, "grad_norm": 4.576408863067627, "learning_rate": 8.01599649737303e-05, "loss": 1.225, "num_input_tokens_seen": 45609408, "step": 2835 }, { "epoch": 0.1986568248881417, "grad_norm": 4.209808826446533, "learning_rate": 8.015296672504378e-05, "loss": 1.0308, "num_input_tokens_seen": 45625792, "step": 2836 }, { "epoch": 0.19872687313387094, "grad_norm": 4.383143901824951, "learning_rate": 8.014596847635727e-05, "loss": 1.2079, "num_input_tokens_seen": 45642176, "step": 2837 }, { "epoch": 0.1987969213796002, "grad_norm": 4.105413913726807, "learning_rate": 8.013897022767076e-05, "loss": 1.0623, "num_input_tokens_seen": 45657480, "step": 2838 }, { "epoch": 0.19886696962532946, "grad_norm": 5.339532852172852, "learning_rate": 8.013197197898425e-05, "loss": 1.1131, "num_input_tokens_seen": 45673168, "step": 2839 }, { "epoch": 0.1989370178710587, "grad_norm": 3.8160016536712646, "learning_rate": 8.012497373029774e-05, "loss": 1.1392, "num_input_tokens_seen": 45689088, "step": 2840 }, { "epoch": 0.19900706611678795, "grad_norm": 3.763986587524414, "learning_rate": 8.011797548161121e-05, "loss": 1.1852, "num_input_tokens_seen": 45705472, "step": 2841 }, { "epoch": 0.19907711436251718, "grad_norm": 4.034756183624268, "learning_rate": 8.01109772329247e-05, "loss": 1.1856, "num_input_tokens_seen": 45721168, "step": 2842 }, { "epoch": 0.19914716260824644, "grad_norm": 3.971479654312134, "learning_rate": 8.010397898423817e-05, "loss": 1.1443, "num_input_tokens_seen": 45737312, "step": 2843 }, { "epoch": 0.19921721085397567, "grad_norm": 4.118296146392822, "learning_rate": 8.009698073555166e-05, "loss": 0.9964, "num_input_tokens_seen": 45752792, "step": 2844 }, { "epoch": 0.19928725909970493, "grad_norm": 3.628143310546875, "learning_rate": 8.008998248686515e-05, "loss": 1.1102, "num_input_tokens_seen": 45769008, "step": 2845 }, { "epoch": 0.19935730734543416, "grad_norm": 3.9946494102478027, "learning_rate": 8.008298423817864e-05, "loss": 1.1199, "num_input_tokens_seen": 45785392, "step": 2846 }, { "epoch": 0.19942735559116342, "grad_norm": 3.7445459365844727, "learning_rate": 8.007598598949213e-05, "loss": 1.1245, "num_input_tokens_seen": 45801320, "step": 2847 }, { "epoch": 0.19949740383689266, "grad_norm": 3.745481491088867, "learning_rate": 8.006898774080562e-05, "loss": 1.0969, "num_input_tokens_seen": 45817504, "step": 2848 }, { "epoch": 0.19956745208262192, "grad_norm": 4.1305766105651855, "learning_rate": 8.006198949211909e-05, "loss": 1.0953, "num_input_tokens_seen": 45833888, "step": 2849 }, { "epoch": 0.19963750032835115, "grad_norm": 3.7843470573425293, "learning_rate": 8.005499124343258e-05, "loss": 1.111, "num_input_tokens_seen": 45850272, "step": 2850 }, { "epoch": 0.1997075485740804, "grad_norm": 3.9884989261627197, "learning_rate": 8.004799299474606e-05, "loss": 1.083, "num_input_tokens_seen": 45866656, "step": 2851 }, { "epoch": 0.19977759681980964, "grad_norm": 3.7280545234680176, "learning_rate": 8.004099474605956e-05, "loss": 1.0036, "num_input_tokens_seen": 45882776, "step": 2852 }, { "epoch": 0.1998476450655389, "grad_norm": 5.151428699493408, "learning_rate": 8.003399649737303e-05, "loss": 1.2988, "num_input_tokens_seen": 45898520, "step": 2853 }, { "epoch": 0.19991769331126813, "grad_norm": 6.738519191741943, "learning_rate": 8.002699824868652e-05, "loss": 1.1934, "num_input_tokens_seen": 45914904, "step": 2854 }, { "epoch": 0.1999877415569974, "grad_norm": 4.689775466918945, "learning_rate": 8.002000000000001e-05, "loss": 1.3534, "num_input_tokens_seen": 45931288, "step": 2855 }, { "epoch": 0.20005778980272662, "grad_norm": 4.047792911529541, "learning_rate": 8.001300175131348e-05, "loss": 1.2926, "num_input_tokens_seen": 45947672, "step": 2856 }, { "epoch": 0.20012783804845588, "grad_norm": 4.609661102294922, "learning_rate": 8.000600350262697e-05, "loss": 1.0717, "num_input_tokens_seen": 45964056, "step": 2857 }, { "epoch": 0.2001978862941851, "grad_norm": 4.188840389251709, "learning_rate": 7.999900525394046e-05, "loss": 1.0872, "num_input_tokens_seen": 45980152, "step": 2858 }, { "epoch": 0.20026793453991437, "grad_norm": 3.558335781097412, "learning_rate": 7.999200700525395e-05, "loss": 1.1207, "num_input_tokens_seen": 45996536, "step": 2859 }, { "epoch": 0.2003379827856436, "grad_norm": 10.145834922790527, "learning_rate": 7.998500875656743e-05, "loss": 1.0649, "num_input_tokens_seen": 46011616, "step": 2860 }, { "epoch": 0.20040803103137286, "grad_norm": 5.534536838531494, "learning_rate": 7.997801050788091e-05, "loss": 1.3019, "num_input_tokens_seen": 46027016, "step": 2861 }, { "epoch": 0.2004780792771021, "grad_norm": 4.258336544036865, "learning_rate": 7.99710122591944e-05, "loss": 1.1192, "num_input_tokens_seen": 46043400, "step": 2862 }, { "epoch": 0.20054812752283135, "grad_norm": 5.266301155090332, "learning_rate": 7.996401401050788e-05, "loss": 1.0048, "num_input_tokens_seen": 46059784, "step": 2863 }, { "epoch": 0.20061817576856059, "grad_norm": 4.502764701843262, "learning_rate": 7.995701576182137e-05, "loss": 0.9435, "num_input_tokens_seen": 46075584, "step": 2864 }, { "epoch": 0.20068822401428985, "grad_norm": 4.39752197265625, "learning_rate": 7.995001751313485e-05, "loss": 0.9992, "num_input_tokens_seen": 46091520, "step": 2865 }, { "epoch": 0.20075827226001908, "grad_norm": 3.9562480449676514, "learning_rate": 7.994301926444834e-05, "loss": 0.9935, "num_input_tokens_seen": 46107568, "step": 2866 }, { "epoch": 0.20082832050574834, "grad_norm": 4.466681957244873, "learning_rate": 7.993602101576183e-05, "loss": 1.0067, "num_input_tokens_seen": 46123952, "step": 2867 }, { "epoch": 0.20089836875147757, "grad_norm": 3.9317095279693604, "learning_rate": 7.992902276707531e-05, "loss": 1.0353, "num_input_tokens_seen": 46140336, "step": 2868 }, { "epoch": 0.20096841699720683, "grad_norm": 5.025266170501709, "learning_rate": 7.99220245183888e-05, "loss": 1.1297, "num_input_tokens_seen": 46155504, "step": 2869 }, { "epoch": 0.20103846524293606, "grad_norm": 3.82340931892395, "learning_rate": 7.991502626970227e-05, "loss": 1.1677, "num_input_tokens_seen": 46171888, "step": 2870 }, { "epoch": 0.20110851348866532, "grad_norm": 4.017914295196533, "learning_rate": 7.990802802101576e-05, "loss": 1.0779, "num_input_tokens_seen": 46187712, "step": 2871 }, { "epoch": 0.20117856173439455, "grad_norm": 4.053089618682861, "learning_rate": 7.990102977232926e-05, "loss": 0.9687, "num_input_tokens_seen": 46202912, "step": 2872 }, { "epoch": 0.2012486099801238, "grad_norm": 3.5664076805114746, "learning_rate": 7.989403152364274e-05, "loss": 1.0047, "num_input_tokens_seen": 46219296, "step": 2873 }, { "epoch": 0.20131865822585307, "grad_norm": 4.039318084716797, "learning_rate": 7.988703327495623e-05, "loss": 1.107, "num_input_tokens_seen": 46235680, "step": 2874 }, { "epoch": 0.2013887064715823, "grad_norm": 3.8851678371429443, "learning_rate": 7.988003502626971e-05, "loss": 1.0268, "num_input_tokens_seen": 46251408, "step": 2875 }, { "epoch": 0.20145875471731156, "grad_norm": 3.581632137298584, "learning_rate": 7.987303677758319e-05, "loss": 0.9255, "num_input_tokens_seen": 46267696, "step": 2876 }, { "epoch": 0.2015288029630408, "grad_norm": 4.135960102081299, "learning_rate": 7.986603852889668e-05, "loss": 1.1763, "num_input_tokens_seen": 46284080, "step": 2877 }, { "epoch": 0.20159885120877005, "grad_norm": 3.649959087371826, "learning_rate": 7.985904028021017e-05, "loss": 0.8932, "num_input_tokens_seen": 46300456, "step": 2878 }, { "epoch": 0.20166889945449928, "grad_norm": 4.564159393310547, "learning_rate": 7.985204203152365e-05, "loss": 0.9239, "num_input_tokens_seen": 46315928, "step": 2879 }, { "epoch": 0.20173894770022854, "grad_norm": 3.806626796722412, "learning_rate": 7.984504378283713e-05, "loss": 1.0011, "num_input_tokens_seen": 46331520, "step": 2880 }, { "epoch": 0.20180899594595778, "grad_norm": 6.621458530426025, "learning_rate": 7.983804553415062e-05, "loss": 1.045, "num_input_tokens_seen": 46347904, "step": 2881 }, { "epoch": 0.20187904419168703, "grad_norm": 4.554089546203613, "learning_rate": 7.983104728546411e-05, "loss": 0.9472, "num_input_tokens_seen": 46364288, "step": 2882 }, { "epoch": 0.20194909243741627, "grad_norm": 4.206694602966309, "learning_rate": 7.982404903677758e-05, "loss": 1.1913, "num_input_tokens_seen": 46380672, "step": 2883 }, { "epoch": 0.20201914068314553, "grad_norm": 6.333064079284668, "learning_rate": 7.981705078809107e-05, "loss": 1.1189, "num_input_tokens_seen": 46396384, "step": 2884 }, { "epoch": 0.20208918892887476, "grad_norm": 3.6293835639953613, "learning_rate": 7.981005253940456e-05, "loss": 0.9825, "num_input_tokens_seen": 46412712, "step": 2885 }, { "epoch": 0.20215923717460402, "grad_norm": 6.282841682434082, "learning_rate": 7.980305429071805e-05, "loss": 1.0498, "num_input_tokens_seen": 46429096, "step": 2886 }, { "epoch": 0.20222928542033325, "grad_norm": 3.661564350128174, "learning_rate": 7.979605604203152e-05, "loss": 0.9022, "num_input_tokens_seen": 46445480, "step": 2887 }, { "epoch": 0.2022993336660625, "grad_norm": 4.232359409332275, "learning_rate": 7.978905779334501e-05, "loss": 1.3196, "num_input_tokens_seen": 46461344, "step": 2888 }, { "epoch": 0.20236938191179174, "grad_norm": 3.9777348041534424, "learning_rate": 7.97820595446585e-05, "loss": 1.1121, "num_input_tokens_seen": 46477728, "step": 2889 }, { "epoch": 0.202439430157521, "grad_norm": 4.221210479736328, "learning_rate": 7.977506129597197e-05, "loss": 1.1899, "num_input_tokens_seen": 46493680, "step": 2890 }, { "epoch": 0.20250947840325023, "grad_norm": 4.210818767547607, "learning_rate": 7.976806304728546e-05, "loss": 1.1003, "num_input_tokens_seen": 46510064, "step": 2891 }, { "epoch": 0.2025795266489795, "grad_norm": 5.012551307678223, "learning_rate": 7.976106479859895e-05, "loss": 0.9933, "num_input_tokens_seen": 46526448, "step": 2892 }, { "epoch": 0.20264957489470872, "grad_norm": 3.4867520332336426, "learning_rate": 7.975406654991244e-05, "loss": 0.8495, "num_input_tokens_seen": 46542832, "step": 2893 }, { "epoch": 0.20271962314043798, "grad_norm": 4.74222993850708, "learning_rate": 7.974706830122593e-05, "loss": 1.1398, "num_input_tokens_seen": 46559048, "step": 2894 }, { "epoch": 0.20278967138616721, "grad_norm": 5.358060359954834, "learning_rate": 7.97400700525394e-05, "loss": 1.0004, "num_input_tokens_seen": 46575400, "step": 2895 }, { "epoch": 0.20285971963189647, "grad_norm": 4.2599053382873535, "learning_rate": 7.973307180385289e-05, "loss": 1.0021, "num_input_tokens_seen": 46591064, "step": 2896 }, { "epoch": 0.2029297678776257, "grad_norm": 5.993118762969971, "learning_rate": 7.972607355516637e-05, "loss": 1.2017, "num_input_tokens_seen": 46606504, "step": 2897 }, { "epoch": 0.20299981612335496, "grad_norm": 4.129568576812744, "learning_rate": 7.971907530647987e-05, "loss": 1.2929, "num_input_tokens_seen": 46622400, "step": 2898 }, { "epoch": 0.2030698643690842, "grad_norm": 3.8486111164093018, "learning_rate": 7.971207705779336e-05, "loss": 1.0113, "num_input_tokens_seen": 46638752, "step": 2899 }, { "epoch": 0.20313991261481346, "grad_norm": 4.262311935424805, "learning_rate": 7.970507880910683e-05, "loss": 1.1222, "num_input_tokens_seen": 46655136, "step": 2900 }, { "epoch": 0.2032099608605427, "grad_norm": 4.065335750579834, "learning_rate": 7.969808056042032e-05, "loss": 1.2965, "num_input_tokens_seen": 46671520, "step": 2901 }, { "epoch": 0.20328000910627195, "grad_norm": 3.8313064575195312, "learning_rate": 7.969108231173381e-05, "loss": 1.1245, "num_input_tokens_seen": 46687904, "step": 2902 }, { "epoch": 0.20335005735200118, "grad_norm": 3.711580276489258, "learning_rate": 7.968408406304729e-05, "loss": 1.1688, "num_input_tokens_seen": 46704088, "step": 2903 }, { "epoch": 0.20342010559773044, "grad_norm": 4.172581672668457, "learning_rate": 7.967708581436077e-05, "loss": 1.1609, "num_input_tokens_seen": 46720360, "step": 2904 }, { "epoch": 0.20349015384345967, "grad_norm": 4.7567267417907715, "learning_rate": 7.967008756567426e-05, "loss": 1.169, "num_input_tokens_seen": 46735560, "step": 2905 }, { "epoch": 0.20356020208918893, "grad_norm": 4.304897308349609, "learning_rate": 7.966308931698775e-05, "loss": 0.9359, "num_input_tokens_seen": 46751720, "step": 2906 }, { "epoch": 0.20363025033491816, "grad_norm": 4.0556864738464355, "learning_rate": 7.965609106830123e-05, "loss": 1.0763, "num_input_tokens_seen": 46767432, "step": 2907 }, { "epoch": 0.20370029858064742, "grad_norm": 3.7381911277770996, "learning_rate": 7.964909281961472e-05, "loss": 1.0158, "num_input_tokens_seen": 46783488, "step": 2908 }, { "epoch": 0.20377034682637668, "grad_norm": 4.363048553466797, "learning_rate": 7.96420945709282e-05, "loss": 0.9627, "num_input_tokens_seen": 46799016, "step": 2909 }, { "epoch": 0.2038403950721059, "grad_norm": 4.04617166519165, "learning_rate": 7.963509632224168e-05, "loss": 1.1312, "num_input_tokens_seen": 46815400, "step": 2910 }, { "epoch": 0.20391044331783517, "grad_norm": 3.8854830265045166, "learning_rate": 7.962809807355517e-05, "loss": 1.0525, "num_input_tokens_seen": 46831784, "step": 2911 }, { "epoch": 0.2039804915635644, "grad_norm": 4.197749614715576, "learning_rate": 7.962109982486866e-05, "loss": 1.0839, "num_input_tokens_seen": 46848168, "step": 2912 }, { "epoch": 0.20405053980929366, "grad_norm": 4.414098739624023, "learning_rate": 7.961410157618214e-05, "loss": 1.1576, "num_input_tokens_seen": 46864552, "step": 2913 }, { "epoch": 0.2041205880550229, "grad_norm": 3.7771573066711426, "learning_rate": 7.960710332749562e-05, "loss": 0.9597, "num_input_tokens_seen": 46880936, "step": 2914 }, { "epoch": 0.20419063630075215, "grad_norm": 4.179026126861572, "learning_rate": 7.960010507880911e-05, "loss": 1.0754, "num_input_tokens_seen": 46897192, "step": 2915 }, { "epoch": 0.20426068454648139, "grad_norm": 4.017509460449219, "learning_rate": 7.95931068301226e-05, "loss": 1.0476, "num_input_tokens_seen": 46913576, "step": 2916 }, { "epoch": 0.20433073279221065, "grad_norm": 5.863056182861328, "learning_rate": 7.958610858143607e-05, "loss": 1.235, "num_input_tokens_seen": 46929960, "step": 2917 }, { "epoch": 0.20440078103793988, "grad_norm": 5.267307281494141, "learning_rate": 7.957911033274956e-05, "loss": 1.2414, "num_input_tokens_seen": 46946344, "step": 2918 }, { "epoch": 0.20447082928366914, "grad_norm": 5.20788049697876, "learning_rate": 7.957211208406306e-05, "loss": 1.1215, "num_input_tokens_seen": 46961712, "step": 2919 }, { "epoch": 0.20454087752939837, "grad_norm": 4.609791278839111, "learning_rate": 7.956511383537654e-05, "loss": 1.0219, "num_input_tokens_seen": 46977752, "step": 2920 }, { "epoch": 0.20461092577512763, "grad_norm": 3.9752824306488037, "learning_rate": 7.955811558669003e-05, "loss": 1.1427, "num_input_tokens_seen": 46994136, "step": 2921 }, { "epoch": 0.20468097402085686, "grad_norm": 3.8456339836120605, "learning_rate": 7.95511173380035e-05, "loss": 1.1006, "num_input_tokens_seen": 47010520, "step": 2922 }, { "epoch": 0.20475102226658612, "grad_norm": 4.087759494781494, "learning_rate": 7.954411908931699e-05, "loss": 1.0535, "num_input_tokens_seen": 47026904, "step": 2923 }, { "epoch": 0.20482107051231535, "grad_norm": 3.9754104614257812, "learning_rate": 7.953712084063048e-05, "loss": 1.0334, "num_input_tokens_seen": 47043288, "step": 2924 }, { "epoch": 0.2048911187580446, "grad_norm": 3.61798357963562, "learning_rate": 7.953012259194397e-05, "loss": 1.1471, "num_input_tokens_seen": 47059672, "step": 2925 }, { "epoch": 0.20496116700377384, "grad_norm": 4.015439510345459, "learning_rate": 7.952312434325746e-05, "loss": 1.0836, "num_input_tokens_seen": 47074232, "step": 2926 }, { "epoch": 0.2050312152495031, "grad_norm": 5.869642734527588, "learning_rate": 7.951612609457093e-05, "loss": 1.275, "num_input_tokens_seen": 47090616, "step": 2927 }, { "epoch": 0.20510126349523233, "grad_norm": 4.0500922203063965, "learning_rate": 7.950912784588442e-05, "loss": 1.1142, "num_input_tokens_seen": 47106656, "step": 2928 }, { "epoch": 0.2051713117409616, "grad_norm": 5.468737602233887, "learning_rate": 7.950212959719791e-05, "loss": 1.2679, "num_input_tokens_seen": 47122648, "step": 2929 }, { "epoch": 0.20524135998669082, "grad_norm": 3.842905282974243, "learning_rate": 7.949513134851138e-05, "loss": 1.0889, "num_input_tokens_seen": 47139032, "step": 2930 }, { "epoch": 0.20531140823242008, "grad_norm": 4.24273681640625, "learning_rate": 7.948813309982487e-05, "loss": 1.0533, "num_input_tokens_seen": 47154344, "step": 2931 }, { "epoch": 0.20538145647814932, "grad_norm": 3.977433443069458, "learning_rate": 7.948113485113836e-05, "loss": 0.9184, "num_input_tokens_seen": 47170728, "step": 2932 }, { "epoch": 0.20545150472387858, "grad_norm": 3.8441646099090576, "learning_rate": 7.947413660245185e-05, "loss": 1.1266, "num_input_tokens_seen": 47187112, "step": 2933 }, { "epoch": 0.2055215529696078, "grad_norm": 3.3789381980895996, "learning_rate": 7.946713835376532e-05, "loss": 0.9244, "num_input_tokens_seen": 47203400, "step": 2934 }, { "epoch": 0.20559160121533707, "grad_norm": 3.817631483078003, "learning_rate": 7.946014010507881e-05, "loss": 1.198, "num_input_tokens_seen": 47219784, "step": 2935 }, { "epoch": 0.2056616494610663, "grad_norm": 3.788300037384033, "learning_rate": 7.94531418563923e-05, "loss": 1.1565, "num_input_tokens_seen": 47236168, "step": 2936 }, { "epoch": 0.20573169770679556, "grad_norm": 3.852132558822632, "learning_rate": 7.944614360770578e-05, "loss": 1.1259, "num_input_tokens_seen": 47252288, "step": 2937 }, { "epoch": 0.2058017459525248, "grad_norm": 3.8631093502044678, "learning_rate": 7.943914535901926e-05, "loss": 1.091, "num_input_tokens_seen": 47267000, "step": 2938 }, { "epoch": 0.20587179419825405, "grad_norm": 3.72165846824646, "learning_rate": 7.943214711033275e-05, "loss": 0.7975, "num_input_tokens_seen": 47282832, "step": 2939 }, { "epoch": 0.20594184244398328, "grad_norm": 4.04188871383667, "learning_rate": 7.942514886164624e-05, "loss": 1.0953, "num_input_tokens_seen": 47298320, "step": 2940 }, { "epoch": 0.20601189068971254, "grad_norm": 3.5907206535339355, "learning_rate": 7.941815061295972e-05, "loss": 0.9766, "num_input_tokens_seen": 47314704, "step": 2941 }, { "epoch": 0.20608193893544177, "grad_norm": 5.023667335510254, "learning_rate": 7.94111523642732e-05, "loss": 1.2083, "num_input_tokens_seen": 47331088, "step": 2942 }, { "epoch": 0.20615198718117103, "grad_norm": 3.8885724544525146, "learning_rate": 7.94041541155867e-05, "loss": 0.9374, "num_input_tokens_seen": 47347424, "step": 2943 }, { "epoch": 0.2062220354269003, "grad_norm": 4.289493560791016, "learning_rate": 7.939715586690017e-05, "loss": 1.0399, "num_input_tokens_seen": 47363808, "step": 2944 }, { "epoch": 0.20629208367262952, "grad_norm": 4.976572513580322, "learning_rate": 7.939015761821367e-05, "loss": 0.8901, "num_input_tokens_seen": 47379152, "step": 2945 }, { "epoch": 0.20636213191835878, "grad_norm": 4.0893425941467285, "learning_rate": 7.938315936952716e-05, "loss": 1.0622, "num_input_tokens_seen": 47395536, "step": 2946 }, { "epoch": 0.206432180164088, "grad_norm": 3.799873113632202, "learning_rate": 7.937616112084063e-05, "loss": 1.1433, "num_input_tokens_seen": 47410968, "step": 2947 }, { "epoch": 0.20650222840981727, "grad_norm": 4.688945293426514, "learning_rate": 7.936916287215412e-05, "loss": 1.1424, "num_input_tokens_seen": 47427352, "step": 2948 }, { "epoch": 0.2065722766555465, "grad_norm": 3.6503846645355225, "learning_rate": 7.93621646234676e-05, "loss": 0.9236, "num_input_tokens_seen": 47443736, "step": 2949 }, { "epoch": 0.20664232490127576, "grad_norm": 4.2314324378967285, "learning_rate": 7.935516637478109e-05, "loss": 1.2795, "num_input_tokens_seen": 47460120, "step": 2950 }, { "epoch": 0.206712373147005, "grad_norm": 5.159674644470215, "learning_rate": 7.934816812609458e-05, "loss": 0.8852, "num_input_tokens_seen": 47476256, "step": 2951 }, { "epoch": 0.20678242139273426, "grad_norm": 3.798804759979248, "learning_rate": 7.934116987740806e-05, "loss": 1.1161, "num_input_tokens_seen": 47492208, "step": 2952 }, { "epoch": 0.2068524696384635, "grad_norm": 4.233975887298584, "learning_rate": 7.933417162872155e-05, "loss": 1.0927, "num_input_tokens_seen": 47507728, "step": 2953 }, { "epoch": 0.20692251788419275, "grad_norm": 3.38350772857666, "learning_rate": 7.932717338003503e-05, "loss": 1.0429, "num_input_tokens_seen": 47523992, "step": 2954 }, { "epoch": 0.20699256612992198, "grad_norm": 3.94380521774292, "learning_rate": 7.932017513134852e-05, "loss": 0.9227, "num_input_tokens_seen": 47540376, "step": 2955 }, { "epoch": 0.20706261437565124, "grad_norm": 3.887354612350464, "learning_rate": 7.9313176882662e-05, "loss": 0.9709, "num_input_tokens_seen": 47555336, "step": 2956 }, { "epoch": 0.20713266262138047, "grad_norm": 4.271602153778076, "learning_rate": 7.930617863397548e-05, "loss": 1.3089, "num_input_tokens_seen": 47570520, "step": 2957 }, { "epoch": 0.20720271086710973, "grad_norm": 4.119933605194092, "learning_rate": 7.929918038528897e-05, "loss": 1.0162, "num_input_tokens_seen": 47586904, "step": 2958 }, { "epoch": 0.20727275911283896, "grad_norm": 6.137136936187744, "learning_rate": 7.929218213660246e-05, "loss": 0.7847, "num_input_tokens_seen": 47602424, "step": 2959 }, { "epoch": 0.20734280735856822, "grad_norm": 3.5264923572540283, "learning_rate": 7.928518388791595e-05, "loss": 1.0751, "num_input_tokens_seen": 47618808, "step": 2960 }, { "epoch": 0.20741285560429745, "grad_norm": 4.183988094329834, "learning_rate": 7.927818563922942e-05, "loss": 1.1901, "num_input_tokens_seen": 47634576, "step": 2961 }, { "epoch": 0.2074829038500267, "grad_norm": 3.486311197280884, "learning_rate": 7.927118739054291e-05, "loss": 0.8559, "num_input_tokens_seen": 47649920, "step": 2962 }, { "epoch": 0.20755295209575594, "grad_norm": 4.561336994171143, "learning_rate": 7.92641891418564e-05, "loss": 0.9521, "num_input_tokens_seen": 47666304, "step": 2963 }, { "epoch": 0.2076230003414852, "grad_norm": 4.002289295196533, "learning_rate": 7.925719089316987e-05, "loss": 1.1708, "num_input_tokens_seen": 47682688, "step": 2964 }, { "epoch": 0.20769304858721443, "grad_norm": 3.694175958633423, "learning_rate": 7.925019264448338e-05, "loss": 0.9635, "num_input_tokens_seen": 47699072, "step": 2965 }, { "epoch": 0.2077630968329437, "grad_norm": 3.7827298641204834, "learning_rate": 7.924319439579685e-05, "loss": 1.0921, "num_input_tokens_seen": 47714720, "step": 2966 }, { "epoch": 0.20783314507867293, "grad_norm": 3.8371527194976807, "learning_rate": 7.923619614711034e-05, "loss": 1.12, "num_input_tokens_seen": 47730904, "step": 2967 }, { "epoch": 0.20790319332440219, "grad_norm": 4.20089054107666, "learning_rate": 7.922919789842381e-05, "loss": 1.0999, "num_input_tokens_seen": 47747288, "step": 2968 }, { "epoch": 0.20797324157013142, "grad_norm": 3.978065252304077, "learning_rate": 7.92221996497373e-05, "loss": 1.0472, "num_input_tokens_seen": 47763672, "step": 2969 }, { "epoch": 0.20804328981586068, "grad_norm": 4.882012844085693, "learning_rate": 7.921520140105079e-05, "loss": 1.0838, "num_input_tokens_seen": 47778888, "step": 2970 }, { "epoch": 0.2081133380615899, "grad_norm": 4.202088356018066, "learning_rate": 7.920820315236428e-05, "loss": 1.178, "num_input_tokens_seen": 47795272, "step": 2971 }, { "epoch": 0.20818338630731917, "grad_norm": 3.623647928237915, "learning_rate": 7.920120490367777e-05, "loss": 0.9782, "num_input_tokens_seen": 47811656, "step": 2972 }, { "epoch": 0.2082534345530484, "grad_norm": 4.158148765563965, "learning_rate": 7.919420665499126e-05, "loss": 1.0585, "num_input_tokens_seen": 47827520, "step": 2973 }, { "epoch": 0.20832348279877766, "grad_norm": 4.016353130340576, "learning_rate": 7.918720840630473e-05, "loss": 1.0176, "num_input_tokens_seen": 47843904, "step": 2974 }, { "epoch": 0.2083935310445069, "grad_norm": 5.862729072570801, "learning_rate": 7.918021015761822e-05, "loss": 1.0233, "num_input_tokens_seen": 47860288, "step": 2975 }, { "epoch": 0.20846357929023615, "grad_norm": 4.194519519805908, "learning_rate": 7.91732119089317e-05, "loss": 1.13, "num_input_tokens_seen": 47876536, "step": 2976 }, { "epoch": 0.20853362753596538, "grad_norm": 3.925144672393799, "learning_rate": 7.916621366024518e-05, "loss": 1.0069, "num_input_tokens_seen": 47892216, "step": 2977 }, { "epoch": 0.20860367578169464, "grad_norm": 4.005881309509277, "learning_rate": 7.915921541155867e-05, "loss": 1.1126, "num_input_tokens_seen": 47907840, "step": 2978 }, { "epoch": 0.2086737240274239, "grad_norm": 3.6061627864837646, "learning_rate": 7.915221716287216e-05, "loss": 0.8235, "num_input_tokens_seen": 47923832, "step": 2979 }, { "epoch": 0.20874377227315313, "grad_norm": 4.407896041870117, "learning_rate": 7.914521891418565e-05, "loss": 0.962, "num_input_tokens_seen": 47940216, "step": 2980 }, { "epoch": 0.2088138205188824, "grad_norm": 4.089472770690918, "learning_rate": 7.913822066549912e-05, "loss": 1.0691, "num_input_tokens_seen": 47956600, "step": 2981 }, { "epoch": 0.20888386876461162, "grad_norm": 4.384250640869141, "learning_rate": 7.913122241681261e-05, "loss": 1.1681, "num_input_tokens_seen": 47972984, "step": 2982 }, { "epoch": 0.20895391701034088, "grad_norm": 3.881756544113159, "learning_rate": 7.91242241681261e-05, "loss": 1.1473, "num_input_tokens_seen": 47989368, "step": 2983 }, { "epoch": 0.20902396525607012, "grad_norm": 3.9435884952545166, "learning_rate": 7.911722591943958e-05, "loss": 1.0328, "num_input_tokens_seen": 48005608, "step": 2984 }, { "epoch": 0.20909401350179938, "grad_norm": 4.1196794509887695, "learning_rate": 7.911022767075308e-05, "loss": 1.0287, "num_input_tokens_seen": 48021992, "step": 2985 }, { "epoch": 0.2091640617475286, "grad_norm": 4.482571125030518, "learning_rate": 7.910322942206655e-05, "loss": 1.0663, "num_input_tokens_seen": 48037816, "step": 2986 }, { "epoch": 0.20923410999325787, "grad_norm": 5.359109401702881, "learning_rate": 7.909623117338004e-05, "loss": 1.2157, "num_input_tokens_seen": 48054200, "step": 2987 }, { "epoch": 0.2093041582389871, "grad_norm": 5.712708950042725, "learning_rate": 7.908923292469352e-05, "loss": 1.1454, "num_input_tokens_seen": 48070008, "step": 2988 }, { "epoch": 0.20937420648471636, "grad_norm": 3.980526924133301, "learning_rate": 7.9082234676007e-05, "loss": 1.1933, "num_input_tokens_seen": 48084864, "step": 2989 }, { "epoch": 0.2094442547304456, "grad_norm": 4.963679790496826, "learning_rate": 7.90752364273205e-05, "loss": 1.1465, "num_input_tokens_seen": 48101248, "step": 2990 }, { "epoch": 0.20951430297617485, "grad_norm": 6.20939302444458, "learning_rate": 7.906823817863398e-05, "loss": 1.1187, "num_input_tokens_seen": 48114984, "step": 2991 }, { "epoch": 0.20958435122190408, "grad_norm": 13.218465805053711, "learning_rate": 7.906123992994747e-05, "loss": 1.0589, "num_input_tokens_seen": 48129704, "step": 2992 }, { "epoch": 0.20965439946763334, "grad_norm": 6.285522937774658, "learning_rate": 7.905424168126095e-05, "loss": 1.0993, "num_input_tokens_seen": 48144280, "step": 2993 }, { "epoch": 0.20972444771336257, "grad_norm": 5.113750457763672, "learning_rate": 7.904724343257444e-05, "loss": 1.0187, "num_input_tokens_seen": 48160664, "step": 2994 }, { "epoch": 0.20979449595909183, "grad_norm": 3.5571322441101074, "learning_rate": 7.904024518388791e-05, "loss": 0.9789, "num_input_tokens_seen": 48177048, "step": 2995 }, { "epoch": 0.20986454420482106, "grad_norm": 4.965229511260986, "learning_rate": 7.90332469352014e-05, "loss": 1.0934, "num_input_tokens_seen": 48193400, "step": 2996 }, { "epoch": 0.20993459245055032, "grad_norm": 4.466450214385986, "learning_rate": 7.902624868651489e-05, "loss": 1.2786, "num_input_tokens_seen": 48209784, "step": 2997 }, { "epoch": 0.21000464069627955, "grad_norm": 3.556642770767212, "learning_rate": 7.901925043782838e-05, "loss": 1.0579, "num_input_tokens_seen": 48226096, "step": 2998 }, { "epoch": 0.2100746889420088, "grad_norm": 5.175073146820068, "learning_rate": 7.901225218914187e-05, "loss": 1.0822, "num_input_tokens_seen": 48242384, "step": 2999 }, { "epoch": 0.21014473718773805, "grad_norm": 4.901797771453857, "learning_rate": 7.900525394045535e-05, "loss": 0.9413, "num_input_tokens_seen": 48257944, "step": 3000 }, { "epoch": 0.21014473718773805, "eval_loss": 1.137844204902649, "eval_runtime": 0.2151, "eval_samples_per_second": 4.649, "eval_steps_per_second": 4.649, "num_input_tokens_seen": 48257944, "step": 3000 }, { "epoch": 0.2102147854334673, "grad_norm": 3.8474860191345215, "learning_rate": 7.899825569176883e-05, "loss": 0.9454, "num_input_tokens_seen": 48273144, "step": 3001 }, { "epoch": 0.21028483367919654, "grad_norm": 4.4164347648620605, "learning_rate": 7.899125744308232e-05, "loss": 1.2554, "num_input_tokens_seen": 48288896, "step": 3002 }, { "epoch": 0.2103548819249258, "grad_norm": 4.560143947601318, "learning_rate": 7.898425919439579e-05, "loss": 1.1129, "num_input_tokens_seen": 48305168, "step": 3003 }, { "epoch": 0.21042493017065503, "grad_norm": 4.310809135437012, "learning_rate": 7.897726094570928e-05, "loss": 1.1215, "num_input_tokens_seen": 48320936, "step": 3004 }, { "epoch": 0.2104949784163843, "grad_norm": 5.8606367111206055, "learning_rate": 7.897026269702277e-05, "loss": 0.7859, "num_input_tokens_seen": 48334752, "step": 3005 }, { "epoch": 0.21056502666211352, "grad_norm": 4.533644676208496, "learning_rate": 7.896326444833626e-05, "loss": 1.3134, "num_input_tokens_seen": 48351136, "step": 3006 }, { "epoch": 0.21063507490784278, "grad_norm": 3.955151081085205, "learning_rate": 7.895626619964975e-05, "loss": 1.3093, "num_input_tokens_seen": 48367520, "step": 3007 }, { "epoch": 0.210705123153572, "grad_norm": 4.857527732849121, "learning_rate": 7.894926795096322e-05, "loss": 0.9838, "num_input_tokens_seen": 48383584, "step": 3008 }, { "epoch": 0.21077517139930127, "grad_norm": 4.2091593742370605, "learning_rate": 7.894226970227671e-05, "loss": 0.9278, "num_input_tokens_seen": 48399968, "step": 3009 }, { "epoch": 0.2108452196450305, "grad_norm": 4.02255916595459, "learning_rate": 7.89352714535902e-05, "loss": 1.086, "num_input_tokens_seen": 48416016, "step": 3010 }, { "epoch": 0.21091526789075976, "grad_norm": 4.021467208862305, "learning_rate": 7.892827320490369e-05, "loss": 1.1088, "num_input_tokens_seen": 48432400, "step": 3011 }, { "epoch": 0.21098531613648902, "grad_norm": 4.211849212646484, "learning_rate": 7.892127495621716e-05, "loss": 1.1698, "num_input_tokens_seen": 48448784, "step": 3012 }, { "epoch": 0.21105536438221825, "grad_norm": 3.890512704849243, "learning_rate": 7.891427670753065e-05, "loss": 1.1048, "num_input_tokens_seen": 48465168, "step": 3013 }, { "epoch": 0.2111254126279475, "grad_norm": 3.9605376720428467, "learning_rate": 7.890727845884414e-05, "loss": 0.9904, "num_input_tokens_seen": 48481024, "step": 3014 }, { "epoch": 0.21119546087367674, "grad_norm": 3.6985483169555664, "learning_rate": 7.890028021015761e-05, "loss": 1.1033, "num_input_tokens_seen": 48497408, "step": 3015 }, { "epoch": 0.211265509119406, "grad_norm": 4.245354652404785, "learning_rate": 7.88932819614711e-05, "loss": 1.0609, "num_input_tokens_seen": 48513640, "step": 3016 }, { "epoch": 0.21133555736513523, "grad_norm": 4.163609027862549, "learning_rate": 7.888628371278459e-05, "loss": 1.2399, "num_input_tokens_seen": 48529704, "step": 3017 }, { "epoch": 0.2114056056108645, "grad_norm": 4.139742374420166, "learning_rate": 7.887928546409808e-05, "loss": 1.1029, "num_input_tokens_seen": 48545808, "step": 3018 }, { "epoch": 0.21147565385659373, "grad_norm": 4.119020938873291, "learning_rate": 7.887228721541157e-05, "loss": 1.233, "num_input_tokens_seen": 48561584, "step": 3019 }, { "epoch": 0.21154570210232299, "grad_norm": 3.467578172683716, "learning_rate": 7.886528896672504e-05, "loss": 0.9757, "num_input_tokens_seen": 48577912, "step": 3020 }, { "epoch": 0.21161575034805222, "grad_norm": 4.891791820526123, "learning_rate": 7.885829071803853e-05, "loss": 1.0507, "num_input_tokens_seen": 48591792, "step": 3021 }, { "epoch": 0.21168579859378148, "grad_norm": 3.8184545040130615, "learning_rate": 7.885129246935201e-05, "loss": 0.9845, "num_input_tokens_seen": 48606656, "step": 3022 }, { "epoch": 0.2117558468395107, "grad_norm": 3.909607410430908, "learning_rate": 7.88442942206655e-05, "loss": 1.2735, "num_input_tokens_seen": 48622608, "step": 3023 }, { "epoch": 0.21182589508523997, "grad_norm": 3.780740737915039, "learning_rate": 7.883729597197899e-05, "loss": 0.9796, "num_input_tokens_seen": 48638992, "step": 3024 }, { "epoch": 0.2118959433309692, "grad_norm": 3.95491099357605, "learning_rate": 7.883029772329247e-05, "loss": 1.0265, "num_input_tokens_seen": 48654344, "step": 3025 }, { "epoch": 0.21196599157669846, "grad_norm": 3.724346876144409, "learning_rate": 7.882329947460596e-05, "loss": 0.9352, "num_input_tokens_seen": 48670728, "step": 3026 }, { "epoch": 0.2120360398224277, "grad_norm": 4.314544200897217, "learning_rate": 7.881630122591945e-05, "loss": 1.145, "num_input_tokens_seen": 48685424, "step": 3027 }, { "epoch": 0.21210608806815695, "grad_norm": 3.9340150356292725, "learning_rate": 7.880930297723293e-05, "loss": 1.0337, "num_input_tokens_seen": 48700416, "step": 3028 }, { "epoch": 0.21217613631388618, "grad_norm": 4.978084087371826, "learning_rate": 7.880230472854641e-05, "loss": 1.2418, "num_input_tokens_seen": 48716800, "step": 3029 }, { "epoch": 0.21224618455961544, "grad_norm": 3.7038094997406006, "learning_rate": 7.879530647985989e-05, "loss": 1.0618, "num_input_tokens_seen": 48732616, "step": 3030 }, { "epoch": 0.21231623280534467, "grad_norm": 5.743021011352539, "learning_rate": 7.878830823117339e-05, "loss": 1.033, "num_input_tokens_seen": 48748656, "step": 3031 }, { "epoch": 0.21238628105107393, "grad_norm": 5.655540943145752, "learning_rate": 7.878130998248687e-05, "loss": 1.3541, "num_input_tokens_seen": 48765040, "step": 3032 }, { "epoch": 0.21245632929680316, "grad_norm": 4.291803359985352, "learning_rate": 7.877431173380036e-05, "loss": 1.1966, "num_input_tokens_seen": 48781424, "step": 3033 }, { "epoch": 0.21252637754253242, "grad_norm": 5.103096961975098, "learning_rate": 7.876731348511384e-05, "loss": 1.0543, "num_input_tokens_seen": 48797808, "step": 3034 }, { "epoch": 0.21259642578826166, "grad_norm": 5.048161029815674, "learning_rate": 7.876031523642732e-05, "loss": 0.9595, "num_input_tokens_seen": 48814192, "step": 3035 }, { "epoch": 0.21266647403399092, "grad_norm": 4.086791038513184, "learning_rate": 7.875331698774081e-05, "loss": 1.1128, "num_input_tokens_seen": 48829816, "step": 3036 }, { "epoch": 0.21273652227972015, "grad_norm": 3.8422605991363525, "learning_rate": 7.87463187390543e-05, "loss": 1.175, "num_input_tokens_seen": 48846200, "step": 3037 }, { "epoch": 0.2128065705254494, "grad_norm": 3.7120776176452637, "learning_rate": 7.873932049036778e-05, "loss": 1.0748, "num_input_tokens_seen": 48862584, "step": 3038 }, { "epoch": 0.21287661877117864, "grad_norm": 5.051353454589844, "learning_rate": 7.873232224168126e-05, "loss": 1.0278, "num_input_tokens_seen": 48878368, "step": 3039 }, { "epoch": 0.2129466670169079, "grad_norm": 3.9874653816223145, "learning_rate": 7.872532399299475e-05, "loss": 1.256, "num_input_tokens_seen": 48894696, "step": 3040 }, { "epoch": 0.21301671526263713, "grad_norm": 4.455258369445801, "learning_rate": 7.871832574430824e-05, "loss": 1.2226, "num_input_tokens_seen": 48911080, "step": 3041 }, { "epoch": 0.2130867635083664, "grad_norm": 5.521103382110596, "learning_rate": 7.871132749562171e-05, "loss": 1.2116, "num_input_tokens_seen": 48927464, "step": 3042 }, { "epoch": 0.21315681175409562, "grad_norm": 3.80818510055542, "learning_rate": 7.87043292469352e-05, "loss": 1.2213, "num_input_tokens_seen": 48943848, "step": 3043 }, { "epoch": 0.21322685999982488, "grad_norm": 4.319914817810059, "learning_rate": 7.869733099824869e-05, "loss": 0.9786, "num_input_tokens_seen": 48960232, "step": 3044 }, { "epoch": 0.2132969082455541, "grad_norm": 4.196371078491211, "learning_rate": 7.869033274956218e-05, "loss": 0.9782, "num_input_tokens_seen": 48976616, "step": 3045 }, { "epoch": 0.21336695649128337, "grad_norm": 3.988114595413208, "learning_rate": 7.868333450087567e-05, "loss": 1.0923, "num_input_tokens_seen": 48992248, "step": 3046 }, { "epoch": 0.21343700473701263, "grad_norm": 3.887589454650879, "learning_rate": 7.867633625218914e-05, "loss": 1.068, "num_input_tokens_seen": 49008632, "step": 3047 }, { "epoch": 0.21350705298274186, "grad_norm": 3.7942206859588623, "learning_rate": 7.866933800350263e-05, "loss": 1.1917, "num_input_tokens_seen": 49024560, "step": 3048 }, { "epoch": 0.21357710122847112, "grad_norm": 4.464767932891846, "learning_rate": 7.86623397548161e-05, "loss": 0.9137, "num_input_tokens_seen": 49040200, "step": 3049 }, { "epoch": 0.21364714947420035, "grad_norm": 4.411591529846191, "learning_rate": 7.86553415061296e-05, "loss": 1.2315, "num_input_tokens_seen": 49056328, "step": 3050 }, { "epoch": 0.2137171977199296, "grad_norm": 4.895592212677002, "learning_rate": 7.86483432574431e-05, "loss": 1.0756, "num_input_tokens_seen": 49072696, "step": 3051 }, { "epoch": 0.21378724596565885, "grad_norm": 4.46630859375, "learning_rate": 7.864134500875657e-05, "loss": 0.837, "num_input_tokens_seen": 49087256, "step": 3052 }, { "epoch": 0.2138572942113881, "grad_norm": 4.975766658782959, "learning_rate": 7.863434676007006e-05, "loss": 1.0508, "num_input_tokens_seen": 49103640, "step": 3053 }, { "epoch": 0.21392734245711734, "grad_norm": 4.441544532775879, "learning_rate": 7.862734851138355e-05, "loss": 0.9917, "num_input_tokens_seen": 49119032, "step": 3054 }, { "epoch": 0.2139973907028466, "grad_norm": 3.797757148742676, "learning_rate": 7.862035026269702e-05, "loss": 0.8701, "num_input_tokens_seen": 49134960, "step": 3055 }, { "epoch": 0.21406743894857583, "grad_norm": 4.021834373474121, "learning_rate": 7.861335201401051e-05, "loss": 1.0355, "num_input_tokens_seen": 49151344, "step": 3056 }, { "epoch": 0.2141374871943051, "grad_norm": 3.772587537765503, "learning_rate": 7.8606353765324e-05, "loss": 0.9717, "num_input_tokens_seen": 49167424, "step": 3057 }, { "epoch": 0.21420753544003432, "grad_norm": 5.356356143951416, "learning_rate": 7.859935551663749e-05, "loss": 1.027, "num_input_tokens_seen": 49183504, "step": 3058 }, { "epoch": 0.21427758368576358, "grad_norm": 4.314568042755127, "learning_rate": 7.859235726795096e-05, "loss": 1.0233, "num_input_tokens_seen": 49199320, "step": 3059 }, { "epoch": 0.2143476319314928, "grad_norm": 3.777794122695923, "learning_rate": 7.858535901926445e-05, "loss": 1.1218, "num_input_tokens_seen": 49215032, "step": 3060 }, { "epoch": 0.21441768017722207, "grad_norm": 3.788496732711792, "learning_rate": 7.857836077057794e-05, "loss": 0.9121, "num_input_tokens_seen": 49230248, "step": 3061 }, { "epoch": 0.2144877284229513, "grad_norm": 3.776698589324951, "learning_rate": 7.857136252189142e-05, "loss": 1.0687, "num_input_tokens_seen": 49246264, "step": 3062 }, { "epoch": 0.21455777666868056, "grad_norm": 3.8229172229766846, "learning_rate": 7.85643642732049e-05, "loss": 0.9773, "num_input_tokens_seen": 49262648, "step": 3063 }, { "epoch": 0.2146278249144098, "grad_norm": 3.7620902061462402, "learning_rate": 7.85573660245184e-05, "loss": 1.0162, "num_input_tokens_seen": 49278640, "step": 3064 }, { "epoch": 0.21469787316013905, "grad_norm": 3.953148126602173, "learning_rate": 7.855036777583188e-05, "loss": 1.1277, "num_input_tokens_seen": 49295024, "step": 3065 }, { "epoch": 0.21476792140586828, "grad_norm": 4.1923441886901855, "learning_rate": 7.854336952714536e-05, "loss": 0.9317, "num_input_tokens_seen": 49311408, "step": 3066 }, { "epoch": 0.21483796965159754, "grad_norm": 4.922461986541748, "learning_rate": 7.853637127845885e-05, "loss": 1.2234, "num_input_tokens_seen": 49327120, "step": 3067 }, { "epoch": 0.21490801789732678, "grad_norm": 3.7414777278900146, "learning_rate": 7.852937302977233e-05, "loss": 0.8628, "num_input_tokens_seen": 49343504, "step": 3068 }, { "epoch": 0.21497806614305603, "grad_norm": 6.1490912437438965, "learning_rate": 7.852237478108581e-05, "loss": 0.9836, "num_input_tokens_seen": 49359336, "step": 3069 }, { "epoch": 0.21504811438878527, "grad_norm": 4.232786178588867, "learning_rate": 7.85153765323993e-05, "loss": 1.1071, "num_input_tokens_seen": 49374888, "step": 3070 }, { "epoch": 0.21511816263451453, "grad_norm": 4.170281887054443, "learning_rate": 7.85083782837128e-05, "loss": 1.1863, "num_input_tokens_seen": 49391272, "step": 3071 }, { "epoch": 0.21518821088024376, "grad_norm": 4.096348285675049, "learning_rate": 7.850138003502627e-05, "loss": 1.1574, "num_input_tokens_seen": 49407656, "step": 3072 }, { "epoch": 0.21525825912597302, "grad_norm": 4.523014068603516, "learning_rate": 7.849438178633976e-05, "loss": 0.9481, "num_input_tokens_seen": 49424040, "step": 3073 }, { "epoch": 0.21532830737170225, "grad_norm": 5.029306888580322, "learning_rate": 7.848738353765324e-05, "loss": 1.2744, "num_input_tokens_seen": 49440208, "step": 3074 }, { "epoch": 0.2153983556174315, "grad_norm": 3.5349771976470947, "learning_rate": 7.848038528896673e-05, "loss": 0.8675, "num_input_tokens_seen": 49456520, "step": 3075 }, { "epoch": 0.21546840386316074, "grad_norm": 3.544787645339966, "learning_rate": 7.84733870402802e-05, "loss": 1.0082, "num_input_tokens_seen": 49472904, "step": 3076 }, { "epoch": 0.21553845210889, "grad_norm": 4.602756500244141, "learning_rate": 7.84663887915937e-05, "loss": 1.0747, "num_input_tokens_seen": 49489264, "step": 3077 }, { "epoch": 0.21560850035461923, "grad_norm": 6.479659080505371, "learning_rate": 7.845939054290719e-05, "loss": 1.0437, "num_input_tokens_seen": 49505232, "step": 3078 }, { "epoch": 0.2156785486003485, "grad_norm": 4.584348201751709, "learning_rate": 7.845239229422067e-05, "loss": 1.1054, "num_input_tokens_seen": 49521616, "step": 3079 }, { "epoch": 0.21574859684607772, "grad_norm": 4.339470386505127, "learning_rate": 7.844539404553416e-05, "loss": 1.2386, "num_input_tokens_seen": 49537376, "step": 3080 }, { "epoch": 0.21581864509180698, "grad_norm": 4.098686218261719, "learning_rate": 7.843839579684765e-05, "loss": 0.9376, "num_input_tokens_seen": 49552256, "step": 3081 }, { "epoch": 0.21588869333753624, "grad_norm": 4.619485855102539, "learning_rate": 7.843139754816112e-05, "loss": 1.0066, "num_input_tokens_seen": 49568640, "step": 3082 }, { "epoch": 0.21595874158326547, "grad_norm": 4.018712997436523, "learning_rate": 7.842439929947461e-05, "loss": 1.0062, "num_input_tokens_seen": 49584816, "step": 3083 }, { "epoch": 0.21602878982899473, "grad_norm": 5.898901462554932, "learning_rate": 7.84174010507881e-05, "loss": 1.1956, "num_input_tokens_seen": 49600872, "step": 3084 }, { "epoch": 0.21609883807472396, "grad_norm": 4.794529438018799, "learning_rate": 7.841040280210159e-05, "loss": 1.0035, "num_input_tokens_seen": 49616840, "step": 3085 }, { "epoch": 0.21616888632045322, "grad_norm": 4.934964656829834, "learning_rate": 7.840340455341506e-05, "loss": 1.0033, "num_input_tokens_seen": 49633224, "step": 3086 }, { "epoch": 0.21623893456618246, "grad_norm": 3.6171560287475586, "learning_rate": 7.839640630472855e-05, "loss": 1.1165, "num_input_tokens_seen": 49649056, "step": 3087 }, { "epoch": 0.21630898281191172, "grad_norm": 4.032123565673828, "learning_rate": 7.838940805604204e-05, "loss": 1.2411, "num_input_tokens_seen": 49665440, "step": 3088 }, { "epoch": 0.21637903105764095, "grad_norm": 3.4669382572174072, "learning_rate": 7.838240980735551e-05, "loss": 0.9666, "num_input_tokens_seen": 49681824, "step": 3089 }, { "epoch": 0.2164490793033702, "grad_norm": 3.6899688243865967, "learning_rate": 7.8375411558669e-05, "loss": 0.9657, "num_input_tokens_seen": 49698208, "step": 3090 }, { "epoch": 0.21651912754909944, "grad_norm": 4.231171131134033, "learning_rate": 7.83684133099825e-05, "loss": 1.1459, "num_input_tokens_seen": 49713664, "step": 3091 }, { "epoch": 0.2165891757948287, "grad_norm": 4.792253017425537, "learning_rate": 7.836141506129598e-05, "loss": 0.9982, "num_input_tokens_seen": 49730048, "step": 3092 }, { "epoch": 0.21665922404055793, "grad_norm": 5.7171478271484375, "learning_rate": 7.835441681260945e-05, "loss": 1.189, "num_input_tokens_seen": 49746432, "step": 3093 }, { "epoch": 0.2167292722862872, "grad_norm": 4.393872261047363, "learning_rate": 7.834741856392294e-05, "loss": 0.9969, "num_input_tokens_seen": 49762816, "step": 3094 }, { "epoch": 0.21679932053201642, "grad_norm": 6.388276100158691, "learning_rate": 7.834042031523643e-05, "loss": 1.2192, "num_input_tokens_seen": 49778680, "step": 3095 }, { "epoch": 0.21686936877774568, "grad_norm": 3.8204843997955322, "learning_rate": 7.83334220665499e-05, "loss": 1.0601, "num_input_tokens_seen": 49794344, "step": 3096 }, { "epoch": 0.2169394170234749, "grad_norm": 10.573785781860352, "learning_rate": 7.832642381786341e-05, "loss": 0.9257, "num_input_tokens_seen": 49810208, "step": 3097 }, { "epoch": 0.21700946526920417, "grad_norm": 3.437734603881836, "learning_rate": 7.83194255691769e-05, "loss": 0.8757, "num_input_tokens_seen": 49826448, "step": 3098 }, { "epoch": 0.2170795135149334, "grad_norm": 3.476918935775757, "learning_rate": 7.831242732049037e-05, "loss": 0.908, "num_input_tokens_seen": 49842832, "step": 3099 }, { "epoch": 0.21714956176066266, "grad_norm": 4.037630558013916, "learning_rate": 7.830542907180386e-05, "loss": 1.1305, "num_input_tokens_seen": 49859216, "step": 3100 }, { "epoch": 0.2172196100063919, "grad_norm": 3.7424814701080322, "learning_rate": 7.829843082311734e-05, "loss": 1.1701, "num_input_tokens_seen": 49875528, "step": 3101 }, { "epoch": 0.21728965825212115, "grad_norm": 4.222198486328125, "learning_rate": 7.829143257443082e-05, "loss": 1.0539, "num_input_tokens_seen": 49891912, "step": 3102 }, { "epoch": 0.21735970649785039, "grad_norm": 4.064510822296143, "learning_rate": 7.828443432574431e-05, "loss": 1.0524, "num_input_tokens_seen": 49908064, "step": 3103 }, { "epoch": 0.21742975474357965, "grad_norm": 3.822498083114624, "learning_rate": 7.82774360770578e-05, "loss": 0.9085, "num_input_tokens_seen": 49923776, "step": 3104 }, { "epoch": 0.21749980298930888, "grad_norm": 4.368459224700928, "learning_rate": 7.827043782837129e-05, "loss": 0.9599, "num_input_tokens_seen": 49940104, "step": 3105 }, { "epoch": 0.21756985123503814, "grad_norm": 3.722587823867798, "learning_rate": 7.826343957968477e-05, "loss": 1.0286, "num_input_tokens_seen": 49955624, "step": 3106 }, { "epoch": 0.21763989948076737, "grad_norm": 4.277473449707031, "learning_rate": 7.825644133099825e-05, "loss": 1.1797, "num_input_tokens_seen": 49971784, "step": 3107 }, { "epoch": 0.21770994772649663, "grad_norm": 4.586781024932861, "learning_rate": 7.824944308231174e-05, "loss": 1.0395, "num_input_tokens_seen": 49988168, "step": 3108 }, { "epoch": 0.21777999597222586, "grad_norm": 4.456960201263428, "learning_rate": 7.824244483362522e-05, "loss": 0.9449, "num_input_tokens_seen": 50003392, "step": 3109 }, { "epoch": 0.21785004421795512, "grad_norm": 4.115220069885254, "learning_rate": 7.82354465849387e-05, "loss": 1.1393, "num_input_tokens_seen": 50019776, "step": 3110 }, { "epoch": 0.21792009246368435, "grad_norm": 3.5760059356689453, "learning_rate": 7.82284483362522e-05, "loss": 1.0421, "num_input_tokens_seen": 50036160, "step": 3111 }, { "epoch": 0.2179901407094136, "grad_norm": 5.757627487182617, "learning_rate": 7.822145008756568e-05, "loss": 1.1382, "num_input_tokens_seen": 50052544, "step": 3112 }, { "epoch": 0.21806018895514284, "grad_norm": 3.4349796772003174, "learning_rate": 7.821445183887916e-05, "loss": 0.8474, "num_input_tokens_seen": 50068872, "step": 3113 }, { "epoch": 0.2181302372008721, "grad_norm": 5.546512603759766, "learning_rate": 7.820745359019265e-05, "loss": 1.4564, "num_input_tokens_seen": 50085256, "step": 3114 }, { "epoch": 0.21820028544660133, "grad_norm": 3.4954123497009277, "learning_rate": 7.820045534150614e-05, "loss": 1.0479, "num_input_tokens_seen": 50101640, "step": 3115 }, { "epoch": 0.2182703336923306, "grad_norm": 5.396134376525879, "learning_rate": 7.819345709281961e-05, "loss": 1.0834, "num_input_tokens_seen": 50117040, "step": 3116 }, { "epoch": 0.21834038193805985, "grad_norm": 3.7895803451538086, "learning_rate": 7.818645884413311e-05, "loss": 0.9567, "num_input_tokens_seen": 50133424, "step": 3117 }, { "epoch": 0.21841043018378908, "grad_norm": 5.321155548095703, "learning_rate": 7.81794605954466e-05, "loss": 1.2246, "num_input_tokens_seen": 50148520, "step": 3118 }, { "epoch": 0.21848047842951834, "grad_norm": 4.993834495544434, "learning_rate": 7.817246234676008e-05, "loss": 1.0944, "num_input_tokens_seen": 50164904, "step": 3119 }, { "epoch": 0.21855052667524758, "grad_norm": 3.69236159324646, "learning_rate": 7.816546409807355e-05, "loss": 0.9759, "num_input_tokens_seen": 50181288, "step": 3120 }, { "epoch": 0.21862057492097683, "grad_norm": 3.689748764038086, "learning_rate": 7.815846584938704e-05, "loss": 1.0594, "num_input_tokens_seen": 50197672, "step": 3121 }, { "epoch": 0.21869062316670607, "grad_norm": 5.904501914978027, "learning_rate": 7.815146760070053e-05, "loss": 1.2348, "num_input_tokens_seen": 50214016, "step": 3122 }, { "epoch": 0.21876067141243533, "grad_norm": 4.018721103668213, "learning_rate": 7.814446935201402e-05, "loss": 1.2688, "num_input_tokens_seen": 50229984, "step": 3123 }, { "epoch": 0.21883071965816456, "grad_norm": 3.6527509689331055, "learning_rate": 7.81374711033275e-05, "loss": 0.8508, "num_input_tokens_seen": 50246368, "step": 3124 }, { "epoch": 0.21890076790389382, "grad_norm": 3.6600260734558105, "learning_rate": 7.8130472854641e-05, "loss": 1.0952, "num_input_tokens_seen": 50262208, "step": 3125 }, { "epoch": 0.21897081614962305, "grad_norm": 3.415969133377075, "learning_rate": 7.812347460595447e-05, "loss": 1.1055, "num_input_tokens_seen": 50278592, "step": 3126 }, { "epoch": 0.2190408643953523, "grad_norm": 5.027013301849365, "learning_rate": 7.811647635726796e-05, "loss": 1.2189, "num_input_tokens_seen": 50294976, "step": 3127 }, { "epoch": 0.21911091264108154, "grad_norm": 3.806324005126953, "learning_rate": 7.810947810858143e-05, "loss": 1.094, "num_input_tokens_seen": 50311360, "step": 3128 }, { "epoch": 0.2191809608868108, "grad_norm": 5.208338260650635, "learning_rate": 7.810247985989492e-05, "loss": 1.0917, "num_input_tokens_seen": 50327744, "step": 3129 }, { "epoch": 0.21925100913254003, "grad_norm": 3.5902316570281982, "learning_rate": 7.809548161120841e-05, "loss": 1.0894, "num_input_tokens_seen": 50343616, "step": 3130 }, { "epoch": 0.2193210573782693, "grad_norm": 3.7159717082977295, "learning_rate": 7.80884833625219e-05, "loss": 1.1168, "num_input_tokens_seen": 50360000, "step": 3131 }, { "epoch": 0.21939110562399852, "grad_norm": 4.03640079498291, "learning_rate": 7.808148511383539e-05, "loss": 0.8906, "num_input_tokens_seen": 50376384, "step": 3132 }, { "epoch": 0.21946115386972778, "grad_norm": 3.763805627822876, "learning_rate": 7.807448686514886e-05, "loss": 1.0922, "num_input_tokens_seen": 50392328, "step": 3133 }, { "epoch": 0.219531202115457, "grad_norm": 4.242026329040527, "learning_rate": 7.806748861646235e-05, "loss": 1.1286, "num_input_tokens_seen": 50408712, "step": 3134 }, { "epoch": 0.21960125036118627, "grad_norm": 3.5783863067626953, "learning_rate": 7.806049036777584e-05, "loss": 0.946, "num_input_tokens_seen": 50424816, "step": 3135 }, { "epoch": 0.2196712986069155, "grad_norm": 3.8409011363983154, "learning_rate": 7.805349211908931e-05, "loss": 1.0901, "num_input_tokens_seen": 50440464, "step": 3136 }, { "epoch": 0.21974134685264476, "grad_norm": 3.642411231994629, "learning_rate": 7.804649387040282e-05, "loss": 1.097, "num_input_tokens_seen": 50456552, "step": 3137 }, { "epoch": 0.219811395098374, "grad_norm": 3.702481985092163, "learning_rate": 7.803949562171629e-05, "loss": 1.0843, "num_input_tokens_seen": 50472936, "step": 3138 }, { "epoch": 0.21988144334410326, "grad_norm": 3.776094913482666, "learning_rate": 7.803249737302978e-05, "loss": 1.0058, "num_input_tokens_seen": 50488760, "step": 3139 }, { "epoch": 0.2199514915898325, "grad_norm": 4.29668664932251, "learning_rate": 7.802549912434326e-05, "loss": 1.3095, "num_input_tokens_seen": 50505144, "step": 3140 }, { "epoch": 0.22002153983556175, "grad_norm": 3.8290088176727295, "learning_rate": 7.801850087565674e-05, "loss": 0.9331, "num_input_tokens_seen": 50521520, "step": 3141 }, { "epoch": 0.22009158808129098, "grad_norm": 3.9471163749694824, "learning_rate": 7.801150262697023e-05, "loss": 1.1064, "num_input_tokens_seen": 50537688, "step": 3142 }, { "epoch": 0.22016163632702024, "grad_norm": 4.3500657081604, "learning_rate": 7.800450437828372e-05, "loss": 1.0843, "num_input_tokens_seen": 50554072, "step": 3143 }, { "epoch": 0.22023168457274947, "grad_norm": 4.257317066192627, "learning_rate": 7.799750612959721e-05, "loss": 1.2822, "num_input_tokens_seen": 50570456, "step": 3144 }, { "epoch": 0.22030173281847873, "grad_norm": 3.881340265274048, "learning_rate": 7.79905078809107e-05, "loss": 1.2797, "num_input_tokens_seen": 50586840, "step": 3145 }, { "epoch": 0.22037178106420796, "grad_norm": 4.07082986831665, "learning_rate": 7.798350963222417e-05, "loss": 1.1659, "num_input_tokens_seen": 50603224, "step": 3146 }, { "epoch": 0.22044182930993722, "grad_norm": 3.740081310272217, "learning_rate": 7.797651138353765e-05, "loss": 1.1638, "num_input_tokens_seen": 50619608, "step": 3147 }, { "epoch": 0.22051187755566645, "grad_norm": 3.9368820190429688, "learning_rate": 7.796951313485114e-05, "loss": 1.2122, "num_input_tokens_seen": 50635240, "step": 3148 }, { "epoch": 0.2205819258013957, "grad_norm": 4.027481555938721, "learning_rate": 7.796251488616463e-05, "loss": 1.1479, "num_input_tokens_seen": 50651144, "step": 3149 }, { "epoch": 0.22065197404712494, "grad_norm": 3.53271222114563, "learning_rate": 7.795551663747811e-05, "loss": 0.8352, "num_input_tokens_seen": 50667320, "step": 3150 }, { "epoch": 0.2207220222928542, "grad_norm": 3.9494400024414062, "learning_rate": 7.79485183887916e-05, "loss": 1.1353, "num_input_tokens_seen": 50683704, "step": 3151 }, { "epoch": 0.22079207053858346, "grad_norm": 4.944929122924805, "learning_rate": 7.794152014010509e-05, "loss": 1.0833, "num_input_tokens_seen": 50699544, "step": 3152 }, { "epoch": 0.2208621187843127, "grad_norm": 4.625114440917969, "learning_rate": 7.793452189141857e-05, "loss": 1.0123, "num_input_tokens_seen": 50715096, "step": 3153 }, { "epoch": 0.22093216703004195, "grad_norm": 4.543829441070557, "learning_rate": 7.792752364273205e-05, "loss": 0.9669, "num_input_tokens_seen": 50731480, "step": 3154 }, { "epoch": 0.22100221527577119, "grad_norm": 4.038646221160889, "learning_rate": 7.792052539404553e-05, "loss": 1.1399, "num_input_tokens_seen": 50747864, "step": 3155 }, { "epoch": 0.22107226352150045, "grad_norm": 5.269920825958252, "learning_rate": 7.791352714535902e-05, "loss": 1.1412, "num_input_tokens_seen": 50764248, "step": 3156 }, { "epoch": 0.22114231176722968, "grad_norm": 3.661792278289795, "learning_rate": 7.790652889667251e-05, "loss": 0.8881, "num_input_tokens_seen": 50780296, "step": 3157 }, { "epoch": 0.22121236001295894, "grad_norm": 5.119567394256592, "learning_rate": 7.7899530647986e-05, "loss": 1.2316, "num_input_tokens_seen": 50796680, "step": 3158 }, { "epoch": 0.22128240825868817, "grad_norm": 4.011631965637207, "learning_rate": 7.789253239929948e-05, "loss": 1.0854, "num_input_tokens_seen": 50812648, "step": 3159 }, { "epoch": 0.22135245650441743, "grad_norm": 4.292233467102051, "learning_rate": 7.788553415061296e-05, "loss": 0.8441, "num_input_tokens_seen": 50829032, "step": 3160 }, { "epoch": 0.22142250475014666, "grad_norm": 3.9228122234344482, "learning_rate": 7.787853590192645e-05, "loss": 1.0963, "num_input_tokens_seen": 50844776, "step": 3161 }, { "epoch": 0.22149255299587592, "grad_norm": 4.396078109741211, "learning_rate": 7.787153765323994e-05, "loss": 1.2647, "num_input_tokens_seen": 50860792, "step": 3162 }, { "epoch": 0.22156260124160515, "grad_norm": 3.6809213161468506, "learning_rate": 7.786453940455342e-05, "loss": 1.0172, "num_input_tokens_seen": 50877176, "step": 3163 }, { "epoch": 0.2216326494873344, "grad_norm": 3.7879207134246826, "learning_rate": 7.785754115586691e-05, "loss": 0.9708, "num_input_tokens_seen": 50893560, "step": 3164 }, { "epoch": 0.22170269773306364, "grad_norm": 5.248175621032715, "learning_rate": 7.785054290718039e-05, "loss": 0.9575, "num_input_tokens_seen": 50909944, "step": 3165 }, { "epoch": 0.2217727459787929, "grad_norm": 5.437406539916992, "learning_rate": 7.784354465849388e-05, "loss": 1.1873, "num_input_tokens_seen": 50925848, "step": 3166 }, { "epoch": 0.22184279422452213, "grad_norm": 4.390413761138916, "learning_rate": 7.783654640980735e-05, "loss": 1.143, "num_input_tokens_seen": 50941488, "step": 3167 }, { "epoch": 0.2219128424702514, "grad_norm": 3.6923863887786865, "learning_rate": 7.782954816112084e-05, "loss": 1.012, "num_input_tokens_seen": 50956984, "step": 3168 }, { "epoch": 0.22198289071598062, "grad_norm": 4.338325023651123, "learning_rate": 7.782254991243433e-05, "loss": 1.0984, "num_input_tokens_seen": 50973096, "step": 3169 }, { "epoch": 0.22205293896170988, "grad_norm": 5.631222248077393, "learning_rate": 7.781555166374782e-05, "loss": 1.0325, "num_input_tokens_seen": 50989480, "step": 3170 }, { "epoch": 0.22212298720743912, "grad_norm": 3.852337598800659, "learning_rate": 7.78085534150613e-05, "loss": 1.0279, "num_input_tokens_seen": 51005864, "step": 3171 }, { "epoch": 0.22219303545316837, "grad_norm": 3.6684298515319824, "learning_rate": 7.78015551663748e-05, "loss": 0.9906, "num_input_tokens_seen": 51022248, "step": 3172 }, { "epoch": 0.2222630836988976, "grad_norm": 3.7521257400512695, "learning_rate": 7.779455691768827e-05, "loss": 0.9641, "num_input_tokens_seen": 51038632, "step": 3173 }, { "epoch": 0.22233313194462687, "grad_norm": 4.571293354034424, "learning_rate": 7.778755866900175e-05, "loss": 1.1655, "num_input_tokens_seen": 51055016, "step": 3174 }, { "epoch": 0.2224031801903561, "grad_norm": 3.921743154525757, "learning_rate": 7.778056042031523e-05, "loss": 1.0815, "num_input_tokens_seen": 51071288, "step": 3175 }, { "epoch": 0.22247322843608536, "grad_norm": 6.1666083335876465, "learning_rate": 7.777356217162872e-05, "loss": 1.1025, "num_input_tokens_seen": 51086712, "step": 3176 }, { "epoch": 0.2225432766818146, "grad_norm": 4.170863151550293, "learning_rate": 7.776656392294221e-05, "loss": 1.0547, "num_input_tokens_seen": 51102904, "step": 3177 }, { "epoch": 0.22261332492754385, "grad_norm": 4.218405246734619, "learning_rate": 7.77595656742557e-05, "loss": 1.0685, "num_input_tokens_seen": 51119288, "step": 3178 }, { "epoch": 0.22268337317327308, "grad_norm": 4.158823490142822, "learning_rate": 7.775256742556919e-05, "loss": 1.0053, "num_input_tokens_seen": 51135672, "step": 3179 }, { "epoch": 0.22275342141900234, "grad_norm": 3.900827407836914, "learning_rate": 7.774556917688266e-05, "loss": 0.9212, "num_input_tokens_seen": 51151880, "step": 3180 }, { "epoch": 0.22282346966473157, "grad_norm": 3.6363813877105713, "learning_rate": 7.773857092819615e-05, "loss": 1.0602, "num_input_tokens_seen": 51167712, "step": 3181 }, { "epoch": 0.22289351791046083, "grad_norm": 6.452186584472656, "learning_rate": 7.773157267950963e-05, "loss": 1.3543, "num_input_tokens_seen": 51184096, "step": 3182 }, { "epoch": 0.22296356615619006, "grad_norm": 4.324470043182373, "learning_rate": 7.772457443082313e-05, "loss": 1.3328, "num_input_tokens_seen": 51200480, "step": 3183 }, { "epoch": 0.22303361440191932, "grad_norm": 4.093019485473633, "learning_rate": 7.77175761821366e-05, "loss": 1.2647, "num_input_tokens_seen": 51216864, "step": 3184 }, { "epoch": 0.22310366264764858, "grad_norm": 3.923771619796753, "learning_rate": 7.771057793345009e-05, "loss": 1.0121, "num_input_tokens_seen": 51233248, "step": 3185 }, { "epoch": 0.2231737108933778, "grad_norm": 3.3340275287628174, "learning_rate": 7.770357968476358e-05, "loss": 0.8954, "num_input_tokens_seen": 51249400, "step": 3186 }, { "epoch": 0.22324375913910707, "grad_norm": 5.360925197601318, "learning_rate": 7.769658143607706e-05, "loss": 1.0391, "num_input_tokens_seen": 51264920, "step": 3187 }, { "epoch": 0.2233138073848363, "grad_norm": 4.377450466156006, "learning_rate": 7.768958318739054e-05, "loss": 1.2148, "num_input_tokens_seen": 51280528, "step": 3188 }, { "epoch": 0.22338385563056556, "grad_norm": 4.01370906829834, "learning_rate": 7.768258493870403e-05, "loss": 1.0084, "num_input_tokens_seen": 51296912, "step": 3189 }, { "epoch": 0.2234539038762948, "grad_norm": 5.112427711486816, "learning_rate": 7.767558669001752e-05, "loss": 1.0388, "num_input_tokens_seen": 51313296, "step": 3190 }, { "epoch": 0.22352395212202406, "grad_norm": 3.5889225006103516, "learning_rate": 7.766858844133101e-05, "loss": 1.0018, "num_input_tokens_seen": 51329680, "step": 3191 }, { "epoch": 0.2235940003677533, "grad_norm": 3.6924920082092285, "learning_rate": 7.766159019264449e-05, "loss": 1.1056, "num_input_tokens_seen": 51346064, "step": 3192 }, { "epoch": 0.22366404861348255, "grad_norm": 3.9349400997161865, "learning_rate": 7.765459194395797e-05, "loss": 0.9785, "num_input_tokens_seen": 51361200, "step": 3193 }, { "epoch": 0.22373409685921178, "grad_norm": 3.6980738639831543, "learning_rate": 7.764759369527145e-05, "loss": 0.9112, "num_input_tokens_seen": 51377584, "step": 3194 }, { "epoch": 0.22380414510494104, "grad_norm": 4.400575637817383, "learning_rate": 7.764059544658494e-05, "loss": 1.2927, "num_input_tokens_seen": 51393968, "step": 3195 }, { "epoch": 0.22387419335067027, "grad_norm": 3.758664846420288, "learning_rate": 7.763359719789843e-05, "loss": 0.8743, "num_input_tokens_seen": 51410160, "step": 3196 }, { "epoch": 0.22394424159639953, "grad_norm": 4.376255512237549, "learning_rate": 7.762659894921192e-05, "loss": 1.1239, "num_input_tokens_seen": 51426192, "step": 3197 }, { "epoch": 0.22401428984212876, "grad_norm": 4.371212959289551, "learning_rate": 7.76196007005254e-05, "loss": 1.4918, "num_input_tokens_seen": 51442576, "step": 3198 }, { "epoch": 0.22408433808785802, "grad_norm": 3.5152950286865234, "learning_rate": 7.761260245183889e-05, "loss": 1.0344, "num_input_tokens_seen": 51458648, "step": 3199 }, { "epoch": 0.22415438633358725, "grad_norm": 4.100535869598389, "learning_rate": 7.760560420315237e-05, "loss": 0.9969, "num_input_tokens_seen": 51475032, "step": 3200 }, { "epoch": 0.22415438633358725, "eval_loss": 1.1358542442321777, "eval_runtime": 0.2073, "eval_samples_per_second": 4.825, "eval_steps_per_second": 4.825, "num_input_tokens_seen": 51475032, "step": 3200 }, { "epoch": 0.2242244345793165, "grad_norm": 4.394073486328125, "learning_rate": 7.759860595446584e-05, "loss": 1.0951, "num_input_tokens_seen": 51490544, "step": 3201 }, { "epoch": 0.22429448282504574, "grad_norm": 4.041582107543945, "learning_rate": 7.759160770577933e-05, "loss": 1.1615, "num_input_tokens_seen": 51506928, "step": 3202 }, { "epoch": 0.224364531070775, "grad_norm": 4.268798351287842, "learning_rate": 7.758460945709282e-05, "loss": 1.0975, "num_input_tokens_seen": 51523232, "step": 3203 }, { "epoch": 0.22443457931650423, "grad_norm": 4.080141067504883, "learning_rate": 7.757761120840631e-05, "loss": 0.9809, "num_input_tokens_seen": 51539616, "step": 3204 }, { "epoch": 0.2245046275622335, "grad_norm": 7.690321445465088, "learning_rate": 7.75706129597198e-05, "loss": 1.1217, "num_input_tokens_seen": 51556000, "step": 3205 }, { "epoch": 0.22457467580796273, "grad_norm": 4.161118507385254, "learning_rate": 7.756361471103329e-05, "loss": 0.9672, "num_input_tokens_seen": 51572384, "step": 3206 }, { "epoch": 0.22464472405369199, "grad_norm": 3.922683000564575, "learning_rate": 7.755661646234676e-05, "loss": 1.0665, "num_input_tokens_seen": 51588768, "step": 3207 }, { "epoch": 0.22471477229942122, "grad_norm": 3.7474617958068848, "learning_rate": 7.754961821366025e-05, "loss": 1.1283, "num_input_tokens_seen": 51604792, "step": 3208 }, { "epoch": 0.22478482054515048, "grad_norm": 3.856959819793701, "learning_rate": 7.754261996497374e-05, "loss": 0.963, "num_input_tokens_seen": 51621176, "step": 3209 }, { "epoch": 0.2248548687908797, "grad_norm": 4.130929470062256, "learning_rate": 7.753562171628723e-05, "loss": 1.0563, "num_input_tokens_seen": 51636864, "step": 3210 }, { "epoch": 0.22492491703660897, "grad_norm": 3.5023388862609863, "learning_rate": 7.75286234676007e-05, "loss": 0.8926, "num_input_tokens_seen": 51653248, "step": 3211 }, { "epoch": 0.2249949652823382, "grad_norm": 3.736415386199951, "learning_rate": 7.752162521891419e-05, "loss": 1.08, "num_input_tokens_seen": 51669632, "step": 3212 }, { "epoch": 0.22506501352806746, "grad_norm": 4.355846881866455, "learning_rate": 7.751462697022768e-05, "loss": 1.0265, "num_input_tokens_seen": 51684632, "step": 3213 }, { "epoch": 0.2251350617737967, "grad_norm": 4.165436744689941, "learning_rate": 7.750762872154115e-05, "loss": 1.1594, "num_input_tokens_seen": 51701016, "step": 3214 }, { "epoch": 0.22520511001952595, "grad_norm": 4.4387946128845215, "learning_rate": 7.750063047285464e-05, "loss": 0.911, "num_input_tokens_seen": 51716176, "step": 3215 }, { "epoch": 0.22527515826525518, "grad_norm": 4.749145030975342, "learning_rate": 7.749363222416813e-05, "loss": 0.952, "num_input_tokens_seen": 51732560, "step": 3216 }, { "epoch": 0.22534520651098444, "grad_norm": 4.321863651275635, "learning_rate": 7.748663397548162e-05, "loss": 1.0974, "num_input_tokens_seen": 51748944, "step": 3217 }, { "epoch": 0.22541525475671367, "grad_norm": 5.319899082183838, "learning_rate": 7.747963572679511e-05, "loss": 0.9506, "num_input_tokens_seen": 51765328, "step": 3218 }, { "epoch": 0.22548530300244293, "grad_norm": 3.5695643424987793, "learning_rate": 7.747263747810858e-05, "loss": 1.1482, "num_input_tokens_seen": 51781712, "step": 3219 }, { "epoch": 0.2255553512481722, "grad_norm": 3.725698947906494, "learning_rate": 7.746563922942207e-05, "loss": 0.9205, "num_input_tokens_seen": 51798096, "step": 3220 }, { "epoch": 0.22562539949390142, "grad_norm": 3.795003652572632, "learning_rate": 7.745864098073555e-05, "loss": 1.0314, "num_input_tokens_seen": 51814480, "step": 3221 }, { "epoch": 0.22569544773963068, "grad_norm": 3.817578077316284, "learning_rate": 7.745164273204903e-05, "loss": 1.1218, "num_input_tokens_seen": 51830864, "step": 3222 }, { "epoch": 0.22576549598535992, "grad_norm": 5.982937812805176, "learning_rate": 7.744464448336252e-05, "loss": 0.9544, "num_input_tokens_seen": 51846104, "step": 3223 }, { "epoch": 0.22583554423108917, "grad_norm": 5.063079833984375, "learning_rate": 7.743764623467601e-05, "loss": 0.9191, "num_input_tokens_seen": 51862488, "step": 3224 }, { "epoch": 0.2259055924768184, "grad_norm": 3.620837450027466, "learning_rate": 7.74306479859895e-05, "loss": 1.0484, "num_input_tokens_seen": 51878784, "step": 3225 }, { "epoch": 0.22597564072254767, "grad_norm": 3.578369617462158, "learning_rate": 7.742364973730299e-05, "loss": 1.0146, "num_input_tokens_seen": 51894832, "step": 3226 }, { "epoch": 0.2260456889682769, "grad_norm": 4.0356974601745605, "learning_rate": 7.741665148861646e-05, "loss": 1.0664, "num_input_tokens_seen": 51911216, "step": 3227 }, { "epoch": 0.22611573721400616, "grad_norm": 4.133927822113037, "learning_rate": 7.740965323992994e-05, "loss": 1.1579, "num_input_tokens_seen": 51927600, "step": 3228 }, { "epoch": 0.2261857854597354, "grad_norm": 4.2958879470825195, "learning_rate": 7.740265499124343e-05, "loss": 1.0519, "num_input_tokens_seen": 51943688, "step": 3229 }, { "epoch": 0.22625583370546465, "grad_norm": 6.211035251617432, "learning_rate": 7.739565674255693e-05, "loss": 1.0097, "num_input_tokens_seen": 51960072, "step": 3230 }, { "epoch": 0.22632588195119388, "grad_norm": 4.073126316070557, "learning_rate": 7.73886584938704e-05, "loss": 1.0226, "num_input_tokens_seen": 51976456, "step": 3231 }, { "epoch": 0.22639593019692314, "grad_norm": 3.605041980743408, "learning_rate": 7.73816602451839e-05, "loss": 0.817, "num_input_tokens_seen": 51992840, "step": 3232 }, { "epoch": 0.22646597844265237, "grad_norm": 4.341184139251709, "learning_rate": 7.737466199649738e-05, "loss": 1.1391, "num_input_tokens_seen": 52008696, "step": 3233 }, { "epoch": 0.22653602668838163, "grad_norm": 4.676966667175293, "learning_rate": 7.736766374781086e-05, "loss": 1.0163, "num_input_tokens_seen": 52024944, "step": 3234 }, { "epoch": 0.22660607493411086, "grad_norm": 4.6688032150268555, "learning_rate": 7.736066549912435e-05, "loss": 0.972, "num_input_tokens_seen": 52041104, "step": 3235 }, { "epoch": 0.22667612317984012, "grad_norm": 4.6416916847229, "learning_rate": 7.735366725043783e-05, "loss": 1.1197, "num_input_tokens_seen": 52055864, "step": 3236 }, { "epoch": 0.22674617142556935, "grad_norm": 3.713846206665039, "learning_rate": 7.734666900175132e-05, "loss": 1.0498, "num_input_tokens_seen": 52071992, "step": 3237 }, { "epoch": 0.2268162196712986, "grad_norm": 3.694094657897949, "learning_rate": 7.73396707530648e-05, "loss": 1.083, "num_input_tokens_seen": 52088376, "step": 3238 }, { "epoch": 0.22688626791702785, "grad_norm": 4.250162601470947, "learning_rate": 7.733267250437829e-05, "loss": 0.9421, "num_input_tokens_seen": 52104320, "step": 3239 }, { "epoch": 0.2269563161627571, "grad_norm": 3.8184008598327637, "learning_rate": 7.732567425569178e-05, "loss": 1.0033, "num_input_tokens_seen": 52120416, "step": 3240 }, { "epoch": 0.22702636440848634, "grad_norm": 3.9957122802734375, "learning_rate": 7.731867600700525e-05, "loss": 0.9594, "num_input_tokens_seen": 52136704, "step": 3241 }, { "epoch": 0.2270964126542156, "grad_norm": 4.153292655944824, "learning_rate": 7.731167775831874e-05, "loss": 1.2315, "num_input_tokens_seen": 52153088, "step": 3242 }, { "epoch": 0.22716646089994483, "grad_norm": 3.628377914428711, "learning_rate": 7.730467950963223e-05, "loss": 0.9826, "num_input_tokens_seen": 52169032, "step": 3243 }, { "epoch": 0.2272365091456741, "grad_norm": 3.45796275138855, "learning_rate": 7.729768126094572e-05, "loss": 1.0942, "num_input_tokens_seen": 52185416, "step": 3244 }, { "epoch": 0.22730655739140332, "grad_norm": 3.9128968715667725, "learning_rate": 7.72906830122592e-05, "loss": 1.2954, "num_input_tokens_seen": 52201504, "step": 3245 }, { "epoch": 0.22737660563713258, "grad_norm": 4.4097394943237305, "learning_rate": 7.728368476357268e-05, "loss": 1.0171, "num_input_tokens_seen": 52217184, "step": 3246 }, { "epoch": 0.2274466538828618, "grad_norm": 4.110626220703125, "learning_rate": 7.727668651488617e-05, "loss": 1.0412, "num_input_tokens_seen": 52233432, "step": 3247 }, { "epoch": 0.22751670212859107, "grad_norm": 4.161354064941406, "learning_rate": 7.726968826619964e-05, "loss": 0.9371, "num_input_tokens_seen": 52249816, "step": 3248 }, { "epoch": 0.2275867503743203, "grad_norm": 5.910977363586426, "learning_rate": 7.726269001751313e-05, "loss": 0.8993, "num_input_tokens_seen": 52266200, "step": 3249 }, { "epoch": 0.22765679862004956, "grad_norm": 3.8264660835266113, "learning_rate": 7.725569176882663e-05, "loss": 1.0927, "num_input_tokens_seen": 52282136, "step": 3250 }, { "epoch": 0.2277268468657788, "grad_norm": 3.9992623329162598, "learning_rate": 7.724869352014011e-05, "loss": 0.9256, "num_input_tokens_seen": 52297368, "step": 3251 }, { "epoch": 0.22779689511150805, "grad_norm": 4.263967990875244, "learning_rate": 7.72416952714536e-05, "loss": 1.1708, "num_input_tokens_seen": 52313200, "step": 3252 }, { "epoch": 0.22786694335723728, "grad_norm": 3.8846871852874756, "learning_rate": 7.723469702276709e-05, "loss": 1.1445, "num_input_tokens_seen": 52329584, "step": 3253 }, { "epoch": 0.22793699160296654, "grad_norm": 4.3504533767700195, "learning_rate": 7.722769877408056e-05, "loss": 1.0332, "num_input_tokens_seen": 52345968, "step": 3254 }, { "epoch": 0.2280070398486958, "grad_norm": 3.9775991439819336, "learning_rate": 7.722070052539404e-05, "loss": 1.2149, "num_input_tokens_seen": 52362352, "step": 3255 }, { "epoch": 0.22807708809442503, "grad_norm": 4.098363399505615, "learning_rate": 7.721370227670754e-05, "loss": 1.1278, "num_input_tokens_seen": 52378736, "step": 3256 }, { "epoch": 0.2281471363401543, "grad_norm": 3.7094836235046387, "learning_rate": 7.720670402802103e-05, "loss": 1.0221, "num_input_tokens_seen": 52394896, "step": 3257 }, { "epoch": 0.22821718458588353, "grad_norm": 4.042232036590576, "learning_rate": 7.71997057793345e-05, "loss": 1.2902, "num_input_tokens_seen": 52410952, "step": 3258 }, { "epoch": 0.22828723283161279, "grad_norm": 3.725853443145752, "learning_rate": 7.719270753064799e-05, "loss": 1.0135, "num_input_tokens_seen": 52427200, "step": 3259 }, { "epoch": 0.22835728107734202, "grad_norm": 5.186229705810547, "learning_rate": 7.718570928196148e-05, "loss": 1.0539, "num_input_tokens_seen": 52443584, "step": 3260 }, { "epoch": 0.22842732932307128, "grad_norm": 3.8725364208221436, "learning_rate": 7.717871103327495e-05, "loss": 1.0782, "num_input_tokens_seen": 52458272, "step": 3261 }, { "epoch": 0.2284973775688005, "grad_norm": 5.006584644317627, "learning_rate": 7.717171278458844e-05, "loss": 1.0313, "num_input_tokens_seen": 52474456, "step": 3262 }, { "epoch": 0.22856742581452977, "grad_norm": 5.102536201477051, "learning_rate": 7.716471453590193e-05, "loss": 1.2077, "num_input_tokens_seen": 52490464, "step": 3263 }, { "epoch": 0.228637474060259, "grad_norm": 3.741029977798462, "learning_rate": 7.715771628721542e-05, "loss": 0.8978, "num_input_tokens_seen": 52506112, "step": 3264 }, { "epoch": 0.22870752230598826, "grad_norm": 5.656842231750488, "learning_rate": 7.71507180385289e-05, "loss": 1.1569, "num_input_tokens_seen": 52522496, "step": 3265 }, { "epoch": 0.2287775705517175, "grad_norm": 3.882403612136841, "learning_rate": 7.714371978984238e-05, "loss": 1.163, "num_input_tokens_seen": 52538240, "step": 3266 }, { "epoch": 0.22884761879744675, "grad_norm": 4.812796592712402, "learning_rate": 7.713672154115587e-05, "loss": 1.0478, "num_input_tokens_seen": 52554024, "step": 3267 }, { "epoch": 0.22891766704317598, "grad_norm": 3.9040687084198, "learning_rate": 7.712972329246935e-05, "loss": 1.0123, "num_input_tokens_seen": 52570408, "step": 3268 }, { "epoch": 0.22898771528890524, "grad_norm": 3.8387644290924072, "learning_rate": 7.712272504378284e-05, "loss": 0.9401, "num_input_tokens_seen": 52586512, "step": 3269 }, { "epoch": 0.22905776353463447, "grad_norm": 4.602542877197266, "learning_rate": 7.711572679509634e-05, "loss": 1.0196, "num_input_tokens_seen": 52602896, "step": 3270 }, { "epoch": 0.22912781178036373, "grad_norm": 4.209007263183594, "learning_rate": 7.710872854640981e-05, "loss": 1.1401, "num_input_tokens_seen": 52619080, "step": 3271 }, { "epoch": 0.22919786002609296, "grad_norm": 3.5082032680511475, "learning_rate": 7.71017302977233e-05, "loss": 0.9979, "num_input_tokens_seen": 52635464, "step": 3272 }, { "epoch": 0.22926790827182222, "grad_norm": 4.123980522155762, "learning_rate": 7.709473204903678e-05, "loss": 1.0201, "num_input_tokens_seen": 52651848, "step": 3273 }, { "epoch": 0.22933795651755146, "grad_norm": 4.267751216888428, "learning_rate": 7.708773380035027e-05, "loss": 1.1338, "num_input_tokens_seen": 52668232, "step": 3274 }, { "epoch": 0.22940800476328072, "grad_norm": 4.1165666580200195, "learning_rate": 7.708073555166374e-05, "loss": 1.1146, "num_input_tokens_seen": 52684616, "step": 3275 }, { "epoch": 0.22947805300900995, "grad_norm": 4.810427665710449, "learning_rate": 7.707373730297724e-05, "loss": 1.1785, "num_input_tokens_seen": 52701000, "step": 3276 }, { "epoch": 0.2295481012547392, "grad_norm": 6.566617488861084, "learning_rate": 7.706673905429073e-05, "loss": 0.8192, "num_input_tokens_seen": 52715920, "step": 3277 }, { "epoch": 0.22961814950046844, "grad_norm": 4.456092834472656, "learning_rate": 7.70597408056042e-05, "loss": 0.992, "num_input_tokens_seen": 52732304, "step": 3278 }, { "epoch": 0.2296881977461977, "grad_norm": 4.063642501831055, "learning_rate": 7.70527425569177e-05, "loss": 0.9306, "num_input_tokens_seen": 52748688, "step": 3279 }, { "epoch": 0.22975824599192693, "grad_norm": 3.337742567062378, "learning_rate": 7.704574430823118e-05, "loss": 0.8497, "num_input_tokens_seen": 52764800, "step": 3280 }, { "epoch": 0.2298282942376562, "grad_norm": 4.36488151550293, "learning_rate": 7.703874605954466e-05, "loss": 1.0851, "num_input_tokens_seen": 52780952, "step": 3281 }, { "epoch": 0.22989834248338542, "grad_norm": 4.948200702667236, "learning_rate": 7.703174781085815e-05, "loss": 0.9591, "num_input_tokens_seen": 52795728, "step": 3282 }, { "epoch": 0.22996839072911468, "grad_norm": 4.977625370025635, "learning_rate": 7.702474956217164e-05, "loss": 1.2094, "num_input_tokens_seen": 52812112, "step": 3283 }, { "epoch": 0.2300384389748439, "grad_norm": 3.7551944255828857, "learning_rate": 7.701775131348512e-05, "loss": 1.1018, "num_input_tokens_seen": 52828184, "step": 3284 }, { "epoch": 0.23010848722057317, "grad_norm": 3.700916051864624, "learning_rate": 7.70107530647986e-05, "loss": 1.0159, "num_input_tokens_seen": 52844568, "step": 3285 }, { "epoch": 0.2301785354663024, "grad_norm": 4.135788917541504, "learning_rate": 7.700375481611209e-05, "loss": 1.047, "num_input_tokens_seen": 52860952, "step": 3286 }, { "epoch": 0.23024858371203166, "grad_norm": 4.018477916717529, "learning_rate": 7.699675656742558e-05, "loss": 1.1124, "num_input_tokens_seen": 52876808, "step": 3287 }, { "epoch": 0.2303186319577609, "grad_norm": 5.230745315551758, "learning_rate": 7.698975831873905e-05, "loss": 1.0805, "num_input_tokens_seen": 52893192, "step": 3288 }, { "epoch": 0.23038868020349015, "grad_norm": 4.192041873931885, "learning_rate": 7.698276007005254e-05, "loss": 1.1476, "num_input_tokens_seen": 52909576, "step": 3289 }, { "epoch": 0.2304587284492194, "grad_norm": 4.28109073638916, "learning_rate": 7.697576182136603e-05, "loss": 0.9795, "num_input_tokens_seen": 52925592, "step": 3290 }, { "epoch": 0.23052877669494864, "grad_norm": 4.673538684844971, "learning_rate": 7.696876357267952e-05, "loss": 1.2104, "num_input_tokens_seen": 52941784, "step": 3291 }, { "epoch": 0.2305988249406779, "grad_norm": 3.791339159011841, "learning_rate": 7.696176532399299e-05, "loss": 1.0098, "num_input_tokens_seen": 52958168, "step": 3292 }, { "epoch": 0.23066887318640714, "grad_norm": 5.353015899658203, "learning_rate": 7.695476707530648e-05, "loss": 1.346, "num_input_tokens_seen": 52974552, "step": 3293 }, { "epoch": 0.2307389214321364, "grad_norm": 6.66793966293335, "learning_rate": 7.694776882661997e-05, "loss": 1.127, "num_input_tokens_seen": 52990512, "step": 3294 }, { "epoch": 0.23080896967786563, "grad_norm": 5.462240695953369, "learning_rate": 7.694077057793344e-05, "loss": 1.2397, "num_input_tokens_seen": 53006768, "step": 3295 }, { "epoch": 0.2308790179235949, "grad_norm": 4.212863445281982, "learning_rate": 7.693377232924695e-05, "loss": 0.9377, "num_input_tokens_seen": 53023152, "step": 3296 }, { "epoch": 0.23094906616932412, "grad_norm": 3.623929977416992, "learning_rate": 7.692677408056044e-05, "loss": 0.9086, "num_input_tokens_seen": 53039536, "step": 3297 }, { "epoch": 0.23101911441505338, "grad_norm": 4.791571617126465, "learning_rate": 7.691977583187391e-05, "loss": 1.0059, "num_input_tokens_seen": 53055920, "step": 3298 }, { "epoch": 0.2310891626607826, "grad_norm": 3.733243465423584, "learning_rate": 7.69127775831874e-05, "loss": 1.1729, "num_input_tokens_seen": 53072304, "step": 3299 }, { "epoch": 0.23115921090651187, "grad_norm": 3.916738986968994, "learning_rate": 7.690577933450087e-05, "loss": 1.2479, "num_input_tokens_seen": 53088568, "step": 3300 }, { "epoch": 0.2312292591522411, "grad_norm": 4.0346856117248535, "learning_rate": 7.689878108581436e-05, "loss": 1.0858, "num_input_tokens_seen": 53103656, "step": 3301 }, { "epoch": 0.23129930739797036, "grad_norm": 4.834316730499268, "learning_rate": 7.689178283712785e-05, "loss": 0.9328, "num_input_tokens_seen": 53120040, "step": 3302 }, { "epoch": 0.2313693556436996, "grad_norm": 4.5966291427612305, "learning_rate": 7.688478458844134e-05, "loss": 1.0108, "num_input_tokens_seen": 53136424, "step": 3303 }, { "epoch": 0.23143940388942885, "grad_norm": 5.17268705368042, "learning_rate": 7.687778633975483e-05, "loss": 1.1559, "num_input_tokens_seen": 53152080, "step": 3304 }, { "epoch": 0.23150945213515808, "grad_norm": 3.6322672367095947, "learning_rate": 7.68707880910683e-05, "loss": 1.0666, "num_input_tokens_seen": 53168464, "step": 3305 }, { "epoch": 0.23157950038088734, "grad_norm": 4.761613368988037, "learning_rate": 7.686378984238179e-05, "loss": 1.032, "num_input_tokens_seen": 53184848, "step": 3306 }, { "epoch": 0.23164954862661657, "grad_norm": 3.4870493412017822, "learning_rate": 7.685679159369528e-05, "loss": 1.026, "num_input_tokens_seen": 53201232, "step": 3307 }, { "epoch": 0.23171959687234583, "grad_norm": 4.122028827667236, "learning_rate": 7.684979334500876e-05, "loss": 1.2103, "num_input_tokens_seen": 53217616, "step": 3308 }, { "epoch": 0.23178964511807507, "grad_norm": 3.4486751556396484, "learning_rate": 7.684279509632224e-05, "loss": 0.6654, "num_input_tokens_seen": 53233936, "step": 3309 }, { "epoch": 0.23185969336380433, "grad_norm": 4.321650981903076, "learning_rate": 7.683579684763573e-05, "loss": 1.106, "num_input_tokens_seen": 53250320, "step": 3310 }, { "epoch": 0.23192974160953356, "grad_norm": 5.820108413696289, "learning_rate": 7.682879859894922e-05, "loss": 1.0225, "num_input_tokens_seen": 53266592, "step": 3311 }, { "epoch": 0.23199978985526282, "grad_norm": 5.5514912605285645, "learning_rate": 7.68218003502627e-05, "loss": 1.1083, "num_input_tokens_seen": 53282976, "step": 3312 }, { "epoch": 0.23206983810099205, "grad_norm": 4.108302116394043, "learning_rate": 7.681480210157618e-05, "loss": 1.1507, "num_input_tokens_seen": 53299184, "step": 3313 }, { "epoch": 0.2321398863467213, "grad_norm": 4.037779331207275, "learning_rate": 7.680780385288967e-05, "loss": 1.2858, "num_input_tokens_seen": 53315000, "step": 3314 }, { "epoch": 0.23220993459245054, "grad_norm": 4.5398383140563965, "learning_rate": 7.680080560420315e-05, "loss": 1.0374, "num_input_tokens_seen": 53331104, "step": 3315 }, { "epoch": 0.2322799828381798, "grad_norm": 4.2399067878723145, "learning_rate": 7.679380735551665e-05, "loss": 1.098, "num_input_tokens_seen": 53347488, "step": 3316 }, { "epoch": 0.23235003108390903, "grad_norm": 5.6600775718688965, "learning_rate": 7.678680910683013e-05, "loss": 0.9446, "num_input_tokens_seen": 53363872, "step": 3317 }, { "epoch": 0.2324200793296383, "grad_norm": 4.462069511413574, "learning_rate": 7.677981085814361e-05, "loss": 0.9313, "num_input_tokens_seen": 53379424, "step": 3318 }, { "epoch": 0.23249012757536752, "grad_norm": 4.644591808319092, "learning_rate": 7.677281260945709e-05, "loss": 1.3155, "num_input_tokens_seen": 53395728, "step": 3319 }, { "epoch": 0.23256017582109678, "grad_norm": 3.860954523086548, "learning_rate": 7.676581436077058e-05, "loss": 1.0917, "num_input_tokens_seen": 53412112, "step": 3320 }, { "epoch": 0.232630224066826, "grad_norm": 4.625146389007568, "learning_rate": 7.675881611208407e-05, "loss": 0.9253, "num_input_tokens_seen": 53427992, "step": 3321 }, { "epoch": 0.23270027231255527, "grad_norm": 6.473335266113281, "learning_rate": 7.675181786339756e-05, "loss": 0.9892, "num_input_tokens_seen": 53444376, "step": 3322 }, { "epoch": 0.2327703205582845, "grad_norm": 3.6846091747283936, "learning_rate": 7.674481961471104e-05, "loss": 0.9976, "num_input_tokens_seen": 53460760, "step": 3323 }, { "epoch": 0.23284036880401376, "grad_norm": 3.784900188446045, "learning_rate": 7.673782136602453e-05, "loss": 0.8865, "num_input_tokens_seen": 53477144, "step": 3324 }, { "epoch": 0.23291041704974302, "grad_norm": 4.175132751464844, "learning_rate": 7.673082311733801e-05, "loss": 1.1741, "num_input_tokens_seen": 53493496, "step": 3325 }, { "epoch": 0.23298046529547226, "grad_norm": 4.355600833892822, "learning_rate": 7.67238248686515e-05, "loss": 0.8686, "num_input_tokens_seen": 53509560, "step": 3326 }, { "epoch": 0.23305051354120151, "grad_norm": 4.32242488861084, "learning_rate": 7.671682661996497e-05, "loss": 0.9493, "num_input_tokens_seen": 53525944, "step": 3327 }, { "epoch": 0.23312056178693075, "grad_norm": 4.937814235687256, "learning_rate": 7.670982837127846e-05, "loss": 1.1617, "num_input_tokens_seen": 53541312, "step": 3328 }, { "epoch": 0.23319061003266, "grad_norm": 3.1939101219177246, "learning_rate": 7.670283012259195e-05, "loss": 0.8866, "num_input_tokens_seen": 53557696, "step": 3329 }, { "epoch": 0.23326065827838924, "grad_norm": 5.137113094329834, "learning_rate": 7.669583187390544e-05, "loss": 0.9911, "num_input_tokens_seen": 53573600, "step": 3330 }, { "epoch": 0.2333307065241185, "grad_norm": 3.777954578399658, "learning_rate": 7.668883362521893e-05, "loss": 1.0047, "num_input_tokens_seen": 53588808, "step": 3331 }, { "epoch": 0.23340075476984773, "grad_norm": 4.229750633239746, "learning_rate": 7.66818353765324e-05, "loss": 1.3247, "num_input_tokens_seen": 53603416, "step": 3332 }, { "epoch": 0.233470803015577, "grad_norm": 4.248676776885986, "learning_rate": 7.667483712784589e-05, "loss": 1.2149, "num_input_tokens_seen": 53618896, "step": 3333 }, { "epoch": 0.23354085126130622, "grad_norm": 3.7393991947174072, "learning_rate": 7.666783887915938e-05, "loss": 1.0339, "num_input_tokens_seen": 53635280, "step": 3334 }, { "epoch": 0.23361089950703548, "grad_norm": 3.6224875450134277, "learning_rate": 7.666084063047285e-05, "loss": 0.8727, "num_input_tokens_seen": 53651664, "step": 3335 }, { "epoch": 0.2336809477527647, "grad_norm": 4.2722063064575195, "learning_rate": 7.665384238178634e-05, "loss": 1.1982, "num_input_tokens_seen": 53668048, "step": 3336 }, { "epoch": 0.23375099599849397, "grad_norm": 3.4717535972595215, "learning_rate": 7.664684413309983e-05, "loss": 0.9695, "num_input_tokens_seen": 53684432, "step": 3337 }, { "epoch": 0.2338210442442232, "grad_norm": 3.6640021800994873, "learning_rate": 7.663984588441332e-05, "loss": 0.8621, "num_input_tokens_seen": 53700816, "step": 3338 }, { "epoch": 0.23389109248995246, "grad_norm": 5.14633321762085, "learning_rate": 7.66328476357268e-05, "loss": 1.1954, "num_input_tokens_seen": 53717200, "step": 3339 }, { "epoch": 0.2339611407356817, "grad_norm": 4.479960918426514, "learning_rate": 7.662584938704028e-05, "loss": 1.1001, "num_input_tokens_seen": 53733584, "step": 3340 }, { "epoch": 0.23403118898141095, "grad_norm": 5.33896017074585, "learning_rate": 7.661885113835377e-05, "loss": 0.8984, "num_input_tokens_seen": 53749072, "step": 3341 }, { "epoch": 0.23410123722714019, "grad_norm": 4.407443046569824, "learning_rate": 7.661185288966726e-05, "loss": 1.2437, "num_input_tokens_seen": 53765088, "step": 3342 }, { "epoch": 0.23417128547286944, "grad_norm": 3.8250956535339355, "learning_rate": 7.660485464098075e-05, "loss": 0.9243, "num_input_tokens_seen": 53781000, "step": 3343 }, { "epoch": 0.23424133371859868, "grad_norm": 4.316215515136719, "learning_rate": 7.659785639229422e-05, "loss": 1.0972, "num_input_tokens_seen": 53796744, "step": 3344 }, { "epoch": 0.23431138196432794, "grad_norm": 4.291647434234619, "learning_rate": 7.659085814360771e-05, "loss": 1.1376, "num_input_tokens_seen": 53813128, "step": 3345 }, { "epoch": 0.23438143021005717, "grad_norm": 3.704899787902832, "learning_rate": 7.658385989492119e-05, "loss": 1.2117, "num_input_tokens_seen": 53829512, "step": 3346 }, { "epoch": 0.23445147845578643, "grad_norm": 3.5979909896850586, "learning_rate": 7.657686164623468e-05, "loss": 0.9604, "num_input_tokens_seen": 53845536, "step": 3347 }, { "epoch": 0.23452152670151566, "grad_norm": 3.8820247650146484, "learning_rate": 7.656986339754816e-05, "loss": 1.2439, "num_input_tokens_seen": 53861920, "step": 3348 }, { "epoch": 0.23459157494724492, "grad_norm": 4.226894855499268, "learning_rate": 7.656286514886165e-05, "loss": 1.0884, "num_input_tokens_seen": 53878304, "step": 3349 }, { "epoch": 0.23466162319297415, "grad_norm": 4.507336616516113, "learning_rate": 7.655586690017514e-05, "loss": 1.0184, "num_input_tokens_seen": 53894688, "step": 3350 }, { "epoch": 0.2347316714387034, "grad_norm": 3.86645245552063, "learning_rate": 7.654886865148863e-05, "loss": 1.0895, "num_input_tokens_seen": 53910736, "step": 3351 }, { "epoch": 0.23480171968443264, "grad_norm": 3.8789820671081543, "learning_rate": 7.65418704028021e-05, "loss": 1.0078, "num_input_tokens_seen": 53926688, "step": 3352 }, { "epoch": 0.2348717679301619, "grad_norm": 3.893564462661743, "learning_rate": 7.653487215411559e-05, "loss": 1.0701, "num_input_tokens_seen": 53942904, "step": 3353 }, { "epoch": 0.23494181617589113, "grad_norm": 4.6554412841796875, "learning_rate": 7.652787390542907e-05, "loss": 1.1396, "num_input_tokens_seen": 53957976, "step": 3354 }, { "epoch": 0.2350118644216204, "grad_norm": 4.118137359619141, "learning_rate": 7.652087565674256e-05, "loss": 1.2019, "num_input_tokens_seen": 53973520, "step": 3355 }, { "epoch": 0.23508191266734962, "grad_norm": 5.099210262298584, "learning_rate": 7.651387740805605e-05, "loss": 0.892, "num_input_tokens_seen": 53989280, "step": 3356 }, { "epoch": 0.23515196091307888, "grad_norm": 3.868797779083252, "learning_rate": 7.650687915936953e-05, "loss": 1.0992, "num_input_tokens_seen": 54005664, "step": 3357 }, { "epoch": 0.23522200915880812, "grad_norm": 4.032477378845215, "learning_rate": 7.649988091068302e-05, "loss": 1.0356, "num_input_tokens_seen": 54022048, "step": 3358 }, { "epoch": 0.23529205740453737, "grad_norm": 3.907238483428955, "learning_rate": 7.64928826619965e-05, "loss": 1.0925, "num_input_tokens_seen": 54038432, "step": 3359 }, { "epoch": 0.23536210565026663, "grad_norm": 3.6504223346710205, "learning_rate": 7.648588441330999e-05, "loss": 0.9708, "num_input_tokens_seen": 54054272, "step": 3360 }, { "epoch": 0.23543215389599587, "grad_norm": 4.614812850952148, "learning_rate": 7.647888616462347e-05, "loss": 1.136, "num_input_tokens_seen": 54070656, "step": 3361 }, { "epoch": 0.23550220214172513, "grad_norm": 4.812591552734375, "learning_rate": 7.647188791593696e-05, "loss": 1.0714, "num_input_tokens_seen": 54086416, "step": 3362 }, { "epoch": 0.23557225038745436, "grad_norm": 3.709543466567993, "learning_rate": 7.646488966725044e-05, "loss": 1.106, "num_input_tokens_seen": 54102800, "step": 3363 }, { "epoch": 0.23564229863318362, "grad_norm": 3.9850802421569824, "learning_rate": 7.645789141856393e-05, "loss": 1.1509, "num_input_tokens_seen": 54119184, "step": 3364 }, { "epoch": 0.23571234687891285, "grad_norm": 4.59740686416626, "learning_rate": 7.645089316987742e-05, "loss": 1.1974, "num_input_tokens_seen": 54135568, "step": 3365 }, { "epoch": 0.2357823951246421, "grad_norm": 4.118459224700928, "learning_rate": 7.644389492119089e-05, "loss": 1.2196, "num_input_tokens_seen": 54151952, "step": 3366 }, { "epoch": 0.23585244337037134, "grad_norm": 4.172552108764648, "learning_rate": 7.643689667250438e-05, "loss": 1.0178, "num_input_tokens_seen": 54167776, "step": 3367 }, { "epoch": 0.2359224916161006, "grad_norm": 3.9671120643615723, "learning_rate": 7.642989842381787e-05, "loss": 1.0589, "num_input_tokens_seen": 54184160, "step": 3368 }, { "epoch": 0.23599253986182983, "grad_norm": 3.7376415729522705, "learning_rate": 7.642290017513136e-05, "loss": 1.1445, "num_input_tokens_seen": 54200280, "step": 3369 }, { "epoch": 0.2360625881075591, "grad_norm": 4.665002346038818, "learning_rate": 7.641590192644484e-05, "loss": 1.3347, "num_input_tokens_seen": 54216664, "step": 3370 }, { "epoch": 0.23613263635328832, "grad_norm": 3.669015884399414, "learning_rate": 7.640890367775832e-05, "loss": 0.8359, "num_input_tokens_seen": 54232320, "step": 3371 }, { "epoch": 0.23620268459901758, "grad_norm": 3.993393659591675, "learning_rate": 7.640190542907181e-05, "loss": 1.0298, "num_input_tokens_seen": 54248704, "step": 3372 }, { "epoch": 0.2362727328447468, "grad_norm": 3.808516263961792, "learning_rate": 7.639490718038528e-05, "loss": 1.1315, "num_input_tokens_seen": 54265088, "step": 3373 }, { "epoch": 0.23634278109047607, "grad_norm": 5.25230073928833, "learning_rate": 7.638790893169877e-05, "loss": 1.1273, "num_input_tokens_seen": 54281256, "step": 3374 }, { "epoch": 0.2364128293362053, "grad_norm": 5.724976062774658, "learning_rate": 7.638091068301226e-05, "loss": 1.3176, "num_input_tokens_seen": 54296832, "step": 3375 }, { "epoch": 0.23648287758193456, "grad_norm": 3.553737163543701, "learning_rate": 7.637391243432575e-05, "loss": 1.0288, "num_input_tokens_seen": 54313120, "step": 3376 }, { "epoch": 0.2365529258276638, "grad_norm": 6.614949703216553, "learning_rate": 7.636691418563924e-05, "loss": 1.0649, "num_input_tokens_seen": 54328184, "step": 3377 }, { "epoch": 0.23662297407339306, "grad_norm": 3.76234769821167, "learning_rate": 7.635991593695273e-05, "loss": 1.149, "num_input_tokens_seen": 54344568, "step": 3378 }, { "epoch": 0.2366930223191223, "grad_norm": 3.4564521312713623, "learning_rate": 7.63529176882662e-05, "loss": 0.9227, "num_input_tokens_seen": 54360952, "step": 3379 }, { "epoch": 0.23676307056485155, "grad_norm": 3.735978841781616, "learning_rate": 7.634591943957969e-05, "loss": 1.2159, "num_input_tokens_seen": 54377336, "step": 3380 }, { "epoch": 0.23683311881058078, "grad_norm": 4.106653690338135, "learning_rate": 7.633892119089317e-05, "loss": 1.0997, "num_input_tokens_seen": 54393232, "step": 3381 }, { "epoch": 0.23690316705631004, "grad_norm": 3.9169600009918213, "learning_rate": 7.633192294220667e-05, "loss": 1.247, "num_input_tokens_seen": 54409616, "step": 3382 }, { "epoch": 0.23697321530203927, "grad_norm": 3.8265388011932373, "learning_rate": 7.632492469352014e-05, "loss": 1.1391, "num_input_tokens_seen": 54425312, "step": 3383 }, { "epoch": 0.23704326354776853, "grad_norm": 3.6288204193115234, "learning_rate": 7.631792644483363e-05, "loss": 1.0445, "num_input_tokens_seen": 54441696, "step": 3384 }, { "epoch": 0.23711331179349776, "grad_norm": 4.207483291625977, "learning_rate": 7.631092819614712e-05, "loss": 1.2068, "num_input_tokens_seen": 54457720, "step": 3385 }, { "epoch": 0.23718336003922702, "grad_norm": 3.880786895751953, "learning_rate": 7.63039299474606e-05, "loss": 1.0471, "num_input_tokens_seen": 54474104, "step": 3386 }, { "epoch": 0.23725340828495625, "grad_norm": 4.493243217468262, "learning_rate": 7.629693169877408e-05, "loss": 1.1107, "num_input_tokens_seen": 54490080, "step": 3387 }, { "epoch": 0.2373234565306855, "grad_norm": 4.432561874389648, "learning_rate": 7.628993345008757e-05, "loss": 1.1474, "num_input_tokens_seen": 54506464, "step": 3388 }, { "epoch": 0.23739350477641474, "grad_norm": 4.210158824920654, "learning_rate": 7.628293520140106e-05, "loss": 1.1567, "num_input_tokens_seen": 54522848, "step": 3389 }, { "epoch": 0.237463553022144, "grad_norm": 4.561443328857422, "learning_rate": 7.627593695271454e-05, "loss": 1.2793, "num_input_tokens_seen": 54538192, "step": 3390 }, { "epoch": 0.23753360126787323, "grad_norm": 3.6792140007019043, "learning_rate": 7.626893870402802e-05, "loss": 0.9692, "num_input_tokens_seen": 54554576, "step": 3391 }, { "epoch": 0.2376036495136025, "grad_norm": 4.3415141105651855, "learning_rate": 7.626194045534151e-05, "loss": 1.1777, "num_input_tokens_seen": 54570960, "step": 3392 }, { "epoch": 0.23767369775933175, "grad_norm": 3.770224094390869, "learning_rate": 7.625494220665499e-05, "loss": 1.1923, "num_input_tokens_seen": 54587344, "step": 3393 }, { "epoch": 0.23774374600506099, "grad_norm": 3.7803759574890137, "learning_rate": 7.624794395796848e-05, "loss": 1.1631, "num_input_tokens_seen": 54603728, "step": 3394 }, { "epoch": 0.23781379425079024, "grad_norm": 4.559312343597412, "learning_rate": 7.624094570928196e-05, "loss": 1.0235, "num_input_tokens_seen": 54619760, "step": 3395 }, { "epoch": 0.23788384249651948, "grad_norm": 4.215981483459473, "learning_rate": 7.623394746059545e-05, "loss": 1.2803, "num_input_tokens_seen": 54636144, "step": 3396 }, { "epoch": 0.23795389074224874, "grad_norm": 4.108291149139404, "learning_rate": 7.622694921190894e-05, "loss": 1.0486, "num_input_tokens_seen": 54652136, "step": 3397 }, { "epoch": 0.23802393898797797, "grad_norm": 4.4075093269348145, "learning_rate": 7.621995096322242e-05, "loss": 1.0766, "num_input_tokens_seen": 54668520, "step": 3398 }, { "epoch": 0.23809398723370723, "grad_norm": 4.002575874328613, "learning_rate": 7.62129527145359e-05, "loss": 1.1793, "num_input_tokens_seen": 54684544, "step": 3399 }, { "epoch": 0.23816403547943646, "grad_norm": 3.5264174938201904, "learning_rate": 7.620595446584938e-05, "loss": 0.928, "num_input_tokens_seen": 54700680, "step": 3400 }, { "epoch": 0.23816403547943646, "eval_loss": 1.1361509561538696, "eval_runtime": 0.1856, "eval_samples_per_second": 5.389, "eval_steps_per_second": 5.389, "num_input_tokens_seen": 54700680, "step": 3400 }, { "epoch": 0.23823408372516572, "grad_norm": 3.585204839706421, "learning_rate": 7.619895621716287e-05, "loss": 1.0865, "num_input_tokens_seen": 54717064, "step": 3401 }, { "epoch": 0.23830413197089495, "grad_norm": 4.442777633666992, "learning_rate": 7.619195796847637e-05, "loss": 0.9445, "num_input_tokens_seen": 54732648, "step": 3402 }, { "epoch": 0.2383741802166242, "grad_norm": 3.807063102722168, "learning_rate": 7.618495971978985e-05, "loss": 1.0127, "num_input_tokens_seen": 54749032, "step": 3403 }, { "epoch": 0.23844422846235344, "grad_norm": 4.984583854675293, "learning_rate": 7.617796147110333e-05, "loss": 1.349, "num_input_tokens_seen": 54764192, "step": 3404 }, { "epoch": 0.2385142767080827, "grad_norm": 4.326750755310059, "learning_rate": 7.617096322241682e-05, "loss": 1.0875, "num_input_tokens_seen": 54780120, "step": 3405 }, { "epoch": 0.23858432495381193, "grad_norm": 5.707291126251221, "learning_rate": 7.61639649737303e-05, "loss": 1.0816, "num_input_tokens_seen": 54796168, "step": 3406 }, { "epoch": 0.2386543731995412, "grad_norm": 4.450499534606934, "learning_rate": 7.615696672504379e-05, "loss": 1.139, "num_input_tokens_seen": 54812056, "step": 3407 }, { "epoch": 0.23872442144527042, "grad_norm": 4.253554821014404, "learning_rate": 7.614996847635728e-05, "loss": 1.1798, "num_input_tokens_seen": 54828248, "step": 3408 }, { "epoch": 0.23879446969099968, "grad_norm": 5.04890251159668, "learning_rate": 7.614297022767076e-05, "loss": 0.9968, "num_input_tokens_seen": 54844632, "step": 3409 }, { "epoch": 0.23886451793672892, "grad_norm": 3.24513578414917, "learning_rate": 7.613597197898424e-05, "loss": 0.8901, "num_input_tokens_seen": 54861016, "step": 3410 }, { "epoch": 0.23893456618245817, "grad_norm": 4.008625507354736, "learning_rate": 7.612897373029773e-05, "loss": 1.1048, "num_input_tokens_seen": 54877168, "step": 3411 }, { "epoch": 0.2390046144281874, "grad_norm": 5.393536567687988, "learning_rate": 7.612197548161122e-05, "loss": 1.1554, "num_input_tokens_seen": 54892720, "step": 3412 }, { "epoch": 0.23907466267391667, "grad_norm": 4.388333797454834, "learning_rate": 7.611497723292469e-05, "loss": 1.0478, "num_input_tokens_seen": 54909104, "step": 3413 }, { "epoch": 0.2391447109196459, "grad_norm": 3.8056883811950684, "learning_rate": 7.610797898423818e-05, "loss": 0.9235, "num_input_tokens_seen": 54925280, "step": 3414 }, { "epoch": 0.23921475916537516, "grad_norm": 6.9983062744140625, "learning_rate": 7.610098073555167e-05, "loss": 1.0766, "num_input_tokens_seen": 54941384, "step": 3415 }, { "epoch": 0.2392848074111044, "grad_norm": 3.485119581222534, "learning_rate": 7.609398248686516e-05, "loss": 1.0811, "num_input_tokens_seen": 54957592, "step": 3416 }, { "epoch": 0.23935485565683365, "grad_norm": 4.450938701629639, "learning_rate": 7.608698423817863e-05, "loss": 0.9354, "num_input_tokens_seen": 54973976, "step": 3417 }, { "epoch": 0.23942490390256288, "grad_norm": 4.142702579498291, "learning_rate": 7.607998598949212e-05, "loss": 1.0336, "num_input_tokens_seen": 54990360, "step": 3418 }, { "epoch": 0.23949495214829214, "grad_norm": 4.341495513916016, "learning_rate": 7.607298774080561e-05, "loss": 0.9722, "num_input_tokens_seen": 55006744, "step": 3419 }, { "epoch": 0.23956500039402137, "grad_norm": 4.355419158935547, "learning_rate": 7.606598949211908e-05, "loss": 0.9972, "num_input_tokens_seen": 55022816, "step": 3420 }, { "epoch": 0.23963504863975063, "grad_norm": 4.295046806335449, "learning_rate": 7.605899124343257e-05, "loss": 1.1881, "num_input_tokens_seen": 55039200, "step": 3421 }, { "epoch": 0.23970509688547986, "grad_norm": 3.9299042224884033, "learning_rate": 7.605199299474608e-05, "loss": 1.0959, "num_input_tokens_seen": 55055552, "step": 3422 }, { "epoch": 0.23977514513120912, "grad_norm": 3.7252607345581055, "learning_rate": 7.604499474605955e-05, "loss": 0.9151, "num_input_tokens_seen": 55071936, "step": 3423 }, { "epoch": 0.23984519337693835, "grad_norm": 4.723415851593018, "learning_rate": 7.603799649737304e-05, "loss": 0.9568, "num_input_tokens_seen": 55088320, "step": 3424 }, { "epoch": 0.2399152416226676, "grad_norm": 3.9923605918884277, "learning_rate": 7.603099824868651e-05, "loss": 1.1124, "num_input_tokens_seen": 55104416, "step": 3425 }, { "epoch": 0.23998528986839684, "grad_norm": 4.510697364807129, "learning_rate": 7.6024e-05, "loss": 1.1397, "num_input_tokens_seen": 55120800, "step": 3426 }, { "epoch": 0.2400553381141261, "grad_norm": 4.161818027496338, "learning_rate": 7.601700175131348e-05, "loss": 1.0915, "num_input_tokens_seen": 55137184, "step": 3427 }, { "epoch": 0.24012538635985536, "grad_norm": 5.871128082275391, "learning_rate": 7.601000350262698e-05, "loss": 0.9465, "num_input_tokens_seen": 55152528, "step": 3428 }, { "epoch": 0.2401954346055846, "grad_norm": 4.180598258972168, "learning_rate": 7.600300525394047e-05, "loss": 1.0132, "num_input_tokens_seen": 55168552, "step": 3429 }, { "epoch": 0.24026548285131386, "grad_norm": 5.575338363647461, "learning_rate": 7.599600700525394e-05, "loss": 1.2578, "num_input_tokens_seen": 55184104, "step": 3430 }, { "epoch": 0.2403355310970431, "grad_norm": 4.503122329711914, "learning_rate": 7.598900875656743e-05, "loss": 1.1367, "num_input_tokens_seen": 55199768, "step": 3431 }, { "epoch": 0.24040557934277235, "grad_norm": 3.6931769847869873, "learning_rate": 7.598201050788092e-05, "loss": 1.0977, "num_input_tokens_seen": 55216016, "step": 3432 }, { "epoch": 0.24047562758850158, "grad_norm": 4.138489723205566, "learning_rate": 7.59750122591944e-05, "loss": 1.1163, "num_input_tokens_seen": 55232400, "step": 3433 }, { "epoch": 0.24054567583423084, "grad_norm": 3.603297710418701, "learning_rate": 7.596801401050788e-05, "loss": 1.1277, "num_input_tokens_seen": 55248784, "step": 3434 }, { "epoch": 0.24061572407996007, "grad_norm": 4.072240352630615, "learning_rate": 7.596101576182137e-05, "loss": 1.3073, "num_input_tokens_seen": 55264320, "step": 3435 }, { "epoch": 0.24068577232568933, "grad_norm": 5.015305519104004, "learning_rate": 7.595401751313486e-05, "loss": 1.3236, "num_input_tokens_seen": 55280528, "step": 3436 }, { "epoch": 0.24075582057141856, "grad_norm": 5.135364055633545, "learning_rate": 7.594701926444834e-05, "loss": 1.0322, "num_input_tokens_seen": 55296912, "step": 3437 }, { "epoch": 0.24082586881714782, "grad_norm": 4.737668991088867, "learning_rate": 7.594002101576183e-05, "loss": 1.0069, "num_input_tokens_seen": 55313296, "step": 3438 }, { "epoch": 0.24089591706287705, "grad_norm": 4.380087375640869, "learning_rate": 7.593302276707531e-05, "loss": 1.267, "num_input_tokens_seen": 55329152, "step": 3439 }, { "epoch": 0.2409659653086063, "grad_norm": 4.472866535186768, "learning_rate": 7.592602451838879e-05, "loss": 1.1577, "num_input_tokens_seen": 55345536, "step": 3440 }, { "epoch": 0.24103601355433554, "grad_norm": 4.323402404785156, "learning_rate": 7.591902626970228e-05, "loss": 1.1872, "num_input_tokens_seen": 55361920, "step": 3441 }, { "epoch": 0.2411060618000648, "grad_norm": 3.7247276306152344, "learning_rate": 7.591202802101578e-05, "loss": 1.0906, "num_input_tokens_seen": 55377344, "step": 3442 }, { "epoch": 0.24117611004579403, "grad_norm": 6.503116607666016, "learning_rate": 7.590502977232925e-05, "loss": 1.2304, "num_input_tokens_seen": 55393728, "step": 3443 }, { "epoch": 0.2412461582915233, "grad_norm": 4.590184688568115, "learning_rate": 7.589803152364273e-05, "loss": 1.1369, "num_input_tokens_seen": 55410112, "step": 3444 }, { "epoch": 0.24131620653725253, "grad_norm": 3.718323230743408, "learning_rate": 7.589103327495622e-05, "loss": 1.09, "num_input_tokens_seen": 55426392, "step": 3445 }, { "epoch": 0.24138625478298179, "grad_norm": 4.8696465492248535, "learning_rate": 7.58840350262697e-05, "loss": 1.2361, "num_input_tokens_seen": 55442632, "step": 3446 }, { "epoch": 0.24145630302871102, "grad_norm": 3.7620716094970703, "learning_rate": 7.587703677758318e-05, "loss": 0.9411, "num_input_tokens_seen": 55459016, "step": 3447 }, { "epoch": 0.24152635127444028, "grad_norm": 3.8696882724761963, "learning_rate": 7.587003852889668e-05, "loss": 0.992, "num_input_tokens_seen": 55474944, "step": 3448 }, { "epoch": 0.2415963995201695, "grad_norm": 4.628901481628418, "learning_rate": 7.586304028021017e-05, "loss": 1.1376, "num_input_tokens_seen": 55490416, "step": 3449 }, { "epoch": 0.24166644776589877, "grad_norm": 4.1568121910095215, "learning_rate": 7.585604203152365e-05, "loss": 1.1596, "num_input_tokens_seen": 55505760, "step": 3450 }, { "epoch": 0.241736496011628, "grad_norm": 4.089991569519043, "learning_rate": 7.584904378283714e-05, "loss": 1.1707, "num_input_tokens_seen": 55521528, "step": 3451 }, { "epoch": 0.24180654425735726, "grad_norm": 7.870755195617676, "learning_rate": 7.584204553415061e-05, "loss": 1.3069, "num_input_tokens_seen": 55536256, "step": 3452 }, { "epoch": 0.2418765925030865, "grad_norm": 4.898053169250488, "learning_rate": 7.58350472854641e-05, "loss": 1.0912, "num_input_tokens_seen": 55551904, "step": 3453 }, { "epoch": 0.24194664074881575, "grad_norm": 4.515797138214111, "learning_rate": 7.582804903677759e-05, "loss": 1.2266, "num_input_tokens_seen": 55567240, "step": 3454 }, { "epoch": 0.24201668899454498, "grad_norm": 3.7202370166778564, "learning_rate": 7.582105078809108e-05, "loss": 1.0118, "num_input_tokens_seen": 55583176, "step": 3455 }, { "epoch": 0.24208673724027424, "grad_norm": 5.834963321685791, "learning_rate": 7.581405253940457e-05, "loss": 1.3757, "num_input_tokens_seen": 55599144, "step": 3456 }, { "epoch": 0.24215678548600347, "grad_norm": 4.450705528259277, "learning_rate": 7.580705429071804e-05, "loss": 0.9523, "num_input_tokens_seen": 55615528, "step": 3457 }, { "epoch": 0.24222683373173273, "grad_norm": 3.796229839324951, "learning_rate": 7.580005604203153e-05, "loss": 1.0415, "num_input_tokens_seen": 55631912, "step": 3458 }, { "epoch": 0.24229688197746196, "grad_norm": 4.004448413848877, "learning_rate": 7.579305779334502e-05, "loss": 1.1538, "num_input_tokens_seen": 55647896, "step": 3459 }, { "epoch": 0.24236693022319122, "grad_norm": 4.511063098907471, "learning_rate": 7.578605954465849e-05, "loss": 1.0616, "num_input_tokens_seen": 55664280, "step": 3460 }, { "epoch": 0.24243697846892046, "grad_norm": 6.866496562957764, "learning_rate": 7.577906129597198e-05, "loss": 1.1126, "num_input_tokens_seen": 55679720, "step": 3461 }, { "epoch": 0.24250702671464971, "grad_norm": 5.447164058685303, "learning_rate": 7.577206304728547e-05, "loss": 1.0812, "num_input_tokens_seen": 55696104, "step": 3462 }, { "epoch": 0.24257707496037897, "grad_norm": 6.401725769042969, "learning_rate": 7.576506479859896e-05, "loss": 1.0097, "num_input_tokens_seen": 55712488, "step": 3463 }, { "epoch": 0.2426471232061082, "grad_norm": 3.7833733558654785, "learning_rate": 7.575806654991243e-05, "loss": 1.0064, "num_input_tokens_seen": 55728504, "step": 3464 }, { "epoch": 0.24271717145183747, "grad_norm": 4.157958984375, "learning_rate": 7.575106830122592e-05, "loss": 1.109, "num_input_tokens_seen": 55743904, "step": 3465 }, { "epoch": 0.2427872196975667, "grad_norm": 4.657470703125, "learning_rate": 7.574407005253941e-05, "loss": 1.2033, "num_input_tokens_seen": 55759920, "step": 3466 }, { "epoch": 0.24285726794329596, "grad_norm": 5.129040718078613, "learning_rate": 7.573707180385289e-05, "loss": 1.2959, "num_input_tokens_seen": 55776304, "step": 3467 }, { "epoch": 0.2429273161890252, "grad_norm": 4.887351036071777, "learning_rate": 7.573007355516639e-05, "loss": 1.0568, "num_input_tokens_seen": 55792688, "step": 3468 }, { "epoch": 0.24299736443475445, "grad_norm": 4.042501926422119, "learning_rate": 7.572307530647988e-05, "loss": 1.0509, "num_input_tokens_seen": 55809072, "step": 3469 }, { "epoch": 0.24306741268048368, "grad_norm": 4.162355422973633, "learning_rate": 7.571607705779335e-05, "loss": 1.0168, "num_input_tokens_seen": 55825456, "step": 3470 }, { "epoch": 0.24313746092621294, "grad_norm": 3.8478844165802, "learning_rate": 7.570907880910683e-05, "loss": 1.2378, "num_input_tokens_seen": 55841840, "step": 3471 }, { "epoch": 0.24320750917194217, "grad_norm": 6.2065815925598145, "learning_rate": 7.570208056042032e-05, "loss": 1.3644, "num_input_tokens_seen": 55858224, "step": 3472 }, { "epoch": 0.24327755741767143, "grad_norm": 4.8233642578125, "learning_rate": 7.56950823117338e-05, "loss": 1.1363, "num_input_tokens_seen": 55874608, "step": 3473 }, { "epoch": 0.24334760566340066, "grad_norm": 3.534205198287964, "learning_rate": 7.568808406304729e-05, "loss": 1.0236, "num_input_tokens_seen": 55890992, "step": 3474 }, { "epoch": 0.24341765390912992, "grad_norm": 4.218345642089844, "learning_rate": 7.568108581436078e-05, "loss": 1.0921, "num_input_tokens_seen": 55907376, "step": 3475 }, { "epoch": 0.24348770215485915, "grad_norm": 3.7262325286865234, "learning_rate": 7.567408756567427e-05, "loss": 1.1182, "num_input_tokens_seen": 55923752, "step": 3476 }, { "epoch": 0.2435577504005884, "grad_norm": 3.7530906200408936, "learning_rate": 7.566708931698774e-05, "loss": 1.0766, "num_input_tokens_seen": 55939176, "step": 3477 }, { "epoch": 0.24362779864631764, "grad_norm": 4.452608585357666, "learning_rate": 7.566009106830123e-05, "loss": 0.9421, "num_input_tokens_seen": 55955200, "step": 3478 }, { "epoch": 0.2436978468920469, "grad_norm": 4.049906253814697, "learning_rate": 7.565309281961471e-05, "loss": 1.1022, "num_input_tokens_seen": 55971584, "step": 3479 }, { "epoch": 0.24376789513777614, "grad_norm": 4.956455230712891, "learning_rate": 7.56460945709282e-05, "loss": 1.2684, "num_input_tokens_seen": 55987968, "step": 3480 }, { "epoch": 0.2438379433835054, "grad_norm": 4.846863746643066, "learning_rate": 7.563909632224169e-05, "loss": 1.0492, "num_input_tokens_seen": 56003000, "step": 3481 }, { "epoch": 0.24390799162923463, "grad_norm": 4.678101539611816, "learning_rate": 7.563209807355517e-05, "loss": 0.8577, "num_input_tokens_seen": 56019384, "step": 3482 }, { "epoch": 0.2439780398749639, "grad_norm": 4.13012170791626, "learning_rate": 7.562509982486866e-05, "loss": 0.9508, "num_input_tokens_seen": 56035768, "step": 3483 }, { "epoch": 0.24404808812069312, "grad_norm": 3.7858669757843018, "learning_rate": 7.561810157618214e-05, "loss": 1.1034, "num_input_tokens_seen": 56052152, "step": 3484 }, { "epoch": 0.24411813636642238, "grad_norm": 3.7132198810577393, "learning_rate": 7.561110332749563e-05, "loss": 1.0665, "num_input_tokens_seen": 56068536, "step": 3485 }, { "epoch": 0.2441881846121516, "grad_norm": 4.093675136566162, "learning_rate": 7.560410507880911e-05, "loss": 1.0994, "num_input_tokens_seen": 56084888, "step": 3486 }, { "epoch": 0.24425823285788087, "grad_norm": 3.8601670265197754, "learning_rate": 7.559710683012259e-05, "loss": 0.9648, "num_input_tokens_seen": 56101272, "step": 3487 }, { "epoch": 0.2443282811036101, "grad_norm": 3.9332170486450195, "learning_rate": 7.559010858143608e-05, "loss": 0.9559, "num_input_tokens_seen": 56117352, "step": 3488 }, { "epoch": 0.24439832934933936, "grad_norm": 3.7619807720184326, "learning_rate": 7.558311033274957e-05, "loss": 1.0948, "num_input_tokens_seen": 56133736, "step": 3489 }, { "epoch": 0.2444683775950686, "grad_norm": 3.9035656452178955, "learning_rate": 7.557611208406306e-05, "loss": 1.1255, "num_input_tokens_seen": 56149624, "step": 3490 }, { "epoch": 0.24453842584079785, "grad_norm": 5.9505486488342285, "learning_rate": 7.556911383537653e-05, "loss": 1.0192, "num_input_tokens_seen": 56163752, "step": 3491 }, { "epoch": 0.24460847408652708, "grad_norm": 4.006525039672852, "learning_rate": 7.556211558669002e-05, "loss": 1.0859, "num_input_tokens_seen": 56180136, "step": 3492 }, { "epoch": 0.24467852233225634, "grad_norm": 5.28178071975708, "learning_rate": 7.555511733800351e-05, "loss": 1.3704, "num_input_tokens_seen": 56196152, "step": 3493 }, { "epoch": 0.24474857057798557, "grad_norm": 3.577709674835205, "learning_rate": 7.5548119089317e-05, "loss": 1.0015, "num_input_tokens_seen": 56212528, "step": 3494 }, { "epoch": 0.24481861882371483, "grad_norm": 4.6045002937316895, "learning_rate": 7.554112084063048e-05, "loss": 1.1895, "num_input_tokens_seen": 56228912, "step": 3495 }, { "epoch": 0.24488866706944407, "grad_norm": 4.160959720611572, "learning_rate": 7.553412259194397e-05, "loss": 1.1253, "num_input_tokens_seen": 56244944, "step": 3496 }, { "epoch": 0.24495871531517333, "grad_norm": 4.380669593811035, "learning_rate": 7.552712434325745e-05, "loss": 1.0171, "num_input_tokens_seen": 56261072, "step": 3497 }, { "epoch": 0.24502876356090258, "grad_norm": 3.568679094314575, "learning_rate": 7.552012609457092e-05, "loss": 1.0005, "num_input_tokens_seen": 56277456, "step": 3498 }, { "epoch": 0.24509881180663182, "grad_norm": 4.006386756896973, "learning_rate": 7.551312784588441e-05, "loss": 1.0756, "num_input_tokens_seen": 56293840, "step": 3499 }, { "epoch": 0.24516886005236108, "grad_norm": 4.180081844329834, "learning_rate": 7.55061295971979e-05, "loss": 1.303, "num_input_tokens_seen": 56310224, "step": 3500 }, { "epoch": 0.2452389082980903, "grad_norm": 5.228555202484131, "learning_rate": 7.549913134851139e-05, "loss": 1.0267, "num_input_tokens_seen": 56326608, "step": 3501 }, { "epoch": 0.24530895654381957, "grad_norm": 3.60235595703125, "learning_rate": 7.549213309982488e-05, "loss": 0.9258, "num_input_tokens_seen": 56342752, "step": 3502 }, { "epoch": 0.2453790047895488, "grad_norm": 5.305295467376709, "learning_rate": 7.548513485113837e-05, "loss": 1.0366, "num_input_tokens_seen": 56358208, "step": 3503 }, { "epoch": 0.24544905303527806, "grad_norm": 4.5955939292907715, "learning_rate": 7.547813660245184e-05, "loss": 1.1016, "num_input_tokens_seen": 56374592, "step": 3504 }, { "epoch": 0.2455191012810073, "grad_norm": 4.503798961639404, "learning_rate": 7.547113835376533e-05, "loss": 0.9045, "num_input_tokens_seen": 56390320, "step": 3505 }, { "epoch": 0.24558914952673655, "grad_norm": 3.79156231880188, "learning_rate": 7.54641401050788e-05, "loss": 0.9796, "num_input_tokens_seen": 56406176, "step": 3506 }, { "epoch": 0.24565919777246578, "grad_norm": 4.054116249084473, "learning_rate": 7.54571418563923e-05, "loss": 1.0002, "num_input_tokens_seen": 56421120, "step": 3507 }, { "epoch": 0.24572924601819504, "grad_norm": 3.57210636138916, "learning_rate": 7.545014360770578e-05, "loss": 0.982, "num_input_tokens_seen": 56437504, "step": 3508 }, { "epoch": 0.24579929426392427, "grad_norm": 4.02463960647583, "learning_rate": 7.544314535901927e-05, "loss": 1.0651, "num_input_tokens_seen": 56452680, "step": 3509 }, { "epoch": 0.24586934250965353, "grad_norm": 3.712689161300659, "learning_rate": 7.543614711033276e-05, "loss": 1.1449, "num_input_tokens_seen": 56468424, "step": 3510 }, { "epoch": 0.24593939075538276, "grad_norm": 4.943066596984863, "learning_rate": 7.542914886164623e-05, "loss": 1.2289, "num_input_tokens_seen": 56484784, "step": 3511 }, { "epoch": 0.24600943900111202, "grad_norm": 4.94294548034668, "learning_rate": 7.542215061295972e-05, "loss": 1.0088, "num_input_tokens_seen": 56500992, "step": 3512 }, { "epoch": 0.24607948724684126, "grad_norm": 4.003958225250244, "learning_rate": 7.541515236427321e-05, "loss": 0.9778, "num_input_tokens_seen": 56516392, "step": 3513 }, { "epoch": 0.24614953549257051, "grad_norm": 4.173887252807617, "learning_rate": 7.540815411558669e-05, "loss": 1.0291, "num_input_tokens_seen": 56532776, "step": 3514 }, { "epoch": 0.24621958373829975, "grad_norm": 5.028538227081299, "learning_rate": 7.540115586690019e-05, "loss": 1.0489, "num_input_tokens_seen": 56549056, "step": 3515 }, { "epoch": 0.246289631984029, "grad_norm": 3.781219959259033, "learning_rate": 7.539415761821366e-05, "loss": 0.9056, "num_input_tokens_seen": 56564768, "step": 3516 }, { "epoch": 0.24635968022975824, "grad_norm": 4.070143222808838, "learning_rate": 7.538715936952715e-05, "loss": 1.0561, "num_input_tokens_seen": 56580856, "step": 3517 }, { "epoch": 0.2464297284754875, "grad_norm": 4.322885036468506, "learning_rate": 7.538016112084063e-05, "loss": 0.9834, "num_input_tokens_seen": 56596664, "step": 3518 }, { "epoch": 0.24649977672121673, "grad_norm": 7.525569438934326, "learning_rate": 7.537316287215412e-05, "loss": 1.278, "num_input_tokens_seen": 56611608, "step": 3519 }, { "epoch": 0.246569824966946, "grad_norm": 3.6520745754241943, "learning_rate": 7.53661646234676e-05, "loss": 0.9472, "num_input_tokens_seen": 56627992, "step": 3520 }, { "epoch": 0.24663987321267522, "grad_norm": 6.346038341522217, "learning_rate": 7.53591663747811e-05, "loss": 1.0585, "num_input_tokens_seen": 56644224, "step": 3521 }, { "epoch": 0.24670992145840448, "grad_norm": 5.049849987030029, "learning_rate": 7.535216812609458e-05, "loss": 1.1843, "num_input_tokens_seen": 56660464, "step": 3522 }, { "epoch": 0.2467799697041337, "grad_norm": 5.948208332061768, "learning_rate": 7.534516987740807e-05, "loss": 1.1426, "num_input_tokens_seen": 56676848, "step": 3523 }, { "epoch": 0.24685001794986297, "grad_norm": 4.2648210525512695, "learning_rate": 7.533817162872155e-05, "loss": 1.0703, "num_input_tokens_seen": 56692800, "step": 3524 }, { "epoch": 0.2469200661955922, "grad_norm": 4.268098831176758, "learning_rate": 7.533117338003502e-05, "loss": 1.123, "num_input_tokens_seen": 56709184, "step": 3525 }, { "epoch": 0.24699011444132146, "grad_norm": 3.987408399581909, "learning_rate": 7.532417513134851e-05, "loss": 1.1312, "num_input_tokens_seen": 56724744, "step": 3526 }, { "epoch": 0.2470601626870507, "grad_norm": 4.087530612945557, "learning_rate": 7.5317176882662e-05, "loss": 0.9904, "num_input_tokens_seen": 56741128, "step": 3527 }, { "epoch": 0.24713021093277995, "grad_norm": 3.484837770462036, "learning_rate": 7.531017863397549e-05, "loss": 1.0385, "num_input_tokens_seen": 56757512, "step": 3528 }, { "epoch": 0.24720025917850919, "grad_norm": 4.382214546203613, "learning_rate": 7.530318038528898e-05, "loss": 1.1713, "num_input_tokens_seen": 56773896, "step": 3529 }, { "epoch": 0.24727030742423844, "grad_norm": 4.361959457397461, "learning_rate": 7.529618213660246e-05, "loss": 1.2548, "num_input_tokens_seen": 56789184, "step": 3530 }, { "epoch": 0.24734035566996768, "grad_norm": 4.029329776763916, "learning_rate": 7.528918388791594e-05, "loss": 1.0637, "num_input_tokens_seen": 56804912, "step": 3531 }, { "epoch": 0.24741040391569694, "grad_norm": 4.577064514160156, "learning_rate": 7.528218563922943e-05, "loss": 0.9591, "num_input_tokens_seen": 56821296, "step": 3532 }, { "epoch": 0.2474804521614262, "grad_norm": 3.6799368858337402, "learning_rate": 7.52751873905429e-05, "loss": 1.0508, "num_input_tokens_seen": 56837680, "step": 3533 }, { "epoch": 0.24755050040715543, "grad_norm": 3.962989568710327, "learning_rate": 7.526818914185639e-05, "loss": 1.1347, "num_input_tokens_seen": 56853984, "step": 3534 }, { "epoch": 0.2476205486528847, "grad_norm": 3.610877275466919, "learning_rate": 7.526119089316988e-05, "loss": 1.075, "num_input_tokens_seen": 56870368, "step": 3535 }, { "epoch": 0.24769059689861392, "grad_norm": 4.16568660736084, "learning_rate": 7.525419264448337e-05, "loss": 0.9326, "num_input_tokens_seen": 56886032, "step": 3536 }, { "epoch": 0.24776064514434318, "grad_norm": 4.645627021789551, "learning_rate": 7.524719439579686e-05, "loss": 1.0221, "num_input_tokens_seen": 56900928, "step": 3537 }, { "epoch": 0.2478306933900724, "grad_norm": 3.652317523956299, "learning_rate": 7.524019614711033e-05, "loss": 1.1641, "num_input_tokens_seen": 56917048, "step": 3538 }, { "epoch": 0.24790074163580167, "grad_norm": 5.583502769470215, "learning_rate": 7.523319789842382e-05, "loss": 1.0303, "num_input_tokens_seen": 56933432, "step": 3539 }, { "epoch": 0.2479707898815309, "grad_norm": 3.6924219131469727, "learning_rate": 7.522619964973731e-05, "loss": 0.9386, "num_input_tokens_seen": 56949816, "step": 3540 }, { "epoch": 0.24804083812726016, "grad_norm": 6.476202487945557, "learning_rate": 7.52192014010508e-05, "loss": 1.1841, "num_input_tokens_seen": 56966064, "step": 3541 }, { "epoch": 0.2481108863729894, "grad_norm": 4.052863121032715, "learning_rate": 7.521220315236429e-05, "loss": 1.1133, "num_input_tokens_seen": 56982448, "step": 3542 }, { "epoch": 0.24818093461871865, "grad_norm": 6.569397926330566, "learning_rate": 7.520520490367776e-05, "loss": 1.1061, "num_input_tokens_seen": 56998832, "step": 3543 }, { "epoch": 0.24825098286444788, "grad_norm": 4.026796817779541, "learning_rate": 7.519820665499125e-05, "loss": 1.0121, "num_input_tokens_seen": 57014744, "step": 3544 }, { "epoch": 0.24832103111017714, "grad_norm": 3.705080032348633, "learning_rate": 7.519120840630472e-05, "loss": 1.0041, "num_input_tokens_seen": 57031128, "step": 3545 }, { "epoch": 0.24839107935590637, "grad_norm": 4.828441143035889, "learning_rate": 7.518421015761821e-05, "loss": 1.1551, "num_input_tokens_seen": 57047512, "step": 3546 }, { "epoch": 0.24846112760163563, "grad_norm": 5.6117777824401855, "learning_rate": 7.51772119089317e-05, "loss": 1.1555, "num_input_tokens_seen": 57063840, "step": 3547 }, { "epoch": 0.24853117584736487, "grad_norm": 3.955193281173706, "learning_rate": 7.517021366024519e-05, "loss": 1.0514, "num_input_tokens_seen": 57079936, "step": 3548 }, { "epoch": 0.24860122409309413, "grad_norm": 3.8878116607666016, "learning_rate": 7.516321541155868e-05, "loss": 1.0335, "num_input_tokens_seen": 57096320, "step": 3549 }, { "epoch": 0.24867127233882336, "grad_norm": 6.119873046875, "learning_rate": 7.515621716287217e-05, "loss": 1.0798, "num_input_tokens_seen": 57111632, "step": 3550 }, { "epoch": 0.24874132058455262, "grad_norm": 3.757984161376953, "learning_rate": 7.514921891418564e-05, "loss": 0.9911, "num_input_tokens_seen": 57128016, "step": 3551 }, { "epoch": 0.24881136883028185, "grad_norm": 4.173069953918457, "learning_rate": 7.514222066549912e-05, "loss": 1.1443, "num_input_tokens_seen": 57144400, "step": 3552 }, { "epoch": 0.2488814170760111, "grad_norm": 3.6985576152801514, "learning_rate": 7.51352224168126e-05, "loss": 1.1084, "num_input_tokens_seen": 57160784, "step": 3553 }, { "epoch": 0.24895146532174034, "grad_norm": 4.464880466461182, "learning_rate": 7.51282241681261e-05, "loss": 1.2571, "num_input_tokens_seen": 57177168, "step": 3554 }, { "epoch": 0.2490215135674696, "grad_norm": 3.699873447418213, "learning_rate": 7.512122591943958e-05, "loss": 0.9722, "num_input_tokens_seen": 57193208, "step": 3555 }, { "epoch": 0.24909156181319883, "grad_norm": 5.011424541473389, "learning_rate": 7.511422767075307e-05, "loss": 0.962, "num_input_tokens_seen": 57209592, "step": 3556 }, { "epoch": 0.2491616100589281, "grad_norm": 4.302685260772705, "learning_rate": 7.510722942206656e-05, "loss": 1.0718, "num_input_tokens_seen": 57225976, "step": 3557 }, { "epoch": 0.24923165830465732, "grad_norm": 3.991840362548828, "learning_rate": 7.510023117338004e-05, "loss": 1.0826, "num_input_tokens_seen": 57242168, "step": 3558 }, { "epoch": 0.24930170655038658, "grad_norm": 3.9910435676574707, "learning_rate": 7.509323292469352e-05, "loss": 1.2494, "num_input_tokens_seen": 57258552, "step": 3559 }, { "epoch": 0.2493717547961158, "grad_norm": 4.170960426330566, "learning_rate": 7.5086234676007e-05, "loss": 1.0068, "num_input_tokens_seen": 57274936, "step": 3560 }, { "epoch": 0.24944180304184507, "grad_norm": 4.317671298980713, "learning_rate": 7.50792364273205e-05, "loss": 1.0835, "num_input_tokens_seen": 57291320, "step": 3561 }, { "epoch": 0.2495118512875743, "grad_norm": 3.871293783187866, "learning_rate": 7.507223817863398e-05, "loss": 0.9352, "num_input_tokens_seen": 57307056, "step": 3562 }, { "epoch": 0.24958189953330356, "grad_norm": 4.014804840087891, "learning_rate": 7.506523992994747e-05, "loss": 1.2893, "num_input_tokens_seen": 57322376, "step": 3563 }, { "epoch": 0.2496519477790328, "grad_norm": 5.13847017288208, "learning_rate": 7.505824168126095e-05, "loss": 1.1943, "num_input_tokens_seen": 57338760, "step": 3564 }, { "epoch": 0.24972199602476206, "grad_norm": 3.7801826000213623, "learning_rate": 7.505124343257443e-05, "loss": 1.021, "num_input_tokens_seen": 57355144, "step": 3565 }, { "epoch": 0.24979204427049131, "grad_norm": 3.662065029144287, "learning_rate": 7.504424518388792e-05, "loss": 1.1461, "num_input_tokens_seen": 57371528, "step": 3566 }, { "epoch": 0.24986209251622055, "grad_norm": 4.548840522766113, "learning_rate": 7.50372469352014e-05, "loss": 1.058, "num_input_tokens_seen": 57387912, "step": 3567 }, { "epoch": 0.2499321407619498, "grad_norm": 4.732056140899658, "learning_rate": 7.50302486865149e-05, "loss": 1.0513, "num_input_tokens_seen": 57403120, "step": 3568 }, { "epoch": 0.25000218900767907, "grad_norm": 3.7986674308776855, "learning_rate": 7.502325043782837e-05, "loss": 0.9574, "num_input_tokens_seen": 57418800, "step": 3569 }, { "epoch": 0.25007223725340827, "grad_norm": 6.760079860687256, "learning_rate": 7.501625218914186e-05, "loss": 0.9101, "num_input_tokens_seen": 57432608, "step": 3570 }, { "epoch": 0.25014228549913753, "grad_norm": 4.0666985511779785, "learning_rate": 7.500925394045535e-05, "loss": 1.0564, "num_input_tokens_seen": 57448296, "step": 3571 }, { "epoch": 0.2502123337448668, "grad_norm": 3.7505650520324707, "learning_rate": 7.500225569176882e-05, "loss": 1.1593, "num_input_tokens_seen": 57464680, "step": 3572 }, { "epoch": 0.25028238199059605, "grad_norm": 5.1084675788879395, "learning_rate": 7.499525744308231e-05, "loss": 1.317, "num_input_tokens_seen": 57481032, "step": 3573 }, { "epoch": 0.25035243023632525, "grad_norm": 6.083080768585205, "learning_rate": 7.49882591943958e-05, "loss": 0.9305, "num_input_tokens_seen": 57497416, "step": 3574 }, { "epoch": 0.2504224784820545, "grad_norm": 4.197649955749512, "learning_rate": 7.498126094570929e-05, "loss": 1.0191, "num_input_tokens_seen": 57513800, "step": 3575 }, { "epoch": 0.25049252672778377, "grad_norm": 4.637972831726074, "learning_rate": 7.497426269702278e-05, "loss": 0.9914, "num_input_tokens_seen": 57529832, "step": 3576 }, { "epoch": 0.25056257497351303, "grad_norm": 4.096358776092529, "learning_rate": 7.496726444833626e-05, "loss": 1.1909, "num_input_tokens_seen": 57545432, "step": 3577 }, { "epoch": 0.25063262321924223, "grad_norm": 3.9253315925598145, "learning_rate": 7.496026619964974e-05, "loss": 1.1383, "num_input_tokens_seen": 57561816, "step": 3578 }, { "epoch": 0.2507026714649715, "grad_norm": 5.603836536407471, "learning_rate": 7.495326795096321e-05, "loss": 1.0744, "num_input_tokens_seen": 57577336, "step": 3579 }, { "epoch": 0.25077271971070075, "grad_norm": 4.588653564453125, "learning_rate": 7.49462697022767e-05, "loss": 1.0896, "num_input_tokens_seen": 57593720, "step": 3580 }, { "epoch": 0.25084276795643, "grad_norm": 3.989229917526245, "learning_rate": 7.49392714535902e-05, "loss": 0.9605, "num_input_tokens_seen": 57609656, "step": 3581 }, { "epoch": 0.2509128162021592, "grad_norm": 4.728183269500732, "learning_rate": 7.493227320490368e-05, "loss": 1.2626, "num_input_tokens_seen": 57626040, "step": 3582 }, { "epoch": 0.2509828644478885, "grad_norm": 4.269988059997559, "learning_rate": 7.492527495621717e-05, "loss": 1.0987, "num_input_tokens_seen": 57641280, "step": 3583 }, { "epoch": 0.25105291269361774, "grad_norm": 6.506377696990967, "learning_rate": 7.491827670753066e-05, "loss": 0.9327, "num_input_tokens_seen": 57657664, "step": 3584 }, { "epoch": 0.251122960939347, "grad_norm": 6.415282726287842, "learning_rate": 7.491127845884413e-05, "loss": 0.9515, "num_input_tokens_seen": 57672704, "step": 3585 }, { "epoch": 0.2511930091850762, "grad_norm": 3.969257116317749, "learning_rate": 7.490428021015761e-05, "loss": 1.1255, "num_input_tokens_seen": 57687504, "step": 3586 }, { "epoch": 0.25126305743080546, "grad_norm": 3.493469476699829, "learning_rate": 7.489728196147111e-05, "loss": 0.95, "num_input_tokens_seen": 57703512, "step": 3587 }, { "epoch": 0.2513331056765347, "grad_norm": 5.777353763580322, "learning_rate": 7.48902837127846e-05, "loss": 1.0089, "num_input_tokens_seen": 57719344, "step": 3588 }, { "epoch": 0.251403153922264, "grad_norm": 3.6840991973876953, "learning_rate": 7.488328546409807e-05, "loss": 1.0351, "num_input_tokens_seen": 57734848, "step": 3589 }, { "epoch": 0.2514732021679932, "grad_norm": 6.526551246643066, "learning_rate": 7.487628721541156e-05, "loss": 1.1651, "num_input_tokens_seen": 57751232, "step": 3590 }, { "epoch": 0.25154325041372244, "grad_norm": 3.7879719734191895, "learning_rate": 7.486928896672505e-05, "loss": 1.0128, "num_input_tokens_seen": 57767616, "step": 3591 }, { "epoch": 0.2516132986594517, "grad_norm": 6.891875267028809, "learning_rate": 7.486229071803853e-05, "loss": 1.2037, "num_input_tokens_seen": 57783592, "step": 3592 }, { "epoch": 0.25168334690518096, "grad_norm": 4.700318336486816, "learning_rate": 7.485529246935201e-05, "loss": 1.0291, "num_input_tokens_seen": 57799976, "step": 3593 }, { "epoch": 0.25175339515091016, "grad_norm": 6.47390604019165, "learning_rate": 7.48482942206655e-05, "loss": 0.9828, "num_input_tokens_seen": 57816360, "step": 3594 }, { "epoch": 0.2518234433966394, "grad_norm": 5.045449733734131, "learning_rate": 7.484129597197899e-05, "loss": 0.9569, "num_input_tokens_seen": 57832016, "step": 3595 }, { "epoch": 0.2518934916423687, "grad_norm": 4.258456230163574, "learning_rate": 7.483429772329247e-05, "loss": 0.9804, "num_input_tokens_seen": 57848400, "step": 3596 }, { "epoch": 0.25196353988809794, "grad_norm": 3.948582649230957, "learning_rate": 7.482729947460596e-05, "loss": 0.9898, "num_input_tokens_seen": 57864784, "step": 3597 }, { "epoch": 0.25203358813382715, "grad_norm": 4.017141342163086, "learning_rate": 7.482030122591944e-05, "loss": 0.8644, "num_input_tokens_seen": 57879696, "step": 3598 }, { "epoch": 0.2521036363795564, "grad_norm": 3.7428297996520996, "learning_rate": 7.481330297723292e-05, "loss": 0.9318, "num_input_tokens_seen": 57896080, "step": 3599 }, { "epoch": 0.25217368462528567, "grad_norm": 4.883368968963623, "learning_rate": 7.480630472854641e-05, "loss": 0.9771, "num_input_tokens_seen": 57911976, "step": 3600 }, { "epoch": 0.25217368462528567, "eval_loss": 1.136000633239746, "eval_runtime": 0.2016, "eval_samples_per_second": 4.959, "eval_steps_per_second": 4.959, "num_input_tokens_seen": 57911976, "step": 3600 }, { "epoch": 0.2522437328710149, "grad_norm": 4.399716377258301, "learning_rate": 7.479930647985991e-05, "loss": 0.9965, "num_input_tokens_seen": 57927440, "step": 3601 }, { "epoch": 0.2523137811167442, "grad_norm": 6.019199371337891, "learning_rate": 7.479230823117338e-05, "loss": 1.1172, "num_input_tokens_seen": 57943824, "step": 3602 }, { "epoch": 0.2523838293624734, "grad_norm": 4.42507266998291, "learning_rate": 7.478530998248687e-05, "loss": 1.1294, "num_input_tokens_seen": 57960208, "step": 3603 }, { "epoch": 0.25245387760820265, "grad_norm": 4.0232062339782715, "learning_rate": 7.477831173380036e-05, "loss": 1.031, "num_input_tokens_seen": 57976560, "step": 3604 }, { "epoch": 0.2525239258539319, "grad_norm": 3.6392862796783447, "learning_rate": 7.477131348511384e-05, "loss": 0.8717, "num_input_tokens_seen": 57992944, "step": 3605 }, { "epoch": 0.25259397409966117, "grad_norm": 3.849912643432617, "learning_rate": 7.476431523642731e-05, "loss": 0.994, "num_input_tokens_seen": 58009328, "step": 3606 }, { "epoch": 0.25266402234539037, "grad_norm": 3.5331156253814697, "learning_rate": 7.475731698774081e-05, "loss": 0.8999, "num_input_tokens_seen": 58025152, "step": 3607 }, { "epoch": 0.25273407059111963, "grad_norm": 4.343970775604248, "learning_rate": 7.47503187390543e-05, "loss": 1.0231, "num_input_tokens_seen": 58041536, "step": 3608 }, { "epoch": 0.2528041188368489, "grad_norm": 3.6736862659454346, "learning_rate": 7.474332049036778e-05, "loss": 1.161, "num_input_tokens_seen": 58057920, "step": 3609 }, { "epoch": 0.25287416708257815, "grad_norm": 6.599121570587158, "learning_rate": 7.473632224168127e-05, "loss": 1.2235, "num_input_tokens_seen": 58073784, "step": 3610 }, { "epoch": 0.25294421532830735, "grad_norm": 4.2448930740356445, "learning_rate": 7.472932399299475e-05, "loss": 1.0207, "num_input_tokens_seen": 58088776, "step": 3611 }, { "epoch": 0.2530142635740366, "grad_norm": 3.416584014892578, "learning_rate": 7.472232574430823e-05, "loss": 0.984, "num_input_tokens_seen": 58105160, "step": 3612 }, { "epoch": 0.2530843118197659, "grad_norm": 3.9348700046539307, "learning_rate": 7.471532749562172e-05, "loss": 1.0883, "num_input_tokens_seen": 58121528, "step": 3613 }, { "epoch": 0.25315436006549513, "grad_norm": 6.208236217498779, "learning_rate": 7.470832924693521e-05, "loss": 1.1842, "num_input_tokens_seen": 58137912, "step": 3614 }, { "epoch": 0.25322440831122434, "grad_norm": 3.9069888591766357, "learning_rate": 7.47013309982487e-05, "loss": 0.9958, "num_input_tokens_seen": 58154056, "step": 3615 }, { "epoch": 0.2532944565569536, "grad_norm": 4.482925891876221, "learning_rate": 7.469433274956217e-05, "loss": 1.0365, "num_input_tokens_seen": 58168904, "step": 3616 }, { "epoch": 0.25336450480268286, "grad_norm": 4.082488536834717, "learning_rate": 7.468733450087566e-05, "loss": 0.9116, "num_input_tokens_seen": 58185288, "step": 3617 }, { "epoch": 0.2534345530484121, "grad_norm": 5.994426250457764, "learning_rate": 7.468033625218915e-05, "loss": 1.1286, "num_input_tokens_seen": 58201600, "step": 3618 }, { "epoch": 0.2535046012941413, "grad_norm": 3.966487169265747, "learning_rate": 7.467333800350262e-05, "loss": 1.061, "num_input_tokens_seen": 58217752, "step": 3619 }, { "epoch": 0.2535746495398706, "grad_norm": 4.3370537757873535, "learning_rate": 7.466633975481611e-05, "loss": 0.9495, "num_input_tokens_seen": 58233672, "step": 3620 }, { "epoch": 0.25364469778559984, "grad_norm": 4.638936519622803, "learning_rate": 7.465934150612961e-05, "loss": 1.1593, "num_input_tokens_seen": 58249904, "step": 3621 }, { "epoch": 0.2537147460313291, "grad_norm": 3.42993426322937, "learning_rate": 7.465234325744309e-05, "loss": 0.9112, "num_input_tokens_seen": 58265272, "step": 3622 }, { "epoch": 0.2537847942770583, "grad_norm": 4.637670516967773, "learning_rate": 7.464534500875656e-05, "loss": 1.1578, "num_input_tokens_seen": 58281656, "step": 3623 }, { "epoch": 0.25385484252278756, "grad_norm": 4.470972061157227, "learning_rate": 7.463834676007005e-05, "loss": 0.9973, "num_input_tokens_seen": 58297696, "step": 3624 }, { "epoch": 0.2539248907685168, "grad_norm": 4.158536434173584, "learning_rate": 7.463134851138354e-05, "loss": 1.2625, "num_input_tokens_seen": 58313960, "step": 3625 }, { "epoch": 0.2539949390142461, "grad_norm": 5.2940850257873535, "learning_rate": 7.462435026269702e-05, "loss": 1.1649, "num_input_tokens_seen": 58329928, "step": 3626 }, { "epoch": 0.2540649872599753, "grad_norm": 4.270470142364502, "learning_rate": 7.461735201401052e-05, "loss": 0.9042, "num_input_tokens_seen": 58345544, "step": 3627 }, { "epoch": 0.25413503550570454, "grad_norm": 4.488008975982666, "learning_rate": 7.4610353765324e-05, "loss": 1.2652, "num_input_tokens_seen": 58361736, "step": 3628 }, { "epoch": 0.2542050837514338, "grad_norm": 3.9760642051696777, "learning_rate": 7.460335551663748e-05, "loss": 0.9522, "num_input_tokens_seen": 58377888, "step": 3629 }, { "epoch": 0.25427513199716306, "grad_norm": 4.022678852081299, "learning_rate": 7.459635726795097e-05, "loss": 1.0673, "num_input_tokens_seen": 58393744, "step": 3630 }, { "epoch": 0.25434518024289227, "grad_norm": 6.345690727233887, "learning_rate": 7.458935901926446e-05, "loss": 1.052, "num_input_tokens_seen": 58410064, "step": 3631 }, { "epoch": 0.2544152284886215, "grad_norm": 4.0159101486206055, "learning_rate": 7.458236077057793e-05, "loss": 1.1164, "num_input_tokens_seen": 58426352, "step": 3632 }, { "epoch": 0.2544852767343508, "grad_norm": 4.125208854675293, "learning_rate": 7.457536252189142e-05, "loss": 1.0113, "num_input_tokens_seen": 58441936, "step": 3633 }, { "epoch": 0.25455532498008004, "grad_norm": 4.429535865783691, "learning_rate": 7.456836427320491e-05, "loss": 1.158, "num_input_tokens_seen": 58457136, "step": 3634 }, { "epoch": 0.2546253732258093, "grad_norm": 3.655606269836426, "learning_rate": 7.45613660245184e-05, "loss": 1.0467, "num_input_tokens_seen": 58473520, "step": 3635 }, { "epoch": 0.2546954214715385, "grad_norm": 3.688188314437866, "learning_rate": 7.455436777583187e-05, "loss": 0.9309, "num_input_tokens_seen": 58489904, "step": 3636 }, { "epoch": 0.25476546971726777, "grad_norm": 3.95440411567688, "learning_rate": 7.454736952714536e-05, "loss": 1.2586, "num_input_tokens_seen": 58506032, "step": 3637 }, { "epoch": 0.254835517962997, "grad_norm": 3.950641632080078, "learning_rate": 7.454037127845885e-05, "loss": 0.9397, "num_input_tokens_seen": 58521464, "step": 3638 }, { "epoch": 0.2549055662087263, "grad_norm": 4.9607038497924805, "learning_rate": 7.453337302977233e-05, "loss": 1.0498, "num_input_tokens_seen": 58537848, "step": 3639 }, { "epoch": 0.2549756144544555, "grad_norm": 3.4168713092803955, "learning_rate": 7.452637478108582e-05, "loss": 0.8983, "num_input_tokens_seen": 58554232, "step": 3640 }, { "epoch": 0.25504566270018475, "grad_norm": 6.897549152374268, "learning_rate": 7.451937653239932e-05, "loss": 1.2782, "num_input_tokens_seen": 58570616, "step": 3641 }, { "epoch": 0.255115710945914, "grad_norm": 4.009060859680176, "learning_rate": 7.451237828371279e-05, "loss": 1.0205, "num_input_tokens_seen": 58587000, "step": 3642 }, { "epoch": 0.25518575919164327, "grad_norm": 4.245255470275879, "learning_rate": 7.450538003502627e-05, "loss": 0.98, "num_input_tokens_seen": 58602768, "step": 3643 }, { "epoch": 0.2552558074373725, "grad_norm": 3.7547385692596436, "learning_rate": 7.449838178633976e-05, "loss": 1.0763, "num_input_tokens_seen": 58619024, "step": 3644 }, { "epoch": 0.25532585568310173, "grad_norm": 5.7543745040893555, "learning_rate": 7.449138353765324e-05, "loss": 1.1535, "num_input_tokens_seen": 58635408, "step": 3645 }, { "epoch": 0.255395903928831, "grad_norm": 3.8786420822143555, "learning_rate": 7.448438528896672e-05, "loss": 1.0385, "num_input_tokens_seen": 58651392, "step": 3646 }, { "epoch": 0.25546595217456025, "grad_norm": 4.290858745574951, "learning_rate": 7.447738704028022e-05, "loss": 0.9459, "num_input_tokens_seen": 58667712, "step": 3647 }, { "epoch": 0.25553600042028946, "grad_norm": 3.8005576133728027, "learning_rate": 7.447038879159371e-05, "loss": 1.1709, "num_input_tokens_seen": 58683512, "step": 3648 }, { "epoch": 0.2556060486660187, "grad_norm": 3.574735403060913, "learning_rate": 7.446339054290719e-05, "loss": 1.0276, "num_input_tokens_seen": 58699296, "step": 3649 }, { "epoch": 0.255676096911748, "grad_norm": 4.487549304962158, "learning_rate": 7.445639229422066e-05, "loss": 1.0608, "num_input_tokens_seen": 58715680, "step": 3650 }, { "epoch": 0.25574614515747723, "grad_norm": 3.80549955368042, "learning_rate": 7.444939404553415e-05, "loss": 1.0916, "num_input_tokens_seen": 58732064, "step": 3651 }, { "epoch": 0.25581619340320644, "grad_norm": 6.745276927947998, "learning_rate": 7.444239579684764e-05, "loss": 0.9649, "num_input_tokens_seen": 58748416, "step": 3652 }, { "epoch": 0.2558862416489357, "grad_norm": 5.366410732269287, "learning_rate": 7.443539754816113e-05, "loss": 1.1205, "num_input_tokens_seen": 58764800, "step": 3653 }, { "epoch": 0.25595628989466496, "grad_norm": 4.889951705932617, "learning_rate": 7.442839929947462e-05, "loss": 1.0447, "num_input_tokens_seen": 58779776, "step": 3654 }, { "epoch": 0.2560263381403942, "grad_norm": 3.776078462600708, "learning_rate": 7.44214010507881e-05, "loss": 0.9146, "num_input_tokens_seen": 58796160, "step": 3655 }, { "epoch": 0.2560963863861234, "grad_norm": 4.999850749969482, "learning_rate": 7.441440280210158e-05, "loss": 1.053, "num_input_tokens_seen": 58812544, "step": 3656 }, { "epoch": 0.2561664346318527, "grad_norm": 4.111214637756348, "learning_rate": 7.440740455341507e-05, "loss": 1.02, "num_input_tokens_seen": 58828696, "step": 3657 }, { "epoch": 0.25623648287758194, "grad_norm": 4.49043083190918, "learning_rate": 7.440040630472856e-05, "loss": 0.8889, "num_input_tokens_seen": 58845080, "step": 3658 }, { "epoch": 0.2563065311233112, "grad_norm": 4.440788745880127, "learning_rate": 7.439340805604203e-05, "loss": 1.0635, "num_input_tokens_seen": 58861464, "step": 3659 }, { "epoch": 0.2563765793690404, "grad_norm": 5.642586708068848, "learning_rate": 7.438640980735552e-05, "loss": 1.3676, "num_input_tokens_seen": 58877624, "step": 3660 }, { "epoch": 0.25644662761476966, "grad_norm": 3.8768467903137207, "learning_rate": 7.437941155866901e-05, "loss": 0.9737, "num_input_tokens_seen": 58894008, "step": 3661 }, { "epoch": 0.2565166758604989, "grad_norm": 3.9855473041534424, "learning_rate": 7.43724133099825e-05, "loss": 1.0987, "num_input_tokens_seen": 58909600, "step": 3662 }, { "epoch": 0.2565867241062282, "grad_norm": 3.6692938804626465, "learning_rate": 7.436541506129597e-05, "loss": 1.0541, "num_input_tokens_seen": 58925776, "step": 3663 }, { "epoch": 0.2566567723519574, "grad_norm": 3.87776517868042, "learning_rate": 7.435841681260946e-05, "loss": 1.0616, "num_input_tokens_seen": 58941048, "step": 3664 }, { "epoch": 0.25672682059768664, "grad_norm": 3.5173263549804688, "learning_rate": 7.435141856392295e-05, "loss": 0.9046, "num_input_tokens_seen": 58957432, "step": 3665 }, { "epoch": 0.2567968688434159, "grad_norm": 4.312611103057861, "learning_rate": 7.434442031523642e-05, "loss": 0.8224, "num_input_tokens_seen": 58973816, "step": 3666 }, { "epoch": 0.25686691708914516, "grad_norm": 3.7889907360076904, "learning_rate": 7.433742206654991e-05, "loss": 1.1431, "num_input_tokens_seen": 58989472, "step": 3667 }, { "epoch": 0.25693696533487437, "grad_norm": 4.997755527496338, "learning_rate": 7.433042381786341e-05, "loss": 1.2147, "num_input_tokens_seen": 59005856, "step": 3668 }, { "epoch": 0.2570070135806036, "grad_norm": 5.839511871337891, "learning_rate": 7.432342556917689e-05, "loss": 1.0974, "num_input_tokens_seen": 59022176, "step": 3669 }, { "epoch": 0.2570770618263329, "grad_norm": 4.185897350311279, "learning_rate": 7.431642732049036e-05, "loss": 1.0769, "num_input_tokens_seen": 59038296, "step": 3670 }, { "epoch": 0.25714711007206215, "grad_norm": 3.6666383743286133, "learning_rate": 7.430942907180385e-05, "loss": 1.0051, "num_input_tokens_seen": 59054680, "step": 3671 }, { "epoch": 0.2572171583177914, "grad_norm": 3.8587453365325928, "learning_rate": 7.430243082311734e-05, "loss": 1.1001, "num_input_tokens_seen": 59070912, "step": 3672 }, { "epoch": 0.2572872065635206, "grad_norm": 3.6518352031707764, "learning_rate": 7.429543257443083e-05, "loss": 1.042, "num_input_tokens_seen": 59087296, "step": 3673 }, { "epoch": 0.25735725480924987, "grad_norm": 4.629798412322998, "learning_rate": 7.428843432574432e-05, "loss": 1.2649, "num_input_tokens_seen": 59103632, "step": 3674 }, { "epoch": 0.25742730305497913, "grad_norm": 6.353034496307373, "learning_rate": 7.428143607705781e-05, "loss": 1.3823, "num_input_tokens_seen": 59120016, "step": 3675 }, { "epoch": 0.2574973513007084, "grad_norm": 6.1848273277282715, "learning_rate": 7.427443782837128e-05, "loss": 1.2275, "num_input_tokens_seen": 59136232, "step": 3676 }, { "epoch": 0.2575673995464376, "grad_norm": 3.6022186279296875, "learning_rate": 7.426743957968476e-05, "loss": 0.9513, "num_input_tokens_seen": 59152616, "step": 3677 }, { "epoch": 0.25763744779216685, "grad_norm": 3.6495468616485596, "learning_rate": 7.426044133099825e-05, "loss": 1.0282, "num_input_tokens_seen": 59167792, "step": 3678 }, { "epoch": 0.2577074960378961, "grad_norm": 4.675189018249512, "learning_rate": 7.425344308231174e-05, "loss": 1.1248, "num_input_tokens_seen": 59184176, "step": 3679 }, { "epoch": 0.25777754428362537, "grad_norm": 3.657700538635254, "learning_rate": 7.424644483362522e-05, "loss": 1.0445, "num_input_tokens_seen": 59199632, "step": 3680 }, { "epoch": 0.2578475925293546, "grad_norm": 3.9934394359588623, "learning_rate": 7.423944658493871e-05, "loss": 1.0598, "num_input_tokens_seen": 59215720, "step": 3681 }, { "epoch": 0.25791764077508383, "grad_norm": 3.777191400527954, "learning_rate": 7.42324483362522e-05, "loss": 1.2255, "num_input_tokens_seen": 59231248, "step": 3682 }, { "epoch": 0.2579876890208131, "grad_norm": 3.9812276363372803, "learning_rate": 7.422545008756568e-05, "loss": 1.1253, "num_input_tokens_seen": 59247280, "step": 3683 }, { "epoch": 0.25805773726654235, "grad_norm": 3.631455183029175, "learning_rate": 7.421845183887916e-05, "loss": 1.0559, "num_input_tokens_seen": 59263664, "step": 3684 }, { "epoch": 0.25812778551227156, "grad_norm": 3.803898334503174, "learning_rate": 7.421145359019265e-05, "loss": 0.9847, "num_input_tokens_seen": 59279880, "step": 3685 }, { "epoch": 0.2581978337580008, "grad_norm": 3.649956703186035, "learning_rate": 7.420445534150613e-05, "loss": 0.951, "num_input_tokens_seen": 59296216, "step": 3686 }, { "epoch": 0.2582678820037301, "grad_norm": 4.010924339294434, "learning_rate": 7.419745709281962e-05, "loss": 1.1987, "num_input_tokens_seen": 59312448, "step": 3687 }, { "epoch": 0.25833793024945934, "grad_norm": 4.2410759925842285, "learning_rate": 7.41904588441331e-05, "loss": 0.9677, "num_input_tokens_seen": 59328456, "step": 3688 }, { "epoch": 0.25840797849518854, "grad_norm": 3.9170684814453125, "learning_rate": 7.41834605954466e-05, "loss": 1.0795, "num_input_tokens_seen": 59344840, "step": 3689 }, { "epoch": 0.2584780267409178, "grad_norm": 3.935624837875366, "learning_rate": 7.417646234676007e-05, "loss": 1.1149, "num_input_tokens_seen": 59360040, "step": 3690 }, { "epoch": 0.25854807498664706, "grad_norm": 4.3747782707214355, "learning_rate": 7.416946409807356e-05, "loss": 1.2462, "num_input_tokens_seen": 59375896, "step": 3691 }, { "epoch": 0.2586181232323763, "grad_norm": 7.553433418273926, "learning_rate": 7.416246584938705e-05, "loss": 1.4753, "num_input_tokens_seen": 59391144, "step": 3692 }, { "epoch": 0.2586881714781055, "grad_norm": 3.4443981647491455, "learning_rate": 7.415546760070053e-05, "loss": 1.0629, "num_input_tokens_seen": 59407528, "step": 3693 }, { "epoch": 0.2587582197238348, "grad_norm": 4.02165412902832, "learning_rate": 7.414846935201401e-05, "loss": 0.9951, "num_input_tokens_seen": 59422824, "step": 3694 }, { "epoch": 0.25882826796956404, "grad_norm": 3.8880200386047363, "learning_rate": 7.414147110332751e-05, "loss": 0.941, "num_input_tokens_seen": 59439208, "step": 3695 }, { "epoch": 0.2588983162152933, "grad_norm": 5.463441371917725, "learning_rate": 7.413447285464099e-05, "loss": 0.9333, "num_input_tokens_seen": 59455592, "step": 3696 }, { "epoch": 0.2589683644610225, "grad_norm": 7.555225372314453, "learning_rate": 7.412747460595446e-05, "loss": 1.2278, "num_input_tokens_seen": 59471976, "step": 3697 }, { "epoch": 0.25903841270675176, "grad_norm": 5.7154436111450195, "learning_rate": 7.412047635726795e-05, "loss": 1.151, "num_input_tokens_seen": 59488360, "step": 3698 }, { "epoch": 0.259108460952481, "grad_norm": 5.09559965133667, "learning_rate": 7.411347810858144e-05, "loss": 1.0998, "num_input_tokens_seen": 59504536, "step": 3699 }, { "epoch": 0.2591785091982103, "grad_norm": 4.7749738693237305, "learning_rate": 7.410647985989493e-05, "loss": 1.2971, "num_input_tokens_seen": 59520488, "step": 3700 }, { "epoch": 0.2592485574439395, "grad_norm": 4.323631763458252, "learning_rate": 7.409948161120842e-05, "loss": 1.1687, "num_input_tokens_seen": 59535384, "step": 3701 }, { "epoch": 0.25931860568966875, "grad_norm": 3.511822462081909, "learning_rate": 7.40924833625219e-05, "loss": 1.0547, "num_input_tokens_seen": 59550888, "step": 3702 }, { "epoch": 0.259388653935398, "grad_norm": 4.039402008056641, "learning_rate": 7.408548511383538e-05, "loss": 0.8453, "num_input_tokens_seen": 59567184, "step": 3703 }, { "epoch": 0.25945870218112727, "grad_norm": 3.6692605018615723, "learning_rate": 7.407848686514885e-05, "loss": 0.9705, "num_input_tokens_seen": 59583568, "step": 3704 }, { "epoch": 0.2595287504268565, "grad_norm": 4.414707660675049, "learning_rate": 7.407148861646234e-05, "loss": 0.8734, "num_input_tokens_seen": 59599088, "step": 3705 }, { "epoch": 0.25959879867258573, "grad_norm": 4.073670387268066, "learning_rate": 7.406449036777583e-05, "loss": 1.2958, "num_input_tokens_seen": 59615432, "step": 3706 }, { "epoch": 0.259668846918315, "grad_norm": 4.436419486999512, "learning_rate": 7.405749211908932e-05, "loss": 1.0019, "num_input_tokens_seen": 59631816, "step": 3707 }, { "epoch": 0.25973889516404425, "grad_norm": 5.866218090057373, "learning_rate": 7.405049387040281e-05, "loss": 1.043, "num_input_tokens_seen": 59648200, "step": 3708 }, { "epoch": 0.2598089434097735, "grad_norm": 4.133188247680664, "learning_rate": 7.40434956217163e-05, "loss": 1.1168, "num_input_tokens_seen": 59664584, "step": 3709 }, { "epoch": 0.2598789916555027, "grad_norm": 4.1976213455200195, "learning_rate": 7.403649737302977e-05, "loss": 1.1118, "num_input_tokens_seen": 59680288, "step": 3710 }, { "epoch": 0.25994903990123197, "grad_norm": 3.990983009338379, "learning_rate": 7.402949912434326e-05, "loss": 0.9963, "num_input_tokens_seen": 59696408, "step": 3711 }, { "epoch": 0.26001908814696123, "grad_norm": 4.427793025970459, "learning_rate": 7.402250087565675e-05, "loss": 1.1771, "num_input_tokens_seen": 59712792, "step": 3712 }, { "epoch": 0.2600891363926905, "grad_norm": 5.360867023468018, "learning_rate": 7.401550262697024e-05, "loss": 1.1428, "num_input_tokens_seen": 59728968, "step": 3713 }, { "epoch": 0.2601591846384197, "grad_norm": 3.8442916870117188, "learning_rate": 7.400850437828371e-05, "loss": 0.9544, "num_input_tokens_seen": 59745352, "step": 3714 }, { "epoch": 0.26022923288414895, "grad_norm": 3.7610833644866943, "learning_rate": 7.40015061295972e-05, "loss": 0.9969, "num_input_tokens_seen": 59761736, "step": 3715 }, { "epoch": 0.2602992811298782, "grad_norm": 3.9050705432891846, "learning_rate": 7.399450788091069e-05, "loss": 1.2099, "num_input_tokens_seen": 59778000, "step": 3716 }, { "epoch": 0.2603693293756075, "grad_norm": 4.293839454650879, "learning_rate": 7.398750963222417e-05, "loss": 1.0274, "num_input_tokens_seen": 59794216, "step": 3717 }, { "epoch": 0.2604393776213367, "grad_norm": 3.7403993606567383, "learning_rate": 7.398051138353765e-05, "loss": 1.0172, "num_input_tokens_seen": 59810600, "step": 3718 }, { "epoch": 0.26050942586706594, "grad_norm": 5.266970157623291, "learning_rate": 7.397351313485114e-05, "loss": 0.8695, "num_input_tokens_seen": 59826984, "step": 3719 }, { "epoch": 0.2605794741127952, "grad_norm": 4.385645866394043, "learning_rate": 7.396651488616463e-05, "loss": 1.0625, "num_input_tokens_seen": 59843368, "step": 3720 }, { "epoch": 0.26064952235852445, "grad_norm": 4.349147796630859, "learning_rate": 7.39595166374781e-05, "loss": 1.2092, "num_input_tokens_seen": 59859136, "step": 3721 }, { "epoch": 0.26071957060425366, "grad_norm": 4.69277286529541, "learning_rate": 7.395251838879161e-05, "loss": 1.1171, "num_input_tokens_seen": 59875024, "step": 3722 }, { "epoch": 0.2607896188499829, "grad_norm": 3.602949857711792, "learning_rate": 7.394552014010508e-05, "loss": 1.0994, "num_input_tokens_seen": 59891408, "step": 3723 }, { "epoch": 0.2608596670957122, "grad_norm": 4.137026786804199, "learning_rate": 7.393852189141856e-05, "loss": 1.0414, "num_input_tokens_seen": 59906360, "step": 3724 }, { "epoch": 0.26092971534144144, "grad_norm": 4.558672904968262, "learning_rate": 7.393152364273205e-05, "loss": 1.2051, "num_input_tokens_seen": 59922744, "step": 3725 }, { "epoch": 0.26099976358717064, "grad_norm": 3.977217197418213, "learning_rate": 7.392452539404554e-05, "loss": 0.9036, "num_input_tokens_seen": 59938448, "step": 3726 }, { "epoch": 0.2610698118328999, "grad_norm": 6.573578834533691, "learning_rate": 7.391752714535902e-05, "loss": 0.9693, "num_input_tokens_seen": 59954832, "step": 3727 }, { "epoch": 0.26113986007862916, "grad_norm": 4.253365516662598, "learning_rate": 7.391052889667251e-05, "loss": 1.1001, "num_input_tokens_seen": 59971216, "step": 3728 }, { "epoch": 0.2612099083243584, "grad_norm": 4.279355525970459, "learning_rate": 7.3903530647986e-05, "loss": 1.0456, "num_input_tokens_seen": 59987384, "step": 3729 }, { "epoch": 0.2612799565700876, "grad_norm": 5.5035505294799805, "learning_rate": 7.389653239929948e-05, "loss": 1.236, "num_input_tokens_seen": 60003720, "step": 3730 }, { "epoch": 0.2613500048158169, "grad_norm": 5.064812660217285, "learning_rate": 7.388953415061295e-05, "loss": 0.8739, "num_input_tokens_seen": 60020104, "step": 3731 }, { "epoch": 0.26142005306154614, "grad_norm": 4.716748237609863, "learning_rate": 7.388253590192644e-05, "loss": 1.2417, "num_input_tokens_seen": 60036488, "step": 3732 }, { "epoch": 0.2614901013072754, "grad_norm": 4.0947489738464355, "learning_rate": 7.387553765323994e-05, "loss": 1.1332, "num_input_tokens_seen": 60052384, "step": 3733 }, { "epoch": 0.2615601495530046, "grad_norm": 3.757126808166504, "learning_rate": 7.386853940455342e-05, "loss": 1.0442, "num_input_tokens_seen": 60068624, "step": 3734 }, { "epoch": 0.26163019779873387, "grad_norm": 7.364987850189209, "learning_rate": 7.38615411558669e-05, "loss": 1.0285, "num_input_tokens_seen": 60084248, "step": 3735 }, { "epoch": 0.2617002460444631, "grad_norm": 4.630516052246094, "learning_rate": 7.38545429071804e-05, "loss": 1.1585, "num_input_tokens_seen": 60100632, "step": 3736 }, { "epoch": 0.2617702942901924, "grad_norm": 5.3436760902404785, "learning_rate": 7.384754465849387e-05, "loss": 0.9723, "num_input_tokens_seen": 60116672, "step": 3737 }, { "epoch": 0.2618403425359216, "grad_norm": 3.843344211578369, "learning_rate": 7.384054640980736e-05, "loss": 0.8992, "num_input_tokens_seen": 60133056, "step": 3738 }, { "epoch": 0.26191039078165085, "grad_norm": 4.561652183532715, "learning_rate": 7.383354816112085e-05, "loss": 1.2304, "num_input_tokens_seen": 60149440, "step": 3739 }, { "epoch": 0.2619804390273801, "grad_norm": 3.951719045639038, "learning_rate": 7.382654991243434e-05, "loss": 0.8449, "num_input_tokens_seen": 60165824, "step": 3740 }, { "epoch": 0.26205048727310937, "grad_norm": 3.702449321746826, "learning_rate": 7.381955166374781e-05, "loss": 1.1251, "num_input_tokens_seen": 60181496, "step": 3741 }, { "epoch": 0.2621205355188386, "grad_norm": 5.43525505065918, "learning_rate": 7.38125534150613e-05, "loss": 1.1107, "num_input_tokens_seen": 60197040, "step": 3742 }, { "epoch": 0.26219058376456783, "grad_norm": 3.9709503650665283, "learning_rate": 7.380555516637479e-05, "loss": 1.1172, "num_input_tokens_seen": 60213424, "step": 3743 }, { "epoch": 0.2622606320102971, "grad_norm": 3.7183797359466553, "learning_rate": 7.379855691768826e-05, "loss": 1.0234, "num_input_tokens_seen": 60229696, "step": 3744 }, { "epoch": 0.26233068025602635, "grad_norm": 3.933479070663452, "learning_rate": 7.379155866900175e-05, "loss": 1.0702, "num_input_tokens_seen": 60246080, "step": 3745 }, { "epoch": 0.2624007285017556, "grad_norm": 4.837695598602295, "learning_rate": 7.378456042031524e-05, "loss": 1.1017, "num_input_tokens_seen": 60262464, "step": 3746 }, { "epoch": 0.2624707767474848, "grad_norm": 4.791194438934326, "learning_rate": 7.377756217162873e-05, "loss": 1.2467, "num_input_tokens_seen": 60278600, "step": 3747 }, { "epoch": 0.2625408249932141, "grad_norm": 4.53259801864624, "learning_rate": 7.37705639229422e-05, "loss": 1.1742, "num_input_tokens_seen": 60293856, "step": 3748 }, { "epoch": 0.26261087323894333, "grad_norm": 3.87522554397583, "learning_rate": 7.37635656742557e-05, "loss": 1.1901, "num_input_tokens_seen": 60309888, "step": 3749 }, { "epoch": 0.2626809214846726, "grad_norm": 4.46868896484375, "learning_rate": 7.375656742556918e-05, "loss": 0.9265, "num_input_tokens_seen": 60325784, "step": 3750 }, { "epoch": 0.2627509697304018, "grad_norm": 3.938703775405884, "learning_rate": 7.374956917688266e-05, "loss": 0.9785, "num_input_tokens_seen": 60340696, "step": 3751 }, { "epoch": 0.26282101797613105, "grad_norm": 3.5147759914398193, "learning_rate": 7.374257092819614e-05, "loss": 1.0984, "num_input_tokens_seen": 60357080, "step": 3752 }, { "epoch": 0.2628910662218603, "grad_norm": 4.008304119110107, "learning_rate": 7.373557267950965e-05, "loss": 1.046, "num_input_tokens_seen": 60373464, "step": 3753 }, { "epoch": 0.2629611144675896, "grad_norm": 3.9318859577178955, "learning_rate": 7.372857443082312e-05, "loss": 1.001, "num_input_tokens_seen": 60389848, "step": 3754 }, { "epoch": 0.2630311627133188, "grad_norm": 4.046808242797852, "learning_rate": 7.372157618213661e-05, "loss": 1.0768, "num_input_tokens_seen": 60406232, "step": 3755 }, { "epoch": 0.26310121095904804, "grad_norm": 5.451204299926758, "learning_rate": 7.37145779334501e-05, "loss": 0.9567, "num_input_tokens_seen": 60422544, "step": 3756 }, { "epoch": 0.2631712592047773, "grad_norm": 4.395990371704102, "learning_rate": 7.370757968476357e-05, "loss": 0.9173, "num_input_tokens_seen": 60438312, "step": 3757 }, { "epoch": 0.26324130745050656, "grad_norm": 5.997600078582764, "learning_rate": 7.370058143607705e-05, "loss": 1.049, "num_input_tokens_seen": 60454696, "step": 3758 }, { "epoch": 0.26331135569623576, "grad_norm": 5.588560104370117, "learning_rate": 7.369358318739055e-05, "loss": 0.9015, "num_input_tokens_seen": 60470232, "step": 3759 }, { "epoch": 0.263381403941965, "grad_norm": 3.2995078563690186, "learning_rate": 7.368658493870404e-05, "loss": 0.9814, "num_input_tokens_seen": 60486224, "step": 3760 }, { "epoch": 0.2634514521876943, "grad_norm": 4.141932964324951, "learning_rate": 7.367958669001751e-05, "loss": 1.0069, "num_input_tokens_seen": 60502608, "step": 3761 }, { "epoch": 0.26352150043342354, "grad_norm": 5.010983943939209, "learning_rate": 7.3672588441331e-05, "loss": 1.1533, "num_input_tokens_seen": 60518672, "step": 3762 }, { "epoch": 0.26359154867915274, "grad_norm": 3.555612802505493, "learning_rate": 7.366559019264449e-05, "loss": 1.1037, "num_input_tokens_seen": 60534408, "step": 3763 }, { "epoch": 0.263661596924882, "grad_norm": 4.006901264190674, "learning_rate": 7.365859194395797e-05, "loss": 1.0086, "num_input_tokens_seen": 60550760, "step": 3764 }, { "epoch": 0.26373164517061126, "grad_norm": 5.055272579193115, "learning_rate": 7.365159369527146e-05, "loss": 0.9645, "num_input_tokens_seen": 60567144, "step": 3765 }, { "epoch": 0.2638016934163405, "grad_norm": 3.860630989074707, "learning_rate": 7.364459544658494e-05, "loss": 1.0371, "num_input_tokens_seen": 60583528, "step": 3766 }, { "epoch": 0.2638717416620697, "grad_norm": 4.644535541534424, "learning_rate": 7.363759719789843e-05, "loss": 1.1461, "num_input_tokens_seen": 60599912, "step": 3767 }, { "epoch": 0.263941789907799, "grad_norm": 3.7196872234344482, "learning_rate": 7.363059894921191e-05, "loss": 1.1025, "num_input_tokens_seen": 60616296, "step": 3768 }, { "epoch": 0.26401183815352824, "grad_norm": 4.477166175842285, "learning_rate": 7.36236007005254e-05, "loss": 1.2221, "num_input_tokens_seen": 60631760, "step": 3769 }, { "epoch": 0.2640818863992575, "grad_norm": 4.906933784484863, "learning_rate": 7.361660245183889e-05, "loss": 0.9398, "num_input_tokens_seen": 60648144, "step": 3770 }, { "epoch": 0.2641519346449867, "grad_norm": 3.784450054168701, "learning_rate": 7.360960420315236e-05, "loss": 0.9521, "num_input_tokens_seen": 60664528, "step": 3771 }, { "epoch": 0.26422198289071597, "grad_norm": 4.5654191970825195, "learning_rate": 7.360260595446585e-05, "loss": 0.9199, "num_input_tokens_seen": 60680912, "step": 3772 }, { "epoch": 0.2642920311364452, "grad_norm": 3.965175151824951, "learning_rate": 7.359560770577934e-05, "loss": 0.9469, "num_input_tokens_seen": 60697296, "step": 3773 }, { "epoch": 0.2643620793821745, "grad_norm": 5.112542152404785, "learning_rate": 7.358860945709283e-05, "loss": 0.946, "num_input_tokens_seen": 60713328, "step": 3774 }, { "epoch": 0.26443212762790375, "grad_norm": 3.8610634803771973, "learning_rate": 7.35816112084063e-05, "loss": 1.1243, "num_input_tokens_seen": 60729712, "step": 3775 }, { "epoch": 0.26450217587363295, "grad_norm": 3.794217348098755, "learning_rate": 7.35746129597198e-05, "loss": 1.1127, "num_input_tokens_seen": 60745824, "step": 3776 }, { "epoch": 0.2645722241193622, "grad_norm": 3.7547152042388916, "learning_rate": 7.356761471103328e-05, "loss": 1.0774, "num_input_tokens_seen": 60762024, "step": 3777 }, { "epoch": 0.26464227236509147, "grad_norm": 3.492917537689209, "learning_rate": 7.356061646234675e-05, "loss": 1.0505, "num_input_tokens_seen": 60778096, "step": 3778 }, { "epoch": 0.26471232061082073, "grad_norm": 3.856019973754883, "learning_rate": 7.355361821366026e-05, "loss": 0.9716, "num_input_tokens_seen": 60794480, "step": 3779 }, { "epoch": 0.26478236885654993, "grad_norm": 3.68072509765625, "learning_rate": 7.354661996497374e-05, "loss": 1.2316, "num_input_tokens_seen": 60810584, "step": 3780 }, { "epoch": 0.2648524171022792, "grad_norm": 4.4739909172058105, "learning_rate": 7.353962171628722e-05, "loss": 1.2492, "num_input_tokens_seen": 60826240, "step": 3781 }, { "epoch": 0.26492246534800845, "grad_norm": 5.2342610359191895, "learning_rate": 7.353262346760071e-05, "loss": 1.1018, "num_input_tokens_seen": 60842216, "step": 3782 }, { "epoch": 0.2649925135937377, "grad_norm": 4.408970355987549, "learning_rate": 7.35256252189142e-05, "loss": 1.0485, "num_input_tokens_seen": 60857336, "step": 3783 }, { "epoch": 0.2650625618394669, "grad_norm": 3.8172199726104736, "learning_rate": 7.351862697022767e-05, "loss": 1.1399, "num_input_tokens_seen": 60873720, "step": 3784 }, { "epoch": 0.2651326100851962, "grad_norm": 4.250039100646973, "learning_rate": 7.351162872154116e-05, "loss": 1.003, "num_input_tokens_seen": 60890104, "step": 3785 }, { "epoch": 0.26520265833092543, "grad_norm": 4.257120609283447, "learning_rate": 7.350463047285465e-05, "loss": 1.0466, "num_input_tokens_seen": 60906488, "step": 3786 }, { "epoch": 0.2652727065766547, "grad_norm": 4.205286026000977, "learning_rate": 7.349763222416814e-05, "loss": 1.2149, "num_input_tokens_seen": 60922872, "step": 3787 }, { "epoch": 0.2653427548223839, "grad_norm": 4.304909706115723, "learning_rate": 7.349063397548161e-05, "loss": 1.0023, "num_input_tokens_seen": 60939256, "step": 3788 }, { "epoch": 0.26541280306811316, "grad_norm": 4.793664455413818, "learning_rate": 7.34836357267951e-05, "loss": 1.0475, "num_input_tokens_seen": 60955440, "step": 3789 }, { "epoch": 0.2654828513138424, "grad_norm": 4.383579730987549, "learning_rate": 7.347663747810859e-05, "loss": 1.1924, "num_input_tokens_seen": 60971824, "step": 3790 }, { "epoch": 0.2655528995595717, "grad_norm": 3.9962210655212402, "learning_rate": 7.346963922942206e-05, "loss": 1.0429, "num_input_tokens_seen": 60987168, "step": 3791 }, { "epoch": 0.2656229478053009, "grad_norm": 4.356331825256348, "learning_rate": 7.346264098073555e-05, "loss": 0.9332, "num_input_tokens_seen": 61002840, "step": 3792 }, { "epoch": 0.26569299605103014, "grad_norm": 5.836807727813721, "learning_rate": 7.345564273204904e-05, "loss": 1.205, "num_input_tokens_seen": 61019224, "step": 3793 }, { "epoch": 0.2657630442967594, "grad_norm": 4.778296947479248, "learning_rate": 7.344864448336253e-05, "loss": 1.0227, "num_input_tokens_seen": 61034712, "step": 3794 }, { "epoch": 0.26583309254248866, "grad_norm": 6.723006248474121, "learning_rate": 7.3441646234676e-05, "loss": 0.955, "num_input_tokens_seen": 61050328, "step": 3795 }, { "epoch": 0.26590314078821786, "grad_norm": 3.773984670639038, "learning_rate": 7.34346479859895e-05, "loss": 1.1262, "num_input_tokens_seen": 61066048, "step": 3796 }, { "epoch": 0.2659731890339471, "grad_norm": 3.915708065032959, "learning_rate": 7.342764973730298e-05, "loss": 1.1027, "num_input_tokens_seen": 61082136, "step": 3797 }, { "epoch": 0.2660432372796764, "grad_norm": 6.568943977355957, "learning_rate": 7.342065148861646e-05, "loss": 1.0457, "num_input_tokens_seen": 61097216, "step": 3798 }, { "epoch": 0.26611328552540564, "grad_norm": 5.0017499923706055, "learning_rate": 7.341365323992995e-05, "loss": 1.0194, "num_input_tokens_seen": 61112344, "step": 3799 }, { "epoch": 0.26618333377113484, "grad_norm": 4.1988935470581055, "learning_rate": 7.340665499124345e-05, "loss": 1.0794, "num_input_tokens_seen": 61128728, "step": 3800 }, { "epoch": 0.26618333377113484, "eval_loss": 1.1352765560150146, "eval_runtime": 0.2173, "eval_samples_per_second": 4.603, "eval_steps_per_second": 4.603, "num_input_tokens_seen": 61128728, "step": 3800 }, { "epoch": 0.2662533820168641, "grad_norm": 3.991041660308838, "learning_rate": 7.339965674255692e-05, "loss": 1.1468, "num_input_tokens_seen": 61145112, "step": 3801 }, { "epoch": 0.26632343026259336, "grad_norm": 4.921470642089844, "learning_rate": 7.33926584938704e-05, "loss": 1.1756, "num_input_tokens_seen": 61160952, "step": 3802 }, { "epoch": 0.2663934785083226, "grad_norm": 3.835486888885498, "learning_rate": 7.33856602451839e-05, "loss": 0.782, "num_input_tokens_seen": 61177024, "step": 3803 }, { "epoch": 0.2664635267540518, "grad_norm": 4.419501304626465, "learning_rate": 7.337866199649738e-05, "loss": 1.0029, "num_input_tokens_seen": 61193408, "step": 3804 }, { "epoch": 0.2665335749997811, "grad_norm": 4.003963947296143, "learning_rate": 7.337166374781086e-05, "loss": 1.0805, "num_input_tokens_seen": 61209792, "step": 3805 }, { "epoch": 0.26660362324551035, "grad_norm": 4.115198612213135, "learning_rate": 7.336466549912435e-05, "loss": 1.1718, "num_input_tokens_seen": 61226176, "step": 3806 }, { "epoch": 0.2666736714912396, "grad_norm": 3.663464307785034, "learning_rate": 7.335766725043784e-05, "loss": 1.1447, "num_input_tokens_seen": 61242560, "step": 3807 }, { "epoch": 0.2667437197369688, "grad_norm": 3.7513012886047363, "learning_rate": 7.335066900175132e-05, "loss": 1.1208, "num_input_tokens_seen": 61258944, "step": 3808 }, { "epoch": 0.26681376798269807, "grad_norm": 4.693987846374512, "learning_rate": 7.33436707530648e-05, "loss": 1.2823, "num_input_tokens_seen": 61275048, "step": 3809 }, { "epoch": 0.26688381622842733, "grad_norm": 6.161116600036621, "learning_rate": 7.333667250437829e-05, "loss": 1.1606, "num_input_tokens_seen": 61291368, "step": 3810 }, { "epoch": 0.2669538644741566, "grad_norm": 5.942180633544922, "learning_rate": 7.332967425569177e-05, "loss": 1.2382, "num_input_tokens_seen": 61307680, "step": 3811 }, { "epoch": 0.26702391271988585, "grad_norm": 4.940249443054199, "learning_rate": 7.332267600700526e-05, "loss": 1.0407, "num_input_tokens_seen": 61324064, "step": 3812 }, { "epoch": 0.26709396096561505, "grad_norm": 5.384439468383789, "learning_rate": 7.331567775831875e-05, "loss": 1.007, "num_input_tokens_seen": 61340416, "step": 3813 }, { "epoch": 0.2671640092113443, "grad_norm": 5.4137959480285645, "learning_rate": 7.330867950963223e-05, "loss": 1.0485, "num_input_tokens_seen": 61356800, "step": 3814 }, { "epoch": 0.26723405745707357, "grad_norm": 5.492247581481934, "learning_rate": 7.330168126094571e-05, "loss": 1.1623, "num_input_tokens_seen": 61371736, "step": 3815 }, { "epoch": 0.26730410570280283, "grad_norm": 5.316330909729004, "learning_rate": 7.32946830122592e-05, "loss": 1.0147, "num_input_tokens_seen": 61388120, "step": 3816 }, { "epoch": 0.26737415394853203, "grad_norm": 3.976797103881836, "learning_rate": 7.328768476357269e-05, "loss": 1.1049, "num_input_tokens_seen": 61403672, "step": 3817 }, { "epoch": 0.2674442021942613, "grad_norm": 7.333898544311523, "learning_rate": 7.328068651488616e-05, "loss": 1.0696, "num_input_tokens_seen": 61420056, "step": 3818 }, { "epoch": 0.26751425043999055, "grad_norm": 3.795746088027954, "learning_rate": 7.327368826619965e-05, "loss": 1.0545, "num_input_tokens_seen": 61436440, "step": 3819 }, { "epoch": 0.2675842986857198, "grad_norm": 6.624248027801514, "learning_rate": 7.326669001751315e-05, "loss": 1.0736, "num_input_tokens_seen": 61452824, "step": 3820 }, { "epoch": 0.267654346931449, "grad_norm": 4.991429805755615, "learning_rate": 7.325969176882663e-05, "loss": 1.0681, "num_input_tokens_seen": 61469208, "step": 3821 }, { "epoch": 0.2677243951771783, "grad_norm": 3.8505215644836426, "learning_rate": 7.32526935201401e-05, "loss": 1.0217, "num_input_tokens_seen": 61485592, "step": 3822 }, { "epoch": 0.26779444342290754, "grad_norm": 3.7079288959503174, "learning_rate": 7.324569527145359e-05, "loss": 1.049, "num_input_tokens_seen": 61501976, "step": 3823 }, { "epoch": 0.2678644916686368, "grad_norm": 3.8987131118774414, "learning_rate": 7.323869702276708e-05, "loss": 1.0152, "num_input_tokens_seen": 61518360, "step": 3824 }, { "epoch": 0.267934539914366, "grad_norm": 4.0447516441345215, "learning_rate": 7.323169877408055e-05, "loss": 1.0604, "num_input_tokens_seen": 61534744, "step": 3825 }, { "epoch": 0.26800458816009526, "grad_norm": 4.089504241943359, "learning_rate": 7.322470052539406e-05, "loss": 1.119, "num_input_tokens_seen": 61551128, "step": 3826 }, { "epoch": 0.2680746364058245, "grad_norm": 3.864943265914917, "learning_rate": 7.321770227670754e-05, "loss": 1.002, "num_input_tokens_seen": 61566872, "step": 3827 }, { "epoch": 0.2681446846515538, "grad_norm": 4.649239540100098, "learning_rate": 7.321070402802102e-05, "loss": 1.059, "num_input_tokens_seen": 61582704, "step": 3828 }, { "epoch": 0.268214732897283, "grad_norm": 7.537643909454346, "learning_rate": 7.32037057793345e-05, "loss": 1.289, "num_input_tokens_seen": 61599088, "step": 3829 }, { "epoch": 0.26828478114301224, "grad_norm": 3.312519073486328, "learning_rate": 7.3196707530648e-05, "loss": 0.872, "num_input_tokens_seen": 61615472, "step": 3830 }, { "epoch": 0.2683548293887415, "grad_norm": 7.833526134490967, "learning_rate": 7.318970928196147e-05, "loss": 1.0896, "num_input_tokens_seen": 61631288, "step": 3831 }, { "epoch": 0.26842487763447076, "grad_norm": 3.9574341773986816, "learning_rate": 7.318271103327496e-05, "loss": 1.1105, "num_input_tokens_seen": 61646400, "step": 3832 }, { "epoch": 0.26849492588019996, "grad_norm": 3.8763623237609863, "learning_rate": 7.317571278458845e-05, "loss": 1.0339, "num_input_tokens_seen": 61662784, "step": 3833 }, { "epoch": 0.2685649741259292, "grad_norm": 4.006046295166016, "learning_rate": 7.316871453590194e-05, "loss": 1.1266, "num_input_tokens_seen": 61678296, "step": 3834 }, { "epoch": 0.2686350223716585, "grad_norm": 4.0256500244140625, "learning_rate": 7.316171628721541e-05, "loss": 0.9773, "num_input_tokens_seen": 61694680, "step": 3835 }, { "epoch": 0.26870507061738774, "grad_norm": 4.045619964599609, "learning_rate": 7.31547180385289e-05, "loss": 1.0445, "num_input_tokens_seen": 61711064, "step": 3836 }, { "epoch": 0.26877511886311695, "grad_norm": 4.189207553863525, "learning_rate": 7.314771978984239e-05, "loss": 1.1357, "num_input_tokens_seen": 61727448, "step": 3837 }, { "epoch": 0.2688451671088462, "grad_norm": 6.098819255828857, "learning_rate": 7.314072154115587e-05, "loss": 1.0298, "num_input_tokens_seen": 61743600, "step": 3838 }, { "epoch": 0.26891521535457547, "grad_norm": 3.832962989807129, "learning_rate": 7.313372329246935e-05, "loss": 1.0985, "num_input_tokens_seen": 61759984, "step": 3839 }, { "epoch": 0.2689852636003047, "grad_norm": 4.448224067687988, "learning_rate": 7.312672504378284e-05, "loss": 0.9682, "num_input_tokens_seen": 61776368, "step": 3840 }, { "epoch": 0.26905531184603393, "grad_norm": 4.621326446533203, "learning_rate": 7.311972679509633e-05, "loss": 0.9866, "num_input_tokens_seen": 61791992, "step": 3841 }, { "epoch": 0.2691253600917632, "grad_norm": 4.979477882385254, "learning_rate": 7.31127285464098e-05, "loss": 1.1592, "num_input_tokens_seen": 61807912, "step": 3842 }, { "epoch": 0.26919540833749245, "grad_norm": 4.678060054779053, "learning_rate": 7.31057302977233e-05, "loss": 1.218, "num_input_tokens_seen": 61824296, "step": 3843 }, { "epoch": 0.2692654565832217, "grad_norm": 5.379042625427246, "learning_rate": 7.309873204903678e-05, "loss": 1.0687, "num_input_tokens_seen": 61840680, "step": 3844 }, { "epoch": 0.26933550482895097, "grad_norm": 5.836205005645752, "learning_rate": 7.309173380035026e-05, "loss": 1.0435, "num_input_tokens_seen": 61856296, "step": 3845 }, { "epoch": 0.26940555307468017, "grad_norm": 4.040728569030762, "learning_rate": 7.308473555166376e-05, "loss": 1.0494, "num_input_tokens_seen": 61872680, "step": 3846 }, { "epoch": 0.26947560132040943, "grad_norm": 5.207007884979248, "learning_rate": 7.307773730297725e-05, "loss": 0.9293, "num_input_tokens_seen": 61889064, "step": 3847 }, { "epoch": 0.2695456495661387, "grad_norm": 4.996053695678711, "learning_rate": 7.307073905429072e-05, "loss": 1.0765, "num_input_tokens_seen": 61905448, "step": 3848 }, { "epoch": 0.26961569781186795, "grad_norm": 3.9249801635742188, "learning_rate": 7.30637408056042e-05, "loss": 1.0971, "num_input_tokens_seen": 61921832, "step": 3849 }, { "epoch": 0.26968574605759715, "grad_norm": 4.512659072875977, "learning_rate": 7.305674255691769e-05, "loss": 1.0811, "num_input_tokens_seen": 61937928, "step": 3850 }, { "epoch": 0.2697557943033264, "grad_norm": 3.8067586421966553, "learning_rate": 7.304974430823118e-05, "loss": 1.0381, "num_input_tokens_seen": 61953992, "step": 3851 }, { "epoch": 0.26982584254905567, "grad_norm": 3.5481879711151123, "learning_rate": 7.304274605954466e-05, "loss": 0.9524, "num_input_tokens_seen": 61969856, "step": 3852 }, { "epoch": 0.26989589079478493, "grad_norm": 5.14021635055542, "learning_rate": 7.303574781085815e-05, "loss": 1.0893, "num_input_tokens_seen": 61985448, "step": 3853 }, { "epoch": 0.26996593904051414, "grad_norm": 4.729730606079102, "learning_rate": 7.302874956217164e-05, "loss": 0.955, "num_input_tokens_seen": 62001832, "step": 3854 }, { "epoch": 0.2700359872862434, "grad_norm": 4.081509113311768, "learning_rate": 7.302175131348512e-05, "loss": 1.3099, "num_input_tokens_seen": 62018216, "step": 3855 }, { "epoch": 0.27010603553197265, "grad_norm": 3.9220404624938965, "learning_rate": 7.301475306479859e-05, "loss": 1.256, "num_input_tokens_seen": 62034600, "step": 3856 }, { "epoch": 0.2701760837777019, "grad_norm": 3.9707326889038086, "learning_rate": 7.30077548161121e-05, "loss": 0.9347, "num_input_tokens_seen": 62050984, "step": 3857 }, { "epoch": 0.2702461320234311, "grad_norm": 3.985651731491089, "learning_rate": 7.300075656742557e-05, "loss": 1.0869, "num_input_tokens_seen": 62066496, "step": 3858 }, { "epoch": 0.2703161802691604, "grad_norm": 4.900750160217285, "learning_rate": 7.299375831873906e-05, "loss": 1.2112, "num_input_tokens_seen": 62082880, "step": 3859 }, { "epoch": 0.27038622851488964, "grad_norm": 3.7562901973724365, "learning_rate": 7.298676007005255e-05, "loss": 1.0372, "num_input_tokens_seen": 62099264, "step": 3860 }, { "epoch": 0.2704562767606189, "grad_norm": 4.3399271965026855, "learning_rate": 7.297976182136604e-05, "loss": 1.2113, "num_input_tokens_seen": 62115648, "step": 3861 }, { "epoch": 0.2705263250063481, "grad_norm": 3.792924642562866, "learning_rate": 7.297276357267951e-05, "loss": 1.0027, "num_input_tokens_seen": 62132032, "step": 3862 }, { "epoch": 0.27059637325207736, "grad_norm": 4.10078763961792, "learning_rate": 7.2965765323993e-05, "loss": 1.0485, "num_input_tokens_seen": 62148416, "step": 3863 }, { "epoch": 0.2706664214978066, "grad_norm": 3.6712818145751953, "learning_rate": 7.295876707530649e-05, "loss": 0.982, "num_input_tokens_seen": 62164080, "step": 3864 }, { "epoch": 0.2707364697435359, "grad_norm": 4.216330051422119, "learning_rate": 7.295176882661996e-05, "loss": 0.9988, "num_input_tokens_seen": 62179952, "step": 3865 }, { "epoch": 0.2708065179892651, "grad_norm": 3.803950548171997, "learning_rate": 7.294477057793345e-05, "loss": 1.1107, "num_input_tokens_seen": 62196336, "step": 3866 }, { "epoch": 0.27087656623499434, "grad_norm": 4.4687676429748535, "learning_rate": 7.293777232924694e-05, "loss": 1.1374, "num_input_tokens_seen": 62212072, "step": 3867 }, { "epoch": 0.2709466144807236, "grad_norm": 3.8923938274383545, "learning_rate": 7.293077408056043e-05, "loss": 1.0037, "num_input_tokens_seen": 62227384, "step": 3868 }, { "epoch": 0.27101666272645286, "grad_norm": 3.7378618717193604, "learning_rate": 7.29237758318739e-05, "loss": 0.9185, "num_input_tokens_seen": 62243768, "step": 3869 }, { "epoch": 0.27108671097218207, "grad_norm": 4.39946985244751, "learning_rate": 7.291677758318739e-05, "loss": 1.2908, "num_input_tokens_seen": 62259760, "step": 3870 }, { "epoch": 0.2711567592179113, "grad_norm": 4.526809215545654, "learning_rate": 7.290977933450088e-05, "loss": 1.1677, "num_input_tokens_seen": 62275880, "step": 3871 }, { "epoch": 0.2712268074636406, "grad_norm": 5.780641078948975, "learning_rate": 7.290278108581437e-05, "loss": 1.3366, "num_input_tokens_seen": 62291992, "step": 3872 }, { "epoch": 0.27129685570936984, "grad_norm": 3.932300329208374, "learning_rate": 7.289578283712786e-05, "loss": 0.9404, "num_input_tokens_seen": 62308168, "step": 3873 }, { "epoch": 0.27136690395509905, "grad_norm": 6.381493091583252, "learning_rate": 7.288878458844135e-05, "loss": 0.9909, "num_input_tokens_seen": 62324552, "step": 3874 }, { "epoch": 0.2714369522008283, "grad_norm": 6.920464515686035, "learning_rate": 7.288178633975482e-05, "loss": 1.0534, "num_input_tokens_seen": 62340712, "step": 3875 }, { "epoch": 0.27150700044655757, "grad_norm": 4.327527046203613, "learning_rate": 7.28747880910683e-05, "loss": 1.2133, "num_input_tokens_seen": 62355904, "step": 3876 }, { "epoch": 0.2715770486922868, "grad_norm": 6.8873610496521, "learning_rate": 7.286778984238178e-05, "loss": 1.1857, "num_input_tokens_seen": 62372288, "step": 3877 }, { "epoch": 0.2716470969380161, "grad_norm": 4.397764205932617, "learning_rate": 7.286079159369527e-05, "loss": 1.1458, "num_input_tokens_seen": 62388672, "step": 3878 }, { "epoch": 0.2717171451837453, "grad_norm": 4.200334072113037, "learning_rate": 7.285379334500876e-05, "loss": 1.1534, "num_input_tokens_seen": 62403728, "step": 3879 }, { "epoch": 0.27178719342947455, "grad_norm": 3.8102898597717285, "learning_rate": 7.284679509632225e-05, "loss": 1.2455, "num_input_tokens_seen": 62419712, "step": 3880 }, { "epoch": 0.2718572416752038, "grad_norm": 5.665886878967285, "learning_rate": 7.283979684763574e-05, "loss": 1.0506, "num_input_tokens_seen": 62435648, "step": 3881 }, { "epoch": 0.27192728992093307, "grad_norm": 5.59833288192749, "learning_rate": 7.283279859894921e-05, "loss": 1.1289, "num_input_tokens_seen": 62451760, "step": 3882 }, { "epoch": 0.2719973381666623, "grad_norm": 4.3096699714660645, "learning_rate": 7.282580035026269e-05, "loss": 1.1069, "num_input_tokens_seen": 62468144, "step": 3883 }, { "epoch": 0.27206738641239153, "grad_norm": 3.584202766418457, "learning_rate": 7.281880210157619e-05, "loss": 0.981, "num_input_tokens_seen": 62484528, "step": 3884 }, { "epoch": 0.2721374346581208, "grad_norm": 5.078696250915527, "learning_rate": 7.281180385288967e-05, "loss": 1.0727, "num_input_tokens_seen": 62500912, "step": 3885 }, { "epoch": 0.27220748290385005, "grad_norm": 3.4883761405944824, "learning_rate": 7.280480560420315e-05, "loss": 0.888, "num_input_tokens_seen": 62517296, "step": 3886 }, { "epoch": 0.27227753114957925, "grad_norm": 3.938286066055298, "learning_rate": 7.279780735551664e-05, "loss": 0.9736, "num_input_tokens_seen": 62532896, "step": 3887 }, { "epoch": 0.2723475793953085, "grad_norm": 3.7150652408599854, "learning_rate": 7.279080910683013e-05, "loss": 1.1163, "num_input_tokens_seen": 62549072, "step": 3888 }, { "epoch": 0.2724176276410378, "grad_norm": 5.31076717376709, "learning_rate": 7.278381085814361e-05, "loss": 0.9943, "num_input_tokens_seen": 62564384, "step": 3889 }, { "epoch": 0.27248767588676703, "grad_norm": 4.8600053787231445, "learning_rate": 7.27768126094571e-05, "loss": 1.1767, "num_input_tokens_seen": 62580768, "step": 3890 }, { "epoch": 0.27255772413249624, "grad_norm": 3.5890231132507324, "learning_rate": 7.276981436077058e-05, "loss": 1.0949, "num_input_tokens_seen": 62596928, "step": 3891 }, { "epoch": 0.2726277723782255, "grad_norm": 4.171263217926025, "learning_rate": 7.276281611208407e-05, "loss": 1.0013, "num_input_tokens_seen": 62613312, "step": 3892 }, { "epoch": 0.27269782062395476, "grad_norm": 5.907830715179443, "learning_rate": 7.275581786339755e-05, "loss": 1.0622, "num_input_tokens_seen": 62627840, "step": 3893 }, { "epoch": 0.272767868869684, "grad_norm": 3.912140369415283, "learning_rate": 7.274881961471104e-05, "loss": 1.1128, "num_input_tokens_seen": 62643760, "step": 3894 }, { "epoch": 0.2728379171154132, "grad_norm": 3.9871180057525635, "learning_rate": 7.274182136602453e-05, "loss": 1.0879, "num_input_tokens_seen": 62660144, "step": 3895 }, { "epoch": 0.2729079653611425, "grad_norm": 3.8014907836914062, "learning_rate": 7.2734823117338e-05, "loss": 1.0135, "num_input_tokens_seen": 62676200, "step": 3896 }, { "epoch": 0.27297801360687174, "grad_norm": 3.7584786415100098, "learning_rate": 7.272782486865149e-05, "loss": 1.0366, "num_input_tokens_seen": 62692584, "step": 3897 }, { "epoch": 0.273048061852601, "grad_norm": 3.573341131210327, "learning_rate": 7.272082661996498e-05, "loss": 0.8726, "num_input_tokens_seen": 62708968, "step": 3898 }, { "epoch": 0.2731181100983302, "grad_norm": 4.013971328735352, "learning_rate": 7.271382837127847e-05, "loss": 0.991, "num_input_tokens_seen": 62725352, "step": 3899 }, { "epoch": 0.27318815834405946, "grad_norm": 4.3081488609313965, "learning_rate": 7.270683012259195e-05, "loss": 1.0632, "num_input_tokens_seen": 62741736, "step": 3900 }, { "epoch": 0.2732582065897887, "grad_norm": 3.857982635498047, "learning_rate": 7.269983187390544e-05, "loss": 1.1116, "num_input_tokens_seen": 62757624, "step": 3901 }, { "epoch": 0.273328254835518, "grad_norm": 3.5167486667633057, "learning_rate": 7.269283362521892e-05, "loss": 0.9951, "num_input_tokens_seen": 62774008, "step": 3902 }, { "epoch": 0.2733983030812472, "grad_norm": 4.025612831115723, "learning_rate": 7.26858353765324e-05, "loss": 1.1632, "num_input_tokens_seen": 62789560, "step": 3903 }, { "epoch": 0.27346835132697644, "grad_norm": 3.6391422748565674, "learning_rate": 7.267883712784588e-05, "loss": 0.9442, "num_input_tokens_seen": 62805824, "step": 3904 }, { "epoch": 0.2735383995727057, "grad_norm": 4.352347373962402, "learning_rate": 7.267183887915937e-05, "loss": 1.0882, "num_input_tokens_seen": 62821368, "step": 3905 }, { "epoch": 0.27360844781843496, "grad_norm": 3.782601833343506, "learning_rate": 7.266484063047286e-05, "loss": 0.9795, "num_input_tokens_seen": 62837024, "step": 3906 }, { "epoch": 0.27367849606416417, "grad_norm": 3.860903263092041, "learning_rate": 7.265784238178635e-05, "loss": 1.1751, "num_input_tokens_seen": 62853408, "step": 3907 }, { "epoch": 0.2737485443098934, "grad_norm": 6.185113430023193, "learning_rate": 7.265084413309984e-05, "loss": 1.1976, "num_input_tokens_seen": 62869792, "step": 3908 }, { "epoch": 0.2738185925556227, "grad_norm": 6.02334451675415, "learning_rate": 7.264384588441331e-05, "loss": 1.0472, "num_input_tokens_seen": 62886088, "step": 3909 }, { "epoch": 0.27388864080135195, "grad_norm": 4.019417762756348, "learning_rate": 7.263684763572679e-05, "loss": 0.9597, "num_input_tokens_seen": 62902472, "step": 3910 }, { "epoch": 0.27395868904708115, "grad_norm": 4.0645527839660645, "learning_rate": 7.262984938704029e-05, "loss": 1.0267, "num_input_tokens_seen": 62918552, "step": 3911 }, { "epoch": 0.2740287372928104, "grad_norm": 3.978803873062134, "learning_rate": 7.262285113835378e-05, "loss": 1.1366, "num_input_tokens_seen": 62934272, "step": 3912 }, { "epoch": 0.27409878553853967, "grad_norm": 4.659839630126953, "learning_rate": 7.261585288966725e-05, "loss": 1.0485, "num_input_tokens_seen": 62950656, "step": 3913 }, { "epoch": 0.27416883378426893, "grad_norm": 4.378306865692139, "learning_rate": 7.260885464098074e-05, "loss": 0.9949, "num_input_tokens_seen": 62966120, "step": 3914 }, { "epoch": 0.2742388820299982, "grad_norm": 3.723999261856079, "learning_rate": 7.260185639229423e-05, "loss": 1.0575, "num_input_tokens_seen": 62982504, "step": 3915 }, { "epoch": 0.2743089302757274, "grad_norm": 4.133684158325195, "learning_rate": 7.25948581436077e-05, "loss": 0.9707, "num_input_tokens_seen": 62998888, "step": 3916 }, { "epoch": 0.27437897852145665, "grad_norm": 3.8377842903137207, "learning_rate": 7.258785989492119e-05, "loss": 1.1018, "num_input_tokens_seen": 63015272, "step": 3917 }, { "epoch": 0.2744490267671859, "grad_norm": 3.546846389770508, "learning_rate": 7.258086164623468e-05, "loss": 0.9544, "num_input_tokens_seen": 63031656, "step": 3918 }, { "epoch": 0.27451907501291517, "grad_norm": 3.8629097938537598, "learning_rate": 7.257386339754817e-05, "loss": 1.0174, "num_input_tokens_seen": 63047208, "step": 3919 }, { "epoch": 0.2745891232586444, "grad_norm": 3.780395984649658, "learning_rate": 7.256686514886165e-05, "loss": 1.0927, "num_input_tokens_seen": 63063592, "step": 3920 }, { "epoch": 0.27465917150437363, "grad_norm": 3.5188148021698, "learning_rate": 7.255986690017513e-05, "loss": 0.9973, "num_input_tokens_seen": 63079976, "step": 3921 }, { "epoch": 0.2747292197501029, "grad_norm": 4.295319080352783, "learning_rate": 7.255286865148862e-05, "loss": 1.1545, "num_input_tokens_seen": 63096360, "step": 3922 }, { "epoch": 0.27479926799583215, "grad_norm": 6.307181358337402, "learning_rate": 7.25458704028021e-05, "loss": 1.0283, "num_input_tokens_seen": 63112744, "step": 3923 }, { "epoch": 0.27486931624156136, "grad_norm": 4.0670342445373535, "learning_rate": 7.253887215411559e-05, "loss": 1.0834, "num_input_tokens_seen": 63129000, "step": 3924 }, { "epoch": 0.2749393644872906, "grad_norm": 4.441539287567139, "learning_rate": 7.253187390542907e-05, "loss": 1.1264, "num_input_tokens_seen": 63145304, "step": 3925 }, { "epoch": 0.2750094127330199, "grad_norm": 6.151254653930664, "learning_rate": 7.252487565674256e-05, "loss": 0.911, "num_input_tokens_seen": 63161688, "step": 3926 }, { "epoch": 0.27507946097874914, "grad_norm": 5.355491638183594, "learning_rate": 7.251787740805605e-05, "loss": 1.0604, "num_input_tokens_seen": 63176128, "step": 3927 }, { "epoch": 0.27514950922447834, "grad_norm": 3.4603800773620605, "learning_rate": 7.251087915936954e-05, "loss": 0.7811, "num_input_tokens_seen": 63192512, "step": 3928 }, { "epoch": 0.2752195574702076, "grad_norm": 5.412753105163574, "learning_rate": 7.250388091068302e-05, "loss": 0.9675, "num_input_tokens_seen": 63208896, "step": 3929 }, { "epoch": 0.27528960571593686, "grad_norm": 3.928074598312378, "learning_rate": 7.249688266199649e-05, "loss": 1.0562, "num_input_tokens_seen": 63224296, "step": 3930 }, { "epoch": 0.2753596539616661, "grad_norm": 4.239214897155762, "learning_rate": 7.248988441330998e-05, "loss": 0.9697, "num_input_tokens_seen": 63239312, "step": 3931 }, { "epoch": 0.2754297022073953, "grad_norm": 3.8074252605438232, "learning_rate": 7.248288616462348e-05, "loss": 1.0834, "num_input_tokens_seen": 63255664, "step": 3932 }, { "epoch": 0.2754997504531246, "grad_norm": 3.721026659011841, "learning_rate": 7.247588791593696e-05, "loss": 1.1663, "num_input_tokens_seen": 63272048, "step": 3933 }, { "epoch": 0.27556979869885384, "grad_norm": 4.076726913452148, "learning_rate": 7.246888966725044e-05, "loss": 1.1179, "num_input_tokens_seen": 63288432, "step": 3934 }, { "epoch": 0.2756398469445831, "grad_norm": 4.238835334777832, "learning_rate": 7.246189141856393e-05, "loss": 1.0894, "num_input_tokens_seen": 63304168, "step": 3935 }, { "epoch": 0.2757098951903123, "grad_norm": 4.4860148429870605, "learning_rate": 7.245489316987741e-05, "loss": 1.1763, "num_input_tokens_seen": 63320552, "step": 3936 }, { "epoch": 0.27577994343604156, "grad_norm": 6.002726078033447, "learning_rate": 7.244789492119088e-05, "loss": 1.158, "num_input_tokens_seen": 63336792, "step": 3937 }, { "epoch": 0.2758499916817708, "grad_norm": 3.799751043319702, "learning_rate": 7.244089667250439e-05, "loss": 1.0316, "num_input_tokens_seen": 63353176, "step": 3938 }, { "epoch": 0.2759200399275001, "grad_norm": 4.905911445617676, "learning_rate": 7.243389842381787e-05, "loss": 0.8847, "num_input_tokens_seen": 63369560, "step": 3939 }, { "epoch": 0.2759900881732293, "grad_norm": 5.141537666320801, "learning_rate": 7.242690017513135e-05, "loss": 1.109, "num_input_tokens_seen": 63385944, "step": 3940 }, { "epoch": 0.27606013641895855, "grad_norm": 5.276777267456055, "learning_rate": 7.241990192644484e-05, "loss": 0.9881, "num_input_tokens_seen": 63401672, "step": 3941 }, { "epoch": 0.2761301846646878, "grad_norm": 5.267075538635254, "learning_rate": 7.241290367775833e-05, "loss": 1.0048, "num_input_tokens_seen": 63417792, "step": 3942 }, { "epoch": 0.27620023291041707, "grad_norm": 4.065691947937012, "learning_rate": 7.24059054290718e-05, "loss": 1.0088, "num_input_tokens_seen": 63434176, "step": 3943 }, { "epoch": 0.27627028115614627, "grad_norm": 7.921762466430664, "learning_rate": 7.239890718038529e-05, "loss": 1.3552, "num_input_tokens_seen": 63450032, "step": 3944 }, { "epoch": 0.27634032940187553, "grad_norm": 3.55094313621521, "learning_rate": 7.239190893169878e-05, "loss": 0.9957, "num_input_tokens_seen": 63466416, "step": 3945 }, { "epoch": 0.2764103776476048, "grad_norm": 5.732813358306885, "learning_rate": 7.238491068301227e-05, "loss": 1.0968, "num_input_tokens_seen": 63482296, "step": 3946 }, { "epoch": 0.27648042589333405, "grad_norm": 3.9143989086151123, "learning_rate": 7.237791243432574e-05, "loss": 0.9218, "num_input_tokens_seen": 63498472, "step": 3947 }, { "epoch": 0.2765504741390633, "grad_norm": 4.123042106628418, "learning_rate": 7.237091418563923e-05, "loss": 1.0081, "num_input_tokens_seen": 63513856, "step": 3948 }, { "epoch": 0.2766205223847925, "grad_norm": 3.7550277709960938, "learning_rate": 7.236391593695272e-05, "loss": 1.0612, "num_input_tokens_seen": 63529432, "step": 3949 }, { "epoch": 0.27669057063052177, "grad_norm": 3.841831922531128, "learning_rate": 7.23569176882662e-05, "loss": 1.1208, "num_input_tokens_seen": 63545816, "step": 3950 }, { "epoch": 0.27676061887625103, "grad_norm": 4.626603126525879, "learning_rate": 7.234991943957968e-05, "loss": 1.3412, "num_input_tokens_seen": 63561960, "step": 3951 }, { "epoch": 0.2768306671219803, "grad_norm": 3.874140977859497, "learning_rate": 7.234292119089319e-05, "loss": 1.0549, "num_input_tokens_seen": 63578344, "step": 3952 }, { "epoch": 0.2769007153677095, "grad_norm": 3.6525163650512695, "learning_rate": 7.233592294220666e-05, "loss": 1.0905, "num_input_tokens_seen": 63594520, "step": 3953 }, { "epoch": 0.27697076361343875, "grad_norm": 5.065535068511963, "learning_rate": 7.232892469352015e-05, "loss": 1.1913, "num_input_tokens_seen": 63610904, "step": 3954 }, { "epoch": 0.277040811859168, "grad_norm": 7.97597599029541, "learning_rate": 7.232192644483364e-05, "loss": 0.9109, "num_input_tokens_seen": 63625896, "step": 3955 }, { "epoch": 0.27711086010489727, "grad_norm": 5.0254645347595215, "learning_rate": 7.231492819614711e-05, "loss": 1.0177, "num_input_tokens_seen": 63642280, "step": 3956 }, { "epoch": 0.2771809083506265, "grad_norm": 4.171605587005615, "learning_rate": 7.230792994746059e-05, "loss": 1.3166, "num_input_tokens_seen": 63658400, "step": 3957 }, { "epoch": 0.27725095659635574, "grad_norm": 4.036003589630127, "learning_rate": 7.230093169877409e-05, "loss": 1.0489, "num_input_tokens_seen": 63674784, "step": 3958 }, { "epoch": 0.277321004842085, "grad_norm": 4.664374828338623, "learning_rate": 7.229393345008758e-05, "loss": 1.3189, "num_input_tokens_seen": 63691168, "step": 3959 }, { "epoch": 0.27739105308781425, "grad_norm": 3.7217307090759277, "learning_rate": 7.228693520140105e-05, "loss": 1.2532, "num_input_tokens_seen": 63707552, "step": 3960 }, { "epoch": 0.27746110133354346, "grad_norm": 3.622593879699707, "learning_rate": 7.227993695271454e-05, "loss": 0.8604, "num_input_tokens_seen": 63723936, "step": 3961 }, { "epoch": 0.2775311495792727, "grad_norm": 4.154850006103516, "learning_rate": 7.227293870402803e-05, "loss": 1.1366, "num_input_tokens_seen": 63740320, "step": 3962 }, { "epoch": 0.277601197825002, "grad_norm": 4.157016754150391, "learning_rate": 7.22659404553415e-05, "loss": 0.8815, "num_input_tokens_seen": 63756456, "step": 3963 }, { "epoch": 0.27767124607073124, "grad_norm": 4.652394771575928, "learning_rate": 7.2258942206655e-05, "loss": 0.8966, "num_input_tokens_seen": 63772840, "step": 3964 }, { "epoch": 0.27774129431646044, "grad_norm": 7.87667989730835, "learning_rate": 7.225194395796848e-05, "loss": 1.1371, "num_input_tokens_seen": 63788800, "step": 3965 }, { "epoch": 0.2778113425621897, "grad_norm": 4.333608627319336, "learning_rate": 7.224494570928197e-05, "loss": 1.3465, "num_input_tokens_seen": 63805088, "step": 3966 }, { "epoch": 0.27788139080791896, "grad_norm": 7.2095184326171875, "learning_rate": 7.223794746059545e-05, "loss": 1.0276, "num_input_tokens_seen": 63821472, "step": 3967 }, { "epoch": 0.2779514390536482, "grad_norm": 3.9144251346588135, "learning_rate": 7.223094921190893e-05, "loss": 0.9954, "num_input_tokens_seen": 63837048, "step": 3968 }, { "epoch": 0.2780214872993774, "grad_norm": 4.380809783935547, "learning_rate": 7.222395096322242e-05, "loss": 0.9757, "num_input_tokens_seen": 63852872, "step": 3969 }, { "epoch": 0.2780915355451067, "grad_norm": 3.637685537338257, "learning_rate": 7.22169527145359e-05, "loss": 1.0264, "num_input_tokens_seen": 63868864, "step": 3970 }, { "epoch": 0.27816158379083594, "grad_norm": 4.742129802703857, "learning_rate": 7.220995446584939e-05, "loss": 1.2344, "num_input_tokens_seen": 63885248, "step": 3971 }, { "epoch": 0.2782316320365652, "grad_norm": 4.7221269607543945, "learning_rate": 7.220295621716289e-05, "loss": 1.1001, "num_input_tokens_seen": 63901632, "step": 3972 }, { "epoch": 0.2783016802822944, "grad_norm": 3.6607449054718018, "learning_rate": 7.219595796847636e-05, "loss": 1.1179, "num_input_tokens_seen": 63917688, "step": 3973 }, { "epoch": 0.27837172852802367, "grad_norm": 4.264851093292236, "learning_rate": 7.218895971978984e-05, "loss": 1.0158, "num_input_tokens_seen": 63934072, "step": 3974 }, { "epoch": 0.2784417767737529, "grad_norm": 5.0043511390686035, "learning_rate": 7.218196147110333e-05, "loss": 1.0359, "num_input_tokens_seen": 63950200, "step": 3975 }, { "epoch": 0.2785118250194822, "grad_norm": 4.323488235473633, "learning_rate": 7.217496322241682e-05, "loss": 1.1791, "num_input_tokens_seen": 63966584, "step": 3976 }, { "epoch": 0.2785818732652114, "grad_norm": 6.721888065338135, "learning_rate": 7.216796497373029e-05, "loss": 0.9446, "num_input_tokens_seen": 63982440, "step": 3977 }, { "epoch": 0.27865192151094065, "grad_norm": 6.3528289794921875, "learning_rate": 7.21609667250438e-05, "loss": 1.1506, "num_input_tokens_seen": 63998824, "step": 3978 }, { "epoch": 0.2787219697566699, "grad_norm": 5.293467044830322, "learning_rate": 7.215396847635728e-05, "loss": 1.2791, "num_input_tokens_seen": 64014984, "step": 3979 }, { "epoch": 0.27879201800239917, "grad_norm": 3.8228442668914795, "learning_rate": 7.214697022767076e-05, "loss": 1.1086, "num_input_tokens_seen": 64031080, "step": 3980 }, { "epoch": 0.27886206624812837, "grad_norm": 3.8407061100006104, "learning_rate": 7.213997197898425e-05, "loss": 1.1772, "num_input_tokens_seen": 64046416, "step": 3981 }, { "epoch": 0.27893211449385763, "grad_norm": 3.9471728801727295, "learning_rate": 7.213297373029773e-05, "loss": 0.9394, "num_input_tokens_seen": 64062784, "step": 3982 }, { "epoch": 0.2790021627395869, "grad_norm": 4.1796722412109375, "learning_rate": 7.212597548161121e-05, "loss": 0.9966, "num_input_tokens_seen": 64077504, "step": 3983 }, { "epoch": 0.27907221098531615, "grad_norm": 3.78998064994812, "learning_rate": 7.21189772329247e-05, "loss": 1.1219, "num_input_tokens_seen": 64093888, "step": 3984 }, { "epoch": 0.2791422592310454, "grad_norm": 3.383371591567993, "learning_rate": 7.211197898423819e-05, "loss": 0.8832, "num_input_tokens_seen": 64110272, "step": 3985 }, { "epoch": 0.2792123074767746, "grad_norm": 3.6502346992492676, "learning_rate": 7.210498073555168e-05, "loss": 1.0114, "num_input_tokens_seen": 64125464, "step": 3986 }, { "epoch": 0.27928235572250387, "grad_norm": 3.9421629905700684, "learning_rate": 7.209798248686515e-05, "loss": 1.1305, "num_input_tokens_seen": 64141848, "step": 3987 }, { "epoch": 0.27935240396823313, "grad_norm": 4.40875244140625, "learning_rate": 7.209098423817864e-05, "loss": 0.9603, "num_input_tokens_seen": 64158232, "step": 3988 }, { "epoch": 0.2794224522139624, "grad_norm": 5.909340858459473, "learning_rate": 7.208398598949213e-05, "loss": 1.121, "num_input_tokens_seen": 64174616, "step": 3989 }, { "epoch": 0.2794925004596916, "grad_norm": 4.548187732696533, "learning_rate": 7.20769877408056e-05, "loss": 0.9575, "num_input_tokens_seen": 64191000, "step": 3990 }, { "epoch": 0.27956254870542085, "grad_norm": 4.1479926109313965, "learning_rate": 7.206998949211909e-05, "loss": 1.1205, "num_input_tokens_seen": 64207384, "step": 3991 }, { "epoch": 0.2796325969511501, "grad_norm": 4.229100227355957, "learning_rate": 7.206299124343259e-05, "loss": 1.0597, "num_input_tokens_seen": 64223304, "step": 3992 }, { "epoch": 0.2797026451968794, "grad_norm": 7.431615352630615, "learning_rate": 7.205599299474607e-05, "loss": 1.2793, "num_input_tokens_seen": 64239688, "step": 3993 }, { "epoch": 0.2797726934426086, "grad_norm": 4.346622943878174, "learning_rate": 7.204899474605954e-05, "loss": 1.2385, "num_input_tokens_seen": 64255752, "step": 3994 }, { "epoch": 0.27984274168833784, "grad_norm": 3.771306276321411, "learning_rate": 7.204199649737303e-05, "loss": 1.0211, "num_input_tokens_seen": 64271760, "step": 3995 }, { "epoch": 0.2799127899340671, "grad_norm": 4.411479473114014, "learning_rate": 7.203499824868652e-05, "loss": 1.3369, "num_input_tokens_seen": 64288144, "step": 3996 }, { "epoch": 0.27998283817979636, "grad_norm": 4.591271877288818, "learning_rate": 7.2028e-05, "loss": 1.1021, "num_input_tokens_seen": 64304528, "step": 3997 }, { "epoch": 0.28005288642552556, "grad_norm": 3.88271427154541, "learning_rate": 7.20210017513135e-05, "loss": 0.9181, "num_input_tokens_seen": 64320912, "step": 3998 }, { "epoch": 0.2801229346712548, "grad_norm": 10.80846118927002, "learning_rate": 7.201400350262699e-05, "loss": 1.0922, "num_input_tokens_seen": 64337296, "step": 3999 }, { "epoch": 0.2801929829169841, "grad_norm": 3.7112953662872314, "learning_rate": 7.200700525394046e-05, "loss": 1.0157, "num_input_tokens_seen": 64353680, "step": 4000 }, { "epoch": 0.2801929829169841, "eval_loss": 1.1334750652313232, "eval_runtime": 0.1958, "eval_samples_per_second": 5.106, "eval_steps_per_second": 5.106, "num_input_tokens_seen": 64353680, "step": 4000 }, { "epoch": 0.28026303116271334, "grad_norm": 7.529544830322266, "learning_rate": 7.200000700525394e-05, "loss": 1.1264, "num_input_tokens_seen": 64368408, "step": 4001 }, { "epoch": 0.28033307940844254, "grad_norm": 3.761939764022827, "learning_rate": 7.199300875656742e-05, "loss": 1.1027, "num_input_tokens_seen": 64384792, "step": 4002 }, { "epoch": 0.2804031276541718, "grad_norm": 4.091811656951904, "learning_rate": 7.198601050788091e-05, "loss": 1.0368, "num_input_tokens_seen": 64400520, "step": 4003 }, { "epoch": 0.28047317589990106, "grad_norm": 5.5972795486450195, "learning_rate": 7.19790122591944e-05, "loss": 1.0957, "num_input_tokens_seen": 64416904, "step": 4004 }, { "epoch": 0.2805432241456303, "grad_norm": 3.4631423950195312, "learning_rate": 7.197201401050789e-05, "loss": 0.9517, "num_input_tokens_seen": 64432168, "step": 4005 }, { "epoch": 0.2806132723913595, "grad_norm": 6.3156938552856445, "learning_rate": 7.196501576182138e-05, "loss": 1.0554, "num_input_tokens_seen": 64447752, "step": 4006 }, { "epoch": 0.2806833206370888, "grad_norm": 10.07819652557373, "learning_rate": 7.195801751313485e-05, "loss": 0.99, "num_input_tokens_seen": 64464136, "step": 4007 }, { "epoch": 0.28075336888281804, "grad_norm": 4.695057392120361, "learning_rate": 7.195101926444834e-05, "loss": 0.9745, "num_input_tokens_seen": 64480520, "step": 4008 }, { "epoch": 0.2808234171285473, "grad_norm": 4.74672269821167, "learning_rate": 7.194402101576183e-05, "loss": 1.0648, "num_input_tokens_seen": 64496904, "step": 4009 }, { "epoch": 0.2808934653742765, "grad_norm": 3.834928512573242, "learning_rate": 7.19370227670753e-05, "loss": 1.0163, "num_input_tokens_seen": 64513288, "step": 4010 }, { "epoch": 0.28096351362000577, "grad_norm": 4.1937103271484375, "learning_rate": 7.19300245183888e-05, "loss": 1.1351, "num_input_tokens_seen": 64528992, "step": 4011 }, { "epoch": 0.281033561865735, "grad_norm": 4.1531243324279785, "learning_rate": 7.192302626970228e-05, "loss": 1.0835, "num_input_tokens_seen": 64544776, "step": 4012 }, { "epoch": 0.2811036101114643, "grad_norm": 5.006285190582275, "learning_rate": 7.191602802101577e-05, "loss": 1.1282, "num_input_tokens_seen": 64560944, "step": 4013 }, { "epoch": 0.2811736583571935, "grad_norm": 3.433964252471924, "learning_rate": 7.190902977232925e-05, "loss": 1.1164, "num_input_tokens_seen": 64577328, "step": 4014 }, { "epoch": 0.28124370660292275, "grad_norm": 6.165640354156494, "learning_rate": 7.190203152364274e-05, "loss": 0.9713, "num_input_tokens_seen": 64593672, "step": 4015 }, { "epoch": 0.281313754848652, "grad_norm": 6.037381649017334, "learning_rate": 7.189503327495622e-05, "loss": 1.032, "num_input_tokens_seen": 64610056, "step": 4016 }, { "epoch": 0.28138380309438127, "grad_norm": 4.2639923095703125, "learning_rate": 7.18880350262697e-05, "loss": 1.1842, "num_input_tokens_seen": 64625864, "step": 4017 }, { "epoch": 0.28145385134011053, "grad_norm": 3.8862967491149902, "learning_rate": 7.188103677758319e-05, "loss": 0.9448, "num_input_tokens_seen": 64642248, "step": 4018 }, { "epoch": 0.28152389958583973, "grad_norm": 3.9584991931915283, "learning_rate": 7.187403852889669e-05, "loss": 0.9602, "num_input_tokens_seen": 64658632, "step": 4019 }, { "epoch": 0.281593947831569, "grad_norm": 6.037077903747559, "learning_rate": 7.186704028021017e-05, "loss": 1.0913, "num_input_tokens_seen": 64675016, "step": 4020 }, { "epoch": 0.28166399607729825, "grad_norm": 3.750059127807617, "learning_rate": 7.186004203152364e-05, "loss": 1.1294, "num_input_tokens_seen": 64691400, "step": 4021 }, { "epoch": 0.2817340443230275, "grad_norm": 4.364743709564209, "learning_rate": 7.185304378283713e-05, "loss": 1.0983, "num_input_tokens_seen": 64706512, "step": 4022 }, { "epoch": 0.2818040925687567, "grad_norm": 3.463717460632324, "learning_rate": 7.184604553415062e-05, "loss": 1.0773, "num_input_tokens_seen": 64722688, "step": 4023 }, { "epoch": 0.281874140814486, "grad_norm": 3.939438819885254, "learning_rate": 7.18390472854641e-05, "loss": 1.3356, "num_input_tokens_seen": 64738856, "step": 4024 }, { "epoch": 0.28194418906021523, "grad_norm": 3.813849687576294, "learning_rate": 7.18320490367776e-05, "loss": 1.0521, "num_input_tokens_seen": 64755240, "step": 4025 }, { "epoch": 0.2820142373059445, "grad_norm": 3.5874619483947754, "learning_rate": 7.182505078809108e-05, "loss": 1.0328, "num_input_tokens_seen": 64771184, "step": 4026 }, { "epoch": 0.2820842855516737, "grad_norm": 4.544376850128174, "learning_rate": 7.181805253940456e-05, "loss": 1.1132, "num_input_tokens_seen": 64787568, "step": 4027 }, { "epoch": 0.28215433379740296, "grad_norm": 3.6816799640655518, "learning_rate": 7.181105429071803e-05, "loss": 1.1088, "num_input_tokens_seen": 64803064, "step": 4028 }, { "epoch": 0.2822243820431322, "grad_norm": 7.1433939933776855, "learning_rate": 7.180405604203152e-05, "loss": 1.0069, "num_input_tokens_seen": 64818736, "step": 4029 }, { "epoch": 0.2822944302888615, "grad_norm": 4.308315753936768, "learning_rate": 7.179705779334501e-05, "loss": 1.1992, "num_input_tokens_seen": 64834848, "step": 4030 }, { "epoch": 0.2823644785345907, "grad_norm": 4.985830783843994, "learning_rate": 7.17900595446585e-05, "loss": 1.1996, "num_input_tokens_seen": 64851224, "step": 4031 }, { "epoch": 0.28243452678031994, "grad_norm": 4.884370803833008, "learning_rate": 7.178306129597199e-05, "loss": 1.0541, "num_input_tokens_seen": 64867608, "step": 4032 }, { "epoch": 0.2825045750260492, "grad_norm": 4.335781097412109, "learning_rate": 7.177606304728548e-05, "loss": 1.0596, "num_input_tokens_seen": 64883840, "step": 4033 }, { "epoch": 0.28257462327177846, "grad_norm": 3.729811191558838, "learning_rate": 7.176906479859895e-05, "loss": 1.0167, "num_input_tokens_seen": 64899872, "step": 4034 }, { "epoch": 0.28264467151750766, "grad_norm": 3.7386136054992676, "learning_rate": 7.176206654991244e-05, "loss": 0.7835, "num_input_tokens_seen": 64916256, "step": 4035 }, { "epoch": 0.2827147197632369, "grad_norm": 3.8022067546844482, "learning_rate": 7.175506830122593e-05, "loss": 1.0571, "num_input_tokens_seen": 64932640, "step": 4036 }, { "epoch": 0.2827847680089662, "grad_norm": 4.713296890258789, "learning_rate": 7.17480700525394e-05, "loss": 1.2877, "num_input_tokens_seen": 64948520, "step": 4037 }, { "epoch": 0.28285481625469544, "grad_norm": 3.682568073272705, "learning_rate": 7.174107180385289e-05, "loss": 1.0193, "num_input_tokens_seen": 64964904, "step": 4038 }, { "epoch": 0.28292486450042464, "grad_norm": 4.533677101135254, "learning_rate": 7.173407355516638e-05, "loss": 1.133, "num_input_tokens_seen": 64981288, "step": 4039 }, { "epoch": 0.2829949127461539, "grad_norm": 4.343021392822266, "learning_rate": 7.172707530647987e-05, "loss": 1.2843, "num_input_tokens_seen": 64997640, "step": 4040 }, { "epoch": 0.28306496099188316, "grad_norm": 4.942739009857178, "learning_rate": 7.172007705779334e-05, "loss": 1.1391, "num_input_tokens_seen": 65012456, "step": 4041 }, { "epoch": 0.2831350092376124, "grad_norm": 6.1112213134765625, "learning_rate": 7.171307880910683e-05, "loss": 1.0135, "num_input_tokens_seen": 65028840, "step": 4042 }, { "epoch": 0.2832050574833416, "grad_norm": 4.650609016418457, "learning_rate": 7.170608056042032e-05, "loss": 0.8585, "num_input_tokens_seen": 65044608, "step": 4043 }, { "epoch": 0.2832751057290709, "grad_norm": 5.383882522583008, "learning_rate": 7.169908231173381e-05, "loss": 1.3442, "num_input_tokens_seen": 65060992, "step": 4044 }, { "epoch": 0.28334515397480015, "grad_norm": 3.569399118423462, "learning_rate": 7.169208406304729e-05, "loss": 1.0645, "num_input_tokens_seen": 65077096, "step": 4045 }, { "epoch": 0.2834152022205294, "grad_norm": 5.199350833892822, "learning_rate": 7.168508581436079e-05, "loss": 1.0178, "num_input_tokens_seen": 65093480, "step": 4046 }, { "epoch": 0.2834852504662586, "grad_norm": 4.172554969787598, "learning_rate": 7.167808756567426e-05, "loss": 1.1721, "num_input_tokens_seen": 65109864, "step": 4047 }, { "epoch": 0.28355529871198787, "grad_norm": 3.822197437286377, "learning_rate": 7.167108931698774e-05, "loss": 0.9076, "num_input_tokens_seen": 65126248, "step": 4048 }, { "epoch": 0.28362534695771713, "grad_norm": 3.8899435997009277, "learning_rate": 7.166409106830123e-05, "loss": 1.1228, "num_input_tokens_seen": 65141984, "step": 4049 }, { "epoch": 0.2836953952034464, "grad_norm": 4.559451580047607, "learning_rate": 7.165709281961471e-05, "loss": 1.0732, "num_input_tokens_seen": 65157984, "step": 4050 }, { "epoch": 0.28376544344917565, "grad_norm": 5.253831386566162, "learning_rate": 7.16500945709282e-05, "loss": 1.1104, "num_input_tokens_seen": 65174040, "step": 4051 }, { "epoch": 0.28383549169490485, "grad_norm": 3.827268123626709, "learning_rate": 7.164309632224169e-05, "loss": 1.0689, "num_input_tokens_seen": 65190424, "step": 4052 }, { "epoch": 0.2839055399406341, "grad_norm": 4.432236194610596, "learning_rate": 7.163609807355518e-05, "loss": 0.9357, "num_input_tokens_seen": 65206808, "step": 4053 }, { "epoch": 0.28397558818636337, "grad_norm": 5.008002281188965, "learning_rate": 7.162909982486866e-05, "loss": 1.1584, "num_input_tokens_seen": 65222744, "step": 4054 }, { "epoch": 0.28404563643209263, "grad_norm": 3.748089551925659, "learning_rate": 7.162210157618213e-05, "loss": 1.0242, "num_input_tokens_seen": 65238592, "step": 4055 }, { "epoch": 0.28411568467782183, "grad_norm": 4.073843002319336, "learning_rate": 7.161510332749562e-05, "loss": 1.0629, "num_input_tokens_seen": 65254464, "step": 4056 }, { "epoch": 0.2841857329235511, "grad_norm": 4.0271100997924805, "learning_rate": 7.160810507880911e-05, "loss": 1.0191, "num_input_tokens_seen": 65269744, "step": 4057 }, { "epoch": 0.28425578116928035, "grad_norm": 4.266842365264893, "learning_rate": 7.16011068301226e-05, "loss": 1.0061, "num_input_tokens_seen": 65286128, "step": 4058 }, { "epoch": 0.2843258294150096, "grad_norm": 3.4473531246185303, "learning_rate": 7.159410858143608e-05, "loss": 0.8837, "num_input_tokens_seen": 65301864, "step": 4059 }, { "epoch": 0.2843958776607388, "grad_norm": 3.717029333114624, "learning_rate": 7.158711033274957e-05, "loss": 1.0704, "num_input_tokens_seen": 65317880, "step": 4060 }, { "epoch": 0.2844659259064681, "grad_norm": 4.008082866668701, "learning_rate": 7.158011208406305e-05, "loss": 1.0322, "num_input_tokens_seen": 65334096, "step": 4061 }, { "epoch": 0.28453597415219734, "grad_norm": 5.350658893585205, "learning_rate": 7.157311383537654e-05, "loss": 1.1277, "num_input_tokens_seen": 65348288, "step": 4062 }, { "epoch": 0.2846060223979266, "grad_norm": 8.911882400512695, "learning_rate": 7.156611558669003e-05, "loss": 1.0978, "num_input_tokens_seen": 65364672, "step": 4063 }, { "epoch": 0.2846760706436558, "grad_norm": 4.207833766937256, "learning_rate": 7.155911733800351e-05, "loss": 1.1248, "num_input_tokens_seen": 65380600, "step": 4064 }, { "epoch": 0.28474611888938506, "grad_norm": 3.492713689804077, "learning_rate": 7.155211908931699e-05, "loss": 0.9513, "num_input_tokens_seen": 65396920, "step": 4065 }, { "epoch": 0.2848161671351143, "grad_norm": 3.866763114929199, "learning_rate": 7.154512084063048e-05, "loss": 0.9899, "num_input_tokens_seen": 65413136, "step": 4066 }, { "epoch": 0.2848862153808436, "grad_norm": 4.352143287658691, "learning_rate": 7.153812259194397e-05, "loss": 1.097, "num_input_tokens_seen": 65428368, "step": 4067 }, { "epoch": 0.2849562636265728, "grad_norm": 5.335500717163086, "learning_rate": 7.153112434325744e-05, "loss": 1.1697, "num_input_tokens_seen": 65444752, "step": 4068 }, { "epoch": 0.28502631187230204, "grad_norm": 3.7467970848083496, "learning_rate": 7.152412609457093e-05, "loss": 0.9655, "num_input_tokens_seen": 65461136, "step": 4069 }, { "epoch": 0.2850963601180313, "grad_norm": 3.410472869873047, "learning_rate": 7.151712784588442e-05, "loss": 0.8464, "num_input_tokens_seen": 65477520, "step": 4070 }, { "epoch": 0.28516640836376056, "grad_norm": 6.551929950714111, "learning_rate": 7.151012959719791e-05, "loss": 1.0369, "num_input_tokens_seen": 65493904, "step": 4071 }, { "epoch": 0.28523645660948976, "grad_norm": 3.4140212535858154, "learning_rate": 7.150313134851138e-05, "loss": 1.0508, "num_input_tokens_seen": 65510288, "step": 4072 }, { "epoch": 0.285306504855219, "grad_norm": 4.227553367614746, "learning_rate": 7.149613309982488e-05, "loss": 1.0793, "num_input_tokens_seen": 65526672, "step": 4073 }, { "epoch": 0.2853765531009483, "grad_norm": 4.202794551849365, "learning_rate": 7.148913485113836e-05, "loss": 1.1393, "num_input_tokens_seen": 65542456, "step": 4074 }, { "epoch": 0.28544660134667754, "grad_norm": 5.172013759613037, "learning_rate": 7.148213660245183e-05, "loss": 1.2451, "num_input_tokens_seen": 65558384, "step": 4075 }, { "epoch": 0.28551664959240675, "grad_norm": 3.716113567352295, "learning_rate": 7.147513835376532e-05, "loss": 0.8515, "num_input_tokens_seen": 65574768, "step": 4076 }, { "epoch": 0.285586697838136, "grad_norm": 8.10258674621582, "learning_rate": 7.146814010507881e-05, "loss": 1.0737, "num_input_tokens_seen": 65590632, "step": 4077 }, { "epoch": 0.28565674608386527, "grad_norm": 3.649273157119751, "learning_rate": 7.14611418563923e-05, "loss": 1.0376, "num_input_tokens_seen": 65607016, "step": 4078 }, { "epoch": 0.2857267943295945, "grad_norm": 4.202502250671387, "learning_rate": 7.145414360770579e-05, "loss": 1.1102, "num_input_tokens_seen": 65622856, "step": 4079 }, { "epoch": 0.28579684257532373, "grad_norm": 4.027415752410889, "learning_rate": 7.144714535901928e-05, "loss": 1.26, "num_input_tokens_seen": 65639240, "step": 4080 }, { "epoch": 0.285866890821053, "grad_norm": 4.549161434173584, "learning_rate": 7.144014711033275e-05, "loss": 1.1598, "num_input_tokens_seen": 65655624, "step": 4081 }, { "epoch": 0.28593693906678225, "grad_norm": 4.43501615524292, "learning_rate": 7.143314886164623e-05, "loss": 1.0735, "num_input_tokens_seen": 65671016, "step": 4082 }, { "epoch": 0.2860069873125115, "grad_norm": 3.739610433578491, "learning_rate": 7.142615061295972e-05, "loss": 1.0321, "num_input_tokens_seen": 65687072, "step": 4083 }, { "epoch": 0.2860770355582407, "grad_norm": 3.725759506225586, "learning_rate": 7.14191523642732e-05, "loss": 1.0712, "num_input_tokens_seen": 65703456, "step": 4084 }, { "epoch": 0.28614708380396997, "grad_norm": 3.706056594848633, "learning_rate": 7.14121541155867e-05, "loss": 1.0643, "num_input_tokens_seen": 65719552, "step": 4085 }, { "epoch": 0.28621713204969923, "grad_norm": 4.971164703369141, "learning_rate": 7.140515586690018e-05, "loss": 1.2084, "num_input_tokens_seen": 65735936, "step": 4086 }, { "epoch": 0.2862871802954285, "grad_norm": 7.377131938934326, "learning_rate": 7.139815761821367e-05, "loss": 0.8867, "num_input_tokens_seen": 65752320, "step": 4087 }, { "epoch": 0.28635722854115775, "grad_norm": 4.293169975280762, "learning_rate": 7.139115936952715e-05, "loss": 1.0805, "num_input_tokens_seen": 65768704, "step": 4088 }, { "epoch": 0.28642727678688695, "grad_norm": 3.4757955074310303, "learning_rate": 7.138416112084063e-05, "loss": 0.9749, "num_input_tokens_seen": 65785088, "step": 4089 }, { "epoch": 0.2864973250326162, "grad_norm": 4.5705695152282715, "learning_rate": 7.137716287215412e-05, "loss": 1.209, "num_input_tokens_seen": 65801472, "step": 4090 }, { "epoch": 0.28656737327834547, "grad_norm": 5.240487575531006, "learning_rate": 7.137016462346761e-05, "loss": 0.9684, "num_input_tokens_seen": 65817856, "step": 4091 }, { "epoch": 0.28663742152407473, "grad_norm": 3.7815425395965576, "learning_rate": 7.136316637478109e-05, "loss": 0.9431, "num_input_tokens_seen": 65833872, "step": 4092 }, { "epoch": 0.28670746976980394, "grad_norm": 5.411090850830078, "learning_rate": 7.135616812609457e-05, "loss": 1.1237, "num_input_tokens_seen": 65849064, "step": 4093 }, { "epoch": 0.2867775180155332, "grad_norm": 4.07004451751709, "learning_rate": 7.134916987740806e-05, "loss": 1.0168, "num_input_tokens_seen": 65865448, "step": 4094 }, { "epoch": 0.28684756626126245, "grad_norm": 3.636051893234253, "learning_rate": 7.134217162872154e-05, "loss": 0.9363, "num_input_tokens_seen": 65881320, "step": 4095 }, { "epoch": 0.2869176145069917, "grad_norm": 4.265620708465576, "learning_rate": 7.133517338003503e-05, "loss": 1.2098, "num_input_tokens_seen": 65896832, "step": 4096 }, { "epoch": 0.2869876627527209, "grad_norm": 4.145105838775635, "learning_rate": 7.132817513134852e-05, "loss": 0.9785, "num_input_tokens_seen": 65912960, "step": 4097 }, { "epoch": 0.2870577109984502, "grad_norm": 3.6198408603668213, "learning_rate": 7.1321176882662e-05, "loss": 1.0276, "num_input_tokens_seen": 65929344, "step": 4098 }, { "epoch": 0.28712775924417944, "grad_norm": 4.000823497772217, "learning_rate": 7.131417863397548e-05, "loss": 1.2109, "num_input_tokens_seen": 65945480, "step": 4099 }, { "epoch": 0.2871978074899087, "grad_norm": 4.2647271156311035, "learning_rate": 7.130718038528898e-05, "loss": 1.1588, "num_input_tokens_seen": 65961672, "step": 4100 }, { "epoch": 0.2872678557356379, "grad_norm": 4.704364776611328, "learning_rate": 7.130018213660246e-05, "loss": 1.0707, "num_input_tokens_seen": 65976848, "step": 4101 }, { "epoch": 0.28733790398136716, "grad_norm": 3.8795642852783203, "learning_rate": 7.129318388791593e-05, "loss": 1.0087, "num_input_tokens_seen": 65993120, "step": 4102 }, { "epoch": 0.2874079522270964, "grad_norm": 4.356956958770752, "learning_rate": 7.128618563922942e-05, "loss": 1.4218, "num_input_tokens_seen": 66008448, "step": 4103 }, { "epoch": 0.2874780004728257, "grad_norm": 3.5145177841186523, "learning_rate": 7.127918739054291e-05, "loss": 0.9055, "num_input_tokens_seen": 66024712, "step": 4104 }, { "epoch": 0.2875480487185549, "grad_norm": 3.7384872436523438, "learning_rate": 7.12721891418564e-05, "loss": 1.0574, "num_input_tokens_seen": 66041096, "step": 4105 }, { "epoch": 0.28761809696428414, "grad_norm": 3.9706084728240967, "learning_rate": 7.126519089316989e-05, "loss": 1.1538, "num_input_tokens_seen": 66056880, "step": 4106 }, { "epoch": 0.2876881452100134, "grad_norm": 3.692093849182129, "learning_rate": 7.125819264448337e-05, "loss": 0.9421, "num_input_tokens_seen": 66073264, "step": 4107 }, { "epoch": 0.28775819345574266, "grad_norm": 4.967808246612549, "learning_rate": 7.125119439579685e-05, "loss": 0.8829, "num_input_tokens_seen": 66089648, "step": 4108 }, { "epoch": 0.28782824170147187, "grad_norm": 3.8627805709838867, "learning_rate": 7.124419614711032e-05, "loss": 1.1056, "num_input_tokens_seen": 66105992, "step": 4109 }, { "epoch": 0.2878982899472011, "grad_norm": 3.7407474517822266, "learning_rate": 7.123719789842381e-05, "loss": 1.0241, "num_input_tokens_seen": 66122040, "step": 4110 }, { "epoch": 0.2879683381929304, "grad_norm": 4.028223514556885, "learning_rate": 7.123019964973732e-05, "loss": 1.161, "num_input_tokens_seen": 66138056, "step": 4111 }, { "epoch": 0.28803838643865964, "grad_norm": 4.248149394989014, "learning_rate": 7.122320140105079e-05, "loss": 1.083, "num_input_tokens_seen": 66154384, "step": 4112 }, { "epoch": 0.28810843468438885, "grad_norm": 3.49904465675354, "learning_rate": 7.121620315236428e-05, "loss": 1.0217, "num_input_tokens_seen": 66170016, "step": 4113 }, { "epoch": 0.2881784829301181, "grad_norm": 5.039339542388916, "learning_rate": 7.120920490367777e-05, "loss": 0.8658, "num_input_tokens_seen": 66185744, "step": 4114 }, { "epoch": 0.28824853117584737, "grad_norm": 3.800870656967163, "learning_rate": 7.120220665499124e-05, "loss": 1.1031, "num_input_tokens_seen": 66202128, "step": 4115 }, { "epoch": 0.2883185794215766, "grad_norm": 4.8073530197143555, "learning_rate": 7.119520840630473e-05, "loss": 1.1191, "num_input_tokens_seen": 66217840, "step": 4116 }, { "epoch": 0.28838862766730583, "grad_norm": 3.495415210723877, "learning_rate": 7.118821015761822e-05, "loss": 0.8693, "num_input_tokens_seen": 66234224, "step": 4117 }, { "epoch": 0.2884586759130351, "grad_norm": 4.46912956237793, "learning_rate": 7.118121190893171e-05, "loss": 1.2077, "num_input_tokens_seen": 66249968, "step": 4118 }, { "epoch": 0.28852872415876435, "grad_norm": 4.553129196166992, "learning_rate": 7.117421366024518e-05, "loss": 1.1039, "num_input_tokens_seen": 66265304, "step": 4119 }, { "epoch": 0.2885987724044936, "grad_norm": 3.713836193084717, "learning_rate": 7.116721541155867e-05, "loss": 1.0833, "num_input_tokens_seen": 66281680, "step": 4120 }, { "epoch": 0.28866882065022287, "grad_norm": 3.9745819568634033, "learning_rate": 7.116021716287216e-05, "loss": 1.1524, "num_input_tokens_seen": 66298064, "step": 4121 }, { "epoch": 0.28873886889595207, "grad_norm": 6.237453937530518, "learning_rate": 7.115321891418564e-05, "loss": 1.3598, "num_input_tokens_seen": 66314448, "step": 4122 }, { "epoch": 0.28880891714168133, "grad_norm": 3.7947497367858887, "learning_rate": 7.114622066549912e-05, "loss": 0.9342, "num_input_tokens_seen": 66330832, "step": 4123 }, { "epoch": 0.2888789653874106, "grad_norm": 5.574815273284912, "learning_rate": 7.113922241681261e-05, "loss": 1.1212, "num_input_tokens_seen": 66347216, "step": 4124 }, { "epoch": 0.28894901363313985, "grad_norm": 3.538344144821167, "learning_rate": 7.11322241681261e-05, "loss": 1.0205, "num_input_tokens_seen": 66363352, "step": 4125 }, { "epoch": 0.28901906187886905, "grad_norm": 3.792769193649292, "learning_rate": 7.112522591943958e-05, "loss": 1.1266, "num_input_tokens_seen": 66379736, "step": 4126 }, { "epoch": 0.2890891101245983, "grad_norm": 4.527935981750488, "learning_rate": 7.111822767075308e-05, "loss": 1.0124, "num_input_tokens_seen": 66396120, "step": 4127 }, { "epoch": 0.2891591583703276, "grad_norm": 3.753326416015625, "learning_rate": 7.111122942206655e-05, "loss": 0.9993, "num_input_tokens_seen": 66412424, "step": 4128 }, { "epoch": 0.28922920661605683, "grad_norm": 4.310519218444824, "learning_rate": 7.110423117338003e-05, "loss": 1.0481, "num_input_tokens_seen": 66428176, "step": 4129 }, { "epoch": 0.28929925486178604, "grad_norm": 3.9848945140838623, "learning_rate": 7.109723292469352e-05, "loss": 1.2687, "num_input_tokens_seen": 66444560, "step": 4130 }, { "epoch": 0.2893693031075153, "grad_norm": 4.654316425323486, "learning_rate": 7.109023467600702e-05, "loss": 1.0025, "num_input_tokens_seen": 66460944, "step": 4131 }, { "epoch": 0.28943935135324456, "grad_norm": 4.566670894622803, "learning_rate": 7.10832364273205e-05, "loss": 0.9224, "num_input_tokens_seen": 66475928, "step": 4132 }, { "epoch": 0.2895093995989738, "grad_norm": 4.4292988777160645, "learning_rate": 7.107623817863398e-05, "loss": 1.0922, "num_input_tokens_seen": 66491904, "step": 4133 }, { "epoch": 0.289579447844703, "grad_norm": 6.520173072814941, "learning_rate": 7.106923992994747e-05, "loss": 0.9938, "num_input_tokens_seen": 66507256, "step": 4134 }, { "epoch": 0.2896494960904323, "grad_norm": 3.8424220085144043, "learning_rate": 7.106224168126095e-05, "loss": 1.0857, "num_input_tokens_seen": 66522736, "step": 4135 }, { "epoch": 0.28971954433616154, "grad_norm": 4.742796897888184, "learning_rate": 7.105524343257442e-05, "loss": 1.0296, "num_input_tokens_seen": 66538480, "step": 4136 }, { "epoch": 0.2897895925818908, "grad_norm": 3.552365779876709, "learning_rate": 7.104824518388792e-05, "loss": 1.0597, "num_input_tokens_seen": 66554576, "step": 4137 }, { "epoch": 0.28985964082762, "grad_norm": 6.649835109710693, "learning_rate": 7.104124693520141e-05, "loss": 0.9729, "num_input_tokens_seen": 66570000, "step": 4138 }, { "epoch": 0.28992968907334926, "grad_norm": 3.9890356063842773, "learning_rate": 7.103424868651489e-05, "loss": 0.9774, "num_input_tokens_seen": 66585640, "step": 4139 }, { "epoch": 0.2899997373190785, "grad_norm": 3.80637526512146, "learning_rate": 7.102725043782838e-05, "loss": 1.0373, "num_input_tokens_seen": 66601696, "step": 4140 }, { "epoch": 0.2900697855648078, "grad_norm": 4.089916706085205, "learning_rate": 7.102025218914186e-05, "loss": 1.0919, "num_input_tokens_seen": 66618080, "step": 4141 }, { "epoch": 0.290139833810537, "grad_norm": 3.2609710693359375, "learning_rate": 7.101325394045534e-05, "loss": 0.9409, "num_input_tokens_seen": 66634216, "step": 4142 }, { "epoch": 0.29020988205626624, "grad_norm": 4.3664093017578125, "learning_rate": 7.100625569176883e-05, "loss": 0.9031, "num_input_tokens_seen": 66650600, "step": 4143 }, { "epoch": 0.2902799303019955, "grad_norm": 4.460801124572754, "learning_rate": 7.099925744308232e-05, "loss": 1.0582, "num_input_tokens_seen": 66666592, "step": 4144 }, { "epoch": 0.29034997854772476, "grad_norm": 4.474677562713623, "learning_rate": 7.09922591943958e-05, "loss": 1.0016, "num_input_tokens_seen": 66681544, "step": 4145 }, { "epoch": 0.29042002679345397, "grad_norm": 3.6482129096984863, "learning_rate": 7.098526094570928e-05, "loss": 1.0823, "num_input_tokens_seen": 66697928, "step": 4146 }, { "epoch": 0.2904900750391832, "grad_norm": 3.483290195465088, "learning_rate": 7.097826269702277e-05, "loss": 0.8853, "num_input_tokens_seen": 66714312, "step": 4147 }, { "epoch": 0.2905601232849125, "grad_norm": 4.703539848327637, "learning_rate": 7.097126444833626e-05, "loss": 0.9718, "num_input_tokens_seen": 66729632, "step": 4148 }, { "epoch": 0.29063017153064175, "grad_norm": 3.8614907264709473, "learning_rate": 7.096426619964973e-05, "loss": 1.0047, "num_input_tokens_seen": 66746016, "step": 4149 }, { "epoch": 0.29070021977637095, "grad_norm": 3.612683057785034, "learning_rate": 7.095726795096322e-05, "loss": 1.1783, "num_input_tokens_seen": 66762400, "step": 4150 }, { "epoch": 0.2907702680221002, "grad_norm": 3.980149984359741, "learning_rate": 7.095026970227672e-05, "loss": 0.9993, "num_input_tokens_seen": 66778392, "step": 4151 }, { "epoch": 0.29084031626782947, "grad_norm": 3.857588052749634, "learning_rate": 7.09432714535902e-05, "loss": 1.0506, "num_input_tokens_seen": 66794200, "step": 4152 }, { "epoch": 0.29091036451355873, "grad_norm": 5.106949806213379, "learning_rate": 7.093627320490367e-05, "loss": 1.2222, "num_input_tokens_seen": 66810584, "step": 4153 }, { "epoch": 0.29098041275928793, "grad_norm": 4.338438987731934, "learning_rate": 7.092927495621718e-05, "loss": 1.1203, "num_input_tokens_seen": 66826208, "step": 4154 }, { "epoch": 0.2910504610050172, "grad_norm": 3.962877035140991, "learning_rate": 7.092227670753065e-05, "loss": 1.1026, "num_input_tokens_seen": 66842592, "step": 4155 }, { "epoch": 0.29112050925074645, "grad_norm": 3.8490965366363525, "learning_rate": 7.091527845884413e-05, "loss": 0.9551, "num_input_tokens_seen": 66858832, "step": 4156 }, { "epoch": 0.2911905574964757, "grad_norm": 4.559625148773193, "learning_rate": 7.090828021015763e-05, "loss": 1.3951, "num_input_tokens_seen": 66875216, "step": 4157 }, { "epoch": 0.29126060574220497, "grad_norm": 8.37543773651123, "learning_rate": 7.090128196147112e-05, "loss": 1.2365, "num_input_tokens_seen": 66891600, "step": 4158 }, { "epoch": 0.2913306539879342, "grad_norm": 4.128559112548828, "learning_rate": 7.089428371278459e-05, "loss": 0.8789, "num_input_tokens_seen": 66907984, "step": 4159 }, { "epoch": 0.29140070223366343, "grad_norm": 4.81403112411499, "learning_rate": 7.088728546409808e-05, "loss": 1.1149, "num_input_tokens_seen": 66923240, "step": 4160 }, { "epoch": 0.2914707504793927, "grad_norm": 4.534300804138184, "learning_rate": 7.088028721541157e-05, "loss": 0.8906, "num_input_tokens_seen": 66939624, "step": 4161 }, { "epoch": 0.29154079872512195, "grad_norm": 4.46708869934082, "learning_rate": 7.087328896672504e-05, "loss": 0.873, "num_input_tokens_seen": 66955968, "step": 4162 }, { "epoch": 0.29161084697085116, "grad_norm": 4.142822265625, "learning_rate": 7.086629071803853e-05, "loss": 0.8286, "num_input_tokens_seen": 66971680, "step": 4163 }, { "epoch": 0.2916808952165804, "grad_norm": 3.686167001724243, "learning_rate": 7.085929246935202e-05, "loss": 0.897, "num_input_tokens_seen": 66987952, "step": 4164 }, { "epoch": 0.2917509434623097, "grad_norm": 8.076430320739746, "learning_rate": 7.085229422066551e-05, "loss": 1.1215, "num_input_tokens_seen": 67004336, "step": 4165 }, { "epoch": 0.29182099170803893, "grad_norm": 8.69857120513916, "learning_rate": 7.084529597197898e-05, "loss": 1.2295, "num_input_tokens_seen": 67020216, "step": 4166 }, { "epoch": 0.29189103995376814, "grad_norm": 3.7867684364318848, "learning_rate": 7.083829772329247e-05, "loss": 1.058, "num_input_tokens_seen": 67035600, "step": 4167 }, { "epoch": 0.2919610881994974, "grad_norm": 5.560591697692871, "learning_rate": 7.083129947460596e-05, "loss": 1.0864, "num_input_tokens_seen": 67051680, "step": 4168 }, { "epoch": 0.29203113644522666, "grad_norm": 3.857120990753174, "learning_rate": 7.082430122591944e-05, "loss": 1.1991, "num_input_tokens_seen": 67068064, "step": 4169 }, { "epoch": 0.2921011846909559, "grad_norm": 4.343360900878906, "learning_rate": 7.081730297723293e-05, "loss": 1.0973, "num_input_tokens_seen": 67084448, "step": 4170 }, { "epoch": 0.2921712329366851, "grad_norm": 4.198531150817871, "learning_rate": 7.081030472854643e-05, "loss": 1.1271, "num_input_tokens_seen": 67100832, "step": 4171 }, { "epoch": 0.2922412811824144, "grad_norm": 3.539684772491455, "learning_rate": 7.08033064798599e-05, "loss": 0.9532, "num_input_tokens_seen": 67117216, "step": 4172 }, { "epoch": 0.29231132942814364, "grad_norm": 4.2374444007873535, "learning_rate": 7.079630823117338e-05, "loss": 0.9965, "num_input_tokens_seen": 67133600, "step": 4173 }, { "epoch": 0.2923813776738729, "grad_norm": 4.106996059417725, "learning_rate": 7.078930998248687e-05, "loss": 1.1141, "num_input_tokens_seen": 67149984, "step": 4174 }, { "epoch": 0.2924514259196021, "grad_norm": 3.7100484371185303, "learning_rate": 7.078231173380035e-05, "loss": 1.0702, "num_input_tokens_seen": 67166168, "step": 4175 }, { "epoch": 0.29252147416533136, "grad_norm": 5.189118385314941, "learning_rate": 7.077531348511383e-05, "loss": 0.9642, "num_input_tokens_seen": 67181472, "step": 4176 }, { "epoch": 0.2925915224110606, "grad_norm": 4.540155410766602, "learning_rate": 7.076831523642733e-05, "loss": 1.0558, "num_input_tokens_seen": 67197856, "step": 4177 }, { "epoch": 0.2926615706567899, "grad_norm": 4.748345375061035, "learning_rate": 7.076131698774082e-05, "loss": 0.8845, "num_input_tokens_seen": 67214240, "step": 4178 }, { "epoch": 0.2927316189025191, "grad_norm": 4.252089023590088, "learning_rate": 7.07543187390543e-05, "loss": 1.1002, "num_input_tokens_seen": 67230312, "step": 4179 }, { "epoch": 0.29280166714824835, "grad_norm": 4.273370742797852, "learning_rate": 7.074732049036777e-05, "loss": 1.1759, "num_input_tokens_seen": 67246152, "step": 4180 }, { "epoch": 0.2928717153939776, "grad_norm": 3.9271481037139893, "learning_rate": 7.074032224168127e-05, "loss": 1.0159, "num_input_tokens_seen": 67261688, "step": 4181 }, { "epoch": 0.29294176363970686, "grad_norm": 3.875622034072876, "learning_rate": 7.073332399299475e-05, "loss": 1.2345, "num_input_tokens_seen": 67278072, "step": 4182 }, { "epoch": 0.29301181188543607, "grad_norm": 3.8089005947113037, "learning_rate": 7.072632574430824e-05, "loss": 1.1025, "num_input_tokens_seen": 67293760, "step": 4183 }, { "epoch": 0.29308186013116533, "grad_norm": 4.402803421020508, "learning_rate": 7.071932749562172e-05, "loss": 1.0397, "num_input_tokens_seen": 67310144, "step": 4184 }, { "epoch": 0.2931519083768946, "grad_norm": 4.4534783363342285, "learning_rate": 7.071232924693521e-05, "loss": 1.0222, "num_input_tokens_seen": 67326528, "step": 4185 }, { "epoch": 0.29322195662262385, "grad_norm": 4.247747898101807, "learning_rate": 7.070533099824869e-05, "loss": 1.0667, "num_input_tokens_seen": 67342080, "step": 4186 }, { "epoch": 0.29329200486835305, "grad_norm": 5.280468463897705, "learning_rate": 7.069833274956218e-05, "loss": 1.0492, "num_input_tokens_seen": 67357168, "step": 4187 }, { "epoch": 0.2933620531140823, "grad_norm": 5.14320707321167, "learning_rate": 7.069133450087567e-05, "loss": 1.1073, "num_input_tokens_seen": 67373552, "step": 4188 }, { "epoch": 0.29343210135981157, "grad_norm": 4.131645679473877, "learning_rate": 7.068433625218914e-05, "loss": 1.3795, "num_input_tokens_seen": 67389936, "step": 4189 }, { "epoch": 0.29350214960554083, "grad_norm": 4.727990627288818, "learning_rate": 7.067733800350263e-05, "loss": 1.2066, "num_input_tokens_seen": 67406320, "step": 4190 }, { "epoch": 0.2935721978512701, "grad_norm": 5.857666969299316, "learning_rate": 7.067033975481612e-05, "loss": 1.028, "num_input_tokens_seen": 67422680, "step": 4191 }, { "epoch": 0.2936422460969993, "grad_norm": 4.185948371887207, "learning_rate": 7.06633415061296e-05, "loss": 1.2738, "num_input_tokens_seen": 67439064, "step": 4192 }, { "epoch": 0.29371229434272855, "grad_norm": 3.749274969100952, "learning_rate": 7.065634325744308e-05, "loss": 1.0327, "num_input_tokens_seen": 67454680, "step": 4193 }, { "epoch": 0.2937823425884578, "grad_norm": 4.332368850708008, "learning_rate": 7.064934500875657e-05, "loss": 1.0986, "num_input_tokens_seen": 67470800, "step": 4194 }, { "epoch": 0.29385239083418707, "grad_norm": 5.514054775238037, "learning_rate": 7.064234676007006e-05, "loss": 1.2602, "num_input_tokens_seen": 67487184, "step": 4195 }, { "epoch": 0.2939224390799163, "grad_norm": 4.534146785736084, "learning_rate": 7.063534851138353e-05, "loss": 1.2929, "num_input_tokens_seen": 67503504, "step": 4196 }, { "epoch": 0.29399248732564554, "grad_norm": 4.86776876449585, "learning_rate": 7.062835026269702e-05, "loss": 1.111, "num_input_tokens_seen": 67519056, "step": 4197 }, { "epoch": 0.2940625355713748, "grad_norm": 3.8528504371643066, "learning_rate": 7.062135201401052e-05, "loss": 0.9151, "num_input_tokens_seen": 67535440, "step": 4198 }, { "epoch": 0.29413258381710405, "grad_norm": 4.244069576263428, "learning_rate": 7.0614353765324e-05, "loss": 1.1733, "num_input_tokens_seen": 67551264, "step": 4199 }, { "epoch": 0.29420263206283326, "grad_norm": 3.5963211059570312, "learning_rate": 7.060735551663747e-05, "loss": 1.008, "num_input_tokens_seen": 67567648, "step": 4200 }, { "epoch": 0.29420263206283326, "eval_loss": 1.1331984996795654, "eval_runtime": 0.203, "eval_samples_per_second": 4.927, "eval_steps_per_second": 4.927, "num_input_tokens_seen": 67567648, "step": 4200 }, { "epoch": 0.2942726803085625, "grad_norm": 4.51765775680542, "learning_rate": 7.060035726795096e-05, "loss": 1.1284, "num_input_tokens_seen": 67583792, "step": 4201 }, { "epoch": 0.2943427285542918, "grad_norm": 4.541067123413086, "learning_rate": 7.059335901926445e-05, "loss": 1.1246, "num_input_tokens_seen": 67599856, "step": 4202 }, { "epoch": 0.29441277680002104, "grad_norm": 4.095570087432861, "learning_rate": 7.058636077057794e-05, "loss": 1.0087, "num_input_tokens_seen": 67616240, "step": 4203 }, { "epoch": 0.29448282504575024, "grad_norm": 4.616795539855957, "learning_rate": 7.057936252189143e-05, "loss": 1.2549, "num_input_tokens_seen": 67632496, "step": 4204 }, { "epoch": 0.2945528732914795, "grad_norm": 3.8619420528411865, "learning_rate": 7.057236427320492e-05, "loss": 0.9626, "num_input_tokens_seen": 67648880, "step": 4205 }, { "epoch": 0.29462292153720876, "grad_norm": 4.194519996643066, "learning_rate": 7.056536602451839e-05, "loss": 0.958, "num_input_tokens_seen": 67665264, "step": 4206 }, { "epoch": 0.294692969782938, "grad_norm": 4.835122585296631, "learning_rate": 7.055836777583187e-05, "loss": 1.0201, "num_input_tokens_seen": 67681648, "step": 4207 }, { "epoch": 0.2947630180286672, "grad_norm": 4.2085280418396, "learning_rate": 7.055136952714537e-05, "loss": 0.9584, "num_input_tokens_seen": 67697960, "step": 4208 }, { "epoch": 0.2948330662743965, "grad_norm": 4.439855575561523, "learning_rate": 7.054437127845884e-05, "loss": 1.0693, "num_input_tokens_seen": 67714344, "step": 4209 }, { "epoch": 0.29490311452012574, "grad_norm": 5.427484035491943, "learning_rate": 7.053737302977233e-05, "loss": 1.1782, "num_input_tokens_seen": 67730728, "step": 4210 }, { "epoch": 0.294973162765855, "grad_norm": 3.6627275943756104, "learning_rate": 7.053037478108582e-05, "loss": 0.8507, "num_input_tokens_seen": 67746704, "step": 4211 }, { "epoch": 0.2950432110115842, "grad_norm": 4.450380325317383, "learning_rate": 7.052337653239931e-05, "loss": 1.0651, "num_input_tokens_seen": 67762320, "step": 4212 }, { "epoch": 0.29511325925731346, "grad_norm": 3.6644749641418457, "learning_rate": 7.051637828371279e-05, "loss": 1.1532, "num_input_tokens_seen": 67778704, "step": 4213 }, { "epoch": 0.2951833075030427, "grad_norm": 4.331392288208008, "learning_rate": 7.050938003502627e-05, "loss": 1.0854, "num_input_tokens_seen": 67795088, "step": 4214 }, { "epoch": 0.295253355748772, "grad_norm": 4.157777786254883, "learning_rate": 7.050238178633976e-05, "loss": 1.2039, "num_input_tokens_seen": 67811472, "step": 4215 }, { "epoch": 0.2953234039945012, "grad_norm": 3.858069896697998, "learning_rate": 7.049538353765324e-05, "loss": 1.1751, "num_input_tokens_seen": 67827488, "step": 4216 }, { "epoch": 0.29539345224023045, "grad_norm": 4.279262542724609, "learning_rate": 7.048838528896673e-05, "loss": 1.0344, "num_input_tokens_seen": 67843872, "step": 4217 }, { "epoch": 0.2954635004859597, "grad_norm": 4.539918422698975, "learning_rate": 7.048138704028021e-05, "loss": 1.0244, "num_input_tokens_seen": 67860256, "step": 4218 }, { "epoch": 0.29553354873168897, "grad_norm": 3.738811492919922, "learning_rate": 7.04743887915937e-05, "loss": 1.067, "num_input_tokens_seen": 67876224, "step": 4219 }, { "epoch": 0.29560359697741817, "grad_norm": 4.634495258331299, "learning_rate": 7.046739054290718e-05, "loss": 0.9273, "num_input_tokens_seen": 67892040, "step": 4220 }, { "epoch": 0.29567364522314743, "grad_norm": 5.988262176513672, "learning_rate": 7.046039229422067e-05, "loss": 0.956, "num_input_tokens_seen": 67908424, "step": 4221 }, { "epoch": 0.2957436934688767, "grad_norm": 7.2220258712768555, "learning_rate": 7.045339404553416e-05, "loss": 1.246, "num_input_tokens_seen": 67924808, "step": 4222 }, { "epoch": 0.29581374171460595, "grad_norm": 8.866394996643066, "learning_rate": 7.044639579684764e-05, "loss": 0.9932, "num_input_tokens_seen": 67941192, "step": 4223 }, { "epoch": 0.2958837899603352, "grad_norm": 4.791526794433594, "learning_rate": 7.043939754816112e-05, "loss": 1.1966, "num_input_tokens_seen": 67957576, "step": 4224 }, { "epoch": 0.2959538382060644, "grad_norm": 3.8345704078674316, "learning_rate": 7.043239929947462e-05, "loss": 0.9754, "num_input_tokens_seen": 67973112, "step": 4225 }, { "epoch": 0.29602388645179367, "grad_norm": 5.0572099685668945, "learning_rate": 7.04254010507881e-05, "loss": 1.3761, "num_input_tokens_seen": 67989360, "step": 4226 }, { "epoch": 0.29609393469752293, "grad_norm": 4.467088222503662, "learning_rate": 7.041840280210157e-05, "loss": 0.981, "num_input_tokens_seen": 68005744, "step": 4227 }, { "epoch": 0.2961639829432522, "grad_norm": 6.415910243988037, "learning_rate": 7.041140455341506e-05, "loss": 1.1376, "num_input_tokens_seen": 68021592, "step": 4228 }, { "epoch": 0.2962340311889814, "grad_norm": 4.432079315185547, "learning_rate": 7.040440630472855e-05, "loss": 1.1264, "num_input_tokens_seen": 68037976, "step": 4229 }, { "epoch": 0.29630407943471065, "grad_norm": 4.207062721252441, "learning_rate": 7.039740805604204e-05, "loss": 1.2702, "num_input_tokens_seen": 68054328, "step": 4230 }, { "epoch": 0.2963741276804399, "grad_norm": 4.825972557067871, "learning_rate": 7.039040980735553e-05, "loss": 1.3091, "num_input_tokens_seen": 68070416, "step": 4231 }, { "epoch": 0.2964441759261692, "grad_norm": 3.917593002319336, "learning_rate": 7.038341155866901e-05, "loss": 1.1863, "num_input_tokens_seen": 68086800, "step": 4232 }, { "epoch": 0.2965142241718984, "grad_norm": 3.8865675926208496, "learning_rate": 7.037641330998249e-05, "loss": 1.2023, "num_input_tokens_seen": 68103184, "step": 4233 }, { "epoch": 0.29658427241762764, "grad_norm": 3.8321971893310547, "learning_rate": 7.036941506129596e-05, "loss": 0.9507, "num_input_tokens_seen": 68119568, "step": 4234 }, { "epoch": 0.2966543206633569, "grad_norm": 5.020960807800293, "learning_rate": 7.036241681260947e-05, "loss": 1.0438, "num_input_tokens_seen": 68135416, "step": 4235 }, { "epoch": 0.29672436890908616, "grad_norm": 3.653468608856201, "learning_rate": 7.035541856392294e-05, "loss": 1.0664, "num_input_tokens_seen": 68151800, "step": 4236 }, { "epoch": 0.29679441715481536, "grad_norm": 3.8133575916290283, "learning_rate": 7.034842031523643e-05, "loss": 1.0867, "num_input_tokens_seen": 68168184, "step": 4237 }, { "epoch": 0.2968644654005446, "grad_norm": 3.6642141342163086, "learning_rate": 7.034142206654992e-05, "loss": 0.9505, "num_input_tokens_seen": 68184080, "step": 4238 }, { "epoch": 0.2969345136462739, "grad_norm": 4.362963676452637, "learning_rate": 7.033442381786341e-05, "loss": 1.0335, "num_input_tokens_seen": 68199928, "step": 4239 }, { "epoch": 0.29700456189200314, "grad_norm": 3.6831562519073486, "learning_rate": 7.032742556917688e-05, "loss": 0.9608, "num_input_tokens_seen": 68215952, "step": 4240 }, { "epoch": 0.29707461013773234, "grad_norm": 4.906534194946289, "learning_rate": 7.032042732049037e-05, "loss": 0.9434, "num_input_tokens_seen": 68232336, "step": 4241 }, { "epoch": 0.2971446583834616, "grad_norm": 3.446749687194824, "learning_rate": 7.031342907180386e-05, "loss": 0.8306, "num_input_tokens_seen": 68247832, "step": 4242 }, { "epoch": 0.29721470662919086, "grad_norm": 4.729014873504639, "learning_rate": 7.030643082311735e-05, "loss": 1.0787, "num_input_tokens_seen": 68264216, "step": 4243 }, { "epoch": 0.2972847548749201, "grad_norm": 4.196920871734619, "learning_rate": 7.029943257443082e-05, "loss": 1.1496, "num_input_tokens_seen": 68280600, "step": 4244 }, { "epoch": 0.2973548031206493, "grad_norm": 7.193357467651367, "learning_rate": 7.029243432574431e-05, "loss": 1.0509, "num_input_tokens_seen": 68296984, "step": 4245 }, { "epoch": 0.2974248513663786, "grad_norm": 4.00344181060791, "learning_rate": 7.02854360770578e-05, "loss": 0.9025, "num_input_tokens_seen": 68312720, "step": 4246 }, { "epoch": 0.29749489961210784, "grad_norm": 4.04103422164917, "learning_rate": 7.027843782837128e-05, "loss": 1.1307, "num_input_tokens_seen": 68328608, "step": 4247 }, { "epoch": 0.2975649478578371, "grad_norm": 4.010391712188721, "learning_rate": 7.027143957968476e-05, "loss": 1.153, "num_input_tokens_seen": 68343288, "step": 4248 }, { "epoch": 0.2976349961035663, "grad_norm": 6.364760398864746, "learning_rate": 7.026444133099825e-05, "loss": 1.1629, "num_input_tokens_seen": 68359672, "step": 4249 }, { "epoch": 0.29770504434929557, "grad_norm": 5.682034969329834, "learning_rate": 7.025744308231174e-05, "loss": 1.1388, "num_input_tokens_seen": 68376056, "step": 4250 }, { "epoch": 0.2977750925950248, "grad_norm": 3.6160550117492676, "learning_rate": 7.025044483362522e-05, "loss": 1.0105, "num_input_tokens_seen": 68392440, "step": 4251 }, { "epoch": 0.2978451408407541, "grad_norm": 4.839343070983887, "learning_rate": 7.024344658493872e-05, "loss": 0.9924, "num_input_tokens_seen": 68408608, "step": 4252 }, { "epoch": 0.2979151890864833, "grad_norm": 5.255819320678711, "learning_rate": 7.02364483362522e-05, "loss": 1.1425, "num_input_tokens_seen": 68424944, "step": 4253 }, { "epoch": 0.29798523733221255, "grad_norm": 3.7549142837524414, "learning_rate": 7.022945008756567e-05, "loss": 0.8801, "num_input_tokens_seen": 68441328, "step": 4254 }, { "epoch": 0.2980552855779418, "grad_norm": 5.159091472625732, "learning_rate": 7.022245183887916e-05, "loss": 1.0075, "num_input_tokens_seen": 68457712, "step": 4255 }, { "epoch": 0.29812533382367107, "grad_norm": 3.8031342029571533, "learning_rate": 7.021545359019265e-05, "loss": 0.9975, "num_input_tokens_seen": 68474072, "step": 4256 }, { "epoch": 0.29819538206940027, "grad_norm": 6.039318084716797, "learning_rate": 7.020845534150613e-05, "loss": 1.0791, "num_input_tokens_seen": 68490456, "step": 4257 }, { "epoch": 0.29826543031512953, "grad_norm": 3.9376237392425537, "learning_rate": 7.020145709281962e-05, "loss": 1.0753, "num_input_tokens_seen": 68506760, "step": 4258 }, { "epoch": 0.2983354785608588, "grad_norm": 4.599661827087402, "learning_rate": 7.019445884413311e-05, "loss": 0.9722, "num_input_tokens_seen": 68523144, "step": 4259 }, { "epoch": 0.29840552680658805, "grad_norm": 3.743640661239624, "learning_rate": 7.018746059544659e-05, "loss": 1.157, "num_input_tokens_seen": 68539448, "step": 4260 }, { "epoch": 0.2984755750523173, "grad_norm": 6.111955642700195, "learning_rate": 7.018046234676006e-05, "loss": 1.2148, "num_input_tokens_seen": 68555832, "step": 4261 }, { "epoch": 0.2985456232980465, "grad_norm": 4.297199249267578, "learning_rate": 7.017346409807356e-05, "loss": 0.9796, "num_input_tokens_seen": 68572216, "step": 4262 }, { "epoch": 0.2986156715437758, "grad_norm": 4.126640319824219, "learning_rate": 7.016646584938705e-05, "loss": 1.0781, "num_input_tokens_seen": 68588600, "step": 4263 }, { "epoch": 0.29868571978950503, "grad_norm": 3.8142640590667725, "learning_rate": 7.015946760070053e-05, "loss": 1.1943, "num_input_tokens_seen": 68604336, "step": 4264 }, { "epoch": 0.2987557680352343, "grad_norm": 3.9500539302825928, "learning_rate": 7.015246935201402e-05, "loss": 1.1179, "num_input_tokens_seen": 68620056, "step": 4265 }, { "epoch": 0.2988258162809635, "grad_norm": 4.431976318359375, "learning_rate": 7.01454711033275e-05, "loss": 1.3419, "num_input_tokens_seen": 68636328, "step": 4266 }, { "epoch": 0.29889586452669276, "grad_norm": 5.619480609893799, "learning_rate": 7.013847285464098e-05, "loss": 1.099, "num_input_tokens_seen": 68651984, "step": 4267 }, { "epoch": 0.298965912772422, "grad_norm": 3.8473827838897705, "learning_rate": 7.013147460595447e-05, "loss": 1.1273, "num_input_tokens_seen": 68668176, "step": 4268 }, { "epoch": 0.2990359610181513, "grad_norm": 5.942142486572266, "learning_rate": 7.012447635726796e-05, "loss": 1.1058, "num_input_tokens_seen": 68684560, "step": 4269 }, { "epoch": 0.2991060092638805, "grad_norm": 6.194666862487793, "learning_rate": 7.011747810858145e-05, "loss": 0.9782, "num_input_tokens_seen": 68699816, "step": 4270 }, { "epoch": 0.29917605750960974, "grad_norm": 4.336294651031494, "learning_rate": 7.011047985989492e-05, "loss": 1.0038, "num_input_tokens_seen": 68716200, "step": 4271 }, { "epoch": 0.299246105755339, "grad_norm": 4.277907371520996, "learning_rate": 7.010348161120841e-05, "loss": 1.0151, "num_input_tokens_seen": 68732584, "step": 4272 }, { "epoch": 0.29931615400106826, "grad_norm": 5.045118808746338, "learning_rate": 7.00964833625219e-05, "loss": 1.0992, "num_input_tokens_seen": 68748840, "step": 4273 }, { "epoch": 0.29938620224679746, "grad_norm": 4.3978400230407715, "learning_rate": 7.008948511383537e-05, "loss": 1.1796, "num_input_tokens_seen": 68765224, "step": 4274 }, { "epoch": 0.2994562504925267, "grad_norm": 5.052615165710449, "learning_rate": 7.008248686514886e-05, "loss": 1.0557, "num_input_tokens_seen": 68780808, "step": 4275 }, { "epoch": 0.299526298738256, "grad_norm": 6.902999401092529, "learning_rate": 7.007548861646235e-05, "loss": 1.0952, "num_input_tokens_seen": 68797064, "step": 4276 }, { "epoch": 0.29959634698398524, "grad_norm": 5.947190761566162, "learning_rate": 7.006849036777584e-05, "loss": 1.1163, "num_input_tokens_seen": 68812904, "step": 4277 }, { "epoch": 0.29966639522971444, "grad_norm": 5.443974018096924, "learning_rate": 7.006149211908931e-05, "loss": 1.146, "num_input_tokens_seen": 68828736, "step": 4278 }, { "epoch": 0.2997364434754437, "grad_norm": 3.9849112033843994, "learning_rate": 7.005449387040282e-05, "loss": 0.9636, "num_input_tokens_seen": 68843928, "step": 4279 }, { "epoch": 0.29980649172117296, "grad_norm": 5.787483215332031, "learning_rate": 7.004749562171629e-05, "loss": 1.194, "num_input_tokens_seen": 68860312, "step": 4280 }, { "epoch": 0.2998765399669022, "grad_norm": 3.8437387943267822, "learning_rate": 7.004049737302977e-05, "loss": 0.9214, "num_input_tokens_seen": 68876696, "step": 4281 }, { "epoch": 0.2999465882126314, "grad_norm": 3.94879150390625, "learning_rate": 7.003349912434325e-05, "loss": 1.1403, "num_input_tokens_seen": 68893080, "step": 4282 }, { "epoch": 0.3000166364583607, "grad_norm": 4.746649265289307, "learning_rate": 7.002650087565676e-05, "loss": 1.077, "num_input_tokens_seen": 68909464, "step": 4283 }, { "epoch": 0.30008668470408995, "grad_norm": 4.1024861335754395, "learning_rate": 7.001950262697023e-05, "loss": 1.0592, "num_input_tokens_seen": 68925352, "step": 4284 }, { "epoch": 0.3001567329498192, "grad_norm": 4.5073699951171875, "learning_rate": 7.001250437828372e-05, "loss": 1.2092, "num_input_tokens_seen": 68941736, "step": 4285 }, { "epoch": 0.3002267811955484, "grad_norm": 4.947534561157227, "learning_rate": 7.000550612959721e-05, "loss": 1.1389, "num_input_tokens_seen": 68958120, "step": 4286 }, { "epoch": 0.30029682944127767, "grad_norm": 3.8399429321289062, "learning_rate": 6.999850788091068e-05, "loss": 1.1268, "num_input_tokens_seen": 68974232, "step": 4287 }, { "epoch": 0.30036687768700693, "grad_norm": 3.9180405139923096, "learning_rate": 6.999150963222416e-05, "loss": 1.1666, "num_input_tokens_seen": 68990616, "step": 4288 }, { "epoch": 0.3004369259327362, "grad_norm": 3.9542794227600098, "learning_rate": 6.998451138353766e-05, "loss": 1.1474, "num_input_tokens_seen": 69006952, "step": 4289 }, { "epoch": 0.3005069741784654, "grad_norm": 3.5275325775146484, "learning_rate": 6.997751313485115e-05, "loss": 1.1239, "num_input_tokens_seen": 69023336, "step": 4290 }, { "epoch": 0.30057702242419465, "grad_norm": 3.9485349655151367, "learning_rate": 6.997051488616462e-05, "loss": 0.9736, "num_input_tokens_seen": 69038392, "step": 4291 }, { "epoch": 0.3006470706699239, "grad_norm": 3.4944114685058594, "learning_rate": 6.996351663747811e-05, "loss": 0.7473, "num_input_tokens_seen": 69054160, "step": 4292 }, { "epoch": 0.30071711891565317, "grad_norm": 3.387148380279541, "learning_rate": 6.99565183887916e-05, "loss": 0.9142, "num_input_tokens_seen": 69070056, "step": 4293 }, { "epoch": 0.30078716716138243, "grad_norm": 3.9591586589813232, "learning_rate": 6.994952014010508e-05, "loss": 1.133, "num_input_tokens_seen": 69086240, "step": 4294 }, { "epoch": 0.30085721540711163, "grad_norm": 8.32682991027832, "learning_rate": 6.994252189141857e-05, "loss": 1.1697, "num_input_tokens_seen": 69102408, "step": 4295 }, { "epoch": 0.3009272636528409, "grad_norm": 3.5885214805603027, "learning_rate": 6.993552364273205e-05, "loss": 1.0626, "num_input_tokens_seen": 69118376, "step": 4296 }, { "epoch": 0.30099731189857015, "grad_norm": 4.784765243530273, "learning_rate": 6.992852539404554e-05, "loss": 0.9771, "num_input_tokens_seen": 69133664, "step": 4297 }, { "epoch": 0.3010673601442994, "grad_norm": 6.456319808959961, "learning_rate": 6.992152714535902e-05, "loss": 1.3836, "num_input_tokens_seen": 69148224, "step": 4298 }, { "epoch": 0.3011374083900286, "grad_norm": 5.820954322814941, "learning_rate": 6.99145288966725e-05, "loss": 0.9987, "num_input_tokens_seen": 69164440, "step": 4299 }, { "epoch": 0.3012074566357579, "grad_norm": 6.690483570098877, "learning_rate": 6.9907530647986e-05, "loss": 1.1583, "num_input_tokens_seen": 69180824, "step": 4300 }, { "epoch": 0.30127750488148713, "grad_norm": 3.8018131256103516, "learning_rate": 6.990053239929947e-05, "loss": 1.1643, "num_input_tokens_seen": 69197016, "step": 4301 }, { "epoch": 0.3013475531272164, "grad_norm": 4.574918746948242, "learning_rate": 6.989353415061296e-05, "loss": 1.168, "num_input_tokens_seen": 69213400, "step": 4302 }, { "epoch": 0.3014176013729456, "grad_norm": 3.3843026161193848, "learning_rate": 6.988653590192646e-05, "loss": 0.9762, "num_input_tokens_seen": 69229784, "step": 4303 }, { "epoch": 0.30148764961867486, "grad_norm": 6.179981708526611, "learning_rate": 6.987953765323994e-05, "loss": 1.173, "num_input_tokens_seen": 69246168, "step": 4304 }, { "epoch": 0.3015576978644041, "grad_norm": 4.759994029998779, "learning_rate": 6.987253940455341e-05, "loss": 0.947, "num_input_tokens_seen": 69262512, "step": 4305 }, { "epoch": 0.3016277461101334, "grad_norm": 3.719902992248535, "learning_rate": 6.986554115586691e-05, "loss": 1.1882, "num_input_tokens_seen": 69278496, "step": 4306 }, { "epoch": 0.3016977943558626, "grad_norm": 3.6757240295410156, "learning_rate": 6.985854290718039e-05, "loss": 1.1506, "num_input_tokens_seen": 69294880, "step": 4307 }, { "epoch": 0.30176784260159184, "grad_norm": 4.316056251525879, "learning_rate": 6.985154465849386e-05, "loss": 0.8921, "num_input_tokens_seen": 69311264, "step": 4308 }, { "epoch": 0.3018378908473211, "grad_norm": 5.248560428619385, "learning_rate": 6.984454640980736e-05, "loss": 1.1127, "num_input_tokens_seen": 69327648, "step": 4309 }, { "epoch": 0.30190793909305036, "grad_norm": 3.601381540298462, "learning_rate": 6.983754816112085e-05, "loss": 1.0002, "num_input_tokens_seen": 69344032, "step": 4310 }, { "epoch": 0.30197798733877956, "grad_norm": 4.555902004241943, "learning_rate": 6.983054991243433e-05, "loss": 1.0674, "num_input_tokens_seen": 69360416, "step": 4311 }, { "epoch": 0.3020480355845088, "grad_norm": 4.615258693695068, "learning_rate": 6.982355166374782e-05, "loss": 1.1759, "num_input_tokens_seen": 69375728, "step": 4312 }, { "epoch": 0.3021180838302381, "grad_norm": 5.953250408172607, "learning_rate": 6.98165534150613e-05, "loss": 1.1161, "num_input_tokens_seen": 69391768, "step": 4313 }, { "epoch": 0.30218813207596734, "grad_norm": 4.049426555633545, "learning_rate": 6.980955516637478e-05, "loss": 1.0466, "num_input_tokens_seen": 69407328, "step": 4314 }, { "epoch": 0.30225818032169655, "grad_norm": 4.012260437011719, "learning_rate": 6.980255691768827e-05, "loss": 1.338, "num_input_tokens_seen": 69423712, "step": 4315 }, { "epoch": 0.3023282285674258, "grad_norm": 3.8932242393493652, "learning_rate": 6.979555866900176e-05, "loss": 1.0836, "num_input_tokens_seen": 69440096, "step": 4316 }, { "epoch": 0.30239827681315506, "grad_norm": 7.58411169052124, "learning_rate": 6.978856042031525e-05, "loss": 1.0849, "num_input_tokens_seen": 69456088, "step": 4317 }, { "epoch": 0.3024683250588843, "grad_norm": 5.275664806365967, "learning_rate": 6.978156217162872e-05, "loss": 0.9773, "num_input_tokens_seen": 69471768, "step": 4318 }, { "epoch": 0.30253837330461353, "grad_norm": 3.6384737491607666, "learning_rate": 6.977456392294221e-05, "loss": 1.1168, "num_input_tokens_seen": 69488152, "step": 4319 }, { "epoch": 0.3026084215503428, "grad_norm": 5.059805870056152, "learning_rate": 6.97675656742557e-05, "loss": 1.1221, "num_input_tokens_seen": 69504536, "step": 4320 }, { "epoch": 0.30267846979607205, "grad_norm": 5.672605037689209, "learning_rate": 6.976056742556917e-05, "loss": 0.8506, "num_input_tokens_seen": 69520920, "step": 4321 }, { "epoch": 0.3027485180418013, "grad_norm": 3.5066421031951904, "learning_rate": 6.975356917688266e-05, "loss": 1.1437, "num_input_tokens_seen": 69537304, "step": 4322 }, { "epoch": 0.3028185662875305, "grad_norm": 4.403011798858643, "learning_rate": 6.974657092819616e-05, "loss": 1.0946, "num_input_tokens_seen": 69553688, "step": 4323 }, { "epoch": 0.30288861453325977, "grad_norm": 3.87226939201355, "learning_rate": 6.973957267950964e-05, "loss": 0.9997, "num_input_tokens_seen": 69570072, "step": 4324 }, { "epoch": 0.30295866277898903, "grad_norm": 4.516434192657471, "learning_rate": 6.973257443082311e-05, "loss": 1.0816, "num_input_tokens_seen": 69585056, "step": 4325 }, { "epoch": 0.3030287110247183, "grad_norm": 4.07093620300293, "learning_rate": 6.97255761821366e-05, "loss": 1.1811, "num_input_tokens_seen": 69601440, "step": 4326 }, { "epoch": 0.3030987592704475, "grad_norm": 3.663632392883301, "learning_rate": 6.971857793345009e-05, "loss": 1.0103, "num_input_tokens_seen": 69617824, "step": 4327 }, { "epoch": 0.30316880751617675, "grad_norm": 3.791191577911377, "learning_rate": 6.971157968476357e-05, "loss": 1.1402, "num_input_tokens_seen": 69634208, "step": 4328 }, { "epoch": 0.303238855761906, "grad_norm": 4.766335964202881, "learning_rate": 6.970458143607707e-05, "loss": 1.057, "num_input_tokens_seen": 69650592, "step": 4329 }, { "epoch": 0.30330890400763527, "grad_norm": 3.6603240966796875, "learning_rate": 6.969758318739056e-05, "loss": 1.0052, "num_input_tokens_seen": 69666976, "step": 4330 }, { "epoch": 0.30337895225336453, "grad_norm": 4.231273174285889, "learning_rate": 6.969058493870403e-05, "loss": 1.1661, "num_input_tokens_seen": 69683360, "step": 4331 }, { "epoch": 0.30344900049909374, "grad_norm": 3.7526698112487793, "learning_rate": 6.968358669001751e-05, "loss": 1.0783, "num_input_tokens_seen": 69699744, "step": 4332 }, { "epoch": 0.303519048744823, "grad_norm": 3.8541617393493652, "learning_rate": 6.967658844133101e-05, "loss": 1.0485, "num_input_tokens_seen": 69715960, "step": 4333 }, { "epoch": 0.30358909699055225, "grad_norm": 3.914926767349243, "learning_rate": 6.966959019264448e-05, "loss": 0.9525, "num_input_tokens_seen": 69732344, "step": 4334 }, { "epoch": 0.3036591452362815, "grad_norm": 4.39329719543457, "learning_rate": 6.966259194395797e-05, "loss": 1.0234, "num_input_tokens_seen": 69748728, "step": 4335 }, { "epoch": 0.3037291934820107, "grad_norm": 3.914006233215332, "learning_rate": 6.965559369527146e-05, "loss": 1.1397, "num_input_tokens_seen": 69765112, "step": 4336 }, { "epoch": 0.30379924172774, "grad_norm": 4.536770343780518, "learning_rate": 6.964859544658495e-05, "loss": 1.1825, "num_input_tokens_seen": 69781496, "step": 4337 }, { "epoch": 0.30386928997346924, "grad_norm": 4.147655010223389, "learning_rate": 6.964159719789843e-05, "loss": 0.7535, "num_input_tokens_seen": 69797880, "step": 4338 }, { "epoch": 0.3039393382191985, "grad_norm": 4.224967956542969, "learning_rate": 6.963459894921191e-05, "loss": 1.1842, "num_input_tokens_seen": 69814264, "step": 4339 }, { "epoch": 0.3040093864649277, "grad_norm": 4.415369033813477, "learning_rate": 6.96276007005254e-05, "loss": 1.1396, "num_input_tokens_seen": 69830648, "step": 4340 }, { "epoch": 0.30407943471065696, "grad_norm": 3.5865182876586914, "learning_rate": 6.962060245183888e-05, "loss": 1.1649, "num_input_tokens_seen": 69846608, "step": 4341 }, { "epoch": 0.3041494829563862, "grad_norm": 6.16670560836792, "learning_rate": 6.961360420315237e-05, "loss": 1.0684, "num_input_tokens_seen": 69862232, "step": 4342 }, { "epoch": 0.3042195312021155, "grad_norm": 7.907288074493408, "learning_rate": 6.960660595446586e-05, "loss": 0.9661, "num_input_tokens_seen": 69878616, "step": 4343 }, { "epoch": 0.3042895794478447, "grad_norm": 3.7910618782043457, "learning_rate": 6.959960770577934e-05, "loss": 1.0852, "num_input_tokens_seen": 69895000, "step": 4344 }, { "epoch": 0.30435962769357394, "grad_norm": 3.4832661151885986, "learning_rate": 6.959260945709282e-05, "loss": 1.0312, "num_input_tokens_seen": 69911384, "step": 4345 }, { "epoch": 0.3044296759393032, "grad_norm": 3.563248872756958, "learning_rate": 6.958561120840631e-05, "loss": 1.1249, "num_input_tokens_seen": 69927768, "step": 4346 }, { "epoch": 0.30449972418503246, "grad_norm": 4.838014602661133, "learning_rate": 6.95786129597198e-05, "loss": 1.2449, "num_input_tokens_seen": 69944152, "step": 4347 }, { "epoch": 0.30456977243076166, "grad_norm": 3.6796975135803223, "learning_rate": 6.957161471103327e-05, "loss": 0.8156, "num_input_tokens_seen": 69959968, "step": 4348 }, { "epoch": 0.3046398206764909, "grad_norm": 4.028040885925293, "learning_rate": 6.956461646234677e-05, "loss": 1.0653, "num_input_tokens_seen": 69975960, "step": 4349 }, { "epoch": 0.3047098689222202, "grad_norm": 4.073189735412598, "learning_rate": 6.955761821366026e-05, "loss": 1.0004, "num_input_tokens_seen": 69991656, "step": 4350 }, { "epoch": 0.30477991716794944, "grad_norm": 5.757152080535889, "learning_rate": 6.955061996497374e-05, "loss": 1.2074, "num_input_tokens_seen": 70008040, "step": 4351 }, { "epoch": 0.30484996541367865, "grad_norm": 5.49181604385376, "learning_rate": 6.954362171628721e-05, "loss": 1.0235, "num_input_tokens_seen": 70024424, "step": 4352 }, { "epoch": 0.3049200136594079, "grad_norm": 5.573401927947998, "learning_rate": 6.95366234676007e-05, "loss": 0.9787, "num_input_tokens_seen": 70040808, "step": 4353 }, { "epoch": 0.30499006190513717, "grad_norm": 3.491823673248291, "learning_rate": 6.952962521891419e-05, "loss": 1.0254, "num_input_tokens_seen": 70057192, "step": 4354 }, { "epoch": 0.3050601101508664, "grad_norm": 6.05043888092041, "learning_rate": 6.952262697022768e-05, "loss": 1.0709, "num_input_tokens_seen": 70073576, "step": 4355 }, { "epoch": 0.30513015839659563, "grad_norm": 3.848910331726074, "learning_rate": 6.951562872154117e-05, "loss": 1.0267, "num_input_tokens_seen": 70089960, "step": 4356 }, { "epoch": 0.3052002066423249, "grad_norm": 4.134339332580566, "learning_rate": 6.950863047285465e-05, "loss": 1.2447, "num_input_tokens_seen": 70106344, "step": 4357 }, { "epoch": 0.30527025488805415, "grad_norm": 3.6560862064361572, "learning_rate": 6.950163222416813e-05, "loss": 1.1018, "num_input_tokens_seen": 70122056, "step": 4358 }, { "epoch": 0.3053403031337834, "grad_norm": 3.813434600830078, "learning_rate": 6.94946339754816e-05, "loss": 1.0149, "num_input_tokens_seen": 70138408, "step": 4359 }, { "epoch": 0.3054103513795126, "grad_norm": 5.002225875854492, "learning_rate": 6.948763572679511e-05, "loss": 1.1563, "num_input_tokens_seen": 70154792, "step": 4360 }, { "epoch": 0.30548039962524187, "grad_norm": 3.8483340740203857, "learning_rate": 6.948063747810858e-05, "loss": 0.9643, "num_input_tokens_seen": 70171176, "step": 4361 }, { "epoch": 0.30555044787097113, "grad_norm": 5.18534517288208, "learning_rate": 6.947363922942207e-05, "loss": 1.1841, "num_input_tokens_seen": 70187336, "step": 4362 }, { "epoch": 0.3056204961167004, "grad_norm": 3.92976713180542, "learning_rate": 6.946664098073556e-05, "loss": 1.0051, "num_input_tokens_seen": 70203720, "step": 4363 }, { "epoch": 0.30569054436242965, "grad_norm": 3.4534151554107666, "learning_rate": 6.945964273204905e-05, "loss": 0.9356, "num_input_tokens_seen": 70220104, "step": 4364 }, { "epoch": 0.30576059260815885, "grad_norm": 3.7937867641448975, "learning_rate": 6.945264448336252e-05, "loss": 1.1694, "num_input_tokens_seen": 70236488, "step": 4365 }, { "epoch": 0.3058306408538881, "grad_norm": 3.9063713550567627, "learning_rate": 6.944564623467601e-05, "loss": 1.0969, "num_input_tokens_seen": 70252872, "step": 4366 }, { "epoch": 0.3059006890996174, "grad_norm": 3.9363296031951904, "learning_rate": 6.94386479859895e-05, "loss": 0.9776, "num_input_tokens_seen": 70269256, "step": 4367 }, { "epoch": 0.30597073734534663, "grad_norm": 4.722838401794434, "learning_rate": 6.943164973730297e-05, "loss": 0.9503, "num_input_tokens_seen": 70285640, "step": 4368 }, { "epoch": 0.30604078559107584, "grad_norm": 4.053229808807373, "learning_rate": 6.942465148861646e-05, "loss": 1.2669, "num_input_tokens_seen": 70301688, "step": 4369 }, { "epoch": 0.3061108338368051, "grad_norm": 3.71604323387146, "learning_rate": 6.941765323992995e-05, "loss": 1.0619, "num_input_tokens_seen": 70318072, "step": 4370 }, { "epoch": 0.30618088208253436, "grad_norm": 3.8376901149749756, "learning_rate": 6.941065499124344e-05, "loss": 1.0007, "num_input_tokens_seen": 70334456, "step": 4371 }, { "epoch": 0.3062509303282636, "grad_norm": 4.157979488372803, "learning_rate": 6.940365674255692e-05, "loss": 1.2379, "num_input_tokens_seen": 70350424, "step": 4372 }, { "epoch": 0.3063209785739928, "grad_norm": 4.173924922943115, "learning_rate": 6.93966584938704e-05, "loss": 1.1111, "num_input_tokens_seen": 70366808, "step": 4373 }, { "epoch": 0.3063910268197221, "grad_norm": 4.114030838012695, "learning_rate": 6.938966024518389e-05, "loss": 1.0932, "num_input_tokens_seen": 70383000, "step": 4374 }, { "epoch": 0.30646107506545134, "grad_norm": 4.31168794631958, "learning_rate": 6.938266199649738e-05, "loss": 1.243, "num_input_tokens_seen": 70399016, "step": 4375 }, { "epoch": 0.3065311233111806, "grad_norm": 6.187852382659912, "learning_rate": 6.937566374781087e-05, "loss": 0.9274, "num_input_tokens_seen": 70413016, "step": 4376 }, { "epoch": 0.3066011715569098, "grad_norm": 4.700244903564453, "learning_rate": 6.936866549912436e-05, "loss": 1.0809, "num_input_tokens_seen": 70427552, "step": 4377 }, { "epoch": 0.30667121980263906, "grad_norm": 4.941024303436279, "learning_rate": 6.936166725043783e-05, "loss": 1.1653, "num_input_tokens_seen": 70443936, "step": 4378 }, { "epoch": 0.3067412680483683, "grad_norm": 3.8171792030334473, "learning_rate": 6.935466900175131e-05, "loss": 1.1128, "num_input_tokens_seen": 70460320, "step": 4379 }, { "epoch": 0.3068113162940976, "grad_norm": 5.006760597229004, "learning_rate": 6.93476707530648e-05, "loss": 1.1674, "num_input_tokens_seen": 70476704, "step": 4380 }, { "epoch": 0.3068813645398268, "grad_norm": 3.8567628860473633, "learning_rate": 6.934067250437829e-05, "loss": 1.1478, "num_input_tokens_seen": 70493016, "step": 4381 }, { "epoch": 0.30695141278555604, "grad_norm": 3.7168126106262207, "learning_rate": 6.933367425569177e-05, "loss": 0.9496, "num_input_tokens_seen": 70509400, "step": 4382 }, { "epoch": 0.3070214610312853, "grad_norm": 4.72265625, "learning_rate": 6.932667600700526e-05, "loss": 1.0319, "num_input_tokens_seen": 70525592, "step": 4383 }, { "epoch": 0.30709150927701456, "grad_norm": 4.502997875213623, "learning_rate": 6.931967775831875e-05, "loss": 1.0556, "num_input_tokens_seen": 70541976, "step": 4384 }, { "epoch": 0.30716155752274377, "grad_norm": 4.090621471405029, "learning_rate": 6.931267950963223e-05, "loss": 1.1441, "num_input_tokens_seen": 70558360, "step": 4385 }, { "epoch": 0.307231605768473, "grad_norm": 3.501185655593872, "learning_rate": 6.93056812609457e-05, "loss": 0.9005, "num_input_tokens_seen": 70574640, "step": 4386 }, { "epoch": 0.3073016540142023, "grad_norm": 3.937352180480957, "learning_rate": 6.92986830122592e-05, "loss": 1.1003, "num_input_tokens_seen": 70591024, "step": 4387 }, { "epoch": 0.30737170225993155, "grad_norm": 8.832700729370117, "learning_rate": 6.929168476357268e-05, "loss": 0.9773, "num_input_tokens_seen": 70606720, "step": 4388 }, { "epoch": 0.30744175050566075, "grad_norm": 3.8081719875335693, "learning_rate": 6.928468651488617e-05, "loss": 0.8348, "num_input_tokens_seen": 70622864, "step": 4389 }, { "epoch": 0.30751179875139, "grad_norm": 3.836366653442383, "learning_rate": 6.927768826619966e-05, "loss": 1.0858, "num_input_tokens_seen": 70639248, "step": 4390 }, { "epoch": 0.30758184699711927, "grad_norm": 5.150767803192139, "learning_rate": 6.927069001751314e-05, "loss": 1.0395, "num_input_tokens_seen": 70655128, "step": 4391 }, { "epoch": 0.3076518952428485, "grad_norm": 5.0762434005737305, "learning_rate": 6.926369176882662e-05, "loss": 0.9503, "num_input_tokens_seen": 70671512, "step": 4392 }, { "epoch": 0.30772194348857773, "grad_norm": 3.7713098526000977, "learning_rate": 6.925669352014011e-05, "loss": 0.9273, "num_input_tokens_seen": 70687896, "step": 4393 }, { "epoch": 0.307791991734307, "grad_norm": 5.246247291564941, "learning_rate": 6.92496952714536e-05, "loss": 1.1295, "num_input_tokens_seen": 70704280, "step": 4394 }, { "epoch": 0.30786203998003625, "grad_norm": 3.5723984241485596, "learning_rate": 6.924269702276707e-05, "loss": 0.9063, "num_input_tokens_seen": 70720664, "step": 4395 }, { "epoch": 0.3079320882257655, "grad_norm": 3.5165982246398926, "learning_rate": 6.923569877408056e-05, "loss": 0.963, "num_input_tokens_seen": 70736968, "step": 4396 }, { "epoch": 0.30800213647149477, "grad_norm": 4.140204429626465, "learning_rate": 6.922870052539405e-05, "loss": 0.9557, "num_input_tokens_seen": 70753352, "step": 4397 }, { "epoch": 0.308072184717224, "grad_norm": 7.949122428894043, "learning_rate": 6.922170227670754e-05, "loss": 0.9429, "num_input_tokens_seen": 70769720, "step": 4398 }, { "epoch": 0.30814223296295323, "grad_norm": 6.45367431640625, "learning_rate": 6.921470402802101e-05, "loss": 1.2214, "num_input_tokens_seen": 70784984, "step": 4399 }, { "epoch": 0.3082122812086825, "grad_norm": 4.139477252960205, "learning_rate": 6.92077057793345e-05, "loss": 1.2376, "num_input_tokens_seen": 70800376, "step": 4400 }, { "epoch": 0.3082122812086825, "eval_loss": 1.1308872699737549, "eval_runtime": 0.2076, "eval_samples_per_second": 4.818, "eval_steps_per_second": 4.818, "num_input_tokens_seen": 70800376, "step": 4400 }, { "epoch": 0.30828232945441175, "grad_norm": 4.095129013061523, "learning_rate": 6.920070753064799e-05, "loss": 1.1774, "num_input_tokens_seen": 70816560, "step": 4401 }, { "epoch": 0.30835237770014096, "grad_norm": 3.6730854511260986, "learning_rate": 6.919370928196148e-05, "loss": 1.0242, "num_input_tokens_seen": 70831848, "step": 4402 }, { "epoch": 0.3084224259458702, "grad_norm": 4.013517379760742, "learning_rate": 6.918671103327497e-05, "loss": 0.9785, "num_input_tokens_seen": 70847408, "step": 4403 }, { "epoch": 0.3084924741915995, "grad_norm": 5.617120742797852, "learning_rate": 6.917971278458846e-05, "loss": 0.9883, "num_input_tokens_seen": 70862080, "step": 4404 }, { "epoch": 0.30856252243732873, "grad_norm": 3.5201385021209717, "learning_rate": 6.917271453590193e-05, "loss": 0.9537, "num_input_tokens_seen": 70878464, "step": 4405 }, { "epoch": 0.30863257068305794, "grad_norm": 5.116230010986328, "learning_rate": 6.91657162872154e-05, "loss": 1.0934, "num_input_tokens_seen": 70894848, "step": 4406 }, { "epoch": 0.3087026189287872, "grad_norm": 3.4510743618011475, "learning_rate": 6.91587180385289e-05, "loss": 1.0857, "num_input_tokens_seen": 70911232, "step": 4407 }, { "epoch": 0.30877266717451646, "grad_norm": 4.719654083251953, "learning_rate": 6.915171978984238e-05, "loss": 1.1565, "num_input_tokens_seen": 70927616, "step": 4408 }, { "epoch": 0.3088427154202457, "grad_norm": 4.52898645401001, "learning_rate": 6.914472154115587e-05, "loss": 0.9418, "num_input_tokens_seen": 70944000, "step": 4409 }, { "epoch": 0.3089127636659749, "grad_norm": 4.237354755401611, "learning_rate": 6.913772329246936e-05, "loss": 1.1614, "num_input_tokens_seen": 70960384, "step": 4410 }, { "epoch": 0.3089828119117042, "grad_norm": 5.489138126373291, "learning_rate": 6.913072504378285e-05, "loss": 0.9871, "num_input_tokens_seen": 70976768, "step": 4411 }, { "epoch": 0.30905286015743344, "grad_norm": 5.482370853424072, "learning_rate": 6.912372679509632e-05, "loss": 0.9962, "num_input_tokens_seen": 70992496, "step": 4412 }, { "epoch": 0.3091229084031627, "grad_norm": 3.8174126148223877, "learning_rate": 6.91167285464098e-05, "loss": 1.0605, "num_input_tokens_seen": 71008880, "step": 4413 }, { "epoch": 0.3091929566488919, "grad_norm": 4.064924716949463, "learning_rate": 6.91097302977233e-05, "loss": 0.8307, "num_input_tokens_seen": 71023912, "step": 4414 }, { "epoch": 0.30926300489462116, "grad_norm": 3.955643653869629, "learning_rate": 6.910273204903678e-05, "loss": 1.2599, "num_input_tokens_seen": 71040296, "step": 4415 }, { "epoch": 0.3093330531403504, "grad_norm": 3.771191358566284, "learning_rate": 6.909573380035026e-05, "loss": 1.0682, "num_input_tokens_seen": 71056680, "step": 4416 }, { "epoch": 0.3094031013860797, "grad_norm": 5.4105963706970215, "learning_rate": 6.908873555166375e-05, "loss": 1.1571, "num_input_tokens_seen": 71072640, "step": 4417 }, { "epoch": 0.3094731496318089, "grad_norm": 4.549078464508057, "learning_rate": 6.908173730297724e-05, "loss": 1.0837, "num_input_tokens_seen": 71087336, "step": 4418 }, { "epoch": 0.30954319787753815, "grad_norm": 3.998065233230591, "learning_rate": 6.907473905429072e-05, "loss": 1.2753, "num_input_tokens_seen": 71102952, "step": 4419 }, { "epoch": 0.3096132461232674, "grad_norm": 3.834508180618286, "learning_rate": 6.90677408056042e-05, "loss": 0.8886, "num_input_tokens_seen": 71119328, "step": 4420 }, { "epoch": 0.30968329436899666, "grad_norm": 3.932875156402588, "learning_rate": 6.90607425569177e-05, "loss": 1.1568, "num_input_tokens_seen": 71134968, "step": 4421 }, { "epoch": 0.30975334261472587, "grad_norm": 3.712484359741211, "learning_rate": 6.905374430823118e-05, "loss": 1.0686, "num_input_tokens_seen": 71150976, "step": 4422 }, { "epoch": 0.30982339086045513, "grad_norm": 3.6733663082122803, "learning_rate": 6.904674605954466e-05, "loss": 0.8884, "num_input_tokens_seen": 71167232, "step": 4423 }, { "epoch": 0.3098934391061844, "grad_norm": 3.9877066612243652, "learning_rate": 6.903974781085815e-05, "loss": 1.0473, "num_input_tokens_seen": 71182704, "step": 4424 }, { "epoch": 0.30996348735191365, "grad_norm": 3.908582925796509, "learning_rate": 6.903274956217163e-05, "loss": 0.9944, "num_input_tokens_seen": 71198920, "step": 4425 }, { "epoch": 0.31003353559764285, "grad_norm": 4.310460090637207, "learning_rate": 6.902575131348511e-05, "loss": 0.9651, "num_input_tokens_seen": 71215256, "step": 4426 }, { "epoch": 0.3101035838433721, "grad_norm": 3.8914272785186768, "learning_rate": 6.90187530647986e-05, "loss": 0.9858, "num_input_tokens_seen": 71231432, "step": 4427 }, { "epoch": 0.31017363208910137, "grad_norm": 5.774794578552246, "learning_rate": 6.901175481611209e-05, "loss": 0.9792, "num_input_tokens_seen": 71246944, "step": 4428 }, { "epoch": 0.31024368033483063, "grad_norm": 6.370543956756592, "learning_rate": 6.900475656742558e-05, "loss": 1.0283, "num_input_tokens_seen": 71263120, "step": 4429 }, { "epoch": 0.31031372858055983, "grad_norm": 3.8334455490112305, "learning_rate": 6.899775831873906e-05, "loss": 1.0173, "num_input_tokens_seen": 71279040, "step": 4430 }, { "epoch": 0.3103837768262891, "grad_norm": 3.624006509780884, "learning_rate": 6.899076007005255e-05, "loss": 0.9908, "num_input_tokens_seen": 71295424, "step": 4431 }, { "epoch": 0.31045382507201835, "grad_norm": 3.8340702056884766, "learning_rate": 6.898376182136603e-05, "loss": 1.1257, "num_input_tokens_seen": 71311808, "step": 4432 }, { "epoch": 0.3105238733177476, "grad_norm": 4.4179277420043945, "learning_rate": 6.89767635726795e-05, "loss": 1.0439, "num_input_tokens_seen": 71327560, "step": 4433 }, { "epoch": 0.31059392156347687, "grad_norm": 5.758373260498047, "learning_rate": 6.896976532399299e-05, "loss": 0.9307, "num_input_tokens_seen": 71342848, "step": 4434 }, { "epoch": 0.3106639698092061, "grad_norm": 3.7063519954681396, "learning_rate": 6.896276707530648e-05, "loss": 1.1769, "num_input_tokens_seen": 71359232, "step": 4435 }, { "epoch": 0.31073401805493533, "grad_norm": 4.19386625289917, "learning_rate": 6.895576882661997e-05, "loss": 1.1185, "num_input_tokens_seen": 71375616, "step": 4436 }, { "epoch": 0.3108040663006646, "grad_norm": 4.116868019104004, "learning_rate": 6.894877057793346e-05, "loss": 0.9571, "num_input_tokens_seen": 71392000, "step": 4437 }, { "epoch": 0.31087411454639385, "grad_norm": 4.810275077819824, "learning_rate": 6.894177232924695e-05, "loss": 1.113, "num_input_tokens_seen": 71407448, "step": 4438 }, { "epoch": 0.31094416279212306, "grad_norm": 4.026486873626709, "learning_rate": 6.893477408056042e-05, "loss": 0.9949, "num_input_tokens_seen": 71423832, "step": 4439 }, { "epoch": 0.3110142110378523, "grad_norm": 4.268560886383057, "learning_rate": 6.89277758318739e-05, "loss": 1.0746, "num_input_tokens_seen": 71440216, "step": 4440 }, { "epoch": 0.3110842592835816, "grad_norm": 3.3299612998962402, "learning_rate": 6.89207775831874e-05, "loss": 0.7408, "num_input_tokens_seen": 71456160, "step": 4441 }, { "epoch": 0.31115430752931084, "grad_norm": 3.678912401199341, "learning_rate": 6.891377933450089e-05, "loss": 0.9508, "num_input_tokens_seen": 71472344, "step": 4442 }, { "epoch": 0.31122435577504004, "grad_norm": 3.3206088542938232, "learning_rate": 6.890678108581436e-05, "loss": 0.9531, "num_input_tokens_seen": 71488728, "step": 4443 }, { "epoch": 0.3112944040207693, "grad_norm": 3.6073081493377686, "learning_rate": 6.889978283712785e-05, "loss": 1.18, "num_input_tokens_seen": 71505112, "step": 4444 }, { "epoch": 0.31136445226649856, "grad_norm": 4.998234748840332, "learning_rate": 6.889278458844134e-05, "loss": 1.0307, "num_input_tokens_seen": 71521496, "step": 4445 }, { "epoch": 0.3114345005122278, "grad_norm": 3.7966136932373047, "learning_rate": 6.888578633975481e-05, "loss": 1.05, "num_input_tokens_seen": 71537880, "step": 4446 }, { "epoch": 0.311504548757957, "grad_norm": 3.7041022777557373, "learning_rate": 6.88787880910683e-05, "loss": 1.0701, "num_input_tokens_seen": 71554136, "step": 4447 }, { "epoch": 0.3115745970036863, "grad_norm": 4.155350208282471, "learning_rate": 6.887178984238179e-05, "loss": 1.0144, "num_input_tokens_seen": 71570520, "step": 4448 }, { "epoch": 0.31164464524941554, "grad_norm": 3.608290195465088, "learning_rate": 6.886479159369528e-05, "loss": 1.0224, "num_input_tokens_seen": 71586904, "step": 4449 }, { "epoch": 0.3117146934951448, "grad_norm": 5.258309841156006, "learning_rate": 6.885779334500875e-05, "loss": 1.0929, "num_input_tokens_seen": 71602280, "step": 4450 }, { "epoch": 0.311784741740874, "grad_norm": 4.176782608032227, "learning_rate": 6.885079509632224e-05, "loss": 1.0301, "num_input_tokens_seen": 71618248, "step": 4451 }, { "epoch": 0.31185478998660326, "grad_norm": 3.219015121459961, "learning_rate": 6.884379684763573e-05, "loss": 0.8699, "num_input_tokens_seen": 71634632, "step": 4452 }, { "epoch": 0.3119248382323325, "grad_norm": 3.485370397567749, "learning_rate": 6.883679859894921e-05, "loss": 0.947, "num_input_tokens_seen": 71651016, "step": 4453 }, { "epoch": 0.3119948864780618, "grad_norm": 4.25452184677124, "learning_rate": 6.88298003502627e-05, "loss": 1.1649, "num_input_tokens_seen": 71667400, "step": 4454 }, { "epoch": 0.312064934723791, "grad_norm": 4.2082133293151855, "learning_rate": 6.882280210157618e-05, "loss": 1.0457, "num_input_tokens_seen": 71682656, "step": 4455 }, { "epoch": 0.31213498296952025, "grad_norm": 3.366642475128174, "learning_rate": 6.881580385288967e-05, "loss": 0.8594, "num_input_tokens_seen": 71699040, "step": 4456 }, { "epoch": 0.3122050312152495, "grad_norm": 3.795114278793335, "learning_rate": 6.880880560420316e-05, "loss": 1.0749, "num_input_tokens_seen": 71715424, "step": 4457 }, { "epoch": 0.31227507946097877, "grad_norm": 7.3179121017456055, "learning_rate": 6.880180735551665e-05, "loss": 1.2978, "num_input_tokens_seen": 71731808, "step": 4458 }, { "epoch": 0.31234512770670797, "grad_norm": 5.0151848793029785, "learning_rate": 6.879480910683012e-05, "loss": 1.1359, "num_input_tokens_seen": 71746688, "step": 4459 }, { "epoch": 0.31241517595243723, "grad_norm": 4.136596202850342, "learning_rate": 6.87878108581436e-05, "loss": 1.141, "num_input_tokens_seen": 71763016, "step": 4460 }, { "epoch": 0.3124852241981665, "grad_norm": 3.6476573944091797, "learning_rate": 6.878081260945709e-05, "loss": 0.9967, "num_input_tokens_seen": 71779400, "step": 4461 }, { "epoch": 0.31255527244389575, "grad_norm": 4.907565593719482, "learning_rate": 6.877381436077059e-05, "loss": 1.0391, "num_input_tokens_seen": 71795784, "step": 4462 }, { "epoch": 0.31262532068962495, "grad_norm": 3.82183575630188, "learning_rate": 6.876681611208407e-05, "loss": 1.2237, "num_input_tokens_seen": 71812168, "step": 4463 }, { "epoch": 0.3126953689353542, "grad_norm": 4.63422966003418, "learning_rate": 6.875981786339755e-05, "loss": 1.1271, "num_input_tokens_seen": 71828296, "step": 4464 }, { "epoch": 0.31276541718108347, "grad_norm": 4.02967643737793, "learning_rate": 6.875281961471104e-05, "loss": 1.105, "num_input_tokens_seen": 71844096, "step": 4465 }, { "epoch": 0.31283546542681273, "grad_norm": 3.477452516555786, "learning_rate": 6.874582136602452e-05, "loss": 1.0503, "num_input_tokens_seen": 71860480, "step": 4466 }, { "epoch": 0.312905513672542, "grad_norm": 4.4327168464660645, "learning_rate": 6.873882311733799e-05, "loss": 1.2305, "num_input_tokens_seen": 71876208, "step": 4467 }, { "epoch": 0.3129755619182712, "grad_norm": 3.8214218616485596, "learning_rate": 6.87318248686515e-05, "loss": 1.0271, "num_input_tokens_seen": 71892592, "step": 4468 }, { "epoch": 0.31304561016400045, "grad_norm": 3.4210402965545654, "learning_rate": 6.872482661996498e-05, "loss": 0.8505, "num_input_tokens_seen": 71908976, "step": 4469 }, { "epoch": 0.3131156584097297, "grad_norm": 3.567034959793091, "learning_rate": 6.871782837127846e-05, "loss": 0.7866, "num_input_tokens_seen": 71925200, "step": 4470 }, { "epoch": 0.313185706655459, "grad_norm": 4.694231986999512, "learning_rate": 6.871083012259195e-05, "loss": 0.9634, "num_input_tokens_seen": 71941584, "step": 4471 }, { "epoch": 0.3132557549011882, "grad_norm": 5.802227973937988, "learning_rate": 6.870383187390544e-05, "loss": 1.1923, "num_input_tokens_seen": 71957968, "step": 4472 }, { "epoch": 0.31332580314691744, "grad_norm": 4.238499641418457, "learning_rate": 6.869683362521891e-05, "loss": 1.3381, "num_input_tokens_seen": 71973376, "step": 4473 }, { "epoch": 0.3133958513926467, "grad_norm": 4.2250213623046875, "learning_rate": 6.86898353765324e-05, "loss": 1.0959, "num_input_tokens_seen": 71988560, "step": 4474 }, { "epoch": 0.31346589963837596, "grad_norm": 4.052889823913574, "learning_rate": 6.868283712784589e-05, "loss": 1.1711, "num_input_tokens_seen": 72004232, "step": 4475 }, { "epoch": 0.31353594788410516, "grad_norm": 3.694481134414673, "learning_rate": 6.867583887915938e-05, "loss": 1.187, "num_input_tokens_seen": 72020616, "step": 4476 }, { "epoch": 0.3136059961298344, "grad_norm": 4.2295074462890625, "learning_rate": 6.866884063047285e-05, "loss": 1.2454, "num_input_tokens_seen": 72036912, "step": 4477 }, { "epoch": 0.3136760443755637, "grad_norm": 3.9813766479492188, "learning_rate": 6.866184238178634e-05, "loss": 1.1627, "num_input_tokens_seen": 72053296, "step": 4478 }, { "epoch": 0.31374609262129294, "grad_norm": 4.473883152008057, "learning_rate": 6.865484413309983e-05, "loss": 1.0522, "num_input_tokens_seen": 72069680, "step": 4479 }, { "epoch": 0.31381614086702214, "grad_norm": 3.7663521766662598, "learning_rate": 6.86478458844133e-05, "loss": 0.937, "num_input_tokens_seen": 72085840, "step": 4480 }, { "epoch": 0.3138861891127514, "grad_norm": 3.9587883949279785, "learning_rate": 6.864084763572679e-05, "loss": 1.1194, "num_input_tokens_seen": 72102224, "step": 4481 }, { "epoch": 0.31395623735848066, "grad_norm": 3.953232526779175, "learning_rate": 6.86338493870403e-05, "loss": 0.9581, "num_input_tokens_seen": 72118608, "step": 4482 }, { "epoch": 0.3140262856042099, "grad_norm": 3.917574882507324, "learning_rate": 6.862685113835377e-05, "loss": 1.1883, "num_input_tokens_seen": 72134504, "step": 4483 }, { "epoch": 0.3140963338499391, "grad_norm": 3.756253242492676, "learning_rate": 6.861985288966726e-05, "loss": 1.1057, "num_input_tokens_seen": 72150368, "step": 4484 }, { "epoch": 0.3141663820956684, "grad_norm": 4.146200656890869, "learning_rate": 6.861285464098075e-05, "loss": 0.98, "num_input_tokens_seen": 72166752, "step": 4485 }, { "epoch": 0.31423643034139764, "grad_norm": 3.98949933052063, "learning_rate": 6.860585639229422e-05, "loss": 1.2088, "num_input_tokens_seen": 72182696, "step": 4486 }, { "epoch": 0.3143064785871269, "grad_norm": 3.99951434135437, "learning_rate": 6.85988581436077e-05, "loss": 1.218, "num_input_tokens_seen": 72199080, "step": 4487 }, { "epoch": 0.3143765268328561, "grad_norm": 4.351415157318115, "learning_rate": 6.85918598949212e-05, "loss": 1.0178, "num_input_tokens_seen": 72215176, "step": 4488 }, { "epoch": 0.31444657507858537, "grad_norm": 4.563141822814941, "learning_rate": 6.858486164623469e-05, "loss": 1.0002, "num_input_tokens_seen": 72231560, "step": 4489 }, { "epoch": 0.3145166233243146, "grad_norm": 4.523083686828613, "learning_rate": 6.857786339754816e-05, "loss": 1.1464, "num_input_tokens_seen": 72246920, "step": 4490 }, { "epoch": 0.3145866715700439, "grad_norm": 4.032657623291016, "learning_rate": 6.857086514886165e-05, "loss": 1.1774, "num_input_tokens_seen": 72263304, "step": 4491 }, { "epoch": 0.3146567198157731, "grad_norm": 4.755338191986084, "learning_rate": 6.856386690017514e-05, "loss": 1.0756, "num_input_tokens_seen": 72279688, "step": 4492 }, { "epoch": 0.31472676806150235, "grad_norm": 4.037180423736572, "learning_rate": 6.855686865148862e-05, "loss": 1.2973, "num_input_tokens_seen": 72296072, "step": 4493 }, { "epoch": 0.3147968163072316, "grad_norm": 3.308746099472046, "learning_rate": 6.85498704028021e-05, "loss": 0.9127, "num_input_tokens_seen": 72312360, "step": 4494 }, { "epoch": 0.31486686455296087, "grad_norm": 4.204549789428711, "learning_rate": 6.854287215411559e-05, "loss": 1.1138, "num_input_tokens_seen": 72328744, "step": 4495 }, { "epoch": 0.31493691279869007, "grad_norm": 4.142894744873047, "learning_rate": 6.853587390542908e-05, "loss": 1.0273, "num_input_tokens_seen": 72344944, "step": 4496 }, { "epoch": 0.31500696104441933, "grad_norm": 5.43609094619751, "learning_rate": 6.852887565674256e-05, "loss": 0.9369, "num_input_tokens_seen": 72360672, "step": 4497 }, { "epoch": 0.3150770092901486, "grad_norm": 4.20035982131958, "learning_rate": 6.852187740805604e-05, "loss": 1.0857, "num_input_tokens_seen": 72376744, "step": 4498 }, { "epoch": 0.31514705753587785, "grad_norm": 3.6777737140655518, "learning_rate": 6.851487915936953e-05, "loss": 1.0489, "num_input_tokens_seen": 72393128, "step": 4499 }, { "epoch": 0.31521710578160705, "grad_norm": 5.047235488891602, "learning_rate": 6.850788091068301e-05, "loss": 1.0644, "num_input_tokens_seen": 72408016, "step": 4500 }, { "epoch": 0.3152871540273363, "grad_norm": 4.095731258392334, "learning_rate": 6.85008826619965e-05, "loss": 1.0881, "num_input_tokens_seen": 72424400, "step": 4501 }, { "epoch": 0.3153572022730656, "grad_norm": 3.6437504291534424, "learning_rate": 6.849388441331e-05, "loss": 1.1428, "num_input_tokens_seen": 72440368, "step": 4502 }, { "epoch": 0.31542725051879483, "grad_norm": 5.345888614654541, "learning_rate": 6.848688616462347e-05, "loss": 1.0143, "num_input_tokens_seen": 72456752, "step": 4503 }, { "epoch": 0.3154972987645241, "grad_norm": 4.471817970275879, "learning_rate": 6.847988791593695e-05, "loss": 1.11, "num_input_tokens_seen": 72472952, "step": 4504 }, { "epoch": 0.3155673470102533, "grad_norm": 3.8012888431549072, "learning_rate": 6.847288966725044e-05, "loss": 1.1961, "num_input_tokens_seen": 72489256, "step": 4505 }, { "epoch": 0.31563739525598256, "grad_norm": 7.531235218048096, "learning_rate": 6.846589141856393e-05, "loss": 1.0254, "num_input_tokens_seen": 72503752, "step": 4506 }, { "epoch": 0.3157074435017118, "grad_norm": 4.075259208679199, "learning_rate": 6.84588931698774e-05, "loss": 1.1834, "num_input_tokens_seen": 72520136, "step": 4507 }, { "epoch": 0.3157774917474411, "grad_norm": 5.203637599945068, "learning_rate": 6.84518949211909e-05, "loss": 1.1198, "num_input_tokens_seen": 72536520, "step": 4508 }, { "epoch": 0.3158475399931703, "grad_norm": 5.733241081237793, "learning_rate": 6.844489667250439e-05, "loss": 1.1981, "num_input_tokens_seen": 72551976, "step": 4509 }, { "epoch": 0.31591758823889954, "grad_norm": 4.182814121246338, "learning_rate": 6.843789842381787e-05, "loss": 1.0879, "num_input_tokens_seen": 72568360, "step": 4510 }, { "epoch": 0.3159876364846288, "grad_norm": 5.769293785095215, "learning_rate": 6.843090017513136e-05, "loss": 0.988, "num_input_tokens_seen": 72584744, "step": 4511 }, { "epoch": 0.31605768473035806, "grad_norm": 5.052547454833984, "learning_rate": 6.842390192644484e-05, "loss": 0.952, "num_input_tokens_seen": 72600608, "step": 4512 }, { "epoch": 0.31612773297608726, "grad_norm": 3.7260072231292725, "learning_rate": 6.841690367775832e-05, "loss": 0.948, "num_input_tokens_seen": 72616720, "step": 4513 }, { "epoch": 0.3161977812218165, "grad_norm": 4.230448246002197, "learning_rate": 6.840990542907181e-05, "loss": 1.3362, "num_input_tokens_seen": 72632896, "step": 4514 }, { "epoch": 0.3162678294675458, "grad_norm": 3.7840049266815186, "learning_rate": 6.84029071803853e-05, "loss": 1.1432, "num_input_tokens_seen": 72649280, "step": 4515 }, { "epoch": 0.31633787771327504, "grad_norm": 3.6891443729400635, "learning_rate": 6.839590893169878e-05, "loss": 0.9276, "num_input_tokens_seen": 72665664, "step": 4516 }, { "epoch": 0.31640792595900424, "grad_norm": 5.132042407989502, "learning_rate": 6.838891068301226e-05, "loss": 0.9418, "num_input_tokens_seen": 72682048, "step": 4517 }, { "epoch": 0.3164779742047335, "grad_norm": 4.329607009887695, "learning_rate": 6.838191243432575e-05, "loss": 1.0247, "num_input_tokens_seen": 72698136, "step": 4518 }, { "epoch": 0.31654802245046276, "grad_norm": 4.269455432891846, "learning_rate": 6.837491418563924e-05, "loss": 1.1186, "num_input_tokens_seen": 72714296, "step": 4519 }, { "epoch": 0.316618070696192, "grad_norm": 3.5963287353515625, "learning_rate": 6.836791593695271e-05, "loss": 0.8834, "num_input_tokens_seen": 72730680, "step": 4520 }, { "epoch": 0.3166881189419212, "grad_norm": 3.9145658016204834, "learning_rate": 6.83609176882662e-05, "loss": 1.1385, "num_input_tokens_seen": 72746296, "step": 4521 }, { "epoch": 0.3167581671876505, "grad_norm": 4.266791820526123, "learning_rate": 6.83539194395797e-05, "loss": 1.0825, "num_input_tokens_seen": 72762680, "step": 4522 }, { "epoch": 0.31682821543337975, "grad_norm": 3.850743532180786, "learning_rate": 6.834692119089318e-05, "loss": 1.0558, "num_input_tokens_seen": 72778816, "step": 4523 }, { "epoch": 0.316898263679109, "grad_norm": 3.8117008209228516, "learning_rate": 6.833992294220665e-05, "loss": 0.9562, "num_input_tokens_seen": 72794576, "step": 4524 }, { "epoch": 0.3169683119248382, "grad_norm": 4.469017028808594, "learning_rate": 6.833292469352014e-05, "loss": 1.2533, "num_input_tokens_seen": 72810960, "step": 4525 }, { "epoch": 0.31703836017056747, "grad_norm": 3.538980007171631, "learning_rate": 6.832592644483363e-05, "loss": 0.9393, "num_input_tokens_seen": 72826480, "step": 4526 }, { "epoch": 0.3171084084162967, "grad_norm": 3.6429643630981445, "learning_rate": 6.83189281961471e-05, "loss": 1.0492, "num_input_tokens_seen": 72842440, "step": 4527 }, { "epoch": 0.317178456662026, "grad_norm": 3.876481056213379, "learning_rate": 6.831192994746061e-05, "loss": 1.0699, "num_input_tokens_seen": 72858424, "step": 4528 }, { "epoch": 0.3172485049077552, "grad_norm": 5.119854927062988, "learning_rate": 6.83049316987741e-05, "loss": 1.1704, "num_input_tokens_seen": 72874808, "step": 4529 }, { "epoch": 0.31731855315348445, "grad_norm": 3.908071994781494, "learning_rate": 6.829793345008757e-05, "loss": 1.0156, "num_input_tokens_seen": 72891192, "step": 4530 }, { "epoch": 0.3173886013992137, "grad_norm": 4.499825954437256, "learning_rate": 6.829093520140105e-05, "loss": 0.9863, "num_input_tokens_seen": 72907576, "step": 4531 }, { "epoch": 0.31745864964494297, "grad_norm": 4.060844421386719, "learning_rate": 6.828393695271453e-05, "loss": 1.0173, "num_input_tokens_seen": 72923960, "step": 4532 }, { "epoch": 0.3175286978906722, "grad_norm": 4.47066068649292, "learning_rate": 6.827693870402802e-05, "loss": 0.859, "num_input_tokens_seen": 72939576, "step": 4533 }, { "epoch": 0.31759874613640143, "grad_norm": 3.6252682209014893, "learning_rate": 6.826994045534151e-05, "loss": 0.996, "num_input_tokens_seen": 72955136, "step": 4534 }, { "epoch": 0.3176687943821307, "grad_norm": 4.25836181640625, "learning_rate": 6.8262942206655e-05, "loss": 1.0267, "num_input_tokens_seen": 72971520, "step": 4535 }, { "epoch": 0.31773884262785995, "grad_norm": 3.6240739822387695, "learning_rate": 6.825594395796849e-05, "loss": 1.0116, "num_input_tokens_seen": 72987440, "step": 4536 }, { "epoch": 0.3178088908735892, "grad_norm": 4.470614910125732, "learning_rate": 6.824894570928196e-05, "loss": 1.1302, "num_input_tokens_seen": 73003824, "step": 4537 }, { "epoch": 0.3178789391193184, "grad_norm": 3.5759263038635254, "learning_rate": 6.824194746059545e-05, "loss": 0.9902, "num_input_tokens_seen": 73020208, "step": 4538 }, { "epoch": 0.3179489873650477, "grad_norm": 4.424665451049805, "learning_rate": 6.823494921190894e-05, "loss": 1.0239, "num_input_tokens_seen": 73036592, "step": 4539 }, { "epoch": 0.31801903561077693, "grad_norm": 3.803205966949463, "learning_rate": 6.822795096322242e-05, "loss": 0.9315, "num_input_tokens_seen": 73052976, "step": 4540 }, { "epoch": 0.3180890838565062, "grad_norm": 4.25760555267334, "learning_rate": 6.82209527145359e-05, "loss": 1.0985, "num_input_tokens_seen": 73069360, "step": 4541 }, { "epoch": 0.3181591321022354, "grad_norm": 4.006928443908691, "learning_rate": 6.82139544658494e-05, "loss": 0.9056, "num_input_tokens_seen": 73084624, "step": 4542 }, { "epoch": 0.31822918034796466, "grad_norm": 3.56350040435791, "learning_rate": 6.820695621716288e-05, "loss": 0.8721, "num_input_tokens_seen": 73100008, "step": 4543 }, { "epoch": 0.3182992285936939, "grad_norm": 3.7276062965393066, "learning_rate": 6.819995796847636e-05, "loss": 1.1001, "num_input_tokens_seen": 73116392, "step": 4544 }, { "epoch": 0.3183692768394232, "grad_norm": 4.955738544464111, "learning_rate": 6.819295971978985e-05, "loss": 1.0459, "num_input_tokens_seen": 73131920, "step": 4545 }, { "epoch": 0.3184393250851524, "grad_norm": 3.5275161266326904, "learning_rate": 6.818596147110333e-05, "loss": 1.1006, "num_input_tokens_seen": 73148304, "step": 4546 }, { "epoch": 0.31850937333088164, "grad_norm": 6.4245924949646, "learning_rate": 6.817896322241681e-05, "loss": 1.2968, "num_input_tokens_seen": 73164688, "step": 4547 }, { "epoch": 0.3185794215766109, "grad_norm": 4.1172966957092285, "learning_rate": 6.81719649737303e-05, "loss": 1.0743, "num_input_tokens_seen": 73181072, "step": 4548 }, { "epoch": 0.31864946982234016, "grad_norm": 3.849090337753296, "learning_rate": 6.81649667250438e-05, "loss": 1.1064, "num_input_tokens_seen": 73197456, "step": 4549 }, { "epoch": 0.31871951806806936, "grad_norm": 6.241509437561035, "learning_rate": 6.815796847635728e-05, "loss": 1.0592, "num_input_tokens_seen": 73213568, "step": 4550 }, { "epoch": 0.3187895663137986, "grad_norm": 4.039997577667236, "learning_rate": 6.815097022767075e-05, "loss": 0.9789, "num_input_tokens_seen": 73229648, "step": 4551 }, { "epoch": 0.3188596145595279, "grad_norm": 3.757549285888672, "learning_rate": 6.814397197898424e-05, "loss": 1.1547, "num_input_tokens_seen": 73245952, "step": 4552 }, { "epoch": 0.31892966280525714, "grad_norm": 4.177220821380615, "learning_rate": 6.813697373029773e-05, "loss": 1.3134, "num_input_tokens_seen": 73262336, "step": 4553 }, { "epoch": 0.31899971105098635, "grad_norm": 3.659167766571045, "learning_rate": 6.812997548161122e-05, "loss": 0.9954, "num_input_tokens_seen": 73278304, "step": 4554 }, { "epoch": 0.3190697592967156, "grad_norm": 4.289649486541748, "learning_rate": 6.81229772329247e-05, "loss": 0.8452, "num_input_tokens_seen": 73294320, "step": 4555 }, { "epoch": 0.31913980754244486, "grad_norm": 4.452631950378418, "learning_rate": 6.811597898423819e-05, "loss": 1.0265, "num_input_tokens_seen": 73310256, "step": 4556 }, { "epoch": 0.3192098557881741, "grad_norm": 3.572444438934326, "learning_rate": 6.810898073555167e-05, "loss": 1.0247, "num_input_tokens_seen": 73326640, "step": 4557 }, { "epoch": 0.31927990403390333, "grad_norm": 4.059347629547119, "learning_rate": 6.810198248686514e-05, "loss": 1.0103, "num_input_tokens_seen": 73342096, "step": 4558 }, { "epoch": 0.3193499522796326, "grad_norm": 5.144520282745361, "learning_rate": 6.809498423817863e-05, "loss": 1.1181, "num_input_tokens_seen": 73358480, "step": 4559 }, { "epoch": 0.31942000052536185, "grad_norm": 4.210456848144531, "learning_rate": 6.808798598949212e-05, "loss": 1.1197, "num_input_tokens_seen": 73374864, "step": 4560 }, { "epoch": 0.3194900487710911, "grad_norm": 5.06007194519043, "learning_rate": 6.808098774080561e-05, "loss": 0.8933, "num_input_tokens_seen": 73391248, "step": 4561 }, { "epoch": 0.3195600970168203, "grad_norm": 4.032425403594971, "learning_rate": 6.80739894921191e-05, "loss": 0.9132, "num_input_tokens_seen": 73406728, "step": 4562 }, { "epoch": 0.31963014526254957, "grad_norm": 4.344507694244385, "learning_rate": 6.806699124343259e-05, "loss": 1.1248, "num_input_tokens_seen": 73423112, "step": 4563 }, { "epoch": 0.31970019350827883, "grad_norm": 3.7113993167877197, "learning_rate": 6.805999299474606e-05, "loss": 0.9122, "num_input_tokens_seen": 73439496, "step": 4564 }, { "epoch": 0.3197702417540081, "grad_norm": 4.160495281219482, "learning_rate": 6.805299474605955e-05, "loss": 1.0425, "num_input_tokens_seen": 73455456, "step": 4565 }, { "epoch": 0.3198402899997373, "grad_norm": 5.51431131362915, "learning_rate": 6.804599649737304e-05, "loss": 0.9416, "num_input_tokens_seen": 73471840, "step": 4566 }, { "epoch": 0.31991033824546655, "grad_norm": 4.145261287689209, "learning_rate": 6.803899824868651e-05, "loss": 0.947, "num_input_tokens_seen": 73487688, "step": 4567 }, { "epoch": 0.3199803864911958, "grad_norm": 3.917922019958496, "learning_rate": 6.8032e-05, "loss": 1.1859, "num_input_tokens_seen": 73504072, "step": 4568 }, { "epoch": 0.32005043473692507, "grad_norm": 3.8644864559173584, "learning_rate": 6.802500175131349e-05, "loss": 0.9176, "num_input_tokens_seen": 73520344, "step": 4569 }, { "epoch": 0.32012048298265433, "grad_norm": 4.043839931488037, "learning_rate": 6.801800350262698e-05, "loss": 1.0045, "num_input_tokens_seen": 73536248, "step": 4570 }, { "epoch": 0.32019053122838353, "grad_norm": 4.793722629547119, "learning_rate": 6.801100525394045e-05, "loss": 1.2245, "num_input_tokens_seen": 73552512, "step": 4571 }, { "epoch": 0.3202605794741128, "grad_norm": 3.632899761199951, "learning_rate": 6.800400700525394e-05, "loss": 0.9899, "num_input_tokens_seen": 73568896, "step": 4572 }, { "epoch": 0.32033062771984205, "grad_norm": 6.236395359039307, "learning_rate": 6.799700875656743e-05, "loss": 1.113, "num_input_tokens_seen": 73585280, "step": 4573 }, { "epoch": 0.3204006759655713, "grad_norm": 4.591775417327881, "learning_rate": 6.799001050788092e-05, "loss": 1.0019, "num_input_tokens_seen": 73600328, "step": 4574 }, { "epoch": 0.3204707242113005, "grad_norm": 3.9546539783477783, "learning_rate": 6.79830122591944e-05, "loss": 1.0444, "num_input_tokens_seen": 73616568, "step": 4575 }, { "epoch": 0.3205407724570298, "grad_norm": 4.425241470336914, "learning_rate": 6.79760140105079e-05, "loss": 1.0112, "num_input_tokens_seen": 73632552, "step": 4576 }, { "epoch": 0.32061082070275904, "grad_norm": 3.999953508377075, "learning_rate": 6.796901576182137e-05, "loss": 1.1854, "num_input_tokens_seen": 73648672, "step": 4577 }, { "epoch": 0.3206808689484883, "grad_norm": 3.6718766689300537, "learning_rate": 6.796201751313485e-05, "loss": 1.0379, "num_input_tokens_seen": 73665056, "step": 4578 }, { "epoch": 0.3207509171942175, "grad_norm": 4.37136173248291, "learning_rate": 6.795501926444834e-05, "loss": 0.9921, "num_input_tokens_seen": 73679680, "step": 4579 }, { "epoch": 0.32082096543994676, "grad_norm": 5.109454154968262, "learning_rate": 6.794802101576182e-05, "loss": 1.0901, "num_input_tokens_seen": 73695768, "step": 4580 }, { "epoch": 0.320891013685676, "grad_norm": 4.277298927307129, "learning_rate": 6.794102276707531e-05, "loss": 1.0651, "num_input_tokens_seen": 73711840, "step": 4581 }, { "epoch": 0.3209610619314053, "grad_norm": 4.598893165588379, "learning_rate": 6.79340245183888e-05, "loss": 1.1733, "num_input_tokens_seen": 73728184, "step": 4582 }, { "epoch": 0.3210311101771345, "grad_norm": 5.124484539031982, "learning_rate": 6.792702626970229e-05, "loss": 0.9399, "num_input_tokens_seen": 73744568, "step": 4583 }, { "epoch": 0.32110115842286374, "grad_norm": 4.426584243774414, "learning_rate": 6.792002802101577e-05, "loss": 0.8339, "num_input_tokens_seen": 73760424, "step": 4584 }, { "epoch": 0.321171206668593, "grad_norm": 3.5181384086608887, "learning_rate": 6.791302977232924e-05, "loss": 0.8025, "num_input_tokens_seen": 73776808, "step": 4585 }, { "epoch": 0.32124125491432226, "grad_norm": 6.614295482635498, "learning_rate": 6.790603152364273e-05, "loss": 1.1392, "num_input_tokens_seen": 73793192, "step": 4586 }, { "epoch": 0.32131130316005146, "grad_norm": 5.212308406829834, "learning_rate": 6.789903327495622e-05, "loss": 1.0909, "num_input_tokens_seen": 73809576, "step": 4587 }, { "epoch": 0.3213813514057807, "grad_norm": 4.7378106117248535, "learning_rate": 6.78920350262697e-05, "loss": 1.112, "num_input_tokens_seen": 73825680, "step": 4588 }, { "epoch": 0.32145139965151, "grad_norm": 5.0195136070251465, "learning_rate": 6.78850367775832e-05, "loss": 1.4437, "num_input_tokens_seen": 73841200, "step": 4589 }, { "epoch": 0.32152144789723924, "grad_norm": 6.186412811279297, "learning_rate": 6.787803852889668e-05, "loss": 1.0715, "num_input_tokens_seen": 73857584, "step": 4590 }, { "epoch": 0.32159149614296845, "grad_norm": 6.835412502288818, "learning_rate": 6.787104028021016e-05, "loss": 0.9454, "num_input_tokens_seen": 73873624, "step": 4591 }, { "epoch": 0.3216615443886977, "grad_norm": 4.3859333992004395, "learning_rate": 6.786404203152365e-05, "loss": 0.9344, "num_input_tokens_seen": 73890008, "step": 4592 }, { "epoch": 0.32173159263442697, "grad_norm": 3.8230555057525635, "learning_rate": 6.785704378283714e-05, "loss": 0.9475, "num_input_tokens_seen": 73906392, "step": 4593 }, { "epoch": 0.3218016408801562, "grad_norm": 4.458274841308594, "learning_rate": 6.785004553415062e-05, "loss": 1.1223, "num_input_tokens_seen": 73922776, "step": 4594 }, { "epoch": 0.32187168912588543, "grad_norm": 4.006426811218262, "learning_rate": 6.78430472854641e-05, "loss": 1.3019, "num_input_tokens_seen": 73938896, "step": 4595 }, { "epoch": 0.3219417373716147, "grad_norm": 4.637386322021484, "learning_rate": 6.783604903677759e-05, "loss": 1.0272, "num_input_tokens_seen": 73955280, "step": 4596 }, { "epoch": 0.32201178561734395, "grad_norm": 5.13168478012085, "learning_rate": 6.782905078809108e-05, "loss": 1.1046, "num_input_tokens_seen": 73971480, "step": 4597 }, { "epoch": 0.3220818338630732, "grad_norm": 3.8248770236968994, "learning_rate": 6.782205253940455e-05, "loss": 1.0467, "num_input_tokens_seen": 73987712, "step": 4598 }, { "epoch": 0.3221518821088024, "grad_norm": 5.167041778564453, "learning_rate": 6.781505429071804e-05, "loss": 1.005, "num_input_tokens_seen": 74004096, "step": 4599 }, { "epoch": 0.32222193035453167, "grad_norm": 3.779311180114746, "learning_rate": 6.780805604203153e-05, "loss": 0.9102, "num_input_tokens_seen": 74020176, "step": 4600 }, { "epoch": 0.32222193035453167, "eval_loss": 1.1318858861923218, "eval_runtime": 0.2027, "eval_samples_per_second": 4.933, "eval_steps_per_second": 4.933, "num_input_tokens_seen": 74020176, "step": 4600 }, { "epoch": 0.32229197860026093, "grad_norm": 3.8468148708343506, "learning_rate": 6.780105779334502e-05, "loss": 0.9602, "num_input_tokens_seen": 74035664, "step": 4601 }, { "epoch": 0.3223620268459902, "grad_norm": 4.28491735458374, "learning_rate": 6.779405954465849e-05, "loss": 1.1125, "num_input_tokens_seen": 74050408, "step": 4602 }, { "epoch": 0.3224320750917194, "grad_norm": 4.872751712799072, "learning_rate": 6.7787061295972e-05, "loss": 0.9746, "num_input_tokens_seen": 74066336, "step": 4603 }, { "epoch": 0.32250212333744865, "grad_norm": 4.060647487640381, "learning_rate": 6.778006304728547e-05, "loss": 1.0575, "num_input_tokens_seen": 74082720, "step": 4604 }, { "epoch": 0.3225721715831779, "grad_norm": 3.607623815536499, "learning_rate": 6.777306479859894e-05, "loss": 0.9797, "num_input_tokens_seen": 74099104, "step": 4605 }, { "epoch": 0.3226422198289072, "grad_norm": 3.719801187515259, "learning_rate": 6.776606654991243e-05, "loss": 1.0249, "num_input_tokens_seen": 74115488, "step": 4606 }, { "epoch": 0.32271226807463643, "grad_norm": 5.072197914123535, "learning_rate": 6.775906830122592e-05, "loss": 1.1264, "num_input_tokens_seen": 74131224, "step": 4607 }, { "epoch": 0.32278231632036564, "grad_norm": 6.052949905395508, "learning_rate": 6.775207005253941e-05, "loss": 1.1319, "num_input_tokens_seen": 74147608, "step": 4608 }, { "epoch": 0.3228523645660949, "grad_norm": 6.214832782745361, "learning_rate": 6.77450718038529e-05, "loss": 0.9861, "num_input_tokens_seen": 74162840, "step": 4609 }, { "epoch": 0.32292241281182416, "grad_norm": 4.279264450073242, "learning_rate": 6.773807355516639e-05, "loss": 0.9855, "num_input_tokens_seen": 74179224, "step": 4610 }, { "epoch": 0.3229924610575534, "grad_norm": 3.8564460277557373, "learning_rate": 6.773107530647986e-05, "loss": 1.0555, "num_input_tokens_seen": 74195608, "step": 4611 }, { "epoch": 0.3230625093032826, "grad_norm": 4.747770309448242, "learning_rate": 6.772407705779334e-05, "loss": 1.0011, "num_input_tokens_seen": 74211664, "step": 4612 }, { "epoch": 0.3231325575490119, "grad_norm": 3.5425655841827393, "learning_rate": 6.771707880910683e-05, "loss": 1.135, "num_input_tokens_seen": 74228048, "step": 4613 }, { "epoch": 0.32320260579474114, "grad_norm": 3.919851303100586, "learning_rate": 6.771008056042033e-05, "loss": 0.9791, "num_input_tokens_seen": 74243424, "step": 4614 }, { "epoch": 0.3232726540404704, "grad_norm": 4.061427593231201, "learning_rate": 6.77030823117338e-05, "loss": 1.2477, "num_input_tokens_seen": 74259696, "step": 4615 }, { "epoch": 0.3233427022861996, "grad_norm": 5.14341926574707, "learning_rate": 6.769608406304729e-05, "loss": 0.9715, "num_input_tokens_seen": 74274968, "step": 4616 }, { "epoch": 0.32341275053192886, "grad_norm": 6.207670211791992, "learning_rate": 6.768908581436078e-05, "loss": 1.0955, "num_input_tokens_seen": 74291352, "step": 4617 }, { "epoch": 0.3234827987776581, "grad_norm": 5.948925971984863, "learning_rate": 6.768208756567426e-05, "loss": 1.1007, "num_input_tokens_seen": 74307000, "step": 4618 }, { "epoch": 0.3235528470233874, "grad_norm": 5.205277442932129, "learning_rate": 6.767508931698774e-05, "loss": 0.9458, "num_input_tokens_seen": 74323384, "step": 4619 }, { "epoch": 0.3236228952691166, "grad_norm": 3.8878557682037354, "learning_rate": 6.766809106830123e-05, "loss": 1.01, "num_input_tokens_seen": 74339768, "step": 4620 }, { "epoch": 0.32369294351484584, "grad_norm": 4.9194111824035645, "learning_rate": 6.766109281961472e-05, "loss": 1.1011, "num_input_tokens_seen": 74355888, "step": 4621 }, { "epoch": 0.3237629917605751, "grad_norm": 3.5212655067443848, "learning_rate": 6.76540945709282e-05, "loss": 1.0886, "num_input_tokens_seen": 74372048, "step": 4622 }, { "epoch": 0.32383304000630436, "grad_norm": 3.6212568283081055, "learning_rate": 6.764709632224168e-05, "loss": 1.0616, "num_input_tokens_seen": 74388432, "step": 4623 }, { "epoch": 0.32390308825203357, "grad_norm": 3.795515298843384, "learning_rate": 6.764009807355517e-05, "loss": 1.1594, "num_input_tokens_seen": 74404584, "step": 4624 }, { "epoch": 0.3239731364977628, "grad_norm": 4.537838935852051, "learning_rate": 6.763309982486865e-05, "loss": 1.1319, "num_input_tokens_seen": 74420304, "step": 4625 }, { "epoch": 0.3240431847434921, "grad_norm": 4.276764392852783, "learning_rate": 6.762610157618214e-05, "loss": 0.9162, "num_input_tokens_seen": 74436688, "step": 4626 }, { "epoch": 0.32411323298922134, "grad_norm": 3.9739227294921875, "learning_rate": 6.761910332749563e-05, "loss": 1.0002, "num_input_tokens_seen": 74451824, "step": 4627 }, { "epoch": 0.32418328123495055, "grad_norm": 4.176823616027832, "learning_rate": 6.761210507880911e-05, "loss": 1.2547, "num_input_tokens_seen": 74467080, "step": 4628 }, { "epoch": 0.3242533294806798, "grad_norm": 4.471405029296875, "learning_rate": 6.760510683012259e-05, "loss": 0.9694, "num_input_tokens_seen": 74483464, "step": 4629 }, { "epoch": 0.32432337772640907, "grad_norm": 3.95442271232605, "learning_rate": 6.759810858143609e-05, "loss": 1.1059, "num_input_tokens_seen": 74499848, "step": 4630 }, { "epoch": 0.3243934259721383, "grad_norm": 5.348501682281494, "learning_rate": 6.759111033274957e-05, "loss": 1.043, "num_input_tokens_seen": 74516232, "step": 4631 }, { "epoch": 0.32446347421786753, "grad_norm": 4.405150413513184, "learning_rate": 6.758411208406304e-05, "loss": 1.0732, "num_input_tokens_seen": 74531120, "step": 4632 }, { "epoch": 0.3245335224635968, "grad_norm": 3.633358955383301, "learning_rate": 6.757711383537653e-05, "loss": 0.9585, "num_input_tokens_seen": 74547504, "step": 4633 }, { "epoch": 0.32460357070932605, "grad_norm": 4.668785095214844, "learning_rate": 6.757011558669003e-05, "loss": 1.2355, "num_input_tokens_seen": 74563888, "step": 4634 }, { "epoch": 0.3246736189550553, "grad_norm": 5.222908020019531, "learning_rate": 6.756311733800351e-05, "loss": 1.094, "num_input_tokens_seen": 74580224, "step": 4635 }, { "epoch": 0.3247436672007845, "grad_norm": 3.812385082244873, "learning_rate": 6.7556119089317e-05, "loss": 1.1326, "num_input_tokens_seen": 74596608, "step": 4636 }, { "epoch": 0.3248137154465138, "grad_norm": 5.080833911895752, "learning_rate": 6.754912084063048e-05, "loss": 1.0665, "num_input_tokens_seen": 74612456, "step": 4637 }, { "epoch": 0.32488376369224303, "grad_norm": 5.309609413146973, "learning_rate": 6.754212259194396e-05, "loss": 1.0206, "num_input_tokens_seen": 74627840, "step": 4638 }, { "epoch": 0.3249538119379723, "grad_norm": 4.46236515045166, "learning_rate": 6.753512434325743e-05, "loss": 1.1093, "num_input_tokens_seen": 74643800, "step": 4639 }, { "epoch": 0.32502386018370155, "grad_norm": 9.981855392456055, "learning_rate": 6.752812609457094e-05, "loss": 1.2777, "num_input_tokens_seen": 74660184, "step": 4640 }, { "epoch": 0.32509390842943076, "grad_norm": 5.075852870941162, "learning_rate": 6.752112784588443e-05, "loss": 0.9977, "num_input_tokens_seen": 74676568, "step": 4641 }, { "epoch": 0.32516395667516, "grad_norm": 3.8985090255737305, "learning_rate": 6.75141295971979e-05, "loss": 1.1299, "num_input_tokens_seen": 74692952, "step": 4642 }, { "epoch": 0.3252340049208893, "grad_norm": 4.9769673347473145, "learning_rate": 6.750713134851139e-05, "loss": 1.3023, "num_input_tokens_seen": 74709216, "step": 4643 }, { "epoch": 0.32530405316661853, "grad_norm": 4.508238315582275, "learning_rate": 6.750013309982488e-05, "loss": 1.014, "num_input_tokens_seen": 74724640, "step": 4644 }, { "epoch": 0.32537410141234774, "grad_norm": 4.214225769042969, "learning_rate": 6.749313485113835e-05, "loss": 1.0864, "num_input_tokens_seen": 74740696, "step": 4645 }, { "epoch": 0.325444149658077, "grad_norm": 4.217604160308838, "learning_rate": 6.748613660245184e-05, "loss": 1.0521, "num_input_tokens_seen": 74756520, "step": 4646 }, { "epoch": 0.32551419790380626, "grad_norm": 3.5975253582000732, "learning_rate": 6.747913835376533e-05, "loss": 0.9642, "num_input_tokens_seen": 74772904, "step": 4647 }, { "epoch": 0.3255842461495355, "grad_norm": 3.5055267810821533, "learning_rate": 6.747214010507882e-05, "loss": 1.0975, "num_input_tokens_seen": 74789288, "step": 4648 }, { "epoch": 0.3256542943952647, "grad_norm": 3.8605833053588867, "learning_rate": 6.746514185639229e-05, "loss": 1.095, "num_input_tokens_seen": 74804768, "step": 4649 }, { "epoch": 0.325724342640994, "grad_norm": 9.446599006652832, "learning_rate": 6.745814360770578e-05, "loss": 1.1894, "num_input_tokens_seen": 74821152, "step": 4650 }, { "epoch": 0.32579439088672324, "grad_norm": 4.161158084869385, "learning_rate": 6.745114535901927e-05, "loss": 0.984, "num_input_tokens_seen": 74836992, "step": 4651 }, { "epoch": 0.3258644391324525, "grad_norm": 3.5690324306488037, "learning_rate": 6.744414711033275e-05, "loss": 1.0186, "num_input_tokens_seen": 74852896, "step": 4652 }, { "epoch": 0.3259344873781817, "grad_norm": 3.5873210430145264, "learning_rate": 6.743714886164623e-05, "loss": 1.0069, "num_input_tokens_seen": 74868472, "step": 4653 }, { "epoch": 0.32600453562391096, "grad_norm": 4.192559719085693, "learning_rate": 6.743015061295972e-05, "loss": 1.0646, "num_input_tokens_seen": 74884856, "step": 4654 }, { "epoch": 0.3260745838696402, "grad_norm": 4.633018493652344, "learning_rate": 6.742315236427321e-05, "loss": 1.1525, "num_input_tokens_seen": 74900848, "step": 4655 }, { "epoch": 0.3261446321153695, "grad_norm": 3.568934440612793, "learning_rate": 6.741615411558669e-05, "loss": 1.061, "num_input_tokens_seen": 74917232, "step": 4656 }, { "epoch": 0.3262146803610987, "grad_norm": 3.6099655628204346, "learning_rate": 6.740915586690019e-05, "loss": 1.0758, "num_input_tokens_seen": 74933616, "step": 4657 }, { "epoch": 0.32628472860682795, "grad_norm": 4.272975921630859, "learning_rate": 6.740215761821366e-05, "loss": 1.1901, "num_input_tokens_seen": 74950000, "step": 4658 }, { "epoch": 0.3263547768525572, "grad_norm": 4.2752251625061035, "learning_rate": 6.739515936952714e-05, "loss": 1.0835, "num_input_tokens_seen": 74966032, "step": 4659 }, { "epoch": 0.32642482509828646, "grad_norm": 5.06410551071167, "learning_rate": 6.738816112084064e-05, "loss": 1.2041, "num_input_tokens_seen": 74981432, "step": 4660 }, { "epoch": 0.32649487334401567, "grad_norm": 6.378856182098389, "learning_rate": 6.738116287215413e-05, "loss": 1.2996, "num_input_tokens_seen": 74997440, "step": 4661 }, { "epoch": 0.3265649215897449, "grad_norm": 5.427485466003418, "learning_rate": 6.73741646234676e-05, "loss": 1.2233, "num_input_tokens_seen": 75013824, "step": 4662 }, { "epoch": 0.3266349698354742, "grad_norm": 4.366839408874512, "learning_rate": 6.736716637478109e-05, "loss": 1.2077, "num_input_tokens_seen": 75030208, "step": 4663 }, { "epoch": 0.32670501808120345, "grad_norm": 5.765005588531494, "learning_rate": 6.736016812609458e-05, "loss": 1.0833, "num_input_tokens_seen": 75046592, "step": 4664 }, { "epoch": 0.32677506632693265, "grad_norm": 3.4886975288391113, "learning_rate": 6.735316987740806e-05, "loss": 0.7976, "num_input_tokens_seen": 75062976, "step": 4665 }, { "epoch": 0.3268451145726619, "grad_norm": 4.1105875968933105, "learning_rate": 6.734617162872154e-05, "loss": 1.011, "num_input_tokens_seen": 75078024, "step": 4666 }, { "epoch": 0.32691516281839117, "grad_norm": 3.8737053871154785, "learning_rate": 6.733917338003503e-05, "loss": 1.0544, "num_input_tokens_seen": 75094408, "step": 4667 }, { "epoch": 0.32698521106412043, "grad_norm": 4.077807426452637, "learning_rate": 6.733217513134852e-05, "loss": 1.1573, "num_input_tokens_seen": 75110792, "step": 4668 }, { "epoch": 0.32705525930984963, "grad_norm": 4.339305400848389, "learning_rate": 6.7325176882662e-05, "loss": 0.7132, "num_input_tokens_seen": 75126240, "step": 4669 }, { "epoch": 0.3271253075555789, "grad_norm": 4.241507053375244, "learning_rate": 6.731817863397549e-05, "loss": 1.1594, "num_input_tokens_seen": 75142144, "step": 4670 }, { "epoch": 0.32719535580130815, "grad_norm": 7.518558979034424, "learning_rate": 6.731118038528897e-05, "loss": 1.0168, "num_input_tokens_seen": 75158528, "step": 4671 }, { "epoch": 0.3272654040470374, "grad_norm": 4.342295169830322, "learning_rate": 6.730418213660245e-05, "loss": 1.2134, "num_input_tokens_seen": 75174912, "step": 4672 }, { "epoch": 0.3273354522927666, "grad_norm": 3.3599188327789307, "learning_rate": 6.729718388791594e-05, "loss": 0.9183, "num_input_tokens_seen": 75190720, "step": 4673 }, { "epoch": 0.3274055005384959, "grad_norm": 4.393617153167725, "learning_rate": 6.729018563922943e-05, "loss": 1.1215, "num_input_tokens_seen": 75207104, "step": 4674 }, { "epoch": 0.32747554878422513, "grad_norm": 3.948538064956665, "learning_rate": 6.728318739054292e-05, "loss": 0.9105, "num_input_tokens_seen": 75222736, "step": 4675 }, { "epoch": 0.3275455970299544, "grad_norm": 5.3323469161987305, "learning_rate": 6.727618914185639e-05, "loss": 0.9977, "num_input_tokens_seen": 75238680, "step": 4676 }, { "epoch": 0.32761564527568365, "grad_norm": 4.943187713623047, "learning_rate": 6.726919089316988e-05, "loss": 0.9327, "num_input_tokens_seen": 75255064, "step": 4677 }, { "epoch": 0.32768569352141286, "grad_norm": 4.083932399749756, "learning_rate": 6.726219264448337e-05, "loss": 1.2085, "num_input_tokens_seen": 75271448, "step": 4678 }, { "epoch": 0.3277557417671421, "grad_norm": 4.682622909545898, "learning_rate": 6.725519439579684e-05, "loss": 1.0105, "num_input_tokens_seen": 75287752, "step": 4679 }, { "epoch": 0.3278257900128714, "grad_norm": 4.544816493988037, "learning_rate": 6.724819614711033e-05, "loss": 1.0422, "num_input_tokens_seen": 75304136, "step": 4680 }, { "epoch": 0.32789583825860064, "grad_norm": 3.859891176223755, "learning_rate": 6.724119789842383e-05, "loss": 0.9317, "num_input_tokens_seen": 75320520, "step": 4681 }, { "epoch": 0.32796588650432984, "grad_norm": 5.739070415496826, "learning_rate": 6.723419964973731e-05, "loss": 1.1315, "num_input_tokens_seen": 75336904, "step": 4682 }, { "epoch": 0.3280359347500591, "grad_norm": 4.289483547210693, "learning_rate": 6.722720140105078e-05, "loss": 1.0576, "num_input_tokens_seen": 75353288, "step": 4683 }, { "epoch": 0.32810598299578836, "grad_norm": 4.03695011138916, "learning_rate": 6.722020315236429e-05, "loss": 1.0129, "num_input_tokens_seen": 75369424, "step": 4684 }, { "epoch": 0.3281760312415176, "grad_norm": 3.8941352367401123, "learning_rate": 6.721320490367776e-05, "loss": 1.015, "num_input_tokens_seen": 75385760, "step": 4685 }, { "epoch": 0.3282460794872468, "grad_norm": 4.345769882202148, "learning_rate": 6.720620665499125e-05, "loss": 0.9842, "num_input_tokens_seen": 75401736, "step": 4686 }, { "epoch": 0.3283161277329761, "grad_norm": 5.759182453155518, "learning_rate": 6.719920840630474e-05, "loss": 1.0937, "num_input_tokens_seen": 75417928, "step": 4687 }, { "epoch": 0.32838617597870534, "grad_norm": 4.947919845581055, "learning_rate": 6.719221015761823e-05, "loss": 1.0346, "num_input_tokens_seen": 75433624, "step": 4688 }, { "epoch": 0.3284562242244346, "grad_norm": 3.936934471130371, "learning_rate": 6.71852119089317e-05, "loss": 1.1684, "num_input_tokens_seen": 75450008, "step": 4689 }, { "epoch": 0.3285262724701638, "grad_norm": 3.7944555282592773, "learning_rate": 6.717821366024519e-05, "loss": 0.9825, "num_input_tokens_seen": 75466392, "step": 4690 }, { "epoch": 0.32859632071589306, "grad_norm": 3.8094451427459717, "learning_rate": 6.717121541155868e-05, "loss": 0.9309, "num_input_tokens_seen": 75482776, "step": 4691 }, { "epoch": 0.3286663689616223, "grad_norm": 4.426685333251953, "learning_rate": 6.716421716287215e-05, "loss": 0.9497, "num_input_tokens_seen": 75497760, "step": 4692 }, { "epoch": 0.3287364172073516, "grad_norm": 4.299224376678467, "learning_rate": 6.715721891418564e-05, "loss": 1.214, "num_input_tokens_seen": 75513024, "step": 4693 }, { "epoch": 0.3288064654530808, "grad_norm": 3.765477418899536, "learning_rate": 6.715022066549913e-05, "loss": 1.2114, "num_input_tokens_seen": 75529304, "step": 4694 }, { "epoch": 0.32887651369881005, "grad_norm": 3.991591453552246, "learning_rate": 6.714322241681262e-05, "loss": 0.8295, "num_input_tokens_seen": 75545264, "step": 4695 }, { "epoch": 0.3289465619445393, "grad_norm": 3.652726888656616, "learning_rate": 6.71362241681261e-05, "loss": 0.953, "num_input_tokens_seen": 75561648, "step": 4696 }, { "epoch": 0.32901661019026857, "grad_norm": 6.083689212799072, "learning_rate": 6.712922591943958e-05, "loss": 1.0838, "num_input_tokens_seen": 75578032, "step": 4697 }, { "epoch": 0.32908665843599777, "grad_norm": 4.732533931732178, "learning_rate": 6.712222767075307e-05, "loss": 0.9885, "num_input_tokens_seen": 75593944, "step": 4698 }, { "epoch": 0.32915670668172703, "grad_norm": 5.024901866912842, "learning_rate": 6.711522942206655e-05, "loss": 0.887, "num_input_tokens_seen": 75610328, "step": 4699 }, { "epoch": 0.3292267549274563, "grad_norm": 4.663429260253906, "learning_rate": 6.710823117338004e-05, "loss": 1.0955, "num_input_tokens_seen": 75626712, "step": 4700 }, { "epoch": 0.32929680317318555, "grad_norm": 4.396904945373535, "learning_rate": 6.710123292469354e-05, "loss": 1.2419, "num_input_tokens_seen": 75643096, "step": 4701 }, { "epoch": 0.32936685141891475, "grad_norm": 3.7963149547576904, "learning_rate": 6.709423467600701e-05, "loss": 1.1536, "num_input_tokens_seen": 75658616, "step": 4702 }, { "epoch": 0.329436899664644, "grad_norm": 4.154513835906982, "learning_rate": 6.708723642732049e-05, "loss": 1.0529, "num_input_tokens_seen": 75675000, "step": 4703 }, { "epoch": 0.32950694791037327, "grad_norm": 3.8939032554626465, "learning_rate": 6.708023817863398e-05, "loss": 1.115, "num_input_tokens_seen": 75690728, "step": 4704 }, { "epoch": 0.32957699615610253, "grad_norm": 4.7678375244140625, "learning_rate": 6.707323992994746e-05, "loss": 0.9747, "num_input_tokens_seen": 75707080, "step": 4705 }, { "epoch": 0.32964704440183173, "grad_norm": 6.56498384475708, "learning_rate": 6.706624168126094e-05, "loss": 1.058, "num_input_tokens_seen": 75723464, "step": 4706 }, { "epoch": 0.329717092647561, "grad_norm": 6.917506694793701, "learning_rate": 6.705924343257444e-05, "loss": 1.0576, "num_input_tokens_seen": 75739848, "step": 4707 }, { "epoch": 0.32978714089329025, "grad_norm": 3.9431846141815186, "learning_rate": 6.705224518388793e-05, "loss": 0.9693, "num_input_tokens_seen": 75756232, "step": 4708 }, { "epoch": 0.3298571891390195, "grad_norm": 4.838469505310059, "learning_rate": 6.70452469352014e-05, "loss": 1.2367, "num_input_tokens_seen": 75772616, "step": 4709 }, { "epoch": 0.3299272373847488, "grad_norm": 4.0371012687683105, "learning_rate": 6.703824868651488e-05, "loss": 1.0494, "num_input_tokens_seen": 75789000, "step": 4710 }, { "epoch": 0.329997285630478, "grad_norm": 3.491875410079956, "learning_rate": 6.703125043782838e-05, "loss": 0.8919, "num_input_tokens_seen": 75805384, "step": 4711 }, { "epoch": 0.33006733387620724, "grad_norm": 3.5304512977600098, "learning_rate": 6.702425218914186e-05, "loss": 0.8896, "num_input_tokens_seen": 75821104, "step": 4712 }, { "epoch": 0.3301373821219365, "grad_norm": 3.642528533935547, "learning_rate": 6.701725394045535e-05, "loss": 0.9843, "num_input_tokens_seen": 75837424, "step": 4713 }, { "epoch": 0.33020743036766576, "grad_norm": 6.536950588226318, "learning_rate": 6.701025569176883e-05, "loss": 0.9545, "num_input_tokens_seen": 75853808, "step": 4714 }, { "epoch": 0.33027747861339496, "grad_norm": 3.376460075378418, "learning_rate": 6.700325744308232e-05, "loss": 0.9607, "num_input_tokens_seen": 75870192, "step": 4715 }, { "epoch": 0.3303475268591242, "grad_norm": 4.988052845001221, "learning_rate": 6.69962591943958e-05, "loss": 1.1392, "num_input_tokens_seen": 75886576, "step": 4716 }, { "epoch": 0.3304175751048535, "grad_norm": 4.724236965179443, "learning_rate": 6.698926094570929e-05, "loss": 1.0015, "num_input_tokens_seen": 75902960, "step": 4717 }, { "epoch": 0.33048762335058274, "grad_norm": 4.877357006072998, "learning_rate": 6.698226269702278e-05, "loss": 0.9892, "num_input_tokens_seen": 75919344, "step": 4718 }, { "epoch": 0.33055767159631194, "grad_norm": 3.981224775314331, "learning_rate": 6.697526444833625e-05, "loss": 0.9356, "num_input_tokens_seen": 75935728, "step": 4719 }, { "epoch": 0.3306277198420412, "grad_norm": 5.456554889678955, "learning_rate": 6.696826619964974e-05, "loss": 0.8373, "num_input_tokens_seen": 75951304, "step": 4720 }, { "epoch": 0.33069776808777046, "grad_norm": 3.9885287284851074, "learning_rate": 6.696126795096323e-05, "loss": 1.2658, "num_input_tokens_seen": 75967688, "step": 4721 }, { "epoch": 0.3307678163334997, "grad_norm": 3.447371482849121, "learning_rate": 6.695426970227672e-05, "loss": 0.9301, "num_input_tokens_seen": 75984072, "step": 4722 }, { "epoch": 0.3308378645792289, "grad_norm": 4.405709743499756, "learning_rate": 6.694727145359019e-05, "loss": 1.2445, "num_input_tokens_seen": 76000456, "step": 4723 }, { "epoch": 0.3309079128249582, "grad_norm": 3.7595372200012207, "learning_rate": 6.694027320490368e-05, "loss": 1.1851, "num_input_tokens_seen": 76016840, "step": 4724 }, { "epoch": 0.33097796107068744, "grad_norm": 5.460091590881348, "learning_rate": 6.693327495621717e-05, "loss": 0.8514, "num_input_tokens_seen": 76032344, "step": 4725 }, { "epoch": 0.3310480093164167, "grad_norm": 7.111250400543213, "learning_rate": 6.692627670753064e-05, "loss": 1.0086, "num_input_tokens_seen": 76048728, "step": 4726 }, { "epoch": 0.3311180575621459, "grad_norm": 4.799232482910156, "learning_rate": 6.691927845884415e-05, "loss": 0.9995, "num_input_tokens_seen": 76063832, "step": 4727 }, { "epoch": 0.33118810580787517, "grad_norm": 4.045900344848633, "learning_rate": 6.691228021015763e-05, "loss": 1.162, "num_input_tokens_seen": 76079792, "step": 4728 }, { "epoch": 0.3312581540536044, "grad_norm": 3.9942305088043213, "learning_rate": 6.690528196147111e-05, "loss": 1.1444, "num_input_tokens_seen": 76095992, "step": 4729 }, { "epoch": 0.3313282022993337, "grad_norm": 4.173962116241455, "learning_rate": 6.689828371278458e-05, "loss": 0.928, "num_input_tokens_seen": 76111760, "step": 4730 }, { "epoch": 0.3313982505450629, "grad_norm": 8.357215881347656, "learning_rate": 6.689128546409807e-05, "loss": 1.1803, "num_input_tokens_seen": 76127184, "step": 4731 }, { "epoch": 0.33146829879079215, "grad_norm": 3.7359249591827393, "learning_rate": 6.688428721541156e-05, "loss": 1.0539, "num_input_tokens_seen": 76143536, "step": 4732 }, { "epoch": 0.3315383470365214, "grad_norm": 4.159603595733643, "learning_rate": 6.687728896672505e-05, "loss": 1.1565, "num_input_tokens_seen": 76159640, "step": 4733 }, { "epoch": 0.33160839528225067, "grad_norm": 4.893441200256348, "learning_rate": 6.687029071803854e-05, "loss": 1.191, "num_input_tokens_seen": 76176024, "step": 4734 }, { "epoch": 0.33167844352797987, "grad_norm": 4.4292426109313965, "learning_rate": 6.686329246935203e-05, "loss": 1.1852, "num_input_tokens_seen": 76192408, "step": 4735 }, { "epoch": 0.33174849177370913, "grad_norm": 3.612821102142334, "learning_rate": 6.68562942206655e-05, "loss": 1.0195, "num_input_tokens_seen": 76208792, "step": 4736 }, { "epoch": 0.3318185400194384, "grad_norm": 3.6046557426452637, "learning_rate": 6.684929597197898e-05, "loss": 1.1402, "num_input_tokens_seen": 76225176, "step": 4737 }, { "epoch": 0.33188858826516765, "grad_norm": 4.637216567993164, "learning_rate": 6.684229772329248e-05, "loss": 0.8202, "num_input_tokens_seen": 76241560, "step": 4738 }, { "epoch": 0.33195863651089685, "grad_norm": 4.83438777923584, "learning_rate": 6.683529947460595e-05, "loss": 0.9085, "num_input_tokens_seen": 76257264, "step": 4739 }, { "epoch": 0.3320286847566261, "grad_norm": 3.903982400894165, "learning_rate": 6.682830122591944e-05, "loss": 1.2306, "num_input_tokens_seen": 76273608, "step": 4740 }, { "epoch": 0.3320987330023554, "grad_norm": 6.24022102355957, "learning_rate": 6.682130297723293e-05, "loss": 0.9706, "num_input_tokens_seen": 76289840, "step": 4741 }, { "epoch": 0.33216878124808463, "grad_norm": 5.286207675933838, "learning_rate": 6.681430472854642e-05, "loss": 1.0803, "num_input_tokens_seen": 76306088, "step": 4742 }, { "epoch": 0.3322388294938139, "grad_norm": 5.145969867706299, "learning_rate": 6.68073064798599e-05, "loss": 1.2303, "num_input_tokens_seen": 76322152, "step": 4743 }, { "epoch": 0.3323088777395431, "grad_norm": 3.6806249618530273, "learning_rate": 6.680030823117338e-05, "loss": 1.0168, "num_input_tokens_seen": 76338424, "step": 4744 }, { "epoch": 0.33237892598527236, "grad_norm": 3.743912696838379, "learning_rate": 6.679330998248687e-05, "loss": 0.9507, "num_input_tokens_seen": 76354808, "step": 4745 }, { "epoch": 0.3324489742310016, "grad_norm": 5.072415828704834, "learning_rate": 6.678631173380035e-05, "loss": 1.014, "num_input_tokens_seen": 76369696, "step": 4746 }, { "epoch": 0.3325190224767309, "grad_norm": 3.366450548171997, "learning_rate": 6.677931348511384e-05, "loss": 0.9201, "num_input_tokens_seen": 76385560, "step": 4747 }, { "epoch": 0.3325890707224601, "grad_norm": 3.8318989276885986, "learning_rate": 6.677231523642732e-05, "loss": 1.0973, "num_input_tokens_seen": 76401168, "step": 4748 }, { "epoch": 0.33265911896818934, "grad_norm": 3.9670164585113525, "learning_rate": 6.676531698774081e-05, "loss": 0.9238, "num_input_tokens_seen": 76417552, "step": 4749 }, { "epoch": 0.3327291672139186, "grad_norm": 4.344585418701172, "learning_rate": 6.675831873905429e-05, "loss": 1.0099, "num_input_tokens_seen": 76433936, "step": 4750 }, { "epoch": 0.33279921545964786, "grad_norm": 7.547675132751465, "learning_rate": 6.675132049036778e-05, "loss": 1.1412, "num_input_tokens_seen": 76450320, "step": 4751 }, { "epoch": 0.33286926370537706, "grad_norm": 7.854677677154541, "learning_rate": 6.674432224168127e-05, "loss": 0.8778, "num_input_tokens_seen": 76465696, "step": 4752 }, { "epoch": 0.3329393119511063, "grad_norm": 4.030972480773926, "learning_rate": 6.673732399299475e-05, "loss": 1.1389, "num_input_tokens_seen": 76482080, "step": 4753 }, { "epoch": 0.3330093601968356, "grad_norm": 5.990024089813232, "learning_rate": 6.673032574430824e-05, "loss": 0.9469, "num_input_tokens_seen": 76498464, "step": 4754 }, { "epoch": 0.33307940844256484, "grad_norm": 3.8437137603759766, "learning_rate": 6.672332749562173e-05, "loss": 1.009, "num_input_tokens_seen": 76514848, "step": 4755 }, { "epoch": 0.33314945668829404, "grad_norm": 3.883882761001587, "learning_rate": 6.67163292469352e-05, "loss": 1.0267, "num_input_tokens_seen": 76531232, "step": 4756 }, { "epoch": 0.3332195049340233, "grad_norm": 4.205630779266357, "learning_rate": 6.670933099824868e-05, "loss": 1.0847, "num_input_tokens_seen": 76547616, "step": 4757 }, { "epoch": 0.33328955317975256, "grad_norm": 6.173430442810059, "learning_rate": 6.670233274956217e-05, "loss": 1.2014, "num_input_tokens_seen": 76564000, "step": 4758 }, { "epoch": 0.3333596014254818, "grad_norm": 3.464181661605835, "learning_rate": 6.669533450087566e-05, "loss": 0.8751, "num_input_tokens_seen": 76579904, "step": 4759 }, { "epoch": 0.333429649671211, "grad_norm": 3.3506994247436523, "learning_rate": 6.668833625218915e-05, "loss": 0.8281, "num_input_tokens_seen": 76596288, "step": 4760 }, { "epoch": 0.3334996979169403, "grad_norm": 7.188508987426758, "learning_rate": 6.668133800350264e-05, "loss": 1.1058, "num_input_tokens_seen": 76611632, "step": 4761 }, { "epoch": 0.33356974616266954, "grad_norm": 3.916689872741699, "learning_rate": 6.667433975481612e-05, "loss": 1.0815, "num_input_tokens_seen": 76626840, "step": 4762 }, { "epoch": 0.3336397944083988, "grad_norm": 3.4827966690063477, "learning_rate": 6.66673415061296e-05, "loss": 0.9103, "num_input_tokens_seen": 76643024, "step": 4763 }, { "epoch": 0.333709842654128, "grad_norm": 4.479428768157959, "learning_rate": 6.666034325744307e-05, "loss": 0.9238, "num_input_tokens_seen": 76659408, "step": 4764 }, { "epoch": 0.33377989089985727, "grad_norm": 6.008899211883545, "learning_rate": 6.665334500875658e-05, "loss": 1.2375, "num_input_tokens_seen": 76675272, "step": 4765 }, { "epoch": 0.3338499391455865, "grad_norm": 4.10992431640625, "learning_rate": 6.664634676007005e-05, "loss": 1.0539, "num_input_tokens_seen": 76691000, "step": 4766 }, { "epoch": 0.3339199873913158, "grad_norm": 3.953507423400879, "learning_rate": 6.663934851138354e-05, "loss": 1.1051, "num_input_tokens_seen": 76707024, "step": 4767 }, { "epoch": 0.333990035637045, "grad_norm": 4.237090587615967, "learning_rate": 6.663235026269703e-05, "loss": 1.1683, "num_input_tokens_seen": 76723408, "step": 4768 }, { "epoch": 0.33406008388277425, "grad_norm": 4.417295932769775, "learning_rate": 6.662535201401052e-05, "loss": 0.923, "num_input_tokens_seen": 76739296, "step": 4769 }, { "epoch": 0.3341301321285035, "grad_norm": 3.664970874786377, "learning_rate": 6.661835376532399e-05, "loss": 0.9556, "num_input_tokens_seen": 76755432, "step": 4770 }, { "epoch": 0.33420018037423277, "grad_norm": 3.702932834625244, "learning_rate": 6.661135551663748e-05, "loss": 0.9457, "num_input_tokens_seen": 76770688, "step": 4771 }, { "epoch": 0.334270228619962, "grad_norm": 3.741722822189331, "learning_rate": 6.660435726795097e-05, "loss": 1.0923, "num_input_tokens_seen": 76787072, "step": 4772 }, { "epoch": 0.33434027686569123, "grad_norm": 3.9605424404144287, "learning_rate": 6.659735901926446e-05, "loss": 1.0823, "num_input_tokens_seen": 76803456, "step": 4773 }, { "epoch": 0.3344103251114205, "grad_norm": 3.9401822090148926, "learning_rate": 6.659036077057793e-05, "loss": 0.996, "num_input_tokens_seen": 76819840, "step": 4774 }, { "epoch": 0.33448037335714975, "grad_norm": 3.8762905597686768, "learning_rate": 6.658336252189142e-05, "loss": 0.9796, "num_input_tokens_seen": 76836224, "step": 4775 }, { "epoch": 0.33455042160287896, "grad_norm": 4.117221832275391, "learning_rate": 6.657636427320491e-05, "loss": 1.2631, "num_input_tokens_seen": 76852608, "step": 4776 }, { "epoch": 0.3346204698486082, "grad_norm": 3.814997434616089, "learning_rate": 6.656936602451839e-05, "loss": 1.0891, "num_input_tokens_seen": 76868400, "step": 4777 }, { "epoch": 0.3346905180943375, "grad_norm": 3.6070499420166016, "learning_rate": 6.656236777583187e-05, "loss": 0.8537, "num_input_tokens_seen": 76884784, "step": 4778 }, { "epoch": 0.33476056634006673, "grad_norm": 6.291281700134277, "learning_rate": 6.655536952714536e-05, "loss": 1.2195, "num_input_tokens_seen": 76901168, "step": 4779 }, { "epoch": 0.334830614585796, "grad_norm": 7.043301105499268, "learning_rate": 6.654837127845885e-05, "loss": 1.1015, "num_input_tokens_seen": 76917552, "step": 4780 }, { "epoch": 0.3349006628315252, "grad_norm": 3.6702778339385986, "learning_rate": 6.654137302977234e-05, "loss": 1.0832, "num_input_tokens_seen": 76933936, "step": 4781 }, { "epoch": 0.33497071107725446, "grad_norm": 4.228512287139893, "learning_rate": 6.653437478108583e-05, "loss": 0.8781, "num_input_tokens_seen": 76950320, "step": 4782 }, { "epoch": 0.3350407593229837, "grad_norm": 3.9304075241088867, "learning_rate": 6.65273765323993e-05, "loss": 1.1066, "num_input_tokens_seen": 76966704, "step": 4783 }, { "epoch": 0.335110807568713, "grad_norm": 3.608708620071411, "learning_rate": 6.652037828371278e-05, "loss": 1.0409, "num_input_tokens_seen": 76983016, "step": 4784 }, { "epoch": 0.3351808558144422, "grad_norm": 4.402626037597656, "learning_rate": 6.651338003502627e-05, "loss": 1.1832, "num_input_tokens_seen": 76999344, "step": 4785 }, { "epoch": 0.33525090406017144, "grad_norm": 4.109679222106934, "learning_rate": 6.650638178633976e-05, "loss": 1.0003, "num_input_tokens_seen": 77014960, "step": 4786 }, { "epoch": 0.3353209523059007, "grad_norm": 3.893702507019043, "learning_rate": 6.649938353765324e-05, "loss": 1.1101, "num_input_tokens_seen": 77031344, "step": 4787 }, { "epoch": 0.33539100055162996, "grad_norm": 4.326907157897949, "learning_rate": 6.649238528896673e-05, "loss": 1.0554, "num_input_tokens_seen": 77047264, "step": 4788 }, { "epoch": 0.33546104879735916, "grad_norm": 4.946060657501221, "learning_rate": 6.648538704028022e-05, "loss": 1.0413, "num_input_tokens_seen": 77063648, "step": 4789 }, { "epoch": 0.3355310970430884, "grad_norm": 3.5379018783569336, "learning_rate": 6.64783887915937e-05, "loss": 1.0981, "num_input_tokens_seen": 77080032, "step": 4790 }, { "epoch": 0.3356011452888177, "grad_norm": 4.117929935455322, "learning_rate": 6.647139054290717e-05, "loss": 1.0624, "num_input_tokens_seen": 77096416, "step": 4791 }, { "epoch": 0.33567119353454694, "grad_norm": 4.293130397796631, "learning_rate": 6.646439229422067e-05, "loss": 1.1938, "num_input_tokens_seen": 77112800, "step": 4792 }, { "epoch": 0.33574124178027615, "grad_norm": 3.8246893882751465, "learning_rate": 6.645739404553416e-05, "loss": 1.0944, "num_input_tokens_seen": 77128272, "step": 4793 }, { "epoch": 0.3358112900260054, "grad_norm": 4.095324993133545, "learning_rate": 6.645039579684764e-05, "loss": 1.0024, "num_input_tokens_seen": 77144008, "step": 4794 }, { "epoch": 0.33588133827173466, "grad_norm": 3.7015397548675537, "learning_rate": 6.644339754816113e-05, "loss": 1.1318, "num_input_tokens_seen": 77160392, "step": 4795 }, { "epoch": 0.3359513865174639, "grad_norm": 3.7702248096466064, "learning_rate": 6.643639929947461e-05, "loss": 1.1815, "num_input_tokens_seen": 77176776, "step": 4796 }, { "epoch": 0.3360214347631931, "grad_norm": 6.468194961547852, "learning_rate": 6.642940105078809e-05, "loss": 1.1144, "num_input_tokens_seen": 77192000, "step": 4797 }, { "epoch": 0.3360914830089224, "grad_norm": 5.211976528167725, "learning_rate": 6.642240280210158e-05, "loss": 1.0252, "num_input_tokens_seen": 77208040, "step": 4798 }, { "epoch": 0.33616153125465165, "grad_norm": 4.3227763175964355, "learning_rate": 6.641540455341507e-05, "loss": 1.1343, "num_input_tokens_seen": 77224016, "step": 4799 }, { "epoch": 0.3362315795003809, "grad_norm": 3.6128039360046387, "learning_rate": 6.640840630472856e-05, "loss": 1.0222, "num_input_tokens_seen": 77240400, "step": 4800 }, { "epoch": 0.3362315795003809, "eval_loss": 1.1314613819122314, "eval_runtime": 0.2053, "eval_samples_per_second": 4.87, "eval_steps_per_second": 4.87, "num_input_tokens_seen": 77240400, "step": 4800 }, { "epoch": 0.3363016277461101, "grad_norm": 3.851407766342163, "learning_rate": 6.640140805604203e-05, "loss": 1.0153, "num_input_tokens_seen": 77255896, "step": 4801 }, { "epoch": 0.33637167599183937, "grad_norm": 3.8287763595581055, "learning_rate": 6.639440980735552e-05, "loss": 1.02, "num_input_tokens_seen": 77272280, "step": 4802 }, { "epoch": 0.33644172423756863, "grad_norm": 4.373470306396484, "learning_rate": 6.638741155866901e-05, "loss": 1.0321, "num_input_tokens_seen": 77288664, "step": 4803 }, { "epoch": 0.3365117724832979, "grad_norm": 5.384084224700928, "learning_rate": 6.638041330998248e-05, "loss": 1.08, "num_input_tokens_seen": 77304544, "step": 4804 }, { "epoch": 0.3365818207290271, "grad_norm": 4.742502212524414, "learning_rate": 6.637341506129597e-05, "loss": 1.0856, "num_input_tokens_seen": 77320928, "step": 4805 }, { "epoch": 0.33665186897475635, "grad_norm": 3.5294950008392334, "learning_rate": 6.636641681260946e-05, "loss": 0.827, "num_input_tokens_seen": 77337072, "step": 4806 }, { "epoch": 0.3367219172204856, "grad_norm": 4.925806999206543, "learning_rate": 6.635941856392295e-05, "loss": 1.1351, "num_input_tokens_seen": 77352312, "step": 4807 }, { "epoch": 0.33679196546621487, "grad_norm": 4.373791694641113, "learning_rate": 6.635242031523644e-05, "loss": 1.055, "num_input_tokens_seen": 77368696, "step": 4808 }, { "epoch": 0.3368620137119441, "grad_norm": 4.921911239624023, "learning_rate": 6.634542206654993e-05, "loss": 1.0054, "num_input_tokens_seen": 77384976, "step": 4809 }, { "epoch": 0.33693206195767333, "grad_norm": 6.203757286071777, "learning_rate": 6.63384238178634e-05, "loss": 1.1801, "num_input_tokens_seen": 77401360, "step": 4810 }, { "epoch": 0.3370021102034026, "grad_norm": 3.675086498260498, "learning_rate": 6.633142556917688e-05, "loss": 1.1148, "num_input_tokens_seen": 77417744, "step": 4811 }, { "epoch": 0.33707215844913185, "grad_norm": 5.169121742248535, "learning_rate": 6.632442732049036e-05, "loss": 0.9541, "num_input_tokens_seen": 77434128, "step": 4812 }, { "epoch": 0.3371422066948611, "grad_norm": 4.036499977111816, "learning_rate": 6.631742907180387e-05, "loss": 0.9914, "num_input_tokens_seen": 77449856, "step": 4813 }, { "epoch": 0.3372122549405903, "grad_norm": 4.040637016296387, "learning_rate": 6.631043082311734e-05, "loss": 1.0704, "num_input_tokens_seen": 77465536, "step": 4814 }, { "epoch": 0.3372823031863196, "grad_norm": 5.76871395111084, "learning_rate": 6.630343257443083e-05, "loss": 1.0127, "num_input_tokens_seen": 77481920, "step": 4815 }, { "epoch": 0.33735235143204884, "grad_norm": 5.222348690032959, "learning_rate": 6.629643432574432e-05, "loss": 0.9411, "num_input_tokens_seen": 77497464, "step": 4816 }, { "epoch": 0.3374223996777781, "grad_norm": 4.099587440490723, "learning_rate": 6.62894360770578e-05, "loss": 1.011, "num_input_tokens_seen": 77513848, "step": 4817 }, { "epoch": 0.3374924479235073, "grad_norm": 4.034639835357666, "learning_rate": 6.628243782837127e-05, "loss": 1.0626, "num_input_tokens_seen": 77530000, "step": 4818 }, { "epoch": 0.33756249616923656, "grad_norm": 5.280242443084717, "learning_rate": 6.627543957968477e-05, "loss": 1.1305, "num_input_tokens_seen": 77546384, "step": 4819 }, { "epoch": 0.3376325444149658, "grad_norm": 4.851918697357178, "learning_rate": 6.626844133099826e-05, "loss": 1.0557, "num_input_tokens_seen": 77561000, "step": 4820 }, { "epoch": 0.3377025926606951, "grad_norm": 3.957601308822632, "learning_rate": 6.626144308231173e-05, "loss": 0.9625, "num_input_tokens_seen": 77576512, "step": 4821 }, { "epoch": 0.3377726409064243, "grad_norm": 3.9369540214538574, "learning_rate": 6.625444483362522e-05, "loss": 1.0611, "num_input_tokens_seen": 77592896, "step": 4822 }, { "epoch": 0.33784268915215354, "grad_norm": 4.397778511047363, "learning_rate": 6.624744658493871e-05, "loss": 1.0173, "num_input_tokens_seen": 77609280, "step": 4823 }, { "epoch": 0.3379127373978828, "grad_norm": 6.919220447540283, "learning_rate": 6.624044833625219e-05, "loss": 0.9992, "num_input_tokens_seen": 77625664, "step": 4824 }, { "epoch": 0.33798278564361206, "grad_norm": 5.501309871673584, "learning_rate": 6.623345008756568e-05, "loss": 1.0321, "num_input_tokens_seen": 77642048, "step": 4825 }, { "epoch": 0.33805283388934126, "grad_norm": 4.240433216094971, "learning_rate": 6.622645183887916e-05, "loss": 0.8771, "num_input_tokens_seen": 77658432, "step": 4826 }, { "epoch": 0.3381228821350705, "grad_norm": 3.9563584327697754, "learning_rate": 6.621945359019265e-05, "loss": 1.1921, "num_input_tokens_seen": 77674816, "step": 4827 }, { "epoch": 0.3381929303807998, "grad_norm": 6.861433982849121, "learning_rate": 6.621245534150613e-05, "loss": 0.9414, "num_input_tokens_seen": 77689712, "step": 4828 }, { "epoch": 0.33826297862652904, "grad_norm": 3.951972246170044, "learning_rate": 6.620545709281962e-05, "loss": 1.2363, "num_input_tokens_seen": 77706096, "step": 4829 }, { "epoch": 0.33833302687225825, "grad_norm": 4.419849395751953, "learning_rate": 6.61984588441331e-05, "loss": 1.1375, "num_input_tokens_seen": 77721168, "step": 4830 }, { "epoch": 0.3384030751179875, "grad_norm": 5.075031280517578, "learning_rate": 6.619146059544658e-05, "loss": 1.1363, "num_input_tokens_seen": 77737552, "step": 4831 }, { "epoch": 0.33847312336371677, "grad_norm": 4.216047763824463, "learning_rate": 6.618446234676007e-05, "loss": 1.0683, "num_input_tokens_seen": 77753936, "step": 4832 }, { "epoch": 0.338543171609446, "grad_norm": 4.175511360168457, "learning_rate": 6.617746409807357e-05, "loss": 1.1294, "num_input_tokens_seen": 77769848, "step": 4833 }, { "epoch": 0.33861321985517523, "grad_norm": 3.894831418991089, "learning_rate": 6.617046584938705e-05, "loss": 0.9527, "num_input_tokens_seen": 77785792, "step": 4834 }, { "epoch": 0.3386832681009045, "grad_norm": 4.06626033782959, "learning_rate": 6.616346760070053e-05, "loss": 1.039, "num_input_tokens_seen": 77801728, "step": 4835 }, { "epoch": 0.33875331634663375, "grad_norm": 6.5680341720581055, "learning_rate": 6.615646935201402e-05, "loss": 1.2627, "num_input_tokens_seen": 77818112, "step": 4836 }, { "epoch": 0.338823364592363, "grad_norm": 4.967332363128662, "learning_rate": 6.61494711033275e-05, "loss": 1.1455, "num_input_tokens_seen": 77833464, "step": 4837 }, { "epoch": 0.3388934128380922, "grad_norm": 4.244988918304443, "learning_rate": 6.614247285464097e-05, "loss": 1.0809, "num_input_tokens_seen": 77849848, "step": 4838 }, { "epoch": 0.33896346108382147, "grad_norm": 4.361011028289795, "learning_rate": 6.613547460595447e-05, "loss": 1.0217, "num_input_tokens_seen": 77866232, "step": 4839 }, { "epoch": 0.33903350932955073, "grad_norm": 6.348353385925293, "learning_rate": 6.612847635726796e-05, "loss": 1.1202, "num_input_tokens_seen": 77882616, "step": 4840 }, { "epoch": 0.33910355757528, "grad_norm": 3.8235714435577393, "learning_rate": 6.612147810858144e-05, "loss": 0.9018, "num_input_tokens_seen": 77899000, "step": 4841 }, { "epoch": 0.3391736058210092, "grad_norm": 4.069207191467285, "learning_rate": 6.611447985989493e-05, "loss": 1.1303, "num_input_tokens_seen": 77915384, "step": 4842 }, { "epoch": 0.33924365406673845, "grad_norm": 3.8036270141601562, "learning_rate": 6.610748161120842e-05, "loss": 1.1935, "num_input_tokens_seen": 77931768, "step": 4843 }, { "epoch": 0.3393137023124677, "grad_norm": 5.776700019836426, "learning_rate": 6.610048336252189e-05, "loss": 1.0031, "num_input_tokens_seen": 77948152, "step": 4844 }, { "epoch": 0.339383750558197, "grad_norm": 5.484714508056641, "learning_rate": 6.609348511383538e-05, "loss": 1.2233, "num_input_tokens_seen": 77964536, "step": 4845 }, { "epoch": 0.3394537988039262, "grad_norm": 4.595640659332275, "learning_rate": 6.608648686514887e-05, "loss": 1.0854, "num_input_tokens_seen": 77980648, "step": 4846 }, { "epoch": 0.33952384704965544, "grad_norm": 5.0377197265625, "learning_rate": 6.607948861646236e-05, "loss": 1.0513, "num_input_tokens_seen": 77995624, "step": 4847 }, { "epoch": 0.3395938952953847, "grad_norm": 3.796713352203369, "learning_rate": 6.607249036777583e-05, "loss": 0.8928, "num_input_tokens_seen": 78012008, "step": 4848 }, { "epoch": 0.33966394354111396, "grad_norm": 5.138030052185059, "learning_rate": 6.606549211908932e-05, "loss": 0.9565, "num_input_tokens_seen": 78027960, "step": 4849 }, { "epoch": 0.3397339917868432, "grad_norm": 5.852467060089111, "learning_rate": 6.605849387040281e-05, "loss": 1.1297, "num_input_tokens_seen": 78044144, "step": 4850 }, { "epoch": 0.3398040400325724, "grad_norm": 3.5677835941314697, "learning_rate": 6.605149562171628e-05, "loss": 1.0115, "num_input_tokens_seen": 78059096, "step": 4851 }, { "epoch": 0.3398740882783017, "grad_norm": 4.033452987670898, "learning_rate": 6.604449737302977e-05, "loss": 1.1311, "num_input_tokens_seen": 78075480, "step": 4852 }, { "epoch": 0.33994413652403094, "grad_norm": 5.06736421585083, "learning_rate": 6.603749912434327e-05, "loss": 1.0456, "num_input_tokens_seen": 78091864, "step": 4853 }, { "epoch": 0.3400141847697602, "grad_norm": 4.851357460021973, "learning_rate": 6.603050087565675e-05, "loss": 0.9985, "num_input_tokens_seen": 78108248, "step": 4854 }, { "epoch": 0.3400842330154894, "grad_norm": 4.659403324127197, "learning_rate": 6.602350262697022e-05, "loss": 1.0618, "num_input_tokens_seen": 78123720, "step": 4855 }, { "epoch": 0.34015428126121866, "grad_norm": 4.0248870849609375, "learning_rate": 6.601650437828371e-05, "loss": 1.0192, "num_input_tokens_seen": 78140104, "step": 4856 }, { "epoch": 0.3402243295069479, "grad_norm": 3.615807056427002, "learning_rate": 6.60095061295972e-05, "loss": 0.9875, "num_input_tokens_seen": 78155768, "step": 4857 }, { "epoch": 0.3402943777526772, "grad_norm": 4.032024383544922, "learning_rate": 6.600250788091068e-05, "loss": 0.8621, "num_input_tokens_seen": 78171688, "step": 4858 }, { "epoch": 0.3403644259984064, "grad_norm": 4.213406085968018, "learning_rate": 6.599550963222418e-05, "loss": 1.1186, "num_input_tokens_seen": 78188072, "step": 4859 }, { "epoch": 0.34043447424413564, "grad_norm": 4.343294620513916, "learning_rate": 6.598851138353767e-05, "loss": 0.9555, "num_input_tokens_seen": 78203328, "step": 4860 }, { "epoch": 0.3405045224898649, "grad_norm": 5.112723350524902, "learning_rate": 6.598151313485114e-05, "loss": 0.8201, "num_input_tokens_seen": 78218480, "step": 4861 }, { "epoch": 0.34057457073559416, "grad_norm": 5.0619215965271, "learning_rate": 6.597451488616463e-05, "loss": 1.1147, "num_input_tokens_seen": 78234864, "step": 4862 }, { "epoch": 0.34064461898132337, "grad_norm": 4.161584377288818, "learning_rate": 6.596751663747812e-05, "loss": 1.3292, "num_input_tokens_seen": 78250664, "step": 4863 }, { "epoch": 0.3407146672270526, "grad_norm": 4.402634143829346, "learning_rate": 6.59605183887916e-05, "loss": 1.2664, "num_input_tokens_seen": 78266976, "step": 4864 }, { "epoch": 0.3407847154727819, "grad_norm": 4.01839017868042, "learning_rate": 6.595352014010508e-05, "loss": 1.1515, "num_input_tokens_seen": 78283360, "step": 4865 }, { "epoch": 0.34085476371851114, "grad_norm": 3.6157965660095215, "learning_rate": 6.594652189141857e-05, "loss": 0.8962, "num_input_tokens_seen": 78299744, "step": 4866 }, { "epoch": 0.34092481196424035, "grad_norm": 4.221523761749268, "learning_rate": 6.593952364273206e-05, "loss": 1.3689, "num_input_tokens_seen": 78314944, "step": 4867 }, { "epoch": 0.3409948602099696, "grad_norm": 5.253129482269287, "learning_rate": 6.593252539404554e-05, "loss": 1.0223, "num_input_tokens_seen": 78331168, "step": 4868 }, { "epoch": 0.34106490845569887, "grad_norm": 4.839991569519043, "learning_rate": 6.592552714535902e-05, "loss": 1.1622, "num_input_tokens_seen": 78347200, "step": 4869 }, { "epoch": 0.3411349567014281, "grad_norm": 5.994297504425049, "learning_rate": 6.591852889667251e-05, "loss": 1.0511, "num_input_tokens_seen": 78363584, "step": 4870 }, { "epoch": 0.34120500494715733, "grad_norm": 4.886160850524902, "learning_rate": 6.591153064798599e-05, "loss": 1.0025, "num_input_tokens_seen": 78379968, "step": 4871 }, { "epoch": 0.3412750531928866, "grad_norm": 8.658349990844727, "learning_rate": 6.590453239929948e-05, "loss": 1.0145, "num_input_tokens_seen": 78395368, "step": 4872 }, { "epoch": 0.34134510143861585, "grad_norm": 5.1440935134887695, "learning_rate": 6.589753415061298e-05, "loss": 0.9584, "num_input_tokens_seen": 78411752, "step": 4873 }, { "epoch": 0.3414151496843451, "grad_norm": 4.83282995223999, "learning_rate": 6.589053590192645e-05, "loss": 1.1825, "num_input_tokens_seen": 78428128, "step": 4874 }, { "epoch": 0.3414851979300743, "grad_norm": 3.603290557861328, "learning_rate": 6.588353765323993e-05, "loss": 1.081, "num_input_tokens_seen": 78444512, "step": 4875 }, { "epoch": 0.3415552461758036, "grad_norm": 3.8035361766815186, "learning_rate": 6.587653940455342e-05, "loss": 1.1571, "num_input_tokens_seen": 78460896, "step": 4876 }, { "epoch": 0.34162529442153283, "grad_norm": 4.02992582321167, "learning_rate": 6.58695411558669e-05, "loss": 1.0974, "num_input_tokens_seen": 78477280, "step": 4877 }, { "epoch": 0.3416953426672621, "grad_norm": 4.898126125335693, "learning_rate": 6.586254290718038e-05, "loss": 1.0632, "num_input_tokens_seen": 78493664, "step": 4878 }, { "epoch": 0.3417653909129913, "grad_norm": 4.779463768005371, "learning_rate": 6.585554465849388e-05, "loss": 1.0473, "num_input_tokens_seen": 78509280, "step": 4879 }, { "epoch": 0.34183543915872056, "grad_norm": 3.7280569076538086, "learning_rate": 6.584854640980737e-05, "loss": 1.1184, "num_input_tokens_seen": 78525664, "step": 4880 }, { "epoch": 0.3419054874044498, "grad_norm": 4.691235542297363, "learning_rate": 6.584154816112085e-05, "loss": 0.9956, "num_input_tokens_seen": 78542048, "step": 4881 }, { "epoch": 0.3419755356501791, "grad_norm": 4.188792705535889, "learning_rate": 6.583454991243432e-05, "loss": 1.1065, "num_input_tokens_seen": 78558432, "step": 4882 }, { "epoch": 0.34204558389590833, "grad_norm": 3.7049522399902344, "learning_rate": 6.582755166374781e-05, "loss": 0.9286, "num_input_tokens_seen": 78574816, "step": 4883 }, { "epoch": 0.34211563214163754, "grad_norm": 5.808310508728027, "learning_rate": 6.58205534150613e-05, "loss": 1.0674, "num_input_tokens_seen": 78590992, "step": 4884 }, { "epoch": 0.3421856803873668, "grad_norm": 3.877638339996338, "learning_rate": 6.581355516637479e-05, "loss": 0.9908, "num_input_tokens_seen": 78607368, "step": 4885 }, { "epoch": 0.34225572863309606, "grad_norm": 3.7855000495910645, "learning_rate": 6.580655691768828e-05, "loss": 1.0697, "num_input_tokens_seen": 78622712, "step": 4886 }, { "epoch": 0.3423257768788253, "grad_norm": 3.9921584129333496, "learning_rate": 6.579955866900176e-05, "loss": 1.0196, "num_input_tokens_seen": 78638840, "step": 4887 }, { "epoch": 0.3423958251245545, "grad_norm": 4.037683486938477, "learning_rate": 6.579256042031524e-05, "loss": 0.9606, "num_input_tokens_seen": 78655224, "step": 4888 }, { "epoch": 0.3424658733702838, "grad_norm": 4.109930515289307, "learning_rate": 6.578556217162873e-05, "loss": 1.1189, "num_input_tokens_seen": 78670984, "step": 4889 }, { "epoch": 0.34253592161601304, "grad_norm": 5.201082229614258, "learning_rate": 6.577856392294222e-05, "loss": 1.0844, "num_input_tokens_seen": 78686856, "step": 4890 }, { "epoch": 0.3426059698617423, "grad_norm": 5.922754764556885, "learning_rate": 6.577156567425569e-05, "loss": 1.2428, "num_input_tokens_seen": 78702688, "step": 4891 }, { "epoch": 0.3426760181074715, "grad_norm": 4.052786350250244, "learning_rate": 6.576456742556918e-05, "loss": 1.0765, "num_input_tokens_seen": 78719072, "step": 4892 }, { "epoch": 0.34274606635320076, "grad_norm": 4.0263671875, "learning_rate": 6.575756917688267e-05, "loss": 1.2076, "num_input_tokens_seen": 78735456, "step": 4893 }, { "epoch": 0.34281611459893, "grad_norm": 3.773024082183838, "learning_rate": 6.575057092819616e-05, "loss": 1.1275, "num_input_tokens_seen": 78751424, "step": 4894 }, { "epoch": 0.3428861628446593, "grad_norm": 3.770413398742676, "learning_rate": 6.574357267950963e-05, "loss": 1.1331, "num_input_tokens_seen": 78767808, "step": 4895 }, { "epoch": 0.3429562110903885, "grad_norm": 6.26648473739624, "learning_rate": 6.573657443082312e-05, "loss": 1.1432, "num_input_tokens_seen": 78783448, "step": 4896 }, { "epoch": 0.34302625933611774, "grad_norm": 4.071943283081055, "learning_rate": 6.572957618213661e-05, "loss": 0.9008, "num_input_tokens_seen": 78798976, "step": 4897 }, { "epoch": 0.343096307581847, "grad_norm": 7.654726505279541, "learning_rate": 6.572257793345008e-05, "loss": 1.0902, "num_input_tokens_seen": 78814664, "step": 4898 }, { "epoch": 0.34316635582757626, "grad_norm": 5.928562641143799, "learning_rate": 6.571557968476357e-05, "loss": 1.0462, "num_input_tokens_seen": 78830792, "step": 4899 }, { "epoch": 0.34323640407330547, "grad_norm": 3.8699424266815186, "learning_rate": 6.570858143607708e-05, "loss": 1.0568, "num_input_tokens_seen": 78847176, "step": 4900 }, { "epoch": 0.3433064523190347, "grad_norm": 4.177735328674316, "learning_rate": 6.570158318739055e-05, "loss": 1.1381, "num_input_tokens_seen": 78863496, "step": 4901 }, { "epoch": 0.343376500564764, "grad_norm": 3.5755650997161865, "learning_rate": 6.569458493870403e-05, "loss": 1.0307, "num_input_tokens_seen": 78879688, "step": 4902 }, { "epoch": 0.34344654881049325, "grad_norm": 5.799609184265137, "learning_rate": 6.568758669001751e-05, "loss": 0.9005, "num_input_tokens_seen": 78894744, "step": 4903 }, { "epoch": 0.34351659705622245, "grad_norm": 3.7705209255218506, "learning_rate": 6.5680588441331e-05, "loss": 1.0557, "num_input_tokens_seen": 78911112, "step": 4904 }, { "epoch": 0.3435866453019517, "grad_norm": 4.713012218475342, "learning_rate": 6.567359019264449e-05, "loss": 1.1005, "num_input_tokens_seen": 78927496, "step": 4905 }, { "epoch": 0.34365669354768097, "grad_norm": 3.8360157012939453, "learning_rate": 6.566659194395798e-05, "loss": 1.1281, "num_input_tokens_seen": 78942712, "step": 4906 }, { "epoch": 0.34372674179341023, "grad_norm": 3.6071383953094482, "learning_rate": 6.565959369527147e-05, "loss": 0.974, "num_input_tokens_seen": 78959016, "step": 4907 }, { "epoch": 0.34379679003913943, "grad_norm": 4.876083850860596, "learning_rate": 6.565259544658494e-05, "loss": 1.1583, "num_input_tokens_seen": 78975400, "step": 4908 }, { "epoch": 0.3438668382848687, "grad_norm": 4.011876583099365, "learning_rate": 6.564559719789842e-05, "loss": 1.0749, "num_input_tokens_seen": 78991784, "step": 4909 }, { "epoch": 0.34393688653059795, "grad_norm": 3.74336576461792, "learning_rate": 6.563859894921191e-05, "loss": 1.0358, "num_input_tokens_seen": 79008168, "step": 4910 }, { "epoch": 0.3440069347763272, "grad_norm": 4.092207908630371, "learning_rate": 6.56316007005254e-05, "loss": 0.9901, "num_input_tokens_seen": 79024200, "step": 4911 }, { "epoch": 0.3440769830220564, "grad_norm": 3.771979331970215, "learning_rate": 6.562460245183888e-05, "loss": 0.9599, "num_input_tokens_seen": 79040584, "step": 4912 }, { "epoch": 0.3441470312677857, "grad_norm": 4.791725158691406, "learning_rate": 6.561760420315237e-05, "loss": 1.0563, "num_input_tokens_seen": 79056552, "step": 4913 }, { "epoch": 0.34421707951351493, "grad_norm": 4.9150519371032715, "learning_rate": 6.561060595446586e-05, "loss": 0.9623, "num_input_tokens_seen": 79072536, "step": 4914 }, { "epoch": 0.3442871277592442, "grad_norm": 4.550070285797119, "learning_rate": 6.560360770577934e-05, "loss": 1.07, "num_input_tokens_seen": 79088376, "step": 4915 }, { "epoch": 0.34435717600497345, "grad_norm": 4.497488975524902, "learning_rate": 6.559660945709283e-05, "loss": 1.0779, "num_input_tokens_seen": 79104760, "step": 4916 }, { "epoch": 0.34442722425070266, "grad_norm": 4.417470455169678, "learning_rate": 6.558961120840631e-05, "loss": 1.0471, "num_input_tokens_seen": 79120296, "step": 4917 }, { "epoch": 0.3444972724964319, "grad_norm": 4.967655658721924, "learning_rate": 6.558261295971979e-05, "loss": 0.9294, "num_input_tokens_seen": 79135936, "step": 4918 }, { "epoch": 0.3445673207421612, "grad_norm": 4.973440647125244, "learning_rate": 6.557561471103328e-05, "loss": 1.1045, "num_input_tokens_seen": 79151632, "step": 4919 }, { "epoch": 0.34463736898789044, "grad_norm": 5.2282609939575195, "learning_rate": 6.556861646234677e-05, "loss": 1.2508, "num_input_tokens_seen": 79167112, "step": 4920 }, { "epoch": 0.34470741723361964, "grad_norm": 4.118466854095459, "learning_rate": 6.556161821366025e-05, "loss": 1.0162, "num_input_tokens_seen": 79183496, "step": 4921 }, { "epoch": 0.3447774654793489, "grad_norm": 4.74249267578125, "learning_rate": 6.555461996497373e-05, "loss": 1.1383, "num_input_tokens_seen": 79199240, "step": 4922 }, { "epoch": 0.34484751372507816, "grad_norm": 4.21056604385376, "learning_rate": 6.554762171628722e-05, "loss": 1.2693, "num_input_tokens_seen": 79215560, "step": 4923 }, { "epoch": 0.3449175619708074, "grad_norm": 3.584332227706909, "learning_rate": 6.55406234676007e-05, "loss": 1.0466, "num_input_tokens_seen": 79231944, "step": 4924 }, { "epoch": 0.3449876102165366, "grad_norm": 7.287233829498291, "learning_rate": 6.55336252189142e-05, "loss": 1.0568, "num_input_tokens_seen": 79248328, "step": 4925 }, { "epoch": 0.3450576584622659, "grad_norm": 6.5669379234313965, "learning_rate": 6.552662697022767e-05, "loss": 0.8538, "num_input_tokens_seen": 79264712, "step": 4926 }, { "epoch": 0.34512770670799514, "grad_norm": 4.086475849151611, "learning_rate": 6.551962872154117e-05, "loss": 1.082, "num_input_tokens_seen": 79281096, "step": 4927 }, { "epoch": 0.3451977549537244, "grad_norm": 5.543658256530762, "learning_rate": 6.551263047285465e-05, "loss": 0.9835, "num_input_tokens_seen": 79297120, "step": 4928 }, { "epoch": 0.3452678031994536, "grad_norm": 6.474762439727783, "learning_rate": 6.550563222416812e-05, "loss": 1.2022, "num_input_tokens_seen": 79313504, "step": 4929 }, { "epoch": 0.34533785144518286, "grad_norm": 3.8226888179779053, "learning_rate": 6.549863397548161e-05, "loss": 0.9796, "num_input_tokens_seen": 79329888, "step": 4930 }, { "epoch": 0.3454078996909121, "grad_norm": 3.8926212787628174, "learning_rate": 6.54916357267951e-05, "loss": 1.0837, "num_input_tokens_seen": 79346272, "step": 4931 }, { "epoch": 0.3454779479366414, "grad_norm": 4.127487659454346, "learning_rate": 6.548463747810859e-05, "loss": 1.1942, "num_input_tokens_seen": 79362656, "step": 4932 }, { "epoch": 0.3455479961823706, "grad_norm": 6.770711421966553, "learning_rate": 6.547763922942208e-05, "loss": 0.9898, "num_input_tokens_seen": 79378544, "step": 4933 }, { "epoch": 0.34561804442809985, "grad_norm": 5.547317028045654, "learning_rate": 6.547064098073557e-05, "loss": 1.0748, "num_input_tokens_seen": 79394896, "step": 4934 }, { "epoch": 0.3456880926738291, "grad_norm": 4.469418048858643, "learning_rate": 6.546364273204904e-05, "loss": 1.1633, "num_input_tokens_seen": 79410480, "step": 4935 }, { "epoch": 0.34575814091955837, "grad_norm": 4.901472091674805, "learning_rate": 6.545664448336252e-05, "loss": 1.0252, "num_input_tokens_seen": 79426864, "step": 4936 }, { "epoch": 0.34582818916528757, "grad_norm": 3.60495662689209, "learning_rate": 6.5449646234676e-05, "loss": 1.007, "num_input_tokens_seen": 79443248, "step": 4937 }, { "epoch": 0.34589823741101683, "grad_norm": 4.513663291931152, "learning_rate": 6.544264798598949e-05, "loss": 1.2239, "num_input_tokens_seen": 79459632, "step": 4938 }, { "epoch": 0.3459682856567461, "grad_norm": 3.6959240436553955, "learning_rate": 6.543564973730298e-05, "loss": 1.0561, "num_input_tokens_seen": 79475320, "step": 4939 }, { "epoch": 0.34603833390247535, "grad_norm": 4.071475505828857, "learning_rate": 6.542865148861647e-05, "loss": 1.1963, "num_input_tokens_seen": 79491704, "step": 4940 }, { "epoch": 0.34610838214820455, "grad_norm": 3.665421962738037, "learning_rate": 6.542165323992996e-05, "loss": 0.9609, "num_input_tokens_seen": 79508088, "step": 4941 }, { "epoch": 0.3461784303939338, "grad_norm": 4.1782941818237305, "learning_rate": 6.541465499124343e-05, "loss": 0.85, "num_input_tokens_seen": 79523936, "step": 4942 }, { "epoch": 0.34624847863966307, "grad_norm": 4.728964328765869, "learning_rate": 6.540765674255691e-05, "loss": 1.0283, "num_input_tokens_seen": 79539848, "step": 4943 }, { "epoch": 0.34631852688539233, "grad_norm": 5.39119815826416, "learning_rate": 6.540065849387041e-05, "loss": 1.185, "num_input_tokens_seen": 79555040, "step": 4944 }, { "epoch": 0.34638857513112153, "grad_norm": 3.8394956588745117, "learning_rate": 6.53936602451839e-05, "loss": 0.7774, "num_input_tokens_seen": 79570504, "step": 4945 }, { "epoch": 0.3464586233768508, "grad_norm": 5.03010368347168, "learning_rate": 6.538666199649737e-05, "loss": 0.8746, "num_input_tokens_seen": 79586888, "step": 4946 }, { "epoch": 0.34652867162258005, "grad_norm": 3.984548807144165, "learning_rate": 6.537966374781086e-05, "loss": 1.0893, "num_input_tokens_seen": 79603128, "step": 4947 }, { "epoch": 0.3465987198683093, "grad_norm": 5.096433162689209, "learning_rate": 6.537266549912435e-05, "loss": 1.0547, "num_input_tokens_seen": 79618624, "step": 4948 }, { "epoch": 0.3466687681140385, "grad_norm": 3.6773791313171387, "learning_rate": 6.536566725043783e-05, "loss": 0.985, "num_input_tokens_seen": 79635008, "step": 4949 }, { "epoch": 0.3467388163597678, "grad_norm": 4.050341606140137, "learning_rate": 6.535866900175132e-05, "loss": 0.9229, "num_input_tokens_seen": 79651392, "step": 4950 }, { "epoch": 0.34680886460549704, "grad_norm": 3.8354263305664062, "learning_rate": 6.53516707530648e-05, "loss": 1.0264, "num_input_tokens_seen": 79667040, "step": 4951 }, { "epoch": 0.3468789128512263, "grad_norm": 4.2188873291015625, "learning_rate": 6.534467250437829e-05, "loss": 1.0297, "num_input_tokens_seen": 79683152, "step": 4952 }, { "epoch": 0.34694896109695555, "grad_norm": 4.75797700881958, "learning_rate": 6.533767425569177e-05, "loss": 1.2475, "num_input_tokens_seen": 79699536, "step": 4953 }, { "epoch": 0.34701900934268476, "grad_norm": 3.494459867477417, "learning_rate": 6.533067600700527e-05, "loss": 0.9534, "num_input_tokens_seen": 79715920, "step": 4954 }, { "epoch": 0.347089057588414, "grad_norm": 3.860872268676758, "learning_rate": 6.532367775831874e-05, "loss": 1.081, "num_input_tokens_seen": 79731832, "step": 4955 }, { "epoch": 0.3471591058341433, "grad_norm": 4.188973426818848, "learning_rate": 6.531667950963222e-05, "loss": 1.0814, "num_input_tokens_seen": 79747592, "step": 4956 }, { "epoch": 0.34722915407987254, "grad_norm": 5.598564624786377, "learning_rate": 6.530968126094571e-05, "loss": 1.0699, "num_input_tokens_seen": 79763048, "step": 4957 }, { "epoch": 0.34729920232560174, "grad_norm": 4.153980255126953, "learning_rate": 6.53026830122592e-05, "loss": 1.1726, "num_input_tokens_seen": 79777928, "step": 4958 }, { "epoch": 0.347369250571331, "grad_norm": 3.875469446182251, "learning_rate": 6.529568476357269e-05, "loss": 1.1449, "num_input_tokens_seen": 79794312, "step": 4959 }, { "epoch": 0.34743929881706026, "grad_norm": 5.391599655151367, "learning_rate": 6.528868651488617e-05, "loss": 1.1748, "num_input_tokens_seen": 79810696, "step": 4960 }, { "epoch": 0.3475093470627895, "grad_norm": 3.3462777137756348, "learning_rate": 6.528168826619966e-05, "loss": 0.8645, "num_input_tokens_seen": 79826208, "step": 4961 }, { "epoch": 0.3475793953085187, "grad_norm": 3.5444939136505127, "learning_rate": 6.527469001751314e-05, "loss": 1.0989, "num_input_tokens_seen": 79842592, "step": 4962 }, { "epoch": 0.347649443554248, "grad_norm": 4.541754722595215, "learning_rate": 6.526769176882661e-05, "loss": 0.9, "num_input_tokens_seen": 79858976, "step": 4963 }, { "epoch": 0.34771949179997724, "grad_norm": 3.728207588195801, "learning_rate": 6.52606935201401e-05, "loss": 1.0493, "num_input_tokens_seen": 79874944, "step": 4964 }, { "epoch": 0.3477895400457065, "grad_norm": 5.615260601043701, "learning_rate": 6.525369527145359e-05, "loss": 1.0588, "num_input_tokens_seen": 79890968, "step": 4965 }, { "epoch": 0.3478595882914357, "grad_norm": 4.863505840301514, "learning_rate": 6.524669702276708e-05, "loss": 0.9896, "num_input_tokens_seen": 79907352, "step": 4966 }, { "epoch": 0.34792963653716497, "grad_norm": 3.6932058334350586, "learning_rate": 6.523969877408057e-05, "loss": 0.9675, "num_input_tokens_seen": 79923736, "step": 4967 }, { "epoch": 0.3479996847828942, "grad_norm": 4.483904838562012, "learning_rate": 6.523270052539406e-05, "loss": 1.222, "num_input_tokens_seen": 79939360, "step": 4968 }, { "epoch": 0.3480697330286235, "grad_norm": 3.540771007537842, "learning_rate": 6.522570227670753e-05, "loss": 0.9759, "num_input_tokens_seen": 79955744, "step": 4969 }, { "epoch": 0.3481397812743527, "grad_norm": 3.980483293533325, "learning_rate": 6.5218704028021e-05, "loss": 1.1637, "num_input_tokens_seen": 79971368, "step": 4970 }, { "epoch": 0.34820982952008195, "grad_norm": 5.302091598510742, "learning_rate": 6.521170577933451e-05, "loss": 1.0568, "num_input_tokens_seen": 79986688, "step": 4971 }, { "epoch": 0.3482798777658112, "grad_norm": 4.176638603210449, "learning_rate": 6.5204707530648e-05, "loss": 1.1928, "num_input_tokens_seen": 80003072, "step": 4972 }, { "epoch": 0.34834992601154047, "grad_norm": 5.939540386199951, "learning_rate": 6.519770928196147e-05, "loss": 1.0465, "num_input_tokens_seen": 80019344, "step": 4973 }, { "epoch": 0.34841997425726967, "grad_norm": 4.681301593780518, "learning_rate": 6.519071103327496e-05, "loss": 1.1121, "num_input_tokens_seen": 80034504, "step": 4974 }, { "epoch": 0.34849002250299893, "grad_norm": 4.993075847625732, "learning_rate": 6.518371278458845e-05, "loss": 0.8792, "num_input_tokens_seen": 80050488, "step": 4975 }, { "epoch": 0.3485600707487282, "grad_norm": 3.87778377532959, "learning_rate": 6.517671453590192e-05, "loss": 0.9458, "num_input_tokens_seen": 80066872, "step": 4976 }, { "epoch": 0.34863011899445745, "grad_norm": 3.652738332748413, "learning_rate": 6.516971628721541e-05, "loss": 0.9912, "num_input_tokens_seen": 80083232, "step": 4977 }, { "epoch": 0.34870016724018665, "grad_norm": 3.9958438873291016, "learning_rate": 6.51627180385289e-05, "loss": 0.8653, "num_input_tokens_seen": 80099616, "step": 4978 }, { "epoch": 0.3487702154859159, "grad_norm": 4.190839767456055, "learning_rate": 6.515571978984239e-05, "loss": 1.2081, "num_input_tokens_seen": 80116000, "step": 4979 }, { "epoch": 0.3488402637316452, "grad_norm": 4.848324298858643, "learning_rate": 6.514872154115586e-05, "loss": 1.197, "num_input_tokens_seen": 80132384, "step": 4980 }, { "epoch": 0.34891031197737443, "grad_norm": 4.863750457763672, "learning_rate": 6.514172329246937e-05, "loss": 1.1181, "num_input_tokens_seen": 80148768, "step": 4981 }, { "epoch": 0.34898036022310364, "grad_norm": 4.555769443511963, "learning_rate": 6.513472504378284e-05, "loss": 0.9769, "num_input_tokens_seen": 80164984, "step": 4982 }, { "epoch": 0.3490504084688329, "grad_norm": 5.041413307189941, "learning_rate": 6.512772679509632e-05, "loss": 1.0183, "num_input_tokens_seen": 80181336, "step": 4983 }, { "epoch": 0.34912045671456216, "grad_norm": 4.58367395401001, "learning_rate": 6.51207285464098e-05, "loss": 1.232, "num_input_tokens_seen": 80197720, "step": 4984 }, { "epoch": 0.3491905049602914, "grad_norm": 3.9667036533355713, "learning_rate": 6.51137302977233e-05, "loss": 1.1363, "num_input_tokens_seen": 80212776, "step": 4985 }, { "epoch": 0.3492605532060207, "grad_norm": 3.474071979522705, "learning_rate": 6.510673204903678e-05, "loss": 0.8978, "num_input_tokens_seen": 80229160, "step": 4986 }, { "epoch": 0.3493306014517499, "grad_norm": 3.912496328353882, "learning_rate": 6.509973380035027e-05, "loss": 0.9695, "num_input_tokens_seen": 80245544, "step": 4987 }, { "epoch": 0.34940064969747914, "grad_norm": 3.760340690612793, "learning_rate": 6.509273555166376e-05, "loss": 0.97, "num_input_tokens_seen": 80261400, "step": 4988 }, { "epoch": 0.3494706979432084, "grad_norm": 4.982266426086426, "learning_rate": 6.508573730297723e-05, "loss": 1.008, "num_input_tokens_seen": 80277784, "step": 4989 }, { "epoch": 0.34954074618893766, "grad_norm": 4.6823530197143555, "learning_rate": 6.507873905429071e-05, "loss": 1.3118, "num_input_tokens_seen": 80294168, "step": 4990 }, { "epoch": 0.34961079443466686, "grad_norm": 3.768439769744873, "learning_rate": 6.50717408056042e-05, "loss": 0.91, "num_input_tokens_seen": 80310552, "step": 4991 }, { "epoch": 0.3496808426803961, "grad_norm": 3.5285451412200928, "learning_rate": 6.50647425569177e-05, "loss": 0.8937, "num_input_tokens_seen": 80326464, "step": 4992 }, { "epoch": 0.3497508909261254, "grad_norm": 3.875992774963379, "learning_rate": 6.505774430823118e-05, "loss": 0.9514, "num_input_tokens_seen": 80342848, "step": 4993 }, { "epoch": 0.34982093917185464, "grad_norm": 4.061910152435303, "learning_rate": 6.505074605954466e-05, "loss": 0.9607, "num_input_tokens_seen": 80359232, "step": 4994 }, { "epoch": 0.34989098741758384, "grad_norm": 4.456427097320557, "learning_rate": 6.504374781085815e-05, "loss": 1.1927, "num_input_tokens_seen": 80375616, "step": 4995 }, { "epoch": 0.3499610356633131, "grad_norm": 4.381276607513428, "learning_rate": 6.503674956217163e-05, "loss": 1.0291, "num_input_tokens_seen": 80392000, "step": 4996 }, { "epoch": 0.35003108390904236, "grad_norm": 6.789033889770508, "learning_rate": 6.50297513134851e-05, "loss": 0.9971, "num_input_tokens_seen": 80407360, "step": 4997 }, { "epoch": 0.3501011321547716, "grad_norm": 3.953124761581421, "learning_rate": 6.50227530647986e-05, "loss": 1.1585, "num_input_tokens_seen": 80423744, "step": 4998 }, { "epoch": 0.3501711804005008, "grad_norm": 3.498389482498169, "learning_rate": 6.50157548161121e-05, "loss": 0.9259, "num_input_tokens_seen": 80440128, "step": 4999 }, { "epoch": 0.3502412286462301, "grad_norm": 5.498814582824707, "learning_rate": 6.500875656742557e-05, "loss": 0.9867, "num_input_tokens_seen": 80456512, "step": 5000 }, { "epoch": 0.3502412286462301, "eval_loss": 1.1277527809143066, "eval_runtime": 0.1909, "eval_samples_per_second": 5.238, "eval_steps_per_second": 5.238, "num_input_tokens_seen": 80456512, "step": 5000 }, { "epoch": 0.35031127689195934, "grad_norm": 3.440230131149292, "learning_rate": 6.500175831873906e-05, "loss": 0.8354, "num_input_tokens_seen": 80472456, "step": 5001 }, { "epoch": 0.3503813251376886, "grad_norm": 5.069565296173096, "learning_rate": 6.499476007005255e-05, "loss": 1.1994, "num_input_tokens_seen": 80488840, "step": 5002 }, { "epoch": 0.3504513733834178, "grad_norm": 4.53994607925415, "learning_rate": 6.498776182136602e-05, "loss": 1.0962, "num_input_tokens_seen": 80504984, "step": 5003 }, { "epoch": 0.35052142162914707, "grad_norm": 4.136146068572998, "learning_rate": 6.498076357267951e-05, "loss": 0.9885, "num_input_tokens_seen": 80520448, "step": 5004 }, { "epoch": 0.3505914698748763, "grad_norm": 5.609417915344238, "learning_rate": 6.4973765323993e-05, "loss": 1.0242, "num_input_tokens_seen": 80536496, "step": 5005 }, { "epoch": 0.3506615181206056, "grad_norm": 4.375439643859863, "learning_rate": 6.496676707530649e-05, "loss": 0.9937, "num_input_tokens_seen": 80551592, "step": 5006 }, { "epoch": 0.3507315663663348, "grad_norm": 3.5269775390625, "learning_rate": 6.495976882661996e-05, "loss": 0.9995, "num_input_tokens_seen": 80567976, "step": 5007 }, { "epoch": 0.35080161461206405, "grad_norm": 3.9541778564453125, "learning_rate": 6.495277057793346e-05, "loss": 1.1451, "num_input_tokens_seen": 80584360, "step": 5008 }, { "epoch": 0.3508716628577933, "grad_norm": 5.544612407684326, "learning_rate": 6.494577232924694e-05, "loss": 1.3493, "num_input_tokens_seen": 80599856, "step": 5009 }, { "epoch": 0.35094171110352257, "grad_norm": 4.189836502075195, "learning_rate": 6.493877408056041e-05, "loss": 1.2096, "num_input_tokens_seen": 80615392, "step": 5010 }, { "epoch": 0.3510117593492518, "grad_norm": 4.8789825439453125, "learning_rate": 6.49317758318739e-05, "loss": 1.0665, "num_input_tokens_seen": 80631776, "step": 5011 }, { "epoch": 0.35108180759498103, "grad_norm": 4.271617412567139, "learning_rate": 6.49247775831874e-05, "loss": 0.9655, "num_input_tokens_seen": 80648160, "step": 5012 }, { "epoch": 0.3511518558407103, "grad_norm": 4.656182765960693, "learning_rate": 6.491777933450088e-05, "loss": 0.9566, "num_input_tokens_seen": 80664424, "step": 5013 }, { "epoch": 0.35122190408643955, "grad_norm": 6.627303600311279, "learning_rate": 6.491078108581437e-05, "loss": 1.2156, "num_input_tokens_seen": 80680128, "step": 5014 }, { "epoch": 0.35129195233216876, "grad_norm": 3.6189517974853516, "learning_rate": 6.490378283712786e-05, "loss": 1.0828, "num_input_tokens_seen": 80695848, "step": 5015 }, { "epoch": 0.351362000577898, "grad_norm": 3.58449387550354, "learning_rate": 6.489678458844133e-05, "loss": 1.0578, "num_input_tokens_seen": 80712232, "step": 5016 }, { "epoch": 0.3514320488236273, "grad_norm": 4.014143466949463, "learning_rate": 6.488978633975481e-05, "loss": 1.1271, "num_input_tokens_seen": 80726480, "step": 5017 }, { "epoch": 0.35150209706935653, "grad_norm": 4.461588382720947, "learning_rate": 6.488278809106831e-05, "loss": 1.1175, "num_input_tokens_seen": 80742776, "step": 5018 }, { "epoch": 0.35157214531508574, "grad_norm": 4.534054279327393, "learning_rate": 6.48757898423818e-05, "loss": 1.1009, "num_input_tokens_seen": 80758024, "step": 5019 }, { "epoch": 0.351642193560815, "grad_norm": 3.502699613571167, "learning_rate": 6.486879159369527e-05, "loss": 1.0564, "num_input_tokens_seen": 80774152, "step": 5020 }, { "epoch": 0.35171224180654426, "grad_norm": 4.463150978088379, "learning_rate": 6.486179334500876e-05, "loss": 0.9945, "num_input_tokens_seen": 80790528, "step": 5021 }, { "epoch": 0.3517822900522735, "grad_norm": 4.1127543449401855, "learning_rate": 6.485479509632225e-05, "loss": 0.9813, "num_input_tokens_seen": 80805400, "step": 5022 }, { "epoch": 0.3518523382980028, "grad_norm": 3.6113109588623047, "learning_rate": 6.484779684763572e-05, "loss": 1.1071, "num_input_tokens_seen": 80821584, "step": 5023 }, { "epoch": 0.351922386543732, "grad_norm": 4.167325019836426, "learning_rate": 6.484079859894921e-05, "loss": 1.0636, "num_input_tokens_seen": 80837968, "step": 5024 }, { "epoch": 0.35199243478946124, "grad_norm": 3.9422924518585205, "learning_rate": 6.48338003502627e-05, "loss": 1.0665, "num_input_tokens_seen": 80854352, "step": 5025 }, { "epoch": 0.3520624830351905, "grad_norm": 4.867110729217529, "learning_rate": 6.482680210157619e-05, "loss": 0.9098, "num_input_tokens_seen": 80870648, "step": 5026 }, { "epoch": 0.35213253128091976, "grad_norm": 4.714593887329102, "learning_rate": 6.481980385288967e-05, "loss": 1.0256, "num_input_tokens_seen": 80886704, "step": 5027 }, { "epoch": 0.35220257952664896, "grad_norm": 3.8926947116851807, "learning_rate": 6.481280560420315e-05, "loss": 0.9577, "num_input_tokens_seen": 80902184, "step": 5028 }, { "epoch": 0.3522726277723782, "grad_norm": 4.510727405548096, "learning_rate": 6.480580735551664e-05, "loss": 1.1543, "num_input_tokens_seen": 80917960, "step": 5029 }, { "epoch": 0.3523426760181075, "grad_norm": 3.6175239086151123, "learning_rate": 6.479880910683012e-05, "loss": 1.0692, "num_input_tokens_seen": 80934344, "step": 5030 }, { "epoch": 0.35241272426383674, "grad_norm": 4.112790584564209, "learning_rate": 6.47918108581436e-05, "loss": 1.1518, "num_input_tokens_seen": 80950336, "step": 5031 }, { "epoch": 0.35248277250956594, "grad_norm": 4.372056007385254, "learning_rate": 6.478481260945711e-05, "loss": 1.0732, "num_input_tokens_seen": 80966272, "step": 5032 }, { "epoch": 0.3525528207552952, "grad_norm": 5.2401204109191895, "learning_rate": 6.477781436077058e-05, "loss": 1.0378, "num_input_tokens_seen": 80981568, "step": 5033 }, { "epoch": 0.35262286900102446, "grad_norm": 4.032891273498535, "learning_rate": 6.477081611208406e-05, "loss": 1.0788, "num_input_tokens_seen": 80997384, "step": 5034 }, { "epoch": 0.3526929172467537, "grad_norm": 5.448423385620117, "learning_rate": 6.476381786339756e-05, "loss": 1.2136, "num_input_tokens_seen": 81013768, "step": 5035 }, { "epoch": 0.3527629654924829, "grad_norm": 3.5669469833374023, "learning_rate": 6.475681961471104e-05, "loss": 1.0039, "num_input_tokens_seen": 81030152, "step": 5036 }, { "epoch": 0.3528330137382122, "grad_norm": 3.4767303466796875, "learning_rate": 6.474982136602451e-05, "loss": 0.9563, "num_input_tokens_seen": 81046536, "step": 5037 }, { "epoch": 0.35290306198394145, "grad_norm": 4.859378814697266, "learning_rate": 6.474282311733801e-05, "loss": 1.2855, "num_input_tokens_seen": 81062528, "step": 5038 }, { "epoch": 0.3529731102296707, "grad_norm": 5.003366470336914, "learning_rate": 6.47358248686515e-05, "loss": 1.1317, "num_input_tokens_seen": 81078912, "step": 5039 }, { "epoch": 0.3530431584753999, "grad_norm": 3.9362549781799316, "learning_rate": 6.472882661996498e-05, "loss": 1.2051, "num_input_tokens_seen": 81095296, "step": 5040 }, { "epoch": 0.35311320672112917, "grad_norm": 3.319826364517212, "learning_rate": 6.472182837127847e-05, "loss": 0.9632, "num_input_tokens_seen": 81111640, "step": 5041 }, { "epoch": 0.35318325496685843, "grad_norm": 3.5816714763641357, "learning_rate": 6.471483012259195e-05, "loss": 0.9576, "num_input_tokens_seen": 81128024, "step": 5042 }, { "epoch": 0.3532533032125877, "grad_norm": 4.352350234985352, "learning_rate": 6.470783187390543e-05, "loss": 1.1754, "num_input_tokens_seen": 81143992, "step": 5043 }, { "epoch": 0.3533233514583169, "grad_norm": 3.4122314453125, "learning_rate": 6.470083362521892e-05, "loss": 1.104, "num_input_tokens_seen": 81160376, "step": 5044 }, { "epoch": 0.35339339970404615, "grad_norm": 4.0952324867248535, "learning_rate": 6.46938353765324e-05, "loss": 0.9727, "num_input_tokens_seen": 81175968, "step": 5045 }, { "epoch": 0.3534634479497754, "grad_norm": 3.9099533557891846, "learning_rate": 6.46868371278459e-05, "loss": 1.0624, "num_input_tokens_seen": 81192352, "step": 5046 }, { "epoch": 0.35353349619550467, "grad_norm": 6.379274845123291, "learning_rate": 6.467983887915937e-05, "loss": 1.0069, "num_input_tokens_seen": 81208648, "step": 5047 }, { "epoch": 0.3536035444412339, "grad_norm": 3.9650473594665527, "learning_rate": 6.467284063047286e-05, "loss": 1.0727, "num_input_tokens_seen": 81224472, "step": 5048 }, { "epoch": 0.35367359268696313, "grad_norm": 3.7729573249816895, "learning_rate": 6.466584238178635e-05, "loss": 1.0097, "num_input_tokens_seen": 81240232, "step": 5049 }, { "epoch": 0.3537436409326924, "grad_norm": 4.012545585632324, "learning_rate": 6.465884413309982e-05, "loss": 1.0527, "num_input_tokens_seen": 81256616, "step": 5050 }, { "epoch": 0.35381368917842165, "grad_norm": 3.679382801055908, "learning_rate": 6.465184588441331e-05, "loss": 1.0033, "num_input_tokens_seen": 81272888, "step": 5051 }, { "epoch": 0.35388373742415086, "grad_norm": 3.897606134414673, "learning_rate": 6.464484763572681e-05, "loss": 0.9513, "num_input_tokens_seen": 81289272, "step": 5052 }, { "epoch": 0.3539537856698801, "grad_norm": 4.988255023956299, "learning_rate": 6.463784938704029e-05, "loss": 0.8484, "num_input_tokens_seen": 81305656, "step": 5053 }, { "epoch": 0.3540238339156094, "grad_norm": 4.226601600646973, "learning_rate": 6.463085113835376e-05, "loss": 1.0048, "num_input_tokens_seen": 81320912, "step": 5054 }, { "epoch": 0.35409388216133864, "grad_norm": 4.0905070304870605, "learning_rate": 6.462385288966725e-05, "loss": 1.2044, "num_input_tokens_seen": 81337296, "step": 5055 }, { "epoch": 0.3541639304070679, "grad_norm": 4.470916748046875, "learning_rate": 6.461685464098074e-05, "loss": 1.1198, "num_input_tokens_seen": 81353680, "step": 5056 }, { "epoch": 0.3542339786527971, "grad_norm": 3.8264098167419434, "learning_rate": 6.460985639229421e-05, "loss": 0.8444, "num_input_tokens_seen": 81370064, "step": 5057 }, { "epoch": 0.35430402689852636, "grad_norm": 5.07196569442749, "learning_rate": 6.460285814360772e-05, "loss": 0.9035, "num_input_tokens_seen": 81386368, "step": 5058 }, { "epoch": 0.3543740751442556, "grad_norm": 4.830010414123535, "learning_rate": 6.45958598949212e-05, "loss": 1.0685, "num_input_tokens_seen": 81402752, "step": 5059 }, { "epoch": 0.3544441233899849, "grad_norm": 3.5972540378570557, "learning_rate": 6.458886164623468e-05, "loss": 0.9466, "num_input_tokens_seen": 81418856, "step": 5060 }, { "epoch": 0.3545141716357141, "grad_norm": 4.840418815612793, "learning_rate": 6.458186339754816e-05, "loss": 1.0174, "num_input_tokens_seen": 81434344, "step": 5061 }, { "epoch": 0.35458421988144334, "grad_norm": 4.891697883605957, "learning_rate": 6.457486514886166e-05, "loss": 1.0537, "num_input_tokens_seen": 81450280, "step": 5062 }, { "epoch": 0.3546542681271726, "grad_norm": 3.7236123085021973, "learning_rate": 6.456786690017513e-05, "loss": 1.0524, "num_input_tokens_seen": 81466664, "step": 5063 }, { "epoch": 0.35472431637290186, "grad_norm": 3.6597838401794434, "learning_rate": 6.456086865148862e-05, "loss": 0.9648, "num_input_tokens_seen": 81483048, "step": 5064 }, { "epoch": 0.35479436461863106, "grad_norm": 4.048685073852539, "learning_rate": 6.455387040280211e-05, "loss": 1.0033, "num_input_tokens_seen": 81499080, "step": 5065 }, { "epoch": 0.3548644128643603, "grad_norm": 3.683549165725708, "learning_rate": 6.45468721541156e-05, "loss": 1.054, "num_input_tokens_seen": 81515464, "step": 5066 }, { "epoch": 0.3549344611100896, "grad_norm": 4.80827522277832, "learning_rate": 6.453987390542907e-05, "loss": 1.0664, "num_input_tokens_seen": 81530672, "step": 5067 }, { "epoch": 0.35500450935581884, "grad_norm": 3.6255602836608887, "learning_rate": 6.453287565674256e-05, "loss": 1.0027, "num_input_tokens_seen": 81546976, "step": 5068 }, { "epoch": 0.35507455760154805, "grad_norm": 3.430290460586548, "learning_rate": 6.452587740805605e-05, "loss": 1.1253, "num_input_tokens_seen": 81562936, "step": 5069 }, { "epoch": 0.3551446058472773, "grad_norm": 5.140942573547363, "learning_rate": 6.451887915936953e-05, "loss": 0.9522, "num_input_tokens_seen": 81579120, "step": 5070 }, { "epoch": 0.35521465409300657, "grad_norm": 4.5443115234375, "learning_rate": 6.451188091068301e-05, "loss": 1.2141, "num_input_tokens_seen": 81595504, "step": 5071 }, { "epoch": 0.3552847023387358, "grad_norm": 4.33146333694458, "learning_rate": 6.45048826619965e-05, "loss": 1.0189, "num_input_tokens_seen": 81611024, "step": 5072 }, { "epoch": 0.35535475058446503, "grad_norm": 4.212037563323975, "learning_rate": 6.449788441330999e-05, "loss": 1.2356, "num_input_tokens_seen": 81627208, "step": 5073 }, { "epoch": 0.3554247988301943, "grad_norm": 3.714611053466797, "learning_rate": 6.449088616462347e-05, "loss": 0.9699, "num_input_tokens_seen": 81642744, "step": 5074 }, { "epoch": 0.35549484707592355, "grad_norm": 3.985471487045288, "learning_rate": 6.448388791593696e-05, "loss": 1.1381, "num_input_tokens_seen": 81659128, "step": 5075 }, { "epoch": 0.3555648953216528, "grad_norm": 4.519073963165283, "learning_rate": 6.447688966725044e-05, "loss": 1.1515, "num_input_tokens_seen": 81675512, "step": 5076 }, { "epoch": 0.355634943567382, "grad_norm": 4.546297550201416, "learning_rate": 6.446989141856392e-05, "loss": 1.324, "num_input_tokens_seen": 81691528, "step": 5077 }, { "epoch": 0.35570499181311127, "grad_norm": 4.023989200592041, "learning_rate": 6.446289316987741e-05, "loss": 1.068, "num_input_tokens_seen": 81707912, "step": 5078 }, { "epoch": 0.35577504005884053, "grad_norm": 4.442357540130615, "learning_rate": 6.445589492119091e-05, "loss": 0.9021, "num_input_tokens_seen": 81724296, "step": 5079 }, { "epoch": 0.3558450883045698, "grad_norm": 3.63273286819458, "learning_rate": 6.444889667250438e-05, "loss": 0.919, "num_input_tokens_seen": 81740112, "step": 5080 }, { "epoch": 0.355915136550299, "grad_norm": 3.8844716548919678, "learning_rate": 6.444189842381786e-05, "loss": 1.1389, "num_input_tokens_seen": 81756024, "step": 5081 }, { "epoch": 0.35598518479602825, "grad_norm": 3.8603484630584717, "learning_rate": 6.443490017513135e-05, "loss": 0.8949, "num_input_tokens_seen": 81772408, "step": 5082 }, { "epoch": 0.3560552330417575, "grad_norm": 4.305675029754639, "learning_rate": 6.442790192644484e-05, "loss": 1.0133, "num_input_tokens_seen": 81787992, "step": 5083 }, { "epoch": 0.3561252812874868, "grad_norm": 5.944203853607178, "learning_rate": 6.442090367775833e-05, "loss": 1.0635, "num_input_tokens_seen": 81804032, "step": 5084 }, { "epoch": 0.356195329533216, "grad_norm": 5.269783020019531, "learning_rate": 6.441390542907181e-05, "loss": 1.0697, "num_input_tokens_seen": 81820416, "step": 5085 }, { "epoch": 0.35626537777894524, "grad_norm": 3.775933027267456, "learning_rate": 6.44069071803853e-05, "loss": 1.0638, "num_input_tokens_seen": 81836712, "step": 5086 }, { "epoch": 0.3563354260246745, "grad_norm": 4.133227825164795, "learning_rate": 6.439990893169878e-05, "loss": 0.9842, "num_input_tokens_seen": 81853096, "step": 5087 }, { "epoch": 0.35640547427040375, "grad_norm": 4.418367862701416, "learning_rate": 6.439291068301225e-05, "loss": 1.1836, "num_input_tokens_seen": 81869480, "step": 5088 }, { "epoch": 0.356475522516133, "grad_norm": 3.584392786026001, "learning_rate": 6.438591243432575e-05, "loss": 1.0805, "num_input_tokens_seen": 81885864, "step": 5089 }, { "epoch": 0.3565455707618622, "grad_norm": 4.216940402984619, "learning_rate": 6.437891418563923e-05, "loss": 0.8602, "num_input_tokens_seen": 81902248, "step": 5090 }, { "epoch": 0.3566156190075915, "grad_norm": 4.383372783660889, "learning_rate": 6.437191593695272e-05, "loss": 0.9763, "num_input_tokens_seen": 81918464, "step": 5091 }, { "epoch": 0.35668566725332074, "grad_norm": 4.06666374206543, "learning_rate": 6.436491768826621e-05, "loss": 0.9784, "num_input_tokens_seen": 81934848, "step": 5092 }, { "epoch": 0.35675571549905, "grad_norm": 5.485066890716553, "learning_rate": 6.43579194395797e-05, "loss": 0.9188, "num_input_tokens_seen": 81950696, "step": 5093 }, { "epoch": 0.3568257637447792, "grad_norm": 6.794841766357422, "learning_rate": 6.435092119089317e-05, "loss": 1.1765, "num_input_tokens_seen": 81967080, "step": 5094 }, { "epoch": 0.35689581199050846, "grad_norm": 3.531291961669922, "learning_rate": 6.434392294220666e-05, "loss": 0.9904, "num_input_tokens_seen": 81983464, "step": 5095 }, { "epoch": 0.3569658602362377, "grad_norm": 3.694018840789795, "learning_rate": 6.433692469352015e-05, "loss": 1.0384, "num_input_tokens_seen": 81999848, "step": 5096 }, { "epoch": 0.357035908481967, "grad_norm": 6.933582305908203, "learning_rate": 6.432992644483362e-05, "loss": 1.0262, "num_input_tokens_seen": 82015304, "step": 5097 }, { "epoch": 0.3571059567276962, "grad_norm": 5.904866695404053, "learning_rate": 6.432292819614711e-05, "loss": 1.0849, "num_input_tokens_seen": 82031688, "step": 5098 }, { "epoch": 0.35717600497342544, "grad_norm": 4.199756145477295, "learning_rate": 6.43159299474606e-05, "loss": 1.1007, "num_input_tokens_seen": 82047336, "step": 5099 }, { "epoch": 0.3572460532191547, "grad_norm": 3.703000783920288, "learning_rate": 6.430893169877409e-05, "loss": 0.8503, "num_input_tokens_seen": 82063720, "step": 5100 }, { "epoch": 0.35731610146488396, "grad_norm": 4.844930171966553, "learning_rate": 6.430193345008756e-05, "loss": 1.0255, "num_input_tokens_seen": 82079632, "step": 5101 }, { "epoch": 0.35738614971061317, "grad_norm": 3.870488166809082, "learning_rate": 6.429493520140105e-05, "loss": 1.0116, "num_input_tokens_seen": 82094864, "step": 5102 }, { "epoch": 0.3574561979563424, "grad_norm": 3.9125707149505615, "learning_rate": 6.428793695271454e-05, "loss": 0.9626, "num_input_tokens_seen": 82111136, "step": 5103 }, { "epoch": 0.3575262462020717, "grad_norm": 4.347132205963135, "learning_rate": 6.428093870402803e-05, "loss": 0.9538, "num_input_tokens_seen": 82127064, "step": 5104 }, { "epoch": 0.35759629444780094, "grad_norm": 3.739053964614868, "learning_rate": 6.42739404553415e-05, "loss": 1.201, "num_input_tokens_seen": 82143448, "step": 5105 }, { "epoch": 0.35766634269353015, "grad_norm": 4.781857967376709, "learning_rate": 6.4266942206655e-05, "loss": 1.0323, "num_input_tokens_seen": 82159832, "step": 5106 }, { "epoch": 0.3577363909392594, "grad_norm": 4.3711700439453125, "learning_rate": 6.425994395796848e-05, "loss": 1.2023, "num_input_tokens_seen": 82175944, "step": 5107 }, { "epoch": 0.35780643918498867, "grad_norm": 3.6916282176971436, "learning_rate": 6.425294570928196e-05, "loss": 0.787, "num_input_tokens_seen": 82192304, "step": 5108 }, { "epoch": 0.3578764874307179, "grad_norm": 4.418915271759033, "learning_rate": 6.424594746059545e-05, "loss": 1.0842, "num_input_tokens_seen": 82208080, "step": 5109 }, { "epoch": 0.35794653567644713, "grad_norm": 3.9138340950012207, "learning_rate": 6.423894921190893e-05, "loss": 1.0261, "num_input_tokens_seen": 82224464, "step": 5110 }, { "epoch": 0.3580165839221764, "grad_norm": 3.99479079246521, "learning_rate": 6.423195096322242e-05, "loss": 1.0562, "num_input_tokens_seen": 82240664, "step": 5111 }, { "epoch": 0.35808663216790565, "grad_norm": 4.260537147521973, "learning_rate": 6.422495271453591e-05, "loss": 1.1133, "num_input_tokens_seen": 82257048, "step": 5112 }, { "epoch": 0.3581566804136349, "grad_norm": 3.5181097984313965, "learning_rate": 6.42179544658494e-05, "loss": 0.98, "num_input_tokens_seen": 82273432, "step": 5113 }, { "epoch": 0.3582267286593641, "grad_norm": 5.96913480758667, "learning_rate": 6.421095621716287e-05, "loss": 0.8867, "num_input_tokens_seen": 82289816, "step": 5114 }, { "epoch": 0.3582967769050934, "grad_norm": 4.628411769866943, "learning_rate": 6.420395796847635e-05, "loss": 1.1363, "num_input_tokens_seen": 82305784, "step": 5115 }, { "epoch": 0.35836682515082263, "grad_norm": 3.5981955528259277, "learning_rate": 6.419695971978985e-05, "loss": 0.9182, "num_input_tokens_seen": 82321384, "step": 5116 }, { "epoch": 0.3584368733965519, "grad_norm": 4.410891056060791, "learning_rate": 6.418996147110333e-05, "loss": 1.1118, "num_input_tokens_seen": 82336184, "step": 5117 }, { "epoch": 0.3585069216422811, "grad_norm": 4.316674709320068, "learning_rate": 6.418296322241682e-05, "loss": 1.1604, "num_input_tokens_seen": 82351520, "step": 5118 }, { "epoch": 0.35857696988801036, "grad_norm": 5.662688255310059, "learning_rate": 6.41759649737303e-05, "loss": 1.1212, "num_input_tokens_seen": 82367904, "step": 5119 }, { "epoch": 0.3586470181337396, "grad_norm": 4.5336151123046875, "learning_rate": 6.416896672504379e-05, "loss": 1.0093, "num_input_tokens_seen": 82384288, "step": 5120 }, { "epoch": 0.3587170663794689, "grad_norm": 6.43854284286499, "learning_rate": 6.416196847635727e-05, "loss": 0.9434, "num_input_tokens_seen": 82400120, "step": 5121 }, { "epoch": 0.3587871146251981, "grad_norm": 3.519869089126587, "learning_rate": 6.415497022767076e-05, "loss": 0.9704, "num_input_tokens_seen": 82416504, "step": 5122 }, { "epoch": 0.35885716287092734, "grad_norm": 4.426568508148193, "learning_rate": 6.414797197898425e-05, "loss": 0.9778, "num_input_tokens_seen": 82431936, "step": 5123 }, { "epoch": 0.3589272111166566, "grad_norm": 10.392409324645996, "learning_rate": 6.414097373029773e-05, "loss": 1.0289, "num_input_tokens_seen": 82447232, "step": 5124 }, { "epoch": 0.35899725936238586, "grad_norm": 4.133431434631348, "learning_rate": 6.413397548161121e-05, "loss": 1.1998, "num_input_tokens_seen": 82462648, "step": 5125 }, { "epoch": 0.3590673076081151, "grad_norm": 5.43566370010376, "learning_rate": 6.41269772329247e-05, "loss": 0.9587, "num_input_tokens_seen": 82478536, "step": 5126 }, { "epoch": 0.3591373558538443, "grad_norm": 4.205079555511475, "learning_rate": 6.411997898423819e-05, "loss": 1.1152, "num_input_tokens_seen": 82494224, "step": 5127 }, { "epoch": 0.3592074040995736, "grad_norm": 4.165416240692139, "learning_rate": 6.411298073555166e-05, "loss": 1.3017, "num_input_tokens_seen": 82510608, "step": 5128 }, { "epoch": 0.35927745234530284, "grad_norm": 3.7855117321014404, "learning_rate": 6.410598248686515e-05, "loss": 0.8362, "num_input_tokens_seen": 82526992, "step": 5129 }, { "epoch": 0.3593475005910321, "grad_norm": 4.406207084655762, "learning_rate": 6.409898423817864e-05, "loss": 1.0353, "num_input_tokens_seen": 82543376, "step": 5130 }, { "epoch": 0.3594175488367613, "grad_norm": 4.228625774383545, "learning_rate": 6.409198598949213e-05, "loss": 0.9788, "num_input_tokens_seen": 82559760, "step": 5131 }, { "epoch": 0.35948759708249056, "grad_norm": 3.6679983139038086, "learning_rate": 6.40849877408056e-05, "loss": 1.072, "num_input_tokens_seen": 82575552, "step": 5132 }, { "epoch": 0.3595576453282198, "grad_norm": 4.011179447174072, "learning_rate": 6.40779894921191e-05, "loss": 1.0443, "num_input_tokens_seen": 82591936, "step": 5133 }, { "epoch": 0.3596276935739491, "grad_norm": 4.861363410949707, "learning_rate": 6.407099124343258e-05, "loss": 1.1077, "num_input_tokens_seen": 82608320, "step": 5134 }, { "epoch": 0.3596977418196783, "grad_norm": 4.128578186035156, "learning_rate": 6.406399299474605e-05, "loss": 1.1903, "num_input_tokens_seen": 82624704, "step": 5135 }, { "epoch": 0.35976779006540754, "grad_norm": 4.036421775817871, "learning_rate": 6.405699474605954e-05, "loss": 1.1624, "num_input_tokens_seen": 82641088, "step": 5136 }, { "epoch": 0.3598378383111368, "grad_norm": 4.536168098449707, "learning_rate": 6.404999649737303e-05, "loss": 0.9512, "num_input_tokens_seen": 82657472, "step": 5137 }, { "epoch": 0.35990788655686606, "grad_norm": 3.665916681289673, "learning_rate": 6.404299824868652e-05, "loss": 1.1718, "num_input_tokens_seen": 82673856, "step": 5138 }, { "epoch": 0.35997793480259527, "grad_norm": 3.798205852508545, "learning_rate": 6.403600000000001e-05, "loss": 1.0625, "num_input_tokens_seen": 82690240, "step": 5139 }, { "epoch": 0.3600479830483245, "grad_norm": 3.9616305828094482, "learning_rate": 6.40290017513135e-05, "loss": 1.1314, "num_input_tokens_seen": 82706624, "step": 5140 }, { "epoch": 0.3601180312940538, "grad_norm": 4.6059489250183105, "learning_rate": 6.402200350262697e-05, "loss": 0.9534, "num_input_tokens_seen": 82723008, "step": 5141 }, { "epoch": 0.36018807953978305, "grad_norm": 4.2935943603515625, "learning_rate": 6.401500525394045e-05, "loss": 0.9653, "num_input_tokens_seen": 82739392, "step": 5142 }, { "epoch": 0.36025812778551225, "grad_norm": 4.02174711227417, "learning_rate": 6.400800700525395e-05, "loss": 1.2037, "num_input_tokens_seen": 82755600, "step": 5143 }, { "epoch": 0.3603281760312415, "grad_norm": 4.0431599617004395, "learning_rate": 6.400100875656744e-05, "loss": 1.0548, "num_input_tokens_seen": 82771592, "step": 5144 }, { "epoch": 0.36039822427697077, "grad_norm": 3.6921310424804688, "learning_rate": 6.399401050788091e-05, "loss": 0.8992, "num_input_tokens_seen": 82787728, "step": 5145 }, { "epoch": 0.36046827252270003, "grad_norm": 4.27170991897583, "learning_rate": 6.39870122591944e-05, "loss": 1.0908, "num_input_tokens_seen": 82803152, "step": 5146 }, { "epoch": 0.36053832076842923, "grad_norm": 4.670827865600586, "learning_rate": 6.398001401050789e-05, "loss": 1.1134, "num_input_tokens_seen": 82819536, "step": 5147 }, { "epoch": 0.3606083690141585, "grad_norm": 3.6219654083251953, "learning_rate": 6.397301576182136e-05, "loss": 0.9576, "num_input_tokens_seen": 82835920, "step": 5148 }, { "epoch": 0.36067841725988775, "grad_norm": 3.53466796875, "learning_rate": 6.396601751313485e-05, "loss": 0.905, "num_input_tokens_seen": 82852304, "step": 5149 }, { "epoch": 0.360748465505617, "grad_norm": 4.027638912200928, "learning_rate": 6.395901926444834e-05, "loss": 1.0661, "num_input_tokens_seen": 82867816, "step": 5150 }, { "epoch": 0.3608185137513462, "grad_norm": 5.701491832733154, "learning_rate": 6.395202101576183e-05, "loss": 1.2476, "num_input_tokens_seen": 82883480, "step": 5151 }, { "epoch": 0.3608885619970755, "grad_norm": 4.156428337097168, "learning_rate": 6.39450227670753e-05, "loss": 1.1507, "num_input_tokens_seen": 82899608, "step": 5152 }, { "epoch": 0.36095861024280473, "grad_norm": 5.278023719787598, "learning_rate": 6.39380245183888e-05, "loss": 1.0583, "num_input_tokens_seen": 82915656, "step": 5153 }, { "epoch": 0.361028658488534, "grad_norm": 3.6892948150634766, "learning_rate": 6.393102626970228e-05, "loss": 1.0063, "num_input_tokens_seen": 82931632, "step": 5154 }, { "epoch": 0.3610987067342632, "grad_norm": 5.179676055908203, "learning_rate": 6.392402802101576e-05, "loss": 1.1701, "num_input_tokens_seen": 82947344, "step": 5155 }, { "epoch": 0.36116875497999246, "grad_norm": 4.948189735412598, "learning_rate": 6.391702977232925e-05, "loss": 1.056, "num_input_tokens_seen": 82963720, "step": 5156 }, { "epoch": 0.3612388032257217, "grad_norm": 4.465184688568115, "learning_rate": 6.391003152364274e-05, "loss": 1.225, "num_input_tokens_seen": 82980048, "step": 5157 }, { "epoch": 0.361308851471451, "grad_norm": 4.053642749786377, "learning_rate": 6.390303327495622e-05, "loss": 1.1481, "num_input_tokens_seen": 82996432, "step": 5158 }, { "epoch": 0.36137889971718024, "grad_norm": 8.422308921813965, "learning_rate": 6.38960350262697e-05, "loss": 1.241, "num_input_tokens_seen": 83012560, "step": 5159 }, { "epoch": 0.36144894796290944, "grad_norm": 3.4304730892181396, "learning_rate": 6.38890367775832e-05, "loss": 1.1008, "num_input_tokens_seen": 83028680, "step": 5160 }, { "epoch": 0.3615189962086387, "grad_norm": 9.87295913696289, "learning_rate": 6.388203852889668e-05, "loss": 1.0512, "num_input_tokens_seen": 83045064, "step": 5161 }, { "epoch": 0.36158904445436796, "grad_norm": 3.7000608444213867, "learning_rate": 6.387504028021015e-05, "loss": 1.0758, "num_input_tokens_seen": 83061448, "step": 5162 }, { "epoch": 0.3616590927000972, "grad_norm": 3.5490283966064453, "learning_rate": 6.386804203152364e-05, "loss": 0.9705, "num_input_tokens_seen": 83077176, "step": 5163 }, { "epoch": 0.3617291409458264, "grad_norm": 3.850770950317383, "learning_rate": 6.386104378283714e-05, "loss": 1.0371, "num_input_tokens_seen": 83093560, "step": 5164 }, { "epoch": 0.3617991891915557, "grad_norm": 5.09017276763916, "learning_rate": 6.385404553415062e-05, "loss": 1.0084, "num_input_tokens_seen": 83109752, "step": 5165 }, { "epoch": 0.36186923743728494, "grad_norm": 4.801665782928467, "learning_rate": 6.38470472854641e-05, "loss": 1.0909, "num_input_tokens_seen": 83125048, "step": 5166 }, { "epoch": 0.3619392856830142, "grad_norm": 3.954345941543579, "learning_rate": 6.38400490367776e-05, "loss": 0.9775, "num_input_tokens_seen": 83140808, "step": 5167 }, { "epoch": 0.3620093339287434, "grad_norm": 4.874080657958984, "learning_rate": 6.383305078809107e-05, "loss": 1.1408, "num_input_tokens_seen": 83157176, "step": 5168 }, { "epoch": 0.36207938217447266, "grad_norm": 4.3997111320495605, "learning_rate": 6.382605253940454e-05, "loss": 1.1489, "num_input_tokens_seen": 83173560, "step": 5169 }, { "epoch": 0.3621494304202019, "grad_norm": 4.431540489196777, "learning_rate": 6.381905429071805e-05, "loss": 1.1138, "num_input_tokens_seen": 83189864, "step": 5170 }, { "epoch": 0.3622194786659312, "grad_norm": 4.48107385635376, "learning_rate": 6.381205604203153e-05, "loss": 1.2451, "num_input_tokens_seen": 83205560, "step": 5171 }, { "epoch": 0.3622895269116604, "grad_norm": 4.369350910186768, "learning_rate": 6.380505779334501e-05, "loss": 1.0877, "num_input_tokens_seen": 83221544, "step": 5172 }, { "epoch": 0.36235957515738965, "grad_norm": 3.8510024547576904, "learning_rate": 6.37980595446585e-05, "loss": 0.8895, "num_input_tokens_seen": 83237928, "step": 5173 }, { "epoch": 0.3624296234031189, "grad_norm": 3.7452402114868164, "learning_rate": 6.379106129597199e-05, "loss": 1.1425, "num_input_tokens_seen": 83254168, "step": 5174 }, { "epoch": 0.36249967164884817, "grad_norm": 4.53076171875, "learning_rate": 6.378406304728546e-05, "loss": 1.1516, "num_input_tokens_seen": 83269568, "step": 5175 }, { "epoch": 0.36256971989457737, "grad_norm": 3.729602813720703, "learning_rate": 6.377706479859895e-05, "loss": 1.2105, "num_input_tokens_seen": 83285952, "step": 5176 }, { "epoch": 0.36263976814030663, "grad_norm": 4.085333824157715, "learning_rate": 6.377006654991244e-05, "loss": 1.0517, "num_input_tokens_seen": 83302200, "step": 5177 }, { "epoch": 0.3627098163860359, "grad_norm": 3.9202303886413574, "learning_rate": 6.376306830122593e-05, "loss": 1.0358, "num_input_tokens_seen": 83318584, "step": 5178 }, { "epoch": 0.36277986463176515, "grad_norm": 4.10648775100708, "learning_rate": 6.37560700525394e-05, "loss": 1.3052, "num_input_tokens_seen": 83334288, "step": 5179 }, { "epoch": 0.36284991287749435, "grad_norm": 3.975217580795288, "learning_rate": 6.374907180385289e-05, "loss": 1.1725, "num_input_tokens_seen": 83350096, "step": 5180 }, { "epoch": 0.3629199611232236, "grad_norm": 4.207096099853516, "learning_rate": 6.374207355516638e-05, "loss": 1.1396, "num_input_tokens_seen": 83366480, "step": 5181 }, { "epoch": 0.36299000936895287, "grad_norm": 3.9960830211639404, "learning_rate": 6.373507530647986e-05, "loss": 1.1971, "num_input_tokens_seen": 83381832, "step": 5182 }, { "epoch": 0.36306005761468213, "grad_norm": 4.142012596130371, "learning_rate": 6.372807705779334e-05, "loss": 1.0829, "num_input_tokens_seen": 83398216, "step": 5183 }, { "epoch": 0.36313010586041133, "grad_norm": 3.8692433834075928, "learning_rate": 6.372107880910685e-05, "loss": 1.0649, "num_input_tokens_seen": 83414600, "step": 5184 }, { "epoch": 0.3632001541061406, "grad_norm": 3.663544178009033, "learning_rate": 6.371408056042032e-05, "loss": 0.8924, "num_input_tokens_seen": 83430984, "step": 5185 }, { "epoch": 0.36327020235186985, "grad_norm": 4.056418418884277, "learning_rate": 6.37070823117338e-05, "loss": 0.9463, "num_input_tokens_seen": 83447368, "step": 5186 }, { "epoch": 0.3633402505975991, "grad_norm": 4.209747314453125, "learning_rate": 6.37000840630473e-05, "loss": 1.0641, "num_input_tokens_seen": 83463752, "step": 5187 }, { "epoch": 0.3634102988433283, "grad_norm": 4.93091344833374, "learning_rate": 6.369308581436077e-05, "loss": 1.2046, "num_input_tokens_seen": 83479424, "step": 5188 }, { "epoch": 0.3634803470890576, "grad_norm": 3.6523993015289307, "learning_rate": 6.368608756567425e-05, "loss": 0.8965, "num_input_tokens_seen": 83495808, "step": 5189 }, { "epoch": 0.36355039533478684, "grad_norm": 4.8949294090271, "learning_rate": 6.367908931698775e-05, "loss": 0.8928, "num_input_tokens_seen": 83511448, "step": 5190 }, { "epoch": 0.3636204435805161, "grad_norm": 5.856332778930664, "learning_rate": 6.367209106830124e-05, "loss": 0.9844, "num_input_tokens_seen": 83526664, "step": 5191 }, { "epoch": 0.3636904918262453, "grad_norm": 3.762014865875244, "learning_rate": 6.366509281961471e-05, "loss": 1.0865, "num_input_tokens_seen": 83542792, "step": 5192 }, { "epoch": 0.36376054007197456, "grad_norm": 4.075290203094482, "learning_rate": 6.36580945709282e-05, "loss": 1.0261, "num_input_tokens_seen": 83558992, "step": 5193 }, { "epoch": 0.3638305883177038, "grad_norm": 4.124780178070068, "learning_rate": 6.365109632224169e-05, "loss": 1.1021, "num_input_tokens_seen": 83575376, "step": 5194 }, { "epoch": 0.3639006365634331, "grad_norm": 6.1159210205078125, "learning_rate": 6.364409807355517e-05, "loss": 0.9209, "num_input_tokens_seen": 83591400, "step": 5195 }, { "epoch": 0.36397068480916234, "grad_norm": 3.8839027881622314, "learning_rate": 6.363709982486865e-05, "loss": 1.0866, "num_input_tokens_seen": 83607784, "step": 5196 }, { "epoch": 0.36404073305489154, "grad_norm": 4.260892391204834, "learning_rate": 6.363010157618214e-05, "loss": 1.0747, "num_input_tokens_seen": 83623944, "step": 5197 }, { "epoch": 0.3641107813006208, "grad_norm": 4.111022472381592, "learning_rate": 6.362310332749563e-05, "loss": 1.2594, "num_input_tokens_seen": 83639408, "step": 5198 }, { "epoch": 0.36418082954635006, "grad_norm": 3.567676305770874, "learning_rate": 6.361610507880911e-05, "loss": 1.0115, "num_input_tokens_seen": 83655496, "step": 5199 }, { "epoch": 0.3642508777920793, "grad_norm": 4.935754299163818, "learning_rate": 6.36091068301226e-05, "loss": 1.2028, "num_input_tokens_seen": 83671016, "step": 5200 }, { "epoch": 0.3642508777920793, "eval_loss": 1.129547119140625, "eval_runtime": 0.1857, "eval_samples_per_second": 5.386, "eval_steps_per_second": 5.386, "num_input_tokens_seen": 83671016, "step": 5200 }, { "epoch": 0.3643209260378085, "grad_norm": 3.8546817302703857, "learning_rate": 6.360210858143608e-05, "loss": 0.9873, "num_input_tokens_seen": 83685736, "step": 5201 }, { "epoch": 0.3643909742835378, "grad_norm": 3.900425910949707, "learning_rate": 6.359511033274956e-05, "loss": 1.0005, "num_input_tokens_seen": 83702120, "step": 5202 }, { "epoch": 0.36446102252926704, "grad_norm": 4.270096302032471, "learning_rate": 6.358811208406305e-05, "loss": 0.9098, "num_input_tokens_seen": 83718504, "step": 5203 }, { "epoch": 0.3645310707749963, "grad_norm": 5.027628421783447, "learning_rate": 6.358111383537655e-05, "loss": 1.1363, "num_input_tokens_seen": 83734888, "step": 5204 }, { "epoch": 0.3646011190207255, "grad_norm": 4.843371868133545, "learning_rate": 6.357411558669002e-05, "loss": 0.9629, "num_input_tokens_seen": 83749488, "step": 5205 }, { "epoch": 0.36467116726645477, "grad_norm": 7.530435562133789, "learning_rate": 6.35671173380035e-05, "loss": 1.0575, "num_input_tokens_seen": 83765872, "step": 5206 }, { "epoch": 0.364741215512184, "grad_norm": 4.028171062469482, "learning_rate": 6.356011908931699e-05, "loss": 1.2011, "num_input_tokens_seen": 83781936, "step": 5207 }, { "epoch": 0.3648112637579133, "grad_norm": 6.744492053985596, "learning_rate": 6.355312084063048e-05, "loss": 1.0464, "num_input_tokens_seen": 83797520, "step": 5208 }, { "epoch": 0.3648813120036425, "grad_norm": 3.9689910411834717, "learning_rate": 6.354612259194395e-05, "loss": 1.0156, "num_input_tokens_seen": 83813872, "step": 5209 }, { "epoch": 0.36495136024937175, "grad_norm": 4.990142345428467, "learning_rate": 6.353912434325745e-05, "loss": 1.2019, "num_input_tokens_seen": 83830256, "step": 5210 }, { "epoch": 0.365021408495101, "grad_norm": 4.547253131866455, "learning_rate": 6.353212609457094e-05, "loss": 1.1825, "num_input_tokens_seen": 83846640, "step": 5211 }, { "epoch": 0.36509145674083027, "grad_norm": 4.108243465423584, "learning_rate": 6.352512784588442e-05, "loss": 1.1827, "num_input_tokens_seen": 83863024, "step": 5212 }, { "epoch": 0.36516150498655947, "grad_norm": 4.540827751159668, "learning_rate": 6.351812959719789e-05, "loss": 1.0034, "num_input_tokens_seen": 83878976, "step": 5213 }, { "epoch": 0.36523155323228873, "grad_norm": 5.3233842849731445, "learning_rate": 6.35111313485114e-05, "loss": 1.2247, "num_input_tokens_seen": 83895360, "step": 5214 }, { "epoch": 0.365301601478018, "grad_norm": 5.161661624908447, "learning_rate": 6.350413309982487e-05, "loss": 1.2067, "num_input_tokens_seen": 83910064, "step": 5215 }, { "epoch": 0.36537164972374725, "grad_norm": 4.908864498138428, "learning_rate": 6.349713485113836e-05, "loss": 1.1748, "num_input_tokens_seen": 83926448, "step": 5216 }, { "epoch": 0.36544169796947645, "grad_norm": 5.954193592071533, "learning_rate": 6.349013660245185e-05, "loss": 0.99, "num_input_tokens_seen": 83942248, "step": 5217 }, { "epoch": 0.3655117462152057, "grad_norm": 3.5276272296905518, "learning_rate": 6.348313835376534e-05, "loss": 0.9637, "num_input_tokens_seen": 83958632, "step": 5218 }, { "epoch": 0.365581794460935, "grad_norm": 3.736661195755005, "learning_rate": 6.347614010507881e-05, "loss": 1.059, "num_input_tokens_seen": 83975016, "step": 5219 }, { "epoch": 0.36565184270666423, "grad_norm": 5.434671401977539, "learning_rate": 6.34691418563923e-05, "loss": 1.0891, "num_input_tokens_seen": 83990424, "step": 5220 }, { "epoch": 0.36572189095239344, "grad_norm": 3.9301772117614746, "learning_rate": 6.346214360770579e-05, "loss": 1.0278, "num_input_tokens_seen": 84006808, "step": 5221 }, { "epoch": 0.3657919391981227, "grad_norm": 5.101827621459961, "learning_rate": 6.345514535901926e-05, "loss": 1.2129, "num_input_tokens_seen": 84022624, "step": 5222 }, { "epoch": 0.36586198744385195, "grad_norm": 4.042179584503174, "learning_rate": 6.344814711033275e-05, "loss": 1.2996, "num_input_tokens_seen": 84038688, "step": 5223 }, { "epoch": 0.3659320356895812, "grad_norm": 4.2309441566467285, "learning_rate": 6.344114886164624e-05, "loss": 1.1113, "num_input_tokens_seen": 84055072, "step": 5224 }, { "epoch": 0.3660020839353104, "grad_norm": 6.73452615737915, "learning_rate": 6.343415061295973e-05, "loss": 1.1523, "num_input_tokens_seen": 84071456, "step": 5225 }, { "epoch": 0.3660721321810397, "grad_norm": 3.684497833251953, "learning_rate": 6.34271523642732e-05, "loss": 1.0967, "num_input_tokens_seen": 84087840, "step": 5226 }, { "epoch": 0.36614218042676894, "grad_norm": 3.7974796295166016, "learning_rate": 6.342015411558669e-05, "loss": 1.0675, "num_input_tokens_seen": 84103456, "step": 5227 }, { "epoch": 0.3662122286724982, "grad_norm": 4.681473255157471, "learning_rate": 6.341315586690018e-05, "loss": 0.9202, "num_input_tokens_seen": 84119840, "step": 5228 }, { "epoch": 0.36628227691822746, "grad_norm": 4.197212219238281, "learning_rate": 6.340615761821366e-05, "loss": 0.9594, "num_input_tokens_seen": 84136224, "step": 5229 }, { "epoch": 0.36635232516395666, "grad_norm": 4.1414794921875, "learning_rate": 6.339915936952716e-05, "loss": 1.1421, "num_input_tokens_seen": 84152552, "step": 5230 }, { "epoch": 0.3664223734096859, "grad_norm": 4.138907432556152, "learning_rate": 6.339216112084065e-05, "loss": 1.0841, "num_input_tokens_seen": 84168936, "step": 5231 }, { "epoch": 0.3664924216554152, "grad_norm": 4.723425388336182, "learning_rate": 6.338516287215412e-05, "loss": 1.1302, "num_input_tokens_seen": 84185320, "step": 5232 }, { "epoch": 0.36656246990114444, "grad_norm": 4.167308330535889, "learning_rate": 6.33781646234676e-05, "loss": 1.2636, "num_input_tokens_seen": 84201704, "step": 5233 }, { "epoch": 0.36663251814687364, "grad_norm": 3.832829236984253, "learning_rate": 6.337116637478109e-05, "loss": 0.9078, "num_input_tokens_seen": 84217216, "step": 5234 }, { "epoch": 0.3667025663926029, "grad_norm": 6.1642842292785645, "learning_rate": 6.336416812609457e-05, "loss": 0.8102, "num_input_tokens_seen": 84232896, "step": 5235 }, { "epoch": 0.36677261463833216, "grad_norm": 3.948350429534912, "learning_rate": 6.335716987740806e-05, "loss": 1.1285, "num_input_tokens_seen": 84248448, "step": 5236 }, { "epoch": 0.3668426628840614, "grad_norm": 3.6216750144958496, "learning_rate": 6.335017162872155e-05, "loss": 1.022, "num_input_tokens_seen": 84264832, "step": 5237 }, { "epoch": 0.3669127111297906, "grad_norm": 5.787931442260742, "learning_rate": 6.334317338003504e-05, "loss": 1.1968, "num_input_tokens_seen": 84281216, "step": 5238 }, { "epoch": 0.3669827593755199, "grad_norm": 4.830391883850098, "learning_rate": 6.333617513134851e-05, "loss": 1.3014, "num_input_tokens_seen": 84297352, "step": 5239 }, { "epoch": 0.36705280762124914, "grad_norm": 3.839425563812256, "learning_rate": 6.332917688266199e-05, "loss": 1.031, "num_input_tokens_seen": 84313608, "step": 5240 }, { "epoch": 0.3671228558669784, "grad_norm": 3.963012456893921, "learning_rate": 6.332217863397549e-05, "loss": 1.0232, "num_input_tokens_seen": 84329680, "step": 5241 }, { "epoch": 0.3671929041127076, "grad_norm": 3.4596047401428223, "learning_rate": 6.331518038528897e-05, "loss": 1.0028, "num_input_tokens_seen": 84346064, "step": 5242 }, { "epoch": 0.36726295235843687, "grad_norm": 5.7928290367126465, "learning_rate": 6.330818213660246e-05, "loss": 1.2292, "num_input_tokens_seen": 84361800, "step": 5243 }, { "epoch": 0.3673330006041661, "grad_norm": 3.5012640953063965, "learning_rate": 6.330118388791594e-05, "loss": 1.0095, "num_input_tokens_seen": 84378184, "step": 5244 }, { "epoch": 0.3674030488498954, "grad_norm": 4.464978218078613, "learning_rate": 6.329418563922943e-05, "loss": 1.2258, "num_input_tokens_seen": 84394568, "step": 5245 }, { "epoch": 0.3674730970956246, "grad_norm": 3.4716012477874756, "learning_rate": 6.328718739054291e-05, "loss": 1.087, "num_input_tokens_seen": 84410584, "step": 5246 }, { "epoch": 0.36754314534135385, "grad_norm": 4.010568618774414, "learning_rate": 6.32801891418564e-05, "loss": 1.0823, "num_input_tokens_seen": 84426968, "step": 5247 }, { "epoch": 0.3676131935870831, "grad_norm": 3.763718605041504, "learning_rate": 6.327319089316989e-05, "loss": 1.079, "num_input_tokens_seen": 84443352, "step": 5248 }, { "epoch": 0.36768324183281237, "grad_norm": 5.381477355957031, "learning_rate": 6.326619264448336e-05, "loss": 1.0387, "num_input_tokens_seen": 84459736, "step": 5249 }, { "epoch": 0.3677532900785416, "grad_norm": 3.6646018028259277, "learning_rate": 6.325919439579685e-05, "loss": 1.026, "num_input_tokens_seen": 84476120, "step": 5250 }, { "epoch": 0.36782333832427083, "grad_norm": 4.005465507507324, "learning_rate": 6.325219614711034e-05, "loss": 1.0341, "num_input_tokens_seen": 84492400, "step": 5251 }, { "epoch": 0.3678933865700001, "grad_norm": 3.4287807941436768, "learning_rate": 6.324519789842383e-05, "loss": 0.9892, "num_input_tokens_seen": 84508720, "step": 5252 }, { "epoch": 0.36796343481572935, "grad_norm": 3.8715076446533203, "learning_rate": 6.32381996497373e-05, "loss": 1.3025, "num_input_tokens_seen": 84524592, "step": 5253 }, { "epoch": 0.36803348306145856, "grad_norm": 3.4789586067199707, "learning_rate": 6.323120140105079e-05, "loss": 0.9109, "num_input_tokens_seen": 84540176, "step": 5254 }, { "epoch": 0.3681035313071878, "grad_norm": 3.992988348007202, "learning_rate": 6.322420315236428e-05, "loss": 1.138, "num_input_tokens_seen": 84556560, "step": 5255 }, { "epoch": 0.3681735795529171, "grad_norm": 4.3957743644714355, "learning_rate": 6.321720490367775e-05, "loss": 1.2542, "num_input_tokens_seen": 84572240, "step": 5256 }, { "epoch": 0.36824362779864633, "grad_norm": 3.7909469604492188, "learning_rate": 6.321020665499126e-05, "loss": 0.9282, "num_input_tokens_seen": 84587400, "step": 5257 }, { "epoch": 0.36831367604437554, "grad_norm": 3.747345209121704, "learning_rate": 6.320320840630474e-05, "loss": 0.9673, "num_input_tokens_seen": 84603240, "step": 5258 }, { "epoch": 0.3683837242901048, "grad_norm": 3.6753249168395996, "learning_rate": 6.319621015761822e-05, "loss": 1.0435, "num_input_tokens_seen": 84619624, "step": 5259 }, { "epoch": 0.36845377253583406, "grad_norm": 3.6952924728393555, "learning_rate": 6.31892119089317e-05, "loss": 1.0577, "num_input_tokens_seen": 84636008, "step": 5260 }, { "epoch": 0.3685238207815633, "grad_norm": 4.606325149536133, "learning_rate": 6.318221366024518e-05, "loss": 1.0212, "num_input_tokens_seen": 84652392, "step": 5261 }, { "epoch": 0.3685938690272925, "grad_norm": 3.749755382537842, "learning_rate": 6.317521541155867e-05, "loss": 1.0378, "num_input_tokens_seen": 84667832, "step": 5262 }, { "epoch": 0.3686639172730218, "grad_norm": 3.7973029613494873, "learning_rate": 6.316821716287216e-05, "loss": 1.1695, "num_input_tokens_seen": 84683904, "step": 5263 }, { "epoch": 0.36873396551875104, "grad_norm": 4.264857769012451, "learning_rate": 6.316121891418565e-05, "loss": 1.0638, "num_input_tokens_seen": 84700288, "step": 5264 }, { "epoch": 0.3688040137644803, "grad_norm": 3.4577653408050537, "learning_rate": 6.315422066549914e-05, "loss": 1.0037, "num_input_tokens_seen": 84716672, "step": 5265 }, { "epoch": 0.36887406201020956, "grad_norm": 4.049471378326416, "learning_rate": 6.314722241681261e-05, "loss": 1.0595, "num_input_tokens_seen": 84732976, "step": 5266 }, { "epoch": 0.36894411025593876, "grad_norm": 4.293907165527344, "learning_rate": 6.314022416812609e-05, "loss": 1.1094, "num_input_tokens_seen": 84747480, "step": 5267 }, { "epoch": 0.369014158501668, "grad_norm": 7.115272045135498, "learning_rate": 6.313322591943959e-05, "loss": 0.9904, "num_input_tokens_seen": 84763864, "step": 5268 }, { "epoch": 0.3690842067473973, "grad_norm": 6.85962438583374, "learning_rate": 6.312622767075306e-05, "loss": 0.9934, "num_input_tokens_seen": 84778648, "step": 5269 }, { "epoch": 0.36915425499312654, "grad_norm": 4.24301290512085, "learning_rate": 6.311922942206655e-05, "loss": 1.0426, "num_input_tokens_seen": 84794440, "step": 5270 }, { "epoch": 0.36922430323885574, "grad_norm": 3.533189535140991, "learning_rate": 6.311223117338004e-05, "loss": 0.9863, "num_input_tokens_seen": 84810824, "step": 5271 }, { "epoch": 0.369294351484585, "grad_norm": 4.706559658050537, "learning_rate": 6.310523292469353e-05, "loss": 1.2352, "num_input_tokens_seen": 84827208, "step": 5272 }, { "epoch": 0.36936439973031426, "grad_norm": 3.492366075515747, "learning_rate": 6.3098234676007e-05, "loss": 0.9802, "num_input_tokens_seen": 84842744, "step": 5273 }, { "epoch": 0.3694344479760435, "grad_norm": 4.733495712280273, "learning_rate": 6.30912364273205e-05, "loss": 1.1914, "num_input_tokens_seen": 84858432, "step": 5274 }, { "epoch": 0.3695044962217727, "grad_norm": 3.6145412921905518, "learning_rate": 6.308423817863398e-05, "loss": 1.0411, "num_input_tokens_seen": 84874496, "step": 5275 }, { "epoch": 0.369574544467502, "grad_norm": 3.764568328857422, "learning_rate": 6.307723992994746e-05, "loss": 1.1604, "num_input_tokens_seen": 84890880, "step": 5276 }, { "epoch": 0.36964459271323125, "grad_norm": 5.0368428230285645, "learning_rate": 6.307024168126095e-05, "loss": 1.0006, "num_input_tokens_seen": 84907264, "step": 5277 }, { "epoch": 0.3697146409589605, "grad_norm": 3.9158520698547363, "learning_rate": 6.306324343257443e-05, "loss": 1.0409, "num_input_tokens_seen": 84923648, "step": 5278 }, { "epoch": 0.3697846892046897, "grad_norm": 4.663973808288574, "learning_rate": 6.305624518388792e-05, "loss": 0.9818, "num_input_tokens_seen": 84939976, "step": 5279 }, { "epoch": 0.36985473745041897, "grad_norm": 4.3741455078125, "learning_rate": 6.30492469352014e-05, "loss": 1.2902, "num_input_tokens_seen": 84956184, "step": 5280 }, { "epoch": 0.36992478569614823, "grad_norm": 5.071192264556885, "learning_rate": 6.304224868651489e-05, "loss": 1.0856, "num_input_tokens_seen": 84972024, "step": 5281 }, { "epoch": 0.3699948339418775, "grad_norm": 3.5479323863983154, "learning_rate": 6.303525043782838e-05, "loss": 1.0809, "num_input_tokens_seen": 84988408, "step": 5282 }, { "epoch": 0.3700648821876067, "grad_norm": 4.6933465003967285, "learning_rate": 6.302825218914186e-05, "loss": 1.1826, "num_input_tokens_seen": 85004720, "step": 5283 }, { "epoch": 0.37013493043333595, "grad_norm": 3.594067096710205, "learning_rate": 6.302125394045535e-05, "loss": 0.8992, "num_input_tokens_seen": 85020456, "step": 5284 }, { "epoch": 0.3702049786790652, "grad_norm": 3.972480535507202, "learning_rate": 6.301425569176884e-05, "loss": 1.09, "num_input_tokens_seen": 85036840, "step": 5285 }, { "epoch": 0.37027502692479447, "grad_norm": 4.674763202667236, "learning_rate": 6.300725744308232e-05, "loss": 1.029, "num_input_tokens_seen": 85053224, "step": 5286 }, { "epoch": 0.3703450751705237, "grad_norm": 4.716235160827637, "learning_rate": 6.300025919439579e-05, "loss": 0.9872, "num_input_tokens_seen": 85068624, "step": 5287 }, { "epoch": 0.37041512341625293, "grad_norm": 5.01246452331543, "learning_rate": 6.299326094570928e-05, "loss": 0.9608, "num_input_tokens_seen": 85085008, "step": 5288 }, { "epoch": 0.3704851716619822, "grad_norm": 5.020605087280273, "learning_rate": 6.298626269702277e-05, "loss": 0.9759, "num_input_tokens_seen": 85101392, "step": 5289 }, { "epoch": 0.37055521990771145, "grad_norm": 5.841190814971924, "learning_rate": 6.297926444833626e-05, "loss": 1.3302, "num_input_tokens_seen": 85117776, "step": 5290 }, { "epoch": 0.37062526815344066, "grad_norm": 4.592007637023926, "learning_rate": 6.297226619964975e-05, "loss": 0.9129, "num_input_tokens_seen": 85134160, "step": 5291 }, { "epoch": 0.3706953163991699, "grad_norm": 3.678398609161377, "learning_rate": 6.296526795096323e-05, "loss": 0.9809, "num_input_tokens_seen": 85150544, "step": 5292 }, { "epoch": 0.3707653646448992, "grad_norm": 3.9148921966552734, "learning_rate": 6.295826970227671e-05, "loss": 1.1459, "num_input_tokens_seen": 85166208, "step": 5293 }, { "epoch": 0.37083541289062844, "grad_norm": 3.83375883102417, "learning_rate": 6.295127145359018e-05, "loss": 1.1273, "num_input_tokens_seen": 85182592, "step": 5294 }, { "epoch": 0.37090546113635764, "grad_norm": 6.339621067047119, "learning_rate": 6.294427320490369e-05, "loss": 1.0995, "num_input_tokens_seen": 85197512, "step": 5295 }, { "epoch": 0.3709755093820869, "grad_norm": 3.931565046310425, "learning_rate": 6.293727495621716e-05, "loss": 0.9326, "num_input_tokens_seen": 85213800, "step": 5296 }, { "epoch": 0.37104555762781616, "grad_norm": 4.46995210647583, "learning_rate": 6.293027670753065e-05, "loss": 1.0782, "num_input_tokens_seen": 85229528, "step": 5297 }, { "epoch": 0.3711156058735454, "grad_norm": 4.4390363693237305, "learning_rate": 6.292327845884414e-05, "loss": 1.1976, "num_input_tokens_seen": 85245912, "step": 5298 }, { "epoch": 0.3711856541192747, "grad_norm": 4.089926719665527, "learning_rate": 6.291628021015763e-05, "loss": 1.037, "num_input_tokens_seen": 85262296, "step": 5299 }, { "epoch": 0.3712557023650039, "grad_norm": 4.190539360046387, "learning_rate": 6.29092819614711e-05, "loss": 1.1928, "num_input_tokens_seen": 85278560, "step": 5300 }, { "epoch": 0.37132575061073314, "grad_norm": 5.1102166175842285, "learning_rate": 6.290228371278459e-05, "loss": 0.8734, "num_input_tokens_seen": 85294944, "step": 5301 }, { "epoch": 0.3713957988564624, "grad_norm": 4.174960136413574, "learning_rate": 6.289528546409808e-05, "loss": 1.0425, "num_input_tokens_seen": 85311328, "step": 5302 }, { "epoch": 0.37146584710219166, "grad_norm": 3.8785698413848877, "learning_rate": 6.288828721541157e-05, "loss": 1.0008, "num_input_tokens_seen": 85326784, "step": 5303 }, { "epoch": 0.37153589534792086, "grad_norm": 3.728626251220703, "learning_rate": 6.288128896672504e-05, "loss": 1.1116, "num_input_tokens_seen": 85343168, "step": 5304 }, { "epoch": 0.3716059435936501, "grad_norm": 5.1877312660217285, "learning_rate": 6.287429071803853e-05, "loss": 1.0917, "num_input_tokens_seen": 85359552, "step": 5305 }, { "epoch": 0.3716759918393794, "grad_norm": 5.751648902893066, "learning_rate": 6.286729246935202e-05, "loss": 1.2662, "num_input_tokens_seen": 85375136, "step": 5306 }, { "epoch": 0.37174604008510864, "grad_norm": 3.7917258739471436, "learning_rate": 6.28602942206655e-05, "loss": 0.8499, "num_input_tokens_seen": 85391520, "step": 5307 }, { "epoch": 0.37181608833083785, "grad_norm": 4.268946647644043, "learning_rate": 6.285329597197898e-05, "loss": 1.0928, "num_input_tokens_seen": 85406848, "step": 5308 }, { "epoch": 0.3718861365765671, "grad_norm": 4.350981712341309, "learning_rate": 6.284629772329247e-05, "loss": 1.1725, "num_input_tokens_seen": 85423232, "step": 5309 }, { "epoch": 0.37195618482229637, "grad_norm": 3.8072032928466797, "learning_rate": 6.283929947460596e-05, "loss": 0.9999, "num_input_tokens_seen": 85439616, "step": 5310 }, { "epoch": 0.3720262330680256, "grad_norm": 4.0531697273254395, "learning_rate": 6.283230122591945e-05, "loss": 0.9389, "num_input_tokens_seen": 85456000, "step": 5311 }, { "epoch": 0.37209628131375483, "grad_norm": 5.18675422668457, "learning_rate": 6.282530297723294e-05, "loss": 1.0504, "num_input_tokens_seen": 85472384, "step": 5312 }, { "epoch": 0.3721663295594841, "grad_norm": 4.675386428833008, "learning_rate": 6.281830472854641e-05, "loss": 0.8373, "num_input_tokens_seen": 85488544, "step": 5313 }, { "epoch": 0.37223637780521335, "grad_norm": 6.522333145141602, "learning_rate": 6.281130647985989e-05, "loss": 0.9685, "num_input_tokens_seen": 85504352, "step": 5314 }, { "epoch": 0.3723064260509426, "grad_norm": 3.9266233444213867, "learning_rate": 6.280430823117338e-05, "loss": 1.0443, "num_input_tokens_seen": 85520688, "step": 5315 }, { "epoch": 0.3723764742966718, "grad_norm": 4.6428093910217285, "learning_rate": 6.279730998248687e-05, "loss": 0.9396, "num_input_tokens_seen": 85537072, "step": 5316 }, { "epoch": 0.37244652254240107, "grad_norm": 3.6043691635131836, "learning_rate": 6.279031173380035e-05, "loss": 0.903, "num_input_tokens_seen": 85553456, "step": 5317 }, { "epoch": 0.37251657078813033, "grad_norm": 3.4878151416778564, "learning_rate": 6.278331348511384e-05, "loss": 1.101, "num_input_tokens_seen": 85569824, "step": 5318 }, { "epoch": 0.3725866190338596, "grad_norm": 4.275106906890869, "learning_rate": 6.277631523642733e-05, "loss": 0.8912, "num_input_tokens_seen": 85586208, "step": 5319 }, { "epoch": 0.3726566672795888, "grad_norm": 7.615388870239258, "learning_rate": 6.27693169877408e-05, "loss": 1.0786, "num_input_tokens_seen": 85600984, "step": 5320 }, { "epoch": 0.37272671552531805, "grad_norm": 4.4750752449035645, "learning_rate": 6.276231873905428e-05, "loss": 1.1369, "num_input_tokens_seen": 85617368, "step": 5321 }, { "epoch": 0.3727967637710473, "grad_norm": 3.7900373935699463, "learning_rate": 6.275532049036778e-05, "loss": 1.0727, "num_input_tokens_seen": 85633304, "step": 5322 }, { "epoch": 0.37286681201677657, "grad_norm": 8.58016300201416, "learning_rate": 6.274832224168127e-05, "loss": 1.0942, "num_input_tokens_seen": 85648592, "step": 5323 }, { "epoch": 0.3729368602625058, "grad_norm": 3.847476005554199, "learning_rate": 6.274132399299475e-05, "loss": 1.2543, "num_input_tokens_seen": 85664976, "step": 5324 }, { "epoch": 0.37300690850823504, "grad_norm": 3.68683123588562, "learning_rate": 6.273432574430824e-05, "loss": 1.1331, "num_input_tokens_seen": 85681360, "step": 5325 }, { "epoch": 0.3730769567539643, "grad_norm": 4.07316255569458, "learning_rate": 6.272732749562172e-05, "loss": 1.1859, "num_input_tokens_seen": 85697744, "step": 5326 }, { "epoch": 0.37314700499969355, "grad_norm": 3.7817749977111816, "learning_rate": 6.27203292469352e-05, "loss": 1.128, "num_input_tokens_seen": 85713680, "step": 5327 }, { "epoch": 0.37321705324542276, "grad_norm": 3.8322465419769287, "learning_rate": 6.271333099824869e-05, "loss": 1.1804, "num_input_tokens_seen": 85730064, "step": 5328 }, { "epoch": 0.373287101491152, "grad_norm": 5.689653396606445, "learning_rate": 6.270633274956218e-05, "loss": 1.0848, "num_input_tokens_seen": 85745904, "step": 5329 }, { "epoch": 0.3733571497368813, "grad_norm": 5.568809509277344, "learning_rate": 6.269933450087566e-05, "loss": 0.9887, "num_input_tokens_seen": 85762288, "step": 5330 }, { "epoch": 0.37342719798261054, "grad_norm": 3.982375383377075, "learning_rate": 6.269233625218914e-05, "loss": 0.9975, "num_input_tokens_seen": 85778672, "step": 5331 }, { "epoch": 0.3734972462283398, "grad_norm": 3.430204391479492, "learning_rate": 6.268533800350263e-05, "loss": 1.0241, "num_input_tokens_seen": 85795056, "step": 5332 }, { "epoch": 0.373567294474069, "grad_norm": 3.465724229812622, "learning_rate": 6.267833975481612e-05, "loss": 0.9229, "num_input_tokens_seen": 85811392, "step": 5333 }, { "epoch": 0.37363734271979826, "grad_norm": 3.837188482284546, "learning_rate": 6.267134150612959e-05, "loss": 1.1354, "num_input_tokens_seen": 85827016, "step": 5334 }, { "epoch": 0.3737073909655275, "grad_norm": 7.360764980316162, "learning_rate": 6.266434325744308e-05, "loss": 1.0209, "num_input_tokens_seen": 85842040, "step": 5335 }, { "epoch": 0.3737774392112568, "grad_norm": 3.567553997039795, "learning_rate": 6.265734500875657e-05, "loss": 1.0502, "num_input_tokens_seen": 85858424, "step": 5336 }, { "epoch": 0.373847487456986, "grad_norm": 4.564986705780029, "learning_rate": 6.265034676007006e-05, "loss": 1.0178, "num_input_tokens_seen": 85874808, "step": 5337 }, { "epoch": 0.37391753570271524, "grad_norm": 3.4568405151367188, "learning_rate": 6.264334851138355e-05, "loss": 0.9245, "num_input_tokens_seen": 85890672, "step": 5338 }, { "epoch": 0.3739875839484445, "grad_norm": 3.723557233810425, "learning_rate": 6.263635026269704e-05, "loss": 1.0175, "num_input_tokens_seen": 85906920, "step": 5339 }, { "epoch": 0.37405763219417376, "grad_norm": 3.5800676345825195, "learning_rate": 6.262935201401051e-05, "loss": 0.9726, "num_input_tokens_seen": 85923304, "step": 5340 }, { "epoch": 0.37412768043990297, "grad_norm": 3.8996667861938477, "learning_rate": 6.262235376532399e-05, "loss": 1.2368, "num_input_tokens_seen": 85938984, "step": 5341 }, { "epoch": 0.3741977286856322, "grad_norm": 3.417182207107544, "learning_rate": 6.261535551663747e-05, "loss": 1.0959, "num_input_tokens_seen": 85955368, "step": 5342 }, { "epoch": 0.3742677769313615, "grad_norm": 4.214803695678711, "learning_rate": 6.260835726795098e-05, "loss": 1.1107, "num_input_tokens_seen": 85971320, "step": 5343 }, { "epoch": 0.37433782517709074, "grad_norm": 3.7782840728759766, "learning_rate": 6.260135901926445e-05, "loss": 0.9455, "num_input_tokens_seen": 85987704, "step": 5344 }, { "epoch": 0.37440787342281995, "grad_norm": 3.6186842918395996, "learning_rate": 6.259436077057794e-05, "loss": 1.0682, "num_input_tokens_seen": 86004088, "step": 5345 }, { "epoch": 0.3744779216685492, "grad_norm": 4.2028913497924805, "learning_rate": 6.258736252189143e-05, "loss": 1.2203, "num_input_tokens_seen": 86020472, "step": 5346 }, { "epoch": 0.37454796991427847, "grad_norm": 4.17422342300415, "learning_rate": 6.25803642732049e-05, "loss": 1.2483, "num_input_tokens_seen": 86036856, "step": 5347 }, { "epoch": 0.3746180181600077, "grad_norm": 3.3578243255615234, "learning_rate": 6.257336602451838e-05, "loss": 1.0315, "num_input_tokens_seen": 86053224, "step": 5348 }, { "epoch": 0.37468806640573693, "grad_norm": 4.105921268463135, "learning_rate": 6.256636777583188e-05, "loss": 1.0552, "num_input_tokens_seen": 86069272, "step": 5349 }, { "epoch": 0.3747581146514662, "grad_norm": 3.7420692443847656, "learning_rate": 6.255936952714537e-05, "loss": 1.0672, "num_input_tokens_seen": 86085656, "step": 5350 }, { "epoch": 0.37482816289719545, "grad_norm": 5.1573872566223145, "learning_rate": 6.255237127845884e-05, "loss": 1.376, "num_input_tokens_seen": 86102040, "step": 5351 }, { "epoch": 0.3748982111429247, "grad_norm": 3.9844436645507812, "learning_rate": 6.254537302977233e-05, "loss": 1.0042, "num_input_tokens_seen": 86117976, "step": 5352 }, { "epoch": 0.3749682593886539, "grad_norm": 3.6582653522491455, "learning_rate": 6.253837478108582e-05, "loss": 0.9786, "num_input_tokens_seen": 86134360, "step": 5353 }, { "epoch": 0.3750383076343832, "grad_norm": 4.814766883850098, "learning_rate": 6.25313765323993e-05, "loss": 1.2574, "num_input_tokens_seen": 86150208, "step": 5354 }, { "epoch": 0.37510835588011243, "grad_norm": 4.7514262199401855, "learning_rate": 6.252437828371278e-05, "loss": 1.071, "num_input_tokens_seen": 86165672, "step": 5355 }, { "epoch": 0.3751784041258417, "grad_norm": 3.9450578689575195, "learning_rate": 6.251738003502627e-05, "loss": 1.1295, "num_input_tokens_seen": 86182056, "step": 5356 }, { "epoch": 0.3752484523715709, "grad_norm": 3.5215647220611572, "learning_rate": 6.251038178633976e-05, "loss": 1.04, "num_input_tokens_seen": 86198440, "step": 5357 }, { "epoch": 0.37531850061730015, "grad_norm": 3.805070161819458, "learning_rate": 6.250338353765324e-05, "loss": 1.036, "num_input_tokens_seen": 86214824, "step": 5358 }, { "epoch": 0.3753885488630294, "grad_norm": 4.033730983734131, "learning_rate": 6.249638528896673e-05, "loss": 1.092, "num_input_tokens_seen": 86231208, "step": 5359 }, { "epoch": 0.3754585971087587, "grad_norm": 3.8157355785369873, "learning_rate": 6.248938704028021e-05, "loss": 1.0032, "num_input_tokens_seen": 86247392, "step": 5360 }, { "epoch": 0.3755286453544879, "grad_norm": 4.832013130187988, "learning_rate": 6.248238879159369e-05, "loss": 1.0711, "num_input_tokens_seen": 86263776, "step": 5361 }, { "epoch": 0.37559869360021714, "grad_norm": 3.753471612930298, "learning_rate": 6.247539054290718e-05, "loss": 1.0532, "num_input_tokens_seen": 86279912, "step": 5362 }, { "epoch": 0.3756687418459464, "grad_norm": 8.569518089294434, "learning_rate": 6.246839229422068e-05, "loss": 1.1073, "num_input_tokens_seen": 86296296, "step": 5363 }, { "epoch": 0.37573879009167566, "grad_norm": 4.399802207946777, "learning_rate": 6.246139404553416e-05, "loss": 1.1484, "num_input_tokens_seen": 86312680, "step": 5364 }, { "epoch": 0.37580883833740486, "grad_norm": 4.230834484100342, "learning_rate": 6.245439579684764e-05, "loss": 1.0905, "num_input_tokens_seen": 86329064, "step": 5365 }, { "epoch": 0.3758788865831341, "grad_norm": 4.750765800476074, "learning_rate": 6.244739754816113e-05, "loss": 1.2126, "num_input_tokens_seen": 86345448, "step": 5366 }, { "epoch": 0.3759489348288634, "grad_norm": 6.567142963409424, "learning_rate": 6.244039929947461e-05, "loss": 1.314, "num_input_tokens_seen": 86361272, "step": 5367 }, { "epoch": 0.37601898307459264, "grad_norm": 3.9668781757354736, "learning_rate": 6.243340105078808e-05, "loss": 1.0427, "num_input_tokens_seen": 86377448, "step": 5368 }, { "epoch": 0.3760890313203219, "grad_norm": 4.619864463806152, "learning_rate": 6.242640280210158e-05, "loss": 1.0687, "num_input_tokens_seen": 86393600, "step": 5369 }, { "epoch": 0.3761590795660511, "grad_norm": 6.837228298187256, "learning_rate": 6.241940455341507e-05, "loss": 0.9225, "num_input_tokens_seen": 86409896, "step": 5370 }, { "epoch": 0.37622912781178036, "grad_norm": 4.634070873260498, "learning_rate": 6.241240630472855e-05, "loss": 1.0147, "num_input_tokens_seen": 86426280, "step": 5371 }, { "epoch": 0.3762991760575096, "grad_norm": 3.944580554962158, "learning_rate": 6.240540805604204e-05, "loss": 1.144, "num_input_tokens_seen": 86442640, "step": 5372 }, { "epoch": 0.3763692243032389, "grad_norm": 7.016427516937256, "learning_rate": 6.239840980735553e-05, "loss": 1.0016, "num_input_tokens_seen": 86459024, "step": 5373 }, { "epoch": 0.3764392725489681, "grad_norm": 3.9997384548187256, "learning_rate": 6.2391411558669e-05, "loss": 0.9382, "num_input_tokens_seen": 86475408, "step": 5374 }, { "epoch": 0.37650932079469734, "grad_norm": 4.016181945800781, "learning_rate": 6.238441330998249e-05, "loss": 1.1728, "num_input_tokens_seen": 86491680, "step": 5375 }, { "epoch": 0.3765793690404266, "grad_norm": 4.19748592376709, "learning_rate": 6.237741506129598e-05, "loss": 1.161, "num_input_tokens_seen": 86507768, "step": 5376 }, { "epoch": 0.37664941728615586, "grad_norm": 4.579540252685547, "learning_rate": 6.237041681260947e-05, "loss": 1.0014, "num_input_tokens_seen": 86524040, "step": 5377 }, { "epoch": 0.37671946553188507, "grad_norm": 3.784952402114868, "learning_rate": 6.236341856392294e-05, "loss": 1.0435, "num_input_tokens_seen": 86540424, "step": 5378 }, { "epoch": 0.3767895137776143, "grad_norm": 5.813356876373291, "learning_rate": 6.235642031523643e-05, "loss": 0.9772, "num_input_tokens_seen": 86556360, "step": 5379 }, { "epoch": 0.3768595620233436, "grad_norm": 4.314088344573975, "learning_rate": 6.234942206654992e-05, "loss": 1.2318, "num_input_tokens_seen": 86572744, "step": 5380 }, { "epoch": 0.37692961026907285, "grad_norm": 3.898298740386963, "learning_rate": 6.23424238178634e-05, "loss": 1.1217, "num_input_tokens_seen": 86588888, "step": 5381 }, { "epoch": 0.37699965851480205, "grad_norm": 3.514692544937134, "learning_rate": 6.233542556917688e-05, "loss": 0.9526, "num_input_tokens_seen": 86605272, "step": 5382 }, { "epoch": 0.3770697067605313, "grad_norm": 3.7073886394500732, "learning_rate": 6.232842732049038e-05, "loss": 1.1199, "num_input_tokens_seen": 86621656, "step": 5383 }, { "epoch": 0.37713975500626057, "grad_norm": 3.9826815128326416, "learning_rate": 6.232142907180386e-05, "loss": 1.1417, "num_input_tokens_seen": 86638040, "step": 5384 }, { "epoch": 0.37720980325198983, "grad_norm": 3.6563196182250977, "learning_rate": 6.231443082311733e-05, "loss": 0.888, "num_input_tokens_seen": 86654424, "step": 5385 }, { "epoch": 0.37727985149771903, "grad_norm": 3.5995571613311768, "learning_rate": 6.230743257443082e-05, "loss": 1.0457, "num_input_tokens_seen": 86670328, "step": 5386 }, { "epoch": 0.3773498997434483, "grad_norm": 4.254338264465332, "learning_rate": 6.230043432574431e-05, "loss": 0.94, "num_input_tokens_seen": 86685960, "step": 5387 }, { "epoch": 0.37741994798917755, "grad_norm": 3.689716100692749, "learning_rate": 6.229343607705779e-05, "loss": 0.81, "num_input_tokens_seen": 86702008, "step": 5388 }, { "epoch": 0.3774899962349068, "grad_norm": 3.4042210578918457, "learning_rate": 6.228643782837129e-05, "loss": 1.0077, "num_input_tokens_seen": 86718392, "step": 5389 }, { "epoch": 0.377560044480636, "grad_norm": 4.607806205749512, "learning_rate": 6.227943957968478e-05, "loss": 1.2891, "num_input_tokens_seen": 86734624, "step": 5390 }, { "epoch": 0.3776300927263653, "grad_norm": 3.951362133026123, "learning_rate": 6.227244133099825e-05, "loss": 1.0501, "num_input_tokens_seen": 86749816, "step": 5391 }, { "epoch": 0.37770014097209453, "grad_norm": 3.535480260848999, "learning_rate": 6.226544308231174e-05, "loss": 0.8942, "num_input_tokens_seen": 86765800, "step": 5392 }, { "epoch": 0.3777701892178238, "grad_norm": 5.398930549621582, "learning_rate": 6.225844483362523e-05, "loss": 1.1322, "num_input_tokens_seen": 86782184, "step": 5393 }, { "epoch": 0.377840237463553, "grad_norm": 4.456240177154541, "learning_rate": 6.22514465849387e-05, "loss": 1.1725, "num_input_tokens_seen": 86798568, "step": 5394 }, { "epoch": 0.37791028570928226, "grad_norm": 3.8764703273773193, "learning_rate": 6.224444833625219e-05, "loss": 1.0041, "num_input_tokens_seen": 86814824, "step": 5395 }, { "epoch": 0.3779803339550115, "grad_norm": 3.8746144771575928, "learning_rate": 6.223745008756568e-05, "loss": 1.066, "num_input_tokens_seen": 86831208, "step": 5396 }, { "epoch": 0.3780503822007408, "grad_norm": 4.3454742431640625, "learning_rate": 6.223045183887917e-05, "loss": 1.1164, "num_input_tokens_seen": 86846872, "step": 5397 }, { "epoch": 0.37812043044647, "grad_norm": 5.006749153137207, "learning_rate": 6.222345359019265e-05, "loss": 0.8317, "num_input_tokens_seen": 86863256, "step": 5398 }, { "epoch": 0.37819047869219924, "grad_norm": 3.7388808727264404, "learning_rate": 6.221645534150613e-05, "loss": 1.1562, "num_input_tokens_seen": 86879640, "step": 5399 }, { "epoch": 0.3782605269379285, "grad_norm": 4.515074253082275, "learning_rate": 6.220945709281962e-05, "loss": 1.0428, "num_input_tokens_seen": 86896024, "step": 5400 }, { "epoch": 0.3782605269379285, "eval_loss": 1.1279726028442383, "eval_runtime": 0.2024, "eval_samples_per_second": 4.94, "eval_steps_per_second": 4.94, "num_input_tokens_seen": 86896024, "step": 5400 }, { "epoch": 0.37833057518365776, "grad_norm": 3.5468356609344482, "learning_rate": 6.22024588441331e-05, "loss": 0.9858, "num_input_tokens_seen": 86912032, "step": 5401 }, { "epoch": 0.378400623429387, "grad_norm": 4.281546115875244, "learning_rate": 6.219546059544659e-05, "loss": 1.2335, "num_input_tokens_seen": 86928080, "step": 5402 }, { "epoch": 0.3784706716751162, "grad_norm": 4.247570037841797, "learning_rate": 6.218846234676009e-05, "loss": 1.0889, "num_input_tokens_seen": 86944424, "step": 5403 }, { "epoch": 0.3785407199208455, "grad_norm": 3.78439998626709, "learning_rate": 6.218146409807356e-05, "loss": 1.0476, "num_input_tokens_seen": 86960808, "step": 5404 }, { "epoch": 0.37861076816657474, "grad_norm": 4.174613952636719, "learning_rate": 6.217446584938704e-05, "loss": 1.2858, "num_input_tokens_seen": 86976472, "step": 5405 }, { "epoch": 0.378680816412304, "grad_norm": 4.759533882141113, "learning_rate": 6.216746760070053e-05, "loss": 0.9813, "num_input_tokens_seen": 86992856, "step": 5406 }, { "epoch": 0.3787508646580332, "grad_norm": 5.2616801261901855, "learning_rate": 6.216046935201402e-05, "loss": 1.1752, "num_input_tokens_seen": 87007936, "step": 5407 }, { "epoch": 0.37882091290376246, "grad_norm": 4.626899719238281, "learning_rate": 6.215347110332749e-05, "loss": 1.0348, "num_input_tokens_seen": 87023888, "step": 5408 }, { "epoch": 0.3788909611494917, "grad_norm": 3.7142221927642822, "learning_rate": 6.214647285464099e-05, "loss": 1.0051, "num_input_tokens_seen": 87040272, "step": 5409 }, { "epoch": 0.378961009395221, "grad_norm": 6.228342056274414, "learning_rate": 6.213947460595448e-05, "loss": 1.0807, "num_input_tokens_seen": 87056656, "step": 5410 }, { "epoch": 0.3790310576409502, "grad_norm": 3.7979259490966797, "learning_rate": 6.213247635726796e-05, "loss": 1.0051, "num_input_tokens_seen": 87073040, "step": 5411 }, { "epoch": 0.37910110588667945, "grad_norm": 3.903106927871704, "learning_rate": 6.212547810858143e-05, "loss": 1.0546, "num_input_tokens_seen": 87089344, "step": 5412 }, { "epoch": 0.3791711541324087, "grad_norm": 3.966651201248169, "learning_rate": 6.211847985989492e-05, "loss": 1.0678, "num_input_tokens_seen": 87105144, "step": 5413 }, { "epoch": 0.37924120237813796, "grad_norm": 4.070274829864502, "learning_rate": 6.211148161120841e-05, "loss": 1.1021, "num_input_tokens_seen": 87121528, "step": 5414 }, { "epoch": 0.37931125062386717, "grad_norm": 3.516997814178467, "learning_rate": 6.21044833625219e-05, "loss": 1.0112, "num_input_tokens_seen": 87137752, "step": 5415 }, { "epoch": 0.37938129886959643, "grad_norm": 4.28290319442749, "learning_rate": 6.209748511383539e-05, "loss": 1.2252, "num_input_tokens_seen": 87154136, "step": 5416 }, { "epoch": 0.3794513471153257, "grad_norm": 4.765808582305908, "learning_rate": 6.209048686514887e-05, "loss": 1.0135, "num_input_tokens_seen": 87170520, "step": 5417 }, { "epoch": 0.37952139536105495, "grad_norm": 3.8507494926452637, "learning_rate": 6.208348861646235e-05, "loss": 1.0304, "num_input_tokens_seen": 87186904, "step": 5418 }, { "epoch": 0.37959144360678415, "grad_norm": 7.46950626373291, "learning_rate": 6.207649036777584e-05, "loss": 1.1376, "num_input_tokens_seen": 87203288, "step": 5419 }, { "epoch": 0.3796614918525134, "grad_norm": 5.770944595336914, "learning_rate": 6.206949211908933e-05, "loss": 1.23, "num_input_tokens_seen": 87219552, "step": 5420 }, { "epoch": 0.37973154009824267, "grad_norm": 3.752936363220215, "learning_rate": 6.20624938704028e-05, "loss": 0.8285, "num_input_tokens_seen": 87235736, "step": 5421 }, { "epoch": 0.37980158834397193, "grad_norm": 3.8336403369903564, "learning_rate": 6.205549562171629e-05, "loss": 0.8416, "num_input_tokens_seen": 87252120, "step": 5422 }, { "epoch": 0.37987163658970113, "grad_norm": 7.380855083465576, "learning_rate": 6.204849737302978e-05, "loss": 1.1149, "num_input_tokens_seen": 87268504, "step": 5423 }, { "epoch": 0.3799416848354304, "grad_norm": 4.780874729156494, "learning_rate": 6.204149912434327e-05, "loss": 0.9103, "num_input_tokens_seen": 87284888, "step": 5424 }, { "epoch": 0.38001173308115965, "grad_norm": 4.691160202026367, "learning_rate": 6.203450087565674e-05, "loss": 1.1994, "num_input_tokens_seen": 87301272, "step": 5425 }, { "epoch": 0.3800817813268889, "grad_norm": 3.592348098754883, "learning_rate": 6.202750262697023e-05, "loss": 1.017, "num_input_tokens_seen": 87317288, "step": 5426 }, { "epoch": 0.3801518295726181, "grad_norm": 4.750811576843262, "learning_rate": 6.202050437828372e-05, "loss": 1.2781, "num_input_tokens_seen": 87332488, "step": 5427 }, { "epoch": 0.3802218778183474, "grad_norm": 4.564239501953125, "learning_rate": 6.20135061295972e-05, "loss": 1.133, "num_input_tokens_seen": 87348264, "step": 5428 }, { "epoch": 0.38029192606407664, "grad_norm": 4.697380065917969, "learning_rate": 6.200650788091068e-05, "loss": 1.0091, "num_input_tokens_seen": 87363920, "step": 5429 }, { "epoch": 0.3803619743098059, "grad_norm": 4.026552677154541, "learning_rate": 6.199950963222419e-05, "loss": 1.2177, "num_input_tokens_seen": 87379920, "step": 5430 }, { "epoch": 0.3804320225555351, "grad_norm": 5.023289203643799, "learning_rate": 6.199251138353766e-05, "loss": 0.9822, "num_input_tokens_seen": 87395848, "step": 5431 }, { "epoch": 0.38050207080126436, "grad_norm": 3.6005523204803467, "learning_rate": 6.198551313485114e-05, "loss": 1.0709, "num_input_tokens_seen": 87411632, "step": 5432 }, { "epoch": 0.3805721190469936, "grad_norm": 4.094357967376709, "learning_rate": 6.197851488616462e-05, "loss": 1.1254, "num_input_tokens_seen": 87426912, "step": 5433 }, { "epoch": 0.3806421672927229, "grad_norm": 4.452909469604492, "learning_rate": 6.197151663747811e-05, "loss": 1.248, "num_input_tokens_seen": 87443296, "step": 5434 }, { "epoch": 0.3807122155384521, "grad_norm": 3.975532054901123, "learning_rate": 6.19645183887916e-05, "loss": 1.0786, "num_input_tokens_seen": 87459680, "step": 5435 }, { "epoch": 0.38078226378418134, "grad_norm": 4.745920181274414, "learning_rate": 6.195752014010509e-05, "loss": 1.2534, "num_input_tokens_seen": 87476064, "step": 5436 }, { "epoch": 0.3808523120299106, "grad_norm": 3.8793790340423584, "learning_rate": 6.195052189141858e-05, "loss": 1.1197, "num_input_tokens_seen": 87492448, "step": 5437 }, { "epoch": 0.38092236027563986, "grad_norm": 4.695518493652344, "learning_rate": 6.194352364273205e-05, "loss": 0.93, "num_input_tokens_seen": 87508832, "step": 5438 }, { "epoch": 0.3809924085213691, "grad_norm": 3.5820047855377197, "learning_rate": 6.193652539404553e-05, "loss": 1.0007, "num_input_tokens_seen": 87524728, "step": 5439 }, { "epoch": 0.3810624567670983, "grad_norm": 5.76292610168457, "learning_rate": 6.192952714535902e-05, "loss": 1.1919, "num_input_tokens_seen": 87540752, "step": 5440 }, { "epoch": 0.3811325050128276, "grad_norm": 4.334653377532959, "learning_rate": 6.19225288966725e-05, "loss": 0.9847, "num_input_tokens_seen": 87556384, "step": 5441 }, { "epoch": 0.38120255325855684, "grad_norm": 3.7438180446624756, "learning_rate": 6.1915530647986e-05, "loss": 1.0083, "num_input_tokens_seen": 87572320, "step": 5442 }, { "epoch": 0.3812726015042861, "grad_norm": 4.082560062408447, "learning_rate": 6.190853239929948e-05, "loss": 0.8908, "num_input_tokens_seen": 87588704, "step": 5443 }, { "epoch": 0.3813426497500153, "grad_norm": 3.9324755668640137, "learning_rate": 6.190153415061297e-05, "loss": 1.0991, "num_input_tokens_seen": 87605088, "step": 5444 }, { "epoch": 0.38141269799574457, "grad_norm": 5.329967498779297, "learning_rate": 6.189453590192645e-05, "loss": 1.1817, "num_input_tokens_seen": 87621472, "step": 5445 }, { "epoch": 0.3814827462414738, "grad_norm": 3.627267837524414, "learning_rate": 6.188753765323993e-05, "loss": 0.9407, "num_input_tokens_seen": 87637856, "step": 5446 }, { "epoch": 0.3815527944872031, "grad_norm": 3.6728835105895996, "learning_rate": 6.188053940455342e-05, "loss": 0.8623, "num_input_tokens_seen": 87653720, "step": 5447 }, { "epoch": 0.3816228427329323, "grad_norm": 3.556185245513916, "learning_rate": 6.18735411558669e-05, "loss": 0.9531, "num_input_tokens_seen": 87670104, "step": 5448 }, { "epoch": 0.38169289097866155, "grad_norm": 4.075231552124023, "learning_rate": 6.186654290718039e-05, "loss": 1.0284, "num_input_tokens_seen": 87686488, "step": 5449 }, { "epoch": 0.3817629392243908, "grad_norm": 3.981752395629883, "learning_rate": 6.185954465849388e-05, "loss": 1.0822, "num_input_tokens_seen": 87702872, "step": 5450 }, { "epoch": 0.38183298747012007, "grad_norm": 4.75683069229126, "learning_rate": 6.185254640980736e-05, "loss": 0.9611, "num_input_tokens_seen": 87718912, "step": 5451 }, { "epoch": 0.38190303571584927, "grad_norm": 6.081716060638428, "learning_rate": 6.184554816112084e-05, "loss": 1.092, "num_input_tokens_seen": 87735160, "step": 5452 }, { "epoch": 0.38197308396157853, "grad_norm": 6.651247978210449, "learning_rate": 6.183854991243433e-05, "loss": 1.0397, "num_input_tokens_seen": 87749232, "step": 5453 }, { "epoch": 0.3820431322073078, "grad_norm": 4.12028694152832, "learning_rate": 6.183155166374782e-05, "loss": 1.094, "num_input_tokens_seen": 87765328, "step": 5454 }, { "epoch": 0.38211318045303705, "grad_norm": 6.3344645500183105, "learning_rate": 6.18245534150613e-05, "loss": 1.0275, "num_input_tokens_seen": 87781712, "step": 5455 }, { "epoch": 0.38218322869876625, "grad_norm": 3.745476007461548, "learning_rate": 6.181755516637478e-05, "loss": 0.9485, "num_input_tokens_seen": 87798032, "step": 5456 }, { "epoch": 0.3822532769444955, "grad_norm": 3.515174388885498, "learning_rate": 6.181055691768828e-05, "loss": 1.1138, "num_input_tokens_seen": 87814416, "step": 5457 }, { "epoch": 0.38232332519022477, "grad_norm": 4.101998329162598, "learning_rate": 6.180355866900176e-05, "loss": 0.9787, "num_input_tokens_seen": 87830504, "step": 5458 }, { "epoch": 0.38239337343595403, "grad_norm": 4.045940399169922, "learning_rate": 6.179656042031523e-05, "loss": 1.1278, "num_input_tokens_seen": 87846264, "step": 5459 }, { "epoch": 0.38246342168168324, "grad_norm": 8.09753131866455, "learning_rate": 6.178956217162872e-05, "loss": 1.131, "num_input_tokens_seen": 87861856, "step": 5460 }, { "epoch": 0.3825334699274125, "grad_norm": 5.395979404449463, "learning_rate": 6.178256392294221e-05, "loss": 1.0364, "num_input_tokens_seen": 87878024, "step": 5461 }, { "epoch": 0.38260351817314175, "grad_norm": 3.452855110168457, "learning_rate": 6.17755656742557e-05, "loss": 0.8875, "num_input_tokens_seen": 87894408, "step": 5462 }, { "epoch": 0.382673566418871, "grad_norm": 3.9877512454986572, "learning_rate": 6.176856742556919e-05, "loss": 1.0349, "num_input_tokens_seen": 87910400, "step": 5463 }, { "epoch": 0.3827436146646002, "grad_norm": 3.9095492362976074, "learning_rate": 6.176156917688268e-05, "loss": 1.063, "num_input_tokens_seen": 87926040, "step": 5464 }, { "epoch": 0.3828136629103295, "grad_norm": 4.558162212371826, "learning_rate": 6.175457092819615e-05, "loss": 1.1126, "num_input_tokens_seen": 87942424, "step": 5465 }, { "epoch": 0.38288371115605874, "grad_norm": 3.786123275756836, "learning_rate": 6.174757267950963e-05, "loss": 1.0414, "num_input_tokens_seen": 87958808, "step": 5466 }, { "epoch": 0.382953759401788, "grad_norm": 4.0291056632995605, "learning_rate": 6.174057443082311e-05, "loss": 1.0603, "num_input_tokens_seen": 87975192, "step": 5467 }, { "epoch": 0.3830238076475172, "grad_norm": 3.698666572570801, "learning_rate": 6.17335761821366e-05, "loss": 0.9187, "num_input_tokens_seen": 87991504, "step": 5468 }, { "epoch": 0.38309385589324646, "grad_norm": 3.7802882194519043, "learning_rate": 6.172657793345009e-05, "loss": 0.9568, "num_input_tokens_seen": 88007888, "step": 5469 }, { "epoch": 0.3831639041389757, "grad_norm": 4.754447937011719, "learning_rate": 6.171957968476358e-05, "loss": 1.1788, "num_input_tokens_seen": 88023832, "step": 5470 }, { "epoch": 0.383233952384705, "grad_norm": 3.502560615539551, "learning_rate": 6.171258143607707e-05, "loss": 0.9993, "num_input_tokens_seen": 88040216, "step": 5471 }, { "epoch": 0.38330400063043424, "grad_norm": 4.379989147186279, "learning_rate": 6.170558318739054e-05, "loss": 1.0609, "num_input_tokens_seen": 88055768, "step": 5472 }, { "epoch": 0.38337404887616344, "grad_norm": 3.3798177242279053, "learning_rate": 6.169858493870403e-05, "loss": 0.6884, "num_input_tokens_seen": 88072152, "step": 5473 }, { "epoch": 0.3834440971218927, "grad_norm": 4.265483856201172, "learning_rate": 6.169158669001752e-05, "loss": 1.0405, "num_input_tokens_seen": 88087816, "step": 5474 }, { "epoch": 0.38351414536762196, "grad_norm": 4.468397617340088, "learning_rate": 6.168458844133101e-05, "loss": 0.885, "num_input_tokens_seen": 88103160, "step": 5475 }, { "epoch": 0.3835841936133512, "grad_norm": 3.888359546661377, "learning_rate": 6.167759019264448e-05, "loss": 1.1768, "num_input_tokens_seen": 88119544, "step": 5476 }, { "epoch": 0.3836542418590804, "grad_norm": 3.7953927516937256, "learning_rate": 6.167059194395797e-05, "loss": 1.1585, "num_input_tokens_seen": 88135928, "step": 5477 }, { "epoch": 0.3837242901048097, "grad_norm": 3.7742021083831787, "learning_rate": 6.166359369527146e-05, "loss": 0.9201, "num_input_tokens_seen": 88151928, "step": 5478 }, { "epoch": 0.38379433835053894, "grad_norm": 3.811535120010376, "learning_rate": 6.165659544658494e-05, "loss": 0.9211, "num_input_tokens_seen": 88168312, "step": 5479 }, { "epoch": 0.3838643865962682, "grad_norm": 5.1758646965026855, "learning_rate": 6.164959719789842e-05, "loss": 1.2087, "num_input_tokens_seen": 88184064, "step": 5480 }, { "epoch": 0.3839344348419974, "grad_norm": 4.529813289642334, "learning_rate": 6.164259894921191e-05, "loss": 0.9142, "num_input_tokens_seen": 88200216, "step": 5481 }, { "epoch": 0.38400448308772667, "grad_norm": 4.426999568939209, "learning_rate": 6.16356007005254e-05, "loss": 1.0046, "num_input_tokens_seen": 88215568, "step": 5482 }, { "epoch": 0.3840745313334559, "grad_norm": 4.73276948928833, "learning_rate": 6.162860245183888e-05, "loss": 1.0082, "num_input_tokens_seen": 88231952, "step": 5483 }, { "epoch": 0.3841445795791852, "grad_norm": 3.6280384063720703, "learning_rate": 6.162160420315238e-05, "loss": 0.9015, "num_input_tokens_seen": 88247728, "step": 5484 }, { "epoch": 0.3842146278249144, "grad_norm": 3.6947717666625977, "learning_rate": 6.161460595446585e-05, "loss": 0.9671, "num_input_tokens_seen": 88263472, "step": 5485 }, { "epoch": 0.38428467607064365, "grad_norm": 3.683591842651367, "learning_rate": 6.160760770577933e-05, "loss": 1.1844, "num_input_tokens_seen": 88279856, "step": 5486 }, { "epoch": 0.3843547243163729, "grad_norm": 6.020013332366943, "learning_rate": 6.160060945709282e-05, "loss": 1.0372, "num_input_tokens_seen": 88295864, "step": 5487 }, { "epoch": 0.38442477256210217, "grad_norm": 8.429437637329102, "learning_rate": 6.15936112084063e-05, "loss": 1.143, "num_input_tokens_seen": 88311752, "step": 5488 }, { "epoch": 0.3844948208078314, "grad_norm": 3.679159164428711, "learning_rate": 6.15866129597198e-05, "loss": 0.9212, "num_input_tokens_seen": 88327784, "step": 5489 }, { "epoch": 0.38456486905356063, "grad_norm": 4.131216526031494, "learning_rate": 6.157961471103328e-05, "loss": 1.0983, "num_input_tokens_seen": 88343480, "step": 5490 }, { "epoch": 0.3846349172992899, "grad_norm": 4.294956684112549, "learning_rate": 6.157261646234677e-05, "loss": 1.224, "num_input_tokens_seen": 88359864, "step": 5491 }, { "epoch": 0.38470496554501915, "grad_norm": 4.683321952819824, "learning_rate": 6.156561821366025e-05, "loss": 1.0377, "num_input_tokens_seen": 88375184, "step": 5492 }, { "epoch": 0.38477501379074835, "grad_norm": 3.932366371154785, "learning_rate": 6.155861996497372e-05, "loss": 1.1341, "num_input_tokens_seen": 88391568, "step": 5493 }, { "epoch": 0.3848450620364776, "grad_norm": 4.191849231719971, "learning_rate": 6.155162171628721e-05, "loss": 1.2247, "num_input_tokens_seen": 88407680, "step": 5494 }, { "epoch": 0.3849151102822069, "grad_norm": 3.983915090560913, "learning_rate": 6.154462346760071e-05, "loss": 1.0115, "num_input_tokens_seen": 88422888, "step": 5495 }, { "epoch": 0.38498515852793613, "grad_norm": 4.163250923156738, "learning_rate": 6.153762521891419e-05, "loss": 1.057, "num_input_tokens_seen": 88439272, "step": 5496 }, { "epoch": 0.38505520677366534, "grad_norm": 6.113068580627441, "learning_rate": 6.153062697022768e-05, "loss": 1.0971, "num_input_tokens_seen": 88455656, "step": 5497 }, { "epoch": 0.3851252550193946, "grad_norm": 5.32371187210083, "learning_rate": 6.152362872154117e-05, "loss": 1.1886, "num_input_tokens_seen": 88472040, "step": 5498 }, { "epoch": 0.38519530326512386, "grad_norm": 6.110095500946045, "learning_rate": 6.151663047285464e-05, "loss": 0.9587, "num_input_tokens_seen": 88487720, "step": 5499 }, { "epoch": 0.3852653515108531, "grad_norm": 3.9656851291656494, "learning_rate": 6.150963222416813e-05, "loss": 0.9635, "num_input_tokens_seen": 88504104, "step": 5500 }, { "epoch": 0.3853353997565823, "grad_norm": 4.601620197296143, "learning_rate": 6.150263397548162e-05, "loss": 1.2542, "num_input_tokens_seen": 88520160, "step": 5501 }, { "epoch": 0.3854054480023116, "grad_norm": 4.273797988891602, "learning_rate": 6.14956357267951e-05, "loss": 1.3405, "num_input_tokens_seen": 88535832, "step": 5502 }, { "epoch": 0.38547549624804084, "grad_norm": 4.023514747619629, "learning_rate": 6.148863747810858e-05, "loss": 1.1248, "num_input_tokens_seen": 88552000, "step": 5503 }, { "epoch": 0.3855455444937701, "grad_norm": 3.7229719161987305, "learning_rate": 6.148163922942207e-05, "loss": 1.1117, "num_input_tokens_seen": 88567600, "step": 5504 }, { "epoch": 0.38561559273949936, "grad_norm": 4.696394920349121, "learning_rate": 6.147464098073556e-05, "loss": 0.8945, "num_input_tokens_seen": 88583224, "step": 5505 }, { "epoch": 0.38568564098522856, "grad_norm": 5.354174613952637, "learning_rate": 6.146764273204903e-05, "loss": 1.079, "num_input_tokens_seen": 88599608, "step": 5506 }, { "epoch": 0.3857556892309578, "grad_norm": 4.717334747314453, "learning_rate": 6.146064448336252e-05, "loss": 1.1293, "num_input_tokens_seen": 88615048, "step": 5507 }, { "epoch": 0.3858257374766871, "grad_norm": 5.373983383178711, "learning_rate": 6.145364623467601e-05, "loss": 0.9625, "num_input_tokens_seen": 88630888, "step": 5508 }, { "epoch": 0.38589578572241634, "grad_norm": 4.338916301727295, "learning_rate": 6.14466479859895e-05, "loss": 1.0884, "num_input_tokens_seen": 88647072, "step": 5509 }, { "epoch": 0.38596583396814554, "grad_norm": 3.898721694946289, "learning_rate": 6.143964973730297e-05, "loss": 1.095, "num_input_tokens_seen": 88663128, "step": 5510 }, { "epoch": 0.3860358822138748, "grad_norm": 4.614948749542236, "learning_rate": 6.143265148861648e-05, "loss": 1.0729, "num_input_tokens_seen": 88679512, "step": 5511 }, { "epoch": 0.38610593045960406, "grad_norm": 5.1157732009887695, "learning_rate": 6.142565323992995e-05, "loss": 1.0776, "num_input_tokens_seen": 88695664, "step": 5512 }, { "epoch": 0.3861759787053333, "grad_norm": 4.29611873626709, "learning_rate": 6.141865499124343e-05, "loss": 1.0838, "num_input_tokens_seen": 88711560, "step": 5513 }, { "epoch": 0.3862460269510625, "grad_norm": 3.9464735984802246, "learning_rate": 6.141165674255692e-05, "loss": 1.1907, "num_input_tokens_seen": 88727464, "step": 5514 }, { "epoch": 0.3863160751967918, "grad_norm": 3.8381590843200684, "learning_rate": 6.140465849387042e-05, "loss": 1.1416, "num_input_tokens_seen": 88743848, "step": 5515 }, { "epoch": 0.38638612344252105, "grad_norm": 3.573434829711914, "learning_rate": 6.139766024518389e-05, "loss": 0.9316, "num_input_tokens_seen": 88759312, "step": 5516 }, { "epoch": 0.3864561716882503, "grad_norm": 4.257131576538086, "learning_rate": 6.139066199649738e-05, "loss": 0.9534, "num_input_tokens_seen": 88775112, "step": 5517 }, { "epoch": 0.3865262199339795, "grad_norm": 4.2985310554504395, "learning_rate": 6.138366374781087e-05, "loss": 0.9387, "num_input_tokens_seen": 88791496, "step": 5518 }, { "epoch": 0.38659626817970877, "grad_norm": 3.7012977600097656, "learning_rate": 6.137666549912434e-05, "loss": 1.0104, "num_input_tokens_seen": 88807880, "step": 5519 }, { "epoch": 0.38666631642543803, "grad_norm": 5.4860453605651855, "learning_rate": 6.136966725043782e-05, "loss": 1.1978, "num_input_tokens_seen": 88823392, "step": 5520 }, { "epoch": 0.3867363646711673, "grad_norm": 4.165813446044922, "learning_rate": 6.136266900175132e-05, "loss": 1.0184, "num_input_tokens_seen": 88839352, "step": 5521 }, { "epoch": 0.3868064129168965, "grad_norm": 3.6253862380981445, "learning_rate": 6.135567075306481e-05, "loss": 0.9544, "num_input_tokens_seen": 88855736, "step": 5522 }, { "epoch": 0.38687646116262575, "grad_norm": 3.834057331085205, "learning_rate": 6.134867250437829e-05, "loss": 1.1863, "num_input_tokens_seen": 88871952, "step": 5523 }, { "epoch": 0.386946509408355, "grad_norm": 4.534783363342285, "learning_rate": 6.134167425569177e-05, "loss": 0.85, "num_input_tokens_seen": 88888336, "step": 5524 }, { "epoch": 0.38701655765408427, "grad_norm": 5.4073381423950195, "learning_rate": 6.133467600700526e-05, "loss": 0.9257, "num_input_tokens_seen": 88904256, "step": 5525 }, { "epoch": 0.3870866058998135, "grad_norm": 3.819841146469116, "learning_rate": 6.132767775831874e-05, "loss": 1.0911, "num_input_tokens_seen": 88920640, "step": 5526 }, { "epoch": 0.38715665414554273, "grad_norm": 3.814857244491577, "learning_rate": 6.132067950963223e-05, "loss": 1.2414, "num_input_tokens_seen": 88937024, "step": 5527 }, { "epoch": 0.387226702391272, "grad_norm": 3.682535171508789, "learning_rate": 6.131368126094571e-05, "loss": 0.9158, "num_input_tokens_seen": 88952712, "step": 5528 }, { "epoch": 0.38729675063700125, "grad_norm": 3.5657262802124023, "learning_rate": 6.13066830122592e-05, "loss": 0.9336, "num_input_tokens_seen": 88969096, "step": 5529 }, { "epoch": 0.38736679888273046, "grad_norm": 3.851977825164795, "learning_rate": 6.129968476357268e-05, "loss": 1.0546, "num_input_tokens_seen": 88985480, "step": 5530 }, { "epoch": 0.3874368471284597, "grad_norm": 4.079189777374268, "learning_rate": 6.129268651488617e-05, "loss": 0.856, "num_input_tokens_seen": 89001104, "step": 5531 }, { "epoch": 0.387506895374189, "grad_norm": 4.388980865478516, "learning_rate": 6.128568826619966e-05, "loss": 1.0785, "num_input_tokens_seen": 89017232, "step": 5532 }, { "epoch": 0.38757694361991823, "grad_norm": 3.6747231483459473, "learning_rate": 6.127869001751313e-05, "loss": 1.1171, "num_input_tokens_seen": 89033576, "step": 5533 }, { "epoch": 0.38764699186564744, "grad_norm": 4.62367057800293, "learning_rate": 6.127169176882662e-05, "loss": 1.138, "num_input_tokens_seen": 89049224, "step": 5534 }, { "epoch": 0.3877170401113767, "grad_norm": 3.8601040840148926, "learning_rate": 6.126469352014011e-05, "loss": 1.0254, "num_input_tokens_seen": 89064968, "step": 5535 }, { "epoch": 0.38778708835710596, "grad_norm": 5.132208347320557, "learning_rate": 6.12576952714536e-05, "loss": 1.0121, "num_input_tokens_seen": 89081352, "step": 5536 }, { "epoch": 0.3878571366028352, "grad_norm": 3.9259984493255615, "learning_rate": 6.125069702276707e-05, "loss": 0.9146, "num_input_tokens_seen": 89097696, "step": 5537 }, { "epoch": 0.3879271848485644, "grad_norm": 3.9004077911376953, "learning_rate": 6.124369877408057e-05, "loss": 1.0059, "num_input_tokens_seen": 89114080, "step": 5538 }, { "epoch": 0.3879972330942937, "grad_norm": 4.657776355743408, "learning_rate": 6.123670052539405e-05, "loss": 1.0612, "num_input_tokens_seen": 89129584, "step": 5539 }, { "epoch": 0.38806728134002294, "grad_norm": 3.4758501052856445, "learning_rate": 6.122970227670752e-05, "loss": 1.0179, "num_input_tokens_seen": 89145968, "step": 5540 }, { "epoch": 0.3881373295857522, "grad_norm": 3.949275255203247, "learning_rate": 6.122270402802101e-05, "loss": 1.0336, "num_input_tokens_seen": 89161904, "step": 5541 }, { "epoch": 0.38820737783148146, "grad_norm": 5.620425224304199, "learning_rate": 6.121570577933451e-05, "loss": 0.9776, "num_input_tokens_seen": 89178032, "step": 5542 }, { "epoch": 0.38827742607721066, "grad_norm": 5.1215643882751465, "learning_rate": 6.120870753064799e-05, "loss": 1.0577, "num_input_tokens_seen": 89193568, "step": 5543 }, { "epoch": 0.3883474743229399, "grad_norm": 3.994556427001953, "learning_rate": 6.120170928196148e-05, "loss": 1.0631, "num_input_tokens_seen": 89209952, "step": 5544 }, { "epoch": 0.3884175225686692, "grad_norm": 6.86944055557251, "learning_rate": 6.119471103327497e-05, "loss": 1.208, "num_input_tokens_seen": 89226336, "step": 5545 }, { "epoch": 0.38848757081439844, "grad_norm": 3.72501540184021, "learning_rate": 6.118771278458844e-05, "loss": 1.0198, "num_input_tokens_seen": 89242720, "step": 5546 }, { "epoch": 0.38855761906012765, "grad_norm": 3.6887834072113037, "learning_rate": 6.118071453590193e-05, "loss": 1.0964, "num_input_tokens_seen": 89258536, "step": 5547 }, { "epoch": 0.3886276673058569, "grad_norm": 5.15130615234375, "learning_rate": 6.117371628721542e-05, "loss": 1.0193, "num_input_tokens_seen": 89274920, "step": 5548 }, { "epoch": 0.38869771555158616, "grad_norm": 3.7503981590270996, "learning_rate": 6.116671803852891e-05, "loss": 0.9457, "num_input_tokens_seen": 89291304, "step": 5549 }, { "epoch": 0.3887677637973154, "grad_norm": 4.851298809051514, "learning_rate": 6.115971978984238e-05, "loss": 1.132, "num_input_tokens_seen": 89307080, "step": 5550 }, { "epoch": 0.38883781204304463, "grad_norm": 3.72981858253479, "learning_rate": 6.115272154115587e-05, "loss": 1.0371, "num_input_tokens_seen": 89323464, "step": 5551 }, { "epoch": 0.3889078602887739, "grad_norm": 4.1301140785217285, "learning_rate": 6.114572329246936e-05, "loss": 0.9746, "num_input_tokens_seen": 89339696, "step": 5552 }, { "epoch": 0.38897790853450315, "grad_norm": 4.225720405578613, "learning_rate": 6.113872504378283e-05, "loss": 1.124, "num_input_tokens_seen": 89356080, "step": 5553 }, { "epoch": 0.3890479567802324, "grad_norm": 3.7197327613830566, "learning_rate": 6.113172679509632e-05, "loss": 1.0739, "num_input_tokens_seen": 89372464, "step": 5554 }, { "epoch": 0.3891180050259616, "grad_norm": 4.626903057098389, "learning_rate": 6.112472854640981e-05, "loss": 0.9896, "num_input_tokens_seen": 89388848, "step": 5555 }, { "epoch": 0.38918805327169087, "grad_norm": 4.229621410369873, "learning_rate": 6.111773029772329e-05, "loss": 0.9925, "num_input_tokens_seen": 89405112, "step": 5556 }, { "epoch": 0.38925810151742013, "grad_norm": 3.5502984523773193, "learning_rate": 6.111073204903678e-05, "loss": 0.9966, "num_input_tokens_seen": 89421496, "step": 5557 }, { "epoch": 0.3893281497631494, "grad_norm": 4.251241207122803, "learning_rate": 6.110373380035026e-05, "loss": 1.0249, "num_input_tokens_seen": 89437880, "step": 5558 }, { "epoch": 0.3893981980088786, "grad_norm": 5.076200485229492, "learning_rate": 6.109673555166375e-05, "loss": 1.0592, "num_input_tokens_seen": 89454264, "step": 5559 }, { "epoch": 0.38946824625460785, "grad_norm": 4.018000602722168, "learning_rate": 6.108973730297723e-05, "loss": 1.197, "num_input_tokens_seen": 89470648, "step": 5560 }, { "epoch": 0.3895382945003371, "grad_norm": 4.3367180824279785, "learning_rate": 6.108273905429072e-05, "loss": 1.0778, "num_input_tokens_seen": 89487032, "step": 5561 }, { "epoch": 0.38960834274606637, "grad_norm": 6.027153015136719, "learning_rate": 6.10757408056042e-05, "loss": 0.7955, "num_input_tokens_seen": 89502064, "step": 5562 }, { "epoch": 0.3896783909917956, "grad_norm": 3.499268054962158, "learning_rate": 6.10687425569177e-05, "loss": 0.935, "num_input_tokens_seen": 89518448, "step": 5563 }, { "epoch": 0.38974843923752484, "grad_norm": 3.3691868782043457, "learning_rate": 6.106174430823117e-05, "loss": 0.9249, "num_input_tokens_seen": 89534640, "step": 5564 }, { "epoch": 0.3898184874832541, "grad_norm": 3.4140114784240723, "learning_rate": 6.105474605954467e-05, "loss": 0.9594, "num_input_tokens_seen": 89551024, "step": 5565 }, { "epoch": 0.38988853572898335, "grad_norm": 4.049834728240967, "learning_rate": 6.104774781085815e-05, "loss": 1.1637, "num_input_tokens_seen": 89567408, "step": 5566 }, { "epoch": 0.38995858397471256, "grad_norm": 4.825027942657471, "learning_rate": 6.104074956217162e-05, "loss": 1.0514, "num_input_tokens_seen": 89583792, "step": 5567 }, { "epoch": 0.3900286322204418, "grad_norm": 5.281174659729004, "learning_rate": 6.103375131348512e-05, "loss": 1.0534, "num_input_tokens_seen": 89600176, "step": 5568 }, { "epoch": 0.3900986804661711, "grad_norm": 3.567270278930664, "learning_rate": 6.1026753064798605e-05, "loss": 0.9217, "num_input_tokens_seen": 89616560, "step": 5569 }, { "epoch": 0.39016872871190034, "grad_norm": 5.449852466583252, "learning_rate": 6.1019754816112086e-05, "loss": 1.0362, "num_input_tokens_seen": 89631968, "step": 5570 }, { "epoch": 0.39023877695762954, "grad_norm": 4.016347885131836, "learning_rate": 6.1012756567425575e-05, "loss": 0.9634, "num_input_tokens_seen": 89646712, "step": 5571 }, { "epoch": 0.3903088252033588, "grad_norm": 3.8826510906219482, "learning_rate": 6.100575831873906e-05, "loss": 1.1645, "num_input_tokens_seen": 89662776, "step": 5572 }, { "epoch": 0.39037887344908806, "grad_norm": 3.80755615234375, "learning_rate": 6.099876007005254e-05, "loss": 1.0404, "num_input_tokens_seen": 89679096, "step": 5573 }, { "epoch": 0.3904489216948173, "grad_norm": 3.7274065017700195, "learning_rate": 6.099176182136602e-05, "loss": 1.1665, "num_input_tokens_seen": 89695048, "step": 5574 }, { "epoch": 0.3905189699405466, "grad_norm": 4.335930824279785, "learning_rate": 6.09847635726795e-05, "loss": 1.0662, "num_input_tokens_seen": 89711432, "step": 5575 }, { "epoch": 0.3905890181862758, "grad_norm": 3.8839964866638184, "learning_rate": 6.0977765323993e-05, "loss": 1.0635, "num_input_tokens_seen": 89727712, "step": 5576 }, { "epoch": 0.39065906643200504, "grad_norm": 4.8028035163879395, "learning_rate": 6.097076707530648e-05, "loss": 1.0906, "num_input_tokens_seen": 89744096, "step": 5577 }, { "epoch": 0.3907291146777343, "grad_norm": 4.042201519012451, "learning_rate": 6.096376882661997e-05, "loss": 0.8609, "num_input_tokens_seen": 89758688, "step": 5578 }, { "epoch": 0.39079916292346356, "grad_norm": 4.1316986083984375, "learning_rate": 6.095677057793345e-05, "loss": 0.9509, "num_input_tokens_seen": 89774720, "step": 5579 }, { "epoch": 0.39086921116919277, "grad_norm": 5.164004802703857, "learning_rate": 6.094977232924693e-05, "loss": 1.1927, "num_input_tokens_seen": 89788480, "step": 5580 }, { "epoch": 0.390939259414922, "grad_norm": 4.125234127044678, "learning_rate": 6.094277408056043e-05, "loss": 0.9237, "num_input_tokens_seen": 89803000, "step": 5581 }, { "epoch": 0.3910093076606513, "grad_norm": 4.798699855804443, "learning_rate": 6.093577583187392e-05, "loss": 1.1069, "num_input_tokens_seen": 89818288, "step": 5582 }, { "epoch": 0.39107935590638054, "grad_norm": 4.383975028991699, "learning_rate": 6.0928777583187404e-05, "loss": 0.9997, "num_input_tokens_seen": 89833616, "step": 5583 }, { "epoch": 0.39114940415210975, "grad_norm": 4.20830774307251, "learning_rate": 6.092177933450087e-05, "loss": 0.8794, "num_input_tokens_seen": 89849200, "step": 5584 }, { "epoch": 0.391219452397839, "grad_norm": 4.470288276672363, "learning_rate": 6.091478108581437e-05, "loss": 1.0974, "num_input_tokens_seen": 89865184, "step": 5585 }, { "epoch": 0.39128950064356827, "grad_norm": 4.8457112312316895, "learning_rate": 6.090778283712785e-05, "loss": 1.1091, "num_input_tokens_seen": 89881184, "step": 5586 }, { "epoch": 0.3913595488892975, "grad_norm": 4.112722873687744, "learning_rate": 6.090078458844133e-05, "loss": 1.0189, "num_input_tokens_seen": 89897568, "step": 5587 }, { "epoch": 0.39142959713502673, "grad_norm": 4.070732116699219, "learning_rate": 6.089378633975482e-05, "loss": 1.0694, "num_input_tokens_seen": 89913952, "step": 5588 }, { "epoch": 0.391499645380756, "grad_norm": 4.092299461364746, "learning_rate": 6.08867880910683e-05, "loss": 1.0922, "num_input_tokens_seen": 89929216, "step": 5589 }, { "epoch": 0.39156969362648525, "grad_norm": 3.8092305660247803, "learning_rate": 6.08797898423818e-05, "loss": 1.0041, "num_input_tokens_seen": 89945376, "step": 5590 }, { "epoch": 0.3916397418722145, "grad_norm": 5.461154937744141, "learning_rate": 6.0872791593695265e-05, "loss": 1.1393, "num_input_tokens_seen": 89960296, "step": 5591 }, { "epoch": 0.3917097901179437, "grad_norm": 5.2103190422058105, "learning_rate": 6.086579334500877e-05, "loss": 1.079, "num_input_tokens_seen": 89975864, "step": 5592 }, { "epoch": 0.39177983836367297, "grad_norm": 3.6308488845825195, "learning_rate": 6.085879509632225e-05, "loss": 1.0344, "num_input_tokens_seen": 89991880, "step": 5593 }, { "epoch": 0.39184988660940223, "grad_norm": 4.632900714874268, "learning_rate": 6.0851796847635724e-05, "loss": 0.8255, "num_input_tokens_seen": 90006560, "step": 5594 }, { "epoch": 0.3919199348551315, "grad_norm": 3.8614165782928467, "learning_rate": 6.084479859894921e-05, "loss": 1.1599, "num_input_tokens_seen": 90022136, "step": 5595 }, { "epoch": 0.3919899831008607, "grad_norm": 3.768287420272827, "learning_rate": 6.0837800350262695e-05, "loss": 1.0694, "num_input_tokens_seen": 90038520, "step": 5596 }, { "epoch": 0.39206003134658995, "grad_norm": 3.355902671813965, "learning_rate": 6.083080210157618e-05, "loss": 0.944, "num_input_tokens_seen": 90054592, "step": 5597 }, { "epoch": 0.3921300795923192, "grad_norm": 3.2001609802246094, "learning_rate": 6.082380385288967e-05, "loss": 0.8642, "num_input_tokens_seen": 90070976, "step": 5598 }, { "epoch": 0.3922001278380485, "grad_norm": 3.74692440032959, "learning_rate": 6.081680560420317e-05, "loss": 0.9807, "num_input_tokens_seen": 90087360, "step": 5599 }, { "epoch": 0.3922701760837777, "grad_norm": 5.602208614349365, "learning_rate": 6.080980735551665e-05, "loss": 1.2451, "num_input_tokens_seen": 90103744, "step": 5600 }, { "epoch": 0.3922701760837777, "eval_loss": 1.127113699913025, "eval_runtime": 0.2033, "eval_samples_per_second": 4.919, "eval_steps_per_second": 4.919, "num_input_tokens_seen": 90103744, "step": 5600 }, { "epoch": 0.39234022432950694, "grad_norm": 3.727559804916382, "learning_rate": 6.080280910683012e-05, "loss": 1.0777, "num_input_tokens_seen": 90119400, "step": 5601 }, { "epoch": 0.3924102725752362, "grad_norm": 4.577515125274658, "learning_rate": 6.0795810858143606e-05, "loss": 1.0032, "num_input_tokens_seen": 90135168, "step": 5602 }, { "epoch": 0.39248032082096546, "grad_norm": 5.225588798522949, "learning_rate": 6.07888126094571e-05, "loss": 1.0964, "num_input_tokens_seen": 90151480, "step": 5603 }, { "epoch": 0.39255036906669466, "grad_norm": 3.6131844520568848, "learning_rate": 6.078181436077057e-05, "loss": 0.9255, "num_input_tokens_seen": 90167864, "step": 5604 }, { "epoch": 0.3926204173124239, "grad_norm": 4.127248287200928, "learning_rate": 6.0774816112084065e-05, "loss": 1.1939, "num_input_tokens_seen": 90184248, "step": 5605 }, { "epoch": 0.3926904655581532, "grad_norm": 4.599911689758301, "learning_rate": 6.076781786339756e-05, "loss": 1.2819, "num_input_tokens_seen": 90199816, "step": 5606 }, { "epoch": 0.39276051380388244, "grad_norm": 3.7179722785949707, "learning_rate": 6.076081961471104e-05, "loss": 0.8559, "num_input_tokens_seen": 90215640, "step": 5607 }, { "epoch": 0.39283056204961164, "grad_norm": 3.872941493988037, "learning_rate": 6.0753821366024524e-05, "loss": 0.9986, "num_input_tokens_seen": 90232024, "step": 5608 }, { "epoch": 0.3929006102953409, "grad_norm": 3.7326548099517822, "learning_rate": 6.074682311733801e-05, "loss": 1.0295, "num_input_tokens_seen": 90247536, "step": 5609 }, { "epoch": 0.39297065854107016, "grad_norm": 4.05418062210083, "learning_rate": 6.0739824868651494e-05, "loss": 1.2199, "num_input_tokens_seen": 90263920, "step": 5610 }, { "epoch": 0.3930407067867994, "grad_norm": 5.326319694519043, "learning_rate": 6.073282661996497e-05, "loss": 1.0705, "num_input_tokens_seen": 90280240, "step": 5611 }, { "epoch": 0.3931107550325287, "grad_norm": 4.132864952087402, "learning_rate": 6.072582837127846e-05, "loss": 1.2377, "num_input_tokens_seen": 90296128, "step": 5612 }, { "epoch": 0.3931808032782579, "grad_norm": 3.7307562828063965, "learning_rate": 6.0718830122591953e-05, "loss": 0.9765, "num_input_tokens_seen": 90312512, "step": 5613 }, { "epoch": 0.39325085152398714, "grad_norm": 6.35123872756958, "learning_rate": 6.0711831873905435e-05, "loss": 0.9049, "num_input_tokens_seen": 90328896, "step": 5614 }, { "epoch": 0.3933208997697164, "grad_norm": 5.536827564239502, "learning_rate": 6.070483362521892e-05, "loss": 0.952, "num_input_tokens_seen": 90344648, "step": 5615 }, { "epoch": 0.39339094801544566, "grad_norm": 3.6394944190979004, "learning_rate": 6.0697835376532406e-05, "loss": 0.9446, "num_input_tokens_seen": 90361032, "step": 5616 }, { "epoch": 0.39346099626117487, "grad_norm": 3.4719443321228027, "learning_rate": 6.069083712784589e-05, "loss": 0.9859, "num_input_tokens_seen": 90377192, "step": 5617 }, { "epoch": 0.3935310445069041, "grad_norm": 6.195781230926514, "learning_rate": 6.068383887915936e-05, "loss": 0.9482, "num_input_tokens_seen": 90393576, "step": 5618 }, { "epoch": 0.3936010927526334, "grad_norm": 3.5569331645965576, "learning_rate": 6.0676840630472865e-05, "loss": 1.0244, "num_input_tokens_seen": 90409960, "step": 5619 }, { "epoch": 0.39367114099836265, "grad_norm": 5.535704135894775, "learning_rate": 6.0669842381786346e-05, "loss": 1.3058, "num_input_tokens_seen": 90426216, "step": 5620 }, { "epoch": 0.39374118924409185, "grad_norm": 3.81278395652771, "learning_rate": 6.0662844133099815e-05, "loss": 0.9343, "num_input_tokens_seen": 90442320, "step": 5621 }, { "epoch": 0.3938112374898211, "grad_norm": 4.093146324157715, "learning_rate": 6.065584588441331e-05, "loss": 1.0698, "num_input_tokens_seen": 90458704, "step": 5622 }, { "epoch": 0.39388128573555037, "grad_norm": 6.061161518096924, "learning_rate": 6.06488476357268e-05, "loss": 0.9046, "num_input_tokens_seen": 90474408, "step": 5623 }, { "epoch": 0.39395133398127963, "grad_norm": 3.763059616088867, "learning_rate": 6.064184938704028e-05, "loss": 1.002, "num_input_tokens_seen": 90490592, "step": 5624 }, { "epoch": 0.39402138222700883, "grad_norm": 5.287941932678223, "learning_rate": 6.063485113835376e-05, "loss": 1.0667, "num_input_tokens_seen": 90506568, "step": 5625 }, { "epoch": 0.3940914304727381, "grad_norm": 3.5837693214416504, "learning_rate": 6.062785288966726e-05, "loss": 1.1136, "num_input_tokens_seen": 90522952, "step": 5626 }, { "epoch": 0.39416147871846735, "grad_norm": 3.5523111820220947, "learning_rate": 6.062085464098074e-05, "loss": 1.0703, "num_input_tokens_seen": 90539336, "step": 5627 }, { "epoch": 0.3942315269641966, "grad_norm": 4.3880934715271, "learning_rate": 6.061385639229421e-05, "loss": 0.993, "num_input_tokens_seen": 90555720, "step": 5628 }, { "epoch": 0.3943015752099258, "grad_norm": 4.26425313949585, "learning_rate": 6.060685814360772e-05, "loss": 1.08, "num_input_tokens_seen": 90572104, "step": 5629 }, { "epoch": 0.3943716234556551, "grad_norm": 3.8837990760803223, "learning_rate": 6.05998598949212e-05, "loss": 0.9414, "num_input_tokens_seen": 90588488, "step": 5630 }, { "epoch": 0.39444167170138433, "grad_norm": 3.704282522201538, "learning_rate": 6.059286164623468e-05, "loss": 1.0037, "num_input_tokens_seen": 90604872, "step": 5631 }, { "epoch": 0.3945117199471136, "grad_norm": 5.933957099914551, "learning_rate": 6.058586339754816e-05, "loss": 1.0753, "num_input_tokens_seen": 90621256, "step": 5632 }, { "epoch": 0.3945817681928428, "grad_norm": 4.185206413269043, "learning_rate": 6.057886514886165e-05, "loss": 1.0044, "num_input_tokens_seen": 90637640, "step": 5633 }, { "epoch": 0.39465181643857206, "grad_norm": 3.97603440284729, "learning_rate": 6.057186690017513e-05, "loss": 1.2243, "num_input_tokens_seen": 90654024, "step": 5634 }, { "epoch": 0.3947218646843013, "grad_norm": 3.394630193710327, "learning_rate": 6.056486865148863e-05, "loss": 0.9702, "num_input_tokens_seen": 90670008, "step": 5635 }, { "epoch": 0.3947919129300306, "grad_norm": 3.810899019241333, "learning_rate": 6.055787040280211e-05, "loss": 0.9998, "num_input_tokens_seen": 90686392, "step": 5636 }, { "epoch": 0.3948619611757598, "grad_norm": 4.237402439117432, "learning_rate": 6.055087215411559e-05, "loss": 1.1013, "num_input_tokens_seen": 90702776, "step": 5637 }, { "epoch": 0.39493200942148904, "grad_norm": 5.481308937072754, "learning_rate": 6.054387390542907e-05, "loss": 1.0064, "num_input_tokens_seen": 90718312, "step": 5638 }, { "epoch": 0.3950020576672183, "grad_norm": 3.582808017730713, "learning_rate": 6.053687565674256e-05, "loss": 0.9305, "num_input_tokens_seen": 90733856, "step": 5639 }, { "epoch": 0.39507210591294756, "grad_norm": 3.9277966022491455, "learning_rate": 6.0529877408056044e-05, "loss": 1.0986, "num_input_tokens_seen": 90750240, "step": 5640 }, { "epoch": 0.39514215415867676, "grad_norm": 4.61000394821167, "learning_rate": 6.0522879159369526e-05, "loss": 1.1705, "num_input_tokens_seen": 90766352, "step": 5641 }, { "epoch": 0.395212202404406, "grad_norm": 4.445149898529053, "learning_rate": 6.051588091068301e-05, "loss": 1.0809, "num_input_tokens_seen": 90782736, "step": 5642 }, { "epoch": 0.3952822506501353, "grad_norm": 4.652968406677246, "learning_rate": 6.0508882661996516e-05, "loss": 0.9761, "num_input_tokens_seen": 90799120, "step": 5643 }, { "epoch": 0.39535229889586454, "grad_norm": 4.172330856323242, "learning_rate": 6.0501884413309985e-05, "loss": 1.0637, "num_input_tokens_seen": 90815504, "step": 5644 }, { "epoch": 0.3954223471415938, "grad_norm": 3.647385358810425, "learning_rate": 6.0494886164623466e-05, "loss": 0.9284, "num_input_tokens_seen": 90831888, "step": 5645 }, { "epoch": 0.395492395387323, "grad_norm": 3.9353525638580322, "learning_rate": 6.0487887915936955e-05, "loss": 1.1498, "num_input_tokens_seen": 90848048, "step": 5646 }, { "epoch": 0.39556244363305226, "grad_norm": 4.216567039489746, "learning_rate": 6.048088966725044e-05, "loss": 1.1247, "num_input_tokens_seen": 90863576, "step": 5647 }, { "epoch": 0.3956324918787815, "grad_norm": 5.031260013580322, "learning_rate": 6.047389141856392e-05, "loss": 1.1314, "num_input_tokens_seen": 90879960, "step": 5648 }, { "epoch": 0.3957025401245108, "grad_norm": 4.927192211151123, "learning_rate": 6.0466893169877414e-05, "loss": 1.0977, "num_input_tokens_seen": 90896344, "step": 5649 }, { "epoch": 0.39577258837024, "grad_norm": 4.589445114135742, "learning_rate": 6.045989492119091e-05, "loss": 1.0233, "num_input_tokens_seen": 90912728, "step": 5650 }, { "epoch": 0.39584263661596925, "grad_norm": 3.5707035064697266, "learning_rate": 6.045289667250438e-05, "loss": 0.9732, "num_input_tokens_seen": 90929112, "step": 5651 }, { "epoch": 0.3959126848616985, "grad_norm": 3.637237787246704, "learning_rate": 6.044589842381787e-05, "loss": 1.2063, "num_input_tokens_seen": 90945376, "step": 5652 }, { "epoch": 0.39598273310742776, "grad_norm": 4.068975448608398, "learning_rate": 6.043890017513136e-05, "loss": 1.0301, "num_input_tokens_seen": 90961448, "step": 5653 }, { "epoch": 0.39605278135315697, "grad_norm": 3.8378570079803467, "learning_rate": 6.0431901926444837e-05, "loss": 0.9195, "num_input_tokens_seen": 90977832, "step": 5654 }, { "epoch": 0.39612282959888623, "grad_norm": 4.5788092613220215, "learning_rate": 6.042490367775832e-05, "loss": 1.1601, "num_input_tokens_seen": 90993296, "step": 5655 }, { "epoch": 0.3961928778446155, "grad_norm": 3.7392847537994385, "learning_rate": 6.041790542907181e-05, "loss": 1.0081, "num_input_tokens_seen": 91009680, "step": 5656 }, { "epoch": 0.39626292609034475, "grad_norm": 5.830812931060791, "learning_rate": 6.04109071803853e-05, "loss": 1.0544, "num_input_tokens_seen": 91025400, "step": 5657 }, { "epoch": 0.39633297433607395, "grad_norm": 3.7372663021087646, "learning_rate": 6.040390893169877e-05, "loss": 1.1403, "num_input_tokens_seen": 91041784, "step": 5658 }, { "epoch": 0.3964030225818032, "grad_norm": 3.756762981414795, "learning_rate": 6.0396910683012266e-05, "loss": 1.0175, "num_input_tokens_seen": 91058168, "step": 5659 }, { "epoch": 0.39647307082753247, "grad_norm": 3.659280776977539, "learning_rate": 6.0389912434325755e-05, "loss": 1.0396, "num_input_tokens_seen": 91074288, "step": 5660 }, { "epoch": 0.39654311907326173, "grad_norm": 4.339829921722412, "learning_rate": 6.038291418563923e-05, "loss": 1.2101, "num_input_tokens_seen": 91090024, "step": 5661 }, { "epoch": 0.39661316731899093, "grad_norm": 4.062867641448975, "learning_rate": 6.037591593695272e-05, "loss": 0.9874, "num_input_tokens_seen": 91106408, "step": 5662 }, { "epoch": 0.3966832155647202, "grad_norm": 4.45166015625, "learning_rate": 6.0368917688266214e-05, "loss": 0.9504, "num_input_tokens_seen": 91122648, "step": 5663 }, { "epoch": 0.39675326381044945, "grad_norm": 3.4350759983062744, "learning_rate": 6.036191943957968e-05, "loss": 0.8876, "num_input_tokens_seen": 91138200, "step": 5664 }, { "epoch": 0.3968233120561787, "grad_norm": 3.5637154579162598, "learning_rate": 6.0354921190893164e-05, "loss": 1.0616, "num_input_tokens_seen": 91154584, "step": 5665 }, { "epoch": 0.3968933603019079, "grad_norm": 3.8793985843658447, "learning_rate": 6.034792294220666e-05, "loss": 1.0933, "num_input_tokens_seen": 91170968, "step": 5666 }, { "epoch": 0.3969634085476372, "grad_norm": 4.1613545417785645, "learning_rate": 6.0340924693520154e-05, "loss": 1.0895, "num_input_tokens_seen": 91185856, "step": 5667 }, { "epoch": 0.39703345679336643, "grad_norm": 8.646449089050293, "learning_rate": 6.033392644483362e-05, "loss": 1.1391, "num_input_tokens_seen": 91201088, "step": 5668 }, { "epoch": 0.3971035050390957, "grad_norm": 4.862243175506592, "learning_rate": 6.032692819614711e-05, "loss": 1.1871, "num_input_tokens_seen": 91217472, "step": 5669 }, { "epoch": 0.3971735532848249, "grad_norm": 4.013809680938721, "learning_rate": 6.0319929947460607e-05, "loss": 1.0307, "num_input_tokens_seen": 91233760, "step": 5670 }, { "epoch": 0.39724360153055416, "grad_norm": 4.664083480834961, "learning_rate": 6.0312931698774075e-05, "loss": 1.1507, "num_input_tokens_seen": 91250144, "step": 5671 }, { "epoch": 0.3973136497762834, "grad_norm": 4.330606937408447, "learning_rate": 6.030593345008756e-05, "loss": 1.1741, "num_input_tokens_seen": 91264592, "step": 5672 }, { "epoch": 0.3973836980220127, "grad_norm": 4.158743381500244, "learning_rate": 6.0298935201401066e-05, "loss": 0.8608, "num_input_tokens_seen": 91280520, "step": 5673 }, { "epoch": 0.3974537462677419, "grad_norm": 3.800955057144165, "learning_rate": 6.029193695271455e-05, "loss": 0.9451, "num_input_tokens_seen": 91296472, "step": 5674 }, { "epoch": 0.39752379451347114, "grad_norm": 4.307434558868408, "learning_rate": 6.028493870402803e-05, "loss": 1.0936, "num_input_tokens_seen": 91312856, "step": 5675 }, { "epoch": 0.3975938427592004, "grad_norm": 4.052398204803467, "learning_rate": 6.027794045534151e-05, "loss": 1.006, "num_input_tokens_seen": 91329216, "step": 5676 }, { "epoch": 0.39766389100492966, "grad_norm": 4.665764331817627, "learning_rate": 6.0270942206655e-05, "loss": 1.1573, "num_input_tokens_seen": 91345184, "step": 5677 }, { "epoch": 0.3977339392506589, "grad_norm": 4.070000648498535, "learning_rate": 6.0263943957968475e-05, "loss": 1.1296, "num_input_tokens_seen": 91361568, "step": 5678 }, { "epoch": 0.3978039874963881, "grad_norm": 4.304214954376221, "learning_rate": 6.025694570928198e-05, "loss": 0.8688, "num_input_tokens_seen": 91376656, "step": 5679 }, { "epoch": 0.3978740357421174, "grad_norm": 4.051540374755859, "learning_rate": 6.024994746059546e-05, "loss": 1.0862, "num_input_tokens_seen": 91393040, "step": 5680 }, { "epoch": 0.39794408398784664, "grad_norm": 3.986542224884033, "learning_rate": 6.024294921190894e-05, "loss": 1.1477, "num_input_tokens_seen": 91408208, "step": 5681 }, { "epoch": 0.3980141322335759, "grad_norm": 4.302114963531494, "learning_rate": 6.023595096322242e-05, "loss": 0.8569, "num_input_tokens_seen": 91424592, "step": 5682 }, { "epoch": 0.3980841804793051, "grad_norm": 6.019785404205322, "learning_rate": 6.0228952714535904e-05, "loss": 1.2166, "num_input_tokens_seen": 91440976, "step": 5683 }, { "epoch": 0.39815422872503436, "grad_norm": 3.667469024658203, "learning_rate": 6.022195446584939e-05, "loss": 1.0587, "num_input_tokens_seen": 91457360, "step": 5684 }, { "epoch": 0.3982242769707636, "grad_norm": 4.30043363571167, "learning_rate": 6.0214956217162874e-05, "loss": 1.0923, "num_input_tokens_seen": 91473744, "step": 5685 }, { "epoch": 0.3982943252164929, "grad_norm": 3.7446558475494385, "learning_rate": 6.0207957968476356e-05, "loss": 1.1302, "num_input_tokens_seen": 91489544, "step": 5686 }, { "epoch": 0.3983643734622221, "grad_norm": 3.722567081451416, "learning_rate": 6.020095971978985e-05, "loss": 0.9913, "num_input_tokens_seen": 91505584, "step": 5687 }, { "epoch": 0.39843442170795135, "grad_norm": 4.311237812042236, "learning_rate": 6.019396147110332e-05, "loss": 1.0352, "num_input_tokens_seen": 91520656, "step": 5688 }, { "epoch": 0.3985044699536806, "grad_norm": 5.650984764099121, "learning_rate": 6.018696322241683e-05, "loss": 1.2826, "num_input_tokens_seen": 91537040, "step": 5689 }, { "epoch": 0.39857451819940987, "grad_norm": 4.106716632843018, "learning_rate": 6.017996497373031e-05, "loss": 1.0234, "num_input_tokens_seen": 91553424, "step": 5690 }, { "epoch": 0.39864456644513907, "grad_norm": 3.893007516860962, "learning_rate": 6.0172966725043786e-05, "loss": 1.1175, "num_input_tokens_seen": 91569184, "step": 5691 }, { "epoch": 0.39871461469086833, "grad_norm": 3.6435177326202393, "learning_rate": 6.016596847635727e-05, "loss": 0.9446, "num_input_tokens_seen": 91584832, "step": 5692 }, { "epoch": 0.3987846629365976, "grad_norm": 3.639324188232422, "learning_rate": 6.015897022767075e-05, "loss": 1.0866, "num_input_tokens_seen": 91601216, "step": 5693 }, { "epoch": 0.39885471118232685, "grad_norm": 3.680997848510742, "learning_rate": 6.0151971978984245e-05, "loss": 1.1352, "num_input_tokens_seen": 91617600, "step": 5694 }, { "epoch": 0.39892475942805605, "grad_norm": 5.37217903137207, "learning_rate": 6.0144973730297726e-05, "loss": 1.1507, "num_input_tokens_seen": 91633760, "step": 5695 }, { "epoch": 0.3989948076737853, "grad_norm": 3.6297101974487305, "learning_rate": 6.013797548161122e-05, "loss": 0.8412, "num_input_tokens_seen": 91649536, "step": 5696 }, { "epoch": 0.39906485591951457, "grad_norm": 8.193422317504883, "learning_rate": 6.0130977232924704e-05, "loss": 1.1533, "num_input_tokens_seen": 91665920, "step": 5697 }, { "epoch": 0.39913490416524383, "grad_norm": 3.6126644611358643, "learning_rate": 6.0123978984238185e-05, "loss": 0.8261, "num_input_tokens_seen": 91682272, "step": 5698 }, { "epoch": 0.39920495241097304, "grad_norm": 4.277047634124756, "learning_rate": 6.011698073555167e-05, "loss": 1.0904, "num_input_tokens_seen": 91698656, "step": 5699 }, { "epoch": 0.3992750006567023, "grad_norm": 4.661556720733643, "learning_rate": 6.0109982486865156e-05, "loss": 1.1948, "num_input_tokens_seen": 91715040, "step": 5700 }, { "epoch": 0.39934504890243155, "grad_norm": 4.143563270568848, "learning_rate": 6.010298423817864e-05, "loss": 1.0421, "num_input_tokens_seen": 91731424, "step": 5701 }, { "epoch": 0.3994150971481608, "grad_norm": 5.750835418701172, "learning_rate": 6.009598598949212e-05, "loss": 1.0594, "num_input_tokens_seen": 91747808, "step": 5702 }, { "epoch": 0.39948514539389, "grad_norm": 4.048924446105957, "learning_rate": 6.0088987740805615e-05, "loss": 1.0488, "num_input_tokens_seen": 91764192, "step": 5703 }, { "epoch": 0.3995551936396193, "grad_norm": 3.7284796237945557, "learning_rate": 6.00819894921191e-05, "loss": 0.9723, "num_input_tokens_seen": 91780576, "step": 5704 }, { "epoch": 0.39962524188534854, "grad_norm": 4.041873931884766, "learning_rate": 6.007499124343258e-05, "loss": 0.9354, "num_input_tokens_seen": 91796960, "step": 5705 }, { "epoch": 0.3996952901310778, "grad_norm": 4.37992000579834, "learning_rate": 6.006799299474607e-05, "loss": 1.3032, "num_input_tokens_seen": 91813344, "step": 5706 }, { "epoch": 0.399765338376807, "grad_norm": 4.49924373626709, "learning_rate": 6.006099474605955e-05, "loss": 0.9709, "num_input_tokens_seen": 91829728, "step": 5707 }, { "epoch": 0.39983538662253626, "grad_norm": 4.609983444213867, "learning_rate": 6.005399649737303e-05, "loss": 1.0591, "num_input_tokens_seen": 91844968, "step": 5708 }, { "epoch": 0.3999054348682655, "grad_norm": 4.1750006675720215, "learning_rate": 6.004699824868651e-05, "loss": 1.1413, "num_input_tokens_seen": 91860464, "step": 5709 }, { "epoch": 0.3999754831139948, "grad_norm": 4.009062767028809, "learning_rate": 6.0039999999999994e-05, "loss": 1.0244, "num_input_tokens_seen": 91876848, "step": 5710 }, { "epoch": 0.400045531359724, "grad_norm": 4.0535078048706055, "learning_rate": 6.003300175131349e-05, "loss": 1.0851, "num_input_tokens_seen": 91892192, "step": 5711 }, { "epoch": 0.40011557960545324, "grad_norm": 4.592657089233398, "learning_rate": 6.002600350262697e-05, "loss": 0.8823, "num_input_tokens_seen": 91907040, "step": 5712 }, { "epoch": 0.4001856278511825, "grad_norm": 3.87369966506958, "learning_rate": 6.001900525394046e-05, "loss": 1.153, "num_input_tokens_seen": 91922712, "step": 5713 }, { "epoch": 0.40025567609691176, "grad_norm": 3.93766713142395, "learning_rate": 6.001200700525394e-05, "loss": 1.0225, "num_input_tokens_seen": 91939096, "step": 5714 }, { "epoch": 0.400325724342641, "grad_norm": 3.823153018951416, "learning_rate": 6.0005008756567424e-05, "loss": 0.9229, "num_input_tokens_seen": 91955016, "step": 5715 }, { "epoch": 0.4003957725883702, "grad_norm": 3.5592081546783447, "learning_rate": 5.999801050788092e-05, "loss": 0.9163, "num_input_tokens_seen": 91971400, "step": 5716 }, { "epoch": 0.4004658208340995, "grad_norm": 3.8749887943267822, "learning_rate": 5.9991012259194414e-05, "loss": 1.0194, "num_input_tokens_seen": 91987784, "step": 5717 }, { "epoch": 0.40053586907982874, "grad_norm": 4.749402046203613, "learning_rate": 5.998401401050788e-05, "loss": 0.9442, "num_input_tokens_seen": 92003592, "step": 5718 }, { "epoch": 0.400605917325558, "grad_norm": 4.114437580108643, "learning_rate": 5.9977015761821365e-05, "loss": 1.0576, "num_input_tokens_seen": 92019640, "step": 5719 }, { "epoch": 0.4006759655712872, "grad_norm": 3.3297617435455322, "learning_rate": 5.997001751313486e-05, "loss": 0.8776, "num_input_tokens_seen": 92035544, "step": 5720 }, { "epoch": 0.40074601381701647, "grad_norm": 4.204908847808838, "learning_rate": 5.996301926444834e-05, "loss": 1.141, "num_input_tokens_seen": 92051840, "step": 5721 }, { "epoch": 0.4008160620627457, "grad_norm": 4.198369979858398, "learning_rate": 5.9956021015761824e-05, "loss": 1.2653, "num_input_tokens_seen": 92068224, "step": 5722 }, { "epoch": 0.400886110308475, "grad_norm": 4.46641206741333, "learning_rate": 5.994902276707531e-05, "loss": 1.0866, "num_input_tokens_seen": 92083656, "step": 5723 }, { "epoch": 0.4009561585542042, "grad_norm": 4.2217535972595215, "learning_rate": 5.994202451838881e-05, "loss": 1.066, "num_input_tokens_seen": 92100040, "step": 5724 }, { "epoch": 0.40102620679993345, "grad_norm": 4.484360218048096, "learning_rate": 5.993502626970229e-05, "loss": 1.1031, "num_input_tokens_seen": 92115592, "step": 5725 }, { "epoch": 0.4010962550456627, "grad_norm": 4.69040060043335, "learning_rate": 5.992802802101576e-05, "loss": 1.1487, "num_input_tokens_seen": 92131280, "step": 5726 }, { "epoch": 0.40116630329139197, "grad_norm": 3.8119077682495117, "learning_rate": 5.992102977232926e-05, "loss": 1.1336, "num_input_tokens_seen": 92147664, "step": 5727 }, { "epoch": 0.40123635153712117, "grad_norm": 4.186896800994873, "learning_rate": 5.9914031523642735e-05, "loss": 0.9449, "num_input_tokens_seen": 92164048, "step": 5728 }, { "epoch": 0.40130639978285043, "grad_norm": 4.658702850341797, "learning_rate": 5.9907033274956217e-05, "loss": 1.1733, "num_input_tokens_seen": 92180432, "step": 5729 }, { "epoch": 0.4013764480285797, "grad_norm": 3.8305857181549072, "learning_rate": 5.9900035026269705e-05, "loss": 1.0041, "num_input_tokens_seen": 92196816, "step": 5730 }, { "epoch": 0.40144649627430895, "grad_norm": 4.063295364379883, "learning_rate": 5.989303677758319e-05, "loss": 1.1743, "num_input_tokens_seen": 92212928, "step": 5731 }, { "epoch": 0.40151654452003815, "grad_norm": 6.850064277648926, "learning_rate": 5.988603852889667e-05, "loss": 1.0967, "num_input_tokens_seen": 92229312, "step": 5732 }, { "epoch": 0.4015865927657674, "grad_norm": 5.065973281860352, "learning_rate": 5.9879040280210164e-05, "loss": 1.0513, "num_input_tokens_seen": 92245696, "step": 5733 }, { "epoch": 0.4016566410114967, "grad_norm": 4.02882719039917, "learning_rate": 5.987204203152366e-05, "loss": 0.934, "num_input_tokens_seen": 92261936, "step": 5734 }, { "epoch": 0.40172668925722593, "grad_norm": 3.9505250453948975, "learning_rate": 5.986504378283714e-05, "loss": 0.9871, "num_input_tokens_seen": 92278320, "step": 5735 }, { "epoch": 0.40179673750295514, "grad_norm": 4.949488162994385, "learning_rate": 5.985804553415061e-05, "loss": 0.9759, "num_input_tokens_seen": 92294664, "step": 5736 }, { "epoch": 0.4018667857486844, "grad_norm": 3.614008903503418, "learning_rate": 5.98510472854641e-05, "loss": 1.006, "num_input_tokens_seen": 92311048, "step": 5737 }, { "epoch": 0.40193683399441366, "grad_norm": 3.739224433898926, "learning_rate": 5.984404903677758e-05, "loss": 0.8997, "num_input_tokens_seen": 92327432, "step": 5738 }, { "epoch": 0.4020068822401429, "grad_norm": 3.6126298904418945, "learning_rate": 5.983705078809106e-05, "loss": 1.0062, "num_input_tokens_seen": 92343816, "step": 5739 }, { "epoch": 0.4020769304858721, "grad_norm": 4.304609298706055, "learning_rate": 5.983005253940456e-05, "loss": 1.0691, "num_input_tokens_seen": 92358872, "step": 5740 }, { "epoch": 0.4021469787316014, "grad_norm": 4.121729850769043, "learning_rate": 5.982305429071805e-05, "loss": 1.1047, "num_input_tokens_seen": 92374960, "step": 5741 }, { "epoch": 0.40221702697733064, "grad_norm": 4.137178421020508, "learning_rate": 5.9816056042031534e-05, "loss": 1.0809, "num_input_tokens_seen": 92391344, "step": 5742 }, { "epoch": 0.4022870752230599, "grad_norm": 8.537243843078613, "learning_rate": 5.9809057793345016e-05, "loss": 1.1571, "num_input_tokens_seen": 92406096, "step": 5743 }, { "epoch": 0.4023571234687891, "grad_norm": 4.613489627838135, "learning_rate": 5.9802059544658505e-05, "loss": 1.126, "num_input_tokens_seen": 92422480, "step": 5744 }, { "epoch": 0.40242717171451836, "grad_norm": 4.812812805175781, "learning_rate": 5.9795061295971987e-05, "loss": 1.2987, "num_input_tokens_seen": 92437472, "step": 5745 }, { "epoch": 0.4024972199602476, "grad_norm": 4.785153865814209, "learning_rate": 5.978806304728546e-05, "loss": 0.9517, "num_input_tokens_seen": 92452560, "step": 5746 }, { "epoch": 0.4025672682059769, "grad_norm": 4.450865268707275, "learning_rate": 5.978106479859895e-05, "loss": 0.9144, "num_input_tokens_seen": 92468312, "step": 5747 }, { "epoch": 0.40263731645170614, "grad_norm": 4.854867935180664, "learning_rate": 5.9774066549912446e-05, "loss": 1.1128, "num_input_tokens_seen": 92484616, "step": 5748 }, { "epoch": 0.40270736469743534, "grad_norm": 4.00141716003418, "learning_rate": 5.976706830122593e-05, "loss": 1.1298, "num_input_tokens_seen": 92500568, "step": 5749 }, { "epoch": 0.4027774129431646, "grad_norm": 3.819101333618164, "learning_rate": 5.976007005253941e-05, "loss": 0.9113, "num_input_tokens_seen": 92515952, "step": 5750 }, { "epoch": 0.40284746118889386, "grad_norm": 3.7954423427581787, "learning_rate": 5.97530718038529e-05, "loss": 1.1873, "num_input_tokens_seen": 92532336, "step": 5751 }, { "epoch": 0.4029175094346231, "grad_norm": 4.081971645355225, "learning_rate": 5.974607355516638e-05, "loss": 1.1075, "num_input_tokens_seen": 92548720, "step": 5752 }, { "epoch": 0.4029875576803523, "grad_norm": 3.834063768386841, "learning_rate": 5.9739075306479855e-05, "loss": 0.9963, "num_input_tokens_seen": 92564648, "step": 5753 }, { "epoch": 0.4030576059260816, "grad_norm": 3.6766366958618164, "learning_rate": 5.973207705779336e-05, "loss": 1.1064, "num_input_tokens_seen": 92581032, "step": 5754 }, { "epoch": 0.40312765417181085, "grad_norm": 4.423589706420898, "learning_rate": 5.972507880910684e-05, "loss": 1.0285, "num_input_tokens_seen": 92596520, "step": 5755 }, { "epoch": 0.4031977024175401, "grad_norm": 4.0820207595825195, "learning_rate": 5.971808056042031e-05, "loss": 1.0288, "num_input_tokens_seen": 92612448, "step": 5756 }, { "epoch": 0.4032677506632693, "grad_norm": 6.634023189544678, "learning_rate": 5.97110823117338e-05, "loss": 1.1607, "num_input_tokens_seen": 92627736, "step": 5757 }, { "epoch": 0.40333779890899857, "grad_norm": 3.517611026763916, "learning_rate": 5.970408406304729e-05, "loss": 0.9292, "num_input_tokens_seen": 92643216, "step": 5758 }, { "epoch": 0.4034078471547278, "grad_norm": 3.7658562660217285, "learning_rate": 5.969708581436077e-05, "loss": 1.052, "num_input_tokens_seen": 92659248, "step": 5759 }, { "epoch": 0.4034778954004571, "grad_norm": 4.128793716430664, "learning_rate": 5.9690087565674254e-05, "loss": 1.1067, "num_input_tokens_seen": 92675632, "step": 5760 }, { "epoch": 0.4035479436461863, "grad_norm": 4.200130939483643, "learning_rate": 5.968308931698775e-05, "loss": 0.9264, "num_input_tokens_seen": 92691408, "step": 5761 }, { "epoch": 0.40361799189191555, "grad_norm": 4.131740093231201, "learning_rate": 5.967609106830123e-05, "loss": 1.0687, "num_input_tokens_seen": 92706568, "step": 5762 }, { "epoch": 0.4036880401376448, "grad_norm": 4.074241638183594, "learning_rate": 5.96690928196147e-05, "loss": 1.0122, "num_input_tokens_seen": 92722952, "step": 5763 }, { "epoch": 0.40375808838337407, "grad_norm": 4.351722717285156, "learning_rate": 5.9662094570928195e-05, "loss": 0.8518, "num_input_tokens_seen": 92739248, "step": 5764 }, { "epoch": 0.4038281366291033, "grad_norm": 4.0495734214782715, "learning_rate": 5.965509632224169e-05, "loss": 1.1392, "num_input_tokens_seen": 92755040, "step": 5765 }, { "epoch": 0.40389818487483253, "grad_norm": 3.881098747253418, "learning_rate": 5.964809807355517e-05, "loss": 0.9899, "num_input_tokens_seen": 92771424, "step": 5766 }, { "epoch": 0.4039682331205618, "grad_norm": 3.964268922805786, "learning_rate": 5.9641099824868654e-05, "loss": 1.1636, "num_input_tokens_seen": 92787808, "step": 5767 }, { "epoch": 0.40403828136629105, "grad_norm": 4.126365661621094, "learning_rate": 5.963410157618214e-05, "loss": 1.273, "num_input_tokens_seen": 92804192, "step": 5768 }, { "epoch": 0.40410832961202026, "grad_norm": 4.142693519592285, "learning_rate": 5.9627103327495625e-05, "loss": 1.2161, "num_input_tokens_seen": 92819920, "step": 5769 }, { "epoch": 0.4041783778577495, "grad_norm": 4.906876087188721, "learning_rate": 5.962010507880912e-05, "loss": 0.9985, "num_input_tokens_seen": 92836304, "step": 5770 }, { "epoch": 0.4042484261034788, "grad_norm": 4.597287654876709, "learning_rate": 5.96131068301226e-05, "loss": 1.13, "num_input_tokens_seen": 92852144, "step": 5771 }, { "epoch": 0.40431847434920803, "grad_norm": 3.525669813156128, "learning_rate": 5.9606108581436084e-05, "loss": 1.0209, "num_input_tokens_seen": 92867944, "step": 5772 }, { "epoch": 0.40438852259493724, "grad_norm": 4.908353328704834, "learning_rate": 5.9599110332749565e-05, "loss": 1.2495, "num_input_tokens_seen": 92884328, "step": 5773 }, { "epoch": 0.4044585708406665, "grad_norm": 4.31436824798584, "learning_rate": 5.959211208406305e-05, "loss": 1.0821, "num_input_tokens_seen": 92900152, "step": 5774 }, { "epoch": 0.40452861908639576, "grad_norm": 3.652494430541992, "learning_rate": 5.9585113835376536e-05, "loss": 1.0086, "num_input_tokens_seen": 92916416, "step": 5775 }, { "epoch": 0.404598667332125, "grad_norm": 3.9569268226623535, "learning_rate": 5.957811558669002e-05, "loss": 1.0284, "num_input_tokens_seen": 92932088, "step": 5776 }, { "epoch": 0.4046687155778542, "grad_norm": 4.301011562347412, "learning_rate": 5.95711173380035e-05, "loss": 1.0994, "num_input_tokens_seen": 92948472, "step": 5777 }, { "epoch": 0.4047387638235835, "grad_norm": 4.0318474769592285, "learning_rate": 5.9564119089316995e-05, "loss": 1.0636, "num_input_tokens_seen": 92964856, "step": 5778 }, { "epoch": 0.40480881206931274, "grad_norm": 4.05795955657959, "learning_rate": 5.955712084063048e-05, "loss": 0.9947, "num_input_tokens_seen": 92980040, "step": 5779 }, { "epoch": 0.404878860315042, "grad_norm": 5.828601837158203, "learning_rate": 5.9550122591943945e-05, "loss": 1.1361, "num_input_tokens_seen": 92996424, "step": 5780 }, { "epoch": 0.4049489085607712, "grad_norm": 3.6617836952209473, "learning_rate": 5.954312434325745e-05, "loss": 0.9282, "num_input_tokens_seen": 93012400, "step": 5781 }, { "epoch": 0.40501895680650046, "grad_norm": 3.8668923377990723, "learning_rate": 5.953612609457093e-05, "loss": 1.2368, "num_input_tokens_seen": 93028176, "step": 5782 }, { "epoch": 0.4050890050522297, "grad_norm": 6.206475257873535, "learning_rate": 5.952912784588441e-05, "loss": 1.133, "num_input_tokens_seen": 93044560, "step": 5783 }, { "epoch": 0.405159053297959, "grad_norm": 3.6768481731414795, "learning_rate": 5.952212959719789e-05, "loss": 1.0141, "num_input_tokens_seen": 93060944, "step": 5784 }, { "epoch": 0.40522910154368824, "grad_norm": 3.8317768573760986, "learning_rate": 5.95151313485114e-05, "loss": 1.1, "num_input_tokens_seen": 93077328, "step": 5785 }, { "epoch": 0.40529914978941745, "grad_norm": 5.8504252433776855, "learning_rate": 5.950813309982487e-05, "loss": 1.1581, "num_input_tokens_seen": 93092016, "step": 5786 }, { "epoch": 0.4053691980351467, "grad_norm": 3.6718640327453613, "learning_rate": 5.9501134851138365e-05, "loss": 0.9614, "num_input_tokens_seen": 93108400, "step": 5787 }, { "epoch": 0.40543924628087596, "grad_norm": 4.16236686706543, "learning_rate": 5.949413660245185e-05, "loss": 1.1426, "num_input_tokens_seen": 93124784, "step": 5788 }, { "epoch": 0.4055092945266052, "grad_norm": 4.1938958168029785, "learning_rate": 5.948713835376533e-05, "loss": 1.241, "num_input_tokens_seen": 93141168, "step": 5789 }, { "epoch": 0.40557934277233443, "grad_norm": 3.7515947818756104, "learning_rate": 5.948014010507881e-05, "loss": 1.0706, "num_input_tokens_seen": 93157552, "step": 5790 }, { "epoch": 0.4056493910180637, "grad_norm": 3.710805654525757, "learning_rate": 5.94731418563923e-05, "loss": 0.9877, "num_input_tokens_seen": 93173936, "step": 5791 }, { "epoch": 0.40571943926379295, "grad_norm": 3.4491820335388184, "learning_rate": 5.9466143607705794e-05, "loss": 1.0282, "num_input_tokens_seen": 93190320, "step": 5792 }, { "epoch": 0.4057894875095222, "grad_norm": 3.8709781169891357, "learning_rate": 5.945914535901926e-05, "loss": 0.8852, "num_input_tokens_seen": 93206648, "step": 5793 }, { "epoch": 0.4058595357552514, "grad_norm": 5.104569435119629, "learning_rate": 5.945214711033276e-05, "loss": 0.9954, "num_input_tokens_seen": 93223032, "step": 5794 }, { "epoch": 0.40592958400098067, "grad_norm": 4.294493675231934, "learning_rate": 5.944514886164625e-05, "loss": 1.0673, "num_input_tokens_seen": 93239344, "step": 5795 }, { "epoch": 0.40599963224670993, "grad_norm": 4.654513835906982, "learning_rate": 5.943815061295972e-05, "loss": 1.0239, "num_input_tokens_seen": 93255104, "step": 5796 }, { "epoch": 0.4060696804924392, "grad_norm": 4.339935779571533, "learning_rate": 5.9431152364273204e-05, "loss": 0.982, "num_input_tokens_seen": 93270448, "step": 5797 }, { "epoch": 0.4061397287381684, "grad_norm": 3.6498191356658936, "learning_rate": 5.942415411558669e-05, "loss": 0.8815, "num_input_tokens_seen": 93286672, "step": 5798 }, { "epoch": 0.40620977698389765, "grad_norm": 3.9488580226898193, "learning_rate": 5.9417155866900174e-05, "loss": 1.2938, "num_input_tokens_seen": 93302256, "step": 5799 }, { "epoch": 0.4062798252296269, "grad_norm": 3.9446182250976562, "learning_rate": 5.9410157618213656e-05, "loss": 0.8862, "num_input_tokens_seen": 93318640, "step": 5800 }, { "epoch": 0.4062798252296269, "eval_loss": 1.1256848573684692, "eval_runtime": 0.2106, "eval_samples_per_second": 4.748, "eval_steps_per_second": 4.748, "num_input_tokens_seen": 93318640, "step": 5800 }, { "epoch": 0.40634987347535617, "grad_norm": 3.986703872680664, "learning_rate": 5.940315936952715e-05, "loss": 1.1783, "num_input_tokens_seen": 93335024, "step": 5801 }, { "epoch": 0.4064199217210854, "grad_norm": 3.56948184967041, "learning_rate": 5.9396161120840647e-05, "loss": 1.035, "num_input_tokens_seen": 93351408, "step": 5802 }, { "epoch": 0.40648996996681463, "grad_norm": 3.9857194423675537, "learning_rate": 5.9389162872154115e-05, "loss": 1.0596, "num_input_tokens_seen": 93367728, "step": 5803 }, { "epoch": 0.4065600182125439, "grad_norm": 5.165848731994629, "learning_rate": 5.93821646234676e-05, "loss": 0.9764, "num_input_tokens_seen": 93384112, "step": 5804 }, { "epoch": 0.40663006645827315, "grad_norm": 3.742520809173584, "learning_rate": 5.9375166374781085e-05, "loss": 1.0802, "num_input_tokens_seen": 93400072, "step": 5805 }, { "epoch": 0.40670011470400236, "grad_norm": 4.13803768157959, "learning_rate": 5.936816812609457e-05, "loss": 1.1845, "num_input_tokens_seen": 93416016, "step": 5806 }, { "epoch": 0.4067701629497316, "grad_norm": 4.530385494232178, "learning_rate": 5.936116987740805e-05, "loss": 1.1034, "num_input_tokens_seen": 93432400, "step": 5807 }, { "epoch": 0.4068402111954609, "grad_norm": 4.162608623504639, "learning_rate": 5.935417162872156e-05, "loss": 1.0239, "num_input_tokens_seen": 93448360, "step": 5808 }, { "epoch": 0.40691025944119014, "grad_norm": 3.5075366497039795, "learning_rate": 5.934717338003504e-05, "loss": 0.8568, "num_input_tokens_seen": 93464744, "step": 5809 }, { "epoch": 0.40698030768691934, "grad_norm": 4.963081359863281, "learning_rate": 5.934017513134851e-05, "loss": 1.028, "num_input_tokens_seen": 93480448, "step": 5810 }, { "epoch": 0.4070503559326486, "grad_norm": 3.83306622505188, "learning_rate": 5.9333176882662e-05, "loss": 1.1558, "num_input_tokens_seen": 93496832, "step": 5811 }, { "epoch": 0.40712040417837786, "grad_norm": 3.878345489501953, "learning_rate": 5.932617863397549e-05, "loss": 0.9784, "num_input_tokens_seen": 93513216, "step": 5812 }, { "epoch": 0.4071904524241071, "grad_norm": 4.1416192054748535, "learning_rate": 5.931918038528897e-05, "loss": 1.0167, "num_input_tokens_seen": 93529504, "step": 5813 }, { "epoch": 0.4072605006698363, "grad_norm": 5.459712028503418, "learning_rate": 5.9312182136602455e-05, "loss": 1.0816, "num_input_tokens_seen": 93545888, "step": 5814 }, { "epoch": 0.4073305489155656, "grad_norm": 5.8356852531433105, "learning_rate": 5.930518388791595e-05, "loss": 1.1543, "num_input_tokens_seen": 93562272, "step": 5815 }, { "epoch": 0.40740059716129484, "grad_norm": 6.23671817779541, "learning_rate": 5.929818563922943e-05, "loss": 1.1929, "num_input_tokens_seen": 93578656, "step": 5816 }, { "epoch": 0.4074706454070241, "grad_norm": 3.6252057552337646, "learning_rate": 5.9291187390542914e-05, "loss": 1.1491, "num_input_tokens_seen": 93594816, "step": 5817 }, { "epoch": 0.40754069365275336, "grad_norm": 4.245891571044922, "learning_rate": 5.9284189141856396e-05, "loss": 1.1624, "num_input_tokens_seen": 93611200, "step": 5818 }, { "epoch": 0.40761074189848256, "grad_norm": 4.052443504333496, "learning_rate": 5.9277190893169885e-05, "loss": 0.9608, "num_input_tokens_seen": 93627080, "step": 5819 }, { "epoch": 0.4076807901442118, "grad_norm": 4.427778720855713, "learning_rate": 5.927019264448336e-05, "loss": 0.9268, "num_input_tokens_seen": 93643464, "step": 5820 }, { "epoch": 0.4077508383899411, "grad_norm": 4.1961541175842285, "learning_rate": 5.926319439579685e-05, "loss": 1.0374, "num_input_tokens_seen": 93659088, "step": 5821 }, { "epoch": 0.40782088663567034, "grad_norm": 4.001824378967285, "learning_rate": 5.9256196147110344e-05, "loss": 1.1674, "num_input_tokens_seen": 93674928, "step": 5822 }, { "epoch": 0.40789093488139955, "grad_norm": 4.010315895080566, "learning_rate": 5.924919789842381e-05, "loss": 0.9092, "num_input_tokens_seen": 93690840, "step": 5823 }, { "epoch": 0.4079609831271288, "grad_norm": 3.771390438079834, "learning_rate": 5.9242199649737294e-05, "loss": 1.1374, "num_input_tokens_seen": 93707224, "step": 5824 }, { "epoch": 0.40803103137285807, "grad_norm": 3.920438051223755, "learning_rate": 5.92352014010508e-05, "loss": 0.9206, "num_input_tokens_seen": 93722536, "step": 5825 }, { "epoch": 0.4081010796185873, "grad_norm": 4.679770469665527, "learning_rate": 5.922820315236428e-05, "loss": 1.0143, "num_input_tokens_seen": 93738736, "step": 5826 }, { "epoch": 0.40817112786431653, "grad_norm": 4.082173824310303, "learning_rate": 5.922120490367776e-05, "loss": 0.9896, "num_input_tokens_seen": 93755120, "step": 5827 }, { "epoch": 0.4082411761100458, "grad_norm": 4.076204776763916, "learning_rate": 5.921420665499124e-05, "loss": 0.9632, "num_input_tokens_seen": 93771504, "step": 5828 }, { "epoch": 0.40831122435577505, "grad_norm": 4.721165180206299, "learning_rate": 5.920720840630474e-05, "loss": 1.237, "num_input_tokens_seen": 93787888, "step": 5829 }, { "epoch": 0.4083812726015043, "grad_norm": 4.313892841339111, "learning_rate": 5.9200210157618205e-05, "loss": 1.1815, "num_input_tokens_seen": 93804272, "step": 5830 }, { "epoch": 0.4084513208472335, "grad_norm": 4.506958961486816, "learning_rate": 5.91932119089317e-05, "loss": 0.8849, "num_input_tokens_seen": 93820416, "step": 5831 }, { "epoch": 0.40852136909296277, "grad_norm": 4.6436991691589355, "learning_rate": 5.9186213660245196e-05, "loss": 0.8856, "num_input_tokens_seen": 93836648, "step": 5832 }, { "epoch": 0.40859141733869203, "grad_norm": 3.9535446166992188, "learning_rate": 5.917921541155868e-05, "loss": 1.1285, "num_input_tokens_seen": 93853032, "step": 5833 }, { "epoch": 0.4086614655844213, "grad_norm": 6.974640846252441, "learning_rate": 5.917221716287216e-05, "loss": 1.0669, "num_input_tokens_seen": 93868640, "step": 5834 }, { "epoch": 0.4087315138301505, "grad_norm": 4.3199262619018555, "learning_rate": 5.916521891418565e-05, "loss": 1.1921, "num_input_tokens_seen": 93885024, "step": 5835 }, { "epoch": 0.40880156207587975, "grad_norm": 3.892812967300415, "learning_rate": 5.915822066549913e-05, "loss": 1.0978, "num_input_tokens_seen": 93901408, "step": 5836 }, { "epoch": 0.408871610321609, "grad_norm": 4.434093952178955, "learning_rate": 5.915122241681261e-05, "loss": 1.1484, "num_input_tokens_seen": 93917632, "step": 5837 }, { "epoch": 0.4089416585673383, "grad_norm": 3.960766553878784, "learning_rate": 5.914422416812611e-05, "loss": 0.994, "num_input_tokens_seen": 93934016, "step": 5838 }, { "epoch": 0.4090117068130675, "grad_norm": 6.450897693634033, "learning_rate": 5.913722591943959e-05, "loss": 1.1364, "num_input_tokens_seen": 93950328, "step": 5839 }, { "epoch": 0.40908175505879674, "grad_norm": 4.3362956047058105, "learning_rate": 5.913022767075307e-05, "loss": 1.0023, "num_input_tokens_seen": 93966712, "step": 5840 }, { "epoch": 0.409151803304526, "grad_norm": 4.250185966491699, "learning_rate": 5.912322942206655e-05, "loss": 0.9535, "num_input_tokens_seen": 93982840, "step": 5841 }, { "epoch": 0.40922185155025526, "grad_norm": 5.399633407592773, "learning_rate": 5.911623117338004e-05, "loss": 1.0504, "num_input_tokens_seen": 93998192, "step": 5842 }, { "epoch": 0.40929189979598446, "grad_norm": 3.926515579223633, "learning_rate": 5.910923292469352e-05, "loss": 1.1711, "num_input_tokens_seen": 94014296, "step": 5843 }, { "epoch": 0.4093619480417137, "grad_norm": 3.9847402572631836, "learning_rate": 5.9102234676007005e-05, "loss": 1.0845, "num_input_tokens_seen": 94030520, "step": 5844 }, { "epoch": 0.409431996287443, "grad_norm": 4.106837272644043, "learning_rate": 5.9095236427320486e-05, "loss": 1.1529, "num_input_tokens_seen": 94046512, "step": 5845 }, { "epoch": 0.40950204453317224, "grad_norm": 5.044075965881348, "learning_rate": 5.908823817863398e-05, "loss": 0.8541, "num_input_tokens_seen": 94061536, "step": 5846 }, { "epoch": 0.40957209277890144, "grad_norm": 7.444840908050537, "learning_rate": 5.908123992994745e-05, "loss": 0.9811, "num_input_tokens_seen": 94077920, "step": 5847 }, { "epoch": 0.4096421410246307, "grad_norm": 4.676487445831299, "learning_rate": 5.907424168126095e-05, "loss": 0.9323, "num_input_tokens_seen": 94094040, "step": 5848 }, { "epoch": 0.40971218927035996, "grad_norm": 3.945162057876587, "learning_rate": 5.9067243432574434e-05, "loss": 1.0101, "num_input_tokens_seen": 94110424, "step": 5849 }, { "epoch": 0.4097822375160892, "grad_norm": 3.9881343841552734, "learning_rate": 5.9060245183887916e-05, "loss": 1.2385, "num_input_tokens_seen": 94126808, "step": 5850 }, { "epoch": 0.4098522857618185, "grad_norm": 3.5962657928466797, "learning_rate": 5.90532469352014e-05, "loss": 0.988, "num_input_tokens_seen": 94143192, "step": 5851 }, { "epoch": 0.4099223340075477, "grad_norm": 5.755387783050537, "learning_rate": 5.904624868651491e-05, "loss": 1.1936, "num_input_tokens_seen": 94158728, "step": 5852 }, { "epoch": 0.40999238225327694, "grad_norm": 4.999855995178223, "learning_rate": 5.9039250437828375e-05, "loss": 1.1235, "num_input_tokens_seen": 94174736, "step": 5853 }, { "epoch": 0.4100624304990062, "grad_norm": 5.452749729156494, "learning_rate": 5.903225218914186e-05, "loss": 1.1477, "num_input_tokens_seen": 94191120, "step": 5854 }, { "epoch": 0.41013247874473546, "grad_norm": 4.211399078369141, "learning_rate": 5.902525394045535e-05, "loss": 0.9808, "num_input_tokens_seen": 94207504, "step": 5855 }, { "epoch": 0.41020252699046467, "grad_norm": 3.3774921894073486, "learning_rate": 5.9018255691768834e-05, "loss": 1.0297, "num_input_tokens_seen": 94223888, "step": 5856 }, { "epoch": 0.4102725752361939, "grad_norm": 6.850539207458496, "learning_rate": 5.9011257443082316e-05, "loss": 1.0313, "num_input_tokens_seen": 94240120, "step": 5857 }, { "epoch": 0.4103426234819232, "grad_norm": 3.2469308376312256, "learning_rate": 5.9004259194395804e-05, "loss": 0.9631, "num_input_tokens_seen": 94256488, "step": 5858 }, { "epoch": 0.41041267172765244, "grad_norm": 5.0675201416015625, "learning_rate": 5.89972609457093e-05, "loss": 0.8961, "num_input_tokens_seen": 94272048, "step": 5859 }, { "epoch": 0.41048271997338165, "grad_norm": 3.8971400260925293, "learning_rate": 5.899026269702277e-05, "loss": 1.2323, "num_input_tokens_seen": 94287984, "step": 5860 }, { "epoch": 0.4105527682191109, "grad_norm": 3.8453164100646973, "learning_rate": 5.898326444833625e-05, "loss": 1.1039, "num_input_tokens_seen": 94303976, "step": 5861 }, { "epoch": 0.41062281646484017, "grad_norm": 3.7564680576324463, "learning_rate": 5.897626619964975e-05, "loss": 1.0977, "num_input_tokens_seen": 94319552, "step": 5862 }, { "epoch": 0.4106928647105694, "grad_norm": 4.541357517242432, "learning_rate": 5.896926795096323e-05, "loss": 1.2374, "num_input_tokens_seen": 94335936, "step": 5863 }, { "epoch": 0.41076291295629863, "grad_norm": 4.841330051422119, "learning_rate": 5.896226970227671e-05, "loss": 0.8555, "num_input_tokens_seen": 94351784, "step": 5864 }, { "epoch": 0.4108329612020279, "grad_norm": 3.9247653484344482, "learning_rate": 5.89552714535902e-05, "loss": 1.0169, "num_input_tokens_seen": 94368080, "step": 5865 }, { "epoch": 0.41090300944775715, "grad_norm": 4.763803958892822, "learning_rate": 5.894827320490368e-05, "loss": 0.9812, "num_input_tokens_seen": 94384056, "step": 5866 }, { "epoch": 0.4109730576934864, "grad_norm": 5.141749858856201, "learning_rate": 5.894127495621716e-05, "loss": 1.0001, "num_input_tokens_seen": 94400440, "step": 5867 }, { "epoch": 0.4110431059392156, "grad_norm": 3.360656261444092, "learning_rate": 5.8934276707530656e-05, "loss": 0.988, "num_input_tokens_seen": 94416824, "step": 5868 }, { "epoch": 0.4111131541849449, "grad_norm": 4.226006507873535, "learning_rate": 5.892727845884415e-05, "loss": 1.0762, "num_input_tokens_seen": 94433208, "step": 5869 }, { "epoch": 0.41118320243067413, "grad_norm": 6.924234390258789, "learning_rate": 5.892028021015762e-05, "loss": 1.1498, "num_input_tokens_seen": 94448848, "step": 5870 }, { "epoch": 0.4112532506764034, "grad_norm": 3.643950939178467, "learning_rate": 5.89132819614711e-05, "loss": 0.9977, "num_input_tokens_seen": 94464720, "step": 5871 }, { "epoch": 0.4113232989221326, "grad_norm": 3.3077268600463867, "learning_rate": 5.890628371278459e-05, "loss": 0.9421, "num_input_tokens_seen": 94481104, "step": 5872 }, { "epoch": 0.41139334716786186, "grad_norm": 3.9556264877319336, "learning_rate": 5.889928546409807e-05, "loss": 0.9313, "num_input_tokens_seen": 94496944, "step": 5873 }, { "epoch": 0.4114633954135911, "grad_norm": 7.0952606201171875, "learning_rate": 5.8892287215411554e-05, "loss": 1.3029, "num_input_tokens_seen": 94511336, "step": 5874 }, { "epoch": 0.4115334436593204, "grad_norm": 3.946803569793701, "learning_rate": 5.888528896672505e-05, "loss": 0.9865, "num_input_tokens_seen": 94526904, "step": 5875 }, { "epoch": 0.4116034919050496, "grad_norm": 4.165796756744385, "learning_rate": 5.8878290718038545e-05, "loss": 1.2249, "num_input_tokens_seen": 94543024, "step": 5876 }, { "epoch": 0.41167354015077884, "grad_norm": 3.9492764472961426, "learning_rate": 5.887129246935201e-05, "loss": 1.0304, "num_input_tokens_seen": 94558744, "step": 5877 }, { "epoch": 0.4117435883965081, "grad_norm": 4.144934177398682, "learning_rate": 5.8864294220665495e-05, "loss": 1.0468, "num_input_tokens_seen": 94574288, "step": 5878 }, { "epoch": 0.41181363664223736, "grad_norm": 3.7620224952697754, "learning_rate": 5.8857295971979e-05, "loss": 0.929, "num_input_tokens_seen": 94590672, "step": 5879 }, { "epoch": 0.41188368488796656, "grad_norm": 4.788266181945801, "learning_rate": 5.885029772329247e-05, "loss": 1.0106, "num_input_tokens_seen": 94606792, "step": 5880 }, { "epoch": 0.4119537331336958, "grad_norm": 4.383455276489258, "learning_rate": 5.8843299474605954e-05, "loss": 0.9611, "num_input_tokens_seen": 94623176, "step": 5881 }, { "epoch": 0.4120237813794251, "grad_norm": 7.097373962402344, "learning_rate": 5.883630122591944e-05, "loss": 1.1559, "num_input_tokens_seen": 94638952, "step": 5882 }, { "epoch": 0.41209382962515434, "grad_norm": 5.4228901863098145, "learning_rate": 5.882930297723294e-05, "loss": 1.2569, "num_input_tokens_seen": 94655224, "step": 5883 }, { "epoch": 0.41216387787088354, "grad_norm": 3.792999267578125, "learning_rate": 5.882230472854642e-05, "loss": 0.8853, "num_input_tokens_seen": 94671608, "step": 5884 }, { "epoch": 0.4122339261166128, "grad_norm": 5.401544094085693, "learning_rate": 5.88153064798599e-05, "loss": 1.0763, "num_input_tokens_seen": 94687048, "step": 5885 }, { "epoch": 0.41230397436234206, "grad_norm": 3.2229812145233154, "learning_rate": 5.880830823117339e-05, "loss": 0.9375, "num_input_tokens_seen": 94703432, "step": 5886 }, { "epoch": 0.4123740226080713, "grad_norm": 4.5977277755737305, "learning_rate": 5.8801309982486865e-05, "loss": 1.1245, "num_input_tokens_seen": 94719152, "step": 5887 }, { "epoch": 0.4124440708538006, "grad_norm": 3.436765670776367, "learning_rate": 5.879431173380035e-05, "loss": 0.8353, "num_input_tokens_seen": 94735536, "step": 5888 }, { "epoch": 0.4125141190995298, "grad_norm": 3.4720351696014404, "learning_rate": 5.878731348511385e-05, "loss": 1.0329, "num_input_tokens_seen": 94751840, "step": 5889 }, { "epoch": 0.41258416734525905, "grad_norm": 3.621783971786499, "learning_rate": 5.878031523642732e-05, "loss": 1.0355, "num_input_tokens_seen": 94768224, "step": 5890 }, { "epoch": 0.4126542155909883, "grad_norm": 4.453585624694824, "learning_rate": 5.87733169877408e-05, "loss": 1.1171, "num_input_tokens_seen": 94783672, "step": 5891 }, { "epoch": 0.41272426383671756, "grad_norm": 4.191892147064209, "learning_rate": 5.8766318739054294e-05, "loss": 0.9505, "num_input_tokens_seen": 94800056, "step": 5892 }, { "epoch": 0.41279431208244677, "grad_norm": 3.5963308811187744, "learning_rate": 5.875932049036778e-05, "loss": 1.0223, "num_input_tokens_seen": 94816440, "step": 5893 }, { "epoch": 0.412864360328176, "grad_norm": 3.904618978500366, "learning_rate": 5.8752322241681265e-05, "loss": 1.0205, "num_input_tokens_seen": 94832824, "step": 5894 }, { "epoch": 0.4129344085739053, "grad_norm": 3.650961399078369, "learning_rate": 5.8745323992994747e-05, "loss": 0.9613, "num_input_tokens_seen": 94849208, "step": 5895 }, { "epoch": 0.41300445681963455, "grad_norm": 4.438238143920898, "learning_rate": 5.873832574430824e-05, "loss": 0.9865, "num_input_tokens_seen": 94865592, "step": 5896 }, { "epoch": 0.41307450506536375, "grad_norm": 3.6906070709228516, "learning_rate": 5.873132749562171e-05, "loss": 1.1228, "num_input_tokens_seen": 94881976, "step": 5897 }, { "epoch": 0.413144553311093, "grad_norm": 5.111722469329834, "learning_rate": 5.872432924693519e-05, "loss": 1.2678, "num_input_tokens_seen": 94898360, "step": 5898 }, { "epoch": 0.41321460155682227, "grad_norm": 3.7190804481506348, "learning_rate": 5.871733099824869e-05, "loss": 1.152, "num_input_tokens_seen": 94914512, "step": 5899 }, { "epoch": 0.41328464980255153, "grad_norm": 3.779918670654297, "learning_rate": 5.871033274956218e-05, "loss": 0.905, "num_input_tokens_seen": 94930816, "step": 5900 }, { "epoch": 0.41335469804828073, "grad_norm": 3.5921852588653564, "learning_rate": 5.8703334500875665e-05, "loss": 0.9119, "num_input_tokens_seen": 94947200, "step": 5901 }, { "epoch": 0.41342474629401, "grad_norm": 4.024271011352539, "learning_rate": 5.8696336252189146e-05, "loss": 1.048, "num_input_tokens_seen": 94963584, "step": 5902 }, { "epoch": 0.41349479453973925, "grad_norm": 4.798417091369629, "learning_rate": 5.8689338003502635e-05, "loss": 1.1399, "num_input_tokens_seen": 94979968, "step": 5903 }, { "epoch": 0.4135648427854685, "grad_norm": 3.5821495056152344, "learning_rate": 5.868233975481612e-05, "loss": 1.015, "num_input_tokens_seen": 94996320, "step": 5904 }, { "epoch": 0.4136348910311977, "grad_norm": 3.998082399368286, "learning_rate": 5.867534150612959e-05, "loss": 0.9569, "num_input_tokens_seen": 95012704, "step": 5905 }, { "epoch": 0.413704939276927, "grad_norm": 3.6389498710632324, "learning_rate": 5.8668343257443094e-05, "loss": 1.1562, "num_input_tokens_seen": 95029088, "step": 5906 }, { "epoch": 0.41377498752265623, "grad_norm": 4.137228012084961, "learning_rate": 5.8661345008756576e-05, "loss": 1.1683, "num_input_tokens_seen": 95045472, "step": 5907 }, { "epoch": 0.4138450357683855, "grad_norm": 4.181145668029785, "learning_rate": 5.865434676007006e-05, "loss": 0.993, "num_input_tokens_seen": 95061856, "step": 5908 }, { "epoch": 0.4139150840141147, "grad_norm": 3.759474754333496, "learning_rate": 5.864734851138354e-05, "loss": 1.0144, "num_input_tokens_seen": 95077904, "step": 5909 }, { "epoch": 0.41398513225984396, "grad_norm": 4.474549293518066, "learning_rate": 5.864035026269703e-05, "loss": 1.1008, "num_input_tokens_seen": 95094288, "step": 5910 }, { "epoch": 0.4140551805055732, "grad_norm": 7.399059295654297, "learning_rate": 5.863335201401051e-05, "loss": 1.2234, "num_input_tokens_seen": 95107872, "step": 5911 }, { "epoch": 0.4141252287513025, "grad_norm": 4.018132209777832, "learning_rate": 5.862635376532399e-05, "loss": 0.9736, "num_input_tokens_seen": 95124256, "step": 5912 }, { "epoch": 0.4141952769970317, "grad_norm": 3.825305223464966, "learning_rate": 5.861935551663749e-05, "loss": 0.9833, "num_input_tokens_seen": 95139784, "step": 5913 }, { "epoch": 0.41426532524276094, "grad_norm": 3.7942214012145996, "learning_rate": 5.861235726795097e-05, "loss": 1.0999, "num_input_tokens_seen": 95155136, "step": 5914 }, { "epoch": 0.4143353734884902, "grad_norm": 3.9006733894348145, "learning_rate": 5.860535901926444e-05, "loss": 1.0608, "num_input_tokens_seen": 95171520, "step": 5915 }, { "epoch": 0.41440542173421946, "grad_norm": 3.688754081726074, "learning_rate": 5.859836077057794e-05, "loss": 1.1338, "num_input_tokens_seen": 95187752, "step": 5916 }, { "epoch": 0.41447546997994866, "grad_norm": 3.954989433288574, "learning_rate": 5.859136252189142e-05, "loss": 0.8965, "num_input_tokens_seen": 95203864, "step": 5917 }, { "epoch": 0.4145455182256779, "grad_norm": 3.733185052871704, "learning_rate": 5.85843642732049e-05, "loss": 0.9872, "num_input_tokens_seen": 95220248, "step": 5918 }, { "epoch": 0.4146155664714072, "grad_norm": 4.39019775390625, "learning_rate": 5.8577366024518385e-05, "loss": 1.0452, "num_input_tokens_seen": 95236632, "step": 5919 }, { "epoch": 0.41468561471713644, "grad_norm": 3.716066837310791, "learning_rate": 5.857036777583188e-05, "loss": 0.9231, "num_input_tokens_seen": 95251888, "step": 5920 }, { "epoch": 0.4147556629628657, "grad_norm": 3.7525405883789062, "learning_rate": 5.856336952714536e-05, "loss": 1.1147, "num_input_tokens_seen": 95268272, "step": 5921 }, { "epoch": 0.4148257112085949, "grad_norm": 3.605818033218384, "learning_rate": 5.855637127845886e-05, "loss": 0.9874, "num_input_tokens_seen": 95283880, "step": 5922 }, { "epoch": 0.41489575945432416, "grad_norm": 3.878814458847046, "learning_rate": 5.854937302977234e-05, "loss": 1.1556, "num_input_tokens_seen": 95300264, "step": 5923 }, { "epoch": 0.4149658077000534, "grad_norm": 4.94001579284668, "learning_rate": 5.854237478108582e-05, "loss": 1.0014, "num_input_tokens_seen": 95316648, "step": 5924 }, { "epoch": 0.4150358559457827, "grad_norm": 4.213568210601807, "learning_rate": 5.85353765323993e-05, "loss": 1.2315, "num_input_tokens_seen": 95332880, "step": 5925 }, { "epoch": 0.4151059041915119, "grad_norm": 4.2475996017456055, "learning_rate": 5.8528378283712784e-05, "loss": 1.0717, "num_input_tokens_seen": 95349176, "step": 5926 }, { "epoch": 0.41517595243724115, "grad_norm": 4.237911224365234, "learning_rate": 5.852138003502627e-05, "loss": 0.93, "num_input_tokens_seen": 95365560, "step": 5927 }, { "epoch": 0.4152460006829704, "grad_norm": 3.695140838623047, "learning_rate": 5.8514381786339755e-05, "loss": 0.9561, "num_input_tokens_seen": 95381280, "step": 5928 }, { "epoch": 0.41531604892869967, "grad_norm": 4.457770347595215, "learning_rate": 5.850738353765325e-05, "loss": 1.0541, "num_input_tokens_seen": 95397664, "step": 5929 }, { "epoch": 0.41538609717442887, "grad_norm": 4.237982273101807, "learning_rate": 5.850038528896673e-05, "loss": 1.0123, "num_input_tokens_seen": 95414048, "step": 5930 }, { "epoch": 0.41545614542015813, "grad_norm": 3.5690579414367676, "learning_rate": 5.8493387040280214e-05, "loss": 0.9788, "num_input_tokens_seen": 95430432, "step": 5931 }, { "epoch": 0.4155261936658874, "grad_norm": 4.665618419647217, "learning_rate": 5.8486388791593696e-05, "loss": 1.0138, "num_input_tokens_seen": 95446816, "step": 5932 }, { "epoch": 0.41559624191161665, "grad_norm": 3.76755952835083, "learning_rate": 5.8479390542907184e-05, "loss": 0.9853, "num_input_tokens_seen": 95463200, "step": 5933 }, { "epoch": 0.41566629015734585, "grad_norm": 4.855432033538818, "learning_rate": 5.8472392294220666e-05, "loss": 1.1747, "num_input_tokens_seen": 95478704, "step": 5934 }, { "epoch": 0.4157363384030751, "grad_norm": 4.860055446624756, "learning_rate": 5.846539404553415e-05, "loss": 0.9747, "num_input_tokens_seen": 95495088, "step": 5935 }, { "epoch": 0.41580638664880437, "grad_norm": 4.268356800079346, "learning_rate": 5.845839579684763e-05, "loss": 1.1564, "num_input_tokens_seen": 95511472, "step": 5936 }, { "epoch": 0.41587643489453363, "grad_norm": 7.423181533813477, "learning_rate": 5.8451397548161125e-05, "loss": 1.0369, "num_input_tokens_seen": 95527008, "step": 5937 }, { "epoch": 0.41594648314026283, "grad_norm": 5.740126609802246, "learning_rate": 5.844439929947461e-05, "loss": 1.1188, "num_input_tokens_seen": 95542536, "step": 5938 }, { "epoch": 0.4160165313859921, "grad_norm": 5.135944366455078, "learning_rate": 5.8437401050788096e-05, "loss": 1.1815, "num_input_tokens_seen": 95558688, "step": 5939 }, { "epoch": 0.41608657963172135, "grad_norm": 3.879530191421509, "learning_rate": 5.843040280210158e-05, "loss": 1.162, "num_input_tokens_seen": 95575064, "step": 5940 }, { "epoch": 0.4161566278774506, "grad_norm": 4.096410274505615, "learning_rate": 5.842340455341506e-05, "loss": 1.0883, "num_input_tokens_seen": 95591272, "step": 5941 }, { "epoch": 0.4162266761231798, "grad_norm": 4.095829486846924, "learning_rate": 5.841640630472854e-05, "loss": 1.0516, "num_input_tokens_seen": 95607656, "step": 5942 }, { "epoch": 0.4162967243689091, "grad_norm": 4.076023101806641, "learning_rate": 5.840940805604205e-05, "loss": 1.0361, "num_input_tokens_seen": 95623384, "step": 5943 }, { "epoch": 0.41636677261463834, "grad_norm": 4.08365535736084, "learning_rate": 5.840240980735553e-05, "loss": 1.1102, "num_input_tokens_seen": 95639768, "step": 5944 }, { "epoch": 0.4164368208603676, "grad_norm": 4.182791233062744, "learning_rate": 5.8395411558669e-05, "loss": 1.0212, "num_input_tokens_seen": 95655928, "step": 5945 }, { "epoch": 0.4165068691060968, "grad_norm": 4.3107428550720215, "learning_rate": 5.8388413309982495e-05, "loss": 0.8607, "num_input_tokens_seen": 95672312, "step": 5946 }, { "epoch": 0.41657691735182606, "grad_norm": 3.7357101440429688, "learning_rate": 5.838141506129598e-05, "loss": 1.0563, "num_input_tokens_seen": 95688696, "step": 5947 }, { "epoch": 0.4166469655975553, "grad_norm": 3.9959046840667725, "learning_rate": 5.837441681260946e-05, "loss": 1.1043, "num_input_tokens_seen": 95704304, "step": 5948 }, { "epoch": 0.4167170138432846, "grad_norm": 4.395400524139404, "learning_rate": 5.836741856392295e-05, "loss": 1.1087, "num_input_tokens_seen": 95720688, "step": 5949 }, { "epoch": 0.4167870620890138, "grad_norm": 4.998651027679443, "learning_rate": 5.836042031523644e-05, "loss": 1.1903, "num_input_tokens_seen": 95737072, "step": 5950 }, { "epoch": 0.41685711033474304, "grad_norm": 4.060539245605469, "learning_rate": 5.8353422066549925e-05, "loss": 1.033, "num_input_tokens_seen": 95753456, "step": 5951 }, { "epoch": 0.4169271585804723, "grad_norm": 5.286706924438477, "learning_rate": 5.834642381786339e-05, "loss": 1.0968, "num_input_tokens_seen": 95769840, "step": 5952 }, { "epoch": 0.41699720682620156, "grad_norm": 4.501932144165039, "learning_rate": 5.833942556917689e-05, "loss": 1.2626, "num_input_tokens_seen": 95786224, "step": 5953 }, { "epoch": 0.41706725507193076, "grad_norm": 5.144174575805664, "learning_rate": 5.833242732049038e-05, "loss": 0.8148, "num_input_tokens_seen": 95802608, "step": 5954 }, { "epoch": 0.41713730331766, "grad_norm": 3.6604678630828857, "learning_rate": 5.832542907180385e-05, "loss": 0.9467, "num_input_tokens_seen": 95818992, "step": 5955 }, { "epoch": 0.4172073515633893, "grad_norm": 5.387998104095459, "learning_rate": 5.831843082311734e-05, "loss": 1.181, "num_input_tokens_seen": 95835376, "step": 5956 }, { "epoch": 0.41727739980911854, "grad_norm": 4.576782703399658, "learning_rate": 5.831143257443082e-05, "loss": 1.0577, "num_input_tokens_seen": 95851760, "step": 5957 }, { "epoch": 0.4173474480548478, "grad_norm": 5.737542629241943, "learning_rate": 5.8304434325744304e-05, "loss": 1.039, "num_input_tokens_seen": 95868144, "step": 5958 }, { "epoch": 0.417417496300577, "grad_norm": 3.515028238296509, "learning_rate": 5.8297436077057786e-05, "loss": 0.9503, "num_input_tokens_seen": 95884528, "step": 5959 }, { "epoch": 0.41748754454630627, "grad_norm": 3.9339003562927246, "learning_rate": 5.8290437828371295e-05, "loss": 0.928, "num_input_tokens_seen": 95900688, "step": 5960 }, { "epoch": 0.4175575927920355, "grad_norm": 3.896474838256836, "learning_rate": 5.828343957968477e-05, "loss": 1.1249, "num_input_tokens_seen": 95917072, "step": 5961 }, { "epoch": 0.4176276410377648, "grad_norm": 5.101248264312744, "learning_rate": 5.8276441330998245e-05, "loss": 1.0097, "num_input_tokens_seen": 95931976, "step": 5962 }, { "epoch": 0.417697689283494, "grad_norm": 7.063873291015625, "learning_rate": 5.8269443082311734e-05, "loss": 1.1686, "num_input_tokens_seen": 95948360, "step": 5963 }, { "epoch": 0.41776773752922325, "grad_norm": 3.564887762069702, "learning_rate": 5.826244483362523e-05, "loss": 1.1375, "num_input_tokens_seen": 95964744, "step": 5964 }, { "epoch": 0.4178377857749525, "grad_norm": 3.821101427078247, "learning_rate": 5.82554465849387e-05, "loss": 1.252, "num_input_tokens_seen": 95980760, "step": 5965 }, { "epoch": 0.41790783402068177, "grad_norm": 3.609252691268921, "learning_rate": 5.824844833625219e-05, "loss": 1.0633, "num_input_tokens_seen": 95997144, "step": 5966 }, { "epoch": 0.41797788226641097, "grad_norm": 4.1750874519348145, "learning_rate": 5.824145008756569e-05, "loss": 1.119, "num_input_tokens_seen": 96012872, "step": 5967 }, { "epoch": 0.41804793051214023, "grad_norm": 3.776747465133667, "learning_rate": 5.823445183887917e-05, "loss": 0.9792, "num_input_tokens_seen": 96029168, "step": 5968 }, { "epoch": 0.4181179787578695, "grad_norm": 8.143741607666016, "learning_rate": 5.822745359019264e-05, "loss": 1.0686, "num_input_tokens_seen": 96045376, "step": 5969 }, { "epoch": 0.41818802700359875, "grad_norm": 4.336330890655518, "learning_rate": 5.822045534150614e-05, "loss": 1.0597, "num_input_tokens_seen": 96060904, "step": 5970 }, { "epoch": 0.41825807524932795, "grad_norm": 3.731605052947998, "learning_rate": 5.821345709281962e-05, "loss": 1.1302, "num_input_tokens_seen": 96076824, "step": 5971 }, { "epoch": 0.4183281234950572, "grad_norm": 3.8380699157714844, "learning_rate": 5.82064588441331e-05, "loss": 1.1711, "num_input_tokens_seen": 96092616, "step": 5972 }, { "epoch": 0.4183981717407865, "grad_norm": 3.9088358879089355, "learning_rate": 5.8199460595446586e-05, "loss": 1.1622, "num_input_tokens_seen": 96109000, "step": 5973 }, { "epoch": 0.41846821998651573, "grad_norm": 4.0047783851623535, "learning_rate": 5.819246234676008e-05, "loss": 1.0949, "num_input_tokens_seen": 96125344, "step": 5974 }, { "epoch": 0.41853826823224494, "grad_norm": 3.936495542526245, "learning_rate": 5.818546409807356e-05, "loss": 1.1566, "num_input_tokens_seen": 96141536, "step": 5975 }, { "epoch": 0.4186083164779742, "grad_norm": 3.8510451316833496, "learning_rate": 5.8178465849387045e-05, "loss": 1.0454, "num_input_tokens_seen": 96157920, "step": 5976 }, { "epoch": 0.41867836472370346, "grad_norm": 3.5825259685516357, "learning_rate": 5.817146760070053e-05, "loss": 1.0491, "num_input_tokens_seen": 96173848, "step": 5977 }, { "epoch": 0.4187484129694327, "grad_norm": 4.0553717613220215, "learning_rate": 5.8164469352014015e-05, "loss": 1.0669, "num_input_tokens_seen": 96190232, "step": 5978 }, { "epoch": 0.4188184612151619, "grad_norm": 4.085362434387207, "learning_rate": 5.815747110332749e-05, "loss": 1.2832, "num_input_tokens_seen": 96205896, "step": 5979 }, { "epoch": 0.4188885094608912, "grad_norm": 6.552733421325684, "learning_rate": 5.815047285464098e-05, "loss": 1.1183, "num_input_tokens_seen": 96221784, "step": 5980 }, { "epoch": 0.41895855770662044, "grad_norm": 4.052005290985107, "learning_rate": 5.8143474605954474e-05, "loss": 1.0309, "num_input_tokens_seen": 96237472, "step": 5981 }, { "epoch": 0.4190286059523497, "grad_norm": 3.9679994583129883, "learning_rate": 5.813647635726794e-05, "loss": 1.0298, "num_input_tokens_seen": 96253616, "step": 5982 }, { "epoch": 0.4190986541980789, "grad_norm": 4.879584312438965, "learning_rate": 5.812947810858144e-05, "loss": 0.9334, "num_input_tokens_seen": 96270000, "step": 5983 }, { "epoch": 0.41916870244380816, "grad_norm": 4.894060134887695, "learning_rate": 5.8122479859894926e-05, "loss": 0.9829, "num_input_tokens_seen": 96286384, "step": 5984 }, { "epoch": 0.4192387506895374, "grad_norm": 3.9925336837768555, "learning_rate": 5.811548161120841e-05, "loss": 1.0285, "num_input_tokens_seen": 96302200, "step": 5985 }, { "epoch": 0.4193087989352667, "grad_norm": 4.043905258178711, "learning_rate": 5.810848336252189e-05, "loss": 1.0217, "num_input_tokens_seen": 96318584, "step": 5986 }, { "epoch": 0.4193788471809959, "grad_norm": 4.216322422027588, "learning_rate": 5.8101485113835385e-05, "loss": 0.9483, "num_input_tokens_seen": 96334720, "step": 5987 }, { "epoch": 0.41944889542672514, "grad_norm": 3.772749900817871, "learning_rate": 5.809448686514887e-05, "loss": 1.272, "num_input_tokens_seen": 96351104, "step": 5988 }, { "epoch": 0.4195189436724544, "grad_norm": 3.6716036796569824, "learning_rate": 5.8087488616462335e-05, "loss": 1.1796, "num_input_tokens_seen": 96367488, "step": 5989 }, { "epoch": 0.41958899191818366, "grad_norm": 3.9748408794403076, "learning_rate": 5.8080490367775844e-05, "loss": 1.0994, "num_input_tokens_seen": 96383872, "step": 5990 }, { "epoch": 0.4196590401639129, "grad_norm": 5.4619269371032715, "learning_rate": 5.8073492119089326e-05, "loss": 1.0553, "num_input_tokens_seen": 96400256, "step": 5991 }, { "epoch": 0.4197290884096421, "grad_norm": 3.4772391319274902, "learning_rate": 5.806649387040281e-05, "loss": 1.0543, "num_input_tokens_seen": 96416640, "step": 5992 }, { "epoch": 0.4197991366553714, "grad_norm": 4.003359794616699, "learning_rate": 5.805949562171629e-05, "loss": 0.9306, "num_input_tokens_seen": 96432872, "step": 5993 }, { "epoch": 0.41986918490110064, "grad_norm": 3.433760166168213, "learning_rate": 5.805249737302979e-05, "loss": 1.0197, "num_input_tokens_seen": 96449256, "step": 5994 }, { "epoch": 0.4199392331468299, "grad_norm": 4.519425868988037, "learning_rate": 5.804549912434326e-05, "loss": 1.0303, "num_input_tokens_seen": 96465456, "step": 5995 }, { "epoch": 0.4200092813925591, "grad_norm": 3.8798038959503174, "learning_rate": 5.803850087565674e-05, "loss": 1.1426, "num_input_tokens_seen": 96481840, "step": 5996 }, { "epoch": 0.42007932963828837, "grad_norm": 7.4741058349609375, "learning_rate": 5.803150262697024e-05, "loss": 1.0759, "num_input_tokens_seen": 96497160, "step": 5997 }, { "epoch": 0.4201493778840176, "grad_norm": 3.6269989013671875, "learning_rate": 5.802450437828372e-05, "loss": 1.0676, "num_input_tokens_seen": 96512672, "step": 5998 }, { "epoch": 0.4202194261297469, "grad_norm": 3.6369056701660156, "learning_rate": 5.80175061295972e-05, "loss": 0.9351, "num_input_tokens_seen": 96529056, "step": 5999 }, { "epoch": 0.4202894743754761, "grad_norm": 6.0609564781188965, "learning_rate": 5.801050788091069e-05, "loss": 1.3902, "num_input_tokens_seen": 96545320, "step": 6000 }, { "epoch": 0.4202894743754761, "eval_loss": 1.1260257959365845, "eval_runtime": 0.1972, "eval_samples_per_second": 5.072, "eval_steps_per_second": 5.072, "num_input_tokens_seen": 96545320, "step": 6000 }, { "epoch": 0.42035952262120535, "grad_norm": 3.939091682434082, "learning_rate": 5.800350963222417e-05, "loss": 1.2935, "num_input_tokens_seen": 96561704, "step": 6001 }, { "epoch": 0.4204295708669346, "grad_norm": 4.907895565032959, "learning_rate": 5.799651138353765e-05, "loss": 0.9923, "num_input_tokens_seen": 96578088, "step": 6002 }, { "epoch": 0.42049961911266387, "grad_norm": 4.598423480987549, "learning_rate": 5.7989513134851135e-05, "loss": 0.9852, "num_input_tokens_seen": 96594392, "step": 6003 }, { "epoch": 0.4205696673583931, "grad_norm": 4.8221540451049805, "learning_rate": 5.7982514886164644e-05, "loss": 1.2558, "num_input_tokens_seen": 96609688, "step": 6004 }, { "epoch": 0.42063971560412233, "grad_norm": 6.331230163574219, "learning_rate": 5.797551663747811e-05, "loss": 0.943, "num_input_tokens_seen": 96625480, "step": 6005 }, { "epoch": 0.4207097638498516, "grad_norm": 4.262217044830322, "learning_rate": 5.7968518388791594e-05, "loss": 1.1607, "num_input_tokens_seen": 96641040, "step": 6006 }, { "epoch": 0.42077981209558085, "grad_norm": 4.552499294281006, "learning_rate": 5.796152014010508e-05, "loss": 0.9532, "num_input_tokens_seen": 96657424, "step": 6007 }, { "epoch": 0.42084986034131006, "grad_norm": 3.414970874786377, "learning_rate": 5.7954521891418564e-05, "loss": 0.8991, "num_input_tokens_seen": 96673792, "step": 6008 }, { "epoch": 0.4209199085870393, "grad_norm": 3.70623517036438, "learning_rate": 5.7947523642732046e-05, "loss": 1.167, "num_input_tokens_seen": 96690176, "step": 6009 }, { "epoch": 0.4209899568327686, "grad_norm": 4.370288848876953, "learning_rate": 5.794052539404554e-05, "loss": 1.0194, "num_input_tokens_seen": 96706272, "step": 6010 }, { "epoch": 0.42106000507849783, "grad_norm": 3.4775140285491943, "learning_rate": 5.793352714535904e-05, "loss": 0.9383, "num_input_tokens_seen": 96722312, "step": 6011 }, { "epoch": 0.42113005332422704, "grad_norm": 3.9860763549804688, "learning_rate": 5.7926528896672505e-05, "loss": 1.0999, "num_input_tokens_seen": 96737040, "step": 6012 }, { "epoch": 0.4212001015699563, "grad_norm": 5.0287933349609375, "learning_rate": 5.791953064798599e-05, "loss": 0.9892, "num_input_tokens_seen": 96753424, "step": 6013 }, { "epoch": 0.42127014981568556, "grad_norm": 3.821143627166748, "learning_rate": 5.791253239929949e-05, "loss": 1.0388, "num_input_tokens_seen": 96769552, "step": 6014 }, { "epoch": 0.4213401980614148, "grad_norm": 4.180905818939209, "learning_rate": 5.7905534150612964e-05, "loss": 1.1118, "num_input_tokens_seen": 96785936, "step": 6015 }, { "epoch": 0.421410246307144, "grad_norm": 4.334224224090576, "learning_rate": 5.7898535901926446e-05, "loss": 1.3425, "num_input_tokens_seen": 96802320, "step": 6016 }, { "epoch": 0.4214802945528733, "grad_norm": 4.317337989807129, "learning_rate": 5.7891537653239934e-05, "loss": 1.0218, "num_input_tokens_seen": 96818360, "step": 6017 }, { "epoch": 0.42155034279860254, "grad_norm": 3.789919376373291, "learning_rate": 5.788453940455343e-05, "loss": 1.0815, "num_input_tokens_seen": 96833928, "step": 6018 }, { "epoch": 0.4216203910443318, "grad_norm": 4.240170001983643, "learning_rate": 5.78775411558669e-05, "loss": 1.0818, "num_input_tokens_seen": 96850312, "step": 6019 }, { "epoch": 0.421690439290061, "grad_norm": 5.163384914398193, "learning_rate": 5.7870542907180393e-05, "loss": 1.061, "num_input_tokens_seen": 96866696, "step": 6020 }, { "epoch": 0.42176048753579026, "grad_norm": 3.653265953063965, "learning_rate": 5.786354465849388e-05, "loss": 0.8955, "num_input_tokens_seen": 96883080, "step": 6021 }, { "epoch": 0.4218305357815195, "grad_norm": 3.4269649982452393, "learning_rate": 5.785654640980736e-05, "loss": 1.009, "num_input_tokens_seen": 96899176, "step": 6022 }, { "epoch": 0.4219005840272488, "grad_norm": 5.8838276863098145, "learning_rate": 5.784954816112084e-05, "loss": 1.0385, "num_input_tokens_seen": 96914576, "step": 6023 }, { "epoch": 0.42197063227297804, "grad_norm": 4.201550006866455, "learning_rate": 5.784254991243433e-05, "loss": 1.0398, "num_input_tokens_seen": 96930808, "step": 6024 }, { "epoch": 0.42204068051870725, "grad_norm": 3.961399793624878, "learning_rate": 5.783555166374781e-05, "loss": 1.0815, "num_input_tokens_seen": 96947192, "step": 6025 }, { "epoch": 0.4221107287644365, "grad_norm": 4.811456680297852, "learning_rate": 5.782855341506129e-05, "loss": 1.0576, "num_input_tokens_seen": 96961896, "step": 6026 }, { "epoch": 0.42218077701016576, "grad_norm": 3.6154356002807617, "learning_rate": 5.7821555166374787e-05, "loss": 0.9678, "num_input_tokens_seen": 96977656, "step": 6027 }, { "epoch": 0.422250825255895, "grad_norm": 3.787724256515503, "learning_rate": 5.7814556917688275e-05, "loss": 1.0813, "num_input_tokens_seen": 96993936, "step": 6028 }, { "epoch": 0.4223208735016242, "grad_norm": 4.215615272521973, "learning_rate": 5.780755866900175e-05, "loss": 1.2758, "num_input_tokens_seen": 97010320, "step": 6029 }, { "epoch": 0.4223909217473535, "grad_norm": 3.9257047176361084, "learning_rate": 5.780056042031524e-05, "loss": 0.9753, "num_input_tokens_seen": 97026704, "step": 6030 }, { "epoch": 0.42246096999308275, "grad_norm": 3.5415945053100586, "learning_rate": 5.7793562171628734e-05, "loss": 1.0718, "num_input_tokens_seen": 97043088, "step": 6031 }, { "epoch": 0.422531018238812, "grad_norm": 4.213465213775635, "learning_rate": 5.77865639229422e-05, "loss": 1.0011, "num_input_tokens_seen": 97059472, "step": 6032 }, { "epoch": 0.4226010664845412, "grad_norm": 3.8070178031921387, "learning_rate": 5.7779565674255684e-05, "loss": 0.881, "num_input_tokens_seen": 97074712, "step": 6033 }, { "epoch": 0.42267111473027047, "grad_norm": 3.8083109855651855, "learning_rate": 5.777256742556918e-05, "loss": 1.0003, "num_input_tokens_seen": 97091096, "step": 6034 }, { "epoch": 0.42274116297599973, "grad_norm": 3.491002082824707, "learning_rate": 5.7765569176882675e-05, "loss": 1.0276, "num_input_tokens_seen": 97107304, "step": 6035 }, { "epoch": 0.422811211221729, "grad_norm": 4.1060919761657715, "learning_rate": 5.775857092819616e-05, "loss": 0.979, "num_input_tokens_seen": 97123688, "step": 6036 }, { "epoch": 0.4228812594674582, "grad_norm": 3.8975484371185303, "learning_rate": 5.775157267950964e-05, "loss": 1.0906, "num_input_tokens_seen": 97140008, "step": 6037 }, { "epoch": 0.42295130771318745, "grad_norm": 4.4457197189331055, "learning_rate": 5.774457443082313e-05, "loss": 1.0763, "num_input_tokens_seen": 97156392, "step": 6038 }, { "epoch": 0.4230213559589167, "grad_norm": 3.5186471939086914, "learning_rate": 5.77375761821366e-05, "loss": 1.0242, "num_input_tokens_seen": 97172776, "step": 6039 }, { "epoch": 0.42309140420464597, "grad_norm": 3.729041814804077, "learning_rate": 5.7730577933450084e-05, "loss": 1.0272, "num_input_tokens_seen": 97189160, "step": 6040 }, { "epoch": 0.4231614524503752, "grad_norm": 4.501081466674805, "learning_rate": 5.7723579684763586e-05, "loss": 0.9879, "num_input_tokens_seen": 97205544, "step": 6041 }, { "epoch": 0.42323150069610443, "grad_norm": 5.922353744506836, "learning_rate": 5.771658143607707e-05, "loss": 1.1519, "num_input_tokens_seen": 97221928, "step": 6042 }, { "epoch": 0.4233015489418337, "grad_norm": 3.649948835372925, "learning_rate": 5.770958318739055e-05, "loss": 0.9467, "num_input_tokens_seen": 97238048, "step": 6043 }, { "epoch": 0.42337159718756295, "grad_norm": 4.660130977630615, "learning_rate": 5.770258493870403e-05, "loss": 1.0903, "num_input_tokens_seen": 97254272, "step": 6044 }, { "epoch": 0.42344164543329216, "grad_norm": 4.064535140991211, "learning_rate": 5.769558669001752e-05, "loss": 1.2646, "num_input_tokens_seen": 97270656, "step": 6045 }, { "epoch": 0.4235116936790214, "grad_norm": 3.931034803390503, "learning_rate": 5.7688588441331e-05, "loss": 0.9511, "num_input_tokens_seen": 97287040, "step": 6046 }, { "epoch": 0.4235817419247507, "grad_norm": 3.920013427734375, "learning_rate": 5.7681590192644484e-05, "loss": 1.1886, "num_input_tokens_seen": 97302784, "step": 6047 }, { "epoch": 0.42365179017047994, "grad_norm": 3.356661319732666, "learning_rate": 5.767459194395798e-05, "loss": 0.9283, "num_input_tokens_seen": 97319168, "step": 6048 }, { "epoch": 0.42372183841620914, "grad_norm": 4.33698034286499, "learning_rate": 5.766759369527145e-05, "loss": 1.0689, "num_input_tokens_seen": 97335552, "step": 6049 }, { "epoch": 0.4237918866619384, "grad_norm": 6.201281547546387, "learning_rate": 5.766059544658493e-05, "loss": 0.9756, "num_input_tokens_seen": 97350720, "step": 6050 }, { "epoch": 0.42386193490766766, "grad_norm": 4.005791664123535, "learning_rate": 5.7653597197898425e-05, "loss": 0.9559, "num_input_tokens_seen": 97367104, "step": 6051 }, { "epoch": 0.4239319831533969, "grad_norm": 4.238742828369141, "learning_rate": 5.764659894921191e-05, "loss": 1.0348, "num_input_tokens_seen": 97383488, "step": 6052 }, { "epoch": 0.4240020313991261, "grad_norm": 4.139926433563232, "learning_rate": 5.7639600700525395e-05, "loss": 1.0571, "num_input_tokens_seen": 97398864, "step": 6053 }, { "epoch": 0.4240720796448554, "grad_norm": 3.538890838623047, "learning_rate": 5.763260245183888e-05, "loss": 0.9162, "num_input_tokens_seen": 97414416, "step": 6054 }, { "epoch": 0.42414212789058464, "grad_norm": 3.888108253479004, "learning_rate": 5.762560420315237e-05, "loss": 1.0937, "num_input_tokens_seen": 97429616, "step": 6055 }, { "epoch": 0.4242121761363139, "grad_norm": 4.287962436676025, "learning_rate": 5.7618605954465854e-05, "loss": 0.9786, "num_input_tokens_seen": 97444784, "step": 6056 }, { "epoch": 0.4242822243820431, "grad_norm": 3.5160460472106934, "learning_rate": 5.761160770577935e-05, "loss": 1.0405, "num_input_tokens_seen": 97461104, "step": 6057 }, { "epoch": 0.42435227262777236, "grad_norm": 4.076432704925537, "learning_rate": 5.760460945709283e-05, "loss": 1.1768, "num_input_tokens_seen": 97477488, "step": 6058 }, { "epoch": 0.4244223208735016, "grad_norm": 3.4506590366363525, "learning_rate": 5.759761120840631e-05, "loss": 0.9435, "num_input_tokens_seen": 97493872, "step": 6059 }, { "epoch": 0.4244923691192309, "grad_norm": 4.196661472320557, "learning_rate": 5.7590612959719795e-05, "loss": 1.0714, "num_input_tokens_seen": 97509088, "step": 6060 }, { "epoch": 0.42456241736496014, "grad_norm": 4.412662506103516, "learning_rate": 5.758361471103328e-05, "loss": 1.1809, "num_input_tokens_seen": 97525472, "step": 6061 }, { "epoch": 0.42463246561068935, "grad_norm": 3.4199881553649902, "learning_rate": 5.7576616462346765e-05, "loss": 1.0078, "num_input_tokens_seen": 97541856, "step": 6062 }, { "epoch": 0.4247025138564186, "grad_norm": 4.215256214141846, "learning_rate": 5.756961821366025e-05, "loss": 0.9772, "num_input_tokens_seen": 97558240, "step": 6063 }, { "epoch": 0.42477256210214787, "grad_norm": 4.764070510864258, "learning_rate": 5.756261996497374e-05, "loss": 1.1994, "num_input_tokens_seen": 97574624, "step": 6064 }, { "epoch": 0.4248426103478771, "grad_norm": 3.8896613121032715, "learning_rate": 5.7555621716287224e-05, "loss": 1.1135, "num_input_tokens_seen": 97591008, "step": 6065 }, { "epoch": 0.42491265859360633, "grad_norm": 5.101664066314697, "learning_rate": 5.7548623467600706e-05, "loss": 1.0647, "num_input_tokens_seen": 97607392, "step": 6066 }, { "epoch": 0.4249827068393356, "grad_norm": 4.464064121246338, "learning_rate": 5.754162521891419e-05, "loss": 1.0992, "num_input_tokens_seen": 97623776, "step": 6067 }, { "epoch": 0.42505275508506485, "grad_norm": 3.9882681369781494, "learning_rate": 5.7534626970227676e-05, "loss": 1.0344, "num_input_tokens_seen": 97639296, "step": 6068 }, { "epoch": 0.4251228033307941, "grad_norm": 5.5437331199646, "learning_rate": 5.752762872154116e-05, "loss": 1.0782, "num_input_tokens_seen": 97655592, "step": 6069 }, { "epoch": 0.4251928515765233, "grad_norm": 4.157887935638428, "learning_rate": 5.752063047285464e-05, "loss": 1.2531, "num_input_tokens_seen": 97671976, "step": 6070 }, { "epoch": 0.42526289982225257, "grad_norm": 4.455500602722168, "learning_rate": 5.751363222416812e-05, "loss": 1.0738, "num_input_tokens_seen": 97688360, "step": 6071 }, { "epoch": 0.42533294806798183, "grad_norm": 5.3056254386901855, "learning_rate": 5.750663397548162e-05, "loss": 1.2483, "num_input_tokens_seen": 97704008, "step": 6072 }, { "epoch": 0.4254029963137111, "grad_norm": 3.5183193683624268, "learning_rate": 5.74996357267951e-05, "loss": 0.9862, "num_input_tokens_seen": 97720392, "step": 6073 }, { "epoch": 0.4254730445594403, "grad_norm": 4.44768762588501, "learning_rate": 5.749263747810859e-05, "loss": 1.2951, "num_input_tokens_seen": 97736584, "step": 6074 }, { "epoch": 0.42554309280516955, "grad_norm": 3.6957905292510986, "learning_rate": 5.748563922942207e-05, "loss": 1.2134, "num_input_tokens_seen": 97752968, "step": 6075 }, { "epoch": 0.4256131410508988, "grad_norm": 3.6841094493865967, "learning_rate": 5.747864098073555e-05, "loss": 0.9744, "num_input_tokens_seen": 97769352, "step": 6076 }, { "epoch": 0.4256831892966281, "grad_norm": 6.541488170623779, "learning_rate": 5.747164273204903e-05, "loss": 1.1638, "num_input_tokens_seen": 97785736, "step": 6077 }, { "epoch": 0.4257532375423573, "grad_norm": 4.056735515594482, "learning_rate": 5.7464644483362515e-05, "loss": 0.9758, "num_input_tokens_seen": 97801624, "step": 6078 }, { "epoch": 0.42582328578808654, "grad_norm": 3.5294058322906494, "learning_rate": 5.745764623467601e-05, "loss": 0.9682, "num_input_tokens_seen": 97817544, "step": 6079 }, { "epoch": 0.4258933340338158, "grad_norm": 3.851330280303955, "learning_rate": 5.745064798598949e-05, "loss": 1.0858, "num_input_tokens_seen": 97833600, "step": 6080 }, { "epoch": 0.42596338227954506, "grad_norm": 3.6939046382904053, "learning_rate": 5.744364973730299e-05, "loss": 0.9469, "num_input_tokens_seen": 97849984, "step": 6081 }, { "epoch": 0.42603343052527426, "grad_norm": 3.7894139289855957, "learning_rate": 5.743665148861647e-05, "loss": 1.1953, "num_input_tokens_seen": 97866368, "step": 6082 }, { "epoch": 0.4261034787710035, "grad_norm": 3.377105712890625, "learning_rate": 5.742965323992995e-05, "loss": 1.0573, "num_input_tokens_seen": 97882752, "step": 6083 }, { "epoch": 0.4261735270167328, "grad_norm": 4.0349440574646, "learning_rate": 5.742265499124344e-05, "loss": 1.1328, "num_input_tokens_seen": 97899136, "step": 6084 }, { "epoch": 0.42624357526246204, "grad_norm": 3.9353208541870117, "learning_rate": 5.7415656742556935e-05, "loss": 0.9787, "num_input_tokens_seen": 97915520, "step": 6085 }, { "epoch": 0.42631362350819124, "grad_norm": 3.6593427658081055, "learning_rate": 5.740865849387042e-05, "loss": 1.0417, "num_input_tokens_seen": 97931904, "step": 6086 }, { "epoch": 0.4263836717539205, "grad_norm": 3.543994665145874, "learning_rate": 5.7401660245183885e-05, "loss": 0.9268, "num_input_tokens_seen": 97948288, "step": 6087 }, { "epoch": 0.42645371999964976, "grad_norm": 3.925420045852661, "learning_rate": 5.739466199649738e-05, "loss": 1.1635, "num_input_tokens_seen": 97964672, "step": 6088 }, { "epoch": 0.426523768245379, "grad_norm": 4.6036224365234375, "learning_rate": 5.738766374781086e-05, "loss": 1.1229, "num_input_tokens_seen": 97981056, "step": 6089 }, { "epoch": 0.4265938164911082, "grad_norm": 6.555153846740723, "learning_rate": 5.7380665499124344e-05, "loss": 1.1401, "num_input_tokens_seen": 97997440, "step": 6090 }, { "epoch": 0.4266638647368375, "grad_norm": 3.7414231300354004, "learning_rate": 5.737366725043783e-05, "loss": 1.0223, "num_input_tokens_seen": 98013264, "step": 6091 }, { "epoch": 0.42673391298256674, "grad_norm": 4.380615234375, "learning_rate": 5.7366669001751314e-05, "loss": 1.1524, "num_input_tokens_seen": 98029176, "step": 6092 }, { "epoch": 0.426803961228296, "grad_norm": 4.624136924743652, "learning_rate": 5.7359670753064796e-05, "loss": 1.1277, "num_input_tokens_seen": 98044384, "step": 6093 }, { "epoch": 0.42687400947402526, "grad_norm": 4.984564781188965, "learning_rate": 5.735267250437828e-05, "loss": 1.1115, "num_input_tokens_seen": 98060768, "step": 6094 }, { "epoch": 0.42694405771975447, "grad_norm": 5.481975078582764, "learning_rate": 5.734567425569178e-05, "loss": 1.167, "num_input_tokens_seen": 98077152, "step": 6095 }, { "epoch": 0.4270141059654837, "grad_norm": 3.3822808265686035, "learning_rate": 5.733867600700526e-05, "loss": 0.9442, "num_input_tokens_seen": 98093224, "step": 6096 }, { "epoch": 0.427084154211213, "grad_norm": 3.8090853691101074, "learning_rate": 5.733167775831874e-05, "loss": 1.0478, "num_input_tokens_seen": 98109608, "step": 6097 }, { "epoch": 0.42715420245694224, "grad_norm": 4.279370307922363, "learning_rate": 5.7324679509632226e-05, "loss": 0.918, "num_input_tokens_seen": 98125992, "step": 6098 }, { "epoch": 0.42722425070267145, "grad_norm": 5.998210430145264, "learning_rate": 5.731768126094571e-05, "loss": 0.9409, "num_input_tokens_seen": 98142376, "step": 6099 }, { "epoch": 0.4272942989484007, "grad_norm": 4.388184070587158, "learning_rate": 5.731068301225919e-05, "loss": 1.0364, "num_input_tokens_seen": 98158760, "step": 6100 }, { "epoch": 0.42736434719412997, "grad_norm": 4.937825679779053, "learning_rate": 5.7303684763572685e-05, "loss": 1.1185, "num_input_tokens_seen": 98175144, "step": 6101 }, { "epoch": 0.4274343954398592, "grad_norm": 3.800776720046997, "learning_rate": 5.729668651488618e-05, "loss": 1.1608, "num_input_tokens_seen": 98191184, "step": 6102 }, { "epoch": 0.42750444368558843, "grad_norm": 3.857093334197998, "learning_rate": 5.728968826619966e-05, "loss": 0.8588, "num_input_tokens_seen": 98207568, "step": 6103 }, { "epoch": 0.4275744919313177, "grad_norm": 3.562218189239502, "learning_rate": 5.728269001751313e-05, "loss": 1.1002, "num_input_tokens_seen": 98223952, "step": 6104 }, { "epoch": 0.42764454017704695, "grad_norm": 3.826802968978882, "learning_rate": 5.7275691768826626e-05, "loss": 0.7401, "num_input_tokens_seen": 98239576, "step": 6105 }, { "epoch": 0.4277145884227762, "grad_norm": 4.127960205078125, "learning_rate": 5.7268693520140114e-05, "loss": 1.0163, "num_input_tokens_seen": 98255960, "step": 6106 }, { "epoch": 0.4277846366685054, "grad_norm": 4.270632743835449, "learning_rate": 5.726169527145359e-05, "loss": 1.2359, "num_input_tokens_seen": 98272080, "step": 6107 }, { "epoch": 0.4278546849142347, "grad_norm": 4.543783187866211, "learning_rate": 5.725469702276708e-05, "loss": 1.1117, "num_input_tokens_seen": 98288464, "step": 6108 }, { "epoch": 0.42792473315996393, "grad_norm": 3.993234634399414, "learning_rate": 5.724769877408057e-05, "loss": 1.0059, "num_input_tokens_seen": 98304424, "step": 6109 }, { "epoch": 0.4279947814056932, "grad_norm": 4.11693000793457, "learning_rate": 5.7240700525394055e-05, "loss": 1.0718, "num_input_tokens_seen": 98320808, "step": 6110 }, { "epoch": 0.4280648296514224, "grad_norm": 4.000871658325195, "learning_rate": 5.723370227670754e-05, "loss": 0.9777, "num_input_tokens_seen": 98337192, "step": 6111 }, { "epoch": 0.42813487789715166, "grad_norm": 3.642763614654541, "learning_rate": 5.7226704028021025e-05, "loss": 0.9108, "num_input_tokens_seen": 98353320, "step": 6112 }, { "epoch": 0.4282049261428809, "grad_norm": 4.22330379486084, "learning_rate": 5.721970577933451e-05, "loss": 1.0968, "num_input_tokens_seen": 98369704, "step": 6113 }, { "epoch": 0.4282749743886102, "grad_norm": 3.7961175441741943, "learning_rate": 5.721270753064798e-05, "loss": 0.8756, "num_input_tokens_seen": 98385544, "step": 6114 }, { "epoch": 0.4283450226343394, "grad_norm": 3.771034002304077, "learning_rate": 5.720570928196147e-05, "loss": 1.1139, "num_input_tokens_seen": 98401928, "step": 6115 }, { "epoch": 0.42841507088006864, "grad_norm": 3.8084332942962646, "learning_rate": 5.719871103327495e-05, "loss": 1.1042, "num_input_tokens_seen": 98418136, "step": 6116 }, { "epoch": 0.4284851191257979, "grad_norm": 3.890608549118042, "learning_rate": 5.7191712784588434e-05, "loss": 0.9865, "num_input_tokens_seen": 98433656, "step": 6117 }, { "epoch": 0.42855516737152716, "grad_norm": 6.781351089477539, "learning_rate": 5.718471453590193e-05, "loss": 0.8032, "num_input_tokens_seen": 98448776, "step": 6118 }, { "epoch": 0.42862521561725636, "grad_norm": 3.941107749938965, "learning_rate": 5.717771628721542e-05, "loss": 1.104, "num_input_tokens_seen": 98465160, "step": 6119 }, { "epoch": 0.4286952638629856, "grad_norm": 4.457616329193115, "learning_rate": 5.71707180385289e-05, "loss": 1.1159, "num_input_tokens_seen": 98481184, "step": 6120 }, { "epoch": 0.4287653121087149, "grad_norm": 3.889111042022705, "learning_rate": 5.7163719789842375e-05, "loss": 1.0685, "num_input_tokens_seen": 98497568, "step": 6121 }, { "epoch": 0.42883536035444414, "grad_norm": 3.7574422359466553, "learning_rate": 5.715672154115588e-05, "loss": 0.9091, "num_input_tokens_seen": 98513920, "step": 6122 }, { "epoch": 0.42890540860017334, "grad_norm": 3.578437089920044, "learning_rate": 5.714972329246936e-05, "loss": 0.9449, "num_input_tokens_seen": 98529664, "step": 6123 }, { "epoch": 0.4289754568459026, "grad_norm": 5.0676398277282715, "learning_rate": 5.714272504378283e-05, "loss": 1.0768, "num_input_tokens_seen": 98544936, "step": 6124 }, { "epoch": 0.42904550509163186, "grad_norm": 4.475335121154785, "learning_rate": 5.713572679509632e-05, "loss": 0.9347, "num_input_tokens_seen": 98560520, "step": 6125 }, { "epoch": 0.4291155533373611, "grad_norm": 6.345788955688477, "learning_rate": 5.712872854640982e-05, "loss": 1.1897, "num_input_tokens_seen": 98576320, "step": 6126 }, { "epoch": 0.4291856015830903, "grad_norm": 3.775374174118042, "learning_rate": 5.71217302977233e-05, "loss": 0.9803, "num_input_tokens_seen": 98592704, "step": 6127 }, { "epoch": 0.4292556498288196, "grad_norm": 4.224292278289795, "learning_rate": 5.711473204903678e-05, "loss": 1.2253, "num_input_tokens_seen": 98607664, "step": 6128 }, { "epoch": 0.42932569807454884, "grad_norm": 4.470034122467041, "learning_rate": 5.710773380035027e-05, "loss": 0.9915, "num_input_tokens_seen": 98624048, "step": 6129 }, { "epoch": 0.4293957463202781, "grad_norm": 6.22687292098999, "learning_rate": 5.710073555166375e-05, "loss": 1.2048, "num_input_tokens_seen": 98640432, "step": 6130 }, { "epoch": 0.42946579456600736, "grad_norm": 3.9434430599212646, "learning_rate": 5.709373730297722e-05, "loss": 1.0306, "num_input_tokens_seen": 98656672, "step": 6131 }, { "epoch": 0.42953584281173657, "grad_norm": 3.7640228271484375, "learning_rate": 5.7086739054290716e-05, "loss": 1.061, "num_input_tokens_seen": 98673056, "step": 6132 }, { "epoch": 0.4296058910574658, "grad_norm": 5.742674827575684, "learning_rate": 5.707974080560421e-05, "loss": 1.0773, "num_input_tokens_seen": 98688400, "step": 6133 }, { "epoch": 0.4296759393031951, "grad_norm": 4.938521862030029, "learning_rate": 5.707274255691769e-05, "loss": 0.9877, "num_input_tokens_seen": 98703304, "step": 6134 }, { "epoch": 0.42974598754892435, "grad_norm": 3.7322773933410645, "learning_rate": 5.7065744308231175e-05, "loss": 0.9787, "num_input_tokens_seen": 98717536, "step": 6135 }, { "epoch": 0.42981603579465355, "grad_norm": 3.741265296936035, "learning_rate": 5.7058746059544663e-05, "loss": 1.1105, "num_input_tokens_seen": 98733632, "step": 6136 }, { "epoch": 0.4298860840403828, "grad_norm": 3.9021074771881104, "learning_rate": 5.7051747810858145e-05, "loss": 0.9721, "num_input_tokens_seen": 98749088, "step": 6137 }, { "epoch": 0.42995613228611207, "grad_norm": 4.327329635620117, "learning_rate": 5.704474956217163e-05, "loss": 1.0862, "num_input_tokens_seen": 98765328, "step": 6138 }, { "epoch": 0.43002618053184133, "grad_norm": 4.335643768310547, "learning_rate": 5.703775131348512e-05, "loss": 1.2386, "num_input_tokens_seen": 98780744, "step": 6139 }, { "epoch": 0.43009622877757053, "grad_norm": 4.66419792175293, "learning_rate": 5.7030753064798604e-05, "loss": 1.0696, "num_input_tokens_seen": 98797128, "step": 6140 }, { "epoch": 0.4301662770232998, "grad_norm": 4.208861351013184, "learning_rate": 5.702375481611207e-05, "loss": 0.9743, "num_input_tokens_seen": 98812776, "step": 6141 }, { "epoch": 0.43023632526902905, "grad_norm": 4.90700626373291, "learning_rate": 5.701675656742557e-05, "loss": 0.9744, "num_input_tokens_seen": 98829160, "step": 6142 }, { "epoch": 0.4303063735147583, "grad_norm": 3.942166805267334, "learning_rate": 5.7009758318739056e-05, "loss": 1.0032, "num_input_tokens_seen": 98845544, "step": 6143 }, { "epoch": 0.4303764217604875, "grad_norm": 4.919578552246094, "learning_rate": 5.700276007005254e-05, "loss": 1.0218, "num_input_tokens_seen": 98861928, "step": 6144 }, { "epoch": 0.4304464700062168, "grad_norm": 3.6429073810577393, "learning_rate": 5.699576182136602e-05, "loss": 1.1052, "num_input_tokens_seen": 98878288, "step": 6145 }, { "epoch": 0.43051651825194603, "grad_norm": 4.227152347564697, "learning_rate": 5.6988763572679515e-05, "loss": 1.059, "num_input_tokens_seen": 98893816, "step": 6146 }, { "epoch": 0.4305865664976753, "grad_norm": 4.016188144683838, "learning_rate": 5.6981765323993e-05, "loss": 0.9898, "num_input_tokens_seen": 98909968, "step": 6147 }, { "epoch": 0.4306566147434045, "grad_norm": 4.0402069091796875, "learning_rate": 5.697476707530648e-05, "loss": 1.1366, "num_input_tokens_seen": 98926352, "step": 6148 }, { "epoch": 0.43072666298913376, "grad_norm": 5.771969318389893, "learning_rate": 5.6967768826619974e-05, "loss": 0.9641, "num_input_tokens_seen": 98941512, "step": 6149 }, { "epoch": 0.430796711234863, "grad_norm": 4.444697856903076, "learning_rate": 5.6960770577933456e-05, "loss": 1.1114, "num_input_tokens_seen": 98957624, "step": 6150 }, { "epoch": 0.4308667594805923, "grad_norm": 3.386268377304077, "learning_rate": 5.695377232924694e-05, "loss": 0.9552, "num_input_tokens_seen": 98974008, "step": 6151 }, { "epoch": 0.4309368077263215, "grad_norm": 3.950138807296753, "learning_rate": 5.694677408056042e-05, "loss": 1.0048, "num_input_tokens_seen": 98990392, "step": 6152 }, { "epoch": 0.43100685597205074, "grad_norm": 3.7290585041046143, "learning_rate": 5.693977583187392e-05, "loss": 1.103, "num_input_tokens_seen": 99006776, "step": 6153 }, { "epoch": 0.43107690421778, "grad_norm": 3.3678364753723145, "learning_rate": 5.693277758318739e-05, "loss": 0.7396, "num_input_tokens_seen": 99022912, "step": 6154 }, { "epoch": 0.43114695246350926, "grad_norm": 5.882314682006836, "learning_rate": 5.6925779334500886e-05, "loss": 1.0949, "num_input_tokens_seen": 99038208, "step": 6155 }, { "epoch": 0.43121700070923846, "grad_norm": 4.231525421142578, "learning_rate": 5.691878108581437e-05, "loss": 1.0437, "num_input_tokens_seen": 99053496, "step": 6156 }, { "epoch": 0.4312870489549677, "grad_norm": 4.864506721496582, "learning_rate": 5.691178283712785e-05, "loss": 1.0978, "num_input_tokens_seen": 99069600, "step": 6157 }, { "epoch": 0.431357097200697, "grad_norm": 6.483276844024658, "learning_rate": 5.690478458844133e-05, "loss": 0.9262, "num_input_tokens_seen": 99085456, "step": 6158 }, { "epoch": 0.43142714544642624, "grad_norm": 3.830292224884033, "learning_rate": 5.689778633975482e-05, "loss": 1.1837, "num_input_tokens_seen": 99101840, "step": 6159 }, { "epoch": 0.43149719369215545, "grad_norm": 4.078514099121094, "learning_rate": 5.68907880910683e-05, "loss": 0.9916, "num_input_tokens_seen": 99118224, "step": 6160 }, { "epoch": 0.4315672419378847, "grad_norm": 4.1833648681640625, "learning_rate": 5.688378984238178e-05, "loss": 1.2243, "num_input_tokens_seen": 99134608, "step": 6161 }, { "epoch": 0.43163729018361396, "grad_norm": 4.761826515197754, "learning_rate": 5.6876791593695265e-05, "loss": 1.1017, "num_input_tokens_seen": 99150992, "step": 6162 }, { "epoch": 0.4317073384293432, "grad_norm": 4.992908954620361, "learning_rate": 5.686979334500877e-05, "loss": 0.9658, "num_input_tokens_seen": 99167320, "step": 6163 }, { "epoch": 0.4317773866750725, "grad_norm": 3.8283936977386475, "learning_rate": 5.686279509632224e-05, "loss": 1.0521, "num_input_tokens_seen": 99183512, "step": 6164 }, { "epoch": 0.4318474349208017, "grad_norm": 3.4508893489837646, "learning_rate": 5.685579684763573e-05, "loss": 0.9632, "num_input_tokens_seen": 99199896, "step": 6165 }, { "epoch": 0.43191748316653095, "grad_norm": 4.4798431396484375, "learning_rate": 5.684879859894921e-05, "loss": 0.8848, "num_input_tokens_seen": 99216280, "step": 6166 }, { "epoch": 0.4319875314122602, "grad_norm": 5.227555751800537, "learning_rate": 5.6841800350262694e-05, "loss": 0.9621, "num_input_tokens_seen": 99230656, "step": 6167 }, { "epoch": 0.43205757965798947, "grad_norm": 5.358756065368652, "learning_rate": 5.6834802101576176e-05, "loss": 1.0361, "num_input_tokens_seen": 99246864, "step": 6168 }, { "epoch": 0.43212762790371867, "grad_norm": 4.224287986755371, "learning_rate": 5.682780385288967e-05, "loss": 1.0804, "num_input_tokens_seen": 99263248, "step": 6169 }, { "epoch": 0.43219767614944793, "grad_norm": 5.748126983642578, "learning_rate": 5.682080560420317e-05, "loss": 1.0353, "num_input_tokens_seen": 99279632, "step": 6170 }, { "epoch": 0.4322677243951772, "grad_norm": 4.036735534667969, "learning_rate": 5.6813807355516635e-05, "loss": 1.0776, "num_input_tokens_seen": 99296016, "step": 6171 }, { "epoch": 0.43233777264090645, "grad_norm": 4.133121013641357, "learning_rate": 5.680680910683013e-05, "loss": 1.0796, "num_input_tokens_seen": 99312400, "step": 6172 }, { "epoch": 0.43240782088663565, "grad_norm": 4.70187520980835, "learning_rate": 5.679981085814362e-05, "loss": 1.1069, "num_input_tokens_seen": 99328504, "step": 6173 }, { "epoch": 0.4324778691323649, "grad_norm": 3.515967845916748, "learning_rate": 5.6792812609457094e-05, "loss": 1.0462, "num_input_tokens_seen": 99344424, "step": 6174 }, { "epoch": 0.43254791737809417, "grad_norm": 5.408679962158203, "learning_rate": 5.6785814360770576e-05, "loss": 1.0638, "num_input_tokens_seen": 99360056, "step": 6175 }, { "epoch": 0.43261796562382343, "grad_norm": 3.7438695430755615, "learning_rate": 5.677881611208408e-05, "loss": 1.0115, "num_input_tokens_seen": 99375920, "step": 6176 }, { "epoch": 0.43268801386955263, "grad_norm": 4.337923526763916, "learning_rate": 5.677181786339756e-05, "loss": 1.1368, "num_input_tokens_seen": 99392040, "step": 6177 }, { "epoch": 0.4327580621152819, "grad_norm": 6.04982852935791, "learning_rate": 5.676481961471103e-05, "loss": 1.1253, "num_input_tokens_seen": 99407592, "step": 6178 }, { "epoch": 0.43282811036101115, "grad_norm": 3.9268686771392822, "learning_rate": 5.6757821366024524e-05, "loss": 1.1957, "num_input_tokens_seen": 99423976, "step": 6179 }, { "epoch": 0.4328981586067404, "grad_norm": 4.466431140899658, "learning_rate": 5.675082311733801e-05, "loss": 0.8999, "num_input_tokens_seen": 99440360, "step": 6180 }, { "epoch": 0.4329682068524696, "grad_norm": 4.166913032531738, "learning_rate": 5.674382486865149e-05, "loss": 1.0157, "num_input_tokens_seen": 99456744, "step": 6181 }, { "epoch": 0.4330382550981989, "grad_norm": 3.525611400604248, "learning_rate": 5.6736826619964976e-05, "loss": 1.1182, "num_input_tokens_seen": 99473080, "step": 6182 }, { "epoch": 0.43310830334392814, "grad_norm": 6.099409103393555, "learning_rate": 5.672982837127847e-05, "loss": 0.9801, "num_input_tokens_seen": 99489088, "step": 6183 }, { "epoch": 0.4331783515896574, "grad_norm": 3.5886685848236084, "learning_rate": 5.672283012259194e-05, "loss": 1.0214, "num_input_tokens_seen": 99505248, "step": 6184 }, { "epoch": 0.4332483998353866, "grad_norm": 3.5279197692871094, "learning_rate": 5.671583187390542e-05, "loss": 0.9724, "num_input_tokens_seen": 99521632, "step": 6185 }, { "epoch": 0.43331844808111586, "grad_norm": 4.606603622436523, "learning_rate": 5.670883362521892e-05, "loss": 0.8623, "num_input_tokens_seen": 99537336, "step": 6186 }, { "epoch": 0.4333884963268451, "grad_norm": 3.5966908931732178, "learning_rate": 5.6701835376532405e-05, "loss": 0.9549, "num_input_tokens_seen": 99553720, "step": 6187 }, { "epoch": 0.4334585445725744, "grad_norm": 3.981893301010132, "learning_rate": 5.669483712784588e-05, "loss": 0.923, "num_input_tokens_seen": 99569384, "step": 6188 }, { "epoch": 0.4335285928183036, "grad_norm": 4.06168270111084, "learning_rate": 5.668783887915937e-05, "loss": 1.12, "num_input_tokens_seen": 99585768, "step": 6189 }, { "epoch": 0.43359864106403284, "grad_norm": 4.476738929748535, "learning_rate": 5.6680840630472864e-05, "loss": 1.0997, "num_input_tokens_seen": 99602152, "step": 6190 }, { "epoch": 0.4336686893097621, "grad_norm": 7.592894554138184, "learning_rate": 5.667384238178633e-05, "loss": 1.1001, "num_input_tokens_seen": 99618536, "step": 6191 }, { "epoch": 0.43373873755549136, "grad_norm": 3.4367337226867676, "learning_rate": 5.666684413309983e-05, "loss": 0.8883, "num_input_tokens_seen": 99634920, "step": 6192 }, { "epoch": 0.43380878580122056, "grad_norm": 3.8736166954040527, "learning_rate": 5.665984588441332e-05, "loss": 1.1062, "num_input_tokens_seen": 99651304, "step": 6193 }, { "epoch": 0.4338788340469498, "grad_norm": 4.0018463134765625, "learning_rate": 5.6652847635726805e-05, "loss": 1.15, "num_input_tokens_seen": 99667688, "step": 6194 }, { "epoch": 0.4339488822926791, "grad_norm": 4.243009090423584, "learning_rate": 5.664584938704029e-05, "loss": 1.0738, "num_input_tokens_seen": 99684072, "step": 6195 }, { "epoch": 0.43401893053840834, "grad_norm": 5.533624172210693, "learning_rate": 5.663885113835377e-05, "loss": 0.9901, "num_input_tokens_seen": 99698440, "step": 6196 }, { "epoch": 0.4340889787841376, "grad_norm": 3.9158618450164795, "learning_rate": 5.663185288966726e-05, "loss": 0.9884, "num_input_tokens_seen": 99714824, "step": 6197 }, { "epoch": 0.4341590270298668, "grad_norm": 6.666274070739746, "learning_rate": 5.662485464098073e-05, "loss": 1.3152, "num_input_tokens_seen": 99731208, "step": 6198 }, { "epoch": 0.43422907527559607, "grad_norm": 4.02492618560791, "learning_rate": 5.6617856392294235e-05, "loss": 1.014, "num_input_tokens_seen": 99747592, "step": 6199 }, { "epoch": 0.4342991235213253, "grad_norm": 3.4257941246032715, "learning_rate": 5.6610858143607716e-05, "loss": 0.9081, "num_input_tokens_seen": 99763656, "step": 6200 }, { "epoch": 0.4342991235213253, "eval_loss": 1.126607060432434, "eval_runtime": 0.1953, "eval_samples_per_second": 5.119, "eval_steps_per_second": 5.119, "num_input_tokens_seen": 99763656, "step": 6200 }, { "epoch": 0.4343691717670546, "grad_norm": 3.7283267974853516, "learning_rate": 5.66038598949212e-05, "loss": 0.8829, "num_input_tokens_seen": 99779416, "step": 6201 }, { "epoch": 0.4344392200127838, "grad_norm": 5.657198429107666, "learning_rate": 5.659686164623468e-05, "loss": 1.1111, "num_input_tokens_seen": 99795376, "step": 6202 }, { "epoch": 0.43450926825851305, "grad_norm": 4.102888107299805, "learning_rate": 5.658986339754817e-05, "loss": 1.1068, "num_input_tokens_seen": 99811760, "step": 6203 }, { "epoch": 0.4345793165042423, "grad_norm": 5.449219226837158, "learning_rate": 5.658286514886165e-05, "loss": 0.8439, "num_input_tokens_seen": 99827264, "step": 6204 }, { "epoch": 0.43464936474997157, "grad_norm": 4.1982197761535645, "learning_rate": 5.657586690017513e-05, "loss": 1.077, "num_input_tokens_seen": 99843648, "step": 6205 }, { "epoch": 0.43471941299570077, "grad_norm": 4.246870517730713, "learning_rate": 5.6568868651488614e-05, "loss": 1.0653, "num_input_tokens_seen": 99859800, "step": 6206 }, { "epoch": 0.43478946124143003, "grad_norm": 7.575351238250732, "learning_rate": 5.656187040280211e-05, "loss": 1.231, "num_input_tokens_seen": 99875240, "step": 6207 }, { "epoch": 0.4348595094871593, "grad_norm": 4.253138065338135, "learning_rate": 5.655487215411558e-05, "loss": 1.1178, "num_input_tokens_seen": 99891624, "step": 6208 }, { "epoch": 0.43492955773288855, "grad_norm": 3.5073490142822266, "learning_rate": 5.654787390542908e-05, "loss": 1.0251, "num_input_tokens_seen": 99908008, "step": 6209 }, { "epoch": 0.43499960597861775, "grad_norm": 3.669361114501953, "learning_rate": 5.654087565674256e-05, "loss": 1.0233, "num_input_tokens_seen": 99924392, "step": 6210 }, { "epoch": 0.435069654224347, "grad_norm": 4.25203800201416, "learning_rate": 5.6533877408056043e-05, "loss": 0.972, "num_input_tokens_seen": 99940776, "step": 6211 }, { "epoch": 0.4351397024700763, "grad_norm": 3.7570602893829346, "learning_rate": 5.6526879159369525e-05, "loss": 1.086, "num_input_tokens_seen": 99956488, "step": 6212 }, { "epoch": 0.43520975071580553, "grad_norm": 3.47245454788208, "learning_rate": 5.651988091068301e-05, "loss": 0.8526, "num_input_tokens_seen": 99972792, "step": 6213 }, { "epoch": 0.43527979896153474, "grad_norm": 4.902298927307129, "learning_rate": 5.65128826619965e-05, "loss": 1.1583, "num_input_tokens_seen": 99988144, "step": 6214 }, { "epoch": 0.435349847207264, "grad_norm": 3.796644926071167, "learning_rate": 5.6505884413309984e-05, "loss": 1.0739, "num_input_tokens_seen": 100004528, "step": 6215 }, { "epoch": 0.43541989545299326, "grad_norm": 4.4678425788879395, "learning_rate": 5.649888616462348e-05, "loss": 1.0843, "num_input_tokens_seen": 100020448, "step": 6216 }, { "epoch": 0.4354899436987225, "grad_norm": 5.181003570556641, "learning_rate": 5.649188791593696e-05, "loss": 0.9784, "num_input_tokens_seen": 100036832, "step": 6217 }, { "epoch": 0.4355599919444517, "grad_norm": 3.4864094257354736, "learning_rate": 5.648488966725044e-05, "loss": 1.1016, "num_input_tokens_seen": 100052656, "step": 6218 }, { "epoch": 0.435630040190181, "grad_norm": 3.6500463485717773, "learning_rate": 5.647789141856393e-05, "loss": 0.9406, "num_input_tokens_seen": 100069040, "step": 6219 }, { "epoch": 0.43570008843591024, "grad_norm": 4.463146686553955, "learning_rate": 5.647089316987743e-05, "loss": 1.0597, "num_input_tokens_seen": 100085248, "step": 6220 }, { "epoch": 0.4357701366816395, "grad_norm": 4.013953685760498, "learning_rate": 5.6463894921190895e-05, "loss": 0.9562, "num_input_tokens_seen": 100100600, "step": 6221 }, { "epoch": 0.4358401849273687, "grad_norm": 4.549919128417969, "learning_rate": 5.645689667250438e-05, "loss": 0.9911, "num_input_tokens_seen": 100116368, "step": 6222 }, { "epoch": 0.43591023317309796, "grad_norm": 4.227685451507568, "learning_rate": 5.644989842381787e-05, "loss": 1.0951, "num_input_tokens_seen": 100132752, "step": 6223 }, { "epoch": 0.4359802814188272, "grad_norm": 3.699406147003174, "learning_rate": 5.6442900175131354e-05, "loss": 0.9597, "num_input_tokens_seen": 100149048, "step": 6224 }, { "epoch": 0.4360503296645565, "grad_norm": 4.6700944900512695, "learning_rate": 5.6435901926444836e-05, "loss": 1.1303, "num_input_tokens_seen": 100165432, "step": 6225 }, { "epoch": 0.4361203779102857, "grad_norm": 3.524517774581909, "learning_rate": 5.6428903677758325e-05, "loss": 1.002, "num_input_tokens_seen": 100181672, "step": 6226 }, { "epoch": 0.43619042615601494, "grad_norm": 3.6368181705474854, "learning_rate": 5.642190542907181e-05, "loss": 0.9204, "num_input_tokens_seen": 100198056, "step": 6227 }, { "epoch": 0.4362604744017442, "grad_norm": 4.629672050476074, "learning_rate": 5.641490718038529e-05, "loss": 1.0703, "num_input_tokens_seen": 100214440, "step": 6228 }, { "epoch": 0.43633052264747346, "grad_norm": 4.120620250701904, "learning_rate": 5.640790893169877e-05, "loss": 0.9761, "num_input_tokens_seen": 100230824, "step": 6229 }, { "epoch": 0.43640057089320267, "grad_norm": 3.8496460914611816, "learning_rate": 5.640091068301227e-05, "loss": 1.0973, "num_input_tokens_seen": 100245920, "step": 6230 }, { "epoch": 0.4364706191389319, "grad_norm": 3.8419101238250732, "learning_rate": 5.639391243432575e-05, "loss": 1.1615, "num_input_tokens_seen": 100262184, "step": 6231 }, { "epoch": 0.4365406673846612, "grad_norm": 4.283138751983643, "learning_rate": 5.638691418563923e-05, "loss": 0.999, "num_input_tokens_seen": 100278568, "step": 6232 }, { "epoch": 0.43661071563039044, "grad_norm": 3.7390506267547607, "learning_rate": 5.637991593695272e-05, "loss": 0.9675, "num_input_tokens_seen": 100294952, "step": 6233 }, { "epoch": 0.4366807638761197, "grad_norm": 3.894780158996582, "learning_rate": 5.63729176882662e-05, "loss": 0.996, "num_input_tokens_seen": 100310536, "step": 6234 }, { "epoch": 0.4367508121218489, "grad_norm": 5.446288108825684, "learning_rate": 5.636591943957968e-05, "loss": 0.973, "num_input_tokens_seen": 100326336, "step": 6235 }, { "epoch": 0.43682086036757817, "grad_norm": 5.487906455993652, "learning_rate": 5.635892119089318e-05, "loss": 0.9008, "num_input_tokens_seen": 100342720, "step": 6236 }, { "epoch": 0.4368909086133074, "grad_norm": 4.296425819396973, "learning_rate": 5.635192294220667e-05, "loss": 1.0326, "num_input_tokens_seen": 100358912, "step": 6237 }, { "epoch": 0.4369609568590367, "grad_norm": 4.339141845703125, "learning_rate": 5.634492469352014e-05, "loss": 1.1232, "num_input_tokens_seen": 100374952, "step": 6238 }, { "epoch": 0.4370310051047659, "grad_norm": 4.520789623260498, "learning_rate": 5.633792644483362e-05, "loss": 1.1701, "num_input_tokens_seen": 100390712, "step": 6239 }, { "epoch": 0.43710105335049515, "grad_norm": 3.7790653705596924, "learning_rate": 5.633092819614712e-05, "loss": 0.9953, "num_input_tokens_seen": 100406936, "step": 6240 }, { "epoch": 0.4371711015962244, "grad_norm": 3.7649457454681396, "learning_rate": 5.63239299474606e-05, "loss": 1.0837, "num_input_tokens_seen": 100423320, "step": 6241 }, { "epoch": 0.43724114984195367, "grad_norm": 5.27927827835083, "learning_rate": 5.631693169877408e-05, "loss": 1.1157, "num_input_tokens_seen": 100439704, "step": 6242 }, { "epoch": 0.4373111980876829, "grad_norm": 4.266254901885986, "learning_rate": 5.630993345008757e-05, "loss": 1.1607, "num_input_tokens_seen": 100455008, "step": 6243 }, { "epoch": 0.43738124633341213, "grad_norm": 4.195004940032959, "learning_rate": 5.6302935201401065e-05, "loss": 1.0163, "num_input_tokens_seen": 100471392, "step": 6244 }, { "epoch": 0.4374512945791414, "grad_norm": 4.85727596282959, "learning_rate": 5.629593695271455e-05, "loss": 1.0097, "num_input_tokens_seen": 100486832, "step": 6245 }, { "epoch": 0.43752134282487065, "grad_norm": 3.865466594696045, "learning_rate": 5.628893870402803e-05, "loss": 1.1528, "num_input_tokens_seen": 100502496, "step": 6246 }, { "epoch": 0.43759139107059986, "grad_norm": 4.107895851135254, "learning_rate": 5.628194045534152e-05, "loss": 1.0452, "num_input_tokens_seen": 100518816, "step": 6247 }, { "epoch": 0.4376614393163291, "grad_norm": 5.402096271514893, "learning_rate": 5.627494220665499e-05, "loss": 1.0368, "num_input_tokens_seen": 100535200, "step": 6248 }, { "epoch": 0.4377314875620584, "grad_norm": 4.255467414855957, "learning_rate": 5.6267943957968474e-05, "loss": 1.268, "num_input_tokens_seen": 100551456, "step": 6249 }, { "epoch": 0.43780153580778763, "grad_norm": 3.4338836669921875, "learning_rate": 5.626094570928196e-05, "loss": 1.0149, "num_input_tokens_seen": 100567840, "step": 6250 }, { "epoch": 0.43787158405351684, "grad_norm": 4.445374488830566, "learning_rate": 5.6253947460595445e-05, "loss": 1.2527, "num_input_tokens_seen": 100583848, "step": 6251 }, { "epoch": 0.4379416322992461, "grad_norm": 4.0756072998046875, "learning_rate": 5.6246949211908927e-05, "loss": 0.8848, "num_input_tokens_seen": 100600000, "step": 6252 }, { "epoch": 0.43801168054497536, "grad_norm": 3.910945177078247, "learning_rate": 5.623995096322242e-05, "loss": 0.9661, "num_input_tokens_seen": 100615424, "step": 6253 }, { "epoch": 0.4380817287907046, "grad_norm": 3.878586769104004, "learning_rate": 5.623295271453591e-05, "loss": 1.0631, "num_input_tokens_seen": 100631808, "step": 6254 }, { "epoch": 0.4381517770364338, "grad_norm": 4.295658111572266, "learning_rate": 5.622595446584939e-05, "loss": 1.2368, "num_input_tokens_seen": 100647344, "step": 6255 }, { "epoch": 0.4382218252821631, "grad_norm": 3.88688063621521, "learning_rate": 5.621895621716287e-05, "loss": 1.0371, "num_input_tokens_seen": 100663464, "step": 6256 }, { "epoch": 0.43829187352789234, "grad_norm": 3.6060731410980225, "learning_rate": 5.621195796847637e-05, "loss": 1.1189, "num_input_tokens_seen": 100679848, "step": 6257 }, { "epoch": 0.4383619217736216, "grad_norm": 4.274289608001709, "learning_rate": 5.620495971978984e-05, "loss": 0.9809, "num_input_tokens_seen": 100695760, "step": 6258 }, { "epoch": 0.4384319700193508, "grad_norm": 4.854022979736328, "learning_rate": 5.619796147110332e-05, "loss": 0.8043, "num_input_tokens_seen": 100711144, "step": 6259 }, { "epoch": 0.43850201826508006, "grad_norm": 3.9589812755584717, "learning_rate": 5.6190963222416815e-05, "loss": 1.202, "num_input_tokens_seen": 100727088, "step": 6260 }, { "epoch": 0.4385720665108093, "grad_norm": 5.07575798034668, "learning_rate": 5.618396497373031e-05, "loss": 1.0708, "num_input_tokens_seen": 100743232, "step": 6261 }, { "epoch": 0.4386421147565386, "grad_norm": 3.557736396789551, "learning_rate": 5.617696672504379e-05, "loss": 1.085, "num_input_tokens_seen": 100759616, "step": 6262 }, { "epoch": 0.4387121630022678, "grad_norm": 4.200889587402344, "learning_rate": 5.6169968476357274e-05, "loss": 0.9893, "num_input_tokens_seen": 100775176, "step": 6263 }, { "epoch": 0.43878221124799704, "grad_norm": 4.214064121246338, "learning_rate": 5.616297022767076e-05, "loss": 1.2024, "num_input_tokens_seen": 100791232, "step": 6264 }, { "epoch": 0.4388522594937263, "grad_norm": 5.372243881225586, "learning_rate": 5.6155971978984244e-05, "loss": 1.0551, "num_input_tokens_seen": 100807496, "step": 6265 }, { "epoch": 0.43892230773945556, "grad_norm": 4.754215717315674, "learning_rate": 5.614897373029771e-05, "loss": 1.1895, "num_input_tokens_seen": 100823880, "step": 6266 }, { "epoch": 0.4389923559851848, "grad_norm": 3.3892760276794434, "learning_rate": 5.614197548161121e-05, "loss": 1.0086, "num_input_tokens_seen": 100839928, "step": 6267 }, { "epoch": 0.439062404230914, "grad_norm": 4.554326057434082, "learning_rate": 5.61349772329247e-05, "loss": 0.993, "num_input_tokens_seen": 100856312, "step": 6268 }, { "epoch": 0.4391324524766433, "grad_norm": 4.118383407592773, "learning_rate": 5.6127978984238185e-05, "loss": 0.7906, "num_input_tokens_seen": 100872696, "step": 6269 }, { "epoch": 0.43920250072237255, "grad_norm": 4.403461456298828, "learning_rate": 5.612098073555167e-05, "loss": 1.1391, "num_input_tokens_seen": 100888808, "step": 6270 }, { "epoch": 0.4392725489681018, "grad_norm": 3.841547966003418, "learning_rate": 5.6113982486865156e-05, "loss": 1.0572, "num_input_tokens_seen": 100905192, "step": 6271 }, { "epoch": 0.439342597213831, "grad_norm": 4.147423267364502, "learning_rate": 5.610698423817864e-05, "loss": 1.1632, "num_input_tokens_seen": 100920552, "step": 6272 }, { "epoch": 0.43941264545956027, "grad_norm": 4.717578887939453, "learning_rate": 5.609998598949212e-05, "loss": 1.1097, "num_input_tokens_seen": 100936656, "step": 6273 }, { "epoch": 0.43948269370528953, "grad_norm": 5.503146171569824, "learning_rate": 5.6092987740805615e-05, "loss": 1.0768, "num_input_tokens_seen": 100952952, "step": 6274 }, { "epoch": 0.4395527419510188, "grad_norm": 3.6871585845947266, "learning_rate": 5.6085989492119096e-05, "loss": 0.9386, "num_input_tokens_seen": 100969152, "step": 6275 }, { "epoch": 0.439622790196748, "grad_norm": 3.751429796218872, "learning_rate": 5.6078991243432565e-05, "loss": 1.0478, "num_input_tokens_seen": 100985520, "step": 6276 }, { "epoch": 0.43969283844247725, "grad_norm": 4.053867340087891, "learning_rate": 5.607199299474606e-05, "loss": 1.1341, "num_input_tokens_seen": 101001336, "step": 6277 }, { "epoch": 0.4397628866882065, "grad_norm": 3.786154270172119, "learning_rate": 5.606499474605955e-05, "loss": 1.077, "num_input_tokens_seen": 101017240, "step": 6278 }, { "epoch": 0.43983293493393577, "grad_norm": 3.516772747039795, "learning_rate": 5.605799649737303e-05, "loss": 0.9854, "num_input_tokens_seen": 101033264, "step": 6279 }, { "epoch": 0.439902983179665, "grad_norm": 4.568872928619385, "learning_rate": 5.605099824868651e-05, "loss": 1.1444, "num_input_tokens_seen": 101049648, "step": 6280 }, { "epoch": 0.43997303142539423, "grad_norm": 4.430622577667236, "learning_rate": 5.604400000000001e-05, "loss": 1.0675, "num_input_tokens_seen": 101066032, "step": 6281 }, { "epoch": 0.4400430796711235, "grad_norm": 5.061071872711182, "learning_rate": 5.603700175131349e-05, "loss": 1.0891, "num_input_tokens_seen": 101082416, "step": 6282 }, { "epoch": 0.44011312791685275, "grad_norm": 3.696657180786133, "learning_rate": 5.603000350262696e-05, "loss": 0.9781, "num_input_tokens_seen": 101098800, "step": 6283 }, { "epoch": 0.44018317616258196, "grad_norm": 4.1430840492248535, "learning_rate": 5.6023005253940467e-05, "loss": 1.1299, "num_input_tokens_seen": 101114672, "step": 6284 }, { "epoch": 0.4402532244083112, "grad_norm": 4.862906455993652, "learning_rate": 5.601600700525395e-05, "loss": 0.8972, "num_input_tokens_seen": 101130632, "step": 6285 }, { "epoch": 0.4403232726540405, "grad_norm": 4.017249584197998, "learning_rate": 5.600900875656743e-05, "loss": 0.9911, "num_input_tokens_seen": 101145920, "step": 6286 }, { "epoch": 0.44039332089976974, "grad_norm": 4.797904014587402, "learning_rate": 5.600201050788091e-05, "loss": 1.106, "num_input_tokens_seen": 101162296, "step": 6287 }, { "epoch": 0.44046336914549894, "grad_norm": 3.685084342956543, "learning_rate": 5.59950122591944e-05, "loss": 1.0434, "num_input_tokens_seen": 101178048, "step": 6288 }, { "epoch": 0.4405334173912282, "grad_norm": 4.259701728820801, "learning_rate": 5.598801401050788e-05, "loss": 1.0203, "num_input_tokens_seen": 101194080, "step": 6289 }, { "epoch": 0.44060346563695746, "grad_norm": 3.961292266845703, "learning_rate": 5.598101576182138e-05, "loss": 0.9682, "num_input_tokens_seen": 101209000, "step": 6290 }, { "epoch": 0.4406735138826867, "grad_norm": 3.863640308380127, "learning_rate": 5.597401751313486e-05, "loss": 1.0203, "num_input_tokens_seen": 101224184, "step": 6291 }, { "epoch": 0.4407435621284159, "grad_norm": 6.002960681915283, "learning_rate": 5.596701926444834e-05, "loss": 1.3855, "num_input_tokens_seen": 101240568, "step": 6292 }, { "epoch": 0.4408136103741452, "grad_norm": 3.870892286300659, "learning_rate": 5.596002101576182e-05, "loss": 1.0754, "num_input_tokens_seen": 101256952, "step": 6293 }, { "epoch": 0.44088365861987444, "grad_norm": 3.654907703399658, "learning_rate": 5.5953022767075305e-05, "loss": 1.1244, "num_input_tokens_seen": 101273192, "step": 6294 }, { "epoch": 0.4409537068656037, "grad_norm": 3.2243661880493164, "learning_rate": 5.5946024518388794e-05, "loss": 0.9084, "num_input_tokens_seen": 101289488, "step": 6295 }, { "epoch": 0.4410237551113329, "grad_norm": 3.947880983352661, "learning_rate": 5.5939026269702275e-05, "loss": 1.1628, "num_input_tokens_seen": 101305064, "step": 6296 }, { "epoch": 0.44109380335706216, "grad_norm": 3.546065092086792, "learning_rate": 5.593202802101576e-05, "loss": 0.9669, "num_input_tokens_seen": 101321448, "step": 6297 }, { "epoch": 0.4411638516027914, "grad_norm": 4.489794731140137, "learning_rate": 5.592502977232925e-05, "loss": 1.1958, "num_input_tokens_seen": 101337832, "step": 6298 }, { "epoch": 0.4412338998485207, "grad_norm": 3.9517438411712646, "learning_rate": 5.5918031523642734e-05, "loss": 1.1256, "num_input_tokens_seen": 101354216, "step": 6299 }, { "epoch": 0.4413039480942499, "grad_norm": 4.599244594573975, "learning_rate": 5.591103327495622e-05, "loss": 0.9712, "num_input_tokens_seen": 101370088, "step": 6300 }, { "epoch": 0.44137399633997915, "grad_norm": 3.753528356552124, "learning_rate": 5.5904035026269705e-05, "loss": 0.9998, "num_input_tokens_seen": 101385432, "step": 6301 }, { "epoch": 0.4414440445857084, "grad_norm": 4.569333553314209, "learning_rate": 5.589703677758319e-05, "loss": 1.1676, "num_input_tokens_seen": 101401816, "step": 6302 }, { "epoch": 0.44151409283143767, "grad_norm": 4.010447978973389, "learning_rate": 5.589003852889667e-05, "loss": 1.0655, "num_input_tokens_seen": 101417272, "step": 6303 }, { "epoch": 0.4415841410771669, "grad_norm": 5.169422626495361, "learning_rate": 5.588304028021015e-05, "loss": 1.3555, "num_input_tokens_seen": 101433656, "step": 6304 }, { "epoch": 0.44165418932289613, "grad_norm": 4.6301069259643555, "learning_rate": 5.587604203152366e-05, "loss": 0.929, "num_input_tokens_seen": 101450040, "step": 6305 }, { "epoch": 0.4417242375686254, "grad_norm": 4.814012050628662, "learning_rate": 5.586904378283713e-05, "loss": 1.102, "num_input_tokens_seen": 101466320, "step": 6306 }, { "epoch": 0.44179428581435465, "grad_norm": 4.340104579925537, "learning_rate": 5.586204553415062e-05, "loss": 1.1179, "num_input_tokens_seen": 101482592, "step": 6307 }, { "epoch": 0.4418643340600839, "grad_norm": 3.807495355606079, "learning_rate": 5.5855047285464105e-05, "loss": 1.1407, "num_input_tokens_seen": 101498920, "step": 6308 }, { "epoch": 0.4419343823058131, "grad_norm": 3.97273325920105, "learning_rate": 5.5848049036777586e-05, "loss": 1.1377, "num_input_tokens_seen": 101515304, "step": 6309 }, { "epoch": 0.44200443055154237, "grad_norm": 6.926362037658691, "learning_rate": 5.584105078809107e-05, "loss": 0.9045, "num_input_tokens_seen": 101531688, "step": 6310 }, { "epoch": 0.44207447879727163, "grad_norm": 4.482272624969482, "learning_rate": 5.583405253940457e-05, "loss": 1.1431, "num_input_tokens_seen": 101547912, "step": 6311 }, { "epoch": 0.4421445270430009, "grad_norm": 3.726999044418335, "learning_rate": 5.582705429071805e-05, "loss": 1.0609, "num_input_tokens_seen": 101563640, "step": 6312 }, { "epoch": 0.4422145752887301, "grad_norm": 4.305807113647461, "learning_rate": 5.582005604203152e-05, "loss": 1.0612, "num_input_tokens_seen": 101580024, "step": 6313 }, { "epoch": 0.44228462353445935, "grad_norm": 5.402091979980469, "learning_rate": 5.5813057793345016e-05, "loss": 0.9018, "num_input_tokens_seen": 101596408, "step": 6314 }, { "epoch": 0.4423546717801886, "grad_norm": 3.658170700073242, "learning_rate": 5.5806059544658504e-05, "loss": 1.1726, "num_input_tokens_seen": 101612792, "step": 6315 }, { "epoch": 0.4424247200259179, "grad_norm": 3.91109561920166, "learning_rate": 5.579906129597198e-05, "loss": 0.9991, "num_input_tokens_seen": 101628408, "step": 6316 }, { "epoch": 0.4424947682716471, "grad_norm": 3.9523725509643555, "learning_rate": 5.579206304728547e-05, "loss": 1.1404, "num_input_tokens_seen": 101644616, "step": 6317 }, { "epoch": 0.44256481651737634, "grad_norm": 4.591569423675537, "learning_rate": 5.578506479859895e-05, "loss": 0.9778, "num_input_tokens_seen": 101660536, "step": 6318 }, { "epoch": 0.4426348647631056, "grad_norm": 3.7487003803253174, "learning_rate": 5.577806654991243e-05, "loss": 1.1305, "num_input_tokens_seen": 101676920, "step": 6319 }, { "epoch": 0.44270491300883485, "grad_norm": 4.111825942993164, "learning_rate": 5.5771068301225913e-05, "loss": 1.177, "num_input_tokens_seen": 101692904, "step": 6320 }, { "epoch": 0.44277496125456406, "grad_norm": 3.7022197246551514, "learning_rate": 5.576407005253941e-05, "loss": 1.0351, "num_input_tokens_seen": 101709288, "step": 6321 }, { "epoch": 0.4428450095002933, "grad_norm": 5.004938125610352, "learning_rate": 5.57570718038529e-05, "loss": 1.1042, "num_input_tokens_seen": 101725176, "step": 6322 }, { "epoch": 0.4429150577460226, "grad_norm": 3.728410005569458, "learning_rate": 5.575007355516637e-05, "loss": 0.9879, "num_input_tokens_seen": 101741160, "step": 6323 }, { "epoch": 0.44298510599175184, "grad_norm": 4.526604175567627, "learning_rate": 5.574307530647986e-05, "loss": 1.1465, "num_input_tokens_seen": 101756848, "step": 6324 }, { "epoch": 0.44305515423748104, "grad_norm": 3.4281585216522217, "learning_rate": 5.5736077057793356e-05, "loss": 1.0865, "num_input_tokens_seen": 101773232, "step": 6325 }, { "epoch": 0.4431252024832103, "grad_norm": 5.678319931030273, "learning_rate": 5.5729078809106825e-05, "loss": 1.0443, "num_input_tokens_seen": 101789064, "step": 6326 }, { "epoch": 0.44319525072893956, "grad_norm": 4.231290817260742, "learning_rate": 5.572208056042032e-05, "loss": 1.0336, "num_input_tokens_seen": 101805360, "step": 6327 }, { "epoch": 0.4432652989746688, "grad_norm": 3.9336435794830322, "learning_rate": 5.5715082311733815e-05, "loss": 1.0592, "num_input_tokens_seen": 101821744, "step": 6328 }, { "epoch": 0.443335347220398, "grad_norm": 3.6775193214416504, "learning_rate": 5.57080840630473e-05, "loss": 1.0551, "num_input_tokens_seen": 101838128, "step": 6329 }, { "epoch": 0.4434053954661273, "grad_norm": 3.7788445949554443, "learning_rate": 5.5701085814360766e-05, "loss": 0.9174, "num_input_tokens_seen": 101854216, "step": 6330 }, { "epoch": 0.44347544371185654, "grad_norm": 5.49542236328125, "learning_rate": 5.569408756567426e-05, "loss": 1.0043, "num_input_tokens_seen": 101870600, "step": 6331 }, { "epoch": 0.4435454919575858, "grad_norm": 4.486842155456543, "learning_rate": 5.568708931698775e-05, "loss": 1.2571, "num_input_tokens_seen": 101886984, "step": 6332 }, { "epoch": 0.443615540203315, "grad_norm": 4.949841499328613, "learning_rate": 5.5680091068301225e-05, "loss": 0.9723, "num_input_tokens_seen": 101903368, "step": 6333 }, { "epoch": 0.44368558844904427, "grad_norm": 3.6375255584716797, "learning_rate": 5.567309281961471e-05, "loss": 1.0938, "num_input_tokens_seen": 101919568, "step": 6334 }, { "epoch": 0.4437556366947735, "grad_norm": 4.649466037750244, "learning_rate": 5.566609457092821e-05, "loss": 1.0182, "num_input_tokens_seen": 101935952, "step": 6335 }, { "epoch": 0.4438256849405028, "grad_norm": 3.971482276916504, "learning_rate": 5.565909632224169e-05, "loss": 1.2222, "num_input_tokens_seen": 101952336, "step": 6336 }, { "epoch": 0.44389573318623204, "grad_norm": 4.605628967285156, "learning_rate": 5.565209807355517e-05, "loss": 1.1634, "num_input_tokens_seen": 101967752, "step": 6337 }, { "epoch": 0.44396578143196125, "grad_norm": 6.878963947296143, "learning_rate": 5.564509982486866e-05, "loss": 0.831, "num_input_tokens_seen": 101982648, "step": 6338 }, { "epoch": 0.4440358296776905, "grad_norm": 4.339694976806641, "learning_rate": 5.563810157618214e-05, "loss": 1.206, "num_input_tokens_seen": 101998912, "step": 6339 }, { "epoch": 0.44410587792341977, "grad_norm": 3.5509302616119385, "learning_rate": 5.563110332749562e-05, "loss": 0.969, "num_input_tokens_seen": 102015296, "step": 6340 }, { "epoch": 0.444175926169149, "grad_norm": 6.927268981933594, "learning_rate": 5.5624105078809106e-05, "loss": 1.0158, "num_input_tokens_seen": 102031136, "step": 6341 }, { "epoch": 0.44424597441487823, "grad_norm": 4.596194267272949, "learning_rate": 5.56171068301226e-05, "loss": 1.0602, "num_input_tokens_seen": 102047016, "step": 6342 }, { "epoch": 0.4443160226606075, "grad_norm": 3.8641550540924072, "learning_rate": 5.561010858143607e-05, "loss": 1.007, "num_input_tokens_seen": 102062344, "step": 6343 }, { "epoch": 0.44438607090633675, "grad_norm": 5.471240997314453, "learning_rate": 5.5603110332749565e-05, "loss": 0.9372, "num_input_tokens_seen": 102078392, "step": 6344 }, { "epoch": 0.444456119152066, "grad_norm": 3.971010208129883, "learning_rate": 5.5596112084063054e-05, "loss": 1.1335, "num_input_tokens_seen": 102092872, "step": 6345 }, { "epoch": 0.4445261673977952, "grad_norm": 4.037472724914551, "learning_rate": 5.5589113835376536e-05, "loss": 0.959, "num_input_tokens_seen": 102109256, "step": 6346 }, { "epoch": 0.4445962156435245, "grad_norm": 3.834984302520752, "learning_rate": 5.558211558669002e-05, "loss": 1.129, "num_input_tokens_seen": 102125424, "step": 6347 }, { "epoch": 0.44466626388925373, "grad_norm": 5.131717205047607, "learning_rate": 5.55751173380035e-05, "loss": 1.0966, "num_input_tokens_seen": 102141808, "step": 6348 }, { "epoch": 0.444736312134983, "grad_norm": 4.5308837890625, "learning_rate": 5.5568119089316995e-05, "loss": 1.1752, "num_input_tokens_seen": 102158192, "step": 6349 }, { "epoch": 0.4448063603807122, "grad_norm": 5.088570594787598, "learning_rate": 5.556112084063046e-05, "loss": 1.0215, "num_input_tokens_seen": 102174576, "step": 6350 }, { "epoch": 0.44487640862644146, "grad_norm": 3.588543176651001, "learning_rate": 5.555412259194397e-05, "loss": 1.0452, "num_input_tokens_seen": 102190928, "step": 6351 }, { "epoch": 0.4449464568721707, "grad_norm": 3.5698747634887695, "learning_rate": 5.5547124343257454e-05, "loss": 1.0472, "num_input_tokens_seen": 102207072, "step": 6352 }, { "epoch": 0.4450165051179, "grad_norm": 3.4842190742492676, "learning_rate": 5.5540126094570935e-05, "loss": 0.9213, "num_input_tokens_seen": 102222664, "step": 6353 }, { "epoch": 0.4450865533636292, "grad_norm": 3.742471218109131, "learning_rate": 5.553312784588442e-05, "loss": 1.1803, "num_input_tokens_seen": 102239048, "step": 6354 }, { "epoch": 0.44515660160935844, "grad_norm": 4.108808994293213, "learning_rate": 5.552612959719792e-05, "loss": 1.2055, "num_input_tokens_seen": 102255432, "step": 6355 }, { "epoch": 0.4452266498550877, "grad_norm": 5.498636722564697, "learning_rate": 5.551913134851139e-05, "loss": 1.0159, "num_input_tokens_seen": 102270760, "step": 6356 }, { "epoch": 0.44529669810081696, "grad_norm": 3.639392137527466, "learning_rate": 5.551213309982487e-05, "loss": 0.9927, "num_input_tokens_seen": 102285560, "step": 6357 }, { "epoch": 0.44536674634654616, "grad_norm": 4.534916400909424, "learning_rate": 5.5505134851138365e-05, "loss": 1.0912, "num_input_tokens_seen": 102301456, "step": 6358 }, { "epoch": 0.4454367945922754, "grad_norm": 3.961845636367798, "learning_rate": 5.5498136602451847e-05, "loss": 0.8966, "num_input_tokens_seen": 102317840, "step": 6359 }, { "epoch": 0.4455068428380047, "grad_norm": 3.712111473083496, "learning_rate": 5.549113835376533e-05, "loss": 1.0297, "num_input_tokens_seen": 102334104, "step": 6360 }, { "epoch": 0.44557689108373394, "grad_norm": 4.441688537597656, "learning_rate": 5.548414010507882e-05, "loss": 1.0116, "num_input_tokens_seen": 102350488, "step": 6361 }, { "epoch": 0.44564693932946314, "grad_norm": 5.881339073181152, "learning_rate": 5.54771418563923e-05, "loss": 0.8988, "num_input_tokens_seen": 102366872, "step": 6362 }, { "epoch": 0.4457169875751924, "grad_norm": 3.9736666679382324, "learning_rate": 5.547014360770578e-05, "loss": 1.1402, "num_input_tokens_seen": 102383256, "step": 6363 }, { "epoch": 0.44578703582092166, "grad_norm": 4.064074516296387, "learning_rate": 5.546314535901926e-05, "loss": 1.0196, "num_input_tokens_seen": 102399024, "step": 6364 }, { "epoch": 0.4458570840666509, "grad_norm": 4.238128662109375, "learning_rate": 5.5456147110332765e-05, "loss": 0.8919, "num_input_tokens_seen": 102415408, "step": 6365 }, { "epoch": 0.4459271323123801, "grad_norm": 3.8058905601501465, "learning_rate": 5.544914886164624e-05, "loss": 1.0117, "num_input_tokens_seen": 102430616, "step": 6366 }, { "epoch": 0.4459971805581094, "grad_norm": 5.049830436706543, "learning_rate": 5.544215061295972e-05, "loss": 1.0986, "num_input_tokens_seen": 102446448, "step": 6367 }, { "epoch": 0.44606722880383864, "grad_norm": 4.0549116134643555, "learning_rate": 5.543515236427321e-05, "loss": 1.1462, "num_input_tokens_seen": 102462832, "step": 6368 }, { "epoch": 0.4461372770495679, "grad_norm": 4.005105495452881, "learning_rate": 5.542815411558669e-05, "loss": 0.8846, "num_input_tokens_seen": 102478688, "step": 6369 }, { "epoch": 0.44620732529529716, "grad_norm": 4.298024654388428, "learning_rate": 5.5421155866900174e-05, "loss": 1.0249, "num_input_tokens_seen": 102495072, "step": 6370 }, { "epoch": 0.44627737354102637, "grad_norm": 4.816470623016357, "learning_rate": 5.541415761821367e-05, "loss": 1.0492, "num_input_tokens_seen": 102511456, "step": 6371 }, { "epoch": 0.4463474217867556, "grad_norm": 3.89819598197937, "learning_rate": 5.5407159369527164e-05, "loss": 0.9172, "num_input_tokens_seen": 102527840, "step": 6372 }, { "epoch": 0.4464174700324849, "grad_norm": 3.883650541305542, "learning_rate": 5.540016112084063e-05, "loss": 0.9475, "num_input_tokens_seen": 102543008, "step": 6373 }, { "epoch": 0.44648751827821415, "grad_norm": 3.8635551929473877, "learning_rate": 5.5393162872154114e-05, "loss": 1.2026, "num_input_tokens_seen": 102558600, "step": 6374 }, { "epoch": 0.44655756652394335, "grad_norm": 4.465150356292725, "learning_rate": 5.538616462346761e-05, "loss": 1.0031, "num_input_tokens_seen": 102574984, "step": 6375 }, { "epoch": 0.4466276147696726, "grad_norm": 3.8807246685028076, "learning_rate": 5.537916637478109e-05, "loss": 1.2355, "num_input_tokens_seen": 102591368, "step": 6376 }, { "epoch": 0.44669766301540187, "grad_norm": 4.526896953582764, "learning_rate": 5.5372168126094573e-05, "loss": 1.4136, "num_input_tokens_seen": 102607520, "step": 6377 }, { "epoch": 0.44676771126113113, "grad_norm": 4.905179023742676, "learning_rate": 5.536516987740806e-05, "loss": 1.1221, "num_input_tokens_seen": 102622928, "step": 6378 }, { "epoch": 0.44683775950686033, "grad_norm": 3.658268690109253, "learning_rate": 5.535817162872156e-05, "loss": 0.9257, "num_input_tokens_seen": 102639312, "step": 6379 }, { "epoch": 0.4469078077525896, "grad_norm": 4.125054359436035, "learning_rate": 5.5351173380035026e-05, "loss": 1.0569, "num_input_tokens_seen": 102655104, "step": 6380 }, { "epoch": 0.44697785599831885, "grad_norm": 3.6429343223571777, "learning_rate": 5.534417513134852e-05, "loss": 0.9446, "num_input_tokens_seen": 102671488, "step": 6381 }, { "epoch": 0.4470479042440481, "grad_norm": 4.249630928039551, "learning_rate": 5.533717688266201e-05, "loss": 1.248, "num_input_tokens_seen": 102687872, "step": 6382 }, { "epoch": 0.4471179524897773, "grad_norm": 4.6079277992248535, "learning_rate": 5.5330178633975485e-05, "loss": 1.1774, "num_input_tokens_seen": 102704256, "step": 6383 }, { "epoch": 0.4471880007355066, "grad_norm": 4.406512260437012, "learning_rate": 5.5323180385288966e-05, "loss": 1.0111, "num_input_tokens_seen": 102720640, "step": 6384 }, { "epoch": 0.44725804898123583, "grad_norm": 5.140946865081787, "learning_rate": 5.5316182136602455e-05, "loss": 1.0323, "num_input_tokens_seen": 102735608, "step": 6385 }, { "epoch": 0.4473280972269651, "grad_norm": 5.1392903327941895, "learning_rate": 5.530918388791594e-05, "loss": 1.1535, "num_input_tokens_seen": 102751992, "step": 6386 }, { "epoch": 0.4473981454726943, "grad_norm": 5.466961860656738, "learning_rate": 5.530218563922942e-05, "loss": 1.2679, "num_input_tokens_seen": 102768376, "step": 6387 }, { "epoch": 0.44746819371842356, "grad_norm": 3.7714486122131348, "learning_rate": 5.5295187390542914e-05, "loss": 0.9847, "num_input_tokens_seen": 102784328, "step": 6388 }, { "epoch": 0.4475382419641528, "grad_norm": 3.665609836578369, "learning_rate": 5.52881891418564e-05, "loss": 1.0716, "num_input_tokens_seen": 102800032, "step": 6389 }, { "epoch": 0.4476082902098821, "grad_norm": 6.100143909454346, "learning_rate": 5.528119089316988e-05, "loss": 0.8844, "num_input_tokens_seen": 102816416, "step": 6390 }, { "epoch": 0.4476783384556113, "grad_norm": 3.8393003940582275, "learning_rate": 5.527419264448336e-05, "loss": 1.1134, "num_input_tokens_seen": 102832800, "step": 6391 }, { "epoch": 0.44774838670134054, "grad_norm": 3.862710952758789, "learning_rate": 5.526719439579686e-05, "loss": 1.0571, "num_input_tokens_seen": 102849128, "step": 6392 }, { "epoch": 0.4478184349470698, "grad_norm": 4.032309055328369, "learning_rate": 5.526019614711033e-05, "loss": 1.0123, "num_input_tokens_seen": 102865512, "step": 6393 }, { "epoch": 0.44788848319279906, "grad_norm": 4.250918388366699, "learning_rate": 5.525319789842381e-05, "loss": 0.8542, "num_input_tokens_seen": 102881896, "step": 6394 }, { "epoch": 0.44795853143852826, "grad_norm": 3.8701565265655518, "learning_rate": 5.524619964973731e-05, "loss": 1.0307, "num_input_tokens_seen": 102898192, "step": 6395 }, { "epoch": 0.4480285796842575, "grad_norm": 4.2415571212768555, "learning_rate": 5.52392014010508e-05, "loss": 1.0586, "num_input_tokens_seen": 102914192, "step": 6396 }, { "epoch": 0.4480986279299868, "grad_norm": 3.937345027923584, "learning_rate": 5.5232203152364284e-05, "loss": 1.175, "num_input_tokens_seen": 102929976, "step": 6397 }, { "epoch": 0.44816867617571604, "grad_norm": 4.339337348937988, "learning_rate": 5.5225204903677766e-05, "loss": 1.2494, "num_input_tokens_seen": 102946360, "step": 6398 }, { "epoch": 0.44823872442144524, "grad_norm": 4.744752883911133, "learning_rate": 5.5218206654991255e-05, "loss": 1.1717, "num_input_tokens_seen": 102962744, "step": 6399 }, { "epoch": 0.4483087726671745, "grad_norm": 3.6590077877044678, "learning_rate": 5.521120840630473e-05, "loss": 1.014, "num_input_tokens_seen": 102978456, "step": 6400 }, { "epoch": 0.4483087726671745, "eval_loss": 1.1259907484054565, "eval_runtime": 0.159, "eval_samples_per_second": 6.289, "eval_steps_per_second": 6.289, "num_input_tokens_seen": 102978456, "step": 6400 }, { "epoch": 0.44837882091290376, "grad_norm": 4.073358535766602, "learning_rate": 5.5204210157618205e-05, "loss": 1.1584, "num_input_tokens_seen": 102994280, "step": 6401 }, { "epoch": 0.448448869158633, "grad_norm": 6.3949480056762695, "learning_rate": 5.51972119089317e-05, "loss": 1.0338, "num_input_tokens_seen": 103010664, "step": 6402 }, { "epoch": 0.4485189174043622, "grad_norm": 4.09867525100708, "learning_rate": 5.5190213660245195e-05, "loss": 1.175, "num_input_tokens_seen": 103027048, "step": 6403 }, { "epoch": 0.4485889656500915, "grad_norm": 3.672407865524292, "learning_rate": 5.518321541155868e-05, "loss": 1.1726, "num_input_tokens_seen": 103043432, "step": 6404 }, { "epoch": 0.44865901389582075, "grad_norm": 3.5733370780944824, "learning_rate": 5.517621716287216e-05, "loss": 0.9414, "num_input_tokens_seen": 103059816, "step": 6405 }, { "epoch": 0.44872906214155, "grad_norm": 5.21142578125, "learning_rate": 5.516921891418565e-05, "loss": 1.1563, "num_input_tokens_seen": 103076200, "step": 6406 }, { "epoch": 0.44879911038727927, "grad_norm": 3.4936230182647705, "learning_rate": 5.516222066549913e-05, "loss": 0.9876, "num_input_tokens_seen": 103092040, "step": 6407 }, { "epoch": 0.44886915863300847, "grad_norm": 4.558346271514893, "learning_rate": 5.515522241681261e-05, "loss": 0.9841, "num_input_tokens_seen": 103108424, "step": 6408 }, { "epoch": 0.44893920687873773, "grad_norm": 5.485194206237793, "learning_rate": 5.514822416812611e-05, "loss": 1.0012, "num_input_tokens_seen": 103123544, "step": 6409 }, { "epoch": 0.449009255124467, "grad_norm": 4.365593433380127, "learning_rate": 5.5141225919439575e-05, "loss": 0.9441, "num_input_tokens_seen": 103139928, "step": 6410 }, { "epoch": 0.44907930337019625, "grad_norm": 6.034286022186279, "learning_rate": 5.513422767075306e-05, "loss": 1.1408, "num_input_tokens_seen": 103154960, "step": 6411 }, { "epoch": 0.44914935161592545, "grad_norm": 3.88476300239563, "learning_rate": 5.512722942206655e-05, "loss": 0.8513, "num_input_tokens_seen": 103169984, "step": 6412 }, { "epoch": 0.4492193998616547, "grad_norm": 3.760528087615967, "learning_rate": 5.512023117338004e-05, "loss": 1.07, "num_input_tokens_seen": 103186296, "step": 6413 }, { "epoch": 0.44928944810738397, "grad_norm": 3.779690980911255, "learning_rate": 5.511323292469352e-05, "loss": 0.9531, "num_input_tokens_seen": 103202680, "step": 6414 }, { "epoch": 0.44935949635311323, "grad_norm": 3.6536929607391357, "learning_rate": 5.5106234676007004e-05, "loss": 1.0908, "num_input_tokens_seen": 103218656, "step": 6415 }, { "epoch": 0.44942954459884243, "grad_norm": 3.9258713722229004, "learning_rate": 5.50992364273205e-05, "loss": 1.0175, "num_input_tokens_seen": 103234872, "step": 6416 }, { "epoch": 0.4494995928445717, "grad_norm": 4.860123634338379, "learning_rate": 5.509223817863397e-05, "loss": 1.1094, "num_input_tokens_seen": 103251000, "step": 6417 }, { "epoch": 0.44956964109030095, "grad_norm": 4.924446105957031, "learning_rate": 5.508523992994745e-05, "loss": 1.1265, "num_input_tokens_seen": 103267296, "step": 6418 }, { "epoch": 0.4496396893360302, "grad_norm": 4.334608554840088, "learning_rate": 5.507824168126096e-05, "loss": 0.9163, "num_input_tokens_seen": 103283440, "step": 6419 }, { "epoch": 0.4497097375817594, "grad_norm": 4.686522483825684, "learning_rate": 5.507124343257444e-05, "loss": 1.0374, "num_input_tokens_seen": 103299760, "step": 6420 }, { "epoch": 0.4497797858274887, "grad_norm": 4.797657012939453, "learning_rate": 5.506424518388792e-05, "loss": 1.1277, "num_input_tokens_seen": 103316144, "step": 6421 }, { "epoch": 0.44984983407321794, "grad_norm": 3.443018674850464, "learning_rate": 5.5057246935201404e-05, "loss": 0.943, "num_input_tokens_seen": 103331712, "step": 6422 }, { "epoch": 0.4499198823189472, "grad_norm": 4.118574619293213, "learning_rate": 5.505024868651489e-05, "loss": 1.0539, "num_input_tokens_seen": 103348096, "step": 6423 }, { "epoch": 0.4499899305646764, "grad_norm": 4.0539937019348145, "learning_rate": 5.5043250437828375e-05, "loss": 0.8437, "num_input_tokens_seen": 103364480, "step": 6424 }, { "epoch": 0.45005997881040566, "grad_norm": 4.269721031188965, "learning_rate": 5.503625218914187e-05, "loss": 1.0896, "num_input_tokens_seen": 103380120, "step": 6425 }, { "epoch": 0.4501300270561349, "grad_norm": 4.6834516525268555, "learning_rate": 5.502925394045535e-05, "loss": 1.1162, "num_input_tokens_seen": 103396504, "step": 6426 }, { "epoch": 0.4502000753018642, "grad_norm": 4.42267370223999, "learning_rate": 5.5022255691768834e-05, "loss": 1.0416, "num_input_tokens_seen": 103412632, "step": 6427 }, { "epoch": 0.4502701235475934, "grad_norm": 4.8119797706604, "learning_rate": 5.5015257443082315e-05, "loss": 1.2585, "num_input_tokens_seen": 103428128, "step": 6428 }, { "epoch": 0.45034017179332264, "grad_norm": 4.170595169067383, "learning_rate": 5.50082591943958e-05, "loss": 0.9985, "num_input_tokens_seen": 103444088, "step": 6429 }, { "epoch": 0.4504102200390519, "grad_norm": 3.7060906887054443, "learning_rate": 5.5001260945709286e-05, "loss": 1.0852, "num_input_tokens_seen": 103460456, "step": 6430 }, { "epoch": 0.45048026828478116, "grad_norm": 4.4231977462768555, "learning_rate": 5.499426269702277e-05, "loss": 1.043, "num_input_tokens_seen": 103476840, "step": 6431 }, { "epoch": 0.45055031653051036, "grad_norm": 4.086833477020264, "learning_rate": 5.498726444833625e-05, "loss": 1.2797, "num_input_tokens_seen": 103492808, "step": 6432 }, { "epoch": 0.4506203647762396, "grad_norm": 3.912932872772217, "learning_rate": 5.4980266199649745e-05, "loss": 0.9846, "num_input_tokens_seen": 103508672, "step": 6433 }, { "epoch": 0.4506904130219689, "grad_norm": 3.6088106632232666, "learning_rate": 5.4973267950963227e-05, "loss": 1.0097, "num_input_tokens_seen": 103525056, "step": 6434 }, { "epoch": 0.45076046126769814, "grad_norm": 4.725728511810303, "learning_rate": 5.4966269702276715e-05, "loss": 1.1345, "num_input_tokens_seen": 103541440, "step": 6435 }, { "epoch": 0.45083050951342735, "grad_norm": 6.745354175567627, "learning_rate": 5.49592714535902e-05, "loss": 0.9549, "num_input_tokens_seen": 103556264, "step": 6436 }, { "epoch": 0.4509005577591566, "grad_norm": 4.462937355041504, "learning_rate": 5.495227320490368e-05, "loss": 1.0289, "num_input_tokens_seen": 103571576, "step": 6437 }, { "epoch": 0.45097060600488587, "grad_norm": 4.77189826965332, "learning_rate": 5.494527495621716e-05, "loss": 1.2534, "num_input_tokens_seen": 103587360, "step": 6438 }, { "epoch": 0.4510406542506151, "grad_norm": 5.734838962554932, "learning_rate": 5.493827670753064e-05, "loss": 1.0698, "num_input_tokens_seen": 103603744, "step": 6439 }, { "epoch": 0.4511107024963444, "grad_norm": 4.221588134765625, "learning_rate": 5.493127845884414e-05, "loss": 1.1712, "num_input_tokens_seen": 103620128, "step": 6440 }, { "epoch": 0.4511807507420736, "grad_norm": 3.894184112548828, "learning_rate": 5.492428021015762e-05, "loss": 1.1204, "num_input_tokens_seen": 103636512, "step": 6441 }, { "epoch": 0.45125079898780285, "grad_norm": 4.208652496337891, "learning_rate": 5.4917281961471115e-05, "loss": 1.1809, "num_input_tokens_seen": 103652824, "step": 6442 }, { "epoch": 0.4513208472335321, "grad_norm": 3.4426159858703613, "learning_rate": 5.49102837127846e-05, "loss": 1.0433, "num_input_tokens_seen": 103669000, "step": 6443 }, { "epoch": 0.45139089547926137, "grad_norm": 3.628229856491089, "learning_rate": 5.490328546409808e-05, "loss": 0.918, "num_input_tokens_seen": 103684792, "step": 6444 }, { "epoch": 0.45146094372499057, "grad_norm": 6.008549690246582, "learning_rate": 5.489628721541156e-05, "loss": 1.2477, "num_input_tokens_seen": 103701176, "step": 6445 }, { "epoch": 0.45153099197071983, "grad_norm": 4.023336887359619, "learning_rate": 5.488928896672506e-05, "loss": 1.2522, "num_input_tokens_seen": 103716488, "step": 6446 }, { "epoch": 0.4516010402164491, "grad_norm": 3.6931705474853516, "learning_rate": 5.488229071803853e-05, "loss": 0.9123, "num_input_tokens_seen": 103732560, "step": 6447 }, { "epoch": 0.45167108846217835, "grad_norm": 3.7356324195861816, "learning_rate": 5.487529246935201e-05, "loss": 1.0913, "num_input_tokens_seen": 103748512, "step": 6448 }, { "epoch": 0.45174113670790755, "grad_norm": 5.176403045654297, "learning_rate": 5.486829422066551e-05, "loss": 1.0758, "num_input_tokens_seen": 103764896, "step": 6449 }, { "epoch": 0.4518111849536368, "grad_norm": 4.492616176605225, "learning_rate": 5.486129597197899e-05, "loss": 1.2558, "num_input_tokens_seen": 103781280, "step": 6450 }, { "epoch": 0.4518812331993661, "grad_norm": 4.058090686798096, "learning_rate": 5.485429772329247e-05, "loss": 1.1313, "num_input_tokens_seen": 103797664, "step": 6451 }, { "epoch": 0.45195128144509533, "grad_norm": 3.6828136444091797, "learning_rate": 5.484729947460596e-05, "loss": 1.0972, "num_input_tokens_seen": 103813912, "step": 6452 }, { "epoch": 0.45202132969082454, "grad_norm": 3.4555649757385254, "learning_rate": 5.484030122591944e-05, "loss": 1.0052, "num_input_tokens_seen": 103830296, "step": 6453 }, { "epoch": 0.4520913779365538, "grad_norm": 4.12479305267334, "learning_rate": 5.4833302977232924e-05, "loss": 0.919, "num_input_tokens_seen": 103846272, "step": 6454 }, { "epoch": 0.45216142618228305, "grad_norm": 4.1249237060546875, "learning_rate": 5.4826304728546406e-05, "loss": 0.9013, "num_input_tokens_seen": 103862408, "step": 6455 }, { "epoch": 0.4522314744280123, "grad_norm": 4.026651859283447, "learning_rate": 5.481930647985989e-05, "loss": 1.0143, "num_input_tokens_seen": 103878592, "step": 6456 }, { "epoch": 0.4523015226737415, "grad_norm": 4.157918453216553, "learning_rate": 5.481230823117338e-05, "loss": 1.0132, "num_input_tokens_seen": 103894512, "step": 6457 }, { "epoch": 0.4523715709194708, "grad_norm": 3.4476771354675293, "learning_rate": 5.4805309982486865e-05, "loss": 0.941, "num_input_tokens_seen": 103910728, "step": 6458 }, { "epoch": 0.45244161916520004, "grad_norm": 5.755035877227783, "learning_rate": 5.479831173380035e-05, "loss": 1.1735, "num_input_tokens_seen": 103927112, "step": 6459 }, { "epoch": 0.4525116674109293, "grad_norm": 3.774343967437744, "learning_rate": 5.4791313485113835e-05, "loss": 1.1775, "num_input_tokens_seen": 103943496, "step": 6460 }, { "epoch": 0.4525817156566585, "grad_norm": 3.8584420680999756, "learning_rate": 5.478431523642732e-05, "loss": 1.0433, "num_input_tokens_seen": 103959880, "step": 6461 }, { "epoch": 0.45265176390238776, "grad_norm": 3.545832872390747, "learning_rate": 5.477731698774081e-05, "loss": 1.0117, "num_input_tokens_seen": 103976264, "step": 6462 }, { "epoch": 0.452721812148117, "grad_norm": 4.018779277801514, "learning_rate": 5.477031873905431e-05, "loss": 1.0711, "num_input_tokens_seen": 103991720, "step": 6463 }, { "epoch": 0.4527918603938463, "grad_norm": 3.966514825820923, "learning_rate": 5.476332049036779e-05, "loss": 1.1632, "num_input_tokens_seen": 104007488, "step": 6464 }, { "epoch": 0.4528619086395755, "grad_norm": 3.8280792236328125, "learning_rate": 5.475632224168126e-05, "loss": 0.9702, "num_input_tokens_seen": 104023096, "step": 6465 }, { "epoch": 0.45293195688530474, "grad_norm": 6.540561676025391, "learning_rate": 5.474932399299475e-05, "loss": 1.2517, "num_input_tokens_seen": 104038808, "step": 6466 }, { "epoch": 0.453002005131034, "grad_norm": 4.703604221343994, "learning_rate": 5.4742325744308235e-05, "loss": 1.1562, "num_input_tokens_seen": 104053944, "step": 6467 }, { "epoch": 0.45307205337676326, "grad_norm": 3.950582504272461, "learning_rate": 5.473532749562171e-05, "loss": 0.9822, "num_input_tokens_seen": 104070304, "step": 6468 }, { "epoch": 0.45314210162249247, "grad_norm": 5.277374744415283, "learning_rate": 5.4728329246935205e-05, "loss": 1.1024, "num_input_tokens_seen": 104086088, "step": 6469 }, { "epoch": 0.4532121498682217, "grad_norm": 4.449152946472168, "learning_rate": 5.47213309982487e-05, "loss": 1.2031, "num_input_tokens_seen": 104102472, "step": 6470 }, { "epoch": 0.453282198113951, "grad_norm": 3.780017852783203, "learning_rate": 5.471433274956218e-05, "loss": 1.0398, "num_input_tokens_seen": 104117552, "step": 6471 }, { "epoch": 0.45335224635968024, "grad_norm": 3.502319574356079, "learning_rate": 5.470733450087565e-05, "loss": 1.0531, "num_input_tokens_seen": 104133936, "step": 6472 }, { "epoch": 0.45342229460540945, "grad_norm": 4.8112311363220215, "learning_rate": 5.470033625218915e-05, "loss": 1.3107, "num_input_tokens_seen": 104150320, "step": 6473 }, { "epoch": 0.4534923428511387, "grad_norm": 3.640571355819702, "learning_rate": 5.4693338003502635e-05, "loss": 0.8591, "num_input_tokens_seen": 104165544, "step": 6474 }, { "epoch": 0.45356239109686797, "grad_norm": 3.796278953552246, "learning_rate": 5.468633975481611e-05, "loss": 0.9476, "num_input_tokens_seen": 104181928, "step": 6475 }, { "epoch": 0.4536324393425972, "grad_norm": 4.749582767486572, "learning_rate": 5.46793415061296e-05, "loss": 1.0821, "num_input_tokens_seen": 104197168, "step": 6476 }, { "epoch": 0.4537024875883265, "grad_norm": 4.0797271728515625, "learning_rate": 5.467234325744308e-05, "loss": 0.9838, "num_input_tokens_seen": 104213000, "step": 6477 }, { "epoch": 0.4537725358340557, "grad_norm": 4.250664710998535, "learning_rate": 5.466534500875656e-05, "loss": 1.1786, "num_input_tokens_seen": 104229136, "step": 6478 }, { "epoch": 0.45384258407978495, "grad_norm": 3.8380579948425293, "learning_rate": 5.465834676007006e-05, "loss": 1.0362, "num_input_tokens_seen": 104245520, "step": 6479 }, { "epoch": 0.4539126323255142, "grad_norm": 3.7329459190368652, "learning_rate": 5.4651348511383546e-05, "loss": 1.0818, "num_input_tokens_seen": 104261904, "step": 6480 }, { "epoch": 0.45398268057124347, "grad_norm": 4.495264530181885, "learning_rate": 5.464435026269703e-05, "loss": 1.1216, "num_input_tokens_seen": 104278288, "step": 6481 }, { "epoch": 0.4540527288169727, "grad_norm": 3.7195420265197754, "learning_rate": 5.46373520140105e-05, "loss": 1.1697, "num_input_tokens_seen": 104294192, "step": 6482 }, { "epoch": 0.45412277706270193, "grad_norm": 5.255592346191406, "learning_rate": 5.463035376532399e-05, "loss": 1.2848, "num_input_tokens_seen": 104308704, "step": 6483 }, { "epoch": 0.4541928253084312, "grad_norm": 4.61810302734375, "learning_rate": 5.462335551663749e-05, "loss": 1.0923, "num_input_tokens_seen": 104325088, "step": 6484 }, { "epoch": 0.45426287355416045, "grad_norm": 7.175589561462402, "learning_rate": 5.4616357267950955e-05, "loss": 1.1434, "num_input_tokens_seen": 104341224, "step": 6485 }, { "epoch": 0.45433292179988966, "grad_norm": 3.756762742996216, "learning_rate": 5.460935901926445e-05, "loss": 0.9934, "num_input_tokens_seen": 104356392, "step": 6486 }, { "epoch": 0.4544029700456189, "grad_norm": 3.979435920715332, "learning_rate": 5.4602360770577946e-05, "loss": 0.777, "num_input_tokens_seen": 104372776, "step": 6487 }, { "epoch": 0.4544730182913482, "grad_norm": 3.761296272277832, "learning_rate": 5.459536252189143e-05, "loss": 1.113, "num_input_tokens_seen": 104389160, "step": 6488 }, { "epoch": 0.45454306653707743, "grad_norm": 5.02775239944458, "learning_rate": 5.458836427320491e-05, "loss": 0.9833, "num_input_tokens_seen": 104404768, "step": 6489 }, { "epoch": 0.45461311478280664, "grad_norm": 4.161303997039795, "learning_rate": 5.45813660245184e-05, "loss": 0.9746, "num_input_tokens_seen": 104420152, "step": 6490 }, { "epoch": 0.4546831630285359, "grad_norm": 3.7053780555725098, "learning_rate": 5.457436777583188e-05, "loss": 0.7889, "num_input_tokens_seen": 104435512, "step": 6491 }, { "epoch": 0.45475321127426516, "grad_norm": 4.103651523590088, "learning_rate": 5.456736952714535e-05, "loss": 1.0379, "num_input_tokens_seen": 104451896, "step": 6492 }, { "epoch": 0.4548232595199944, "grad_norm": 4.212504863739014, "learning_rate": 5.456037127845884e-05, "loss": 1.0417, "num_input_tokens_seen": 104468280, "step": 6493 }, { "epoch": 0.4548933077657236, "grad_norm": 6.549145221710205, "learning_rate": 5.455337302977234e-05, "loss": 1.0627, "num_input_tokens_seen": 104484520, "step": 6494 }, { "epoch": 0.4549633560114529, "grad_norm": 3.777740478515625, "learning_rate": 5.454637478108582e-05, "loss": 1.2379, "num_input_tokens_seen": 104500536, "step": 6495 }, { "epoch": 0.45503340425718214, "grad_norm": 3.827119827270508, "learning_rate": 5.45393765323993e-05, "loss": 0.9852, "num_input_tokens_seen": 104516712, "step": 6496 }, { "epoch": 0.4551034525029114, "grad_norm": 4.231398105621338, "learning_rate": 5.453237828371279e-05, "loss": 1.0009, "num_input_tokens_seen": 104532792, "step": 6497 }, { "epoch": 0.4551735007486406, "grad_norm": 5.237041473388672, "learning_rate": 5.452538003502627e-05, "loss": 1.0285, "num_input_tokens_seen": 104549136, "step": 6498 }, { "epoch": 0.45524354899436986, "grad_norm": 4.457448482513428, "learning_rate": 5.451838178633975e-05, "loss": 1.2198, "num_input_tokens_seen": 104565320, "step": 6499 }, { "epoch": 0.4553135972400991, "grad_norm": 3.7427215576171875, "learning_rate": 5.451138353765325e-05, "loss": 1.1668, "num_input_tokens_seen": 104580744, "step": 6500 }, { "epoch": 0.4553836454858284, "grad_norm": 4.094877243041992, "learning_rate": 5.450438528896673e-05, "loss": 1.1735, "num_input_tokens_seen": 104596576, "step": 6501 }, { "epoch": 0.4554536937315576, "grad_norm": 4.290172576904297, "learning_rate": 5.4497387040280214e-05, "loss": 1.2692, "num_input_tokens_seen": 104612960, "step": 6502 }, { "epoch": 0.45552374197728684, "grad_norm": 3.5206210613250732, "learning_rate": 5.4490388791593695e-05, "loss": 0.9754, "num_input_tokens_seen": 104627664, "step": 6503 }, { "epoch": 0.4555937902230161, "grad_norm": 3.7847232818603516, "learning_rate": 5.4483390542907184e-05, "loss": 1.1653, "num_input_tokens_seen": 104644048, "step": 6504 }, { "epoch": 0.45566383846874536, "grad_norm": 4.193985939025879, "learning_rate": 5.4476392294220666e-05, "loss": 1.1462, "num_input_tokens_seen": 104659664, "step": 6505 }, { "epoch": 0.45573388671447457, "grad_norm": 3.819429874420166, "learning_rate": 5.446939404553415e-05, "loss": 1.0596, "num_input_tokens_seen": 104676048, "step": 6506 }, { "epoch": 0.4558039349602038, "grad_norm": 3.7949306964874268, "learning_rate": 5.446239579684764e-05, "loss": 1.016, "num_input_tokens_seen": 104692432, "step": 6507 }, { "epoch": 0.4558739832059331, "grad_norm": 3.880740165710449, "learning_rate": 5.4455397548161125e-05, "loss": 1.2764, "num_input_tokens_seen": 104708616, "step": 6508 }, { "epoch": 0.45594403145166235, "grad_norm": 5.389898777008057, "learning_rate": 5.444839929947459e-05, "loss": 1.1917, "num_input_tokens_seen": 104724984, "step": 6509 }, { "epoch": 0.4560140796973916, "grad_norm": 5.053036689758301, "learning_rate": 5.44414010507881e-05, "loss": 1.1091, "num_input_tokens_seen": 104741368, "step": 6510 }, { "epoch": 0.4560841279431208, "grad_norm": 4.7108330726623535, "learning_rate": 5.4434402802101584e-05, "loss": 1.1051, "num_input_tokens_seen": 104757752, "step": 6511 }, { "epoch": 0.45615417618885007, "grad_norm": 3.8108251094818115, "learning_rate": 5.4427404553415066e-05, "loss": 0.9524, "num_input_tokens_seen": 104774136, "step": 6512 }, { "epoch": 0.45622422443457933, "grad_norm": 3.8631815910339355, "learning_rate": 5.442040630472854e-05, "loss": 1.1041, "num_input_tokens_seen": 104790288, "step": 6513 }, { "epoch": 0.4562942726803086, "grad_norm": 3.745565176010132, "learning_rate": 5.441340805604205e-05, "loss": 1.086, "num_input_tokens_seen": 104806672, "step": 6514 }, { "epoch": 0.4563643209260378, "grad_norm": 3.6682205200195312, "learning_rate": 5.440640980735552e-05, "loss": 1.0626, "num_input_tokens_seen": 104822824, "step": 6515 }, { "epoch": 0.45643436917176705, "grad_norm": 4.422383785247803, "learning_rate": 5.439941155866901e-05, "loss": 1.059, "num_input_tokens_seen": 104839208, "step": 6516 }, { "epoch": 0.4565044174174963, "grad_norm": 5.5291242599487305, "learning_rate": 5.4392413309982495e-05, "loss": 0.9887, "num_input_tokens_seen": 104855296, "step": 6517 }, { "epoch": 0.45657446566322557, "grad_norm": 7.490913391113281, "learning_rate": 5.438541506129598e-05, "loss": 1.2157, "num_input_tokens_seen": 104871440, "step": 6518 }, { "epoch": 0.4566445139089548, "grad_norm": 5.1885528564453125, "learning_rate": 5.437841681260946e-05, "loss": 1.1799, "num_input_tokens_seen": 104887824, "step": 6519 }, { "epoch": 0.45671456215468403, "grad_norm": 4.4618096351623535, "learning_rate": 5.437141856392295e-05, "loss": 1.34, "num_input_tokens_seen": 104904208, "step": 6520 }, { "epoch": 0.4567846104004133, "grad_norm": 3.8809101581573486, "learning_rate": 5.436442031523643e-05, "loss": 1.0538, "num_input_tokens_seen": 104920592, "step": 6521 }, { "epoch": 0.45685465864614255, "grad_norm": 3.429588794708252, "learning_rate": 5.435742206654991e-05, "loss": 0.975, "num_input_tokens_seen": 104936976, "step": 6522 }, { "epoch": 0.45692470689187176, "grad_norm": 3.714005947113037, "learning_rate": 5.435042381786339e-05, "loss": 0.9339, "num_input_tokens_seen": 104953136, "step": 6523 }, { "epoch": 0.456994755137601, "grad_norm": 4.082497596740723, "learning_rate": 5.4343425569176895e-05, "loss": 1.1525, "num_input_tokens_seen": 104969520, "step": 6524 }, { "epoch": 0.4570648033833303, "grad_norm": 5.983520030975342, "learning_rate": 5.433642732049037e-05, "loss": 1.1925, "num_input_tokens_seen": 104984096, "step": 6525 }, { "epoch": 0.45713485162905954, "grad_norm": 4.282527446746826, "learning_rate": 5.432942907180385e-05, "loss": 1.0791, "num_input_tokens_seen": 105000120, "step": 6526 }, { "epoch": 0.45720489987478874, "grad_norm": 4.0138726234436035, "learning_rate": 5.432243082311734e-05, "loss": 1.212, "num_input_tokens_seen": 105016504, "step": 6527 }, { "epoch": 0.457274948120518, "grad_norm": 7.047135829925537, "learning_rate": 5.431543257443082e-05, "loss": 1.1071, "num_input_tokens_seen": 105031360, "step": 6528 }, { "epoch": 0.45734499636624726, "grad_norm": 4.966803550720215, "learning_rate": 5.4308434325744304e-05, "loss": 1.084, "num_input_tokens_seen": 105047744, "step": 6529 }, { "epoch": 0.4574150446119765, "grad_norm": 3.5288639068603516, "learning_rate": 5.43014360770578e-05, "loss": 1.0007, "num_input_tokens_seen": 105064128, "step": 6530 }, { "epoch": 0.4574850928577057, "grad_norm": 5.45017147064209, "learning_rate": 5.4294437828371295e-05, "loss": 1.0761, "num_input_tokens_seen": 105080512, "step": 6531 }, { "epoch": 0.457555141103435, "grad_norm": 4.392576694488525, "learning_rate": 5.428743957968476e-05, "loss": 1.1192, "num_input_tokens_seen": 105096896, "step": 6532 }, { "epoch": 0.45762518934916424, "grad_norm": 3.4714255332946777, "learning_rate": 5.428044133099826e-05, "loss": 0.94, "num_input_tokens_seen": 105113280, "step": 6533 }, { "epoch": 0.4576952375948935, "grad_norm": 4.999673366546631, "learning_rate": 5.427344308231175e-05, "loss": 0.8589, "num_input_tokens_seen": 105128640, "step": 6534 }, { "epoch": 0.4577652858406227, "grad_norm": 3.9281249046325684, "learning_rate": 5.426644483362522e-05, "loss": 1.2802, "num_input_tokens_seen": 105144176, "step": 6535 }, { "epoch": 0.45783533408635196, "grad_norm": 4.223507881164551, "learning_rate": 5.42594465849387e-05, "loss": 0.9767, "num_input_tokens_seen": 105160560, "step": 6536 }, { "epoch": 0.4579053823320812, "grad_norm": 3.7774858474731445, "learning_rate": 5.425244833625219e-05, "loss": 0.8401, "num_input_tokens_seen": 105176696, "step": 6537 }, { "epoch": 0.4579754305778105, "grad_norm": 3.666398048400879, "learning_rate": 5.424545008756569e-05, "loss": 1.0199, "num_input_tokens_seen": 105192376, "step": 6538 }, { "epoch": 0.4580454788235397, "grad_norm": 4.442626476287842, "learning_rate": 5.4238451838879156e-05, "loss": 1.2166, "num_input_tokens_seen": 105208192, "step": 6539 }, { "epoch": 0.45811552706926895, "grad_norm": 3.8626255989074707, "learning_rate": 5.423145359019265e-05, "loss": 1.2679, "num_input_tokens_seen": 105224576, "step": 6540 }, { "epoch": 0.4581855753149982, "grad_norm": 3.713498830795288, "learning_rate": 5.422445534150614e-05, "loss": 1.0785, "num_input_tokens_seen": 105240960, "step": 6541 }, { "epoch": 0.45825562356072747, "grad_norm": 5.06941032409668, "learning_rate": 5.4217457092819615e-05, "loss": 1.1006, "num_input_tokens_seen": 105257344, "step": 6542 }, { "epoch": 0.4583256718064567, "grad_norm": 3.6487746238708496, "learning_rate": 5.4210458844133103e-05, "loss": 0.9838, "num_input_tokens_seen": 105273336, "step": 6543 }, { "epoch": 0.45839572005218593, "grad_norm": 3.70211124420166, "learning_rate": 5.42034605954466e-05, "loss": 1.0334, "num_input_tokens_seen": 105289720, "step": 6544 }, { "epoch": 0.4584657682979152, "grad_norm": 5.169928073883057, "learning_rate": 5.419646234676007e-05, "loss": 0.9696, "num_input_tokens_seen": 105306104, "step": 6545 }, { "epoch": 0.45853581654364445, "grad_norm": 4.101007461547852, "learning_rate": 5.418946409807355e-05, "loss": 1.1545, "num_input_tokens_seen": 105322488, "step": 6546 }, { "epoch": 0.4586058647893737, "grad_norm": 4.077839374542236, "learning_rate": 5.4182465849387044e-05, "loss": 0.9885, "num_input_tokens_seen": 105338872, "step": 6547 }, { "epoch": 0.4586759130351029, "grad_norm": 4.46600341796875, "learning_rate": 5.417546760070053e-05, "loss": 1.0451, "num_input_tokens_seen": 105355256, "step": 6548 }, { "epoch": 0.45874596128083217, "grad_norm": 3.765453577041626, "learning_rate": 5.416846935201401e-05, "loss": 0.9365, "num_input_tokens_seen": 105370928, "step": 6549 }, { "epoch": 0.45881600952656143, "grad_norm": 3.913649559020996, "learning_rate": 5.4161471103327496e-05, "loss": 1.0764, "num_input_tokens_seen": 105387312, "step": 6550 }, { "epoch": 0.4588860577722907, "grad_norm": 5.323554992675781, "learning_rate": 5.415447285464099e-05, "loss": 1.0202, "num_input_tokens_seen": 105403408, "step": 6551 }, { "epoch": 0.4589561060180199, "grad_norm": 3.8482306003570557, "learning_rate": 5.414747460595446e-05, "loss": 1.011, "num_input_tokens_seen": 105419792, "step": 6552 }, { "epoch": 0.45902615426374915, "grad_norm": 4.369050025939941, "learning_rate": 5.414047635726794e-05, "loss": 0.9605, "num_input_tokens_seen": 105435240, "step": 6553 }, { "epoch": 0.4590962025094784, "grad_norm": 3.4255287647247314, "learning_rate": 5.413347810858145e-05, "loss": 0.9871, "num_input_tokens_seen": 105451568, "step": 6554 }, { "epoch": 0.45916625075520767, "grad_norm": 4.246303081512451, "learning_rate": 5.412647985989493e-05, "loss": 1.0404, "num_input_tokens_seen": 105467952, "step": 6555 }, { "epoch": 0.4592362990009369, "grad_norm": 3.785661220550537, "learning_rate": 5.4119481611208414e-05, "loss": 0.9251, "num_input_tokens_seen": 105484336, "step": 6556 }, { "epoch": 0.45930634724666614, "grad_norm": 3.661653757095337, "learning_rate": 5.4112483362521896e-05, "loss": 0.9834, "num_input_tokens_seen": 105500720, "step": 6557 }, { "epoch": 0.4593763954923954, "grad_norm": 4.362829685211182, "learning_rate": 5.4105485113835385e-05, "loss": 1.084, "num_input_tokens_seen": 105516488, "step": 6558 }, { "epoch": 0.45944644373812465, "grad_norm": 3.867062568664551, "learning_rate": 5.409848686514885e-05, "loss": 1.048, "num_input_tokens_seen": 105532736, "step": 6559 }, { "epoch": 0.45951649198385386, "grad_norm": 3.8351078033447266, "learning_rate": 5.409148861646236e-05, "loss": 1.0162, "num_input_tokens_seen": 105549120, "step": 6560 }, { "epoch": 0.4595865402295831, "grad_norm": 4.525234699249268, "learning_rate": 5.4084490367775844e-05, "loss": 1.0798, "num_input_tokens_seen": 105564776, "step": 6561 }, { "epoch": 0.4596565884753124, "grad_norm": 3.8182532787323, "learning_rate": 5.4077492119089326e-05, "loss": 1.021, "num_input_tokens_seen": 105581080, "step": 6562 }, { "epoch": 0.45972663672104164, "grad_norm": 3.82145619392395, "learning_rate": 5.407049387040281e-05, "loss": 1.0482, "num_input_tokens_seen": 105596496, "step": 6563 }, { "epoch": 0.45979668496677084, "grad_norm": 4.378223419189453, "learning_rate": 5.406349562171629e-05, "loss": 1.0795, "num_input_tokens_seen": 105612200, "step": 6564 }, { "epoch": 0.4598667332125001, "grad_norm": 4.628854274749756, "learning_rate": 5.405649737302978e-05, "loss": 1.1581, "num_input_tokens_seen": 105628584, "step": 6565 }, { "epoch": 0.45993678145822936, "grad_norm": 5.091843128204346, "learning_rate": 5.404949912434326e-05, "loss": 1.203, "num_input_tokens_seen": 105644968, "step": 6566 }, { "epoch": 0.4600068297039586, "grad_norm": 4.4174580574035645, "learning_rate": 5.404250087565674e-05, "loss": 1.1291, "num_input_tokens_seen": 105661352, "step": 6567 }, { "epoch": 0.4600768779496878, "grad_norm": 4.136083602905273, "learning_rate": 5.403550262697024e-05, "loss": 1.183, "num_input_tokens_seen": 105677200, "step": 6568 }, { "epoch": 0.4601469261954171, "grad_norm": 4.388592720031738, "learning_rate": 5.4028504378283705e-05, "loss": 1.2882, "num_input_tokens_seen": 105693584, "step": 6569 }, { "epoch": 0.46021697444114634, "grad_norm": 5.642048358917236, "learning_rate": 5.402150612959721e-05, "loss": 1.1693, "num_input_tokens_seen": 105709688, "step": 6570 }, { "epoch": 0.4602870226868756, "grad_norm": 4.107602596282959, "learning_rate": 5.401450788091069e-05, "loss": 1.2681, "num_input_tokens_seen": 105725608, "step": 6571 }, { "epoch": 0.4603570709326048, "grad_norm": 6.477549076080322, "learning_rate": 5.400750963222417e-05, "loss": 1.0574, "num_input_tokens_seen": 105741992, "step": 6572 }, { "epoch": 0.46042711917833407, "grad_norm": 4.7183380126953125, "learning_rate": 5.400051138353765e-05, "loss": 1.087, "num_input_tokens_seen": 105758376, "step": 6573 }, { "epoch": 0.4604971674240633, "grad_norm": 4.523158073425293, "learning_rate": 5.3993513134851135e-05, "loss": 1.0595, "num_input_tokens_seen": 105774760, "step": 6574 }, { "epoch": 0.4605672156697926, "grad_norm": 3.631554126739502, "learning_rate": 5.398651488616463e-05, "loss": 1.0407, "num_input_tokens_seen": 105791144, "step": 6575 }, { "epoch": 0.4606372639155218, "grad_norm": 8.034467697143555, "learning_rate": 5.397951663747811e-05, "loss": 1.0565, "num_input_tokens_seen": 105807528, "step": 6576 }, { "epoch": 0.46070731216125105, "grad_norm": 3.780055522918701, "learning_rate": 5.397251838879161e-05, "loss": 1.0622, "num_input_tokens_seen": 105823592, "step": 6577 }, { "epoch": 0.4607773604069803, "grad_norm": 3.975475549697876, "learning_rate": 5.396552014010509e-05, "loss": 1.0651, "num_input_tokens_seen": 105839976, "step": 6578 }, { "epoch": 0.46084740865270957, "grad_norm": 3.4668362140655518, "learning_rate": 5.395852189141857e-05, "loss": 0.9884, "num_input_tokens_seen": 105856360, "step": 6579 }, { "epoch": 0.4609174568984388, "grad_norm": 3.7928245067596436, "learning_rate": 5.395152364273205e-05, "loss": 1.0635, "num_input_tokens_seen": 105872744, "step": 6580 }, { "epoch": 0.46098750514416803, "grad_norm": 3.8289833068847656, "learning_rate": 5.3944525394045555e-05, "loss": 0.8981, "num_input_tokens_seen": 105888528, "step": 6581 }, { "epoch": 0.4610575533898973, "grad_norm": 6.435444355010986, "learning_rate": 5.393752714535902e-05, "loss": 1.0163, "num_input_tokens_seen": 105903592, "step": 6582 }, { "epoch": 0.46112760163562655, "grad_norm": 4.274429798126221, "learning_rate": 5.3930528896672505e-05, "loss": 1.3321, "num_input_tokens_seen": 105919904, "step": 6583 }, { "epoch": 0.4611976498813558, "grad_norm": 3.619840145111084, "learning_rate": 5.3923530647986e-05, "loss": 0.8481, "num_input_tokens_seen": 105936288, "step": 6584 }, { "epoch": 0.461267698127085, "grad_norm": 3.643489122390747, "learning_rate": 5.391653239929948e-05, "loss": 1.2037, "num_input_tokens_seen": 105952624, "step": 6585 }, { "epoch": 0.4613377463728143, "grad_norm": 3.5494256019592285, "learning_rate": 5.3909534150612964e-05, "loss": 1.0352, "num_input_tokens_seen": 105968568, "step": 6586 }, { "epoch": 0.46140779461854353, "grad_norm": 5.754514694213867, "learning_rate": 5.390253590192645e-05, "loss": 0.8431, "num_input_tokens_seen": 105984952, "step": 6587 }, { "epoch": 0.4614778428642728, "grad_norm": 3.9911015033721924, "learning_rate": 5.3895537653239934e-05, "loss": 0.9876, "num_input_tokens_seen": 106001336, "step": 6588 }, { "epoch": 0.461547891110002, "grad_norm": 4.1558756828308105, "learning_rate": 5.3888539404553416e-05, "loss": 1.0537, "num_input_tokens_seen": 106016736, "step": 6589 }, { "epoch": 0.46161793935573125, "grad_norm": 4.300850868225098, "learning_rate": 5.38815411558669e-05, "loss": 1.0303, "num_input_tokens_seen": 106033120, "step": 6590 }, { "epoch": 0.4616879876014605, "grad_norm": 6.03284215927124, "learning_rate": 5.387454290718038e-05, "loss": 1.0919, "num_input_tokens_seen": 106049504, "step": 6591 }, { "epoch": 0.4617580358471898, "grad_norm": 4.091002941131592, "learning_rate": 5.3867544658493875e-05, "loss": 1.205, "num_input_tokens_seen": 106065632, "step": 6592 }, { "epoch": 0.461828084092919, "grad_norm": 3.7395520210266113, "learning_rate": 5.386054640980736e-05, "loss": 0.9516, "num_input_tokens_seen": 106081632, "step": 6593 }, { "epoch": 0.46189813233864824, "grad_norm": 4.021444797515869, "learning_rate": 5.3853548161120845e-05, "loss": 1.0859, "num_input_tokens_seen": 106097376, "step": 6594 }, { "epoch": 0.4619681805843775, "grad_norm": 5.202040672302246, "learning_rate": 5.384654991243433e-05, "loss": 0.935, "num_input_tokens_seen": 106113096, "step": 6595 }, { "epoch": 0.46203822883010676, "grad_norm": 8.020401000976562, "learning_rate": 5.383955166374781e-05, "loss": 1.0943, "num_input_tokens_seen": 106129480, "step": 6596 }, { "epoch": 0.46210827707583596, "grad_norm": 4.892960548400879, "learning_rate": 5.3832553415061304e-05, "loss": 1.0347, "num_input_tokens_seen": 106145864, "step": 6597 }, { "epoch": 0.4621783253215652, "grad_norm": 3.963135004043579, "learning_rate": 5.38255551663748e-05, "loss": 1.0275, "num_input_tokens_seen": 106162248, "step": 6598 }, { "epoch": 0.4622483735672945, "grad_norm": 5.362968444824219, "learning_rate": 5.381855691768827e-05, "loss": 0.9192, "num_input_tokens_seen": 106177616, "step": 6599 }, { "epoch": 0.46231842181302374, "grad_norm": 5.272266864776611, "learning_rate": 5.381155866900175e-05, "loss": 1.3188, "num_input_tokens_seen": 106194000, "step": 6600 }, { "epoch": 0.46231842181302374, "eval_loss": 1.1238571405410767, "eval_runtime": 0.1703, "eval_samples_per_second": 5.872, "eval_steps_per_second": 5.872, "num_input_tokens_seen": 106194000, "step": 6600 }, { "epoch": 0.46238847005875294, "grad_norm": 6.129757881164551, "learning_rate": 5.3804560420315245e-05, "loss": 0.9134, "num_input_tokens_seen": 106210384, "step": 6601 }, { "epoch": 0.4624585183044822, "grad_norm": 4.237639904022217, "learning_rate": 5.379756217162873e-05, "loss": 1.013, "num_input_tokens_seen": 106226240, "step": 6602 }, { "epoch": 0.46252856655021146, "grad_norm": 3.4758036136627197, "learning_rate": 5.37905639229422e-05, "loss": 0.9542, "num_input_tokens_seen": 106242624, "step": 6603 }, { "epoch": 0.4625986147959407, "grad_norm": 4.031625270843506, "learning_rate": 5.37835656742557e-05, "loss": 0.9543, "num_input_tokens_seen": 106259008, "step": 6604 }, { "epoch": 0.4626686630416699, "grad_norm": 3.9605302810668945, "learning_rate": 5.377656742556919e-05, "loss": 1.1185, "num_input_tokens_seen": 106274960, "step": 6605 }, { "epoch": 0.4627387112873992, "grad_norm": 3.5777320861816406, "learning_rate": 5.3769569176882675e-05, "loss": 1.1453, "num_input_tokens_seen": 106291344, "step": 6606 }, { "epoch": 0.46280875953312844, "grad_norm": 3.553462505340576, "learning_rate": 5.376257092819614e-05, "loss": 0.9831, "num_input_tokens_seen": 106306184, "step": 6607 }, { "epoch": 0.4628788077788577, "grad_norm": 3.745340347290039, "learning_rate": 5.3755572679509645e-05, "loss": 0.8471, "num_input_tokens_seen": 106320360, "step": 6608 }, { "epoch": 0.4629488560245869, "grad_norm": 3.7483649253845215, "learning_rate": 5.374857443082312e-05, "loss": 1.0482, "num_input_tokens_seen": 106336664, "step": 6609 }, { "epoch": 0.46301890427031617, "grad_norm": 3.675184726715088, "learning_rate": 5.37415761821366e-05, "loss": 1.0679, "num_input_tokens_seen": 106353048, "step": 6610 }, { "epoch": 0.4630889525160454, "grad_norm": 4.733851432800293, "learning_rate": 5.373457793345009e-05, "loss": 1.0269, "num_input_tokens_seen": 106369432, "step": 6611 }, { "epoch": 0.4631590007617747, "grad_norm": 3.9618589878082275, "learning_rate": 5.372757968476357e-05, "loss": 1.0667, "num_input_tokens_seen": 106385592, "step": 6612 }, { "epoch": 0.46322904900750395, "grad_norm": 3.95268177986145, "learning_rate": 5.3720581436077054e-05, "loss": 1.1044, "num_input_tokens_seen": 106401976, "step": 6613 }, { "epoch": 0.46329909725323315, "grad_norm": 4.600008010864258, "learning_rate": 5.371358318739055e-05, "loss": 1.0004, "num_input_tokens_seen": 106418360, "step": 6614 }, { "epoch": 0.4633691454989624, "grad_norm": 3.6651558876037598, "learning_rate": 5.370658493870404e-05, "loss": 0.9977, "num_input_tokens_seen": 106434488, "step": 6615 }, { "epoch": 0.46343919374469167, "grad_norm": 4.116913318634033, "learning_rate": 5.369958669001752e-05, "loss": 1.0409, "num_input_tokens_seen": 106450872, "step": 6616 }, { "epoch": 0.46350924199042093, "grad_norm": 4.44846773147583, "learning_rate": 5.3692588441330995e-05, "loss": 0.8553, "num_input_tokens_seen": 106467256, "step": 6617 }, { "epoch": 0.46357929023615013, "grad_norm": 5.590776443481445, "learning_rate": 5.3685590192644483e-05, "loss": 0.9049, "num_input_tokens_seen": 106483640, "step": 6618 }, { "epoch": 0.4636493384818794, "grad_norm": 5.505274772644043, "learning_rate": 5.3678591943957965e-05, "loss": 1.2003, "num_input_tokens_seen": 106500024, "step": 6619 }, { "epoch": 0.46371938672760865, "grad_norm": 3.8726046085357666, "learning_rate": 5.367159369527145e-05, "loss": 0.953, "num_input_tokens_seen": 106516408, "step": 6620 }, { "epoch": 0.4637894349733379, "grad_norm": 3.9251434803009033, "learning_rate": 5.366459544658494e-05, "loss": 1.1335, "num_input_tokens_seen": 106531504, "step": 6621 }, { "epoch": 0.4638594832190671, "grad_norm": 3.9294116497039795, "learning_rate": 5.365759719789844e-05, "loss": 0.9266, "num_input_tokens_seen": 106547888, "step": 6622 }, { "epoch": 0.4639295314647964, "grad_norm": 4.324211120605469, "learning_rate": 5.365059894921192e-05, "loss": 1.0962, "num_input_tokens_seen": 106564272, "step": 6623 }, { "epoch": 0.46399957971052563, "grad_norm": 3.5331010818481445, "learning_rate": 5.364360070052539e-05, "loss": 0.9043, "num_input_tokens_seen": 106580608, "step": 6624 }, { "epoch": 0.4640696279562549, "grad_norm": 3.642073392868042, "learning_rate": 5.363660245183889e-05, "loss": 1.0047, "num_input_tokens_seen": 106596760, "step": 6625 }, { "epoch": 0.4641396762019841, "grad_norm": 4.356872081756592, "learning_rate": 5.362960420315237e-05, "loss": 1.0147, "num_input_tokens_seen": 106613144, "step": 6626 }, { "epoch": 0.46420972444771336, "grad_norm": 3.66884446144104, "learning_rate": 5.362260595446584e-05, "loss": 0.9679, "num_input_tokens_seen": 106629528, "step": 6627 }, { "epoch": 0.4642797726934426, "grad_norm": 4.089823246002197, "learning_rate": 5.3615607705779335e-05, "loss": 0.9901, "num_input_tokens_seen": 106645864, "step": 6628 }, { "epoch": 0.4643498209391719, "grad_norm": 4.411832332611084, "learning_rate": 5.360860945709283e-05, "loss": 1.0351, "num_input_tokens_seen": 106662248, "step": 6629 }, { "epoch": 0.4644198691849011, "grad_norm": 3.7563977241516113, "learning_rate": 5.360161120840631e-05, "loss": 1.0096, "num_input_tokens_seen": 106678632, "step": 6630 }, { "epoch": 0.46448991743063034, "grad_norm": 3.6493430137634277, "learning_rate": 5.3594612959719794e-05, "loss": 0.966, "num_input_tokens_seen": 106694552, "step": 6631 }, { "epoch": 0.4645599656763596, "grad_norm": 3.6459546089172363, "learning_rate": 5.358761471103328e-05, "loss": 1.0288, "num_input_tokens_seen": 106710544, "step": 6632 }, { "epoch": 0.46463001392208886, "grad_norm": 4.07296085357666, "learning_rate": 5.3580616462346765e-05, "loss": 1.0409, "num_input_tokens_seen": 106726928, "step": 6633 }, { "epoch": 0.46470006216781806, "grad_norm": 3.623961925506592, "learning_rate": 5.357361821366024e-05, "loss": 0.8566, "num_input_tokens_seen": 106742048, "step": 6634 }, { "epoch": 0.4647701104135473, "grad_norm": 3.8658370971679688, "learning_rate": 5.356661996497374e-05, "loss": 1.1116, "num_input_tokens_seen": 106758432, "step": 6635 }, { "epoch": 0.4648401586592766, "grad_norm": 7.479616641998291, "learning_rate": 5.3559621716287224e-05, "loss": 1.0146, "num_input_tokens_seen": 106774816, "step": 6636 }, { "epoch": 0.46491020690500584, "grad_norm": 5.282004356384277, "learning_rate": 5.3552623467600706e-05, "loss": 1.0662, "num_input_tokens_seen": 106791200, "step": 6637 }, { "epoch": 0.46498025515073504, "grad_norm": 5.323639392852783, "learning_rate": 5.354562521891419e-05, "loss": 1.159, "num_input_tokens_seen": 106806400, "step": 6638 }, { "epoch": 0.4650503033964643, "grad_norm": 3.709852933883667, "learning_rate": 5.3538626970227676e-05, "loss": 1.0024, "num_input_tokens_seen": 106822224, "step": 6639 }, { "epoch": 0.46512035164219356, "grad_norm": 3.583138942718506, "learning_rate": 5.353162872154116e-05, "loss": 1.1467, "num_input_tokens_seen": 106838608, "step": 6640 }, { "epoch": 0.4651903998879228, "grad_norm": 4.027291297912598, "learning_rate": 5.352463047285464e-05, "loss": 1.1139, "num_input_tokens_seen": 106854992, "step": 6641 }, { "epoch": 0.465260448133652, "grad_norm": 3.9708850383758545, "learning_rate": 5.3517632224168135e-05, "loss": 1.0602, "num_input_tokens_seen": 106871376, "step": 6642 }, { "epoch": 0.4653304963793813, "grad_norm": 5.148803234100342, "learning_rate": 5.351063397548162e-05, "loss": 1.0852, "num_input_tokens_seen": 106887760, "step": 6643 }, { "epoch": 0.46540054462511055, "grad_norm": 4.076368808746338, "learning_rate": 5.3503635726795085e-05, "loss": 1.1273, "num_input_tokens_seen": 106904112, "step": 6644 }, { "epoch": 0.4654705928708398, "grad_norm": 4.920746803283691, "learning_rate": 5.349663747810858e-05, "loss": 1.091, "num_input_tokens_seen": 106919960, "step": 6645 }, { "epoch": 0.465540641116569, "grad_norm": 3.8127434253692627, "learning_rate": 5.3489639229422076e-05, "loss": 0.9896, "num_input_tokens_seen": 106935928, "step": 6646 }, { "epoch": 0.46561068936229827, "grad_norm": 3.9216270446777344, "learning_rate": 5.348264098073556e-05, "loss": 1.0585, "num_input_tokens_seen": 106952168, "step": 6647 }, { "epoch": 0.46568073760802753, "grad_norm": 3.5133566856384277, "learning_rate": 5.347564273204903e-05, "loss": 0.8579, "num_input_tokens_seen": 106968080, "step": 6648 }, { "epoch": 0.4657507858537568, "grad_norm": 3.634164333343506, "learning_rate": 5.346864448336253e-05, "loss": 1.0907, "num_input_tokens_seen": 106984464, "step": 6649 }, { "epoch": 0.46582083409948605, "grad_norm": 3.7191765308380127, "learning_rate": 5.346164623467601e-05, "loss": 1.0374, "num_input_tokens_seen": 107000848, "step": 6650 }, { "epoch": 0.46589088234521525, "grad_norm": 3.767498254776001, "learning_rate": 5.345464798598948e-05, "loss": 1.0061, "num_input_tokens_seen": 107017232, "step": 6651 }, { "epoch": 0.4659609305909445, "grad_norm": 3.8340818881988525, "learning_rate": 5.344764973730299e-05, "loss": 1.075, "num_input_tokens_seen": 107033616, "step": 6652 }, { "epoch": 0.46603097883667377, "grad_norm": 9.20552921295166, "learning_rate": 5.344065148861647e-05, "loss": 1.1089, "num_input_tokens_seen": 107049120, "step": 6653 }, { "epoch": 0.46610102708240303, "grad_norm": 4.367069721221924, "learning_rate": 5.343365323992995e-05, "loss": 1.2988, "num_input_tokens_seen": 107063432, "step": 6654 }, { "epoch": 0.46617107532813223, "grad_norm": 3.6735596656799316, "learning_rate": 5.342665499124343e-05, "loss": 0.9416, "num_input_tokens_seen": 107079712, "step": 6655 }, { "epoch": 0.4662411235738615, "grad_norm": 4.066924095153809, "learning_rate": 5.341965674255692e-05, "loss": 1.1776, "num_input_tokens_seen": 107096096, "step": 6656 }, { "epoch": 0.46631117181959075, "grad_norm": 3.7454941272735596, "learning_rate": 5.34126584938704e-05, "loss": 1.0325, "num_input_tokens_seen": 107112480, "step": 6657 }, { "epoch": 0.46638122006532, "grad_norm": 3.738274574279785, "learning_rate": 5.3405660245183885e-05, "loss": 1.0846, "num_input_tokens_seen": 107128864, "step": 6658 }, { "epoch": 0.4664512683110492, "grad_norm": 8.665736198425293, "learning_rate": 5.339866199649738e-05, "loss": 1.1532, "num_input_tokens_seen": 107144536, "step": 6659 }, { "epoch": 0.4665213165567785, "grad_norm": 3.8733510971069336, "learning_rate": 5.339166374781086e-05, "loss": 0.7974, "num_input_tokens_seen": 107160664, "step": 6660 }, { "epoch": 0.46659136480250774, "grad_norm": 4.000319957733154, "learning_rate": 5.3384665499124344e-05, "loss": 1.0197, "num_input_tokens_seen": 107177048, "step": 6661 }, { "epoch": 0.466661413048237, "grad_norm": 3.8049557209014893, "learning_rate": 5.337766725043783e-05, "loss": 1.1256, "num_input_tokens_seen": 107192496, "step": 6662 }, { "epoch": 0.4667314612939662, "grad_norm": 4.009215354919434, "learning_rate": 5.3370669001751314e-05, "loss": 1.1667, "num_input_tokens_seen": 107207912, "step": 6663 }, { "epoch": 0.46680150953969546, "grad_norm": 6.3007378578186035, "learning_rate": 5.3363670753064796e-05, "loss": 1.3413, "num_input_tokens_seen": 107223504, "step": 6664 }, { "epoch": 0.4668715577854247, "grad_norm": 3.5798394680023193, "learning_rate": 5.335667250437828e-05, "loss": 1.0625, "num_input_tokens_seen": 107239824, "step": 6665 }, { "epoch": 0.466941606031154, "grad_norm": 4.701604843139648, "learning_rate": 5.334967425569179e-05, "loss": 1.1597, "num_input_tokens_seen": 107256208, "step": 6666 }, { "epoch": 0.4670116542768832, "grad_norm": 3.462380886077881, "learning_rate": 5.3342676007005255e-05, "loss": 0.976, "num_input_tokens_seen": 107272592, "step": 6667 }, { "epoch": 0.46708170252261244, "grad_norm": 5.546586513519287, "learning_rate": 5.333567775831875e-05, "loss": 1.3386, "num_input_tokens_seen": 107287520, "step": 6668 }, { "epoch": 0.4671517507683417, "grad_norm": 4.677948474884033, "learning_rate": 5.332867950963223e-05, "loss": 1.1263, "num_input_tokens_seen": 107303608, "step": 6669 }, { "epoch": 0.46722179901407096, "grad_norm": 3.95694899559021, "learning_rate": 5.3321681260945714e-05, "loss": 1.1536, "num_input_tokens_seen": 107319992, "step": 6670 }, { "epoch": 0.46729184725980016, "grad_norm": 4.037060737609863, "learning_rate": 5.331468301225919e-05, "loss": 1.0444, "num_input_tokens_seen": 107336376, "step": 6671 }, { "epoch": 0.4673618955055294, "grad_norm": 3.5486528873443604, "learning_rate": 5.3307684763572684e-05, "loss": 0.9887, "num_input_tokens_seen": 107352760, "step": 6672 }, { "epoch": 0.4674319437512587, "grad_norm": 3.868568181991577, "learning_rate": 5.330068651488618e-05, "loss": 0.8593, "num_input_tokens_seen": 107369144, "step": 6673 }, { "epoch": 0.46750199199698794, "grad_norm": 7.702548980712891, "learning_rate": 5.329368826619965e-05, "loss": 1.188, "num_input_tokens_seen": 107385528, "step": 6674 }, { "epoch": 0.46757204024271715, "grad_norm": 4.390200614929199, "learning_rate": 5.328669001751314e-05, "loss": 1.1342, "num_input_tokens_seen": 107401304, "step": 6675 }, { "epoch": 0.4676420884884464, "grad_norm": 3.7440412044525146, "learning_rate": 5.327969176882663e-05, "loss": 0.8969, "num_input_tokens_seen": 107417688, "step": 6676 }, { "epoch": 0.46771213673417567, "grad_norm": 4.894672870635986, "learning_rate": 5.327269352014011e-05, "loss": 1.0542, "num_input_tokens_seen": 107433816, "step": 6677 }, { "epoch": 0.4677821849799049, "grad_norm": 4.762908458709717, "learning_rate": 5.326569527145359e-05, "loss": 1.1926, "num_input_tokens_seen": 107450200, "step": 6678 }, { "epoch": 0.46785223322563413, "grad_norm": 4.3587870597839355, "learning_rate": 5.325869702276708e-05, "loss": 1.2127, "num_input_tokens_seen": 107466584, "step": 6679 }, { "epoch": 0.4679222814713634, "grad_norm": 4.166892051696777, "learning_rate": 5.325169877408056e-05, "loss": 1.0259, "num_input_tokens_seen": 107482968, "step": 6680 }, { "epoch": 0.46799232971709265, "grad_norm": 4.266642093658447, "learning_rate": 5.324470052539404e-05, "loss": 0.9715, "num_input_tokens_seen": 107498192, "step": 6681 }, { "epoch": 0.4680623779628219, "grad_norm": 3.3419625759124756, "learning_rate": 5.3237702276707536e-05, "loss": 1.0262, "num_input_tokens_seen": 107514576, "step": 6682 }, { "epoch": 0.46813242620855117, "grad_norm": 3.903163433074951, "learning_rate": 5.3230704028021025e-05, "loss": 1.0785, "num_input_tokens_seen": 107530536, "step": 6683 }, { "epoch": 0.46820247445428037, "grad_norm": 5.467947959899902, "learning_rate": 5.32237057793345e-05, "loss": 1.246, "num_input_tokens_seen": 107546248, "step": 6684 }, { "epoch": 0.46827252270000963, "grad_norm": 3.9213547706604004, "learning_rate": 5.321670753064799e-05, "loss": 1.1432, "num_input_tokens_seen": 107561992, "step": 6685 }, { "epoch": 0.4683425709457389, "grad_norm": 5.265954971313477, "learning_rate": 5.320970928196147e-05, "loss": 0.9934, "num_input_tokens_seen": 107578376, "step": 6686 }, { "epoch": 0.46841261919146815, "grad_norm": 3.9765655994415283, "learning_rate": 5.320271103327495e-05, "loss": 1.0219, "num_input_tokens_seen": 107594680, "step": 6687 }, { "epoch": 0.46848266743719735, "grad_norm": 4.261830806732178, "learning_rate": 5.3195712784588434e-05, "loss": 1.0328, "num_input_tokens_seen": 107611064, "step": 6688 }, { "epoch": 0.4685527156829266, "grad_norm": 7.026014804840088, "learning_rate": 5.318871453590194e-05, "loss": 0.9985, "num_input_tokens_seen": 107626112, "step": 6689 }, { "epoch": 0.46862276392865587, "grad_norm": 4.726694107055664, "learning_rate": 5.3181716287215425e-05, "loss": 1.0744, "num_input_tokens_seen": 107642496, "step": 6690 }, { "epoch": 0.46869281217438513, "grad_norm": 3.6380646228790283, "learning_rate": 5.317471803852889e-05, "loss": 1.1708, "num_input_tokens_seen": 107658880, "step": 6691 }, { "epoch": 0.46876286042011434, "grad_norm": 3.5807487964630127, "learning_rate": 5.316771978984239e-05, "loss": 1.1403, "num_input_tokens_seen": 107675256, "step": 6692 }, { "epoch": 0.4688329086658436, "grad_norm": 3.9915847778320312, "learning_rate": 5.316072154115588e-05, "loss": 1.0826, "num_input_tokens_seen": 107691016, "step": 6693 }, { "epoch": 0.46890295691157285, "grad_norm": 4.012253284454346, "learning_rate": 5.3153723292469345e-05, "loss": 1.0194, "num_input_tokens_seen": 107707064, "step": 6694 }, { "epoch": 0.4689730051573021, "grad_norm": 3.9562582969665527, "learning_rate": 5.314672504378284e-05, "loss": 1.0017, "num_input_tokens_seen": 107723152, "step": 6695 }, { "epoch": 0.4690430534030313, "grad_norm": 4.575549125671387, "learning_rate": 5.3139726795096336e-05, "loss": 1.0722, "num_input_tokens_seen": 107739536, "step": 6696 }, { "epoch": 0.4691131016487606, "grad_norm": 3.8225462436676025, "learning_rate": 5.313272854640982e-05, "loss": 1.0149, "num_input_tokens_seen": 107755920, "step": 6697 }, { "epoch": 0.46918314989448984, "grad_norm": 3.951275587081909, "learning_rate": 5.31257302977233e-05, "loss": 0.9675, "num_input_tokens_seen": 107772296, "step": 6698 }, { "epoch": 0.4692531981402191, "grad_norm": 3.5939061641693115, "learning_rate": 5.311873204903678e-05, "loss": 0.9912, "num_input_tokens_seen": 107788480, "step": 6699 }, { "epoch": 0.4693232463859483, "grad_norm": 7.109866619110107, "learning_rate": 5.311173380035027e-05, "loss": 0.8939, "num_input_tokens_seen": 107804768, "step": 6700 }, { "epoch": 0.46939329463167756, "grad_norm": 3.6135330200195312, "learning_rate": 5.3104735551663745e-05, "loss": 1.1388, "num_input_tokens_seen": 107820632, "step": 6701 }, { "epoch": 0.4694633428774068, "grad_norm": 4.7758331298828125, "learning_rate": 5.3097737302977234e-05, "loss": 0.9504, "num_input_tokens_seen": 107837016, "step": 6702 }, { "epoch": 0.4695333911231361, "grad_norm": 3.7631545066833496, "learning_rate": 5.309073905429073e-05, "loss": 1.021, "num_input_tokens_seen": 107853400, "step": 6703 }, { "epoch": 0.4696034393688653, "grad_norm": 5.737015247344971, "learning_rate": 5.30837408056042e-05, "loss": 1.053, "num_input_tokens_seen": 107869784, "step": 6704 }, { "epoch": 0.46967348761459454, "grad_norm": 3.845569610595703, "learning_rate": 5.307674255691769e-05, "loss": 1.0225, "num_input_tokens_seen": 107885760, "step": 6705 }, { "epoch": 0.4697435358603238, "grad_norm": 7.402350902557373, "learning_rate": 5.306974430823118e-05, "loss": 1.0404, "num_input_tokens_seen": 107902144, "step": 6706 }, { "epoch": 0.46981358410605306, "grad_norm": 4.036012649536133, "learning_rate": 5.306274605954466e-05, "loss": 1.0646, "num_input_tokens_seen": 107918528, "step": 6707 }, { "epoch": 0.46988363235178227, "grad_norm": 5.720461845397949, "learning_rate": 5.3055747810858145e-05, "loss": 0.9158, "num_input_tokens_seen": 107934912, "step": 6708 }, { "epoch": 0.4699536805975115, "grad_norm": 4.842574119567871, "learning_rate": 5.304874956217163e-05, "loss": 1.0039, "num_input_tokens_seen": 107950800, "step": 6709 }, { "epoch": 0.4700237288432408, "grad_norm": 3.787020444869995, "learning_rate": 5.304175131348512e-05, "loss": 0.8436, "num_input_tokens_seen": 107967184, "step": 6710 }, { "epoch": 0.47009377708897004, "grad_norm": 4.2691192626953125, "learning_rate": 5.303475306479859e-05, "loss": 1.0335, "num_input_tokens_seen": 107983568, "step": 6711 }, { "epoch": 0.47016382533469925, "grad_norm": 5.233339786529541, "learning_rate": 5.3027754816112086e-05, "loss": 1.0575, "num_input_tokens_seen": 107999952, "step": 6712 }, { "epoch": 0.4702338735804285, "grad_norm": 3.421193838119507, "learning_rate": 5.302075656742558e-05, "loss": 1.022, "num_input_tokens_seen": 108016336, "step": 6713 }, { "epoch": 0.47030392182615777, "grad_norm": 4.561410427093506, "learning_rate": 5.301375831873906e-05, "loss": 1.173, "num_input_tokens_seen": 108032720, "step": 6714 }, { "epoch": 0.470373970071887, "grad_norm": 4.749919891357422, "learning_rate": 5.3006760070052545e-05, "loss": 1.05, "num_input_tokens_seen": 108049104, "step": 6715 }, { "epoch": 0.47044401831761623, "grad_norm": 4.774212837219238, "learning_rate": 5.299976182136603e-05, "loss": 1.0433, "num_input_tokens_seen": 108065488, "step": 6716 }, { "epoch": 0.4705140665633455, "grad_norm": 3.6954824924468994, "learning_rate": 5.2992763572679515e-05, "loss": 0.9831, "num_input_tokens_seen": 108081224, "step": 6717 }, { "epoch": 0.47058411480907475, "grad_norm": 5.202620983123779, "learning_rate": 5.2985765323993e-05, "loss": 1.0583, "num_input_tokens_seen": 108097608, "step": 6718 }, { "epoch": 0.470654163054804, "grad_norm": 3.7043261528015137, "learning_rate": 5.297876707530649e-05, "loss": 0.9753, "num_input_tokens_seen": 108113992, "step": 6719 }, { "epoch": 0.47072421130053327, "grad_norm": 4.06228494644165, "learning_rate": 5.2971768826619974e-05, "loss": 1.0117, "num_input_tokens_seen": 108130376, "step": 6720 }, { "epoch": 0.4707942595462625, "grad_norm": 3.4427239894866943, "learning_rate": 5.2964770577933456e-05, "loss": 0.9207, "num_input_tokens_seen": 108146760, "step": 6721 }, { "epoch": 0.47086430779199173, "grad_norm": 6.617749214172363, "learning_rate": 5.295777232924694e-05, "loss": 1.1472, "num_input_tokens_seen": 108163144, "step": 6722 }, { "epoch": 0.470934356037721, "grad_norm": 3.744797706604004, "learning_rate": 5.2950774080560426e-05, "loss": 1.0143, "num_input_tokens_seen": 108179528, "step": 6723 }, { "epoch": 0.47100440428345025, "grad_norm": 5.034976005554199, "learning_rate": 5.294377583187391e-05, "loss": 1.0061, "num_input_tokens_seen": 108195248, "step": 6724 }, { "epoch": 0.47107445252917945, "grad_norm": 3.9690632820129395, "learning_rate": 5.293677758318739e-05, "loss": 1.2634, "num_input_tokens_seen": 108210920, "step": 6725 }, { "epoch": 0.4711445007749087, "grad_norm": 3.351450204849243, "learning_rate": 5.292977933450087e-05, "loss": 0.876, "num_input_tokens_seen": 108227304, "step": 6726 }, { "epoch": 0.471214549020638, "grad_norm": 3.7437920570373535, "learning_rate": 5.292278108581437e-05, "loss": 0.9549, "num_input_tokens_seen": 108243688, "step": 6727 }, { "epoch": 0.47128459726636723, "grad_norm": 6.022392272949219, "learning_rate": 5.291578283712785e-05, "loss": 1.0568, "num_input_tokens_seen": 108260072, "step": 6728 }, { "epoch": 0.47135464551209644, "grad_norm": 4.407289505004883, "learning_rate": 5.290878458844134e-05, "loss": 1.0511, "num_input_tokens_seen": 108276456, "step": 6729 }, { "epoch": 0.4714246937578257, "grad_norm": 3.9509878158569336, "learning_rate": 5.290178633975482e-05, "loss": 1.1648, "num_input_tokens_seen": 108291632, "step": 6730 }, { "epoch": 0.47149474200355496, "grad_norm": 4.2412285804748535, "learning_rate": 5.28947880910683e-05, "loss": 1.1903, "num_input_tokens_seen": 108308016, "step": 6731 }, { "epoch": 0.4715647902492842, "grad_norm": 4.234686374664307, "learning_rate": 5.288778984238178e-05, "loss": 1.1111, "num_input_tokens_seen": 108323984, "step": 6732 }, { "epoch": 0.4716348384950134, "grad_norm": 4.565019130706787, "learning_rate": 5.288079159369529e-05, "loss": 1.096, "num_input_tokens_seen": 108340368, "step": 6733 }, { "epoch": 0.4717048867407427, "grad_norm": 4.805628299713135, "learning_rate": 5.287379334500876e-05, "loss": 1.1239, "num_input_tokens_seen": 108356752, "step": 6734 }, { "epoch": 0.47177493498647194, "grad_norm": 3.9647700786590576, "learning_rate": 5.286679509632224e-05, "loss": 1.1555, "num_input_tokens_seen": 108372216, "step": 6735 }, { "epoch": 0.4718449832322012, "grad_norm": 3.811239004135132, "learning_rate": 5.285979684763574e-05, "loss": 0.9169, "num_input_tokens_seen": 108387696, "step": 6736 }, { "epoch": 0.4719150314779304, "grad_norm": 4.559319496154785, "learning_rate": 5.285279859894922e-05, "loss": 0.918, "num_input_tokens_seen": 108403944, "step": 6737 }, { "epoch": 0.47198507972365966, "grad_norm": 4.727875232696533, "learning_rate": 5.2845800350262694e-05, "loss": 1.1424, "num_input_tokens_seen": 108420328, "step": 6738 }, { "epoch": 0.4720551279693889, "grad_norm": 3.8609120845794678, "learning_rate": 5.283880210157619e-05, "loss": 1.0053, "num_input_tokens_seen": 108436712, "step": 6739 }, { "epoch": 0.4721251762151182, "grad_norm": 3.804370164871216, "learning_rate": 5.2831803852889685e-05, "loss": 1.0733, "num_input_tokens_seen": 108453040, "step": 6740 }, { "epoch": 0.4721952244608474, "grad_norm": 3.939620018005371, "learning_rate": 5.282480560420315e-05, "loss": 1.0229, "num_input_tokens_seen": 108468880, "step": 6741 }, { "epoch": 0.47226527270657664, "grad_norm": 4.376893043518066, "learning_rate": 5.2817807355516635e-05, "loss": 1.131, "num_input_tokens_seen": 108485248, "step": 6742 }, { "epoch": 0.4723353209523059, "grad_norm": 5.025060653686523, "learning_rate": 5.281080910683014e-05, "loss": 1.1299, "num_input_tokens_seen": 108501040, "step": 6743 }, { "epoch": 0.47240536919803516, "grad_norm": 3.524656057357788, "learning_rate": 5.280381085814361e-05, "loss": 0.9653, "num_input_tokens_seen": 108516624, "step": 6744 }, { "epoch": 0.47247541744376437, "grad_norm": 3.8542211055755615, "learning_rate": 5.2796812609457094e-05, "loss": 1.0025, "num_input_tokens_seen": 108533008, "step": 6745 }, { "epoch": 0.4725454656894936, "grad_norm": 3.8751041889190674, "learning_rate": 5.278981436077058e-05, "loss": 1.1803, "num_input_tokens_seen": 108549112, "step": 6746 }, { "epoch": 0.4726155139352229, "grad_norm": 4.343238353729248, "learning_rate": 5.2782816112084064e-05, "loss": 0.9759, "num_input_tokens_seen": 108564328, "step": 6747 }, { "epoch": 0.47268556218095215, "grad_norm": 3.695493698120117, "learning_rate": 5.2775817863397546e-05, "loss": 0.8834, "num_input_tokens_seen": 108580112, "step": 6748 }, { "epoch": 0.47275561042668135, "grad_norm": 3.8947877883911133, "learning_rate": 5.276881961471104e-05, "loss": 1.0522, "num_input_tokens_seen": 108596136, "step": 6749 }, { "epoch": 0.4728256586724106, "grad_norm": 4.2317633628845215, "learning_rate": 5.276182136602453e-05, "loss": 0.9472, "num_input_tokens_seen": 108612520, "step": 6750 }, { "epoch": 0.47289570691813987, "grad_norm": 3.608283281326294, "learning_rate": 5.2754823117338005e-05, "loss": 1.0748, "num_input_tokens_seen": 108628904, "step": 6751 }, { "epoch": 0.47296575516386913, "grad_norm": 4.512143611907959, "learning_rate": 5.274782486865149e-05, "loss": 1.0156, "num_input_tokens_seen": 108644248, "step": 6752 }, { "epoch": 0.4730358034095984, "grad_norm": 3.81160044670105, "learning_rate": 5.2740826619964976e-05, "loss": 1.0496, "num_input_tokens_seen": 108660488, "step": 6753 }, { "epoch": 0.4731058516553276, "grad_norm": 3.760336399078369, "learning_rate": 5.273382837127846e-05, "loss": 1.0335, "num_input_tokens_seen": 108676872, "step": 6754 }, { "epoch": 0.47317589990105685, "grad_norm": 3.969651222229004, "learning_rate": 5.272683012259194e-05, "loss": 1.2213, "num_input_tokens_seen": 108693256, "step": 6755 }, { "epoch": 0.4732459481467861, "grad_norm": 4.55695915222168, "learning_rate": 5.2719831873905435e-05, "loss": 0.9125, "num_input_tokens_seen": 108709576, "step": 6756 }, { "epoch": 0.47331599639251537, "grad_norm": 4.36952018737793, "learning_rate": 5.271283362521893e-05, "loss": 1.0403, "num_input_tokens_seen": 108725520, "step": 6757 }, { "epoch": 0.4733860446382446, "grad_norm": 4.689207553863525, "learning_rate": 5.27058353765324e-05, "loss": 1.0875, "num_input_tokens_seen": 108741744, "step": 6758 }, { "epoch": 0.47345609288397383, "grad_norm": 3.5912058353424072, "learning_rate": 5.269883712784588e-05, "loss": 1.1125, "num_input_tokens_seen": 108757952, "step": 6759 }, { "epoch": 0.4735261411297031, "grad_norm": 4.725868225097656, "learning_rate": 5.269183887915938e-05, "loss": 0.9312, "num_input_tokens_seen": 108774088, "step": 6760 }, { "epoch": 0.47359618937543235, "grad_norm": 4.213376045227051, "learning_rate": 5.268484063047285e-05, "loss": 1.0752, "num_input_tokens_seen": 108790472, "step": 6761 }, { "epoch": 0.47366623762116156, "grad_norm": 4.116434574127197, "learning_rate": 5.267784238178633e-05, "loss": 1.0481, "num_input_tokens_seen": 108806776, "step": 6762 }, { "epoch": 0.4737362858668908, "grad_norm": 3.8367996215820312, "learning_rate": 5.267084413309983e-05, "loss": 1.0882, "num_input_tokens_seen": 108822416, "step": 6763 }, { "epoch": 0.4738063341126201, "grad_norm": 3.609545946121216, "learning_rate": 5.266384588441332e-05, "loss": 0.929, "num_input_tokens_seen": 108838208, "step": 6764 }, { "epoch": 0.47387638235834934, "grad_norm": 4.108180522918701, "learning_rate": 5.2656847635726805e-05, "loss": 0.9622, "num_input_tokens_seen": 108854592, "step": 6765 }, { "epoch": 0.47394643060407854, "grad_norm": 4.884720325469971, "learning_rate": 5.2649849387040287e-05, "loss": 1.1246, "num_input_tokens_seen": 108870976, "step": 6766 }, { "epoch": 0.4740164788498078, "grad_norm": 4.856875896453857, "learning_rate": 5.2642851138353775e-05, "loss": 1.2403, "num_input_tokens_seen": 108885688, "step": 6767 }, { "epoch": 0.47408652709553706, "grad_norm": 3.5622432231903076, "learning_rate": 5.263585288966725e-05, "loss": 0.9572, "num_input_tokens_seen": 108902072, "step": 6768 }, { "epoch": 0.4741565753412663, "grad_norm": 5.305510997772217, "learning_rate": 5.262885464098073e-05, "loss": 1.229, "num_input_tokens_seen": 108917848, "step": 6769 }, { "epoch": 0.4742266235869955, "grad_norm": 3.729074478149414, "learning_rate": 5.2621856392294234e-05, "loss": 0.9361, "num_input_tokens_seen": 108934232, "step": 6770 }, { "epoch": 0.4742966718327248, "grad_norm": 4.5915937423706055, "learning_rate": 5.26148581436077e-05, "loss": 1.0442, "num_input_tokens_seen": 108949696, "step": 6771 }, { "epoch": 0.47436672007845404, "grad_norm": 3.977216958999634, "learning_rate": 5.26078598949212e-05, "loss": 1.2395, "num_input_tokens_seen": 108965848, "step": 6772 }, { "epoch": 0.4744367683241833, "grad_norm": 4.012653827667236, "learning_rate": 5.260086164623468e-05, "loss": 0.9013, "num_input_tokens_seen": 108982232, "step": 6773 }, { "epoch": 0.4745068165699125, "grad_norm": 4.10910701751709, "learning_rate": 5.259386339754817e-05, "loss": 0.9896, "num_input_tokens_seen": 108997800, "step": 6774 }, { "epoch": 0.47457686481564176, "grad_norm": 5.1765336990356445, "learning_rate": 5.258686514886165e-05, "loss": 1.1068, "num_input_tokens_seen": 109013664, "step": 6775 }, { "epoch": 0.474646913061371, "grad_norm": 5.6664228439331055, "learning_rate": 5.257986690017513e-05, "loss": 1.345, "num_input_tokens_seen": 109029208, "step": 6776 }, { "epoch": 0.4747169613071003, "grad_norm": 6.2354817390441895, "learning_rate": 5.257286865148863e-05, "loss": 1.1819, "num_input_tokens_seen": 109044528, "step": 6777 }, { "epoch": 0.4747870095528295, "grad_norm": 3.8308510780334473, "learning_rate": 5.2565870402802095e-05, "loss": 0.9639, "num_input_tokens_seen": 109060912, "step": 6778 }, { "epoch": 0.47485705779855875, "grad_norm": 4.019093990325928, "learning_rate": 5.255887215411558e-05, "loss": 0.9385, "num_input_tokens_seen": 109077296, "step": 6779 }, { "epoch": 0.474927106044288, "grad_norm": 6.938348293304443, "learning_rate": 5.255187390542907e-05, "loss": 0.9974, "num_input_tokens_seen": 109093680, "step": 6780 }, { "epoch": 0.47499715429001726, "grad_norm": 4.200627326965332, "learning_rate": 5.254487565674257e-05, "loss": 1.0353, "num_input_tokens_seen": 109110008, "step": 6781 }, { "epoch": 0.47506720253574647, "grad_norm": 4.06279993057251, "learning_rate": 5.253787740805605e-05, "loss": 0.9581, "num_input_tokens_seen": 109126392, "step": 6782 }, { "epoch": 0.47513725078147573, "grad_norm": 4.124377250671387, "learning_rate": 5.2530879159369525e-05, "loss": 1.2065, "num_input_tokens_seen": 109142680, "step": 6783 }, { "epoch": 0.475207299027205, "grad_norm": 4.182784557342529, "learning_rate": 5.252388091068302e-05, "loss": 1.073, "num_input_tokens_seen": 109158768, "step": 6784 }, { "epoch": 0.47527734727293425, "grad_norm": 4.513407230377197, "learning_rate": 5.25168826619965e-05, "loss": 1.0392, "num_input_tokens_seen": 109175152, "step": 6785 }, { "epoch": 0.4753473955186635, "grad_norm": 3.251490354537964, "learning_rate": 5.250988441330997e-05, "loss": 0.8611, "num_input_tokens_seen": 109191056, "step": 6786 }, { "epoch": 0.4754174437643927, "grad_norm": 8.621055603027344, "learning_rate": 5.250288616462348e-05, "loss": 1.0699, "num_input_tokens_seen": 109206888, "step": 6787 }, { "epoch": 0.47548749201012197, "grad_norm": 4.264245986938477, "learning_rate": 5.249588791593696e-05, "loss": 0.9409, "num_input_tokens_seen": 109223272, "step": 6788 }, { "epoch": 0.47555754025585123, "grad_norm": 3.6648037433624268, "learning_rate": 5.248888966725044e-05, "loss": 1.0575, "num_input_tokens_seen": 109239544, "step": 6789 }, { "epoch": 0.4756275885015805, "grad_norm": 4.528952598571777, "learning_rate": 5.2481891418563925e-05, "loss": 1.1251, "num_input_tokens_seen": 109255816, "step": 6790 }, { "epoch": 0.4756976367473097, "grad_norm": 4.45644998550415, "learning_rate": 5.247489316987741e-05, "loss": 1.095, "num_input_tokens_seen": 109272200, "step": 6791 }, { "epoch": 0.47576768499303895, "grad_norm": 3.8969879150390625, "learning_rate": 5.2467894921190895e-05, "loss": 0.9836, "num_input_tokens_seen": 109288520, "step": 6792 }, { "epoch": 0.4758377332387682, "grad_norm": 3.627748727798462, "learning_rate": 5.246089667250438e-05, "loss": 1.1656, "num_input_tokens_seen": 109304624, "step": 6793 }, { "epoch": 0.47590778148449747, "grad_norm": 4.493330478668213, "learning_rate": 5.245389842381787e-05, "loss": 1.2352, "num_input_tokens_seen": 109319976, "step": 6794 }, { "epoch": 0.4759778297302267, "grad_norm": 3.5947048664093018, "learning_rate": 5.2446900175131354e-05, "loss": 1.035, "num_input_tokens_seen": 109336360, "step": 6795 }, { "epoch": 0.47604787797595594, "grad_norm": 4.194823741912842, "learning_rate": 5.2439901926444836e-05, "loss": 0.9851, "num_input_tokens_seen": 109351624, "step": 6796 }, { "epoch": 0.4761179262216852, "grad_norm": 3.9734160900115967, "learning_rate": 5.2432903677758324e-05, "loss": 1.2019, "num_input_tokens_seen": 109367072, "step": 6797 }, { "epoch": 0.47618797446741445, "grad_norm": 4.142136096954346, "learning_rate": 5.2425905429071806e-05, "loss": 1.1178, "num_input_tokens_seen": 109383408, "step": 6798 }, { "epoch": 0.47625802271314366, "grad_norm": 4.315369129180908, "learning_rate": 5.241890718038529e-05, "loss": 1.2254, "num_input_tokens_seen": 109398616, "step": 6799 }, { "epoch": 0.4763280709588729, "grad_norm": 4.77875280380249, "learning_rate": 5.241190893169877e-05, "loss": 1.1018, "num_input_tokens_seen": 109414592, "step": 6800 }, { "epoch": 0.4763280709588729, "eval_loss": 1.1252864599227905, "eval_runtime": 0.1585, "eval_samples_per_second": 6.311, "eval_steps_per_second": 6.311, "num_input_tokens_seen": 109414592, "step": 6800 }, { "epoch": 0.4763981192046022, "grad_norm": 4.054019927978516, "learning_rate": 5.2404910683012265e-05, "loss": 1.1978, "num_input_tokens_seen": 109430896, "step": 6801 }, { "epoch": 0.47646816745033144, "grad_norm": 4.0688276290893555, "learning_rate": 5.239791243432575e-05, "loss": 1.015, "num_input_tokens_seen": 109447008, "step": 6802 }, { "epoch": 0.47653821569606064, "grad_norm": 4.081553936004639, "learning_rate": 5.239091418563924e-05, "loss": 1.2566, "num_input_tokens_seen": 109463392, "step": 6803 }, { "epoch": 0.4766082639417899, "grad_norm": 4.719587326049805, "learning_rate": 5.2383915936952724e-05, "loss": 1.0577, "num_input_tokens_seen": 109478768, "step": 6804 }, { "epoch": 0.47667831218751916, "grad_norm": 3.7197132110595703, "learning_rate": 5.2376917688266206e-05, "loss": 1.0442, "num_input_tokens_seen": 109494808, "step": 6805 }, { "epoch": 0.4767483604332484, "grad_norm": 5.000951290130615, "learning_rate": 5.236991943957968e-05, "loss": 1.0497, "num_input_tokens_seen": 109511192, "step": 6806 }, { "epoch": 0.4768184086789776, "grad_norm": 3.9910333156585693, "learning_rate": 5.236292119089316e-05, "loss": 1.2905, "num_input_tokens_seen": 109527576, "step": 6807 }, { "epoch": 0.4768884569247069, "grad_norm": 4.522314548492432, "learning_rate": 5.235592294220666e-05, "loss": 0.9959, "num_input_tokens_seen": 109543960, "step": 6808 }, { "epoch": 0.47695850517043614, "grad_norm": 3.7235898971557617, "learning_rate": 5.234892469352014e-05, "loss": 0.9931, "num_input_tokens_seen": 109560344, "step": 6809 }, { "epoch": 0.4770285534161654, "grad_norm": 3.643763303756714, "learning_rate": 5.2341926444833635e-05, "loss": 0.9588, "num_input_tokens_seen": 109576728, "step": 6810 }, { "epoch": 0.4770986016618946, "grad_norm": 5.52113151550293, "learning_rate": 5.233492819614712e-05, "loss": 1.2022, "num_input_tokens_seen": 109592584, "step": 6811 }, { "epoch": 0.47716864990762387, "grad_norm": 4.9974188804626465, "learning_rate": 5.23279299474606e-05, "loss": 1.1755, "num_input_tokens_seen": 109608960, "step": 6812 }, { "epoch": 0.4772386981533531, "grad_norm": 5.266491889953613, "learning_rate": 5.232093169877408e-05, "loss": 1.1099, "num_input_tokens_seen": 109625104, "step": 6813 }, { "epoch": 0.4773087463990824, "grad_norm": 3.9919018745422363, "learning_rate": 5.231393345008757e-05, "loss": 1.0423, "num_input_tokens_seen": 109641488, "step": 6814 }, { "epoch": 0.4773787946448116, "grad_norm": 5.361277103424072, "learning_rate": 5.230693520140105e-05, "loss": 1.215, "num_input_tokens_seen": 109657872, "step": 6815 }, { "epoch": 0.47744884289054085, "grad_norm": 4.024937629699707, "learning_rate": 5.229993695271453e-05, "loss": 1.2601, "num_input_tokens_seen": 109674256, "step": 6816 }, { "epoch": 0.4775188911362701, "grad_norm": 3.7742490768432617, "learning_rate": 5.229293870402803e-05, "loss": 1.0789, "num_input_tokens_seen": 109690576, "step": 6817 }, { "epoch": 0.47758893938199937, "grad_norm": 3.622018814086914, "learning_rate": 5.228594045534151e-05, "loss": 0.8893, "num_input_tokens_seen": 109706592, "step": 6818 }, { "epoch": 0.47765898762772857, "grad_norm": 4.550981044769287, "learning_rate": 5.227894220665499e-05, "loss": 1.31, "num_input_tokens_seen": 109722976, "step": 6819 }, { "epoch": 0.47772903587345783, "grad_norm": 3.8553786277770996, "learning_rate": 5.227194395796848e-05, "loss": 0.9512, "num_input_tokens_seen": 109738920, "step": 6820 }, { "epoch": 0.4777990841191871, "grad_norm": 3.7159841060638428, "learning_rate": 5.226494570928196e-05, "loss": 0.9445, "num_input_tokens_seen": 109755304, "step": 6821 }, { "epoch": 0.47786913236491635, "grad_norm": 5.884495258331299, "learning_rate": 5.2257947460595444e-05, "loss": 0.9789, "num_input_tokens_seen": 109771576, "step": 6822 }, { "epoch": 0.4779391806106456, "grad_norm": 3.7047083377838135, "learning_rate": 5.2250949211908926e-05, "loss": 1.0297, "num_input_tokens_seen": 109787872, "step": 6823 }, { "epoch": 0.4780092288563748, "grad_norm": 3.485847234725952, "learning_rate": 5.224395096322241e-05, "loss": 0.9269, "num_input_tokens_seen": 109803384, "step": 6824 }, { "epoch": 0.47807927710210407, "grad_norm": 3.9222450256347656, "learning_rate": 5.223695271453592e-05, "loss": 1.0749, "num_input_tokens_seen": 109818704, "step": 6825 }, { "epoch": 0.47814932534783333, "grad_norm": 4.232855796813965, "learning_rate": 5.2229954465849385e-05, "loss": 1.1773, "num_input_tokens_seen": 109835088, "step": 6826 }, { "epoch": 0.4782193735935626, "grad_norm": 3.5413403511047363, "learning_rate": 5.222295621716288e-05, "loss": 0.9407, "num_input_tokens_seen": 109851472, "step": 6827 }, { "epoch": 0.4782894218392918, "grad_norm": 4.55118989944458, "learning_rate": 5.2215957968476356e-05, "loss": 1.1199, "num_input_tokens_seen": 109867856, "step": 6828 }, { "epoch": 0.47835947008502105, "grad_norm": 3.691756010055542, "learning_rate": 5.220895971978984e-05, "loss": 0.9721, "num_input_tokens_seen": 109884240, "step": 6829 }, { "epoch": 0.4784295183307503, "grad_norm": 3.588829755783081, "learning_rate": 5.220196147110333e-05, "loss": 1.0665, "num_input_tokens_seen": 109900624, "step": 6830 }, { "epoch": 0.4784995665764796, "grad_norm": 4.766005516052246, "learning_rate": 5.219496322241683e-05, "loss": 1.0467, "num_input_tokens_seen": 109917008, "step": 6831 }, { "epoch": 0.4785696148222088, "grad_norm": 3.7234201431274414, "learning_rate": 5.218796497373031e-05, "loss": 1.0377, "num_input_tokens_seen": 109933392, "step": 6832 }, { "epoch": 0.47863966306793804, "grad_norm": 3.434387683868408, "learning_rate": 5.218096672504378e-05, "loss": 0.874, "num_input_tokens_seen": 109949776, "step": 6833 }, { "epoch": 0.4787097113136673, "grad_norm": 3.7484259605407715, "learning_rate": 5.2173968476357274e-05, "loss": 0.9365, "num_input_tokens_seen": 109966016, "step": 6834 }, { "epoch": 0.47877975955939656, "grad_norm": 5.821316719055176, "learning_rate": 5.216697022767076e-05, "loss": 0.9894, "num_input_tokens_seen": 109981168, "step": 6835 }, { "epoch": 0.47884980780512576, "grad_norm": 5.2646484375, "learning_rate": 5.215997197898424e-05, "loss": 1.0894, "num_input_tokens_seen": 109996648, "step": 6836 }, { "epoch": 0.478919856050855, "grad_norm": 5.125279426574707, "learning_rate": 5.2152973730297726e-05, "loss": 0.9451, "num_input_tokens_seen": 110013032, "step": 6837 }, { "epoch": 0.4789899042965843, "grad_norm": 4.917844295501709, "learning_rate": 5.214597548161121e-05, "loss": 1.1573, "num_input_tokens_seen": 110029040, "step": 6838 }, { "epoch": 0.47905995254231354, "grad_norm": 3.6937522888183594, "learning_rate": 5.21389772329247e-05, "loss": 1.0922, "num_input_tokens_seen": 110045032, "step": 6839 }, { "epoch": 0.47913000078804274, "grad_norm": 4.9768757820129395, "learning_rate": 5.213197898423817e-05, "loss": 1.3347, "num_input_tokens_seen": 110061416, "step": 6840 }, { "epoch": 0.479200049033772, "grad_norm": 5.775148391723633, "learning_rate": 5.212498073555167e-05, "loss": 1.1443, "num_input_tokens_seen": 110077800, "step": 6841 }, { "epoch": 0.47927009727950126, "grad_norm": 4.3342766761779785, "learning_rate": 5.2117982486865155e-05, "loss": 1.0604, "num_input_tokens_seen": 110093848, "step": 6842 }, { "epoch": 0.4793401455252305, "grad_norm": 3.6098031997680664, "learning_rate": 5.211098423817863e-05, "loss": 1.0893, "num_input_tokens_seen": 110110232, "step": 6843 }, { "epoch": 0.4794101937709597, "grad_norm": 3.7780818939208984, "learning_rate": 5.210398598949212e-05, "loss": 0.9852, "num_input_tokens_seen": 110126584, "step": 6844 }, { "epoch": 0.479480242016689, "grad_norm": 3.732302188873291, "learning_rate": 5.2096987740805614e-05, "loss": 0.9158, "num_input_tokens_seen": 110142968, "step": 6845 }, { "epoch": 0.47955029026241824, "grad_norm": 4.920741558074951, "learning_rate": 5.208998949211908e-05, "loss": 0.9931, "num_input_tokens_seen": 110159352, "step": 6846 }, { "epoch": 0.4796203385081475, "grad_norm": 3.847682476043701, "learning_rate": 5.208299124343258e-05, "loss": 1.1485, "num_input_tokens_seen": 110175736, "step": 6847 }, { "epoch": 0.4796903867538767, "grad_norm": 3.8941121101379395, "learning_rate": 5.207599299474607e-05, "loss": 1.0896, "num_input_tokens_seen": 110192040, "step": 6848 }, { "epoch": 0.47976043499960597, "grad_norm": 4.254310131072998, "learning_rate": 5.2068994746059555e-05, "loss": 1.1701, "num_input_tokens_seen": 110208304, "step": 6849 }, { "epoch": 0.4798304832453352, "grad_norm": 3.85739803314209, "learning_rate": 5.206199649737302e-05, "loss": 0.9785, "num_input_tokens_seen": 110224688, "step": 6850 }, { "epoch": 0.4799005314910645, "grad_norm": 4.137633323669434, "learning_rate": 5.205499824868651e-05, "loss": 1.2111, "num_input_tokens_seen": 110240160, "step": 6851 }, { "epoch": 0.4799705797367937, "grad_norm": 3.827974557876587, "learning_rate": 5.204800000000001e-05, "loss": 0.9639, "num_input_tokens_seen": 110255952, "step": 6852 }, { "epoch": 0.48004062798252295, "grad_norm": 4.506080627441406, "learning_rate": 5.2041001751313475e-05, "loss": 1.0435, "num_input_tokens_seen": 110272336, "step": 6853 }, { "epoch": 0.4801106762282522, "grad_norm": 3.4824750423431396, "learning_rate": 5.203400350262697e-05, "loss": 0.8792, "num_input_tokens_seen": 110288720, "step": 6854 }, { "epoch": 0.48018072447398147, "grad_norm": 3.319546937942505, "learning_rate": 5.2027005253940466e-05, "loss": 0.9861, "num_input_tokens_seen": 110304984, "step": 6855 }, { "epoch": 0.48025077271971073, "grad_norm": 5.543242454528809, "learning_rate": 5.202000700525395e-05, "loss": 1.0694, "num_input_tokens_seen": 110320488, "step": 6856 }, { "epoch": 0.48032082096543993, "grad_norm": 6.7765069007873535, "learning_rate": 5.201300875656743e-05, "loss": 1.0751, "num_input_tokens_seen": 110336872, "step": 6857 }, { "epoch": 0.4803908692111692, "grad_norm": 3.5764353275299072, "learning_rate": 5.200601050788092e-05, "loss": 1.0798, "num_input_tokens_seen": 110353160, "step": 6858 }, { "epoch": 0.48046091745689845, "grad_norm": 4.938530921936035, "learning_rate": 5.19990122591944e-05, "loss": 0.9155, "num_input_tokens_seen": 110369544, "step": 6859 }, { "epoch": 0.4805309657026277, "grad_norm": 3.5447168350219727, "learning_rate": 5.1992014010507875e-05, "loss": 0.9738, "num_input_tokens_seen": 110385928, "step": 6860 }, { "epoch": 0.4806010139483569, "grad_norm": 4.1170220375061035, "learning_rate": 5.1985015761821364e-05, "loss": 1.156, "num_input_tokens_seen": 110402224, "step": 6861 }, { "epoch": 0.4806710621940862, "grad_norm": 3.6147382259368896, "learning_rate": 5.197801751313486e-05, "loss": 1.0212, "num_input_tokens_seen": 110418608, "step": 6862 }, { "epoch": 0.48074111043981543, "grad_norm": 3.745072841644287, "learning_rate": 5.197101926444834e-05, "loss": 1.1518, "num_input_tokens_seen": 110434792, "step": 6863 }, { "epoch": 0.4808111586855447, "grad_norm": 4.3973517417907715, "learning_rate": 5.196402101576182e-05, "loss": 1.0627, "num_input_tokens_seen": 110450376, "step": 6864 }, { "epoch": 0.4808812069312739, "grad_norm": 4.029878616333008, "learning_rate": 5.195702276707531e-05, "loss": 1.051, "num_input_tokens_seen": 110466760, "step": 6865 }, { "epoch": 0.48095125517700316, "grad_norm": 3.5051989555358887, "learning_rate": 5.195002451838879e-05, "loss": 1.1163, "num_input_tokens_seen": 110483144, "step": 6866 }, { "epoch": 0.4810213034227324, "grad_norm": 3.8468475341796875, "learning_rate": 5.1943026269702275e-05, "loss": 1.0515, "num_input_tokens_seen": 110499528, "step": 6867 }, { "epoch": 0.4810913516684617, "grad_norm": 3.4679362773895264, "learning_rate": 5.193602802101577e-05, "loss": 1.0516, "num_input_tokens_seen": 110515448, "step": 6868 }, { "epoch": 0.4811613999141909, "grad_norm": 3.540043830871582, "learning_rate": 5.192902977232925e-05, "loss": 1.0163, "num_input_tokens_seen": 110531832, "step": 6869 }, { "epoch": 0.48123144815992014, "grad_norm": 4.2961835861206055, "learning_rate": 5.192203152364272e-05, "loss": 0.9839, "num_input_tokens_seen": 110548216, "step": 6870 }, { "epoch": 0.4813014964056494, "grad_norm": 4.718245029449463, "learning_rate": 5.191503327495623e-05, "loss": 1.0214, "num_input_tokens_seen": 110564600, "step": 6871 }, { "epoch": 0.48137154465137866, "grad_norm": 4.846748352050781, "learning_rate": 5.190803502626971e-05, "loss": 1.1448, "num_input_tokens_seen": 110579952, "step": 6872 }, { "epoch": 0.48144159289710786, "grad_norm": 3.5760273933410645, "learning_rate": 5.1901036777583186e-05, "loss": 1.0028, "num_input_tokens_seen": 110595984, "step": 6873 }, { "epoch": 0.4815116411428371, "grad_norm": 6.386372089385986, "learning_rate": 5.189403852889667e-05, "loss": 1.1695, "num_input_tokens_seen": 110612368, "step": 6874 }, { "epoch": 0.4815816893885664, "grad_norm": 5.007279872894287, "learning_rate": 5.188704028021018e-05, "loss": 1.0406, "num_input_tokens_seen": 110628752, "step": 6875 }, { "epoch": 0.48165173763429564, "grad_norm": 4.01614236831665, "learning_rate": 5.1880042031523645e-05, "loss": 1.075, "num_input_tokens_seen": 110645136, "step": 6876 }, { "epoch": 0.48172178588002484, "grad_norm": 4.7416486740112305, "learning_rate": 5.187304378283713e-05, "loss": 1.3402, "num_input_tokens_seen": 110661400, "step": 6877 }, { "epoch": 0.4817918341257541, "grad_norm": 4.886537551879883, "learning_rate": 5.186604553415062e-05, "loss": 0.8621, "num_input_tokens_seen": 110677784, "step": 6878 }, { "epoch": 0.48186188237148336, "grad_norm": 4.033387660980225, "learning_rate": 5.1859047285464104e-05, "loss": 1.3515, "num_input_tokens_seen": 110694168, "step": 6879 }, { "epoch": 0.4819319306172126, "grad_norm": 3.7201569080352783, "learning_rate": 5.1852049036777586e-05, "loss": 1.163, "num_input_tokens_seen": 110710552, "step": 6880 }, { "epoch": 0.4820019788629418, "grad_norm": 3.73651123046875, "learning_rate": 5.1845050788091075e-05, "loss": 1.0389, "num_input_tokens_seen": 110726440, "step": 6881 }, { "epoch": 0.4820720271086711, "grad_norm": 4.395266532897949, "learning_rate": 5.1838052539404556e-05, "loss": 0.9636, "num_input_tokens_seen": 110742200, "step": 6882 }, { "epoch": 0.48214207535440035, "grad_norm": 3.70263409614563, "learning_rate": 5.183105429071804e-05, "loss": 1.0229, "num_input_tokens_seen": 110758584, "step": 6883 }, { "epoch": 0.4822121236001296, "grad_norm": 4.863175868988037, "learning_rate": 5.1824056042031534e-05, "loss": 1.1663, "num_input_tokens_seen": 110774040, "step": 6884 }, { "epoch": 0.4822821718458588, "grad_norm": 3.668220043182373, "learning_rate": 5.181705779334502e-05, "loss": 1.2351, "num_input_tokens_seen": 110790352, "step": 6885 }, { "epoch": 0.48235222009158807, "grad_norm": 4.210755825042725, "learning_rate": 5.18100595446585e-05, "loss": 1.0517, "num_input_tokens_seen": 110805912, "step": 6886 }, { "epoch": 0.48242226833731733, "grad_norm": 3.62275767326355, "learning_rate": 5.180306129597198e-05, "loss": 1.0383, "num_input_tokens_seen": 110822296, "step": 6887 }, { "epoch": 0.4824923165830466, "grad_norm": 3.498563051223755, "learning_rate": 5.179606304728547e-05, "loss": 0.9063, "num_input_tokens_seen": 110838680, "step": 6888 }, { "epoch": 0.4825623648287758, "grad_norm": 6.4097161293029785, "learning_rate": 5.178906479859895e-05, "loss": 0.9482, "num_input_tokens_seen": 110855064, "step": 6889 }, { "epoch": 0.48263241307450505, "grad_norm": 4.8159565925598145, "learning_rate": 5.178206654991243e-05, "loss": 1.2248, "num_input_tokens_seen": 110871328, "step": 6890 }, { "epoch": 0.4827024613202343, "grad_norm": 3.976828098297119, "learning_rate": 5.177506830122593e-05, "loss": 1.1521, "num_input_tokens_seen": 110886600, "step": 6891 }, { "epoch": 0.48277250956596357, "grad_norm": 3.6857738494873047, "learning_rate": 5.176807005253942e-05, "loss": 1.1149, "num_input_tokens_seen": 110902984, "step": 6892 }, { "epoch": 0.48284255781169283, "grad_norm": 4.129028797149658, "learning_rate": 5.176107180385289e-05, "loss": 1.1404, "num_input_tokens_seen": 110918808, "step": 6893 }, { "epoch": 0.48291260605742203, "grad_norm": 4.203270435333252, "learning_rate": 5.175407355516637e-05, "loss": 1.1844, "num_input_tokens_seen": 110935192, "step": 6894 }, { "epoch": 0.4829826543031513, "grad_norm": 3.7045552730560303, "learning_rate": 5.1747075306479874e-05, "loss": 0.9193, "num_input_tokens_seen": 110951168, "step": 6895 }, { "epoch": 0.48305270254888055, "grad_norm": 4.2172112464904785, "learning_rate": 5.174007705779334e-05, "loss": 0.8905, "num_input_tokens_seen": 110967552, "step": 6896 }, { "epoch": 0.4831227507946098, "grad_norm": 3.395329236984253, "learning_rate": 5.1733078809106824e-05, "loss": 0.9696, "num_input_tokens_seen": 110983736, "step": 6897 }, { "epoch": 0.483192799040339, "grad_norm": 6.649857044219971, "learning_rate": 5.172608056042032e-05, "loss": 1.2299, "num_input_tokens_seen": 111000120, "step": 6898 }, { "epoch": 0.4832628472860683, "grad_norm": 5.114965438842773, "learning_rate": 5.1719082311733815e-05, "loss": 1.196, "num_input_tokens_seen": 111016504, "step": 6899 }, { "epoch": 0.48333289553179754, "grad_norm": 4.1728410720825195, "learning_rate": 5.171208406304728e-05, "loss": 1.1445, "num_input_tokens_seen": 111032232, "step": 6900 }, { "epoch": 0.4834029437775268, "grad_norm": 3.674546241760254, "learning_rate": 5.170508581436078e-05, "loss": 1.0889, "num_input_tokens_seen": 111047576, "step": 6901 }, { "epoch": 0.483472992023256, "grad_norm": 3.4895896911621094, "learning_rate": 5.169808756567427e-05, "loss": 0.9618, "num_input_tokens_seen": 111063792, "step": 6902 }, { "epoch": 0.48354304026898526, "grad_norm": 8.447297096252441, "learning_rate": 5.169108931698774e-05, "loss": 0.911, "num_input_tokens_seen": 111079136, "step": 6903 }, { "epoch": 0.4836130885147145, "grad_norm": 4.854581356048584, "learning_rate": 5.1684091068301224e-05, "loss": 0.9725, "num_input_tokens_seen": 111093808, "step": 6904 }, { "epoch": 0.4836831367604438, "grad_norm": 3.4015259742736816, "learning_rate": 5.167709281961471e-05, "loss": 0.9395, "num_input_tokens_seen": 111110192, "step": 6905 }, { "epoch": 0.483753185006173, "grad_norm": 3.979801654815674, "learning_rate": 5.1670094570928195e-05, "loss": 1.181, "num_input_tokens_seen": 111126576, "step": 6906 }, { "epoch": 0.48382323325190224, "grad_norm": 3.655245542526245, "learning_rate": 5.166309632224169e-05, "loss": 0.9631, "num_input_tokens_seen": 111142960, "step": 6907 }, { "epoch": 0.4838932814976315, "grad_norm": 3.820819616317749, "learning_rate": 5.165609807355517e-05, "loss": 1.0845, "num_input_tokens_seen": 111159344, "step": 6908 }, { "epoch": 0.48396332974336076, "grad_norm": 3.6869490146636963, "learning_rate": 5.164909982486866e-05, "loss": 0.7909, "num_input_tokens_seen": 111175608, "step": 6909 }, { "epoch": 0.48403337798908996, "grad_norm": 3.644277334213257, "learning_rate": 5.1642101576182135e-05, "loss": 1.0442, "num_input_tokens_seen": 111191992, "step": 6910 }, { "epoch": 0.4841034262348192, "grad_norm": 3.794215202331543, "learning_rate": 5.1635103327495624e-05, "loss": 1.1105, "num_input_tokens_seen": 111207248, "step": 6911 }, { "epoch": 0.4841734744805485, "grad_norm": 4.5081987380981445, "learning_rate": 5.162810507880912e-05, "loss": 1.3952, "num_input_tokens_seen": 111223632, "step": 6912 }, { "epoch": 0.48424352272627774, "grad_norm": 3.632924795150757, "learning_rate": 5.162110683012259e-05, "loss": 1.0862, "num_input_tokens_seen": 111240016, "step": 6913 }, { "epoch": 0.48431357097200695, "grad_norm": 3.522996425628662, "learning_rate": 5.161410858143607e-05, "loss": 0.9521, "num_input_tokens_seen": 111255840, "step": 6914 }, { "epoch": 0.4843836192177362, "grad_norm": 4.495186805725098, "learning_rate": 5.1607110332749565e-05, "loss": 1.0066, "num_input_tokens_seen": 111272224, "step": 6915 }, { "epoch": 0.48445366746346546, "grad_norm": 3.6315512657165527, "learning_rate": 5.160011208406306e-05, "loss": 1.0991, "num_input_tokens_seen": 111287920, "step": 6916 }, { "epoch": 0.4845237157091947, "grad_norm": 3.4649548530578613, "learning_rate": 5.159311383537654e-05, "loss": 1.024, "num_input_tokens_seen": 111304304, "step": 6917 }, { "epoch": 0.48459376395492393, "grad_norm": 4.057675838470459, "learning_rate": 5.158611558669002e-05, "loss": 1.0403, "num_input_tokens_seen": 111320688, "step": 6918 }, { "epoch": 0.4846638122006532, "grad_norm": 4.989962100982666, "learning_rate": 5.157911733800351e-05, "loss": 1.0446, "num_input_tokens_seen": 111337072, "step": 6919 }, { "epoch": 0.48473386044638245, "grad_norm": 4.090515613555908, "learning_rate": 5.157211908931698e-05, "loss": 0.9324, "num_input_tokens_seen": 111353456, "step": 6920 }, { "epoch": 0.4848039086921117, "grad_norm": 4.017073154449463, "learning_rate": 5.156512084063046e-05, "loss": 1.0938, "num_input_tokens_seen": 111369840, "step": 6921 }, { "epoch": 0.4848739569378409, "grad_norm": 4.227852821350098, "learning_rate": 5.155812259194397e-05, "loss": 1.0553, "num_input_tokens_seen": 111386096, "step": 6922 }, { "epoch": 0.48494400518357017, "grad_norm": 5.356720447540283, "learning_rate": 5.155112434325745e-05, "loss": 1.1807, "num_input_tokens_seen": 111402192, "step": 6923 }, { "epoch": 0.48501405342929943, "grad_norm": 3.714996814727783, "learning_rate": 5.1544126094570935e-05, "loss": 0.9851, "num_input_tokens_seen": 111418120, "step": 6924 }, { "epoch": 0.4850841016750287, "grad_norm": 3.814669609069824, "learning_rate": 5.153712784588442e-05, "loss": 1.1195, "num_input_tokens_seen": 111434408, "step": 6925 }, { "epoch": 0.48515414992075795, "grad_norm": 4.38773250579834, "learning_rate": 5.1530129597197905e-05, "loss": 0.9939, "num_input_tokens_seen": 111450384, "step": 6926 }, { "epoch": 0.48522419816648715, "grad_norm": 5.492570877075195, "learning_rate": 5.152313134851139e-05, "loss": 1.0629, "num_input_tokens_seen": 111466768, "step": 6927 }, { "epoch": 0.4852942464122164, "grad_norm": 4.867751598358154, "learning_rate": 5.151613309982487e-05, "loss": 1.0787, "num_input_tokens_seen": 111481680, "step": 6928 }, { "epoch": 0.48536429465794567, "grad_norm": 3.6009931564331055, "learning_rate": 5.1509134851138364e-05, "loss": 1.1068, "num_input_tokens_seen": 111497784, "step": 6929 }, { "epoch": 0.48543434290367493, "grad_norm": 3.451188564300537, "learning_rate": 5.1502136602451846e-05, "loss": 0.9131, "num_input_tokens_seen": 111513856, "step": 6930 }, { "epoch": 0.48550439114940414, "grad_norm": 4.886107444763184, "learning_rate": 5.149513835376533e-05, "loss": 0.9234, "num_input_tokens_seen": 111530240, "step": 6931 }, { "epoch": 0.4855744393951334, "grad_norm": 4.033775806427002, "learning_rate": 5.148814010507881e-05, "loss": 1.0094, "num_input_tokens_seen": 111546160, "step": 6932 }, { "epoch": 0.48564448764086265, "grad_norm": 4.718981742858887, "learning_rate": 5.14811418563923e-05, "loss": 0.9965, "num_input_tokens_seen": 111562432, "step": 6933 }, { "epoch": 0.4857145358865919, "grad_norm": 3.7174808979034424, "learning_rate": 5.147414360770578e-05, "loss": 1.1065, "num_input_tokens_seen": 111578816, "step": 6934 }, { "epoch": 0.4857845841323211, "grad_norm": 4.0880208015441895, "learning_rate": 5.146714535901926e-05, "loss": 1.0742, "num_input_tokens_seen": 111593928, "step": 6935 }, { "epoch": 0.4858546323780504, "grad_norm": 3.3873400688171387, "learning_rate": 5.146014711033276e-05, "loss": 0.9752, "num_input_tokens_seen": 111610312, "step": 6936 }, { "epoch": 0.48592468062377964, "grad_norm": 3.6071503162384033, "learning_rate": 5.145314886164624e-05, "loss": 0.9917, "num_input_tokens_seen": 111626696, "step": 6937 }, { "epoch": 0.4859947288695089, "grad_norm": 3.502610445022583, "learning_rate": 5.1446150612959735e-05, "loss": 0.8912, "num_input_tokens_seen": 111643080, "step": 6938 }, { "epoch": 0.4860647771152381, "grad_norm": 3.5743067264556885, "learning_rate": 5.1439152364273216e-05, "loss": 1.1493, "num_input_tokens_seen": 111659048, "step": 6939 }, { "epoch": 0.48613482536096736, "grad_norm": 3.9423654079437256, "learning_rate": 5.14321541155867e-05, "loss": 1.1328, "num_input_tokens_seen": 111675432, "step": 6940 }, { "epoch": 0.4862048736066966, "grad_norm": 4.670028209686279, "learning_rate": 5.142515586690017e-05, "loss": 0.9023, "num_input_tokens_seen": 111691816, "step": 6941 }, { "epoch": 0.4862749218524259, "grad_norm": 3.8914809226989746, "learning_rate": 5.1418157618213655e-05, "loss": 1.0373, "num_input_tokens_seen": 111708200, "step": 6942 }, { "epoch": 0.4863449700981551, "grad_norm": 3.864323139190674, "learning_rate": 5.141115936952715e-05, "loss": 0.9064, "num_input_tokens_seen": 111724488, "step": 6943 }, { "epoch": 0.48641501834388434, "grad_norm": 3.700681447982788, "learning_rate": 5.140416112084063e-05, "loss": 1.092, "num_input_tokens_seen": 111740368, "step": 6944 }, { "epoch": 0.4864850665896136, "grad_norm": 3.7225606441497803, "learning_rate": 5.139716287215413e-05, "loss": 0.971, "num_input_tokens_seen": 111755936, "step": 6945 }, { "epoch": 0.48655511483534286, "grad_norm": 4.638529300689697, "learning_rate": 5.139016462346761e-05, "loss": 0.9367, "num_input_tokens_seen": 111772152, "step": 6946 }, { "epoch": 0.48662516308107207, "grad_norm": 5.287013053894043, "learning_rate": 5.138316637478109e-05, "loss": 0.9463, "num_input_tokens_seen": 111787144, "step": 6947 }, { "epoch": 0.4866952113268013, "grad_norm": 3.991861343383789, "learning_rate": 5.137616812609457e-05, "loss": 0.8265, "num_input_tokens_seen": 111803528, "step": 6948 }, { "epoch": 0.4867652595725306, "grad_norm": 4.166889190673828, "learning_rate": 5.136916987740806e-05, "loss": 1.1667, "num_input_tokens_seen": 111819376, "step": 6949 }, { "epoch": 0.48683530781825984, "grad_norm": 4.159299373626709, "learning_rate": 5.1362171628721543e-05, "loss": 1.1422, "num_input_tokens_seen": 111835760, "step": 6950 }, { "epoch": 0.48690535606398905, "grad_norm": 5.612180709838867, "learning_rate": 5.1355173380035025e-05, "loss": 0.9889, "num_input_tokens_seen": 111851744, "step": 6951 }, { "epoch": 0.4869754043097183, "grad_norm": 5.82523775100708, "learning_rate": 5.134817513134852e-05, "loss": 0.9702, "num_input_tokens_seen": 111868128, "step": 6952 }, { "epoch": 0.48704545255544757, "grad_norm": 3.5110416412353516, "learning_rate": 5.1341176882662e-05, "loss": 0.9538, "num_input_tokens_seen": 111884512, "step": 6953 }, { "epoch": 0.4871155008011768, "grad_norm": 4.108850479125977, "learning_rate": 5.1334178633975484e-05, "loss": 1.0866, "num_input_tokens_seen": 111899544, "step": 6954 }, { "epoch": 0.48718554904690603, "grad_norm": 3.9000258445739746, "learning_rate": 5.132718038528897e-05, "loss": 0.9456, "num_input_tokens_seen": 111915928, "step": 6955 }, { "epoch": 0.4872555972926353, "grad_norm": 4.503340244293213, "learning_rate": 5.1320182136602455e-05, "loss": 1.004, "num_input_tokens_seen": 111932288, "step": 6956 }, { "epoch": 0.48732564553836455, "grad_norm": 4.052606582641602, "learning_rate": 5.1313183887915936e-05, "loss": 1.1662, "num_input_tokens_seen": 111948672, "step": 6957 }, { "epoch": 0.4873956937840938, "grad_norm": 3.4959487915039062, "learning_rate": 5.130618563922942e-05, "loss": 1.0242, "num_input_tokens_seen": 111964272, "step": 6958 }, { "epoch": 0.48746574202982307, "grad_norm": 4.654433250427246, "learning_rate": 5.12991873905429e-05, "loss": 1.1307, "num_input_tokens_seen": 111980656, "step": 6959 }, { "epoch": 0.48753579027555227, "grad_norm": 4.125091552734375, "learning_rate": 5.1292189141856395e-05, "loss": 1.1137, "num_input_tokens_seen": 111996584, "step": 6960 }, { "epoch": 0.48760583852128153, "grad_norm": 4.574272155761719, "learning_rate": 5.128519089316988e-05, "loss": 0.9312, "num_input_tokens_seen": 112011528, "step": 6961 }, { "epoch": 0.4876758867670108, "grad_norm": 4.110400676727295, "learning_rate": 5.127819264448337e-05, "loss": 0.9669, "num_input_tokens_seen": 112026256, "step": 6962 }, { "epoch": 0.48774593501274005, "grad_norm": 3.4572913646698, "learning_rate": 5.127119439579685e-05, "loss": 0.9956, "num_input_tokens_seen": 112042288, "step": 6963 }, { "epoch": 0.48781598325846925, "grad_norm": 4.498427391052246, "learning_rate": 5.126419614711033e-05, "loss": 1.1088, "num_input_tokens_seen": 112058672, "step": 6964 }, { "epoch": 0.4878860315041985, "grad_norm": 5.4692301750183105, "learning_rate": 5.1257197898423825e-05, "loss": 0.9582, "num_input_tokens_seen": 112073536, "step": 6965 }, { "epoch": 0.4879560797499278, "grad_norm": 3.8990654945373535, "learning_rate": 5.125019964973732e-05, "loss": 1.2729, "num_input_tokens_seen": 112089344, "step": 6966 }, { "epoch": 0.48802612799565703, "grad_norm": 3.5601627826690674, "learning_rate": 5.12432014010508e-05, "loss": 1.0519, "num_input_tokens_seen": 112105296, "step": 6967 }, { "epoch": 0.48809617624138624, "grad_norm": 3.91282057762146, "learning_rate": 5.123620315236427e-05, "loss": 1.2046, "num_input_tokens_seen": 112120600, "step": 6968 }, { "epoch": 0.4881662244871155, "grad_norm": 5.9246602058410645, "learning_rate": 5.1229204903677766e-05, "loss": 1.0982, "num_input_tokens_seen": 112136472, "step": 6969 }, { "epoch": 0.48823627273284476, "grad_norm": 8.849782943725586, "learning_rate": 5.122220665499125e-05, "loss": 1.0671, "num_input_tokens_seen": 112152792, "step": 6970 }, { "epoch": 0.488306320978574, "grad_norm": 4.184106349945068, "learning_rate": 5.121520840630473e-05, "loss": 1.2696, "num_input_tokens_seen": 112169176, "step": 6971 }, { "epoch": 0.4883763692243032, "grad_norm": 4.250857830047607, "learning_rate": 5.120821015761822e-05, "loss": 0.9724, "num_input_tokens_seen": 112184784, "step": 6972 }, { "epoch": 0.4884464174700325, "grad_norm": 4.522305011749268, "learning_rate": 5.12012119089317e-05, "loss": 0.8251, "num_input_tokens_seen": 112200960, "step": 6973 }, { "epoch": 0.48851646571576174, "grad_norm": 3.5135490894317627, "learning_rate": 5.1194213660245195e-05, "loss": 1.0713, "num_input_tokens_seen": 112217080, "step": 6974 }, { "epoch": 0.488586513961491, "grad_norm": 5.541810989379883, "learning_rate": 5.118721541155866e-05, "loss": 1.1885, "num_input_tokens_seen": 112233464, "step": 6975 }, { "epoch": 0.4886565622072202, "grad_norm": 3.7535064220428467, "learning_rate": 5.1180217162872165e-05, "loss": 1.0558, "num_input_tokens_seen": 112249848, "step": 6976 }, { "epoch": 0.48872661045294946, "grad_norm": 4.454082012176514, "learning_rate": 5.117321891418565e-05, "loss": 1.0892, "num_input_tokens_seen": 112265560, "step": 6977 }, { "epoch": 0.4887966586986787, "grad_norm": 3.770138740539551, "learning_rate": 5.116622066549912e-05, "loss": 1.082, "num_input_tokens_seen": 112281944, "step": 6978 }, { "epoch": 0.488866706944408, "grad_norm": 5.923669815063477, "learning_rate": 5.115922241681261e-05, "loss": 1.0662, "num_input_tokens_seen": 112298264, "step": 6979 }, { "epoch": 0.4889367551901372, "grad_norm": 3.9768123626708984, "learning_rate": 5.115222416812609e-05, "loss": 1.1816, "num_input_tokens_seen": 112314608, "step": 6980 }, { "epoch": 0.48900680343586644, "grad_norm": 5.525039196014404, "learning_rate": 5.1145225919439575e-05, "loss": 1.0287, "num_input_tokens_seen": 112330400, "step": 6981 }, { "epoch": 0.4890768516815957, "grad_norm": 3.8725640773773193, "learning_rate": 5.113822767075307e-05, "loss": 0.9666, "num_input_tokens_seen": 112345384, "step": 6982 }, { "epoch": 0.48914689992732496, "grad_norm": 4.746465682983398, "learning_rate": 5.1131229422066565e-05, "loss": 1.1162, "num_input_tokens_seen": 112361768, "step": 6983 }, { "epoch": 0.48921694817305417, "grad_norm": 3.774049997329712, "learning_rate": 5.112423117338005e-05, "loss": 1.0898, "num_input_tokens_seen": 112377432, "step": 6984 }, { "epoch": 0.4892869964187834, "grad_norm": 3.686307191848755, "learning_rate": 5.1117232924693515e-05, "loss": 1.0459, "num_input_tokens_seen": 112393672, "step": 6985 }, { "epoch": 0.4893570446645127, "grad_norm": 4.177459239959717, "learning_rate": 5.1110234676007004e-05, "loss": 1.0504, "num_input_tokens_seen": 112409600, "step": 6986 }, { "epoch": 0.48942709291024195, "grad_norm": 3.8517558574676514, "learning_rate": 5.1103236427320486e-05, "loss": 1.0947, "num_input_tokens_seen": 112425880, "step": 6987 }, { "epoch": 0.48949714115597115, "grad_norm": 3.3155159950256348, "learning_rate": 5.109623817863397e-05, "loss": 0.9201, "num_input_tokens_seen": 112442264, "step": 6988 }, { "epoch": 0.4895671894017004, "grad_norm": 4.027132987976074, "learning_rate": 5.108923992994746e-05, "loss": 1.0882, "num_input_tokens_seen": 112458504, "step": 6989 }, { "epoch": 0.48963723764742967, "grad_norm": 3.622421979904175, "learning_rate": 5.108224168126096e-05, "loss": 0.9098, "num_input_tokens_seen": 112474888, "step": 6990 }, { "epoch": 0.48970728589315893, "grad_norm": 4.16541051864624, "learning_rate": 5.107524343257444e-05, "loss": 0.967, "num_input_tokens_seen": 112491272, "step": 6991 }, { "epoch": 0.48977733413888813, "grad_norm": 4.473822593688965, "learning_rate": 5.106824518388792e-05, "loss": 1.026, "num_input_tokens_seen": 112506632, "step": 6992 }, { "epoch": 0.4898473823846174, "grad_norm": 5.10452127456665, "learning_rate": 5.106124693520141e-05, "loss": 1.003, "num_input_tokens_seen": 112521696, "step": 6993 }, { "epoch": 0.48991743063034665, "grad_norm": 4.185652732849121, "learning_rate": 5.105424868651489e-05, "loss": 0.9275, "num_input_tokens_seen": 112537432, "step": 6994 }, { "epoch": 0.4899874788760759, "grad_norm": 4.864262580871582, "learning_rate": 5.104725043782837e-05, "loss": 1.033, "num_input_tokens_seen": 112553816, "step": 6995 }, { "epoch": 0.49005752712180517, "grad_norm": 3.859199047088623, "learning_rate": 5.1040252189141856e-05, "loss": 1.115, "num_input_tokens_seen": 112570200, "step": 6996 }, { "epoch": 0.4901275753675344, "grad_norm": 3.49395751953125, "learning_rate": 5.103325394045535e-05, "loss": 0.9285, "num_input_tokens_seen": 112586584, "step": 6997 }, { "epoch": 0.49019762361326363, "grad_norm": 4.164735317230225, "learning_rate": 5.102625569176883e-05, "loss": 0.8565, "num_input_tokens_seen": 112602568, "step": 6998 }, { "epoch": 0.4902676718589929, "grad_norm": 6.273041725158691, "learning_rate": 5.1019257443082315e-05, "loss": 1.0252, "num_input_tokens_seen": 112618952, "step": 6999 }, { "epoch": 0.49033772010472215, "grad_norm": 3.8460848331451416, "learning_rate": 5.1012259194395804e-05, "loss": 0.9854, "num_input_tokens_seen": 112635336, "step": 7000 }, { "epoch": 0.49033772010472215, "eval_loss": 1.1226829290390015, "eval_runtime": 0.157, "eval_samples_per_second": 6.371, "eval_steps_per_second": 6.371, "num_input_tokens_seen": 112635336, "step": 7000 }, { "epoch": 0.49040776835045136, "grad_norm": 4.498118877410889, "learning_rate": 5.1005260945709285e-05, "loss": 1.2191, "num_input_tokens_seen": 112651248, "step": 7001 }, { "epoch": 0.4904778165961806, "grad_norm": 3.9830660820007324, "learning_rate": 5.099826269702276e-05, "loss": 0.9127, "num_input_tokens_seen": 112667632, "step": 7002 }, { "epoch": 0.4905478648419099, "grad_norm": 3.439422130584717, "learning_rate": 5.099126444833626e-05, "loss": 0.9423, "num_input_tokens_seen": 112684016, "step": 7003 }, { "epoch": 0.49061791308763913, "grad_norm": 3.6636171340942383, "learning_rate": 5.0984266199649744e-05, "loss": 1.0124, "num_input_tokens_seen": 112700360, "step": 7004 }, { "epoch": 0.49068796133336834, "grad_norm": 4.3784589767456055, "learning_rate": 5.097726795096321e-05, "loss": 1.1782, "num_input_tokens_seen": 112715624, "step": 7005 }, { "epoch": 0.4907580095790976, "grad_norm": 3.716031789779663, "learning_rate": 5.097026970227671e-05, "loss": 1.0312, "num_input_tokens_seen": 112732008, "step": 7006 }, { "epoch": 0.49082805782482686, "grad_norm": 4.289496898651123, "learning_rate": 5.09632714535902e-05, "loss": 1.103, "num_input_tokens_seen": 112748392, "step": 7007 }, { "epoch": 0.4908981060705561, "grad_norm": 4.012343883514404, "learning_rate": 5.095627320490368e-05, "loss": 1.0623, "num_input_tokens_seen": 112764432, "step": 7008 }, { "epoch": 0.4909681543162853, "grad_norm": 3.7599732875823975, "learning_rate": 5.094927495621716e-05, "loss": 1.0666, "num_input_tokens_seen": 112780544, "step": 7009 }, { "epoch": 0.4910382025620146, "grad_norm": 3.398778200149536, "learning_rate": 5.0942276707530656e-05, "loss": 1.0517, "num_input_tokens_seen": 112796928, "step": 7010 }, { "epoch": 0.49110825080774384, "grad_norm": 7.299741268157959, "learning_rate": 5.093527845884414e-05, "loss": 1.0963, "num_input_tokens_seen": 112812576, "step": 7011 }, { "epoch": 0.4911782990534731, "grad_norm": 4.2506866455078125, "learning_rate": 5.0928280210157606e-05, "loss": 1.11, "num_input_tokens_seen": 112828184, "step": 7012 }, { "epoch": 0.4912483472992023, "grad_norm": 4.033505916595459, "learning_rate": 5.09212819614711e-05, "loss": 1.1648, "num_input_tokens_seen": 112844000, "step": 7013 }, { "epoch": 0.49131839554493156, "grad_norm": 3.9474592208862305, "learning_rate": 5.0914283712784596e-05, "loss": 1.081, "num_input_tokens_seen": 112860384, "step": 7014 }, { "epoch": 0.4913884437906608, "grad_norm": 5.549149036407471, "learning_rate": 5.090728546409808e-05, "loss": 0.8816, "num_input_tokens_seen": 112875512, "step": 7015 }, { "epoch": 0.4914584920363901, "grad_norm": 5.086400985717773, "learning_rate": 5.090028721541156e-05, "loss": 1.2013, "num_input_tokens_seen": 112890840, "step": 7016 }, { "epoch": 0.4915285402821193, "grad_norm": 3.8358511924743652, "learning_rate": 5.089328896672505e-05, "loss": 0.9351, "num_input_tokens_seen": 112907048, "step": 7017 }, { "epoch": 0.49159858852784855, "grad_norm": 3.902388095855713, "learning_rate": 5.088629071803853e-05, "loss": 0.9513, "num_input_tokens_seen": 112923152, "step": 7018 }, { "epoch": 0.4916686367735778, "grad_norm": 4.3525166511535645, "learning_rate": 5.0879292469352026e-05, "loss": 0.9481, "num_input_tokens_seen": 112939536, "step": 7019 }, { "epoch": 0.49173868501930706, "grad_norm": 4.519508361816406, "learning_rate": 5.087229422066551e-05, "loss": 0.9127, "num_input_tokens_seen": 112955920, "step": 7020 }, { "epoch": 0.49180873326503627, "grad_norm": 4.366591930389404, "learning_rate": 5.086529597197899e-05, "loss": 1.1246, "num_input_tokens_seen": 112971824, "step": 7021 }, { "epoch": 0.49187878151076553, "grad_norm": 3.9145777225494385, "learning_rate": 5.085829772329247e-05, "loss": 1.085, "num_input_tokens_seen": 112988208, "step": 7022 }, { "epoch": 0.4919488297564948, "grad_norm": 3.9565582275390625, "learning_rate": 5.085129947460595e-05, "loss": 1.0299, "num_input_tokens_seen": 113004592, "step": 7023 }, { "epoch": 0.49201887800222405, "grad_norm": 4.051690101623535, "learning_rate": 5.084430122591944e-05, "loss": 1.0569, "num_input_tokens_seen": 113020552, "step": 7024 }, { "epoch": 0.49208892624795325, "grad_norm": 4.020756244659424, "learning_rate": 5.0837302977232923e-05, "loss": 0.9238, "num_input_tokens_seen": 113036840, "step": 7025 }, { "epoch": 0.4921589744936825, "grad_norm": 4.177811622619629, "learning_rate": 5.0830304728546405e-05, "loss": 1.147, "num_input_tokens_seen": 113051816, "step": 7026 }, { "epoch": 0.49222902273941177, "grad_norm": 3.997945785522461, "learning_rate": 5.08233064798599e-05, "loss": 0.9788, "num_input_tokens_seen": 113067968, "step": 7027 }, { "epoch": 0.49229907098514103, "grad_norm": 4.968790531158447, "learning_rate": 5.081630823117338e-05, "loss": 1.1171, "num_input_tokens_seen": 113084352, "step": 7028 }, { "epoch": 0.4923691192308703, "grad_norm": 3.9024367332458496, "learning_rate": 5.0809309982486864e-05, "loss": 0.9673, "num_input_tokens_seen": 113100672, "step": 7029 }, { "epoch": 0.4924391674765995, "grad_norm": 4.58246374130249, "learning_rate": 5.080231173380036e-05, "loss": 1.0995, "num_input_tokens_seen": 113115856, "step": 7030 }, { "epoch": 0.49250921572232875, "grad_norm": 4.944141387939453, "learning_rate": 5.0795313485113835e-05, "loss": 1.0369, "num_input_tokens_seen": 113132240, "step": 7031 }, { "epoch": 0.492579263968058, "grad_norm": 5.382607460021973, "learning_rate": 5.0788315236427316e-05, "loss": 1.1601, "num_input_tokens_seen": 113148624, "step": 7032 }, { "epoch": 0.49264931221378727, "grad_norm": 4.664593696594238, "learning_rate": 5.07813169877408e-05, "loss": 1.2741, "num_input_tokens_seen": 113164608, "step": 7033 }, { "epoch": 0.4927193604595165, "grad_norm": 3.8908638954162598, "learning_rate": 5.077431873905431e-05, "loss": 1.0828, "num_input_tokens_seen": 113180712, "step": 7034 }, { "epoch": 0.49278940870524574, "grad_norm": 3.93803334236145, "learning_rate": 5.0767320490367775e-05, "loss": 1.0555, "num_input_tokens_seen": 113196328, "step": 7035 }, { "epoch": 0.492859456950975, "grad_norm": 5.349659442901611, "learning_rate": 5.076032224168127e-05, "loss": 1.0171, "num_input_tokens_seen": 113212712, "step": 7036 }, { "epoch": 0.49292950519670425, "grad_norm": 5.471059322357178, "learning_rate": 5.075332399299475e-05, "loss": 1.0165, "num_input_tokens_seen": 113229096, "step": 7037 }, { "epoch": 0.49299955344243346, "grad_norm": 5.430948734283447, "learning_rate": 5.0746325744308234e-05, "loss": 1.1832, "num_input_tokens_seen": 113244752, "step": 7038 }, { "epoch": 0.4930696016881627, "grad_norm": 3.4861812591552734, "learning_rate": 5.0739327495621716e-05, "loss": 0.9185, "num_input_tokens_seen": 113261136, "step": 7039 }, { "epoch": 0.493139649933892, "grad_norm": 3.9184775352478027, "learning_rate": 5.0732329246935205e-05, "loss": 1.0508, "num_input_tokens_seen": 113277520, "step": 7040 }, { "epoch": 0.49320969817962124, "grad_norm": 3.6723365783691406, "learning_rate": 5.072533099824869e-05, "loss": 1.0803, "num_input_tokens_seen": 113293208, "step": 7041 }, { "epoch": 0.49327974642535044, "grad_norm": 4.273809432983398, "learning_rate": 5.071833274956217e-05, "loss": 1.2621, "num_input_tokens_seen": 113309592, "step": 7042 }, { "epoch": 0.4933497946710797, "grad_norm": 3.448326349258423, "learning_rate": 5.0711334500875664e-05, "loss": 1.0201, "num_input_tokens_seen": 113325976, "step": 7043 }, { "epoch": 0.49341984291680896, "grad_norm": 4.427138805389404, "learning_rate": 5.070433625218915e-05, "loss": 1.1113, "num_input_tokens_seen": 113341896, "step": 7044 }, { "epoch": 0.4934898911625382, "grad_norm": 3.837282180786133, "learning_rate": 5.069733800350263e-05, "loss": 1.0454, "num_input_tokens_seen": 113358280, "step": 7045 }, { "epoch": 0.4935599394082674, "grad_norm": 4.842933654785156, "learning_rate": 5.0690339754816116e-05, "loss": 1.3572, "num_input_tokens_seen": 113374664, "step": 7046 }, { "epoch": 0.4936299876539967, "grad_norm": 5.275210857391357, "learning_rate": 5.06833415061296e-05, "loss": 1.0208, "num_input_tokens_seen": 113391048, "step": 7047 }, { "epoch": 0.49370003589972594, "grad_norm": 4.120177268981934, "learning_rate": 5.067634325744308e-05, "loss": 1.1957, "num_input_tokens_seen": 113407432, "step": 7048 }, { "epoch": 0.4937700841454552, "grad_norm": 4.254641056060791, "learning_rate": 5.066934500875656e-05, "loss": 1.1381, "num_input_tokens_seen": 113423392, "step": 7049 }, { "epoch": 0.4938401323911844, "grad_norm": 4.007355690002441, "learning_rate": 5.066234676007006e-05, "loss": 0.9828, "num_input_tokens_seen": 113439776, "step": 7050 }, { "epoch": 0.49391018063691366, "grad_norm": 3.8774940967559814, "learning_rate": 5.065534851138355e-05, "loss": 0.8772, "num_input_tokens_seen": 113456160, "step": 7051 }, { "epoch": 0.4939802288826429, "grad_norm": 3.6799323558807373, "learning_rate": 5.064835026269702e-05, "loss": 1.0428, "num_input_tokens_seen": 113472408, "step": 7052 }, { "epoch": 0.4940502771283722, "grad_norm": 3.662111282348633, "learning_rate": 5.064135201401051e-05, "loss": 1.0102, "num_input_tokens_seen": 113488792, "step": 7053 }, { "epoch": 0.4941203253741014, "grad_norm": 4.95071268081665, "learning_rate": 5.0634353765324004e-05, "loss": 0.9709, "num_input_tokens_seen": 113505176, "step": 7054 }, { "epoch": 0.49419037361983065, "grad_norm": 4.512982368469238, "learning_rate": 5.062735551663747e-05, "loss": 1.0955, "num_input_tokens_seen": 113520640, "step": 7055 }, { "epoch": 0.4942604218655599, "grad_norm": 3.6250205039978027, "learning_rate": 5.0620357267950955e-05, "loss": 0.9812, "num_input_tokens_seen": 113535616, "step": 7056 }, { "epoch": 0.49433047011128917, "grad_norm": 3.856593132019043, "learning_rate": 5.0613359019264463e-05, "loss": 0.9007, "num_input_tokens_seen": 113552000, "step": 7057 }, { "epoch": 0.49440051835701837, "grad_norm": 3.655444860458374, "learning_rate": 5.0606360770577945e-05, "loss": 1.1066, "num_input_tokens_seen": 113568200, "step": 7058 }, { "epoch": 0.49447056660274763, "grad_norm": 4.795759677886963, "learning_rate": 5.0599362521891414e-05, "loss": 0.9739, "num_input_tokens_seen": 113584584, "step": 7059 }, { "epoch": 0.4945406148484769, "grad_norm": 4.4534783363342285, "learning_rate": 5.059236427320491e-05, "loss": 1.0207, "num_input_tokens_seen": 113600968, "step": 7060 }, { "epoch": 0.49461066309420615, "grad_norm": 4.681578159332275, "learning_rate": 5.05853660245184e-05, "loss": 1.3153, "num_input_tokens_seen": 113617352, "step": 7061 }, { "epoch": 0.49468071133993535, "grad_norm": 3.6734678745269775, "learning_rate": 5.057836777583187e-05, "loss": 1.0005, "num_input_tokens_seen": 113633736, "step": 7062 }, { "epoch": 0.4947507595856646, "grad_norm": 3.7208728790283203, "learning_rate": 5.057136952714536e-05, "loss": 1.0641, "num_input_tokens_seen": 113649224, "step": 7063 }, { "epoch": 0.49482080783139387, "grad_norm": 4.036924362182617, "learning_rate": 5.0564371278458856e-05, "loss": 1.0842, "num_input_tokens_seen": 113665552, "step": 7064 }, { "epoch": 0.49489085607712313, "grad_norm": 6.462393760681152, "learning_rate": 5.055737302977234e-05, "loss": 1.0341, "num_input_tokens_seen": 113681192, "step": 7065 }, { "epoch": 0.4949609043228524, "grad_norm": 4.203556537628174, "learning_rate": 5.055037478108582e-05, "loss": 1.3181, "num_input_tokens_seen": 113697168, "step": 7066 }, { "epoch": 0.4950309525685816, "grad_norm": 3.798896551132202, "learning_rate": 5.05433765323993e-05, "loss": 1.1016, "num_input_tokens_seen": 113713504, "step": 7067 }, { "epoch": 0.49510100081431085, "grad_norm": 4.175333499908447, "learning_rate": 5.053637828371279e-05, "loss": 1.0572, "num_input_tokens_seen": 113729872, "step": 7068 }, { "epoch": 0.4951710490600401, "grad_norm": 3.563164234161377, "learning_rate": 5.0529380035026266e-05, "loss": 0.849, "num_input_tokens_seen": 113745736, "step": 7069 }, { "epoch": 0.4952410973057694, "grad_norm": 3.605379104614258, "learning_rate": 5.0522381786339754e-05, "loss": 1.0291, "num_input_tokens_seen": 113760816, "step": 7070 }, { "epoch": 0.4953111455514986, "grad_norm": 3.849106550216675, "learning_rate": 5.051538353765325e-05, "loss": 0.8297, "num_input_tokens_seen": 113776600, "step": 7071 }, { "epoch": 0.49538119379722784, "grad_norm": 4.046478748321533, "learning_rate": 5.050838528896672e-05, "loss": 0.9624, "num_input_tokens_seen": 113792984, "step": 7072 }, { "epoch": 0.4954512420429571, "grad_norm": 4.66940450668335, "learning_rate": 5.050138704028021e-05, "loss": 1.1873, "num_input_tokens_seen": 113809072, "step": 7073 }, { "epoch": 0.49552129028868636, "grad_norm": 5.5129075050354, "learning_rate": 5.049438879159371e-05, "loss": 1.0224, "num_input_tokens_seen": 113825456, "step": 7074 }, { "epoch": 0.49559133853441556, "grad_norm": 4.045241355895996, "learning_rate": 5.048739054290719e-05, "loss": 1.0762, "num_input_tokens_seen": 113841840, "step": 7075 }, { "epoch": 0.4956613867801448, "grad_norm": 3.9198641777038574, "learning_rate": 5.0480392294220665e-05, "loss": 1.1765, "num_input_tokens_seen": 113857624, "step": 7076 }, { "epoch": 0.4957314350258741, "grad_norm": 3.836678981781006, "learning_rate": 5.047339404553415e-05, "loss": 1.1377, "num_input_tokens_seen": 113874008, "step": 7077 }, { "epoch": 0.49580148327160334, "grad_norm": 3.593061923980713, "learning_rate": 5.046639579684764e-05, "loss": 1.0115, "num_input_tokens_seen": 113889992, "step": 7078 }, { "epoch": 0.49587153151733254, "grad_norm": 5.477400302886963, "learning_rate": 5.045939754816111e-05, "loss": 1.1446, "num_input_tokens_seen": 113906376, "step": 7079 }, { "epoch": 0.4959415797630618, "grad_norm": 5.204897880554199, "learning_rate": 5.045239929947462e-05, "loss": 1.2356, "num_input_tokens_seen": 113921792, "step": 7080 }, { "epoch": 0.49601162800879106, "grad_norm": 6.132393836975098, "learning_rate": 5.04454010507881e-05, "loss": 1.0126, "num_input_tokens_seen": 113937720, "step": 7081 }, { "epoch": 0.4960816762545203, "grad_norm": 3.651715040206909, "learning_rate": 5.043840280210158e-05, "loss": 0.948, "num_input_tokens_seen": 113953016, "step": 7082 }, { "epoch": 0.4961517245002495, "grad_norm": 4.28763484954834, "learning_rate": 5.0431404553415065e-05, "loss": 0.9985, "num_input_tokens_seen": 113969400, "step": 7083 }, { "epoch": 0.4962217727459788, "grad_norm": 7.6505208015441895, "learning_rate": 5.0424406304728554e-05, "loss": 1.2593, "num_input_tokens_seen": 113985784, "step": 7084 }, { "epoch": 0.49629182099170804, "grad_norm": 4.85219144821167, "learning_rate": 5.0417408056042036e-05, "loss": 0.9904, "num_input_tokens_seen": 114001592, "step": 7085 }, { "epoch": 0.4963618692374373, "grad_norm": 3.414391040802002, "learning_rate": 5.041040980735552e-05, "loss": 1.088, "num_input_tokens_seen": 114017976, "step": 7086 }, { "epoch": 0.4964319174831665, "grad_norm": 4.361126899719238, "learning_rate": 5.040341155866901e-05, "loss": 1.0559, "num_input_tokens_seen": 114034360, "step": 7087 }, { "epoch": 0.49650196572889577, "grad_norm": 3.459439754486084, "learning_rate": 5.0396413309982495e-05, "loss": 0.8409, "num_input_tokens_seen": 114050744, "step": 7088 }, { "epoch": 0.496572013974625, "grad_norm": 4.241810321807861, "learning_rate": 5.0389415061295976e-05, "loss": 1.1019, "num_input_tokens_seen": 114066776, "step": 7089 }, { "epoch": 0.4966420622203543, "grad_norm": 4.012382984161377, "learning_rate": 5.0382416812609465e-05, "loss": 1.1166, "num_input_tokens_seen": 114082888, "step": 7090 }, { "epoch": 0.4967121104660835, "grad_norm": 3.8776516914367676, "learning_rate": 5.037541856392295e-05, "loss": 1.0333, "num_input_tokens_seen": 114099072, "step": 7091 }, { "epoch": 0.49678215871181275, "grad_norm": 4.0513014793396, "learning_rate": 5.036842031523643e-05, "loss": 0.9747, "num_input_tokens_seen": 114115296, "step": 7092 }, { "epoch": 0.496852206957542, "grad_norm": 3.7338500022888184, "learning_rate": 5.036142206654991e-05, "loss": 1.1254, "num_input_tokens_seen": 114131680, "step": 7093 }, { "epoch": 0.49692225520327127, "grad_norm": 5.892488956451416, "learning_rate": 5.035442381786339e-05, "loss": 1.0316, "num_input_tokens_seen": 114146560, "step": 7094 }, { "epoch": 0.49699230344900047, "grad_norm": 5.1975507736206055, "learning_rate": 5.034742556917689e-05, "loss": 1.2128, "num_input_tokens_seen": 114162944, "step": 7095 }, { "epoch": 0.49706235169472973, "grad_norm": 4.196847438812256, "learning_rate": 5.034042732049037e-05, "loss": 1.0168, "num_input_tokens_seen": 114178040, "step": 7096 }, { "epoch": 0.497132399940459, "grad_norm": 4.342573642730713, "learning_rate": 5.0333429071803865e-05, "loss": 0.9601, "num_input_tokens_seen": 114194424, "step": 7097 }, { "epoch": 0.49720244818618825, "grad_norm": 4.113316059112549, "learning_rate": 5.032643082311734e-05, "loss": 1.0902, "num_input_tokens_seen": 114210808, "step": 7098 }, { "epoch": 0.4972724964319175, "grad_norm": 4.835622787475586, "learning_rate": 5.031943257443082e-05, "loss": 1.1025, "num_input_tokens_seen": 114225248, "step": 7099 }, { "epoch": 0.4973425446776467, "grad_norm": 4.603962421417236, "learning_rate": 5.031243432574432e-05, "loss": 1.1335, "num_input_tokens_seen": 114241632, "step": 7100 }, { "epoch": 0.497412592923376, "grad_norm": 4.17899227142334, "learning_rate": 5.030543607705781e-05, "loss": 1.0766, "num_input_tokens_seen": 114256688, "step": 7101 }, { "epoch": 0.49748264116910523, "grad_norm": 3.890780448913574, "learning_rate": 5.029843782837128e-05, "loss": 1.089, "num_input_tokens_seen": 114273072, "step": 7102 }, { "epoch": 0.4975526894148345, "grad_norm": 4.290158748626709, "learning_rate": 5.029143957968476e-05, "loss": 0.9706, "num_input_tokens_seen": 114288936, "step": 7103 }, { "epoch": 0.4976227376605637, "grad_norm": 5.222672462463379, "learning_rate": 5.028444133099826e-05, "loss": 1.0974, "num_input_tokens_seen": 114305320, "step": 7104 }, { "epoch": 0.49769278590629296, "grad_norm": 3.383232355117798, "learning_rate": 5.027744308231174e-05, "loss": 1.0279, "num_input_tokens_seen": 114321704, "step": 7105 }, { "epoch": 0.4977628341520222, "grad_norm": 3.8526852130889893, "learning_rate": 5.027044483362522e-05, "loss": 1.0064, "num_input_tokens_seen": 114337920, "step": 7106 }, { "epoch": 0.4978328823977515, "grad_norm": 3.699127674102783, "learning_rate": 5.026344658493871e-05, "loss": 1.1549, "num_input_tokens_seen": 114354304, "step": 7107 }, { "epoch": 0.4979029306434807, "grad_norm": 3.3088033199310303, "learning_rate": 5.025644833625219e-05, "loss": 0.9093, "num_input_tokens_seen": 114370688, "step": 7108 }, { "epoch": 0.49797297888920994, "grad_norm": 4.435497760772705, "learning_rate": 5.0249450087565674e-05, "loss": 1.0656, "num_input_tokens_seen": 114386664, "step": 7109 }, { "epoch": 0.4980430271349392, "grad_norm": 4.2929840087890625, "learning_rate": 5.0242451838879155e-05, "loss": 0.9749, "num_input_tokens_seen": 114401816, "step": 7110 }, { "epoch": 0.49811307538066846, "grad_norm": 4.264016628265381, "learning_rate": 5.023545359019266e-05, "loss": 1.008, "num_input_tokens_seen": 114418200, "step": 7111 }, { "epoch": 0.49818312362639766, "grad_norm": 3.5081541538238525, "learning_rate": 5.022845534150613e-05, "loss": 0.8752, "num_input_tokens_seen": 114434424, "step": 7112 }, { "epoch": 0.4982531718721269, "grad_norm": 5.671893119812012, "learning_rate": 5.0221457092819614e-05, "loss": 1.0707, "num_input_tokens_seen": 114449560, "step": 7113 }, { "epoch": 0.4983232201178562, "grad_norm": 4.350570201873779, "learning_rate": 5.02144588441331e-05, "loss": 1.105, "num_input_tokens_seen": 114465504, "step": 7114 }, { "epoch": 0.49839326836358544, "grad_norm": 3.650238513946533, "learning_rate": 5.0207460595446585e-05, "loss": 1.1202, "num_input_tokens_seen": 114481760, "step": 7115 }, { "epoch": 0.49846331660931464, "grad_norm": 4.211227893829346, "learning_rate": 5.020046234676007e-05, "loss": 1.228, "num_input_tokens_seen": 114496696, "step": 7116 }, { "epoch": 0.4985333648550439, "grad_norm": 3.561427354812622, "learning_rate": 5.019346409807356e-05, "loss": 0.8941, "num_input_tokens_seen": 114512616, "step": 7117 }, { "epoch": 0.49860341310077316, "grad_norm": 4.558845520019531, "learning_rate": 5.018646584938706e-05, "loss": 1.1921, "num_input_tokens_seen": 114527600, "step": 7118 }, { "epoch": 0.4986734613465024, "grad_norm": 3.419285297393799, "learning_rate": 5.0179467600700526e-05, "loss": 1.044, "num_input_tokens_seen": 114543984, "step": 7119 }, { "epoch": 0.4987435095922316, "grad_norm": 3.844834566116333, "learning_rate": 5.017246935201401e-05, "loss": 1.2369, "num_input_tokens_seen": 114559728, "step": 7120 }, { "epoch": 0.4988135578379609, "grad_norm": 4.457134246826172, "learning_rate": 5.0165471103327496e-05, "loss": 1.061, "num_input_tokens_seen": 114575688, "step": 7121 }, { "epoch": 0.49888360608369015, "grad_norm": 4.241283893585205, "learning_rate": 5.015847285464098e-05, "loss": 1.1434, "num_input_tokens_seen": 114590680, "step": 7122 }, { "epoch": 0.4989536543294194, "grad_norm": 3.7781248092651367, "learning_rate": 5.015147460595446e-05, "loss": 0.9872, "num_input_tokens_seen": 114606960, "step": 7123 }, { "epoch": 0.4990237025751486, "grad_norm": 5.492437839508057, "learning_rate": 5.0144476357267955e-05, "loss": 1.0772, "num_input_tokens_seen": 114623344, "step": 7124 }, { "epoch": 0.49909375082087787, "grad_norm": 5.001891613006592, "learning_rate": 5.013747810858145e-05, "loss": 1.1999, "num_input_tokens_seen": 114639728, "step": 7125 }, { "epoch": 0.49916379906660713, "grad_norm": 3.78376841545105, "learning_rate": 5.013047985989493e-05, "loss": 1.1275, "num_input_tokens_seen": 114655984, "step": 7126 }, { "epoch": 0.4992338473123364, "grad_norm": 5.250494956970215, "learning_rate": 5.0123481611208414e-05, "loss": 1.1063, "num_input_tokens_seen": 114670912, "step": 7127 }, { "epoch": 0.4993038955580656, "grad_norm": 3.8290820121765137, "learning_rate": 5.01164833625219e-05, "loss": 0.9789, "num_input_tokens_seen": 114687240, "step": 7128 }, { "epoch": 0.49937394380379485, "grad_norm": 5.523165225982666, "learning_rate": 5.010948511383538e-05, "loss": 0.977, "num_input_tokens_seen": 114703616, "step": 7129 }, { "epoch": 0.4994439920495241, "grad_norm": 3.838224172592163, "learning_rate": 5.010248686514886e-05, "loss": 0.9461, "num_input_tokens_seen": 114720000, "step": 7130 }, { "epoch": 0.49951404029525337, "grad_norm": 3.751004457473755, "learning_rate": 5.009548861646235e-05, "loss": 1.0345, "num_input_tokens_seen": 114735624, "step": 7131 }, { "epoch": 0.49958408854098263, "grad_norm": 4.485782146453857, "learning_rate": 5.0088490367775843e-05, "loss": 1.2378, "num_input_tokens_seen": 114751040, "step": 7132 }, { "epoch": 0.49965413678671183, "grad_norm": 4.896092891693115, "learning_rate": 5.0081492119089325e-05, "loss": 1.1213, "num_input_tokens_seen": 114766096, "step": 7133 }, { "epoch": 0.4997241850324411, "grad_norm": 4.27908182144165, "learning_rate": 5.007449387040281e-05, "loss": 0.9422, "num_input_tokens_seen": 114782480, "step": 7134 }, { "epoch": 0.49979423327817035, "grad_norm": 5.309985160827637, "learning_rate": 5.0067495621716296e-05, "loss": 1.1707, "num_input_tokens_seen": 114798864, "step": 7135 }, { "epoch": 0.4998642815238996, "grad_norm": 3.838355302810669, "learning_rate": 5.006049737302978e-05, "loss": 1.243, "num_input_tokens_seen": 114814680, "step": 7136 }, { "epoch": 0.4999343297696288, "grad_norm": 3.9620189666748047, "learning_rate": 5.005349912434325e-05, "loss": 1.025, "num_input_tokens_seen": 114831064, "step": 7137 }, { "epoch": 0.5000043780153581, "grad_norm": 3.4240174293518066, "learning_rate": 5.004650087565674e-05, "loss": 1.0205, "num_input_tokens_seen": 114847448, "step": 7138 }, { "epoch": 0.5000744262610873, "grad_norm": 3.603026866912842, "learning_rate": 5.003950262697022e-05, "loss": 1.0196, "num_input_tokens_seen": 114863832, "step": 7139 }, { "epoch": 0.5001444745068165, "grad_norm": 4.349592208862305, "learning_rate": 5.0032504378283705e-05, "loss": 1.1059, "num_input_tokens_seen": 114879200, "step": 7140 }, { "epoch": 0.5002145227525459, "grad_norm": 5.716104984283447, "learning_rate": 5.00255061295972e-05, "loss": 1.0465, "num_input_tokens_seen": 114894880, "step": 7141 }, { "epoch": 0.5002845709982751, "grad_norm": 3.857797384262085, "learning_rate": 5.0018507880910695e-05, "loss": 1.0438, "num_input_tokens_seen": 114911264, "step": 7142 }, { "epoch": 0.5003546192440043, "grad_norm": 3.7292556762695312, "learning_rate": 5.001150963222417e-05, "loss": 1.0803, "num_input_tokens_seen": 114926792, "step": 7143 }, { "epoch": 0.5004246674897336, "grad_norm": 4.02719783782959, "learning_rate": 5.000451138353765e-05, "loss": 0.9635, "num_input_tokens_seen": 114942944, "step": 7144 }, { "epoch": 0.5004947157354628, "grad_norm": 5.39168119430542, "learning_rate": 4.9997513134851134e-05, "loss": 1.0898, "num_input_tokens_seen": 114958800, "step": 7145 }, { "epoch": 0.5005647639811921, "grad_norm": 4.773622512817383, "learning_rate": 4.999051488616463e-05, "loss": 1.1112, "num_input_tokens_seen": 114974664, "step": 7146 }, { "epoch": 0.5006348122269213, "grad_norm": 3.635557174682617, "learning_rate": 4.998351663747812e-05, "loss": 0.9355, "num_input_tokens_seen": 114990728, "step": 7147 }, { "epoch": 0.5007048604726505, "grad_norm": 4.165726661682129, "learning_rate": 4.997651838879159e-05, "loss": 1.1044, "num_input_tokens_seen": 115006552, "step": 7148 }, { "epoch": 0.5007749087183798, "grad_norm": 4.2835001945495605, "learning_rate": 4.996952014010508e-05, "loss": 0.937, "num_input_tokens_seen": 115022936, "step": 7149 }, { "epoch": 0.500844956964109, "grad_norm": 3.7588231563568115, "learning_rate": 4.9962521891418564e-05, "loss": 1.0578, "num_input_tokens_seen": 115038832, "step": 7150 }, { "epoch": 0.5009150052098382, "grad_norm": 4.017446041107178, "learning_rate": 4.995552364273205e-05, "loss": 1.0177, "num_input_tokens_seen": 115055216, "step": 7151 }, { "epoch": 0.5009850534555675, "grad_norm": 4.145601749420166, "learning_rate": 4.994852539404554e-05, "loss": 1.1925, "num_input_tokens_seen": 115071600, "step": 7152 }, { "epoch": 0.5010551017012967, "grad_norm": 4.027134895324707, "learning_rate": 4.994152714535902e-05, "loss": 1.0115, "num_input_tokens_seen": 115087504, "step": 7153 }, { "epoch": 0.5011251499470261, "grad_norm": 4.185591697692871, "learning_rate": 4.993452889667251e-05, "loss": 1.012, "num_input_tokens_seen": 115102936, "step": 7154 }, { "epoch": 0.5011951981927553, "grad_norm": 3.262739658355713, "learning_rate": 4.9927530647985986e-05, "loss": 0.9424, "num_input_tokens_seen": 115119320, "step": 7155 }, { "epoch": 0.5012652464384845, "grad_norm": 3.514493465423584, "learning_rate": 4.992053239929948e-05, "loss": 0.9523, "num_input_tokens_seen": 115135704, "step": 7156 }, { "epoch": 0.5013352946842138, "grad_norm": 3.2577719688415527, "learning_rate": 4.991353415061297e-05, "loss": 0.9603, "num_input_tokens_seen": 115152088, "step": 7157 }, { "epoch": 0.501405342929943, "grad_norm": 4.475879669189453, "learning_rate": 4.9906535901926445e-05, "loss": 1.1393, "num_input_tokens_seen": 115168472, "step": 7158 }, { "epoch": 0.5014753911756722, "grad_norm": 4.558653354644775, "learning_rate": 4.9899537653239934e-05, "loss": 0.9628, "num_input_tokens_seen": 115184136, "step": 7159 }, { "epoch": 0.5015454394214015, "grad_norm": 4.034858703613281, "learning_rate": 4.9892539404553416e-05, "loss": 1.2669, "num_input_tokens_seen": 115200520, "step": 7160 }, { "epoch": 0.5016154876671307, "grad_norm": 4.190174579620361, "learning_rate": 4.9885541155866904e-05, "loss": 1.0478, "num_input_tokens_seen": 115216904, "step": 7161 }, { "epoch": 0.50168553591286, "grad_norm": 4.808748245239258, "learning_rate": 4.987854290718039e-05, "loss": 1.2661, "num_input_tokens_seen": 115232864, "step": 7162 }, { "epoch": 0.5017555841585892, "grad_norm": 3.7075023651123047, "learning_rate": 4.9871544658493875e-05, "loss": 1.1328, "num_input_tokens_seen": 115248544, "step": 7163 }, { "epoch": 0.5018256324043184, "grad_norm": 3.6593689918518066, "learning_rate": 4.986454640980736e-05, "loss": 1.0653, "num_input_tokens_seen": 115264616, "step": 7164 }, { "epoch": 0.5018956806500477, "grad_norm": 3.959949493408203, "learning_rate": 4.985754816112084e-05, "loss": 1.0708, "num_input_tokens_seen": 115281000, "step": 7165 }, { "epoch": 0.501965728895777, "grad_norm": 3.6724140644073486, "learning_rate": 4.9850549912434334e-05, "loss": 1.1306, "num_input_tokens_seen": 115297384, "step": 7166 }, { "epoch": 0.5020357771415063, "grad_norm": 3.9350247383117676, "learning_rate": 4.984355166374781e-05, "loss": 1.1785, "num_input_tokens_seen": 115312760, "step": 7167 }, { "epoch": 0.5021058253872355, "grad_norm": 3.8056607246398926, "learning_rate": 4.98365534150613e-05, "loss": 0.8915, "num_input_tokens_seen": 115328336, "step": 7168 }, { "epoch": 0.5021758736329647, "grad_norm": 3.995048761367798, "learning_rate": 4.9829555166374786e-05, "loss": 1.0687, "num_input_tokens_seen": 115344016, "step": 7169 }, { "epoch": 0.502245921878694, "grad_norm": 4.534327983856201, "learning_rate": 4.982255691768827e-05, "loss": 0.9757, "num_input_tokens_seen": 115359976, "step": 7170 }, { "epoch": 0.5023159701244232, "grad_norm": 5.29775333404541, "learning_rate": 4.9815558669001756e-05, "loss": 0.9172, "num_input_tokens_seen": 115375480, "step": 7171 }, { "epoch": 0.5023860183701524, "grad_norm": 3.8773534297943115, "learning_rate": 4.980856042031524e-05, "loss": 1.0997, "num_input_tokens_seen": 115391632, "step": 7172 }, { "epoch": 0.5024560666158817, "grad_norm": 4.249567985534668, "learning_rate": 4.9801562171628727e-05, "loss": 1.1145, "num_input_tokens_seen": 115408016, "step": 7173 }, { "epoch": 0.5025261148616109, "grad_norm": 4.293243408203125, "learning_rate": 4.9794563922942215e-05, "loss": 1.4234, "num_input_tokens_seen": 115423808, "step": 7174 }, { "epoch": 0.5025961631073402, "grad_norm": 4.535524845123291, "learning_rate": 4.978756567425569e-05, "loss": 0.9403, "num_input_tokens_seen": 115440192, "step": 7175 }, { "epoch": 0.5026662113530694, "grad_norm": 4.3390631675720215, "learning_rate": 4.9780567425569186e-05, "loss": 1.3728, "num_input_tokens_seen": 115455416, "step": 7176 }, { "epoch": 0.5027362595987986, "grad_norm": 3.630815267562866, "learning_rate": 4.977356917688266e-05, "loss": 1.0493, "num_input_tokens_seen": 115471800, "step": 7177 }, { "epoch": 0.502806307844528, "grad_norm": 3.9146728515625, "learning_rate": 4.976657092819615e-05, "loss": 1.0058, "num_input_tokens_seen": 115488184, "step": 7178 }, { "epoch": 0.5028763560902572, "grad_norm": 4.978190898895264, "learning_rate": 4.975957267950964e-05, "loss": 1.0014, "num_input_tokens_seen": 115503792, "step": 7179 }, { "epoch": 0.5029464043359864, "grad_norm": 3.8975963592529297, "learning_rate": 4.975257443082312e-05, "loss": 1.1317, "num_input_tokens_seen": 115519576, "step": 7180 }, { "epoch": 0.5030164525817157, "grad_norm": 4.439699649810791, "learning_rate": 4.974557618213661e-05, "loss": 1.0718, "num_input_tokens_seen": 115535512, "step": 7181 }, { "epoch": 0.5030865008274449, "grad_norm": 4.4080610275268555, "learning_rate": 4.973857793345009e-05, "loss": 1.3639, "num_input_tokens_seen": 115551880, "step": 7182 }, { "epoch": 0.5031565490731742, "grad_norm": 3.8968825340270996, "learning_rate": 4.973157968476358e-05, "loss": 1.1686, "num_input_tokens_seen": 115568136, "step": 7183 }, { "epoch": 0.5032265973189034, "grad_norm": 4.030379295349121, "learning_rate": 4.972458143607707e-05, "loss": 0.9829, "num_input_tokens_seen": 115583928, "step": 7184 }, { "epoch": 0.5032966455646326, "grad_norm": 4.46726131439209, "learning_rate": 4.971758318739054e-05, "loss": 1.1789, "num_input_tokens_seen": 115600016, "step": 7185 }, { "epoch": 0.5033666938103619, "grad_norm": 4.490327835083008, "learning_rate": 4.971058493870404e-05, "loss": 0.968, "num_input_tokens_seen": 115616400, "step": 7186 }, { "epoch": 0.5034367420560911, "grad_norm": 5.678159713745117, "learning_rate": 4.970358669001751e-05, "loss": 1.2517, "num_input_tokens_seen": 115632424, "step": 7187 }, { "epoch": 0.5035067903018203, "grad_norm": 4.695899963378906, "learning_rate": 4.9696588441331e-05, "loss": 1.2681, "num_input_tokens_seen": 115648808, "step": 7188 }, { "epoch": 0.5035768385475496, "grad_norm": 3.6823155879974365, "learning_rate": 4.968959019264449e-05, "loss": 1.0642, "num_input_tokens_seen": 115665192, "step": 7189 }, { "epoch": 0.5036468867932788, "grad_norm": 5.105508804321289, "learning_rate": 4.968259194395797e-05, "loss": 1.1646, "num_input_tokens_seen": 115681472, "step": 7190 }, { "epoch": 0.5037169350390082, "grad_norm": 4.0591607093811035, "learning_rate": 4.967559369527146e-05, "loss": 1.1231, "num_input_tokens_seen": 115697248, "step": 7191 }, { "epoch": 0.5037869832847374, "grad_norm": 4.097674369812012, "learning_rate": 4.966859544658494e-05, "loss": 0.9121, "num_input_tokens_seen": 115713632, "step": 7192 }, { "epoch": 0.5038570315304666, "grad_norm": 3.711235523223877, "learning_rate": 4.966159719789843e-05, "loss": 0.9402, "num_input_tokens_seen": 115730016, "step": 7193 }, { "epoch": 0.5039270797761959, "grad_norm": 3.9073588848114014, "learning_rate": 4.9654598949211906e-05, "loss": 1.0806, "num_input_tokens_seen": 115745872, "step": 7194 }, { "epoch": 0.5039971280219251, "grad_norm": 3.230870008468628, "learning_rate": 4.9647600700525394e-05, "loss": 0.8976, "num_input_tokens_seen": 115762256, "step": 7195 }, { "epoch": 0.5040671762676543, "grad_norm": 4.253819942474365, "learning_rate": 4.964060245183889e-05, "loss": 1.1024, "num_input_tokens_seen": 115778640, "step": 7196 }, { "epoch": 0.5041372245133836, "grad_norm": 3.6932590007781982, "learning_rate": 4.9633604203152365e-05, "loss": 1.1522, "num_input_tokens_seen": 115795024, "step": 7197 }, { "epoch": 0.5042072727591128, "grad_norm": 4.178073883056641, "learning_rate": 4.962660595446585e-05, "loss": 1.1629, "num_input_tokens_seen": 115811408, "step": 7198 }, { "epoch": 0.5042773210048421, "grad_norm": 3.4744091033935547, "learning_rate": 4.9619607705779335e-05, "loss": 0.9078, "num_input_tokens_seen": 115827792, "step": 7199 }, { "epoch": 0.5043473692505713, "grad_norm": 5.810272216796875, "learning_rate": 4.9612609457092824e-05, "loss": 1.0187, "num_input_tokens_seen": 115843352, "step": 7200 }, { "epoch": 0.5043473692505713, "eval_loss": 1.1233556270599365, "eval_runtime": 0.159, "eval_samples_per_second": 6.288, "eval_steps_per_second": 6.288, "num_input_tokens_seen": 115843352, "step": 7200 }, { "epoch": 0.5044174174963005, "grad_norm": 4.0738205909729, "learning_rate": 4.960561120840631e-05, "loss": 1.2362, "num_input_tokens_seen": 115859216, "step": 7201 }, { "epoch": 0.5044874657420299, "grad_norm": 4.0072102546691895, "learning_rate": 4.9598612959719794e-05, "loss": 1.1624, "num_input_tokens_seen": 115875600, "step": 7202 }, { "epoch": 0.504557513987759, "grad_norm": 5.232552528381348, "learning_rate": 4.959161471103328e-05, "loss": 0.9148, "num_input_tokens_seen": 115890976, "step": 7203 }, { "epoch": 0.5046275622334884, "grad_norm": 4.6930623054504395, "learning_rate": 4.958461646234676e-05, "loss": 1.0314, "num_input_tokens_seen": 115907360, "step": 7204 }, { "epoch": 0.5046976104792176, "grad_norm": 5.217222690582275, "learning_rate": 4.9577618213660246e-05, "loss": 1.1316, "num_input_tokens_seen": 115923320, "step": 7205 }, { "epoch": 0.5047676587249468, "grad_norm": 3.999408006668091, "learning_rate": 4.957061996497374e-05, "loss": 0.9769, "num_input_tokens_seen": 115939704, "step": 7206 }, { "epoch": 0.5048377069706761, "grad_norm": 4.267052173614502, "learning_rate": 4.956362171628722e-05, "loss": 1.006, "num_input_tokens_seen": 115955592, "step": 7207 }, { "epoch": 0.5049077552164053, "grad_norm": 4.446041584014893, "learning_rate": 4.9556623467600705e-05, "loss": 1.2351, "num_input_tokens_seen": 115971976, "step": 7208 }, { "epoch": 0.5049778034621345, "grad_norm": 3.8210396766662598, "learning_rate": 4.954962521891419e-05, "loss": 1.1836, "num_input_tokens_seen": 115987528, "step": 7209 }, { "epoch": 0.5050478517078638, "grad_norm": 5.992397785186768, "learning_rate": 4.9542626970227676e-05, "loss": 1.0428, "num_input_tokens_seen": 116003912, "step": 7210 }, { "epoch": 0.505117899953593, "grad_norm": 3.934375524520874, "learning_rate": 4.9535628721541164e-05, "loss": 0.9858, "num_input_tokens_seen": 116020296, "step": 7211 }, { "epoch": 0.5051879481993223, "grad_norm": 3.936866521835327, "learning_rate": 4.9528630472854646e-05, "loss": 1.1011, "num_input_tokens_seen": 116036600, "step": 7212 }, { "epoch": 0.5052579964450515, "grad_norm": 4.908316135406494, "learning_rate": 4.9521632224168135e-05, "loss": 0.8953, "num_input_tokens_seen": 116052984, "step": 7213 }, { "epoch": 0.5053280446907807, "grad_norm": 4.035202503204346, "learning_rate": 4.951463397548161e-05, "loss": 1.1475, "num_input_tokens_seen": 116068768, "step": 7214 }, { "epoch": 0.5053980929365101, "grad_norm": 3.7488014698028564, "learning_rate": 4.95076357267951e-05, "loss": 0.9506, "num_input_tokens_seen": 116085152, "step": 7215 }, { "epoch": 0.5054681411822393, "grad_norm": 5.226819038391113, "learning_rate": 4.9500637478108594e-05, "loss": 0.9878, "num_input_tokens_seen": 116100176, "step": 7216 }, { "epoch": 0.5055381894279685, "grad_norm": 4.0122857093811035, "learning_rate": 4.949363922942207e-05, "loss": 1.1275, "num_input_tokens_seen": 116116560, "step": 7217 }, { "epoch": 0.5056082376736978, "grad_norm": 4.160411834716797, "learning_rate": 4.948664098073556e-05, "loss": 0.9622, "num_input_tokens_seen": 116132464, "step": 7218 }, { "epoch": 0.505678285919427, "grad_norm": 4.860180377960205, "learning_rate": 4.947964273204904e-05, "loss": 0.9896, "num_input_tokens_seen": 116148848, "step": 7219 }, { "epoch": 0.5057483341651563, "grad_norm": 4.549893856048584, "learning_rate": 4.947264448336253e-05, "loss": 1.0893, "num_input_tokens_seen": 116165128, "step": 7220 }, { "epoch": 0.5058183824108855, "grad_norm": 3.4614131450653076, "learning_rate": 4.9465646234676e-05, "loss": 0.7666, "num_input_tokens_seen": 116181152, "step": 7221 }, { "epoch": 0.5058884306566147, "grad_norm": 4.7237043380737305, "learning_rate": 4.94586479859895e-05, "loss": 0.949, "num_input_tokens_seen": 116197032, "step": 7222 }, { "epoch": 0.505958478902344, "grad_norm": 4.4195098876953125, "learning_rate": 4.945164973730299e-05, "loss": 0.985, "num_input_tokens_seen": 116212752, "step": 7223 }, { "epoch": 0.5060285271480732, "grad_norm": 3.6815669536590576, "learning_rate": 4.944465148861646e-05, "loss": 0.9769, "num_input_tokens_seen": 116229024, "step": 7224 }, { "epoch": 0.5060985753938024, "grad_norm": 3.776644229888916, "learning_rate": 4.943765323992995e-05, "loss": 0.9953, "num_input_tokens_seen": 116245408, "step": 7225 }, { "epoch": 0.5061686236395317, "grad_norm": 4.3324761390686035, "learning_rate": 4.943065499124343e-05, "loss": 1.058, "num_input_tokens_seen": 116261192, "step": 7226 }, { "epoch": 0.506238671885261, "grad_norm": 3.499302387237549, "learning_rate": 4.942365674255692e-05, "loss": 1.1726, "num_input_tokens_seen": 116277576, "step": 7227 }, { "epoch": 0.5063087201309903, "grad_norm": 3.5195088386535645, "learning_rate": 4.941665849387041e-05, "loss": 0.8901, "num_input_tokens_seen": 116293960, "step": 7228 }, { "epoch": 0.5063787683767195, "grad_norm": 4.266250133514404, "learning_rate": 4.940966024518389e-05, "loss": 1.0067, "num_input_tokens_seen": 116310344, "step": 7229 }, { "epoch": 0.5064488166224487, "grad_norm": 4.53155517578125, "learning_rate": 4.940266199649738e-05, "loss": 1.1207, "num_input_tokens_seen": 116326536, "step": 7230 }, { "epoch": 0.506518864868178, "grad_norm": 5.224839210510254, "learning_rate": 4.9395663747810855e-05, "loss": 1.1932, "num_input_tokens_seen": 116342792, "step": 7231 }, { "epoch": 0.5065889131139072, "grad_norm": 4.072076797485352, "learning_rate": 4.938866549912435e-05, "loss": 1.1041, "num_input_tokens_seen": 116359144, "step": 7232 }, { "epoch": 0.5066589613596364, "grad_norm": 4.286440372467041, "learning_rate": 4.938166725043784e-05, "loss": 1.1439, "num_input_tokens_seen": 116375128, "step": 7233 }, { "epoch": 0.5067290096053657, "grad_norm": 3.684030055999756, "learning_rate": 4.9374669001751314e-05, "loss": 0.8753, "num_input_tokens_seen": 116391448, "step": 7234 }, { "epoch": 0.5067990578510949, "grad_norm": 3.721698045730591, "learning_rate": 4.93676707530648e-05, "loss": 1.0196, "num_input_tokens_seen": 116407832, "step": 7235 }, { "epoch": 0.5068691060968242, "grad_norm": 3.5029869079589844, "learning_rate": 4.9360672504378284e-05, "loss": 1.0157, "num_input_tokens_seen": 116424216, "step": 7236 }, { "epoch": 0.5069391543425534, "grad_norm": 3.960109233856201, "learning_rate": 4.935367425569177e-05, "loss": 1.2421, "num_input_tokens_seen": 116440600, "step": 7237 }, { "epoch": 0.5070092025882826, "grad_norm": 3.7146995067596436, "learning_rate": 4.934667600700526e-05, "loss": 1.0466, "num_input_tokens_seen": 116455776, "step": 7238 }, { "epoch": 0.507079250834012, "grad_norm": 4.000344753265381, "learning_rate": 4.933967775831874e-05, "loss": 1.0173, "num_input_tokens_seen": 116472160, "step": 7239 }, { "epoch": 0.5071492990797412, "grad_norm": 4.015896320343018, "learning_rate": 4.933267950963223e-05, "loss": 1.0865, "num_input_tokens_seen": 116488440, "step": 7240 }, { "epoch": 0.5072193473254705, "grad_norm": 4.240390777587891, "learning_rate": 4.932568126094571e-05, "loss": 0.9658, "num_input_tokens_seen": 116504112, "step": 7241 }, { "epoch": 0.5072893955711997, "grad_norm": 4.051314353942871, "learning_rate": 4.93186830122592e-05, "loss": 1.1068, "num_input_tokens_seen": 116519976, "step": 7242 }, { "epoch": 0.5073594438169289, "grad_norm": 4.370121955871582, "learning_rate": 4.931168476357269e-05, "loss": 1.1409, "num_input_tokens_seen": 116536360, "step": 7243 }, { "epoch": 0.5074294920626582, "grad_norm": 3.7158761024475098, "learning_rate": 4.9304686514886166e-05, "loss": 1.0234, "num_input_tokens_seen": 116552744, "step": 7244 }, { "epoch": 0.5074995403083874, "grad_norm": 3.6040024757385254, "learning_rate": 4.9297688266199654e-05, "loss": 0.8008, "num_input_tokens_seen": 116568088, "step": 7245 }, { "epoch": 0.5075695885541166, "grad_norm": 5.175736904144287, "learning_rate": 4.9290690017513136e-05, "loss": 1.09, "num_input_tokens_seen": 116584152, "step": 7246 }, { "epoch": 0.5076396367998459, "grad_norm": 4.735289573669434, "learning_rate": 4.9283691768826625e-05, "loss": 1.093, "num_input_tokens_seen": 116599848, "step": 7247 }, { "epoch": 0.5077096850455751, "grad_norm": 5.659826278686523, "learning_rate": 4.9276693520140107e-05, "loss": 1.1248, "num_input_tokens_seen": 116615216, "step": 7248 }, { "epoch": 0.5077797332913044, "grad_norm": 4.524930000305176, "learning_rate": 4.9269695271453595e-05, "loss": 1.0244, "num_input_tokens_seen": 116631328, "step": 7249 }, { "epoch": 0.5078497815370336, "grad_norm": 3.6031768321990967, "learning_rate": 4.9262697022767084e-05, "loss": 1.0042, "num_input_tokens_seen": 116647016, "step": 7250 }, { "epoch": 0.5079198297827628, "grad_norm": 3.953381299972534, "learning_rate": 4.925569877408056e-05, "loss": 0.9817, "num_input_tokens_seen": 116663400, "step": 7251 }, { "epoch": 0.5079898780284922, "grad_norm": 7.162896633148193, "learning_rate": 4.9248700525394054e-05, "loss": 1.2361, "num_input_tokens_seen": 116679456, "step": 7252 }, { "epoch": 0.5080599262742214, "grad_norm": 3.416929006576538, "learning_rate": 4.924170227670753e-05, "loss": 1.0067, "num_input_tokens_seen": 116695840, "step": 7253 }, { "epoch": 0.5081299745199506, "grad_norm": 3.542628049850464, "learning_rate": 4.923470402802102e-05, "loss": 0.8451, "num_input_tokens_seen": 116712224, "step": 7254 }, { "epoch": 0.5082000227656799, "grad_norm": 5.850252151489258, "learning_rate": 4.9227705779334506e-05, "loss": 1.0186, "num_input_tokens_seen": 116728608, "step": 7255 }, { "epoch": 0.5082700710114091, "grad_norm": 4.921962261199951, "learning_rate": 4.922070753064799e-05, "loss": 1.0944, "num_input_tokens_seen": 116744992, "step": 7256 }, { "epoch": 0.5083401192571384, "grad_norm": 5.621464252471924, "learning_rate": 4.921370928196148e-05, "loss": 1.0273, "num_input_tokens_seen": 116761376, "step": 7257 }, { "epoch": 0.5084101675028676, "grad_norm": 3.8369336128234863, "learning_rate": 4.920671103327496e-05, "loss": 1.0421, "num_input_tokens_seen": 116777656, "step": 7258 }, { "epoch": 0.5084802157485968, "grad_norm": 4.033676624298096, "learning_rate": 4.919971278458845e-05, "loss": 1.0248, "num_input_tokens_seen": 116793432, "step": 7259 }, { "epoch": 0.5085502639943261, "grad_norm": 5.80480432510376, "learning_rate": 4.9192714535901936e-05, "loss": 0.9104, "num_input_tokens_seen": 116809816, "step": 7260 }, { "epoch": 0.5086203122400553, "grad_norm": 4.646456241607666, "learning_rate": 4.918571628721541e-05, "loss": 1.1542, "num_input_tokens_seen": 116826024, "step": 7261 }, { "epoch": 0.5086903604857845, "grad_norm": 5.681286811828613, "learning_rate": 4.9178718038528906e-05, "loss": 1.0206, "num_input_tokens_seen": 116841176, "step": 7262 }, { "epoch": 0.5087604087315138, "grad_norm": 4.391019821166992, "learning_rate": 4.917171978984238e-05, "loss": 0.9668, "num_input_tokens_seen": 116857560, "step": 7263 }, { "epoch": 0.508830456977243, "grad_norm": 3.69963002204895, "learning_rate": 4.916472154115587e-05, "loss": 1.1773, "num_input_tokens_seen": 116873872, "step": 7264 }, { "epoch": 0.5089005052229724, "grad_norm": 4.748251914978027, "learning_rate": 4.915772329246936e-05, "loss": 1.1401, "num_input_tokens_seen": 116890256, "step": 7265 }, { "epoch": 0.5089705534687016, "grad_norm": 4.31103515625, "learning_rate": 4.915072504378284e-05, "loss": 1.0757, "num_input_tokens_seen": 116906640, "step": 7266 }, { "epoch": 0.5090406017144308, "grad_norm": 4.616788864135742, "learning_rate": 4.914372679509633e-05, "loss": 1.0779, "num_input_tokens_seen": 116923024, "step": 7267 }, { "epoch": 0.5091106499601601, "grad_norm": 5.435046672821045, "learning_rate": 4.913672854640981e-05, "loss": 1.1371, "num_input_tokens_seen": 116939264, "step": 7268 }, { "epoch": 0.5091806982058893, "grad_norm": 3.534294843673706, "learning_rate": 4.91297302977233e-05, "loss": 0.9129, "num_input_tokens_seen": 116955648, "step": 7269 }, { "epoch": 0.5092507464516186, "grad_norm": 4.072021961212158, "learning_rate": 4.912273204903679e-05, "loss": 1.0451, "num_input_tokens_seen": 116970936, "step": 7270 }, { "epoch": 0.5093207946973478, "grad_norm": 3.853341817855835, "learning_rate": 4.911573380035026e-05, "loss": 1.0749, "num_input_tokens_seen": 116987064, "step": 7271 }, { "epoch": 0.509390842943077, "grad_norm": 5.853321552276611, "learning_rate": 4.910873555166375e-05, "loss": 1.0245, "num_input_tokens_seen": 117002752, "step": 7272 }, { "epoch": 0.5094608911888063, "grad_norm": 3.7305798530578613, "learning_rate": 4.910173730297723e-05, "loss": 1.0011, "num_input_tokens_seen": 117019136, "step": 7273 }, { "epoch": 0.5095309394345355, "grad_norm": 3.9224064350128174, "learning_rate": 4.909473905429072e-05, "loss": 1.0884, "num_input_tokens_seen": 117035504, "step": 7274 }, { "epoch": 0.5096009876802647, "grad_norm": 3.633242130279541, "learning_rate": 4.9087740805604204e-05, "loss": 1.0027, "num_input_tokens_seen": 117051888, "step": 7275 }, { "epoch": 0.509671035925994, "grad_norm": 3.7328341007232666, "learning_rate": 4.908074255691769e-05, "loss": 1.1986, "num_input_tokens_seen": 117068272, "step": 7276 }, { "epoch": 0.5097410841717233, "grad_norm": 3.492896556854248, "learning_rate": 4.907374430823118e-05, "loss": 0.8862, "num_input_tokens_seen": 117084656, "step": 7277 }, { "epoch": 0.5098111324174526, "grad_norm": 3.8437516689300537, "learning_rate": 4.906674605954466e-05, "loss": 1.1177, "num_input_tokens_seen": 117100432, "step": 7278 }, { "epoch": 0.5098811806631818, "grad_norm": 3.537297487258911, "learning_rate": 4.905974781085815e-05, "loss": 1.1694, "num_input_tokens_seen": 117116544, "step": 7279 }, { "epoch": 0.509951228908911, "grad_norm": 3.6758127212524414, "learning_rate": 4.9052749562171626e-05, "loss": 0.8982, "num_input_tokens_seen": 117132376, "step": 7280 }, { "epoch": 0.5100212771546403, "grad_norm": 3.7280797958374023, "learning_rate": 4.9045751313485115e-05, "loss": 1.013, "num_input_tokens_seen": 117148760, "step": 7281 }, { "epoch": 0.5100913254003695, "grad_norm": 4.579721927642822, "learning_rate": 4.9038753064798603e-05, "loss": 1.1065, "num_input_tokens_seen": 117165144, "step": 7282 }, { "epoch": 0.5101613736460987, "grad_norm": 4.146833896636963, "learning_rate": 4.9031754816112085e-05, "loss": 1.1848, "num_input_tokens_seen": 117180472, "step": 7283 }, { "epoch": 0.510231421891828, "grad_norm": 3.7897355556488037, "learning_rate": 4.9024756567425574e-05, "loss": 1.0082, "num_input_tokens_seen": 117196856, "step": 7284 }, { "epoch": 0.5103014701375572, "grad_norm": 3.821641206741333, "learning_rate": 4.9017758318739056e-05, "loss": 1.0827, "num_input_tokens_seen": 117213240, "step": 7285 }, { "epoch": 0.5103715183832865, "grad_norm": 4.439133644104004, "learning_rate": 4.9010760070052544e-05, "loss": 1.0325, "num_input_tokens_seen": 117229240, "step": 7286 }, { "epoch": 0.5104415666290157, "grad_norm": 4.781843185424805, "learning_rate": 4.900376182136603e-05, "loss": 1.0871, "num_input_tokens_seen": 117245624, "step": 7287 }, { "epoch": 0.510511614874745, "grad_norm": 3.928457736968994, "learning_rate": 4.8996763572679515e-05, "loss": 1.0049, "num_input_tokens_seen": 117261496, "step": 7288 }, { "epoch": 0.5105816631204743, "grad_norm": 3.8278815746307373, "learning_rate": 4.8989765323993e-05, "loss": 0.8442, "num_input_tokens_seen": 117277448, "step": 7289 }, { "epoch": 0.5106517113662035, "grad_norm": 3.8238277435302734, "learning_rate": 4.898276707530648e-05, "loss": 1.1188, "num_input_tokens_seen": 117293832, "step": 7290 }, { "epoch": 0.5107217596119327, "grad_norm": 3.835528612136841, "learning_rate": 4.897576882661997e-05, "loss": 0.9828, "num_input_tokens_seen": 117310216, "step": 7291 }, { "epoch": 0.510791807857662, "grad_norm": 3.516911029815674, "learning_rate": 4.8968770577933455e-05, "loss": 1.0539, "num_input_tokens_seen": 117326600, "step": 7292 }, { "epoch": 0.5108618561033912, "grad_norm": 4.772302150726318, "learning_rate": 4.896177232924694e-05, "loss": 1.1043, "num_input_tokens_seen": 117342984, "step": 7293 }, { "epoch": 0.5109319043491205, "grad_norm": 3.933194875717163, "learning_rate": 4.8954774080560426e-05, "loss": 1.1596, "num_input_tokens_seen": 117359024, "step": 7294 }, { "epoch": 0.5110019525948497, "grad_norm": 5.703490734100342, "learning_rate": 4.894777583187391e-05, "loss": 1.0191, "num_input_tokens_seen": 117375184, "step": 7295 }, { "epoch": 0.5110720008405789, "grad_norm": 5.454290866851807, "learning_rate": 4.8940777583187396e-05, "loss": 1.1642, "num_input_tokens_seen": 117391480, "step": 7296 }, { "epoch": 0.5111420490863082, "grad_norm": 4.424625396728516, "learning_rate": 4.8933779334500885e-05, "loss": 1.0596, "num_input_tokens_seen": 117407864, "step": 7297 }, { "epoch": 0.5112120973320374, "grad_norm": 3.7400286197662354, "learning_rate": 4.892678108581436e-05, "loss": 0.9681, "num_input_tokens_seen": 117424248, "step": 7298 }, { "epoch": 0.5112821455777666, "grad_norm": 4.604375839233398, "learning_rate": 4.8919782837127855e-05, "loss": 1.1543, "num_input_tokens_seen": 117440632, "step": 7299 }, { "epoch": 0.511352193823496, "grad_norm": 4.090991497039795, "learning_rate": 4.891278458844133e-05, "loss": 0.9973, "num_input_tokens_seen": 117457016, "step": 7300 }, { "epoch": 0.5114222420692252, "grad_norm": 4.32021427154541, "learning_rate": 4.890578633975482e-05, "loss": 1.1599, "num_input_tokens_seen": 117473400, "step": 7301 }, { "epoch": 0.5114922903149545, "grad_norm": 5.156586170196533, "learning_rate": 4.88987880910683e-05, "loss": 1.0992, "num_input_tokens_seen": 117489520, "step": 7302 }, { "epoch": 0.5115623385606837, "grad_norm": 3.4068989753723145, "learning_rate": 4.889178984238179e-05, "loss": 0.7612, "num_input_tokens_seen": 117505904, "step": 7303 }, { "epoch": 0.5116323868064129, "grad_norm": 4.153528690338135, "learning_rate": 4.888479159369528e-05, "loss": 1.0344, "num_input_tokens_seen": 117522056, "step": 7304 }, { "epoch": 0.5117024350521422, "grad_norm": 4.036544322967529, "learning_rate": 4.887779334500876e-05, "loss": 0.9494, "num_input_tokens_seen": 117538392, "step": 7305 }, { "epoch": 0.5117724832978714, "grad_norm": 3.522869110107422, "learning_rate": 4.887079509632225e-05, "loss": 1.0601, "num_input_tokens_seen": 117554776, "step": 7306 }, { "epoch": 0.5118425315436007, "grad_norm": 5.0436530113220215, "learning_rate": 4.886379684763572e-05, "loss": 1.3193, "num_input_tokens_seen": 117571160, "step": 7307 }, { "epoch": 0.5119125797893299, "grad_norm": 3.9105610847473145, "learning_rate": 4.885679859894921e-05, "loss": 1.087, "num_input_tokens_seen": 117587544, "step": 7308 }, { "epoch": 0.5119826280350591, "grad_norm": 3.6909308433532715, "learning_rate": 4.884980035026271e-05, "loss": 1.2365, "num_input_tokens_seen": 117603512, "step": 7309 }, { "epoch": 0.5120526762807884, "grad_norm": 4.3869404792785645, "learning_rate": 4.884280210157618e-05, "loss": 1.0075, "num_input_tokens_seen": 117619896, "step": 7310 }, { "epoch": 0.5121227245265176, "grad_norm": 4.207435607910156, "learning_rate": 4.883580385288967e-05, "loss": 1.2587, "num_input_tokens_seen": 117636280, "step": 7311 }, { "epoch": 0.5121927727722468, "grad_norm": 3.8486809730529785, "learning_rate": 4.882880560420315e-05, "loss": 0.9452, "num_input_tokens_seen": 117652664, "step": 7312 }, { "epoch": 0.5122628210179762, "grad_norm": 4.338857173919678, "learning_rate": 4.882180735551664e-05, "loss": 1.0159, "num_input_tokens_seen": 117669048, "step": 7313 }, { "epoch": 0.5123328692637054, "grad_norm": 4.424511432647705, "learning_rate": 4.881480910683013e-05, "loss": 1.1418, "num_input_tokens_seen": 117684464, "step": 7314 }, { "epoch": 0.5124029175094347, "grad_norm": 3.560134172439575, "learning_rate": 4.880781085814361e-05, "loss": 0.8744, "num_input_tokens_seen": 117700848, "step": 7315 }, { "epoch": 0.5124729657551639, "grad_norm": 3.687147378921509, "learning_rate": 4.88008126094571e-05, "loss": 1.0157, "num_input_tokens_seen": 117717232, "step": 7316 }, { "epoch": 0.5125430140008931, "grad_norm": 3.469698429107666, "learning_rate": 4.8793814360770575e-05, "loss": 1.0037, "num_input_tokens_seen": 117733616, "step": 7317 }, { "epoch": 0.5126130622466224, "grad_norm": 4.417529582977295, "learning_rate": 4.8786816112084064e-05, "loss": 1.0446, "num_input_tokens_seen": 117749912, "step": 7318 }, { "epoch": 0.5126831104923516, "grad_norm": 4.738529205322266, "learning_rate": 4.877981786339756e-05, "loss": 1.1557, "num_input_tokens_seen": 117766088, "step": 7319 }, { "epoch": 0.5127531587380808, "grad_norm": 3.9368979930877686, "learning_rate": 4.8772819614711034e-05, "loss": 0.9631, "num_input_tokens_seen": 117781896, "step": 7320 }, { "epoch": 0.5128232069838101, "grad_norm": 4.166690826416016, "learning_rate": 4.876582136602452e-05, "loss": 1.1633, "num_input_tokens_seen": 117798280, "step": 7321 }, { "epoch": 0.5128932552295393, "grad_norm": 5.667482376098633, "learning_rate": 4.8758823117338005e-05, "loss": 1.0399, "num_input_tokens_seen": 117812376, "step": 7322 }, { "epoch": 0.5129633034752686, "grad_norm": 4.0888776779174805, "learning_rate": 4.875182486865149e-05, "loss": 1.1515, "num_input_tokens_seen": 117828760, "step": 7323 }, { "epoch": 0.5130333517209978, "grad_norm": 3.7693824768066406, "learning_rate": 4.874482661996498e-05, "loss": 1.003, "num_input_tokens_seen": 117844312, "step": 7324 }, { "epoch": 0.513103399966727, "grad_norm": 4.041450500488281, "learning_rate": 4.8737828371278464e-05, "loss": 1.0271, "num_input_tokens_seen": 117860456, "step": 7325 }, { "epoch": 0.5131734482124564, "grad_norm": 3.697910785675049, "learning_rate": 4.873083012259195e-05, "loss": 1.0592, "num_input_tokens_seen": 117876840, "step": 7326 }, { "epoch": 0.5132434964581856, "grad_norm": 5.30276346206665, "learning_rate": 4.872383187390543e-05, "loss": 1.0356, "num_input_tokens_seen": 117891376, "step": 7327 }, { "epoch": 0.5133135447039148, "grad_norm": 4.165642261505127, "learning_rate": 4.8716833625218916e-05, "loss": 1.095, "num_input_tokens_seen": 117907080, "step": 7328 }, { "epoch": 0.5133835929496441, "grad_norm": 3.727694272994995, "learning_rate": 4.87098353765324e-05, "loss": 1.0567, "num_input_tokens_seen": 117923464, "step": 7329 }, { "epoch": 0.5134536411953733, "grad_norm": 3.4425323009490967, "learning_rate": 4.8702837127845886e-05, "loss": 1.002, "num_input_tokens_seen": 117939848, "step": 7330 }, { "epoch": 0.5135236894411026, "grad_norm": 5.108044147491455, "learning_rate": 4.8695838879159375e-05, "loss": 1.3818, "num_input_tokens_seen": 117956232, "step": 7331 }, { "epoch": 0.5135937376868318, "grad_norm": 3.598155975341797, "learning_rate": 4.868884063047286e-05, "loss": 0.8586, "num_input_tokens_seen": 117972616, "step": 7332 }, { "epoch": 0.513663785932561, "grad_norm": 5.529993057250977, "learning_rate": 4.8681842381786345e-05, "loss": 1.0782, "num_input_tokens_seen": 117988616, "step": 7333 }, { "epoch": 0.5137338341782903, "grad_norm": 3.80139422416687, "learning_rate": 4.867484413309982e-05, "loss": 0.9837, "num_input_tokens_seen": 118004720, "step": 7334 }, { "epoch": 0.5138038824240195, "grad_norm": 3.5874598026275635, "learning_rate": 4.8667845884413316e-05, "loss": 0.965, "num_input_tokens_seen": 118021008, "step": 7335 }, { "epoch": 0.5138739306697487, "grad_norm": 4.101084232330322, "learning_rate": 4.8660847635726804e-05, "loss": 1.1482, "num_input_tokens_seen": 118036976, "step": 7336 }, { "epoch": 0.513943978915478, "grad_norm": 4.202828407287598, "learning_rate": 4.865384938704028e-05, "loss": 0.9336, "num_input_tokens_seen": 118052632, "step": 7337 }, { "epoch": 0.5140140271612073, "grad_norm": 3.6700570583343506, "learning_rate": 4.864685113835377e-05, "loss": 1.0664, "num_input_tokens_seen": 118069016, "step": 7338 }, { "epoch": 0.5140840754069366, "grad_norm": 5.313467502593994, "learning_rate": 4.863985288966725e-05, "loss": 1.1133, "num_input_tokens_seen": 118085400, "step": 7339 }, { "epoch": 0.5141541236526658, "grad_norm": 3.78403639793396, "learning_rate": 4.863285464098074e-05, "loss": 1.1592, "num_input_tokens_seen": 118101784, "step": 7340 }, { "epoch": 0.514224171898395, "grad_norm": 5.284808158874512, "learning_rate": 4.862585639229423e-05, "loss": 1.0568, "num_input_tokens_seen": 118116584, "step": 7341 }, { "epoch": 0.5142942201441243, "grad_norm": 3.6075503826141357, "learning_rate": 4.861885814360771e-05, "loss": 1.0451, "num_input_tokens_seen": 118132968, "step": 7342 }, { "epoch": 0.5143642683898535, "grad_norm": 3.514549493789673, "learning_rate": 4.86118598949212e-05, "loss": 0.8536, "num_input_tokens_seen": 118148576, "step": 7343 }, { "epoch": 0.5144343166355828, "grad_norm": 4.394196510314941, "learning_rate": 4.860486164623467e-05, "loss": 0.9135, "num_input_tokens_seen": 118164168, "step": 7344 }, { "epoch": 0.514504364881312, "grad_norm": 4.1175737380981445, "learning_rate": 4.859786339754817e-05, "loss": 1.1392, "num_input_tokens_seen": 118180552, "step": 7345 }, { "epoch": 0.5145744131270412, "grad_norm": 4.6415581703186035, "learning_rate": 4.8590865148861656e-05, "loss": 1.1932, "num_input_tokens_seen": 118196936, "step": 7346 }, { "epoch": 0.5146444613727705, "grad_norm": 4.681972980499268, "learning_rate": 4.858386690017513e-05, "loss": 1.0245, "num_input_tokens_seen": 118213320, "step": 7347 }, { "epoch": 0.5147145096184997, "grad_norm": 6.289412021636963, "learning_rate": 4.857686865148862e-05, "loss": 1.0609, "num_input_tokens_seen": 118228648, "step": 7348 }, { "epoch": 0.5147845578642289, "grad_norm": 4.726010799407959, "learning_rate": 4.85698704028021e-05, "loss": 1.1635, "num_input_tokens_seen": 118244552, "step": 7349 }, { "epoch": 0.5148546061099583, "grad_norm": 3.7650058269500732, "learning_rate": 4.856287215411559e-05, "loss": 0.8969, "num_input_tokens_seen": 118260416, "step": 7350 }, { "epoch": 0.5149246543556875, "grad_norm": 5.283148765563965, "learning_rate": 4.855587390542907e-05, "loss": 1.1027, "num_input_tokens_seen": 118276800, "step": 7351 }, { "epoch": 0.5149947026014168, "grad_norm": 4.060834884643555, "learning_rate": 4.854887565674256e-05, "loss": 0.9387, "num_input_tokens_seen": 118293128, "step": 7352 }, { "epoch": 0.515064750847146, "grad_norm": 3.5670249462127686, "learning_rate": 4.854187740805605e-05, "loss": 0.9658, "num_input_tokens_seen": 118309512, "step": 7353 }, { "epoch": 0.5151347990928752, "grad_norm": 4.051480770111084, "learning_rate": 4.8534879159369524e-05, "loss": 1.1616, "num_input_tokens_seen": 118325640, "step": 7354 }, { "epoch": 0.5152048473386045, "grad_norm": 4.1467461585998535, "learning_rate": 4.852788091068302e-05, "loss": 1.0501, "num_input_tokens_seen": 118342024, "step": 7355 }, { "epoch": 0.5152748955843337, "grad_norm": 4.226484775543213, "learning_rate": 4.8520882661996495e-05, "loss": 1.3255, "num_input_tokens_seen": 118358408, "step": 7356 }, { "epoch": 0.5153449438300629, "grad_norm": 3.81172513961792, "learning_rate": 4.8513884413309983e-05, "loss": 0.9909, "num_input_tokens_seen": 118373000, "step": 7357 }, { "epoch": 0.5154149920757922, "grad_norm": 5.178154945373535, "learning_rate": 4.850688616462347e-05, "loss": 1.1296, "num_input_tokens_seen": 118389208, "step": 7358 }, { "epoch": 0.5154850403215214, "grad_norm": 3.971707582473755, "learning_rate": 4.8499887915936954e-05, "loss": 1.0011, "num_input_tokens_seen": 118405592, "step": 7359 }, { "epoch": 0.5155550885672507, "grad_norm": 3.4853761196136475, "learning_rate": 4.849288966725044e-05, "loss": 0.9744, "num_input_tokens_seen": 118421976, "step": 7360 }, { "epoch": 0.51562513681298, "grad_norm": 3.309765577316284, "learning_rate": 4.8485891418563924e-05, "loss": 0.9776, "num_input_tokens_seen": 118438232, "step": 7361 }, { "epoch": 0.5156951850587091, "grad_norm": 5.875191688537598, "learning_rate": 4.847889316987741e-05, "loss": 0.9788, "num_input_tokens_seen": 118453368, "step": 7362 }, { "epoch": 0.5157652333044385, "grad_norm": 3.633922815322876, "learning_rate": 4.84718949211909e-05, "loss": 0.9949, "num_input_tokens_seen": 118469752, "step": 7363 }, { "epoch": 0.5158352815501677, "grad_norm": 3.7608840465545654, "learning_rate": 4.8464896672504376e-05, "loss": 0.9275, "num_input_tokens_seen": 118485760, "step": 7364 }, { "epoch": 0.5159053297958969, "grad_norm": 4.625830173492432, "learning_rate": 4.845789842381787e-05, "loss": 1.0056, "num_input_tokens_seen": 118501944, "step": 7365 }, { "epoch": 0.5159753780416262, "grad_norm": 5.04378604888916, "learning_rate": 4.845090017513135e-05, "loss": 1.1256, "num_input_tokens_seen": 118518328, "step": 7366 }, { "epoch": 0.5160454262873554, "grad_norm": 3.658432960510254, "learning_rate": 4.8443901926444835e-05, "loss": 1.0818, "num_input_tokens_seen": 118534528, "step": 7367 }, { "epoch": 0.5161154745330847, "grad_norm": 4.299334526062012, "learning_rate": 4.8436903677758324e-05, "loss": 1.0318, "num_input_tokens_seen": 118550912, "step": 7368 }, { "epoch": 0.5161855227788139, "grad_norm": 5.608176231384277, "learning_rate": 4.8429905429071806e-05, "loss": 1.0708, "num_input_tokens_seen": 118566464, "step": 7369 }, { "epoch": 0.5162555710245431, "grad_norm": 4.76702356338501, "learning_rate": 4.8422907180385294e-05, "loss": 1.035, "num_input_tokens_seen": 118582848, "step": 7370 }, { "epoch": 0.5163256192702724, "grad_norm": 4.01913595199585, "learning_rate": 4.8415908931698776e-05, "loss": 1.2714, "num_input_tokens_seen": 118599232, "step": 7371 }, { "epoch": 0.5163956675160016, "grad_norm": 4.569849967956543, "learning_rate": 4.8408910683012265e-05, "loss": 0.9726, "num_input_tokens_seen": 118615616, "step": 7372 }, { "epoch": 0.5164657157617308, "grad_norm": 4.713064670562744, "learning_rate": 4.8401912434325753e-05, "loss": 0.9028, "num_input_tokens_seen": 118632000, "step": 7373 }, { "epoch": 0.5165357640074602, "grad_norm": 4.8917317390441895, "learning_rate": 4.839491418563923e-05, "loss": 0.9373, "num_input_tokens_seen": 118646960, "step": 7374 }, { "epoch": 0.5166058122531894, "grad_norm": 4.5929460525512695, "learning_rate": 4.8387915936952724e-05, "loss": 1.0066, "num_input_tokens_seen": 118663344, "step": 7375 }, { "epoch": 0.5166758604989187, "grad_norm": 4.328620433807373, "learning_rate": 4.83809176882662e-05, "loss": 0.9591, "num_input_tokens_seen": 118679728, "step": 7376 }, { "epoch": 0.5167459087446479, "grad_norm": 4.046355724334717, "learning_rate": 4.837391943957969e-05, "loss": 1.1514, "num_input_tokens_seen": 118694912, "step": 7377 }, { "epoch": 0.5168159569903771, "grad_norm": 3.612791061401367, "learning_rate": 4.836692119089317e-05, "loss": 1.0936, "num_input_tokens_seen": 118711296, "step": 7378 }, { "epoch": 0.5168860052361064, "grad_norm": 3.2623980045318604, "learning_rate": 4.835992294220666e-05, "loss": 0.9072, "num_input_tokens_seen": 118727680, "step": 7379 }, { "epoch": 0.5169560534818356, "grad_norm": 3.7941362857818604, "learning_rate": 4.8352924693520147e-05, "loss": 0.8598, "num_input_tokens_seen": 118744064, "step": 7380 }, { "epoch": 0.5170261017275649, "grad_norm": 3.7529819011688232, "learning_rate": 4.834592644483363e-05, "loss": 0.9265, "num_input_tokens_seen": 118759616, "step": 7381 }, { "epoch": 0.5170961499732941, "grad_norm": 3.552791118621826, "learning_rate": 4.833892819614712e-05, "loss": 1.1577, "num_input_tokens_seen": 118776000, "step": 7382 }, { "epoch": 0.5171661982190233, "grad_norm": 3.719827651977539, "learning_rate": 4.833192994746059e-05, "loss": 0.852, "num_input_tokens_seen": 118792360, "step": 7383 }, { "epoch": 0.5172362464647526, "grad_norm": 6.634743690490723, "learning_rate": 4.832493169877408e-05, "loss": 1.3676, "num_input_tokens_seen": 118808112, "step": 7384 }, { "epoch": 0.5173062947104818, "grad_norm": 4.388521194458008, "learning_rate": 4.8317933450087576e-05, "loss": 1.2442, "num_input_tokens_seen": 118824384, "step": 7385 }, { "epoch": 0.517376342956211, "grad_norm": 4.161240577697754, "learning_rate": 4.831093520140105e-05, "loss": 1.0573, "num_input_tokens_seen": 118840280, "step": 7386 }, { "epoch": 0.5174463912019404, "grad_norm": 4.272933006286621, "learning_rate": 4.830393695271454e-05, "loss": 1.1301, "num_input_tokens_seen": 118855176, "step": 7387 }, { "epoch": 0.5175164394476696, "grad_norm": 4.388091087341309, "learning_rate": 4.829693870402802e-05, "loss": 1.0048, "num_input_tokens_seen": 118869528, "step": 7388 }, { "epoch": 0.5175864876933989, "grad_norm": 3.623302698135376, "learning_rate": 4.828994045534151e-05, "loss": 1.0096, "num_input_tokens_seen": 118885272, "step": 7389 }, { "epoch": 0.5176565359391281, "grad_norm": 6.347753047943115, "learning_rate": 4.8282942206655e-05, "loss": 0.9923, "num_input_tokens_seen": 118901384, "step": 7390 }, { "epoch": 0.5177265841848573, "grad_norm": 5.049704551696777, "learning_rate": 4.827594395796848e-05, "loss": 1.1742, "num_input_tokens_seen": 118916392, "step": 7391 }, { "epoch": 0.5177966324305866, "grad_norm": 4.022581100463867, "learning_rate": 4.826894570928197e-05, "loss": 0.8257, "num_input_tokens_seen": 118932000, "step": 7392 }, { "epoch": 0.5178666806763158, "grad_norm": 4.277614593505859, "learning_rate": 4.8261947460595444e-05, "loss": 1.1972, "num_input_tokens_seen": 118947752, "step": 7393 }, { "epoch": 0.517936728922045, "grad_norm": 3.5294747352600098, "learning_rate": 4.825494921190893e-05, "loss": 1.0023, "num_input_tokens_seen": 118963312, "step": 7394 }, { "epoch": 0.5180067771677743, "grad_norm": 4.206392765045166, "learning_rate": 4.824795096322243e-05, "loss": 1.1942, "num_input_tokens_seen": 118979696, "step": 7395 }, { "epoch": 0.5180768254135035, "grad_norm": 5.907151222229004, "learning_rate": 4.82409527145359e-05, "loss": 1.1046, "num_input_tokens_seen": 118996080, "step": 7396 }, { "epoch": 0.5181468736592328, "grad_norm": 3.4217193126678467, "learning_rate": 4.823395446584939e-05, "loss": 0.9469, "num_input_tokens_seen": 119012064, "step": 7397 }, { "epoch": 0.518216921904962, "grad_norm": 3.4076719284057617, "learning_rate": 4.822695621716287e-05, "loss": 0.9599, "num_input_tokens_seen": 119028448, "step": 7398 }, { "epoch": 0.5182869701506913, "grad_norm": 3.6392581462860107, "learning_rate": 4.821995796847636e-05, "loss": 1.0481, "num_input_tokens_seen": 119044832, "step": 7399 }, { "epoch": 0.5183570183964206, "grad_norm": 4.3391947746276855, "learning_rate": 4.821295971978985e-05, "loss": 1.0365, "num_input_tokens_seen": 119061216, "step": 7400 }, { "epoch": 0.5183570183964206, "eval_loss": 1.1227985620498657, "eval_runtime": 0.1572, "eval_samples_per_second": 6.362, "eval_steps_per_second": 6.362, "num_input_tokens_seen": 119061216, "step": 7400 }, { "epoch": 0.5184270666421498, "grad_norm": 4.152843952178955, "learning_rate": 4.820596147110333e-05, "loss": 1.0123, "num_input_tokens_seen": 119077600, "step": 7401 }, { "epoch": 0.518497114887879, "grad_norm": 4.2847371101379395, "learning_rate": 4.819896322241682e-05, "loss": 1.0532, "num_input_tokens_seen": 119093984, "step": 7402 }, { "epoch": 0.5185671631336083, "grad_norm": 3.9743080139160156, "learning_rate": 4.8191964973730296e-05, "loss": 1.0229, "num_input_tokens_seen": 119109984, "step": 7403 }, { "epoch": 0.5186372113793375, "grad_norm": 3.8699886798858643, "learning_rate": 4.8184966725043785e-05, "loss": 1.0008, "num_input_tokens_seen": 119126040, "step": 7404 }, { "epoch": 0.5187072596250668, "grad_norm": 4.810290336608887, "learning_rate": 4.8177968476357266e-05, "loss": 1.1959, "num_input_tokens_seen": 119141512, "step": 7405 }, { "epoch": 0.518777307870796, "grad_norm": 5.068106174468994, "learning_rate": 4.8170970227670755e-05, "loss": 0.9215, "num_input_tokens_seen": 119156112, "step": 7406 }, { "epoch": 0.5188473561165252, "grad_norm": 5.044201374053955, "learning_rate": 4.8163971978984244e-05, "loss": 1.2169, "num_input_tokens_seen": 119172496, "step": 7407 }, { "epoch": 0.5189174043622545, "grad_norm": 5.986364364624023, "learning_rate": 4.8156973730297725e-05, "loss": 1.1855, "num_input_tokens_seen": 119188880, "step": 7408 }, { "epoch": 0.5189874526079837, "grad_norm": 4.029194355010986, "learning_rate": 4.8149975481611214e-05, "loss": 0.9322, "num_input_tokens_seen": 119205264, "step": 7409 }, { "epoch": 0.519057500853713, "grad_norm": 6.960198879241943, "learning_rate": 4.814297723292469e-05, "loss": 0.9012, "num_input_tokens_seen": 119220560, "step": 7410 }, { "epoch": 0.5191275490994423, "grad_norm": 3.8074264526367188, "learning_rate": 4.8135978984238184e-05, "loss": 1.0376, "num_input_tokens_seen": 119236856, "step": 7411 }, { "epoch": 0.5191975973451715, "grad_norm": 4.192657947540283, "learning_rate": 4.812898073555167e-05, "loss": 1.2593, "num_input_tokens_seen": 119253240, "step": 7412 }, { "epoch": 0.5192676455909008, "grad_norm": 4.494477272033691, "learning_rate": 4.812198248686515e-05, "loss": 1.1815, "num_input_tokens_seen": 119269264, "step": 7413 }, { "epoch": 0.51933769383663, "grad_norm": 3.808358907699585, "learning_rate": 4.8114984238178637e-05, "loss": 1.0039, "num_input_tokens_seen": 119285384, "step": 7414 }, { "epoch": 0.5194077420823592, "grad_norm": 4.097054481506348, "learning_rate": 4.810798598949212e-05, "loss": 1.2953, "num_input_tokens_seen": 119301768, "step": 7415 }, { "epoch": 0.5194777903280885, "grad_norm": 4.17371129989624, "learning_rate": 4.810098774080561e-05, "loss": 1.3124, "num_input_tokens_seen": 119318152, "step": 7416 }, { "epoch": 0.5195478385738177, "grad_norm": 3.7520711421966553, "learning_rate": 4.8093989492119096e-05, "loss": 0.9264, "num_input_tokens_seen": 119334536, "step": 7417 }, { "epoch": 0.519617886819547, "grad_norm": 3.5394372940063477, "learning_rate": 4.808699124343258e-05, "loss": 1.1861, "num_input_tokens_seen": 119350888, "step": 7418 }, { "epoch": 0.5196879350652762, "grad_norm": 3.490539312362671, "learning_rate": 4.8079992994746066e-05, "loss": 0.9393, "num_input_tokens_seen": 119367272, "step": 7419 }, { "epoch": 0.5197579833110054, "grad_norm": 3.7124273777008057, "learning_rate": 4.807299474605954e-05, "loss": 0.9956, "num_input_tokens_seen": 119383656, "step": 7420 }, { "epoch": 0.5198280315567347, "grad_norm": 4.070093631744385, "learning_rate": 4.8065996497373036e-05, "loss": 1.0084, "num_input_tokens_seen": 119399440, "step": 7421 }, { "epoch": 0.5198980798024639, "grad_norm": 7.456560134887695, "learning_rate": 4.8058998248686525e-05, "loss": 0.9437, "num_input_tokens_seen": 119415232, "step": 7422 }, { "epoch": 0.5199681280481931, "grad_norm": 3.812809705734253, "learning_rate": 4.8052e-05, "loss": 0.9875, "num_input_tokens_seen": 119431032, "step": 7423 }, { "epoch": 0.5200381762939225, "grad_norm": 4.641679763793945, "learning_rate": 4.804500175131349e-05, "loss": 0.8798, "num_input_tokens_seen": 119447416, "step": 7424 }, { "epoch": 0.5201082245396517, "grad_norm": 3.7425315380096436, "learning_rate": 4.803800350262697e-05, "loss": 0.922, "num_input_tokens_seen": 119463400, "step": 7425 }, { "epoch": 0.520178272785381, "grad_norm": 3.5548949241638184, "learning_rate": 4.803100525394046e-05, "loss": 0.8365, "num_input_tokens_seen": 119479784, "step": 7426 }, { "epoch": 0.5202483210311102, "grad_norm": 3.335888385772705, "learning_rate": 4.802400700525395e-05, "loss": 0.9923, "num_input_tokens_seen": 119495928, "step": 7427 }, { "epoch": 0.5203183692768394, "grad_norm": 3.8208446502685547, "learning_rate": 4.801700875656743e-05, "loss": 0.9784, "num_input_tokens_seen": 119511408, "step": 7428 }, { "epoch": 0.5203884175225687, "grad_norm": 4.35474967956543, "learning_rate": 4.801001050788092e-05, "loss": 1.1477, "num_input_tokens_seen": 119526656, "step": 7429 }, { "epoch": 0.5204584657682979, "grad_norm": 4.081501007080078, "learning_rate": 4.800301225919439e-05, "loss": 1.0881, "num_input_tokens_seen": 119543040, "step": 7430 }, { "epoch": 0.5205285140140271, "grad_norm": 4.1007866859436035, "learning_rate": 4.799601401050789e-05, "loss": 0.9335, "num_input_tokens_seen": 119559040, "step": 7431 }, { "epoch": 0.5205985622597564, "grad_norm": 3.955095052719116, "learning_rate": 4.7989015761821363e-05, "loss": 1.0092, "num_input_tokens_seen": 119574832, "step": 7432 }, { "epoch": 0.5206686105054856, "grad_norm": 5.475005626678467, "learning_rate": 4.798201751313485e-05, "loss": 1.1933, "num_input_tokens_seen": 119590688, "step": 7433 }, { "epoch": 0.520738658751215, "grad_norm": 4.125513553619385, "learning_rate": 4.797501926444834e-05, "loss": 0.9113, "num_input_tokens_seen": 119606592, "step": 7434 }, { "epoch": 0.5208087069969441, "grad_norm": 3.608366012573242, "learning_rate": 4.796802101576182e-05, "loss": 1.0557, "num_input_tokens_seen": 119622976, "step": 7435 }, { "epoch": 0.5208787552426734, "grad_norm": 5.827488899230957, "learning_rate": 4.796102276707531e-05, "loss": 1.044, "num_input_tokens_seen": 119639360, "step": 7436 }, { "epoch": 0.5209488034884027, "grad_norm": 4.732996463775635, "learning_rate": 4.795402451838879e-05, "loss": 0.7811, "num_input_tokens_seen": 119654448, "step": 7437 }, { "epoch": 0.5210188517341319, "grad_norm": 3.9436864852905273, "learning_rate": 4.794702626970228e-05, "loss": 0.9169, "num_input_tokens_seen": 119670448, "step": 7438 }, { "epoch": 0.5210888999798611, "grad_norm": 3.787825345993042, "learning_rate": 4.794002802101577e-05, "loss": 1.1469, "num_input_tokens_seen": 119686832, "step": 7439 }, { "epoch": 0.5211589482255904, "grad_norm": 3.7498011589050293, "learning_rate": 4.7933029772329245e-05, "loss": 0.944, "num_input_tokens_seen": 119702776, "step": 7440 }, { "epoch": 0.5212289964713196, "grad_norm": 4.7387285232543945, "learning_rate": 4.792603152364274e-05, "loss": 1.0428, "num_input_tokens_seen": 119719160, "step": 7441 }, { "epoch": 0.5212990447170489, "grad_norm": 4.916442394256592, "learning_rate": 4.7919033274956215e-05, "loss": 1.0036, "num_input_tokens_seen": 119735208, "step": 7442 }, { "epoch": 0.5213690929627781, "grad_norm": 7.069667339324951, "learning_rate": 4.7912035026269704e-05, "loss": 1.0664, "num_input_tokens_seen": 119750720, "step": 7443 }, { "epoch": 0.5214391412085073, "grad_norm": 4.178729057312012, "learning_rate": 4.790503677758319e-05, "loss": 0.946, "num_input_tokens_seen": 119767104, "step": 7444 }, { "epoch": 0.5215091894542366, "grad_norm": 4.8164801597595215, "learning_rate": 4.7898038528896674e-05, "loss": 1.2099, "num_input_tokens_seen": 119782680, "step": 7445 }, { "epoch": 0.5215792376999658, "grad_norm": 4.30715799331665, "learning_rate": 4.789104028021016e-05, "loss": 1.0225, "num_input_tokens_seen": 119798392, "step": 7446 }, { "epoch": 0.5216492859456952, "grad_norm": 4.270611763000488, "learning_rate": 4.7884042031523645e-05, "loss": 1.3242, "num_input_tokens_seen": 119814776, "step": 7447 }, { "epoch": 0.5217193341914244, "grad_norm": 3.3384652137756348, "learning_rate": 4.7877043782837133e-05, "loss": 0.9353, "num_input_tokens_seen": 119831160, "step": 7448 }, { "epoch": 0.5217893824371536, "grad_norm": 3.343555450439453, "learning_rate": 4.787004553415062e-05, "loss": 1.0108, "num_input_tokens_seen": 119847544, "step": 7449 }, { "epoch": 0.5218594306828829, "grad_norm": 4.26312255859375, "learning_rate": 4.78630472854641e-05, "loss": 1.1169, "num_input_tokens_seen": 119863928, "step": 7450 }, { "epoch": 0.5219294789286121, "grad_norm": 3.9083523750305176, "learning_rate": 4.785604903677759e-05, "loss": 1.01, "num_input_tokens_seen": 119880312, "step": 7451 }, { "epoch": 0.5219995271743413, "grad_norm": 4.212127208709717, "learning_rate": 4.784905078809107e-05, "loss": 1.1562, "num_input_tokens_seen": 119895464, "step": 7452 }, { "epoch": 0.5220695754200706, "grad_norm": 6.6233038902282715, "learning_rate": 4.7842052539404556e-05, "loss": 1.0112, "num_input_tokens_seen": 119911848, "step": 7453 }, { "epoch": 0.5221396236657998, "grad_norm": 3.770444631576538, "learning_rate": 4.7835054290718045e-05, "loss": 0.9608, "num_input_tokens_seen": 119928232, "step": 7454 }, { "epoch": 0.5222096719115291, "grad_norm": 3.3227531909942627, "learning_rate": 4.7828056042031527e-05, "loss": 0.9593, "num_input_tokens_seen": 119944296, "step": 7455 }, { "epoch": 0.5222797201572583, "grad_norm": 4.297872066497803, "learning_rate": 4.7821057793345015e-05, "loss": 1.1381, "num_input_tokens_seen": 119960680, "step": 7456 }, { "epoch": 0.5223497684029875, "grad_norm": 3.6508405208587646, "learning_rate": 4.78140595446585e-05, "loss": 0.9298, "num_input_tokens_seen": 119977064, "step": 7457 }, { "epoch": 0.5224198166487168, "grad_norm": 3.5275723934173584, "learning_rate": 4.7807061295971986e-05, "loss": 1.0049, "num_input_tokens_seen": 119993448, "step": 7458 }, { "epoch": 0.522489864894446, "grad_norm": 3.983844757080078, "learning_rate": 4.780006304728546e-05, "loss": 1.0362, "num_input_tokens_seen": 120009832, "step": 7459 }, { "epoch": 0.5225599131401752, "grad_norm": 3.6752452850341797, "learning_rate": 4.779306479859895e-05, "loss": 1.0864, "num_input_tokens_seen": 120026216, "step": 7460 }, { "epoch": 0.5226299613859046, "grad_norm": 3.3501710891723633, "learning_rate": 4.7786066549912445e-05, "loss": 0.8639, "num_input_tokens_seen": 120042456, "step": 7461 }, { "epoch": 0.5227000096316338, "grad_norm": 3.448544979095459, "learning_rate": 4.777906830122592e-05, "loss": 0.953, "num_input_tokens_seen": 120058392, "step": 7462 }, { "epoch": 0.5227700578773631, "grad_norm": 3.7892913818359375, "learning_rate": 4.777207005253941e-05, "loss": 0.9391, "num_input_tokens_seen": 120074776, "step": 7463 }, { "epoch": 0.5228401061230923, "grad_norm": 6.972007751464844, "learning_rate": 4.776507180385289e-05, "loss": 0.944, "num_input_tokens_seen": 120089776, "step": 7464 }, { "epoch": 0.5229101543688215, "grad_norm": 3.6271073818206787, "learning_rate": 4.775807355516638e-05, "loss": 1.1703, "num_input_tokens_seen": 120106160, "step": 7465 }, { "epoch": 0.5229802026145508, "grad_norm": 4.735227108001709, "learning_rate": 4.775107530647987e-05, "loss": 1.2245, "num_input_tokens_seen": 120122144, "step": 7466 }, { "epoch": 0.52305025086028, "grad_norm": 4.848446846008301, "learning_rate": 4.774407705779335e-05, "loss": 1.0117, "num_input_tokens_seen": 120138528, "step": 7467 }, { "epoch": 0.5231202991060092, "grad_norm": 3.5538604259490967, "learning_rate": 4.773707880910684e-05, "loss": 1.0153, "num_input_tokens_seen": 120154912, "step": 7468 }, { "epoch": 0.5231903473517385, "grad_norm": 4.481129169464111, "learning_rate": 4.773008056042031e-05, "loss": 1.1685, "num_input_tokens_seen": 120171296, "step": 7469 }, { "epoch": 0.5232603955974677, "grad_norm": 4.581019401550293, "learning_rate": 4.77230823117338e-05, "loss": 1.0292, "num_input_tokens_seen": 120187680, "step": 7470 }, { "epoch": 0.523330443843197, "grad_norm": 5.147364139556885, "learning_rate": 4.7716084063047297e-05, "loss": 0.9574, "num_input_tokens_seen": 120204008, "step": 7471 }, { "epoch": 0.5234004920889263, "grad_norm": 3.716172218322754, "learning_rate": 4.770908581436077e-05, "loss": 0.9937, "num_input_tokens_seen": 120220392, "step": 7472 }, { "epoch": 0.5234705403346555, "grad_norm": 5.15359354019165, "learning_rate": 4.770208756567426e-05, "loss": 1.0412, "num_input_tokens_seen": 120236504, "step": 7473 }, { "epoch": 0.5235405885803848, "grad_norm": 3.9450008869171143, "learning_rate": 4.769508931698774e-05, "loss": 1.2084, "num_input_tokens_seen": 120252568, "step": 7474 }, { "epoch": 0.523610636826114, "grad_norm": 7.154159069061279, "learning_rate": 4.768809106830123e-05, "loss": 1.0837, "num_input_tokens_seen": 120268952, "step": 7475 }, { "epoch": 0.5236806850718432, "grad_norm": 3.838291883468628, "learning_rate": 4.768109281961472e-05, "loss": 0.9896, "num_input_tokens_seen": 120285336, "step": 7476 }, { "epoch": 0.5237507333175725, "grad_norm": 4.197968482971191, "learning_rate": 4.76740945709282e-05, "loss": 1.0913, "num_input_tokens_seen": 120301720, "step": 7477 }, { "epoch": 0.5238207815633017, "grad_norm": 4.732188701629639, "learning_rate": 4.766709632224169e-05, "loss": 1.2257, "num_input_tokens_seen": 120316968, "step": 7478 }, { "epoch": 0.523890829809031, "grad_norm": 4.164662837982178, "learning_rate": 4.7660098073555165e-05, "loss": 1.0551, "num_input_tokens_seen": 120333352, "step": 7479 }, { "epoch": 0.5239608780547602, "grad_norm": 3.6521334648132324, "learning_rate": 4.765309982486865e-05, "loss": 0.9889, "num_input_tokens_seen": 120349736, "step": 7480 }, { "epoch": 0.5240309263004894, "grad_norm": 3.6896276473999023, "learning_rate": 4.764610157618215e-05, "loss": 0.8846, "num_input_tokens_seen": 120366120, "step": 7481 }, { "epoch": 0.5241009745462187, "grad_norm": 4.352004051208496, "learning_rate": 4.7639103327495624e-05, "loss": 1.0483, "num_input_tokens_seen": 120382360, "step": 7482 }, { "epoch": 0.5241710227919479, "grad_norm": 6.433780670166016, "learning_rate": 4.763210507880911e-05, "loss": 1.1227, "num_input_tokens_seen": 120397904, "step": 7483 }, { "epoch": 0.5242410710376773, "grad_norm": 5.6717329025268555, "learning_rate": 4.7625106830122594e-05, "loss": 1.0687, "num_input_tokens_seen": 120414248, "step": 7484 }, { "epoch": 0.5243111192834065, "grad_norm": 3.4725184440612793, "learning_rate": 4.761810858143608e-05, "loss": 0.8943, "num_input_tokens_seen": 120430632, "step": 7485 }, { "epoch": 0.5243811675291357, "grad_norm": 3.803506374359131, "learning_rate": 4.761111033274956e-05, "loss": 1.0, "num_input_tokens_seen": 120447016, "step": 7486 }, { "epoch": 0.524451215774865, "grad_norm": 5.165005683898926, "learning_rate": 4.760411208406305e-05, "loss": 1.0664, "num_input_tokens_seen": 120463400, "step": 7487 }, { "epoch": 0.5245212640205942, "grad_norm": 6.133605480194092, "learning_rate": 4.759711383537654e-05, "loss": 1.0436, "num_input_tokens_seen": 120478432, "step": 7488 }, { "epoch": 0.5245913122663234, "grad_norm": 4.061281681060791, "learning_rate": 4.7590115586690017e-05, "loss": 1.1429, "num_input_tokens_seen": 120494816, "step": 7489 }, { "epoch": 0.5246613605120527, "grad_norm": 3.2192203998565674, "learning_rate": 4.7583117338003505e-05, "loss": 0.9059, "num_input_tokens_seen": 120511200, "step": 7490 }, { "epoch": 0.5247314087577819, "grad_norm": 3.718182325363159, "learning_rate": 4.757611908931699e-05, "loss": 0.9153, "num_input_tokens_seen": 120527584, "step": 7491 }, { "epoch": 0.5248014570035112, "grad_norm": 3.742267370223999, "learning_rate": 4.7569120840630476e-05, "loss": 0.9785, "num_input_tokens_seen": 120543968, "step": 7492 }, { "epoch": 0.5248715052492404, "grad_norm": 5.5869951248168945, "learning_rate": 4.7562122591943964e-05, "loss": 1.009, "num_input_tokens_seen": 120559120, "step": 7493 }, { "epoch": 0.5249415534949696, "grad_norm": 3.9366302490234375, "learning_rate": 4.7555124343257446e-05, "loss": 1.1197, "num_input_tokens_seen": 120575504, "step": 7494 }, { "epoch": 0.5250116017406989, "grad_norm": 5.102993488311768, "learning_rate": 4.7548126094570935e-05, "loss": 1.0065, "num_input_tokens_seen": 120591592, "step": 7495 }, { "epoch": 0.5250816499864281, "grad_norm": 3.392009735107422, "learning_rate": 4.754112784588441e-05, "loss": 0.9957, "num_input_tokens_seen": 120607976, "step": 7496 }, { "epoch": 0.5251516982321573, "grad_norm": 5.089282512664795, "learning_rate": 4.7534129597197905e-05, "loss": 0.9714, "num_input_tokens_seen": 120624360, "step": 7497 }, { "epoch": 0.5252217464778867, "grad_norm": 4.311940670013428, "learning_rate": 4.7527131348511394e-05, "loss": 0.9564, "num_input_tokens_seen": 120640744, "step": 7498 }, { "epoch": 0.5252917947236159, "grad_norm": 3.8907923698425293, "learning_rate": 4.752013309982487e-05, "loss": 1.1852, "num_input_tokens_seen": 120656992, "step": 7499 }, { "epoch": 0.5253618429693452, "grad_norm": 3.856172800064087, "learning_rate": 4.751313485113836e-05, "loss": 1.2247, "num_input_tokens_seen": 120673376, "step": 7500 }, { "epoch": 0.5254318912150744, "grad_norm": 3.661641836166382, "learning_rate": 4.750613660245184e-05, "loss": 1.0106, "num_input_tokens_seen": 120689760, "step": 7501 }, { "epoch": 0.5255019394608036, "grad_norm": 3.8533976078033447, "learning_rate": 4.749913835376533e-05, "loss": 0.9485, "num_input_tokens_seen": 120706144, "step": 7502 }, { "epoch": 0.5255719877065329, "grad_norm": 4.209764003753662, "learning_rate": 4.7492140105078816e-05, "loss": 1.0623, "num_input_tokens_seen": 120722528, "step": 7503 }, { "epoch": 0.5256420359522621, "grad_norm": 4.190296649932861, "learning_rate": 4.74851418563923e-05, "loss": 1.0135, "num_input_tokens_seen": 120738912, "step": 7504 }, { "epoch": 0.5257120841979913, "grad_norm": 3.971188545227051, "learning_rate": 4.747814360770579e-05, "loss": 0.8885, "num_input_tokens_seen": 120754464, "step": 7505 }, { "epoch": 0.5257821324437206, "grad_norm": 3.8005099296569824, "learning_rate": 4.747114535901926e-05, "loss": 0.8758, "num_input_tokens_seen": 120770848, "step": 7506 }, { "epoch": 0.5258521806894498, "grad_norm": 4.144433498382568, "learning_rate": 4.746414711033276e-05, "loss": 1.2437, "num_input_tokens_seen": 120787232, "step": 7507 }, { "epoch": 0.5259222289351791, "grad_norm": 4.963449001312256, "learning_rate": 4.7457148861646246e-05, "loss": 1.2806, "num_input_tokens_seen": 120802792, "step": 7508 }, { "epoch": 0.5259922771809084, "grad_norm": 6.420035362243652, "learning_rate": 4.745015061295972e-05, "loss": 1.1617, "num_input_tokens_seen": 120819176, "step": 7509 }, { "epoch": 0.5260623254266376, "grad_norm": 3.493263006210327, "learning_rate": 4.744315236427321e-05, "loss": 0.9335, "num_input_tokens_seen": 120835560, "step": 7510 }, { "epoch": 0.5261323736723669, "grad_norm": 5.055440425872803, "learning_rate": 4.743615411558669e-05, "loss": 0.9479, "num_input_tokens_seen": 120851752, "step": 7511 }, { "epoch": 0.5262024219180961, "grad_norm": 3.632171869277954, "learning_rate": 4.742915586690018e-05, "loss": 1.0991, "num_input_tokens_seen": 120867568, "step": 7512 }, { "epoch": 0.5262724701638254, "grad_norm": 4.197231769561768, "learning_rate": 4.742215761821366e-05, "loss": 1.1136, "num_input_tokens_seen": 120883952, "step": 7513 }, { "epoch": 0.5263425184095546, "grad_norm": 5.476736068725586, "learning_rate": 4.741515936952715e-05, "loss": 0.9664, "num_input_tokens_seen": 120899584, "step": 7514 }, { "epoch": 0.5264125666552838, "grad_norm": 3.98996901512146, "learning_rate": 4.740816112084064e-05, "loss": 1.0942, "num_input_tokens_seen": 120915968, "step": 7515 }, { "epoch": 0.5264826149010131, "grad_norm": 3.881070375442505, "learning_rate": 4.7401162872154114e-05, "loss": 1.3225, "num_input_tokens_seen": 120932352, "step": 7516 }, { "epoch": 0.5265526631467423, "grad_norm": 4.582496643066406, "learning_rate": 4.739416462346761e-05, "loss": 1.1441, "num_input_tokens_seen": 120948736, "step": 7517 }, { "epoch": 0.5266227113924715, "grad_norm": 3.413455009460449, "learning_rate": 4.7387166374781084e-05, "loss": 0.8994, "num_input_tokens_seen": 120964928, "step": 7518 }, { "epoch": 0.5266927596382008, "grad_norm": 3.806334972381592, "learning_rate": 4.738016812609457e-05, "loss": 1.1016, "num_input_tokens_seen": 120981312, "step": 7519 }, { "epoch": 0.52676280788393, "grad_norm": 4.136782169342041, "learning_rate": 4.737316987740806e-05, "loss": 1.3558, "num_input_tokens_seen": 120997696, "step": 7520 }, { "epoch": 0.5268328561296594, "grad_norm": 4.850030899047852, "learning_rate": 4.736617162872154e-05, "loss": 0.9764, "num_input_tokens_seen": 121013624, "step": 7521 }, { "epoch": 0.5269029043753886, "grad_norm": 3.9870705604553223, "learning_rate": 4.735917338003503e-05, "loss": 1.122, "num_input_tokens_seen": 121030008, "step": 7522 }, { "epoch": 0.5269729526211178, "grad_norm": 3.778407573699951, "learning_rate": 4.7352175131348513e-05, "loss": 1.0158, "num_input_tokens_seen": 121046168, "step": 7523 }, { "epoch": 0.5270430008668471, "grad_norm": 4.619656085968018, "learning_rate": 4.7345176882662e-05, "loss": 1.0316, "num_input_tokens_seen": 121061544, "step": 7524 }, { "epoch": 0.5271130491125763, "grad_norm": 3.794745683670044, "learning_rate": 4.733817863397549e-05, "loss": 0.9998, "num_input_tokens_seen": 121077928, "step": 7525 }, { "epoch": 0.5271830973583055, "grad_norm": 5.526957988739014, "learning_rate": 4.7331180385288966e-05, "loss": 0.9366, "num_input_tokens_seen": 121094312, "step": 7526 }, { "epoch": 0.5272531456040348, "grad_norm": 4.695877552032471, "learning_rate": 4.732418213660246e-05, "loss": 0.9802, "num_input_tokens_seen": 121110696, "step": 7527 }, { "epoch": 0.527323193849764, "grad_norm": 4.297723293304443, "learning_rate": 4.7317183887915936e-05, "loss": 0.9783, "num_input_tokens_seen": 121126832, "step": 7528 }, { "epoch": 0.5273932420954933, "grad_norm": 3.9494822025299072, "learning_rate": 4.7310185639229425e-05, "loss": 1.0655, "num_input_tokens_seen": 121143192, "step": 7529 }, { "epoch": 0.5274632903412225, "grad_norm": 3.572096347808838, "learning_rate": 4.730318739054291e-05, "loss": 0.9569, "num_input_tokens_seen": 121159552, "step": 7530 }, { "epoch": 0.5275333385869517, "grad_norm": 4.53004789352417, "learning_rate": 4.7296189141856395e-05, "loss": 0.9207, "num_input_tokens_seen": 121175744, "step": 7531 }, { "epoch": 0.527603386832681, "grad_norm": 4.6228203773498535, "learning_rate": 4.7289190893169884e-05, "loss": 1.0224, "num_input_tokens_seen": 121190304, "step": 7532 }, { "epoch": 0.5276734350784102, "grad_norm": 3.7169201374053955, "learning_rate": 4.7282192644483366e-05, "loss": 1.0556, "num_input_tokens_seen": 121206688, "step": 7533 }, { "epoch": 0.5277434833241395, "grad_norm": 4.113621234893799, "learning_rate": 4.7275194395796854e-05, "loss": 0.8171, "num_input_tokens_seen": 121223072, "step": 7534 }, { "epoch": 0.5278135315698688, "grad_norm": 3.70991849899292, "learning_rate": 4.726819614711034e-05, "loss": 1.0469, "num_input_tokens_seen": 121239456, "step": 7535 }, { "epoch": 0.527883579815598, "grad_norm": 4.051577091217041, "learning_rate": 4.726119789842382e-05, "loss": 1.192, "num_input_tokens_seen": 121255840, "step": 7536 }, { "epoch": 0.5279536280613273, "grad_norm": 5.249835968017578, "learning_rate": 4.725419964973731e-05, "loss": 1.1282, "num_input_tokens_seen": 121271664, "step": 7537 }, { "epoch": 0.5280236763070565, "grad_norm": 3.727388381958008, "learning_rate": 4.724720140105079e-05, "loss": 1.1146, "num_input_tokens_seen": 121288048, "step": 7538 }, { "epoch": 0.5280937245527857, "grad_norm": 3.91412615776062, "learning_rate": 4.724020315236428e-05, "loss": 1.002, "num_input_tokens_seen": 121304432, "step": 7539 }, { "epoch": 0.528163772798515, "grad_norm": 4.474177360534668, "learning_rate": 4.723320490367776e-05, "loss": 1.1398, "num_input_tokens_seen": 121320816, "step": 7540 }, { "epoch": 0.5282338210442442, "grad_norm": 3.860063314437866, "learning_rate": 4.722620665499125e-05, "loss": 0.9238, "num_input_tokens_seen": 121337200, "step": 7541 }, { "epoch": 0.5283038692899734, "grad_norm": 3.4258196353912354, "learning_rate": 4.7219208406304736e-05, "loss": 0.8809, "num_input_tokens_seen": 121353584, "step": 7542 }, { "epoch": 0.5283739175357027, "grad_norm": 3.8684587478637695, "learning_rate": 4.721221015761822e-05, "loss": 1.1276, "num_input_tokens_seen": 121369968, "step": 7543 }, { "epoch": 0.5284439657814319, "grad_norm": 3.713254451751709, "learning_rate": 4.7205211908931706e-05, "loss": 0.9036, "num_input_tokens_seen": 121386352, "step": 7544 }, { "epoch": 0.5285140140271612, "grad_norm": 4.29981803894043, "learning_rate": 4.719821366024518e-05, "loss": 0.9864, "num_input_tokens_seen": 121402632, "step": 7545 }, { "epoch": 0.5285840622728905, "grad_norm": 3.9872469902038574, "learning_rate": 4.719121541155867e-05, "loss": 1.1266, "num_input_tokens_seen": 121419016, "step": 7546 }, { "epoch": 0.5286541105186197, "grad_norm": 5.582516193389893, "learning_rate": 4.7184217162872165e-05, "loss": 1.0141, "num_input_tokens_seen": 121434168, "step": 7547 }, { "epoch": 0.528724158764349, "grad_norm": 3.623431921005249, "learning_rate": 4.717721891418564e-05, "loss": 1.021, "num_input_tokens_seen": 121450552, "step": 7548 }, { "epoch": 0.5287942070100782, "grad_norm": 4.299187660217285, "learning_rate": 4.717022066549913e-05, "loss": 0.9786, "num_input_tokens_seen": 121466488, "step": 7549 }, { "epoch": 0.5288642552558075, "grad_norm": 3.768704652786255, "learning_rate": 4.716322241681261e-05, "loss": 1.3006, "num_input_tokens_seen": 121482872, "step": 7550 }, { "epoch": 0.5289343035015367, "grad_norm": 4.494194507598877, "learning_rate": 4.71562241681261e-05, "loss": 1.2086, "num_input_tokens_seen": 121499256, "step": 7551 }, { "epoch": 0.5290043517472659, "grad_norm": 3.676561117172241, "learning_rate": 4.714922591943959e-05, "loss": 0.9644, "num_input_tokens_seen": 121515272, "step": 7552 }, { "epoch": 0.5290743999929952, "grad_norm": 5.0016961097717285, "learning_rate": 4.714222767075307e-05, "loss": 0.8883, "num_input_tokens_seen": 121531544, "step": 7553 }, { "epoch": 0.5291444482387244, "grad_norm": 3.8496031761169434, "learning_rate": 4.713522942206656e-05, "loss": 1.1284, "num_input_tokens_seen": 121547928, "step": 7554 }, { "epoch": 0.5292144964844536, "grad_norm": 5.569375514984131, "learning_rate": 4.712823117338003e-05, "loss": 1.227, "num_input_tokens_seen": 121564312, "step": 7555 }, { "epoch": 0.5292845447301829, "grad_norm": 3.6076838970184326, "learning_rate": 4.712123292469352e-05, "loss": 1.0773, "num_input_tokens_seen": 121580688, "step": 7556 }, { "epoch": 0.5293545929759121, "grad_norm": 3.5435140132904053, "learning_rate": 4.711423467600701e-05, "loss": 0.9505, "num_input_tokens_seen": 121596496, "step": 7557 }, { "epoch": 0.5294246412216415, "grad_norm": 3.233835458755493, "learning_rate": 4.710723642732049e-05, "loss": 0.9657, "num_input_tokens_seen": 121612880, "step": 7558 }, { "epoch": 0.5294946894673707, "grad_norm": 5.496852874755859, "learning_rate": 4.710023817863398e-05, "loss": 1.0778, "num_input_tokens_seen": 121629264, "step": 7559 }, { "epoch": 0.5295647377130999, "grad_norm": 5.445659637451172, "learning_rate": 4.709323992994746e-05, "loss": 1.0318, "num_input_tokens_seen": 121645128, "step": 7560 }, { "epoch": 0.5296347859588292, "grad_norm": 3.8428354263305664, "learning_rate": 4.708624168126095e-05, "loss": 1.0299, "num_input_tokens_seen": 121661504, "step": 7561 }, { "epoch": 0.5297048342045584, "grad_norm": 3.609997272491455, "learning_rate": 4.707924343257444e-05, "loss": 1.0052, "num_input_tokens_seen": 121677888, "step": 7562 }, { "epoch": 0.5297748824502876, "grad_norm": 4.154750823974609, "learning_rate": 4.707224518388792e-05, "loss": 1.0849, "num_input_tokens_seen": 121693328, "step": 7563 }, { "epoch": 0.5298449306960169, "grad_norm": 4.595134735107422, "learning_rate": 4.706524693520141e-05, "loss": 0.9786, "num_input_tokens_seen": 121709512, "step": 7564 }, { "epoch": 0.5299149789417461, "grad_norm": 3.6196346282958984, "learning_rate": 4.7058248686514885e-05, "loss": 1.0239, "num_input_tokens_seen": 121725272, "step": 7565 }, { "epoch": 0.5299850271874754, "grad_norm": 3.8893964290618896, "learning_rate": 4.7051250437828374e-05, "loss": 0.9623, "num_input_tokens_seen": 121741656, "step": 7566 }, { "epoch": 0.5300550754332046, "grad_norm": 3.8492813110351562, "learning_rate": 4.7044252189141856e-05, "loss": 1.0309, "num_input_tokens_seen": 121758040, "step": 7567 }, { "epoch": 0.5301251236789338, "grad_norm": 3.920822858810425, "learning_rate": 4.7037253940455344e-05, "loss": 1.0483, "num_input_tokens_seen": 121773736, "step": 7568 }, { "epoch": 0.5301951719246631, "grad_norm": 5.269485950469971, "learning_rate": 4.703025569176883e-05, "loss": 1.0504, "num_input_tokens_seen": 121790120, "step": 7569 }, { "epoch": 0.5302652201703923, "grad_norm": 4.920991897583008, "learning_rate": 4.7023257443082315e-05, "loss": 0.8157, "num_input_tokens_seen": 121806504, "step": 7570 }, { "epoch": 0.5303352684161216, "grad_norm": 3.8457534313201904, "learning_rate": 4.70162591943958e-05, "loss": 1.0428, "num_input_tokens_seen": 121822256, "step": 7571 }, { "epoch": 0.5304053166618509, "grad_norm": 3.80702543258667, "learning_rate": 4.700926094570928e-05, "loss": 1.1466, "num_input_tokens_seen": 121838640, "step": 7572 }, { "epoch": 0.5304753649075801, "grad_norm": 3.8683180809020996, "learning_rate": 4.7002262697022774e-05, "loss": 0.8946, "num_input_tokens_seen": 121854704, "step": 7573 }, { "epoch": 0.5305454131533094, "grad_norm": 6.011785507202148, "learning_rate": 4.699526444833626e-05, "loss": 1.0269, "num_input_tokens_seen": 121871088, "step": 7574 }, { "epoch": 0.5306154613990386, "grad_norm": 4.970396995544434, "learning_rate": 4.698826619964974e-05, "loss": 0.9285, "num_input_tokens_seen": 121887240, "step": 7575 }, { "epoch": 0.5306855096447678, "grad_norm": 4.267600059509277, "learning_rate": 4.6981267950963226e-05, "loss": 1.2008, "num_input_tokens_seen": 121903624, "step": 7576 }, { "epoch": 0.5307555578904971, "grad_norm": 4.618432998657227, "learning_rate": 4.697426970227671e-05, "loss": 1.0197, "num_input_tokens_seen": 121919656, "step": 7577 }, { "epoch": 0.5308256061362263, "grad_norm": 3.775972366333008, "learning_rate": 4.6967271453590196e-05, "loss": 0.9733, "num_input_tokens_seen": 121935992, "step": 7578 }, { "epoch": 0.5308956543819555, "grad_norm": 4.6172356605529785, "learning_rate": 4.6960273204903685e-05, "loss": 1.1194, "num_input_tokens_seen": 121952376, "step": 7579 }, { "epoch": 0.5309657026276848, "grad_norm": 4.867498874664307, "learning_rate": 4.695327495621717e-05, "loss": 1.1188, "num_input_tokens_seen": 121968760, "step": 7580 }, { "epoch": 0.531035750873414, "grad_norm": 4.13311767578125, "learning_rate": 4.6946276707530655e-05, "loss": 1.0245, "num_input_tokens_seen": 121984304, "step": 7581 }, { "epoch": 0.5311057991191434, "grad_norm": 3.957585573196411, "learning_rate": 4.693927845884413e-05, "loss": 0.8689, "num_input_tokens_seen": 122000656, "step": 7582 }, { "epoch": 0.5311758473648726, "grad_norm": 4.368579387664795, "learning_rate": 4.693228021015762e-05, "loss": 0.996, "num_input_tokens_seen": 122016576, "step": 7583 }, { "epoch": 0.5312458956106018, "grad_norm": 3.653543710708618, "learning_rate": 4.6925281961471114e-05, "loss": 1.034, "num_input_tokens_seen": 122032784, "step": 7584 }, { "epoch": 0.5313159438563311, "grad_norm": 3.5882821083068848, "learning_rate": 4.691828371278459e-05, "loss": 0.8917, "num_input_tokens_seen": 122049120, "step": 7585 }, { "epoch": 0.5313859921020603, "grad_norm": 3.252802848815918, "learning_rate": 4.691128546409808e-05, "loss": 0.9816, "num_input_tokens_seen": 122065504, "step": 7586 }, { "epoch": 0.5314560403477896, "grad_norm": 3.5019781589508057, "learning_rate": 4.690428721541156e-05, "loss": 0.912, "num_input_tokens_seen": 122080800, "step": 7587 }, { "epoch": 0.5315260885935188, "grad_norm": 3.470921277999878, "learning_rate": 4.689728896672505e-05, "loss": 1.0494, "num_input_tokens_seen": 122097184, "step": 7588 }, { "epoch": 0.531596136839248, "grad_norm": 3.5450100898742676, "learning_rate": 4.689029071803854e-05, "loss": 0.9291, "num_input_tokens_seen": 122113568, "step": 7589 }, { "epoch": 0.5316661850849773, "grad_norm": 4.1188578605651855, "learning_rate": 4.688329246935202e-05, "loss": 1.0393, "num_input_tokens_seen": 122129800, "step": 7590 }, { "epoch": 0.5317362333307065, "grad_norm": 4.097812175750732, "learning_rate": 4.687629422066551e-05, "loss": 1.0745, "num_input_tokens_seen": 122145784, "step": 7591 }, { "epoch": 0.5318062815764357, "grad_norm": 3.929668664932251, "learning_rate": 4.686929597197898e-05, "loss": 1.0934, "num_input_tokens_seen": 122162168, "step": 7592 }, { "epoch": 0.531876329822165, "grad_norm": 5.706707954406738, "learning_rate": 4.686229772329247e-05, "loss": 1.007, "num_input_tokens_seen": 122178552, "step": 7593 }, { "epoch": 0.5319463780678942, "grad_norm": 4.066921234130859, "learning_rate": 4.685529947460595e-05, "loss": 0.9495, "num_input_tokens_seen": 122194936, "step": 7594 }, { "epoch": 0.5320164263136236, "grad_norm": 3.8690404891967773, "learning_rate": 4.684830122591944e-05, "loss": 1.2008, "num_input_tokens_seen": 122211184, "step": 7595 }, { "epoch": 0.5320864745593528, "grad_norm": 5.118635654449463, "learning_rate": 4.684130297723293e-05, "loss": 0.9476, "num_input_tokens_seen": 122227152, "step": 7596 }, { "epoch": 0.532156522805082, "grad_norm": 3.9654159545898438, "learning_rate": 4.683430472854641e-05, "loss": 1.1422, "num_input_tokens_seen": 122242952, "step": 7597 }, { "epoch": 0.5322265710508113, "grad_norm": 5.006682872772217, "learning_rate": 4.68273064798599e-05, "loss": 1.2078, "num_input_tokens_seen": 122259104, "step": 7598 }, { "epoch": 0.5322966192965405, "grad_norm": 4.747622013092041, "learning_rate": 4.6820308231173375e-05, "loss": 1.3191, "num_input_tokens_seen": 122274320, "step": 7599 }, { "epoch": 0.5323666675422697, "grad_norm": 4.334118843078613, "learning_rate": 4.681330998248687e-05, "loss": 0.9368, "num_input_tokens_seen": 122290376, "step": 7600 }, { "epoch": 0.5323666675422697, "eval_loss": 1.120592474937439, "eval_runtime": 0.4292, "eval_samples_per_second": 2.33, "eval_steps_per_second": 2.33, "num_input_tokens_seen": 122290376, "step": 7600 }, { "epoch": 0.532436715787999, "grad_norm": 3.4493050575256348, "learning_rate": 4.680631173380036e-05, "loss": 0.9147, "num_input_tokens_seen": 122306760, "step": 7601 }, { "epoch": 0.5325067640337282, "grad_norm": 3.933213949203491, "learning_rate": 4.6799313485113834e-05, "loss": 1.2942, "num_input_tokens_seen": 122322784, "step": 7602 }, { "epoch": 0.5325768122794575, "grad_norm": 3.874788284301758, "learning_rate": 4.679231523642732e-05, "loss": 1.2438, "num_input_tokens_seen": 122339168, "step": 7603 }, { "epoch": 0.5326468605251867, "grad_norm": 4.443728446960449, "learning_rate": 4.6785316987740805e-05, "loss": 1.138, "num_input_tokens_seen": 122355552, "step": 7604 }, { "epoch": 0.5327169087709159, "grad_norm": 3.7982730865478516, "learning_rate": 4.677831873905429e-05, "loss": 1.105, "num_input_tokens_seen": 122371936, "step": 7605 }, { "epoch": 0.5327869570166452, "grad_norm": 3.4133870601654053, "learning_rate": 4.677132049036778e-05, "loss": 1.0804, "num_input_tokens_seen": 122388192, "step": 7606 }, { "epoch": 0.5328570052623745, "grad_norm": 5.178568363189697, "learning_rate": 4.6764322241681264e-05, "loss": 0.9356, "num_input_tokens_seen": 122404576, "step": 7607 }, { "epoch": 0.5329270535081037, "grad_norm": 3.9049737453460693, "learning_rate": 4.675732399299475e-05, "loss": 0.9998, "num_input_tokens_seen": 122420960, "step": 7608 }, { "epoch": 0.532997101753833, "grad_norm": 3.8209729194641113, "learning_rate": 4.675032574430823e-05, "loss": 0.9225, "num_input_tokens_seen": 122436808, "step": 7609 }, { "epoch": 0.5330671499995622, "grad_norm": 5.2931389808654785, "learning_rate": 4.674332749562172e-05, "loss": 1.225, "num_input_tokens_seen": 122453192, "step": 7610 }, { "epoch": 0.5331371982452915, "grad_norm": 3.608839273452759, "learning_rate": 4.673632924693521e-05, "loss": 1.0053, "num_input_tokens_seen": 122469576, "step": 7611 }, { "epoch": 0.5332072464910207, "grad_norm": 3.75544810295105, "learning_rate": 4.6729330998248686e-05, "loss": 1.0932, "num_input_tokens_seen": 122485960, "step": 7612 }, { "epoch": 0.5332772947367499, "grad_norm": 4.498108863830566, "learning_rate": 4.6722332749562175e-05, "loss": 1.1613, "num_input_tokens_seen": 122502344, "step": 7613 }, { "epoch": 0.5333473429824792, "grad_norm": 7.5673909187316895, "learning_rate": 4.671533450087566e-05, "loss": 1.1696, "num_input_tokens_seen": 122518728, "step": 7614 }, { "epoch": 0.5334173912282084, "grad_norm": 6.213915824890137, "learning_rate": 4.6708336252189145e-05, "loss": 0.9305, "num_input_tokens_seen": 122534992, "step": 7615 }, { "epoch": 0.5334874394739376, "grad_norm": 3.739473342895508, "learning_rate": 4.6701338003502634e-05, "loss": 1.0961, "num_input_tokens_seen": 122551376, "step": 7616 }, { "epoch": 0.5335574877196669, "grad_norm": 4.1125617027282715, "learning_rate": 4.6694339754816116e-05, "loss": 1.0056, "num_input_tokens_seen": 122567760, "step": 7617 }, { "epoch": 0.5336275359653961, "grad_norm": 3.5337769985198975, "learning_rate": 4.6687341506129604e-05, "loss": 0.9145, "num_input_tokens_seen": 122583992, "step": 7618 }, { "epoch": 0.5336975842111255, "grad_norm": 3.6304452419281006, "learning_rate": 4.668034325744308e-05, "loss": 1.0398, "num_input_tokens_seen": 122600376, "step": 7619 }, { "epoch": 0.5337676324568547, "grad_norm": 4.323266983032227, "learning_rate": 4.6673345008756575e-05, "loss": 1.1597, "num_input_tokens_seen": 122616760, "step": 7620 }, { "epoch": 0.5338376807025839, "grad_norm": 3.598428249359131, "learning_rate": 4.666634676007005e-05, "loss": 1.0098, "num_input_tokens_seen": 122633144, "step": 7621 }, { "epoch": 0.5339077289483132, "grad_norm": 4.279503345489502, "learning_rate": 4.665934851138354e-05, "loss": 1.1214, "num_input_tokens_seen": 122649528, "step": 7622 }, { "epoch": 0.5339777771940424, "grad_norm": 5.056297302246094, "learning_rate": 4.665235026269703e-05, "loss": 1.0469, "num_input_tokens_seen": 122665912, "step": 7623 }, { "epoch": 0.5340478254397717, "grad_norm": 7.37079381942749, "learning_rate": 4.664535201401051e-05, "loss": 1.168, "num_input_tokens_seen": 122681968, "step": 7624 }, { "epoch": 0.5341178736855009, "grad_norm": 4.478328227996826, "learning_rate": 4.6638353765324e-05, "loss": 1.2127, "num_input_tokens_seen": 122698352, "step": 7625 }, { "epoch": 0.5341879219312301, "grad_norm": 5.164111614227295, "learning_rate": 4.663135551663748e-05, "loss": 1.0905, "num_input_tokens_seen": 122714736, "step": 7626 }, { "epoch": 0.5342579701769594, "grad_norm": 6.489926815032959, "learning_rate": 4.662435726795097e-05, "loss": 0.95, "num_input_tokens_seen": 122731120, "step": 7627 }, { "epoch": 0.5343280184226886, "grad_norm": 5.209092140197754, "learning_rate": 4.6617359019264456e-05, "loss": 1.117, "num_input_tokens_seen": 122747504, "step": 7628 }, { "epoch": 0.5343980666684178, "grad_norm": 4.007065773010254, "learning_rate": 4.661036077057793e-05, "loss": 0.9271, "num_input_tokens_seen": 122763864, "step": 7629 }, { "epoch": 0.5344681149141471, "grad_norm": 4.077016830444336, "learning_rate": 4.660336252189143e-05, "loss": 1.1052, "num_input_tokens_seen": 122779808, "step": 7630 }, { "epoch": 0.5345381631598763, "grad_norm": 3.764261245727539, "learning_rate": 4.65963642732049e-05, "loss": 1.1111, "num_input_tokens_seen": 122796192, "step": 7631 }, { "epoch": 0.5346082114056057, "grad_norm": 3.8028204441070557, "learning_rate": 4.658936602451839e-05, "loss": 0.9435, "num_input_tokens_seen": 122812576, "step": 7632 }, { "epoch": 0.5346782596513349, "grad_norm": 4.397709846496582, "learning_rate": 4.658236777583188e-05, "loss": 0.9155, "num_input_tokens_seen": 122828048, "step": 7633 }, { "epoch": 0.5347483078970641, "grad_norm": 4.138652324676514, "learning_rate": 4.657536952714536e-05, "loss": 0.9613, "num_input_tokens_seen": 122842976, "step": 7634 }, { "epoch": 0.5348183561427934, "grad_norm": 3.9423744678497314, "learning_rate": 4.656837127845885e-05, "loss": 1.0464, "num_input_tokens_seen": 122859360, "step": 7635 }, { "epoch": 0.5348884043885226, "grad_norm": 4.257975101470947, "learning_rate": 4.656137302977233e-05, "loss": 1.1539, "num_input_tokens_seen": 122874888, "step": 7636 }, { "epoch": 0.5349584526342518, "grad_norm": 4.364237308502197, "learning_rate": 4.655437478108582e-05, "loss": 1.0326, "num_input_tokens_seen": 122891272, "step": 7637 }, { "epoch": 0.5350285008799811, "grad_norm": 4.238529205322266, "learning_rate": 4.654737653239931e-05, "loss": 1.2226, "num_input_tokens_seen": 122906408, "step": 7638 }, { "epoch": 0.5350985491257103, "grad_norm": 3.609213352203369, "learning_rate": 4.654037828371278e-05, "loss": 0.9925, "num_input_tokens_seen": 122922792, "step": 7639 }, { "epoch": 0.5351685973714396, "grad_norm": 3.5299439430236816, "learning_rate": 4.653338003502628e-05, "loss": 0.9586, "num_input_tokens_seen": 122939120, "step": 7640 }, { "epoch": 0.5352386456171688, "grad_norm": 3.879683494567871, "learning_rate": 4.6526381786339754e-05, "loss": 1.1882, "num_input_tokens_seen": 122955504, "step": 7641 }, { "epoch": 0.535308693862898, "grad_norm": 3.414780616760254, "learning_rate": 4.651938353765324e-05, "loss": 1.0047, "num_input_tokens_seen": 122971848, "step": 7642 }, { "epoch": 0.5353787421086273, "grad_norm": 3.307396173477173, "learning_rate": 4.651238528896673e-05, "loss": 0.8771, "num_input_tokens_seen": 122987840, "step": 7643 }, { "epoch": 0.5354487903543566, "grad_norm": 3.41166353225708, "learning_rate": 4.650538704028021e-05, "loss": 0.8481, "num_input_tokens_seen": 123004224, "step": 7644 }, { "epoch": 0.5355188386000858, "grad_norm": 4.273513317108154, "learning_rate": 4.64983887915937e-05, "loss": 1.2176, "num_input_tokens_seen": 123020608, "step": 7645 }, { "epoch": 0.5355888868458151, "grad_norm": 6.728183269500732, "learning_rate": 4.649139054290718e-05, "loss": 1.0303, "num_input_tokens_seen": 123035928, "step": 7646 }, { "epoch": 0.5356589350915443, "grad_norm": 3.6993179321289062, "learning_rate": 4.648439229422067e-05, "loss": 0.9471, "num_input_tokens_seen": 123051784, "step": 7647 }, { "epoch": 0.5357289833372736, "grad_norm": 4.173643112182617, "learning_rate": 4.647739404553415e-05, "loss": 1.0996, "num_input_tokens_seen": 123068080, "step": 7648 }, { "epoch": 0.5357990315830028, "grad_norm": 4.235645294189453, "learning_rate": 4.6470395796847635e-05, "loss": 1.0112, "num_input_tokens_seen": 123084464, "step": 7649 }, { "epoch": 0.535869079828732, "grad_norm": 4.8372344970703125, "learning_rate": 4.646339754816113e-05, "loss": 0.9532, "num_input_tokens_seen": 123099480, "step": 7650 }, { "epoch": 0.5359391280744613, "grad_norm": 3.768519878387451, "learning_rate": 4.6456399299474606e-05, "loss": 1.0942, "num_input_tokens_seen": 123115592, "step": 7651 }, { "epoch": 0.5360091763201905, "grad_norm": 5.204262733459473, "learning_rate": 4.6449401050788094e-05, "loss": 0.954, "num_input_tokens_seen": 123131976, "step": 7652 }, { "epoch": 0.5360792245659198, "grad_norm": 3.371913194656372, "learning_rate": 4.6442402802101576e-05, "loss": 0.8998, "num_input_tokens_seen": 123148360, "step": 7653 }, { "epoch": 0.536149272811649, "grad_norm": 6.089724540710449, "learning_rate": 4.6435404553415065e-05, "loss": 0.9717, "num_input_tokens_seen": 123164744, "step": 7654 }, { "epoch": 0.5362193210573782, "grad_norm": 4.112463474273682, "learning_rate": 4.6428406304728553e-05, "loss": 1.1899, "num_input_tokens_seen": 123181128, "step": 7655 }, { "epoch": 0.5362893693031076, "grad_norm": 3.9375081062316895, "learning_rate": 4.6421408056042035e-05, "loss": 1.0524, "num_input_tokens_seen": 123197512, "step": 7656 }, { "epoch": 0.5363594175488368, "grad_norm": 4.459086894989014, "learning_rate": 4.6414409807355524e-05, "loss": 1.206, "num_input_tokens_seen": 123212816, "step": 7657 }, { "epoch": 0.536429465794566, "grad_norm": 4.026162147521973, "learning_rate": 4.6407411558669e-05, "loss": 0.8556, "num_input_tokens_seen": 123229200, "step": 7658 }, { "epoch": 0.5364995140402953, "grad_norm": 3.6091065406799316, "learning_rate": 4.640041330998249e-05, "loss": 1.0515, "num_input_tokens_seen": 123245360, "step": 7659 }, { "epoch": 0.5365695622860245, "grad_norm": 4.105917453765869, "learning_rate": 4.639341506129598e-05, "loss": 1.3174, "num_input_tokens_seen": 123261072, "step": 7660 }, { "epoch": 0.5366396105317538, "grad_norm": 4.645833492279053, "learning_rate": 4.638641681260946e-05, "loss": 1.0443, "num_input_tokens_seen": 123277216, "step": 7661 }, { "epoch": 0.536709658777483, "grad_norm": 4.012742519378662, "learning_rate": 4.6379418563922946e-05, "loss": 1.0339, "num_input_tokens_seen": 123293336, "step": 7662 }, { "epoch": 0.5367797070232122, "grad_norm": 3.9238698482513428, "learning_rate": 4.637242031523643e-05, "loss": 1.0772, "num_input_tokens_seen": 123309720, "step": 7663 }, { "epoch": 0.5368497552689415, "grad_norm": 3.3745267391204834, "learning_rate": 4.636542206654992e-05, "loss": 1.0389, "num_input_tokens_seen": 123326104, "step": 7664 }, { "epoch": 0.5369198035146707, "grad_norm": 4.080345630645752, "learning_rate": 4.6358423817863405e-05, "loss": 1.0941, "num_input_tokens_seen": 123342280, "step": 7665 }, { "epoch": 0.5369898517603999, "grad_norm": 4.973495006561279, "learning_rate": 4.635142556917689e-05, "loss": 0.9355, "num_input_tokens_seen": 123358664, "step": 7666 }, { "epoch": 0.5370599000061292, "grad_norm": 3.8065357208251953, "learning_rate": 4.6344427320490376e-05, "loss": 1.0071, "num_input_tokens_seen": 123374584, "step": 7667 }, { "epoch": 0.5371299482518584, "grad_norm": 4.49127721786499, "learning_rate": 4.633742907180385e-05, "loss": 1.0562, "num_input_tokens_seen": 123390208, "step": 7668 }, { "epoch": 0.5371999964975878, "grad_norm": 4.231927394866943, "learning_rate": 4.633043082311734e-05, "loss": 1.0988, "num_input_tokens_seen": 123406592, "step": 7669 }, { "epoch": 0.537270044743317, "grad_norm": 3.7635555267333984, "learning_rate": 4.6323432574430835e-05, "loss": 1.1342, "num_input_tokens_seen": 123422976, "step": 7670 }, { "epoch": 0.5373400929890462, "grad_norm": 3.9398446083068848, "learning_rate": 4.631643432574431e-05, "loss": 1.1557, "num_input_tokens_seen": 123439184, "step": 7671 }, { "epoch": 0.5374101412347755, "grad_norm": 3.7720675468444824, "learning_rate": 4.63094360770578e-05, "loss": 1.1092, "num_input_tokens_seen": 123455568, "step": 7672 }, { "epoch": 0.5374801894805047, "grad_norm": 5.23007869720459, "learning_rate": 4.630243782837128e-05, "loss": 1.0874, "num_input_tokens_seen": 123471952, "step": 7673 }, { "epoch": 0.5375502377262339, "grad_norm": 4.356583118438721, "learning_rate": 4.629543957968477e-05, "loss": 1.1307, "num_input_tokens_seen": 123487528, "step": 7674 }, { "epoch": 0.5376202859719632, "grad_norm": 3.71581768989563, "learning_rate": 4.6288441330998244e-05, "loss": 0.9518, "num_input_tokens_seen": 123503216, "step": 7675 }, { "epoch": 0.5376903342176924, "grad_norm": 3.9850363731384277, "learning_rate": 4.628144308231174e-05, "loss": 1.0477, "num_input_tokens_seen": 123519600, "step": 7676 }, { "epoch": 0.5377603824634217, "grad_norm": 3.9007675647735596, "learning_rate": 4.627444483362523e-05, "loss": 1.1008, "num_input_tokens_seen": 123535952, "step": 7677 }, { "epoch": 0.5378304307091509, "grad_norm": 4.420581340789795, "learning_rate": 4.62674465849387e-05, "loss": 1.1284, "num_input_tokens_seen": 123552336, "step": 7678 }, { "epoch": 0.5379004789548801, "grad_norm": 3.788006067276001, "learning_rate": 4.626044833625219e-05, "loss": 1.0635, "num_input_tokens_seen": 123568232, "step": 7679 }, { "epoch": 0.5379705272006094, "grad_norm": 3.8466997146606445, "learning_rate": 4.625345008756567e-05, "loss": 1.0154, "num_input_tokens_seen": 123583680, "step": 7680 }, { "epoch": 0.5380405754463387, "grad_norm": 4.214776515960693, "learning_rate": 4.624645183887916e-05, "loss": 0.9577, "num_input_tokens_seen": 123600064, "step": 7681 }, { "epoch": 0.5381106236920679, "grad_norm": 4.797380447387695, "learning_rate": 4.623945359019265e-05, "loss": 0.9281, "num_input_tokens_seen": 123615880, "step": 7682 }, { "epoch": 0.5381806719377972, "grad_norm": 3.3792150020599365, "learning_rate": 4.623245534150613e-05, "loss": 0.97, "num_input_tokens_seen": 123632264, "step": 7683 }, { "epoch": 0.5382507201835264, "grad_norm": 4.794241428375244, "learning_rate": 4.622545709281962e-05, "loss": 0.9202, "num_input_tokens_seen": 123647104, "step": 7684 }, { "epoch": 0.5383207684292557, "grad_norm": 3.86734676361084, "learning_rate": 4.6218458844133096e-05, "loss": 1.2309, "num_input_tokens_seen": 123662976, "step": 7685 }, { "epoch": 0.5383908166749849, "grad_norm": 4.570960998535156, "learning_rate": 4.621146059544659e-05, "loss": 1.1882, "num_input_tokens_seen": 123678648, "step": 7686 }, { "epoch": 0.5384608649207141, "grad_norm": 4.53627347946167, "learning_rate": 4.620446234676008e-05, "loss": 0.996, "num_input_tokens_seen": 123695032, "step": 7687 }, { "epoch": 0.5385309131664434, "grad_norm": 3.517305612564087, "learning_rate": 4.6197464098073555e-05, "loss": 0.9094, "num_input_tokens_seen": 123711416, "step": 7688 }, { "epoch": 0.5386009614121726, "grad_norm": 3.955936908721924, "learning_rate": 4.6190465849387043e-05, "loss": 1.0099, "num_input_tokens_seen": 123727800, "step": 7689 }, { "epoch": 0.5386710096579019, "grad_norm": 3.3592917919158936, "learning_rate": 4.6183467600700525e-05, "loss": 1.0007, "num_input_tokens_seen": 123744184, "step": 7690 }, { "epoch": 0.5387410579036311, "grad_norm": 3.4240005016326904, "learning_rate": 4.6176469352014014e-05, "loss": 0.8842, "num_input_tokens_seen": 123760544, "step": 7691 }, { "epoch": 0.5388111061493603, "grad_norm": 4.404487609863281, "learning_rate": 4.61694711033275e-05, "loss": 1.2803, "num_input_tokens_seen": 123776928, "step": 7692 }, { "epoch": 0.5388811543950897, "grad_norm": 3.729642868041992, "learning_rate": 4.6162472854640984e-05, "loss": 1.0132, "num_input_tokens_seen": 123793312, "step": 7693 }, { "epoch": 0.5389512026408189, "grad_norm": 4.076940536499023, "learning_rate": 4.615547460595447e-05, "loss": 0.9274, "num_input_tokens_seen": 123809696, "step": 7694 }, { "epoch": 0.5390212508865481, "grad_norm": 3.436220407485962, "learning_rate": 4.614847635726795e-05, "loss": 1.0113, "num_input_tokens_seen": 123826080, "step": 7695 }, { "epoch": 0.5390912991322774, "grad_norm": 3.4559690952301025, "learning_rate": 4.614147810858144e-05, "loss": 1.0622, "num_input_tokens_seen": 123841856, "step": 7696 }, { "epoch": 0.5391613473780066, "grad_norm": 4.085431098937988, "learning_rate": 4.613447985989493e-05, "loss": 1.0832, "num_input_tokens_seen": 123858240, "step": 7697 }, { "epoch": 0.5392313956237359, "grad_norm": 4.373138427734375, "learning_rate": 4.612748161120841e-05, "loss": 1.0914, "num_input_tokens_seen": 123874168, "step": 7698 }, { "epoch": 0.5393014438694651, "grad_norm": 4.127585411071777, "learning_rate": 4.6120483362521896e-05, "loss": 1.1582, "num_input_tokens_seen": 123890552, "step": 7699 }, { "epoch": 0.5393714921151943, "grad_norm": 3.53139328956604, "learning_rate": 4.611348511383538e-05, "loss": 1.0876, "num_input_tokens_seen": 123906936, "step": 7700 }, { "epoch": 0.5394415403609236, "grad_norm": 3.8704042434692383, "learning_rate": 4.6106486865148866e-05, "loss": 0.9507, "num_input_tokens_seen": 123922640, "step": 7701 }, { "epoch": 0.5395115886066528, "grad_norm": 5.1702189445495605, "learning_rate": 4.609948861646235e-05, "loss": 1.1272, "num_input_tokens_seen": 123938520, "step": 7702 }, { "epoch": 0.539581636852382, "grad_norm": 4.382580280303955, "learning_rate": 4.6092490367775836e-05, "loss": 1.1366, "num_input_tokens_seen": 123953728, "step": 7703 }, { "epoch": 0.5396516850981113, "grad_norm": 4.136844635009766, "learning_rate": 4.6085492119089325e-05, "loss": 0.9004, "num_input_tokens_seen": 123970112, "step": 7704 }, { "epoch": 0.5397217333438405, "grad_norm": 3.9517366886138916, "learning_rate": 4.60784938704028e-05, "loss": 1.097, "num_input_tokens_seen": 123986104, "step": 7705 }, { "epoch": 0.5397917815895699, "grad_norm": 3.5815629959106445, "learning_rate": 4.6071495621716295e-05, "loss": 1.1862, "num_input_tokens_seen": 124002488, "step": 7706 }, { "epoch": 0.5398618298352991, "grad_norm": 3.8689863681793213, "learning_rate": 4.606449737302977e-05, "loss": 1.0321, "num_input_tokens_seen": 124018184, "step": 7707 }, { "epoch": 0.5399318780810283, "grad_norm": 3.911912679672241, "learning_rate": 4.605749912434326e-05, "loss": 1.2491, "num_input_tokens_seen": 124034568, "step": 7708 }, { "epoch": 0.5400019263267576, "grad_norm": 4.168681621551514, "learning_rate": 4.605050087565675e-05, "loss": 0.953, "num_input_tokens_seen": 124050952, "step": 7709 }, { "epoch": 0.5400719745724868, "grad_norm": 3.850926160812378, "learning_rate": 4.604350262697023e-05, "loss": 0.9641, "num_input_tokens_seen": 124066688, "step": 7710 }, { "epoch": 0.540142022818216, "grad_norm": 4.872866630554199, "learning_rate": 4.603650437828372e-05, "loss": 1.0421, "num_input_tokens_seen": 124083072, "step": 7711 }, { "epoch": 0.5402120710639453, "grad_norm": 3.4104743003845215, "learning_rate": 4.60295061295972e-05, "loss": 0.9064, "num_input_tokens_seen": 124099144, "step": 7712 }, { "epoch": 0.5402821193096745, "grad_norm": 4.460788249969482, "learning_rate": 4.602250788091069e-05, "loss": 1.258, "num_input_tokens_seen": 124115528, "step": 7713 }, { "epoch": 0.5403521675554038, "grad_norm": 4.264237880706787, "learning_rate": 4.601550963222418e-05, "loss": 0.9638, "num_input_tokens_seen": 124131080, "step": 7714 }, { "epoch": 0.540422215801133, "grad_norm": 3.436184883117676, "learning_rate": 4.600851138353765e-05, "loss": 1.0876, "num_input_tokens_seen": 124147464, "step": 7715 }, { "epoch": 0.5404922640468622, "grad_norm": 5.596844673156738, "learning_rate": 4.600151313485115e-05, "loss": 0.8719, "num_input_tokens_seen": 124163848, "step": 7716 }, { "epoch": 0.5405623122925916, "grad_norm": 3.6911396980285645, "learning_rate": 4.599451488616462e-05, "loss": 1.0812, "num_input_tokens_seen": 124180000, "step": 7717 }, { "epoch": 0.5406323605383208, "grad_norm": 5.705863952636719, "learning_rate": 4.598751663747811e-05, "loss": 1.0574, "num_input_tokens_seen": 124196384, "step": 7718 }, { "epoch": 0.54070240878405, "grad_norm": 3.699828863143921, "learning_rate": 4.59805183887916e-05, "loss": 1.0859, "num_input_tokens_seen": 124212768, "step": 7719 }, { "epoch": 0.5407724570297793, "grad_norm": 4.548577785491943, "learning_rate": 4.597352014010508e-05, "loss": 1.1576, "num_input_tokens_seen": 124228520, "step": 7720 }, { "epoch": 0.5408425052755085, "grad_norm": 3.9667632579803467, "learning_rate": 4.596652189141857e-05, "loss": 0.8426, "num_input_tokens_seen": 124244312, "step": 7721 }, { "epoch": 0.5409125535212378, "grad_norm": 5.471269130706787, "learning_rate": 4.595952364273205e-05, "loss": 1.154, "num_input_tokens_seen": 124260448, "step": 7722 }, { "epoch": 0.540982601766967, "grad_norm": 3.6126010417938232, "learning_rate": 4.595252539404554e-05, "loss": 0.9946, "num_input_tokens_seen": 124276776, "step": 7723 }, { "epoch": 0.5410526500126962, "grad_norm": 3.4256107807159424, "learning_rate": 4.594552714535903e-05, "loss": 0.8039, "num_input_tokens_seen": 124293160, "step": 7724 }, { "epoch": 0.5411226982584255, "grad_norm": 4.028780937194824, "learning_rate": 4.5938528896672504e-05, "loss": 0.986, "num_input_tokens_seen": 124308648, "step": 7725 }, { "epoch": 0.5411927465041547, "grad_norm": 4.520470142364502, "learning_rate": 4.5931530647986e-05, "loss": 1.2179, "num_input_tokens_seen": 124324512, "step": 7726 }, { "epoch": 0.541262794749884, "grad_norm": 6.498549938201904, "learning_rate": 4.5924532399299474e-05, "loss": 0.9941, "num_input_tokens_seen": 124340888, "step": 7727 }, { "epoch": 0.5413328429956132, "grad_norm": 6.183528900146484, "learning_rate": 4.591753415061296e-05, "loss": 0.9104, "num_input_tokens_seen": 124355024, "step": 7728 }, { "epoch": 0.5414028912413424, "grad_norm": 4.973779201507568, "learning_rate": 4.5910535901926445e-05, "loss": 1.0977, "num_input_tokens_seen": 124370424, "step": 7729 }, { "epoch": 0.5414729394870718, "grad_norm": 11.01496410369873, "learning_rate": 4.5903537653239933e-05, "loss": 1.2337, "num_input_tokens_seen": 124386808, "step": 7730 }, { "epoch": 0.541542987732801, "grad_norm": 5.617726802825928, "learning_rate": 4.589653940455342e-05, "loss": 1.1204, "num_input_tokens_seen": 124403192, "step": 7731 }, { "epoch": 0.5416130359785302, "grad_norm": 4.480281352996826, "learning_rate": 4.5889541155866904e-05, "loss": 1.1171, "num_input_tokens_seen": 124419576, "step": 7732 }, { "epoch": 0.5416830842242595, "grad_norm": 4.884644985198975, "learning_rate": 4.588254290718039e-05, "loss": 0.9479, "num_input_tokens_seen": 124435816, "step": 7733 }, { "epoch": 0.5417531324699887, "grad_norm": 3.540273666381836, "learning_rate": 4.587554465849387e-05, "loss": 1.1185, "num_input_tokens_seen": 124452200, "step": 7734 }, { "epoch": 0.541823180715718, "grad_norm": 3.7844882011413574, "learning_rate": 4.5868546409807356e-05, "loss": 0.8428, "num_input_tokens_seen": 124468584, "step": 7735 }, { "epoch": 0.5418932289614472, "grad_norm": 3.699333906173706, "learning_rate": 4.586154816112085e-05, "loss": 1.087, "num_input_tokens_seen": 124484968, "step": 7736 }, { "epoch": 0.5419632772071764, "grad_norm": 4.426324844360352, "learning_rate": 4.5854549912434326e-05, "loss": 1.0829, "num_input_tokens_seen": 124501352, "step": 7737 }, { "epoch": 0.5420333254529057, "grad_norm": 3.796420097351074, "learning_rate": 4.5847551663747815e-05, "loss": 0.8883, "num_input_tokens_seen": 124517328, "step": 7738 }, { "epoch": 0.5421033736986349, "grad_norm": 4.042966842651367, "learning_rate": 4.58405534150613e-05, "loss": 1.0737, "num_input_tokens_seen": 124533408, "step": 7739 }, { "epoch": 0.5421734219443641, "grad_norm": 4.4333977699279785, "learning_rate": 4.5833555166374785e-05, "loss": 1.0786, "num_input_tokens_seen": 124549504, "step": 7740 }, { "epoch": 0.5422434701900934, "grad_norm": 3.791276216506958, "learning_rate": 4.5826556917688274e-05, "loss": 0.9739, "num_input_tokens_seen": 124564864, "step": 7741 }, { "epoch": 0.5423135184358227, "grad_norm": 3.6679089069366455, "learning_rate": 4.5819558669001756e-05, "loss": 0.9717, "num_input_tokens_seen": 124581248, "step": 7742 }, { "epoch": 0.542383566681552, "grad_norm": 4.028548717498779, "learning_rate": 4.5812560420315244e-05, "loss": 1.2381, "num_input_tokens_seen": 124597632, "step": 7743 }, { "epoch": 0.5424536149272812, "grad_norm": 4.555594444274902, "learning_rate": 4.580556217162872e-05, "loss": 0.9839, "num_input_tokens_seen": 124614016, "step": 7744 }, { "epoch": 0.5425236631730104, "grad_norm": 4.0034589767456055, "learning_rate": 4.579856392294221e-05, "loss": 1.1126, "num_input_tokens_seen": 124629696, "step": 7745 }, { "epoch": 0.5425937114187397, "grad_norm": 5.23121452331543, "learning_rate": 4.5791565674255703e-05, "loss": 1.2308, "num_input_tokens_seen": 124646080, "step": 7746 }, { "epoch": 0.5426637596644689, "grad_norm": 3.759575605392456, "learning_rate": 4.578456742556918e-05, "loss": 0.9166, "num_input_tokens_seen": 124662464, "step": 7747 }, { "epoch": 0.5427338079101981, "grad_norm": 3.4041309356689453, "learning_rate": 4.577756917688267e-05, "loss": 0.9725, "num_input_tokens_seen": 124678848, "step": 7748 }, { "epoch": 0.5428038561559274, "grad_norm": 4.347851276397705, "learning_rate": 4.577057092819615e-05, "loss": 0.9738, "num_input_tokens_seen": 124695232, "step": 7749 }, { "epoch": 0.5428739044016566, "grad_norm": 3.457156181335449, "learning_rate": 4.576357267950964e-05, "loss": 0.9348, "num_input_tokens_seen": 124711368, "step": 7750 }, { "epoch": 0.5429439526473859, "grad_norm": 4.432048320770264, "learning_rate": 4.5756574430823126e-05, "loss": 1.1717, "num_input_tokens_seen": 124727752, "step": 7751 }, { "epoch": 0.5430140008931151, "grad_norm": 3.337639570236206, "learning_rate": 4.574957618213661e-05, "loss": 0.7409, "num_input_tokens_seen": 124744136, "step": 7752 }, { "epoch": 0.5430840491388443, "grad_norm": 5.203801155090332, "learning_rate": 4.5742577933450096e-05, "loss": 1.111, "num_input_tokens_seen": 124760264, "step": 7753 }, { "epoch": 0.5431540973845737, "grad_norm": 4.642807960510254, "learning_rate": 4.573557968476357e-05, "loss": 1.0197, "num_input_tokens_seen": 124776456, "step": 7754 }, { "epoch": 0.5432241456303029, "grad_norm": 4.211435317993164, "learning_rate": 4.572858143607706e-05, "loss": 1.1239, "num_input_tokens_seen": 124792320, "step": 7755 }, { "epoch": 0.5432941938760322, "grad_norm": 4.980574607849121, "learning_rate": 4.572158318739054e-05, "loss": 1.3394, "num_input_tokens_seen": 124808704, "step": 7756 }, { "epoch": 0.5433642421217614, "grad_norm": 3.556262493133545, "learning_rate": 4.571458493870403e-05, "loss": 1.047, "num_input_tokens_seen": 124825088, "step": 7757 }, { "epoch": 0.5434342903674906, "grad_norm": 4.556125164031982, "learning_rate": 4.570758669001752e-05, "loss": 0.9926, "num_input_tokens_seen": 124841032, "step": 7758 }, { "epoch": 0.5435043386132199, "grad_norm": 5.515524864196777, "learning_rate": 4.5700588441331e-05, "loss": 1.0712, "num_input_tokens_seen": 124857416, "step": 7759 }, { "epoch": 0.5435743868589491, "grad_norm": 3.912358283996582, "learning_rate": 4.569359019264449e-05, "loss": 0.9536, "num_input_tokens_seen": 124873800, "step": 7760 }, { "epoch": 0.5436444351046783, "grad_norm": 3.7982399463653564, "learning_rate": 4.5686591943957965e-05, "loss": 1.009, "num_input_tokens_seen": 124890184, "step": 7761 }, { "epoch": 0.5437144833504076, "grad_norm": 3.991724967956543, "learning_rate": 4.567959369527146e-05, "loss": 0.9333, "num_input_tokens_seen": 124906568, "step": 7762 }, { "epoch": 0.5437845315961368, "grad_norm": 5.313719272613525, "learning_rate": 4.567259544658495e-05, "loss": 1.0243, "num_input_tokens_seen": 124922352, "step": 7763 }, { "epoch": 0.5438545798418661, "grad_norm": 3.796652317047119, "learning_rate": 4.5665597197898423e-05, "loss": 0.905, "num_input_tokens_seen": 124938472, "step": 7764 }, { "epoch": 0.5439246280875953, "grad_norm": 4.874033451080322, "learning_rate": 4.565859894921191e-05, "loss": 1.0194, "num_input_tokens_seen": 124954848, "step": 7765 }, { "epoch": 0.5439946763333245, "grad_norm": 3.8010215759277344, "learning_rate": 4.5651600700525394e-05, "loss": 1.092, "num_input_tokens_seen": 124970488, "step": 7766 }, { "epoch": 0.5440647245790539, "grad_norm": 4.711667060852051, "learning_rate": 4.564460245183888e-05, "loss": 1.0575, "num_input_tokens_seen": 124986656, "step": 7767 }, { "epoch": 0.5441347728247831, "grad_norm": 5.9820356369018555, "learning_rate": 4.563760420315237e-05, "loss": 1.0015, "num_input_tokens_seen": 125003040, "step": 7768 }, { "epoch": 0.5442048210705123, "grad_norm": 4.217742443084717, "learning_rate": 4.563060595446585e-05, "loss": 0.907, "num_input_tokens_seen": 125019008, "step": 7769 }, { "epoch": 0.5442748693162416, "grad_norm": 3.3076283931732178, "learning_rate": 4.562360770577934e-05, "loss": 0.7289, "num_input_tokens_seen": 125035272, "step": 7770 }, { "epoch": 0.5443449175619708, "grad_norm": 3.409607172012329, "learning_rate": 4.5616609457092817e-05, "loss": 0.8898, "num_input_tokens_seen": 125051656, "step": 7771 }, { "epoch": 0.5444149658077001, "grad_norm": 5.260388374328613, "learning_rate": 4.560961120840631e-05, "loss": 0.9778, "num_input_tokens_seen": 125068040, "step": 7772 }, { "epoch": 0.5444850140534293, "grad_norm": 4.905508518218994, "learning_rate": 4.56026129597198e-05, "loss": 1.2631, "num_input_tokens_seen": 125084192, "step": 7773 }, { "epoch": 0.5445550622991585, "grad_norm": 4.701261043548584, "learning_rate": 4.5595614711033276e-05, "loss": 1.1305, "num_input_tokens_seen": 125098976, "step": 7774 }, { "epoch": 0.5446251105448878, "grad_norm": 4.822204113006592, "learning_rate": 4.5588616462346764e-05, "loss": 1.0407, "num_input_tokens_seen": 125114624, "step": 7775 }, { "epoch": 0.544695158790617, "grad_norm": 3.7025883197784424, "learning_rate": 4.5581618213660246e-05, "loss": 1.0469, "num_input_tokens_seen": 125130560, "step": 7776 }, { "epoch": 0.5447652070363462, "grad_norm": 4.998040676116943, "learning_rate": 4.5574619964973735e-05, "loss": 1.0568, "num_input_tokens_seen": 125146944, "step": 7777 }, { "epoch": 0.5448352552820755, "grad_norm": 3.457750082015991, "learning_rate": 4.5567621716287216e-05, "loss": 0.8956, "num_input_tokens_seen": 125162520, "step": 7778 }, { "epoch": 0.5449053035278048, "grad_norm": 3.417926788330078, "learning_rate": 4.5560623467600705e-05, "loss": 0.9547, "num_input_tokens_seen": 125178656, "step": 7779 }, { "epoch": 0.5449753517735341, "grad_norm": 4.075389385223389, "learning_rate": 4.5553625218914194e-05, "loss": 0.9655, "num_input_tokens_seen": 125195040, "step": 7780 }, { "epoch": 0.5450454000192633, "grad_norm": 4.12037992477417, "learning_rate": 4.554662697022767e-05, "loss": 1.3969, "num_input_tokens_seen": 125211424, "step": 7781 }, { "epoch": 0.5451154482649925, "grad_norm": 4.0456671714782715, "learning_rate": 4.5539628721541164e-05, "loss": 0.9851, "num_input_tokens_seen": 125227752, "step": 7782 }, { "epoch": 0.5451854965107218, "grad_norm": 4.972954273223877, "learning_rate": 4.553263047285464e-05, "loss": 1.0509, "num_input_tokens_seen": 125242792, "step": 7783 }, { "epoch": 0.545255544756451, "grad_norm": 3.667360544204712, "learning_rate": 4.552563222416813e-05, "loss": 1.194, "num_input_tokens_seen": 125259176, "step": 7784 }, { "epoch": 0.5453255930021802, "grad_norm": 3.54160737991333, "learning_rate": 4.5518633975481616e-05, "loss": 0.9758, "num_input_tokens_seen": 125275560, "step": 7785 }, { "epoch": 0.5453956412479095, "grad_norm": 3.7189040184020996, "learning_rate": 4.55116357267951e-05, "loss": 0.8118, "num_input_tokens_seen": 125291944, "step": 7786 }, { "epoch": 0.5454656894936387, "grad_norm": 3.435598611831665, "learning_rate": 4.5504637478108587e-05, "loss": 0.9343, "num_input_tokens_seen": 125307528, "step": 7787 }, { "epoch": 0.545535737739368, "grad_norm": 3.7623162269592285, "learning_rate": 4.549763922942207e-05, "loss": 1.2752, "num_input_tokens_seen": 125323816, "step": 7788 }, { "epoch": 0.5456057859850972, "grad_norm": 4.6416239738464355, "learning_rate": 4.549064098073556e-05, "loss": 1.0901, "num_input_tokens_seen": 125340200, "step": 7789 }, { "epoch": 0.5456758342308264, "grad_norm": 4.615113258361816, "learning_rate": 4.5483642732049046e-05, "loss": 0.977, "num_input_tokens_seen": 125356584, "step": 7790 }, { "epoch": 0.5457458824765558, "grad_norm": 3.8960089683532715, "learning_rate": 4.547664448336252e-05, "loss": 0.9974, "num_input_tokens_seen": 125372968, "step": 7791 }, { "epoch": 0.545815930722285, "grad_norm": 3.9642269611358643, "learning_rate": 4.5469646234676016e-05, "loss": 0.9801, "num_input_tokens_seen": 125389352, "step": 7792 }, { "epoch": 0.5458859789680143, "grad_norm": 5.444625377655029, "learning_rate": 4.546264798598949e-05, "loss": 1.1028, "num_input_tokens_seen": 125404624, "step": 7793 }, { "epoch": 0.5459560272137435, "grad_norm": 4.1974053382873535, "learning_rate": 4.545564973730298e-05, "loss": 0.9788, "num_input_tokens_seen": 125420944, "step": 7794 }, { "epoch": 0.5460260754594727, "grad_norm": 5.193080425262451, "learning_rate": 4.544865148861647e-05, "loss": 1.0504, "num_input_tokens_seen": 125437328, "step": 7795 }, { "epoch": 0.546096123705202, "grad_norm": 5.049325942993164, "learning_rate": 4.544165323992995e-05, "loss": 0.9977, "num_input_tokens_seen": 125452656, "step": 7796 }, { "epoch": 0.5461661719509312, "grad_norm": 4.1581549644470215, "learning_rate": 4.543465499124344e-05, "loss": 0.873, "num_input_tokens_seen": 125469040, "step": 7797 }, { "epoch": 0.5462362201966604, "grad_norm": 4.484875202178955, "learning_rate": 4.542765674255692e-05, "loss": 1.1117, "num_input_tokens_seen": 125485168, "step": 7798 }, { "epoch": 0.5463062684423897, "grad_norm": 4.153511047363281, "learning_rate": 4.542065849387041e-05, "loss": 1.2482, "num_input_tokens_seen": 125501336, "step": 7799 }, { "epoch": 0.5463763166881189, "grad_norm": 4.060786724090576, "learning_rate": 4.54136602451839e-05, "loss": 1.0232, "num_input_tokens_seen": 125517688, "step": 7800 }, { "epoch": 0.5463763166881189, "eval_loss": 1.121619462966919, "eval_runtime": 0.2054, "eval_samples_per_second": 4.868, "eval_steps_per_second": 4.868, "num_input_tokens_seen": 125517688, "step": 7800 }, { "epoch": 0.5464463649338482, "grad_norm": 5.799534797668457, "learning_rate": 4.540666199649737e-05, "loss": 1.2131, "num_input_tokens_seen": 125533408, "step": 7801 }, { "epoch": 0.5465164131795774, "grad_norm": 3.7412962913513184, "learning_rate": 4.539966374781087e-05, "loss": 1.0203, "num_input_tokens_seen": 125549792, "step": 7802 }, { "epoch": 0.5465864614253066, "grad_norm": 3.978907346725464, "learning_rate": 4.539266549912434e-05, "loss": 1.0106, "num_input_tokens_seen": 125565352, "step": 7803 }, { "epoch": 0.546656509671036, "grad_norm": 4.388980865478516, "learning_rate": 4.538566725043783e-05, "loss": 1.0467, "num_input_tokens_seen": 125580864, "step": 7804 }, { "epoch": 0.5467265579167652, "grad_norm": 3.432842969894409, "learning_rate": 4.5378669001751313e-05, "loss": 1.0141, "num_input_tokens_seen": 125597248, "step": 7805 }, { "epoch": 0.5467966061624944, "grad_norm": 4.419676303863525, "learning_rate": 4.53716707530648e-05, "loss": 1.053, "num_input_tokens_seen": 125613448, "step": 7806 }, { "epoch": 0.5468666544082237, "grad_norm": 4.717494964599609, "learning_rate": 4.536467250437829e-05, "loss": 0.9455, "num_input_tokens_seen": 125629728, "step": 7807 }, { "epoch": 0.5469367026539529, "grad_norm": 3.8088278770446777, "learning_rate": 4.535767425569177e-05, "loss": 1.0265, "num_input_tokens_seen": 125646112, "step": 7808 }, { "epoch": 0.5470067508996822, "grad_norm": 4.486949443817139, "learning_rate": 4.535067600700526e-05, "loss": 1.0894, "num_input_tokens_seen": 125662496, "step": 7809 }, { "epoch": 0.5470767991454114, "grad_norm": 4.220696926116943, "learning_rate": 4.5343677758318736e-05, "loss": 1.1683, "num_input_tokens_seen": 125678192, "step": 7810 }, { "epoch": 0.5471468473911406, "grad_norm": 3.5514204502105713, "learning_rate": 4.5336679509632225e-05, "loss": 0.934, "num_input_tokens_seen": 125693824, "step": 7811 }, { "epoch": 0.5472168956368699, "grad_norm": 4.971661567687988, "learning_rate": 4.532968126094572e-05, "loss": 1.0883, "num_input_tokens_seen": 125709184, "step": 7812 }, { "epoch": 0.5472869438825991, "grad_norm": 4.215356349945068, "learning_rate": 4.5322683012259195e-05, "loss": 1.1258, "num_input_tokens_seen": 125725568, "step": 7813 }, { "epoch": 0.5473569921283283, "grad_norm": 3.7598018646240234, "learning_rate": 4.5315684763572684e-05, "loss": 0.9901, "num_input_tokens_seen": 125741952, "step": 7814 }, { "epoch": 0.5474270403740576, "grad_norm": 4.145439147949219, "learning_rate": 4.5308686514886165e-05, "loss": 1.1545, "num_input_tokens_seen": 125757792, "step": 7815 }, { "epoch": 0.5474970886197869, "grad_norm": 4.645499229431152, "learning_rate": 4.5301688266199654e-05, "loss": 0.964, "num_input_tokens_seen": 125774176, "step": 7816 }, { "epoch": 0.5475671368655162, "grad_norm": 3.526381015777588, "learning_rate": 4.529469001751314e-05, "loss": 1.098, "num_input_tokens_seen": 125790264, "step": 7817 }, { "epoch": 0.5476371851112454, "grad_norm": 4.389588356018066, "learning_rate": 4.5287691768826624e-05, "loss": 1.2777, "num_input_tokens_seen": 125806648, "step": 7818 }, { "epoch": 0.5477072333569746, "grad_norm": 3.5831832885742188, "learning_rate": 4.528069352014011e-05, "loss": 1.1427, "num_input_tokens_seen": 125822848, "step": 7819 }, { "epoch": 0.5477772816027039, "grad_norm": 5.075149059295654, "learning_rate": 4.527369527145359e-05, "loss": 0.962, "num_input_tokens_seen": 125839232, "step": 7820 }, { "epoch": 0.5478473298484331, "grad_norm": 3.7986133098602295, "learning_rate": 4.526669702276708e-05, "loss": 1.0552, "num_input_tokens_seen": 125855616, "step": 7821 }, { "epoch": 0.5479173780941623, "grad_norm": 3.6170661449432373, "learning_rate": 4.525969877408057e-05, "loss": 0.9207, "num_input_tokens_seen": 125872000, "step": 7822 }, { "epoch": 0.5479874263398916, "grad_norm": 3.752514123916626, "learning_rate": 4.525270052539405e-05, "loss": 0.9744, "num_input_tokens_seen": 125888384, "step": 7823 }, { "epoch": 0.5480574745856208, "grad_norm": 3.609358072280884, "learning_rate": 4.5245702276707536e-05, "loss": 1.0688, "num_input_tokens_seen": 125904768, "step": 7824 }, { "epoch": 0.5481275228313501, "grad_norm": 5.24570369720459, "learning_rate": 4.523870402802102e-05, "loss": 1.0225, "num_input_tokens_seen": 125920592, "step": 7825 }, { "epoch": 0.5481975710770793, "grad_norm": 5.084728240966797, "learning_rate": 4.5231705779334506e-05, "loss": 1.0211, "num_input_tokens_seen": 125936976, "step": 7826 }, { "epoch": 0.5482676193228085, "grad_norm": 4.076999664306641, "learning_rate": 4.5224707530647995e-05, "loss": 1.0143, "num_input_tokens_seen": 125953048, "step": 7827 }, { "epoch": 0.5483376675685379, "grad_norm": 3.8320982456207275, "learning_rate": 4.5217709281961476e-05, "loss": 0.9229, "num_input_tokens_seen": 125969432, "step": 7828 }, { "epoch": 0.5484077158142671, "grad_norm": 5.069493293762207, "learning_rate": 4.5210711033274965e-05, "loss": 0.8938, "num_input_tokens_seen": 125985688, "step": 7829 }, { "epoch": 0.5484777640599964, "grad_norm": 3.8555328845977783, "learning_rate": 4.520371278458844e-05, "loss": 1.0394, "num_input_tokens_seen": 126002072, "step": 7830 }, { "epoch": 0.5485478123057256, "grad_norm": 3.5486679077148438, "learning_rate": 4.519671453590193e-05, "loss": 1.0865, "num_input_tokens_seen": 126018456, "step": 7831 }, { "epoch": 0.5486178605514548, "grad_norm": 4.256968021392822, "learning_rate": 4.518971628721541e-05, "loss": 1.275, "num_input_tokens_seen": 126034272, "step": 7832 }, { "epoch": 0.5486879087971841, "grad_norm": 3.594381332397461, "learning_rate": 4.51827180385289e-05, "loss": 1.0079, "num_input_tokens_seen": 126050656, "step": 7833 }, { "epoch": 0.5487579570429133, "grad_norm": 3.4965176582336426, "learning_rate": 4.517571978984239e-05, "loss": 1.0182, "num_input_tokens_seen": 126067040, "step": 7834 }, { "epoch": 0.5488280052886425, "grad_norm": 3.762791395187378, "learning_rate": 4.516872154115587e-05, "loss": 1.048, "num_input_tokens_seen": 126083384, "step": 7835 }, { "epoch": 0.5488980535343718, "grad_norm": 4.816859245300293, "learning_rate": 4.516172329246936e-05, "loss": 1.1516, "num_input_tokens_seen": 126099768, "step": 7836 }, { "epoch": 0.548968101780101, "grad_norm": 4.410999774932861, "learning_rate": 4.515472504378283e-05, "loss": 1.1552, "num_input_tokens_seen": 126114976, "step": 7837 }, { "epoch": 0.5490381500258303, "grad_norm": 3.48974609375, "learning_rate": 4.514772679509633e-05, "loss": 0.8017, "num_input_tokens_seen": 126131360, "step": 7838 }, { "epoch": 0.5491081982715595, "grad_norm": 4.172264575958252, "learning_rate": 4.514072854640982e-05, "loss": 0.9508, "num_input_tokens_seen": 126147080, "step": 7839 }, { "epoch": 0.5491782465172887, "grad_norm": 4.311397075653076, "learning_rate": 4.513373029772329e-05, "loss": 1.1269, "num_input_tokens_seen": 126163464, "step": 7840 }, { "epoch": 0.5492482947630181, "grad_norm": 3.950122833251953, "learning_rate": 4.512673204903678e-05, "loss": 1.0341, "num_input_tokens_seen": 126179848, "step": 7841 }, { "epoch": 0.5493183430087473, "grad_norm": 3.5279722213745117, "learning_rate": 4.511973380035026e-05, "loss": 0.8949, "num_input_tokens_seen": 126196232, "step": 7842 }, { "epoch": 0.5493883912544765, "grad_norm": 3.958651065826416, "learning_rate": 4.511273555166375e-05, "loss": 1.2114, "num_input_tokens_seen": 126212616, "step": 7843 }, { "epoch": 0.5494584395002058, "grad_norm": 4.567907810211182, "learning_rate": 4.510573730297724e-05, "loss": 1.0151, "num_input_tokens_seen": 126229000, "step": 7844 }, { "epoch": 0.549528487745935, "grad_norm": 5.533442497253418, "learning_rate": 4.509873905429072e-05, "loss": 1.1584, "num_input_tokens_seen": 126243808, "step": 7845 }, { "epoch": 0.5495985359916643, "grad_norm": 3.7990267276763916, "learning_rate": 4.509174080560421e-05, "loss": 1.023, "num_input_tokens_seen": 126260136, "step": 7846 }, { "epoch": 0.5496685842373935, "grad_norm": 3.648163318634033, "learning_rate": 4.5084742556917685e-05, "loss": 1.1331, "num_input_tokens_seen": 126276488, "step": 7847 }, { "epoch": 0.5497386324831227, "grad_norm": 3.247767686843872, "learning_rate": 4.507774430823118e-05, "loss": 0.8663, "num_input_tokens_seen": 126292872, "step": 7848 }, { "epoch": 0.549808680728852, "grad_norm": 3.6284050941467285, "learning_rate": 4.507074605954467e-05, "loss": 1.2207, "num_input_tokens_seen": 126309256, "step": 7849 }, { "epoch": 0.5498787289745812, "grad_norm": 3.714810848236084, "learning_rate": 4.5063747810858144e-05, "loss": 0.8713, "num_input_tokens_seen": 126324880, "step": 7850 }, { "epoch": 0.5499487772203104, "grad_norm": 4.455381870269775, "learning_rate": 4.505674956217163e-05, "loss": 0.9736, "num_input_tokens_seen": 126341264, "step": 7851 }, { "epoch": 0.5500188254660398, "grad_norm": 6.539924144744873, "learning_rate": 4.5049751313485115e-05, "loss": 1.1598, "num_input_tokens_seen": 126357648, "step": 7852 }, { "epoch": 0.550088873711769, "grad_norm": 4.244390487670898, "learning_rate": 4.50427530647986e-05, "loss": 1.2593, "num_input_tokens_seen": 126373888, "step": 7853 }, { "epoch": 0.5501589219574983, "grad_norm": 4.415054798126221, "learning_rate": 4.503575481611209e-05, "loss": 1.2661, "num_input_tokens_seen": 126389976, "step": 7854 }, { "epoch": 0.5502289702032275, "grad_norm": 5.152358531951904, "learning_rate": 4.5028756567425574e-05, "loss": 1.0433, "num_input_tokens_seen": 126406000, "step": 7855 }, { "epoch": 0.5502990184489567, "grad_norm": 4.243201732635498, "learning_rate": 4.502175831873906e-05, "loss": 1.0326, "num_input_tokens_seen": 126422384, "step": 7856 }, { "epoch": 0.550369066694686, "grad_norm": 4.7400898933410645, "learning_rate": 4.501476007005254e-05, "loss": 1.1462, "num_input_tokens_seen": 126438552, "step": 7857 }, { "epoch": 0.5504391149404152, "grad_norm": 3.5386011600494385, "learning_rate": 4.500776182136603e-05, "loss": 0.9501, "num_input_tokens_seen": 126454936, "step": 7858 }, { "epoch": 0.5505091631861445, "grad_norm": 3.939976692199707, "learning_rate": 4.500076357267951e-05, "loss": 1.081, "num_input_tokens_seen": 126471320, "step": 7859 }, { "epoch": 0.5505792114318737, "grad_norm": 3.7520668506622314, "learning_rate": 4.4993765323992996e-05, "loss": 1.1897, "num_input_tokens_seen": 126487144, "step": 7860 }, { "epoch": 0.5506492596776029, "grad_norm": 3.8446950912475586, "learning_rate": 4.4986767075306485e-05, "loss": 1.1295, "num_input_tokens_seen": 126503208, "step": 7861 }, { "epoch": 0.5507193079233322, "grad_norm": 5.0048909187316895, "learning_rate": 4.4979768826619967e-05, "loss": 1.1618, "num_input_tokens_seen": 126519592, "step": 7862 }, { "epoch": 0.5507893561690614, "grad_norm": 3.4074764251708984, "learning_rate": 4.4972770577933455e-05, "loss": 0.9419, "num_input_tokens_seen": 126535928, "step": 7863 }, { "epoch": 0.5508594044147906, "grad_norm": 4.959311485290527, "learning_rate": 4.496577232924694e-05, "loss": 0.9927, "num_input_tokens_seen": 126551768, "step": 7864 }, { "epoch": 0.55092945266052, "grad_norm": 4.260848045349121, "learning_rate": 4.4958774080560426e-05, "loss": 1.1245, "num_input_tokens_seen": 126566968, "step": 7865 }, { "epoch": 0.5509995009062492, "grad_norm": 4.374674320220947, "learning_rate": 4.4951775831873914e-05, "loss": 1.1858, "num_input_tokens_seen": 126582432, "step": 7866 }, { "epoch": 0.5510695491519785, "grad_norm": 3.808882236480713, "learning_rate": 4.494477758318739e-05, "loss": 0.8445, "num_input_tokens_seen": 126598272, "step": 7867 }, { "epoch": 0.5511395973977077, "grad_norm": 4.428232192993164, "learning_rate": 4.493777933450088e-05, "loss": 1.0271, "num_input_tokens_seen": 126614656, "step": 7868 }, { "epoch": 0.5512096456434369, "grad_norm": 3.7343485355377197, "learning_rate": 4.493078108581436e-05, "loss": 0.9685, "num_input_tokens_seen": 126630920, "step": 7869 }, { "epoch": 0.5512796938891662, "grad_norm": 3.8342700004577637, "learning_rate": 4.492378283712785e-05, "loss": 1.145, "num_input_tokens_seen": 126647304, "step": 7870 }, { "epoch": 0.5513497421348954, "grad_norm": 3.969151496887207, "learning_rate": 4.491678458844134e-05, "loss": 1.1645, "num_input_tokens_seen": 126663688, "step": 7871 }, { "epoch": 0.5514197903806246, "grad_norm": 4.096588134765625, "learning_rate": 4.490978633975482e-05, "loss": 1.2369, "num_input_tokens_seen": 126680072, "step": 7872 }, { "epoch": 0.5514898386263539, "grad_norm": 3.6431949138641357, "learning_rate": 4.490278809106831e-05, "loss": 0.8899, "num_input_tokens_seen": 126696456, "step": 7873 }, { "epoch": 0.5515598868720831, "grad_norm": 3.6036217212677, "learning_rate": 4.489578984238179e-05, "loss": 0.9629, "num_input_tokens_seen": 126712464, "step": 7874 }, { "epoch": 0.5516299351178124, "grad_norm": 4.925845146179199, "learning_rate": 4.488879159369528e-05, "loss": 1.0951, "num_input_tokens_seen": 126727664, "step": 7875 }, { "epoch": 0.5516999833635416, "grad_norm": 3.822420120239258, "learning_rate": 4.4881793345008766e-05, "loss": 0.9803, "num_input_tokens_seen": 126742280, "step": 7876 }, { "epoch": 0.5517700316092709, "grad_norm": 4.6314005851745605, "learning_rate": 4.487479509632224e-05, "loss": 0.852, "num_input_tokens_seen": 126758664, "step": 7877 }, { "epoch": 0.5518400798550002, "grad_norm": 3.5483834743499756, "learning_rate": 4.486779684763573e-05, "loss": 0.9601, "num_input_tokens_seen": 126774160, "step": 7878 }, { "epoch": 0.5519101281007294, "grad_norm": 3.582298755645752, "learning_rate": 4.486079859894921e-05, "loss": 1.0404, "num_input_tokens_seen": 126790344, "step": 7879 }, { "epoch": 0.5519801763464586, "grad_norm": 6.529607772827148, "learning_rate": 4.48538003502627e-05, "loss": 0.8033, "num_input_tokens_seen": 126806728, "step": 7880 }, { "epoch": 0.5520502245921879, "grad_norm": 4.167466640472412, "learning_rate": 4.484680210157619e-05, "loss": 1.1219, "num_input_tokens_seen": 126822792, "step": 7881 }, { "epoch": 0.5521202728379171, "grad_norm": 4.332005500793457, "learning_rate": 4.483980385288967e-05, "loss": 1.093, "num_input_tokens_seen": 126838952, "step": 7882 }, { "epoch": 0.5521903210836464, "grad_norm": 7.588165283203125, "learning_rate": 4.483280560420316e-05, "loss": 1.075, "num_input_tokens_seen": 126855336, "step": 7883 }, { "epoch": 0.5522603693293756, "grad_norm": 5.086427688598633, "learning_rate": 4.4825807355516634e-05, "loss": 0.8955, "num_input_tokens_seen": 126871448, "step": 7884 }, { "epoch": 0.5523304175751048, "grad_norm": 5.5746355056762695, "learning_rate": 4.481880910683013e-05, "loss": 0.8979, "num_input_tokens_seen": 126886424, "step": 7885 }, { "epoch": 0.5524004658208341, "grad_norm": 3.833796262741089, "learning_rate": 4.4811810858143605e-05, "loss": 0.9682, "num_input_tokens_seen": 126902808, "step": 7886 }, { "epoch": 0.5524705140665633, "grad_norm": 3.436943769454956, "learning_rate": 4.480481260945709e-05, "loss": 0.8928, "num_input_tokens_seen": 126918960, "step": 7887 }, { "epoch": 0.5525405623122925, "grad_norm": 3.8391594886779785, "learning_rate": 4.479781436077058e-05, "loss": 1.0474, "num_input_tokens_seen": 126934432, "step": 7888 }, { "epoch": 0.5526106105580219, "grad_norm": 5.476257801055908, "learning_rate": 4.4790816112084064e-05, "loss": 1.0429, "num_input_tokens_seen": 126949912, "step": 7889 }, { "epoch": 0.5526806588037511, "grad_norm": 3.628587245941162, "learning_rate": 4.478381786339755e-05, "loss": 1.013, "num_input_tokens_seen": 126966296, "step": 7890 }, { "epoch": 0.5527507070494804, "grad_norm": 4.087871551513672, "learning_rate": 4.4776819614711034e-05, "loss": 1.0916, "num_input_tokens_seen": 126982680, "step": 7891 }, { "epoch": 0.5528207552952096, "grad_norm": 4.438427448272705, "learning_rate": 4.476982136602452e-05, "loss": 1.1005, "num_input_tokens_seen": 126998072, "step": 7892 }, { "epoch": 0.5528908035409388, "grad_norm": 3.5243074893951416, "learning_rate": 4.476282311733801e-05, "loss": 0.7855, "num_input_tokens_seen": 127014104, "step": 7893 }, { "epoch": 0.5529608517866681, "grad_norm": 10.454376220703125, "learning_rate": 4.4755824868651486e-05, "loss": 0.9935, "num_input_tokens_seen": 127030488, "step": 7894 }, { "epoch": 0.5530309000323973, "grad_norm": 3.5278947353363037, "learning_rate": 4.474882661996498e-05, "loss": 1.1274, "num_input_tokens_seen": 127046536, "step": 7895 }, { "epoch": 0.5531009482781266, "grad_norm": 3.8515357971191406, "learning_rate": 4.474182837127846e-05, "loss": 1.0061, "num_input_tokens_seen": 127062200, "step": 7896 }, { "epoch": 0.5531709965238558, "grad_norm": 4.094476699829102, "learning_rate": 4.4734830122591945e-05, "loss": 0.9606, "num_input_tokens_seen": 127077656, "step": 7897 }, { "epoch": 0.553241044769585, "grad_norm": 3.9371705055236816, "learning_rate": 4.4727831873905434e-05, "loss": 1.1099, "num_input_tokens_seen": 127093464, "step": 7898 }, { "epoch": 0.5533110930153143, "grad_norm": 5.384475231170654, "learning_rate": 4.4720833625218916e-05, "loss": 1.0207, "num_input_tokens_seen": 127109848, "step": 7899 }, { "epoch": 0.5533811412610435, "grad_norm": 4.609706878662109, "learning_rate": 4.4713835376532404e-05, "loss": 0.9562, "num_input_tokens_seen": 127126232, "step": 7900 }, { "epoch": 0.5534511895067727, "grad_norm": 5.60554838180542, "learning_rate": 4.4706837127845886e-05, "loss": 1.0052, "num_input_tokens_seen": 127142616, "step": 7901 }, { "epoch": 0.5535212377525021, "grad_norm": 4.736166477203369, "learning_rate": 4.4699838879159375e-05, "loss": 0.9599, "num_input_tokens_seen": 127159000, "step": 7902 }, { "epoch": 0.5535912859982313, "grad_norm": 3.9779133796691895, "learning_rate": 4.469284063047286e-05, "loss": 1.0068, "num_input_tokens_seen": 127175384, "step": 7903 }, { "epoch": 0.5536613342439606, "grad_norm": 4.673941612243652, "learning_rate": 4.468584238178634e-05, "loss": 1.2606, "num_input_tokens_seen": 127191752, "step": 7904 }, { "epoch": 0.5537313824896898, "grad_norm": 3.722839117050171, "learning_rate": 4.4678844133099834e-05, "loss": 1.0554, "num_input_tokens_seen": 127207408, "step": 7905 }, { "epoch": 0.553801430735419, "grad_norm": 4.31935453414917, "learning_rate": 4.467184588441331e-05, "loss": 1.0523, "num_input_tokens_seen": 127223792, "step": 7906 }, { "epoch": 0.5538714789811483, "grad_norm": 3.686140775680542, "learning_rate": 4.46648476357268e-05, "loss": 1.083, "num_input_tokens_seen": 127240176, "step": 7907 }, { "epoch": 0.5539415272268775, "grad_norm": 4.997079372406006, "learning_rate": 4.4657849387040286e-05, "loss": 1.2024, "num_input_tokens_seen": 127255560, "step": 7908 }, { "epoch": 0.5540115754726067, "grad_norm": 4.612823963165283, "learning_rate": 4.465085113835377e-05, "loss": 1.0067, "num_input_tokens_seen": 127271336, "step": 7909 }, { "epoch": 0.554081623718336, "grad_norm": 3.857698678970337, "learning_rate": 4.4643852889667256e-05, "loss": 1.2211, "num_input_tokens_seen": 127287720, "step": 7910 }, { "epoch": 0.5541516719640652, "grad_norm": 3.8385705947875977, "learning_rate": 4.463685464098074e-05, "loss": 1.2123, "num_input_tokens_seen": 127304104, "step": 7911 }, { "epoch": 0.5542217202097945, "grad_norm": 3.910621166229248, "learning_rate": 4.462985639229423e-05, "loss": 1.071, "num_input_tokens_seen": 127319512, "step": 7912 }, { "epoch": 0.5542917684555237, "grad_norm": 4.685849666595459, "learning_rate": 4.46228581436077e-05, "loss": 0.9889, "num_input_tokens_seen": 127335736, "step": 7913 }, { "epoch": 0.554361816701253, "grad_norm": 3.6313934326171875, "learning_rate": 4.461585989492119e-05, "loss": 0.9355, "num_input_tokens_seen": 127351896, "step": 7914 }, { "epoch": 0.5544318649469823, "grad_norm": 3.3593709468841553, "learning_rate": 4.4608861646234686e-05, "loss": 0.8789, "num_input_tokens_seen": 127368032, "step": 7915 }, { "epoch": 0.5545019131927115, "grad_norm": 3.924467086791992, "learning_rate": 4.460186339754816e-05, "loss": 1.0038, "num_input_tokens_seen": 127384416, "step": 7916 }, { "epoch": 0.5545719614384407, "grad_norm": 4.749776363372803, "learning_rate": 4.459486514886165e-05, "loss": 1.2994, "num_input_tokens_seen": 127400800, "step": 7917 }, { "epoch": 0.55464200968417, "grad_norm": 4.43412446975708, "learning_rate": 4.458786690017513e-05, "loss": 1.1877, "num_input_tokens_seen": 127416184, "step": 7918 }, { "epoch": 0.5547120579298992, "grad_norm": 3.626112699508667, "learning_rate": 4.458086865148862e-05, "loss": 0.9214, "num_input_tokens_seen": 127432568, "step": 7919 }, { "epoch": 0.5547821061756285, "grad_norm": 4.031342506408691, "learning_rate": 4.457387040280211e-05, "loss": 1.0058, "num_input_tokens_seen": 127448656, "step": 7920 }, { "epoch": 0.5548521544213577, "grad_norm": 4.554370403289795, "learning_rate": 4.456687215411559e-05, "loss": 1.101, "num_input_tokens_seen": 127462584, "step": 7921 }, { "epoch": 0.5549222026670869, "grad_norm": 4.46144437789917, "learning_rate": 4.455987390542908e-05, "loss": 1.1095, "num_input_tokens_seen": 127478968, "step": 7922 }, { "epoch": 0.5549922509128162, "grad_norm": 3.759772539138794, "learning_rate": 4.4552875656742554e-05, "loss": 1.1723, "num_input_tokens_seen": 127494856, "step": 7923 }, { "epoch": 0.5550622991585454, "grad_norm": 6.424376010894775, "learning_rate": 4.454587740805604e-05, "loss": 1.1691, "num_input_tokens_seen": 127511240, "step": 7924 }, { "epoch": 0.5551323474042746, "grad_norm": 4.5455708503723145, "learning_rate": 4.453887915936954e-05, "loss": 1.0578, "num_input_tokens_seen": 127527624, "step": 7925 }, { "epoch": 0.555202395650004, "grad_norm": 3.673654794692993, "learning_rate": 4.453188091068301e-05, "loss": 1.0097, "num_input_tokens_seen": 127543200, "step": 7926 }, { "epoch": 0.5552724438957332, "grad_norm": 5.912845134735107, "learning_rate": 4.45248826619965e-05, "loss": 1.1171, "num_input_tokens_seen": 127558704, "step": 7927 }, { "epoch": 0.5553424921414625, "grad_norm": 4.603636741638184, "learning_rate": 4.451788441330998e-05, "loss": 1.0536, "num_input_tokens_seen": 127575088, "step": 7928 }, { "epoch": 0.5554125403871917, "grad_norm": 4.067967414855957, "learning_rate": 4.451088616462347e-05, "loss": 1.3762, "num_input_tokens_seen": 127590584, "step": 7929 }, { "epoch": 0.5554825886329209, "grad_norm": 4.261424541473389, "learning_rate": 4.450388791593696e-05, "loss": 0.993, "num_input_tokens_seen": 127606320, "step": 7930 }, { "epoch": 0.5555526368786502, "grad_norm": 3.4832777976989746, "learning_rate": 4.449688966725044e-05, "loss": 1.0043, "num_input_tokens_seen": 127622704, "step": 7931 }, { "epoch": 0.5556226851243794, "grad_norm": 3.5916316509246826, "learning_rate": 4.448989141856393e-05, "loss": 1.014, "num_input_tokens_seen": 127639088, "step": 7932 }, { "epoch": 0.5556927333701087, "grad_norm": 4.151430130004883, "learning_rate": 4.4482893169877406e-05, "loss": 1.0182, "num_input_tokens_seen": 127655240, "step": 7933 }, { "epoch": 0.5557627816158379, "grad_norm": 4.024296283721924, "learning_rate": 4.4475894921190894e-05, "loss": 0.9699, "num_input_tokens_seen": 127671624, "step": 7934 }, { "epoch": 0.5558328298615671, "grad_norm": 3.977968454360962, "learning_rate": 4.446889667250439e-05, "loss": 1.1159, "num_input_tokens_seen": 127688008, "step": 7935 }, { "epoch": 0.5559028781072964, "grad_norm": 3.4853708744049072, "learning_rate": 4.4461898423817865e-05, "loss": 0.9936, "num_input_tokens_seen": 127704352, "step": 7936 }, { "epoch": 0.5559729263530256, "grad_norm": 3.8649024963378906, "learning_rate": 4.445490017513135e-05, "loss": 0.9168, "num_input_tokens_seen": 127720488, "step": 7937 }, { "epoch": 0.5560429745987548, "grad_norm": 3.8322534561157227, "learning_rate": 4.4447901926444835e-05, "loss": 1.0689, "num_input_tokens_seen": 127736280, "step": 7938 }, { "epoch": 0.5561130228444842, "grad_norm": 3.2827885150909424, "learning_rate": 4.4440903677758324e-05, "loss": 0.8871, "num_input_tokens_seen": 127752664, "step": 7939 }, { "epoch": 0.5561830710902134, "grad_norm": 3.632049560546875, "learning_rate": 4.44339054290718e-05, "loss": 0.9965, "num_input_tokens_seen": 127769048, "step": 7940 }, { "epoch": 0.5562531193359427, "grad_norm": 3.9161887168884277, "learning_rate": 4.4426907180385294e-05, "loss": 1.3189, "num_input_tokens_seen": 127784576, "step": 7941 }, { "epoch": 0.5563231675816719, "grad_norm": 3.6720736026763916, "learning_rate": 4.441990893169878e-05, "loss": 1.2914, "num_input_tokens_seen": 127800960, "step": 7942 }, { "epoch": 0.5563932158274011, "grad_norm": 4.027480602264404, "learning_rate": 4.441291068301226e-05, "loss": 1.0616, "num_input_tokens_seen": 127816800, "step": 7943 }, { "epoch": 0.5564632640731304, "grad_norm": 5.149120807647705, "learning_rate": 4.4405912434325746e-05, "loss": 0.9716, "num_input_tokens_seen": 127832760, "step": 7944 }, { "epoch": 0.5565333123188596, "grad_norm": 4.779052734375, "learning_rate": 4.439891418563923e-05, "loss": 1.0751, "num_input_tokens_seen": 127848888, "step": 7945 }, { "epoch": 0.5566033605645888, "grad_norm": 4.37583065032959, "learning_rate": 4.439191593695272e-05, "loss": 1.1268, "num_input_tokens_seen": 127865216, "step": 7946 }, { "epoch": 0.5566734088103181, "grad_norm": 4.8968963623046875, "learning_rate": 4.4384917688266205e-05, "loss": 1.139, "num_input_tokens_seen": 127881600, "step": 7947 }, { "epoch": 0.5567434570560473, "grad_norm": 3.862400770187378, "learning_rate": 4.437791943957969e-05, "loss": 1.0398, "num_input_tokens_seen": 127897424, "step": 7948 }, { "epoch": 0.5568135053017766, "grad_norm": 3.653930902481079, "learning_rate": 4.4370921190893176e-05, "loss": 1.0289, "num_input_tokens_seen": 127913808, "step": 7949 }, { "epoch": 0.5568835535475058, "grad_norm": 4.557390213012695, "learning_rate": 4.436392294220665e-05, "loss": 1.1048, "num_input_tokens_seen": 127930192, "step": 7950 }, { "epoch": 0.556953601793235, "grad_norm": 6.2530364990234375, "learning_rate": 4.4356924693520146e-05, "loss": 1.2491, "num_input_tokens_seen": 127946576, "step": 7951 }, { "epoch": 0.5570236500389644, "grad_norm": 3.9652791023254395, "learning_rate": 4.4349926444833635e-05, "loss": 0.9109, "num_input_tokens_seen": 127962960, "step": 7952 }, { "epoch": 0.5570936982846936, "grad_norm": 5.276636600494385, "learning_rate": 4.434292819614711e-05, "loss": 1.0179, "num_input_tokens_seen": 127979016, "step": 7953 }, { "epoch": 0.5571637465304228, "grad_norm": 3.445674180984497, "learning_rate": 4.43359299474606e-05, "loss": 1.033, "num_input_tokens_seen": 127995400, "step": 7954 }, { "epoch": 0.5572337947761521, "grad_norm": 4.23106575012207, "learning_rate": 4.432893169877408e-05, "loss": 1.1025, "num_input_tokens_seen": 128011784, "step": 7955 }, { "epoch": 0.5573038430218813, "grad_norm": 3.795984983444214, "learning_rate": 4.432193345008757e-05, "loss": 0.8922, "num_input_tokens_seen": 128027504, "step": 7956 }, { "epoch": 0.5573738912676106, "grad_norm": 4.286993980407715, "learning_rate": 4.431493520140106e-05, "loss": 1.0134, "num_input_tokens_seen": 128043832, "step": 7957 }, { "epoch": 0.5574439395133398, "grad_norm": 5.160306930541992, "learning_rate": 4.430793695271454e-05, "loss": 1.1087, "num_input_tokens_seen": 128059152, "step": 7958 }, { "epoch": 0.557513987759069, "grad_norm": 5.083519458770752, "learning_rate": 4.430093870402803e-05, "loss": 1.173, "num_input_tokens_seen": 128074664, "step": 7959 }, { "epoch": 0.5575840360047983, "grad_norm": 3.746807336807251, "learning_rate": 4.42939404553415e-05, "loss": 1.0455, "num_input_tokens_seen": 128090984, "step": 7960 }, { "epoch": 0.5576540842505275, "grad_norm": 5.527798175811768, "learning_rate": 4.4286942206655e-05, "loss": 0.9927, "num_input_tokens_seen": 128107368, "step": 7961 }, { "epoch": 0.5577241324962567, "grad_norm": 5.1027116775512695, "learning_rate": 4.427994395796849e-05, "loss": 1.116, "num_input_tokens_seen": 128123752, "step": 7962 }, { "epoch": 0.5577941807419861, "grad_norm": 3.7059028148651123, "learning_rate": 4.427294570928196e-05, "loss": 1.2798, "num_input_tokens_seen": 128140136, "step": 7963 }, { "epoch": 0.5578642289877153, "grad_norm": 3.936305522918701, "learning_rate": 4.426594746059545e-05, "loss": 0.88, "num_input_tokens_seen": 128156520, "step": 7964 }, { "epoch": 0.5579342772334446, "grad_norm": 4.8260369300842285, "learning_rate": 4.425894921190893e-05, "loss": 0.8951, "num_input_tokens_seen": 128172904, "step": 7965 }, { "epoch": 0.5580043254791738, "grad_norm": 4.908670425415039, "learning_rate": 4.425195096322242e-05, "loss": 0.9753, "num_input_tokens_seen": 128188672, "step": 7966 }, { "epoch": 0.558074373724903, "grad_norm": 4.453614711761475, "learning_rate": 4.42449527145359e-05, "loss": 1.1764, "num_input_tokens_seen": 128205056, "step": 7967 }, { "epoch": 0.5581444219706323, "grad_norm": 3.9060006141662598, "learning_rate": 4.423795446584939e-05, "loss": 1.1539, "num_input_tokens_seen": 128221440, "step": 7968 }, { "epoch": 0.5582144702163615, "grad_norm": 4.674314498901367, "learning_rate": 4.423095621716288e-05, "loss": 1.0709, "num_input_tokens_seen": 128237824, "step": 7969 }, { "epoch": 0.5582845184620908, "grad_norm": 5.411530017852783, "learning_rate": 4.4223957968476355e-05, "loss": 1.1191, "num_input_tokens_seen": 128253488, "step": 7970 }, { "epoch": 0.55835456670782, "grad_norm": 3.7647812366485596, "learning_rate": 4.421695971978985e-05, "loss": 1.0836, "num_input_tokens_seen": 128269600, "step": 7971 }, { "epoch": 0.5584246149535492, "grad_norm": 5.809626579284668, "learning_rate": 4.4209961471103325e-05, "loss": 1.2228, "num_input_tokens_seen": 128285592, "step": 7972 }, { "epoch": 0.5584946631992785, "grad_norm": 4.234646320343018, "learning_rate": 4.4202963222416814e-05, "loss": 1.2126, "num_input_tokens_seen": 128301976, "step": 7973 }, { "epoch": 0.5585647114450077, "grad_norm": 3.390713930130005, "learning_rate": 4.41959649737303e-05, "loss": 1.0575, "num_input_tokens_seen": 128318360, "step": 7974 }, { "epoch": 0.558634759690737, "grad_norm": 3.582789182662964, "learning_rate": 4.4188966725043784e-05, "loss": 1.0472, "num_input_tokens_seen": 128334744, "step": 7975 }, { "epoch": 0.5587048079364663, "grad_norm": 4.052979946136475, "learning_rate": 4.418196847635727e-05, "loss": 1.0512, "num_input_tokens_seen": 128350920, "step": 7976 }, { "epoch": 0.5587748561821955, "grad_norm": 4.101496696472168, "learning_rate": 4.4174970227670755e-05, "loss": 1.0432, "num_input_tokens_seen": 128367144, "step": 7977 }, { "epoch": 0.5588449044279248, "grad_norm": 4.708171367645264, "learning_rate": 4.416797197898424e-05, "loss": 0.9654, "num_input_tokens_seen": 128381272, "step": 7978 }, { "epoch": 0.558914952673654, "grad_norm": 4.02223539352417, "learning_rate": 4.416097373029773e-05, "loss": 1.1984, "num_input_tokens_seen": 128397160, "step": 7979 }, { "epoch": 0.5589850009193832, "grad_norm": 4.594113349914551, "learning_rate": 4.415397548161121e-05, "loss": 1.1523, "num_input_tokens_seen": 128413248, "step": 7980 }, { "epoch": 0.5590550491651125, "grad_norm": 4.255110263824463, "learning_rate": 4.41469772329247e-05, "loss": 0.9418, "num_input_tokens_seen": 128429632, "step": 7981 }, { "epoch": 0.5591250974108417, "grad_norm": 4.862103462219238, "learning_rate": 4.413997898423818e-05, "loss": 0.8901, "num_input_tokens_seen": 128446016, "step": 7982 }, { "epoch": 0.5591951456565709, "grad_norm": 4.902801036834717, "learning_rate": 4.4132980735551666e-05, "loss": 1.1788, "num_input_tokens_seen": 128461344, "step": 7983 }, { "epoch": 0.5592651939023002, "grad_norm": 3.762756824493408, "learning_rate": 4.4125982486865154e-05, "loss": 1.2135, "num_input_tokens_seen": 128477080, "step": 7984 }, { "epoch": 0.5593352421480294, "grad_norm": 4.081719398498535, "learning_rate": 4.4118984238178636e-05, "loss": 0.9034, "num_input_tokens_seen": 128493112, "step": 7985 }, { "epoch": 0.5594052903937587, "grad_norm": 4.764441013336182, "learning_rate": 4.4111985989492125e-05, "loss": 1.1384, "num_input_tokens_seen": 128508816, "step": 7986 }, { "epoch": 0.559475338639488, "grad_norm": 3.709524631500244, "learning_rate": 4.410498774080561e-05, "loss": 1.0331, "num_input_tokens_seen": 128525200, "step": 7987 }, { "epoch": 0.5595453868852172, "grad_norm": 3.648374557495117, "learning_rate": 4.4097989492119095e-05, "loss": 1.072, "num_input_tokens_seen": 128541584, "step": 7988 }, { "epoch": 0.5596154351309465, "grad_norm": 6.713209629058838, "learning_rate": 4.4090991243432584e-05, "loss": 1.0908, "num_input_tokens_seen": 128557048, "step": 7989 }, { "epoch": 0.5596854833766757, "grad_norm": 5.627123832702637, "learning_rate": 4.408399299474606e-05, "loss": 0.8656, "num_input_tokens_seen": 128572232, "step": 7990 }, { "epoch": 0.5597555316224049, "grad_norm": 3.928884267807007, "learning_rate": 4.4076994746059554e-05, "loss": 0.8768, "num_input_tokens_seen": 128588616, "step": 7991 }, { "epoch": 0.5598255798681342, "grad_norm": 4.144782543182373, "learning_rate": 4.406999649737303e-05, "loss": 1.2362, "num_input_tokens_seen": 128604824, "step": 7992 }, { "epoch": 0.5598956281138634, "grad_norm": 3.7674214839935303, "learning_rate": 4.406299824868652e-05, "loss": 1.0108, "num_input_tokens_seen": 128621208, "step": 7993 }, { "epoch": 0.5599656763595927, "grad_norm": 3.4347217082977295, "learning_rate": 4.4056e-05, "loss": 0.8531, "num_input_tokens_seen": 128637592, "step": 7994 }, { "epoch": 0.5600357246053219, "grad_norm": 4.309778690338135, "learning_rate": 4.404900175131349e-05, "loss": 1.1468, "num_input_tokens_seen": 128653976, "step": 7995 }, { "epoch": 0.5601057728510511, "grad_norm": 3.567622423171997, "learning_rate": 4.404200350262698e-05, "loss": 0.9106, "num_input_tokens_seen": 128670360, "step": 7996 }, { "epoch": 0.5601758210967804, "grad_norm": 5.184731483459473, "learning_rate": 4.403500525394046e-05, "loss": 0.8277, "num_input_tokens_seen": 128684744, "step": 7997 }, { "epoch": 0.5602458693425096, "grad_norm": 4.807730674743652, "learning_rate": 4.402800700525395e-05, "loss": 1.028, "num_input_tokens_seen": 128701128, "step": 7998 }, { "epoch": 0.560315917588239, "grad_norm": 6.368520736694336, "learning_rate": 4.402100875656742e-05, "loss": 1.3107, "num_input_tokens_seen": 128717288, "step": 7999 }, { "epoch": 0.5603859658339682, "grad_norm": 3.997955083847046, "learning_rate": 4.401401050788091e-05, "loss": 0.9302, "num_input_tokens_seen": 128733328, "step": 8000 }, { "epoch": 0.5603859658339682, "eval_loss": 1.1226634979248047, "eval_runtime": 0.2085, "eval_samples_per_second": 4.797, "eval_steps_per_second": 4.797, "num_input_tokens_seen": 128733328, "step": 8000 }, { "epoch": 0.5604560140796974, "grad_norm": 7.1005635261535645, "learning_rate": 4.4007012259194406e-05, "loss": 1.0116, "num_input_tokens_seen": 128749712, "step": 8001 }, { "epoch": 0.5605260623254267, "grad_norm": 3.9482359886169434, "learning_rate": 4.400001401050788e-05, "loss": 0.8402, "num_input_tokens_seen": 128765408, "step": 8002 }, { "epoch": 0.5605961105711559, "grad_norm": 5.755213260650635, "learning_rate": 4.399301576182137e-05, "loss": 1.0476, "num_input_tokens_seen": 128781256, "step": 8003 }, { "epoch": 0.5606661588168851, "grad_norm": 4.391782760620117, "learning_rate": 4.398601751313485e-05, "loss": 0.9619, "num_input_tokens_seen": 128796952, "step": 8004 }, { "epoch": 0.5607362070626144, "grad_norm": 4.010289192199707, "learning_rate": 4.397901926444834e-05, "loss": 0.963, "num_input_tokens_seen": 128813160, "step": 8005 }, { "epoch": 0.5608062553083436, "grad_norm": 4.009171485900879, "learning_rate": 4.397202101576183e-05, "loss": 1.0096, "num_input_tokens_seen": 128828712, "step": 8006 }, { "epoch": 0.5608763035540729, "grad_norm": 3.7310292720794678, "learning_rate": 4.396502276707531e-05, "loss": 1.0376, "num_input_tokens_seen": 128845096, "step": 8007 }, { "epoch": 0.5609463517998021, "grad_norm": 4.350787162780762, "learning_rate": 4.39580245183888e-05, "loss": 1.1524, "num_input_tokens_seen": 128861480, "step": 8008 }, { "epoch": 0.5610164000455313, "grad_norm": 3.7701098918914795, "learning_rate": 4.3951026269702274e-05, "loss": 1.0272, "num_input_tokens_seen": 128877160, "step": 8009 }, { "epoch": 0.5610864482912606, "grad_norm": 3.878599166870117, "learning_rate": 4.394402802101576e-05, "loss": 0.9247, "num_input_tokens_seen": 128893544, "step": 8010 }, { "epoch": 0.5611564965369898, "grad_norm": 4.865396499633789, "learning_rate": 4.393702977232926e-05, "loss": 1.2614, "num_input_tokens_seen": 128909496, "step": 8011 }, { "epoch": 0.561226544782719, "grad_norm": 3.691734552383423, "learning_rate": 4.393003152364273e-05, "loss": 1.1862, "num_input_tokens_seen": 128925440, "step": 8012 }, { "epoch": 0.5612965930284484, "grad_norm": 3.890925884246826, "learning_rate": 4.392303327495622e-05, "loss": 0.8807, "num_input_tokens_seen": 128941824, "step": 8013 }, { "epoch": 0.5613666412741776, "grad_norm": 5.752060413360596, "learning_rate": 4.3916035026269704e-05, "loss": 1.1932, "num_input_tokens_seen": 128955672, "step": 8014 }, { "epoch": 0.5614366895199069, "grad_norm": 3.931739091873169, "learning_rate": 4.390903677758319e-05, "loss": 0.9507, "num_input_tokens_seen": 128972056, "step": 8015 }, { "epoch": 0.5615067377656361, "grad_norm": 5.384754657745361, "learning_rate": 4.390203852889668e-05, "loss": 1.0261, "num_input_tokens_seen": 128988440, "step": 8016 }, { "epoch": 0.5615767860113653, "grad_norm": 4.624659061431885, "learning_rate": 4.389504028021016e-05, "loss": 0.8866, "num_input_tokens_seen": 129004528, "step": 8017 }, { "epoch": 0.5616468342570946, "grad_norm": 3.9661507606506348, "learning_rate": 4.388804203152365e-05, "loss": 1.1405, "num_input_tokens_seen": 129020912, "step": 8018 }, { "epoch": 0.5617168825028238, "grad_norm": 3.5319230556488037, "learning_rate": 4.3881043782837126e-05, "loss": 1.0024, "num_input_tokens_seen": 129037296, "step": 8019 }, { "epoch": 0.561786930748553, "grad_norm": 4.236444473266602, "learning_rate": 4.3874045534150615e-05, "loss": 1.0596, "num_input_tokens_seen": 129053680, "step": 8020 }, { "epoch": 0.5618569789942823, "grad_norm": 3.69328236579895, "learning_rate": 4.38670472854641e-05, "loss": 1.0373, "num_input_tokens_seen": 129070064, "step": 8021 }, { "epoch": 0.5619270272400115, "grad_norm": 5.341061115264893, "learning_rate": 4.3860049036777585e-05, "loss": 0.9455, "num_input_tokens_seen": 129086224, "step": 8022 }, { "epoch": 0.5619970754857408, "grad_norm": 4.005245208740234, "learning_rate": 4.3853050788091074e-05, "loss": 0.9985, "num_input_tokens_seen": 129102608, "step": 8023 }, { "epoch": 0.56206712373147, "grad_norm": 4.377266883850098, "learning_rate": 4.3846052539404556e-05, "loss": 1.1311, "num_input_tokens_seen": 129118992, "step": 8024 }, { "epoch": 0.5621371719771993, "grad_norm": 3.7815823554992676, "learning_rate": 4.3839054290718044e-05, "loss": 1.192, "num_input_tokens_seen": 129135376, "step": 8025 }, { "epoch": 0.5622072202229286, "grad_norm": 4.8340744972229, "learning_rate": 4.383205604203152e-05, "loss": 1.0839, "num_input_tokens_seen": 129151352, "step": 8026 }, { "epoch": 0.5622772684686578, "grad_norm": 5.007735252380371, "learning_rate": 4.3825057793345015e-05, "loss": 1.094, "num_input_tokens_seen": 129166992, "step": 8027 }, { "epoch": 0.562347316714387, "grad_norm": 5.447760105133057, "learning_rate": 4.38180595446585e-05, "loss": 1.2057, "num_input_tokens_seen": 129183376, "step": 8028 }, { "epoch": 0.5624173649601163, "grad_norm": 4.995473861694336, "learning_rate": 4.381106129597198e-05, "loss": 0.9403, "num_input_tokens_seen": 129198816, "step": 8029 }, { "epoch": 0.5624874132058455, "grad_norm": 4.708920955657959, "learning_rate": 4.380406304728547e-05, "loss": 0.9984, "num_input_tokens_seen": 129215200, "step": 8030 }, { "epoch": 0.5625574614515748, "grad_norm": 3.8864386081695557, "learning_rate": 4.379706479859895e-05, "loss": 0.734, "num_input_tokens_seen": 129231504, "step": 8031 }, { "epoch": 0.562627509697304, "grad_norm": 3.467696189880371, "learning_rate": 4.379006654991244e-05, "loss": 0.9214, "num_input_tokens_seen": 129247624, "step": 8032 }, { "epoch": 0.5626975579430332, "grad_norm": 4.07413387298584, "learning_rate": 4.3783068301225926e-05, "loss": 1.0158, "num_input_tokens_seen": 129264008, "step": 8033 }, { "epoch": 0.5627676061887625, "grad_norm": 4.135556697845459, "learning_rate": 4.377607005253941e-05, "loss": 1.1483, "num_input_tokens_seen": 129279136, "step": 8034 }, { "epoch": 0.5628376544344917, "grad_norm": 4.153659820556641, "learning_rate": 4.3769071803852896e-05, "loss": 0.9118, "num_input_tokens_seen": 129294192, "step": 8035 }, { "epoch": 0.5629077026802211, "grad_norm": 3.6717982292175293, "learning_rate": 4.376207355516637e-05, "loss": 1.0144, "num_input_tokens_seen": 129310312, "step": 8036 }, { "epoch": 0.5629777509259503, "grad_norm": 4.013020038604736, "learning_rate": 4.375507530647987e-05, "loss": 1.1211, "num_input_tokens_seen": 129326696, "step": 8037 }, { "epoch": 0.5630477991716795, "grad_norm": 4.093492031097412, "learning_rate": 4.3748077057793355e-05, "loss": 1.0406, "num_input_tokens_seen": 129343080, "step": 8038 }, { "epoch": 0.5631178474174088, "grad_norm": 5.1094512939453125, "learning_rate": 4.374107880910683e-05, "loss": 1.1122, "num_input_tokens_seen": 129359232, "step": 8039 }, { "epoch": 0.563187895663138, "grad_norm": 4.3890275955200195, "learning_rate": 4.373408056042032e-05, "loss": 1.2691, "num_input_tokens_seen": 129374992, "step": 8040 }, { "epoch": 0.5632579439088672, "grad_norm": 3.4645395278930664, "learning_rate": 4.37270823117338e-05, "loss": 0.945, "num_input_tokens_seen": 129390728, "step": 8041 }, { "epoch": 0.5633279921545965, "grad_norm": 3.5011980533599854, "learning_rate": 4.372008406304729e-05, "loss": 0.9252, "num_input_tokens_seen": 129406616, "step": 8042 }, { "epoch": 0.5633980404003257, "grad_norm": 3.694739580154419, "learning_rate": 4.371308581436078e-05, "loss": 1.0331, "num_input_tokens_seen": 129423000, "step": 8043 }, { "epoch": 0.563468088646055, "grad_norm": 4.7144293785095215, "learning_rate": 4.370608756567426e-05, "loss": 0.9954, "num_input_tokens_seen": 129438048, "step": 8044 }, { "epoch": 0.5635381368917842, "grad_norm": 3.9834823608398438, "learning_rate": 4.369908931698775e-05, "loss": 1.1571, "num_input_tokens_seen": 129454432, "step": 8045 }, { "epoch": 0.5636081851375134, "grad_norm": 5.334307670593262, "learning_rate": 4.3692091068301223e-05, "loss": 1.1283, "num_input_tokens_seen": 129470816, "step": 8046 }, { "epoch": 0.5636782333832427, "grad_norm": 4.415291786193848, "learning_rate": 4.368509281961472e-05, "loss": 1.2554, "num_input_tokens_seen": 129487200, "step": 8047 }, { "epoch": 0.563748281628972, "grad_norm": 5.6152262687683105, "learning_rate": 4.3678094570928194e-05, "loss": 1.0074, "num_input_tokens_seen": 129503584, "step": 8048 }, { "epoch": 0.5638183298747012, "grad_norm": 5.171108245849609, "learning_rate": 4.367109632224168e-05, "loss": 0.9731, "num_input_tokens_seen": 129519968, "step": 8049 }, { "epoch": 0.5638883781204305, "grad_norm": 4.773746013641357, "learning_rate": 4.366409807355517e-05, "loss": 1.0941, "num_input_tokens_seen": 129535432, "step": 8050 }, { "epoch": 0.5639584263661597, "grad_norm": 4.129162311553955, "learning_rate": 4.365709982486865e-05, "loss": 0.9929, "num_input_tokens_seen": 129550880, "step": 8051 }, { "epoch": 0.564028474611889, "grad_norm": 4.851277828216553, "learning_rate": 4.365010157618214e-05, "loss": 0.9914, "num_input_tokens_seen": 129565592, "step": 8052 }, { "epoch": 0.5640985228576182, "grad_norm": 4.487455368041992, "learning_rate": 4.364310332749562e-05, "loss": 1.2058, "num_input_tokens_seen": 129581976, "step": 8053 }, { "epoch": 0.5641685711033474, "grad_norm": 4.226871013641357, "learning_rate": 4.363610507880911e-05, "loss": 1.1413, "num_input_tokens_seen": 129598360, "step": 8054 }, { "epoch": 0.5642386193490767, "grad_norm": 4.204067230224609, "learning_rate": 4.36291068301226e-05, "loss": 1.1353, "num_input_tokens_seen": 129614232, "step": 8055 }, { "epoch": 0.5643086675948059, "grad_norm": 3.792025566101074, "learning_rate": 4.3622108581436075e-05, "loss": 1.2165, "num_input_tokens_seen": 129630616, "step": 8056 }, { "epoch": 0.5643787158405351, "grad_norm": 3.966076135635376, "learning_rate": 4.361511033274957e-05, "loss": 1.1306, "num_input_tokens_seen": 129647000, "step": 8057 }, { "epoch": 0.5644487640862644, "grad_norm": 5.704814910888672, "learning_rate": 4.3608112084063046e-05, "loss": 1.0289, "num_input_tokens_seen": 129663384, "step": 8058 }, { "epoch": 0.5645188123319936, "grad_norm": 4.2781081199646, "learning_rate": 4.3601113835376534e-05, "loss": 1.173, "num_input_tokens_seen": 129679768, "step": 8059 }, { "epoch": 0.564588860577723, "grad_norm": 3.806344985961914, "learning_rate": 4.359411558669002e-05, "loss": 1.1536, "num_input_tokens_seen": 129696152, "step": 8060 }, { "epoch": 0.5646589088234522, "grad_norm": 4.551416873931885, "learning_rate": 4.3587117338003505e-05, "loss": 1.078, "num_input_tokens_seen": 129711184, "step": 8061 }, { "epoch": 0.5647289570691814, "grad_norm": 3.6475703716278076, "learning_rate": 4.3580119089316993e-05, "loss": 1.0125, "num_input_tokens_seen": 129727568, "step": 8062 }, { "epoch": 0.5647990053149107, "grad_norm": 4.113941192626953, "learning_rate": 4.3573120840630475e-05, "loss": 1.0104, "num_input_tokens_seen": 129743952, "step": 8063 }, { "epoch": 0.5648690535606399, "grad_norm": 3.9017693996429443, "learning_rate": 4.3566122591943964e-05, "loss": 0.9712, "num_input_tokens_seen": 129760336, "step": 8064 }, { "epoch": 0.5649391018063691, "grad_norm": 4.13060188293457, "learning_rate": 4.355912434325745e-05, "loss": 0.9723, "num_input_tokens_seen": 129776144, "step": 8065 }, { "epoch": 0.5650091500520984, "grad_norm": 4.518004417419434, "learning_rate": 4.355212609457093e-05, "loss": 0.9606, "num_input_tokens_seen": 129792000, "step": 8066 }, { "epoch": 0.5650791982978276, "grad_norm": 4.141806602478027, "learning_rate": 4.354512784588442e-05, "loss": 1.0458, "num_input_tokens_seen": 129808384, "step": 8067 }, { "epoch": 0.5651492465435569, "grad_norm": 4.177087783813477, "learning_rate": 4.35381295971979e-05, "loss": 1.1853, "num_input_tokens_seen": 129824256, "step": 8068 }, { "epoch": 0.5652192947892861, "grad_norm": 3.939929485321045, "learning_rate": 4.3531131348511386e-05, "loss": 0.995, "num_input_tokens_seen": 129840640, "step": 8069 }, { "epoch": 0.5652893430350153, "grad_norm": 4.335320472717285, "learning_rate": 4.3524133099824875e-05, "loss": 1.048, "num_input_tokens_seen": 129856952, "step": 8070 }, { "epoch": 0.5653593912807446, "grad_norm": 6.816842079162598, "learning_rate": 4.351713485113836e-05, "loss": 1.0009, "num_input_tokens_seen": 129872552, "step": 8071 }, { "epoch": 0.5654294395264738, "grad_norm": 4.088340759277344, "learning_rate": 4.3510136602451845e-05, "loss": 1.0941, "num_input_tokens_seen": 129888936, "step": 8072 }, { "epoch": 0.5654994877722032, "grad_norm": 3.65846586227417, "learning_rate": 4.350313835376533e-05, "loss": 1.1355, "num_input_tokens_seen": 129905320, "step": 8073 }, { "epoch": 0.5655695360179324, "grad_norm": 3.5550310611724854, "learning_rate": 4.3496140105078816e-05, "loss": 1.0121, "num_input_tokens_seen": 129921704, "step": 8074 }, { "epoch": 0.5656395842636616, "grad_norm": 4.011558532714844, "learning_rate": 4.348914185639229e-05, "loss": 1.0614, "num_input_tokens_seen": 129937968, "step": 8075 }, { "epoch": 0.5657096325093909, "grad_norm": 3.63883638381958, "learning_rate": 4.348214360770578e-05, "loss": 1.162, "num_input_tokens_seen": 129954352, "step": 8076 }, { "epoch": 0.5657796807551201, "grad_norm": 3.9487674236297607, "learning_rate": 4.3475145359019275e-05, "loss": 1.1023, "num_input_tokens_seen": 129970528, "step": 8077 }, { "epoch": 0.5658497290008493, "grad_norm": 4.0052266120910645, "learning_rate": 4.346814711033275e-05, "loss": 1.0462, "num_input_tokens_seen": 129985568, "step": 8078 }, { "epoch": 0.5659197772465786, "grad_norm": 4.147900581359863, "learning_rate": 4.346114886164624e-05, "loss": 1.0128, "num_input_tokens_seen": 130000832, "step": 8079 }, { "epoch": 0.5659898254923078, "grad_norm": 3.9152534008026123, "learning_rate": 4.345415061295972e-05, "loss": 1.1202, "num_input_tokens_seen": 130017216, "step": 8080 }, { "epoch": 0.5660598737380371, "grad_norm": 5.270138263702393, "learning_rate": 4.344715236427321e-05, "loss": 1.1368, "num_input_tokens_seen": 130033600, "step": 8081 }, { "epoch": 0.5661299219837663, "grad_norm": 3.7850892543792725, "learning_rate": 4.34401541155867e-05, "loss": 1.0064, "num_input_tokens_seen": 130049576, "step": 8082 }, { "epoch": 0.5661999702294955, "grad_norm": 3.8164186477661133, "learning_rate": 4.343315586690018e-05, "loss": 0.9948, "num_input_tokens_seen": 130065960, "step": 8083 }, { "epoch": 0.5662700184752248, "grad_norm": 4.439040660858154, "learning_rate": 4.342615761821367e-05, "loss": 0.8882, "num_input_tokens_seen": 130082344, "step": 8084 }, { "epoch": 0.566340066720954, "grad_norm": 4.9009599685668945, "learning_rate": 4.341915936952714e-05, "loss": 1.2654, "num_input_tokens_seen": 130098432, "step": 8085 }, { "epoch": 0.5664101149666833, "grad_norm": 3.7141706943511963, "learning_rate": 4.341216112084063e-05, "loss": 0.9878, "num_input_tokens_seen": 130114816, "step": 8086 }, { "epoch": 0.5664801632124126, "grad_norm": 3.661966323852539, "learning_rate": 4.340516287215413e-05, "loss": 0.9867, "num_input_tokens_seen": 130131112, "step": 8087 }, { "epoch": 0.5665502114581418, "grad_norm": 5.044811725616455, "learning_rate": 4.33981646234676e-05, "loss": 0.9719, "num_input_tokens_seen": 130147496, "step": 8088 }, { "epoch": 0.5666202597038711, "grad_norm": 5.159415245056152, "learning_rate": 4.339116637478109e-05, "loss": 1.1301, "num_input_tokens_seen": 130163880, "step": 8089 }, { "epoch": 0.5666903079496003, "grad_norm": 3.651123523712158, "learning_rate": 4.338416812609457e-05, "loss": 1.0788, "num_input_tokens_seen": 130179888, "step": 8090 }, { "epoch": 0.5667603561953295, "grad_norm": 4.067645072937012, "learning_rate": 4.337716987740806e-05, "loss": 1.0071, "num_input_tokens_seen": 130196272, "step": 8091 }, { "epoch": 0.5668304044410588, "grad_norm": 3.646764039993286, "learning_rate": 4.337017162872155e-05, "loss": 0.9172, "num_input_tokens_seen": 130212656, "step": 8092 }, { "epoch": 0.566900452686788, "grad_norm": 5.098438739776611, "learning_rate": 4.336317338003503e-05, "loss": 0.9031, "num_input_tokens_seen": 130228608, "step": 8093 }, { "epoch": 0.5669705009325172, "grad_norm": 3.536712408065796, "learning_rate": 4.335617513134852e-05, "loss": 0.9787, "num_input_tokens_seen": 130244768, "step": 8094 }, { "epoch": 0.5670405491782465, "grad_norm": 4.888411998748779, "learning_rate": 4.3349176882661995e-05, "loss": 1.1445, "num_input_tokens_seen": 130260936, "step": 8095 }, { "epoch": 0.5671105974239757, "grad_norm": 4.441121578216553, "learning_rate": 4.3342178633975484e-05, "loss": 1.1489, "num_input_tokens_seen": 130277320, "step": 8096 }, { "epoch": 0.567180645669705, "grad_norm": 4.9977545738220215, "learning_rate": 4.333518038528898e-05, "loss": 1.0849, "num_input_tokens_seen": 130292984, "step": 8097 }, { "epoch": 0.5672506939154343, "grad_norm": 3.602999448776245, "learning_rate": 4.3328182136602454e-05, "loss": 0.8972, "num_input_tokens_seen": 130308376, "step": 8098 }, { "epoch": 0.5673207421611635, "grad_norm": 4.618570804595947, "learning_rate": 4.332118388791594e-05, "loss": 1.1175, "num_input_tokens_seen": 130324440, "step": 8099 }, { "epoch": 0.5673907904068928, "grad_norm": 4.875861644744873, "learning_rate": 4.3314185639229424e-05, "loss": 1.036, "num_input_tokens_seen": 130340160, "step": 8100 }, { "epoch": 0.567460838652622, "grad_norm": 3.3949697017669678, "learning_rate": 4.330718739054291e-05, "loss": 0.9029, "num_input_tokens_seen": 130355984, "step": 8101 }, { "epoch": 0.5675308868983513, "grad_norm": 5.05001163482666, "learning_rate": 4.330018914185639e-05, "loss": 1.095, "num_input_tokens_seen": 130372368, "step": 8102 }, { "epoch": 0.5676009351440805, "grad_norm": 3.913780689239502, "learning_rate": 4.329319089316988e-05, "loss": 1.1253, "num_input_tokens_seen": 130388752, "step": 8103 }, { "epoch": 0.5676709833898097, "grad_norm": 3.778477668762207, "learning_rate": 4.328619264448337e-05, "loss": 1.0927, "num_input_tokens_seen": 130405136, "step": 8104 }, { "epoch": 0.567741031635539, "grad_norm": 4.065064430236816, "learning_rate": 4.327919439579685e-05, "loss": 1.1245, "num_input_tokens_seen": 130421384, "step": 8105 }, { "epoch": 0.5678110798812682, "grad_norm": 3.609297037124634, "learning_rate": 4.3272196147110336e-05, "loss": 1.0434, "num_input_tokens_seen": 130437768, "step": 8106 }, { "epoch": 0.5678811281269974, "grad_norm": 4.521945476531982, "learning_rate": 4.326519789842382e-05, "loss": 1.0285, "num_input_tokens_seen": 130452640, "step": 8107 }, { "epoch": 0.5679511763727267, "grad_norm": 4.647178649902344, "learning_rate": 4.3258199649737306e-05, "loss": 1.1492, "num_input_tokens_seen": 130469024, "step": 8108 }, { "epoch": 0.5680212246184559, "grad_norm": 3.594043254852295, "learning_rate": 4.3251201401050795e-05, "loss": 0.8569, "num_input_tokens_seen": 130484648, "step": 8109 }, { "epoch": 0.5680912728641853, "grad_norm": 4.731708526611328, "learning_rate": 4.3244203152364276e-05, "loss": 1.3175, "num_input_tokens_seen": 130500024, "step": 8110 }, { "epoch": 0.5681613211099145, "grad_norm": 4.488846302032471, "learning_rate": 4.3237204903677765e-05, "loss": 1.0714, "num_input_tokens_seen": 130516408, "step": 8111 }, { "epoch": 0.5682313693556437, "grad_norm": 4.828577995300293, "learning_rate": 4.323020665499124e-05, "loss": 1.1976, "num_input_tokens_seen": 130532752, "step": 8112 }, { "epoch": 0.568301417601373, "grad_norm": 5.491752624511719, "learning_rate": 4.3223208406304735e-05, "loss": 1.249, "num_input_tokens_seen": 130549136, "step": 8113 }, { "epoch": 0.5683714658471022, "grad_norm": 3.4913344383239746, "learning_rate": 4.3216210157618224e-05, "loss": 1.0322, "num_input_tokens_seen": 130565448, "step": 8114 }, { "epoch": 0.5684415140928314, "grad_norm": 3.519788980484009, "learning_rate": 4.32092119089317e-05, "loss": 0.8711, "num_input_tokens_seen": 130581832, "step": 8115 }, { "epoch": 0.5685115623385607, "grad_norm": 4.691020488739014, "learning_rate": 4.320221366024519e-05, "loss": 1.1926, "num_input_tokens_seen": 130597608, "step": 8116 }, { "epoch": 0.5685816105842899, "grad_norm": 4.807389736175537, "learning_rate": 4.319521541155867e-05, "loss": 1.0702, "num_input_tokens_seen": 130613992, "step": 8117 }, { "epoch": 0.5686516588300192, "grad_norm": 4.060730457305908, "learning_rate": 4.318821716287216e-05, "loss": 1.2357, "num_input_tokens_seen": 130630376, "step": 8118 }, { "epoch": 0.5687217070757484, "grad_norm": 4.58277702331543, "learning_rate": 4.3181218914185647e-05, "loss": 1.0893, "num_input_tokens_seen": 130646112, "step": 8119 }, { "epoch": 0.5687917553214776, "grad_norm": 4.290525913238525, "learning_rate": 4.317422066549913e-05, "loss": 1.177, "num_input_tokens_seen": 130662496, "step": 8120 }, { "epoch": 0.568861803567207, "grad_norm": 4.339941501617432, "learning_rate": 4.316722241681262e-05, "loss": 0.9364, "num_input_tokens_seen": 130678880, "step": 8121 }, { "epoch": 0.5689318518129362, "grad_norm": 7.363986492156982, "learning_rate": 4.316022416812609e-05, "loss": 1.0718, "num_input_tokens_seen": 130693176, "step": 8122 }, { "epoch": 0.5690019000586654, "grad_norm": 4.054739475250244, "learning_rate": 4.315322591943959e-05, "loss": 1.147, "num_input_tokens_seen": 130709152, "step": 8123 }, { "epoch": 0.5690719483043947, "grad_norm": 3.5982446670532227, "learning_rate": 4.3146227670753076e-05, "loss": 0.9578, "num_input_tokens_seen": 130725424, "step": 8124 }, { "epoch": 0.5691419965501239, "grad_norm": 4.437469005584717, "learning_rate": 4.313922942206655e-05, "loss": 1.0498, "num_input_tokens_seen": 130740184, "step": 8125 }, { "epoch": 0.5692120447958532, "grad_norm": 3.761885643005371, "learning_rate": 4.313223117338004e-05, "loss": 1.0214, "num_input_tokens_seen": 130756568, "step": 8126 }, { "epoch": 0.5692820930415824, "grad_norm": 4.0984416007995605, "learning_rate": 4.312523292469352e-05, "loss": 0.8935, "num_input_tokens_seen": 130772248, "step": 8127 }, { "epoch": 0.5693521412873116, "grad_norm": 3.3904314041137695, "learning_rate": 4.311823467600701e-05, "loss": 0.9048, "num_input_tokens_seen": 130788632, "step": 8128 }, { "epoch": 0.5694221895330409, "grad_norm": 4.1015214920043945, "learning_rate": 4.311123642732049e-05, "loss": 1.0574, "num_input_tokens_seen": 130805016, "step": 8129 }, { "epoch": 0.5694922377787701, "grad_norm": 4.696201801300049, "learning_rate": 4.310423817863398e-05, "loss": 1.0341, "num_input_tokens_seen": 130821160, "step": 8130 }, { "epoch": 0.5695622860244993, "grad_norm": 4.361652374267578, "learning_rate": 4.309723992994747e-05, "loss": 0.9995, "num_input_tokens_seen": 130835640, "step": 8131 }, { "epoch": 0.5696323342702286, "grad_norm": 3.435509204864502, "learning_rate": 4.3090241681260944e-05, "loss": 0.8642, "num_input_tokens_seen": 130851176, "step": 8132 }, { "epoch": 0.5697023825159578, "grad_norm": 5.87947416305542, "learning_rate": 4.308324343257444e-05, "loss": 1.0376, "num_input_tokens_seen": 130866568, "step": 8133 }, { "epoch": 0.5697724307616872, "grad_norm": 3.353489398956299, "learning_rate": 4.3076245183887914e-05, "loss": 0.8508, "num_input_tokens_seen": 130882536, "step": 8134 }, { "epoch": 0.5698424790074164, "grad_norm": 3.4409422874450684, "learning_rate": 4.30692469352014e-05, "loss": 0.86, "num_input_tokens_seen": 130898848, "step": 8135 }, { "epoch": 0.5699125272531456, "grad_norm": 4.050583362579346, "learning_rate": 4.306224868651489e-05, "loss": 1.0886, "num_input_tokens_seen": 130915224, "step": 8136 }, { "epoch": 0.5699825754988749, "grad_norm": 3.630647659301758, "learning_rate": 4.3055250437828373e-05, "loss": 0.9504, "num_input_tokens_seen": 130931608, "step": 8137 }, { "epoch": 0.5700526237446041, "grad_norm": 3.3654730319976807, "learning_rate": 4.304825218914186e-05, "loss": 0.9361, "num_input_tokens_seen": 130947992, "step": 8138 }, { "epoch": 0.5701226719903334, "grad_norm": 4.898937702178955, "learning_rate": 4.3041253940455344e-05, "loss": 1.0703, "num_input_tokens_seen": 130963560, "step": 8139 }, { "epoch": 0.5701927202360626, "grad_norm": 3.9583377838134766, "learning_rate": 4.303425569176883e-05, "loss": 0.9444, "num_input_tokens_seen": 130978024, "step": 8140 }, { "epoch": 0.5702627684817918, "grad_norm": 3.5538089275360107, "learning_rate": 4.302725744308232e-05, "loss": 1.0337, "num_input_tokens_seen": 130993672, "step": 8141 }, { "epoch": 0.5703328167275211, "grad_norm": 3.731968879699707, "learning_rate": 4.3020259194395796e-05, "loss": 1.0219, "num_input_tokens_seen": 131010056, "step": 8142 }, { "epoch": 0.5704028649732503, "grad_norm": 4.178092002868652, "learning_rate": 4.301326094570929e-05, "loss": 1.0606, "num_input_tokens_seen": 131026440, "step": 8143 }, { "epoch": 0.5704729132189795, "grad_norm": 4.041304588317871, "learning_rate": 4.3006262697022766e-05, "loss": 1.0485, "num_input_tokens_seen": 131041816, "step": 8144 }, { "epoch": 0.5705429614647088, "grad_norm": 3.405008316040039, "learning_rate": 4.2999264448336255e-05, "loss": 0.9009, "num_input_tokens_seen": 131058200, "step": 8145 }, { "epoch": 0.570613009710438, "grad_norm": 4.503276348114014, "learning_rate": 4.2992266199649744e-05, "loss": 0.8806, "num_input_tokens_seen": 131074584, "step": 8146 }, { "epoch": 0.5706830579561674, "grad_norm": 3.5301196575164795, "learning_rate": 4.2985267950963225e-05, "loss": 0.918, "num_input_tokens_seen": 131090968, "step": 8147 }, { "epoch": 0.5707531062018966, "grad_norm": 3.807494878768921, "learning_rate": 4.2978269702276714e-05, "loss": 1.1168, "num_input_tokens_seen": 131106840, "step": 8148 }, { "epoch": 0.5708231544476258, "grad_norm": 3.9565365314483643, "learning_rate": 4.2971271453590196e-05, "loss": 1.01, "num_input_tokens_seen": 131123224, "step": 8149 }, { "epoch": 0.5708932026933551, "grad_norm": 3.4169907569885254, "learning_rate": 4.2964273204903684e-05, "loss": 0.9627, "num_input_tokens_seen": 131139608, "step": 8150 }, { "epoch": 0.5709632509390843, "grad_norm": 3.7201099395751953, "learning_rate": 4.295727495621717e-05, "loss": 1.0652, "num_input_tokens_seen": 131155896, "step": 8151 }, { "epoch": 0.5710332991848135, "grad_norm": 4.128871440887451, "learning_rate": 4.295027670753065e-05, "loss": 1.1233, "num_input_tokens_seen": 131171824, "step": 8152 }, { "epoch": 0.5711033474305428, "grad_norm": 4.046292304992676, "learning_rate": 4.294327845884414e-05, "loss": 1.0088, "num_input_tokens_seen": 131188208, "step": 8153 }, { "epoch": 0.571173395676272, "grad_norm": 4.006393909454346, "learning_rate": 4.293628021015762e-05, "loss": 1.0339, "num_input_tokens_seen": 131204592, "step": 8154 }, { "epoch": 0.5712434439220013, "grad_norm": 3.7372918128967285, "learning_rate": 4.292928196147111e-05, "loss": 0.8374, "num_input_tokens_seen": 131220552, "step": 8155 }, { "epoch": 0.5713134921677305, "grad_norm": 3.778796672821045, "learning_rate": 4.292228371278459e-05, "loss": 1.193, "num_input_tokens_seen": 131236936, "step": 8156 }, { "epoch": 0.5713835404134597, "grad_norm": 5.442931652069092, "learning_rate": 4.291528546409808e-05, "loss": 1.3328, "num_input_tokens_seen": 131253320, "step": 8157 }, { "epoch": 0.571453588659189, "grad_norm": 3.9510107040405273, "learning_rate": 4.2908287215411566e-05, "loss": 1.1776, "num_input_tokens_seen": 131269704, "step": 8158 }, { "epoch": 0.5715236369049183, "grad_norm": 3.787173271179199, "learning_rate": 4.290128896672505e-05, "loss": 0.9507, "num_input_tokens_seen": 131286088, "step": 8159 }, { "epoch": 0.5715936851506475, "grad_norm": 4.486179828643799, "learning_rate": 4.2894290718038536e-05, "loss": 0.8241, "num_input_tokens_seen": 131302472, "step": 8160 }, { "epoch": 0.5716637333963768, "grad_norm": 5.035008907318115, "learning_rate": 4.288729246935201e-05, "loss": 1.0102, "num_input_tokens_seen": 131318520, "step": 8161 }, { "epoch": 0.571733781642106, "grad_norm": 3.7599833011627197, "learning_rate": 4.28802942206655e-05, "loss": 0.8591, "num_input_tokens_seen": 131334904, "step": 8162 }, { "epoch": 0.5718038298878353, "grad_norm": 4.14766263961792, "learning_rate": 4.287329597197899e-05, "loss": 1.1655, "num_input_tokens_seen": 131351288, "step": 8163 }, { "epoch": 0.5718738781335645, "grad_norm": 3.914499044418335, "learning_rate": 4.286629772329247e-05, "loss": 1.0247, "num_input_tokens_seen": 131367672, "step": 8164 }, { "epoch": 0.5719439263792937, "grad_norm": 3.5725250244140625, "learning_rate": 4.285929947460596e-05, "loss": 1.0252, "num_input_tokens_seen": 131383968, "step": 8165 }, { "epoch": 0.572013974625023, "grad_norm": 3.5275251865386963, "learning_rate": 4.285230122591944e-05, "loss": 1.0562, "num_input_tokens_seen": 131400352, "step": 8166 }, { "epoch": 0.5720840228707522, "grad_norm": 5.734055042266846, "learning_rate": 4.284530297723293e-05, "loss": 0.8822, "num_input_tokens_seen": 131416168, "step": 8167 }, { "epoch": 0.5721540711164814, "grad_norm": 6.255549907684326, "learning_rate": 4.283830472854642e-05, "loss": 1.0778, "num_input_tokens_seen": 131432072, "step": 8168 }, { "epoch": 0.5722241193622107, "grad_norm": 5.589477062225342, "learning_rate": 4.283130647985989e-05, "loss": 0.9766, "num_input_tokens_seen": 131448456, "step": 8169 }, { "epoch": 0.5722941676079399, "grad_norm": 4.749863147735596, "learning_rate": 4.282430823117339e-05, "loss": 1.122, "num_input_tokens_seen": 131464840, "step": 8170 }, { "epoch": 0.5723642158536693, "grad_norm": 4.904588222503662, "learning_rate": 4.2817309982486864e-05, "loss": 1.1841, "num_input_tokens_seen": 131481224, "step": 8171 }, { "epoch": 0.5724342640993985, "grad_norm": 3.5946271419525146, "learning_rate": 4.281031173380035e-05, "loss": 0.8804, "num_input_tokens_seen": 131497608, "step": 8172 }, { "epoch": 0.5725043123451277, "grad_norm": 4.187933921813965, "learning_rate": 4.280331348511384e-05, "loss": 0.8924, "num_input_tokens_seen": 131513776, "step": 8173 }, { "epoch": 0.572574360590857, "grad_norm": 4.336525917053223, "learning_rate": 4.279631523642732e-05, "loss": 1.0626, "num_input_tokens_seen": 131529544, "step": 8174 }, { "epoch": 0.5726444088365862, "grad_norm": 4.156813621520996, "learning_rate": 4.278931698774081e-05, "loss": 0.9311, "num_input_tokens_seen": 131544880, "step": 8175 }, { "epoch": 0.5727144570823155, "grad_norm": 4.064977169036865, "learning_rate": 4.278231873905429e-05, "loss": 1.2461, "num_input_tokens_seen": 131560600, "step": 8176 }, { "epoch": 0.5727845053280447, "grad_norm": 3.8093082904815674, "learning_rate": 4.277532049036778e-05, "loss": 1.0079, "num_input_tokens_seen": 131576984, "step": 8177 }, { "epoch": 0.5728545535737739, "grad_norm": 3.8263261318206787, "learning_rate": 4.276832224168127e-05, "loss": 1.0441, "num_input_tokens_seen": 131592848, "step": 8178 }, { "epoch": 0.5729246018195032, "grad_norm": 4.268642425537109, "learning_rate": 4.2761323992994745e-05, "loss": 1.0139, "num_input_tokens_seen": 131609232, "step": 8179 }, { "epoch": 0.5729946500652324, "grad_norm": 3.6199145317077637, "learning_rate": 4.275432574430824e-05, "loss": 1.0993, "num_input_tokens_seen": 131625160, "step": 8180 }, { "epoch": 0.5730646983109616, "grad_norm": 5.106734275817871, "learning_rate": 4.2747327495621716e-05, "loss": 1.2588, "num_input_tokens_seen": 131641240, "step": 8181 }, { "epoch": 0.5731347465566909, "grad_norm": 4.9245381355285645, "learning_rate": 4.2740329246935204e-05, "loss": 0.9496, "num_input_tokens_seen": 131657624, "step": 8182 }, { "epoch": 0.5732047948024201, "grad_norm": 4.108707904815674, "learning_rate": 4.2733330998248686e-05, "loss": 1.0043, "num_input_tokens_seen": 131673400, "step": 8183 }, { "epoch": 0.5732748430481495, "grad_norm": 4.672266006469727, "learning_rate": 4.2726332749562175e-05, "loss": 1.1272, "num_input_tokens_seen": 131689784, "step": 8184 }, { "epoch": 0.5733448912938787, "grad_norm": 5.033571720123291, "learning_rate": 4.271933450087566e-05, "loss": 1.067, "num_input_tokens_seen": 131706168, "step": 8185 }, { "epoch": 0.5734149395396079, "grad_norm": 4.336825847625732, "learning_rate": 4.2712336252189145e-05, "loss": 0.9631, "num_input_tokens_seen": 131722496, "step": 8186 }, { "epoch": 0.5734849877853372, "grad_norm": 3.6326375007629395, "learning_rate": 4.2705338003502634e-05, "loss": 0.8525, "num_input_tokens_seen": 131738880, "step": 8187 }, { "epoch": 0.5735550360310664, "grad_norm": 3.407284736633301, "learning_rate": 4.269833975481611e-05, "loss": 1.099, "num_input_tokens_seen": 131755080, "step": 8188 }, { "epoch": 0.5736250842767956, "grad_norm": 5.604672431945801, "learning_rate": 4.26913415061296e-05, "loss": 1.0606, "num_input_tokens_seen": 131771464, "step": 8189 }, { "epoch": 0.5736951325225249, "grad_norm": 3.4375531673431396, "learning_rate": 4.268434325744309e-05, "loss": 0.9977, "num_input_tokens_seen": 131787848, "step": 8190 }, { "epoch": 0.5737651807682541, "grad_norm": 4.113184928894043, "learning_rate": 4.267734500875657e-05, "loss": 1.0283, "num_input_tokens_seen": 131804232, "step": 8191 }, { "epoch": 0.5738352290139834, "grad_norm": 5.918920516967773, "learning_rate": 4.2670346760070056e-05, "loss": 1.168, "num_input_tokens_seen": 131819608, "step": 8192 }, { "epoch": 0.5739052772597126, "grad_norm": 4.387043476104736, "learning_rate": 4.266334851138354e-05, "loss": 1.2334, "num_input_tokens_seen": 131835992, "step": 8193 }, { "epoch": 0.5739753255054418, "grad_norm": 4.381311416625977, "learning_rate": 4.2656350262697027e-05, "loss": 1.0675, "num_input_tokens_seen": 131851920, "step": 8194 }, { "epoch": 0.5740453737511712, "grad_norm": 6.177340507507324, "learning_rate": 4.2649352014010515e-05, "loss": 1.1517, "num_input_tokens_seen": 131867240, "step": 8195 }, { "epoch": 0.5741154219969004, "grad_norm": 4.505295276641846, "learning_rate": 4.2642353765324e-05, "loss": 1.1147, "num_input_tokens_seen": 131883408, "step": 8196 }, { "epoch": 0.5741854702426296, "grad_norm": 3.9150309562683105, "learning_rate": 4.2635355516637486e-05, "loss": 0.9609, "num_input_tokens_seen": 131899144, "step": 8197 }, { "epoch": 0.5742555184883589, "grad_norm": 4.148092269897461, "learning_rate": 4.262835726795096e-05, "loss": 1.0709, "num_input_tokens_seen": 131915528, "step": 8198 }, { "epoch": 0.5743255667340881, "grad_norm": 3.353017807006836, "learning_rate": 4.262135901926445e-05, "loss": 1.0293, "num_input_tokens_seen": 131931912, "step": 8199 }, { "epoch": 0.5743956149798174, "grad_norm": 3.949207305908203, "learning_rate": 4.2614360770577945e-05, "loss": 0.8957, "num_input_tokens_seen": 131947712, "step": 8200 }, { "epoch": 0.5743956149798174, "eval_loss": 1.1200522184371948, "eval_runtime": 0.2045, "eval_samples_per_second": 4.889, "eval_steps_per_second": 4.889, "num_input_tokens_seen": 131947712, "step": 8200 }, { "epoch": 0.5744656632255466, "grad_norm": 3.978220224380493, "learning_rate": 4.260736252189142e-05, "loss": 0.9035, "num_input_tokens_seen": 131963208, "step": 8201 }, { "epoch": 0.5745357114712758, "grad_norm": 3.7986531257629395, "learning_rate": 4.260036427320491e-05, "loss": 0.8985, "num_input_tokens_seen": 131978048, "step": 8202 }, { "epoch": 0.5746057597170051, "grad_norm": 3.9384713172912598, "learning_rate": 4.259336602451839e-05, "loss": 1.1815, "num_input_tokens_seen": 131994432, "step": 8203 }, { "epoch": 0.5746758079627343, "grad_norm": 4.360903739929199, "learning_rate": 4.258636777583188e-05, "loss": 1.107, "num_input_tokens_seen": 132010504, "step": 8204 }, { "epoch": 0.5747458562084636, "grad_norm": 4.957514762878418, "learning_rate": 4.2579369527145354e-05, "loss": 1.1793, "num_input_tokens_seen": 132026192, "step": 8205 }, { "epoch": 0.5748159044541928, "grad_norm": 4.4821882247924805, "learning_rate": 4.257237127845885e-05, "loss": 1.1506, "num_input_tokens_seen": 132041624, "step": 8206 }, { "epoch": 0.574885952699922, "grad_norm": 3.915302276611328, "learning_rate": 4.256537302977234e-05, "loss": 1.0654, "num_input_tokens_seen": 132056600, "step": 8207 }, { "epoch": 0.5749560009456514, "grad_norm": 3.67526912689209, "learning_rate": 4.255837478108581e-05, "loss": 1.101, "num_input_tokens_seen": 132072984, "step": 8208 }, { "epoch": 0.5750260491913806, "grad_norm": 5.455066204071045, "learning_rate": 4.25513765323993e-05, "loss": 1.1665, "num_input_tokens_seen": 132089368, "step": 8209 }, { "epoch": 0.5750960974371098, "grad_norm": 3.7879862785339355, "learning_rate": 4.254437828371278e-05, "loss": 1.2125, "num_input_tokens_seen": 132105752, "step": 8210 }, { "epoch": 0.5751661456828391, "grad_norm": 4.033114910125732, "learning_rate": 4.253738003502627e-05, "loss": 1.1133, "num_input_tokens_seen": 132122136, "step": 8211 }, { "epoch": 0.5752361939285683, "grad_norm": 3.596369743347168, "learning_rate": 4.253038178633976e-05, "loss": 1.0748, "num_input_tokens_seen": 132138520, "step": 8212 }, { "epoch": 0.5753062421742976, "grad_norm": 4.42296838760376, "learning_rate": 4.252338353765324e-05, "loss": 1.0508, "num_input_tokens_seen": 132154904, "step": 8213 }, { "epoch": 0.5753762904200268, "grad_norm": 3.5020558834075928, "learning_rate": 4.251638528896673e-05, "loss": 0.9663, "num_input_tokens_seen": 132171248, "step": 8214 }, { "epoch": 0.575446338665756, "grad_norm": 4.176201343536377, "learning_rate": 4.2509387040280206e-05, "loss": 1.0606, "num_input_tokens_seen": 132187296, "step": 8215 }, { "epoch": 0.5755163869114853, "grad_norm": 3.459134817123413, "learning_rate": 4.25023887915937e-05, "loss": 0.87, "num_input_tokens_seen": 132203680, "step": 8216 }, { "epoch": 0.5755864351572145, "grad_norm": 3.870122194290161, "learning_rate": 4.249539054290719e-05, "loss": 1.005, "num_input_tokens_seen": 132220064, "step": 8217 }, { "epoch": 0.5756564834029437, "grad_norm": 4.246501922607422, "learning_rate": 4.2488392294220665e-05, "loss": 1.246, "num_input_tokens_seen": 132236248, "step": 8218 }, { "epoch": 0.575726531648673, "grad_norm": 4.028504371643066, "learning_rate": 4.248139404553415e-05, "loss": 1.1537, "num_input_tokens_seen": 132252624, "step": 8219 }, { "epoch": 0.5757965798944022, "grad_norm": 5.083249092102051, "learning_rate": 4.2474395796847635e-05, "loss": 1.1357, "num_input_tokens_seen": 132269008, "step": 8220 }, { "epoch": 0.5758666281401316, "grad_norm": 4.459508419036865, "learning_rate": 4.2467397548161124e-05, "loss": 0.9581, "num_input_tokens_seen": 132285392, "step": 8221 }, { "epoch": 0.5759366763858608, "grad_norm": 3.5667686462402344, "learning_rate": 4.246039929947461e-05, "loss": 0.9399, "num_input_tokens_seen": 132301584, "step": 8222 }, { "epoch": 0.57600672463159, "grad_norm": 3.982933759689331, "learning_rate": 4.2453401050788094e-05, "loss": 1.2309, "num_input_tokens_seen": 132317968, "step": 8223 }, { "epoch": 0.5760767728773193, "grad_norm": 3.59114408493042, "learning_rate": 4.244640280210158e-05, "loss": 0.9939, "num_input_tokens_seen": 132334352, "step": 8224 }, { "epoch": 0.5761468211230485, "grad_norm": 4.596956729888916, "learning_rate": 4.243940455341506e-05, "loss": 1.1058, "num_input_tokens_seen": 132350736, "step": 8225 }, { "epoch": 0.5762168693687777, "grad_norm": 4.798111915588379, "learning_rate": 4.243240630472855e-05, "loss": 1.1164, "num_input_tokens_seen": 132367120, "step": 8226 }, { "epoch": 0.576286917614507, "grad_norm": 3.9915149211883545, "learning_rate": 4.242540805604204e-05, "loss": 1.1464, "num_input_tokens_seen": 132382712, "step": 8227 }, { "epoch": 0.5763569658602362, "grad_norm": 3.8071064949035645, "learning_rate": 4.241840980735552e-05, "loss": 0.8643, "num_input_tokens_seen": 132399096, "step": 8228 }, { "epoch": 0.5764270141059655, "grad_norm": 3.679708242416382, "learning_rate": 4.2411411558669005e-05, "loss": 1.1226, "num_input_tokens_seen": 132415288, "step": 8229 }, { "epoch": 0.5764970623516947, "grad_norm": 3.723848581314087, "learning_rate": 4.240441330998249e-05, "loss": 1.0517, "num_input_tokens_seen": 132431672, "step": 8230 }, { "epoch": 0.5765671105974239, "grad_norm": 4.438333988189697, "learning_rate": 4.2397415061295976e-05, "loss": 1.1601, "num_input_tokens_seen": 132447408, "step": 8231 }, { "epoch": 0.5766371588431533, "grad_norm": 3.4421169757843018, "learning_rate": 4.239041681260946e-05, "loss": 1.017, "num_input_tokens_seen": 132463600, "step": 8232 }, { "epoch": 0.5767072070888825, "grad_norm": 3.851832389831543, "learning_rate": 4.2383418563922946e-05, "loss": 1.0237, "num_input_tokens_seen": 132479984, "step": 8233 }, { "epoch": 0.5767772553346117, "grad_norm": 5.055593967437744, "learning_rate": 4.2376420315236435e-05, "loss": 1.0512, "num_input_tokens_seen": 132495872, "step": 8234 }, { "epoch": 0.576847303580341, "grad_norm": 3.958918809890747, "learning_rate": 4.236942206654991e-05, "loss": 1.0412, "num_input_tokens_seen": 132512256, "step": 8235 }, { "epoch": 0.5769173518260702, "grad_norm": 3.6936240196228027, "learning_rate": 4.2362423817863405e-05, "loss": 1.0251, "num_input_tokens_seen": 132528640, "step": 8236 }, { "epoch": 0.5769874000717995, "grad_norm": 4.020928859710693, "learning_rate": 4.235542556917688e-05, "loss": 0.979, "num_input_tokens_seen": 132543712, "step": 8237 }, { "epoch": 0.5770574483175287, "grad_norm": 3.5804123878479004, "learning_rate": 4.234842732049037e-05, "loss": 1.0581, "num_input_tokens_seen": 132559568, "step": 8238 }, { "epoch": 0.5771274965632579, "grad_norm": 5.450964450836182, "learning_rate": 4.234142907180386e-05, "loss": 1.0109, "num_input_tokens_seen": 132575952, "step": 8239 }, { "epoch": 0.5771975448089872, "grad_norm": 4.571927547454834, "learning_rate": 4.233443082311734e-05, "loss": 1.0427, "num_input_tokens_seen": 132591832, "step": 8240 }, { "epoch": 0.5772675930547164, "grad_norm": 5.09351921081543, "learning_rate": 4.232743257443083e-05, "loss": 0.9558, "num_input_tokens_seen": 132608216, "step": 8241 }, { "epoch": 0.5773376413004457, "grad_norm": 3.211693525314331, "learning_rate": 4.232043432574431e-05, "loss": 0.9804, "num_input_tokens_seen": 132624600, "step": 8242 }, { "epoch": 0.5774076895461749, "grad_norm": 3.822723865509033, "learning_rate": 4.23134360770578e-05, "loss": 1.0267, "num_input_tokens_seen": 132640984, "step": 8243 }, { "epoch": 0.5774777377919041, "grad_norm": 5.541271686553955, "learning_rate": 4.230643782837129e-05, "loss": 0.7341, "num_input_tokens_seen": 132657368, "step": 8244 }, { "epoch": 0.5775477860376335, "grad_norm": 6.682171821594238, "learning_rate": 4.229943957968476e-05, "loss": 0.9682, "num_input_tokens_seen": 132673752, "step": 8245 }, { "epoch": 0.5776178342833627, "grad_norm": 3.4161407947540283, "learning_rate": 4.229244133099826e-05, "loss": 0.9307, "num_input_tokens_seen": 132690136, "step": 8246 }, { "epoch": 0.5776878825290919, "grad_norm": 3.6526873111724854, "learning_rate": 4.228544308231173e-05, "loss": 1.0356, "num_input_tokens_seen": 132706520, "step": 8247 }, { "epoch": 0.5777579307748212, "grad_norm": 4.506532192230225, "learning_rate": 4.227844483362522e-05, "loss": 0.9916, "num_input_tokens_seen": 132721600, "step": 8248 }, { "epoch": 0.5778279790205504, "grad_norm": 3.9028801918029785, "learning_rate": 4.227144658493871e-05, "loss": 1.1071, "num_input_tokens_seen": 132737728, "step": 8249 }, { "epoch": 0.5778980272662797, "grad_norm": 4.508323669433594, "learning_rate": 4.226444833625219e-05, "loss": 1.044, "num_input_tokens_seen": 132753240, "step": 8250 }, { "epoch": 0.5779680755120089, "grad_norm": 4.045495986938477, "learning_rate": 4.225745008756568e-05, "loss": 1.0133, "num_input_tokens_seen": 132769024, "step": 8251 }, { "epoch": 0.5780381237577381, "grad_norm": 3.3747479915618896, "learning_rate": 4.225045183887916e-05, "loss": 0.9641, "num_input_tokens_seen": 132785408, "step": 8252 }, { "epoch": 0.5781081720034674, "grad_norm": 3.527862787246704, "learning_rate": 4.224345359019265e-05, "loss": 0.9351, "num_input_tokens_seen": 132801792, "step": 8253 }, { "epoch": 0.5781782202491966, "grad_norm": 5.42443323135376, "learning_rate": 4.223645534150614e-05, "loss": 0.9197, "num_input_tokens_seen": 132817536, "step": 8254 }, { "epoch": 0.5782482684949258, "grad_norm": 3.450995922088623, "learning_rate": 4.2229457092819614e-05, "loss": 1.1086, "num_input_tokens_seen": 132833920, "step": 8255 }, { "epoch": 0.5783183167406551, "grad_norm": 3.824470043182373, "learning_rate": 4.222245884413311e-05, "loss": 1.0504, "num_input_tokens_seen": 132850304, "step": 8256 }, { "epoch": 0.5783883649863844, "grad_norm": 4.278646945953369, "learning_rate": 4.2215460595446584e-05, "loss": 0.766, "num_input_tokens_seen": 132865216, "step": 8257 }, { "epoch": 0.5784584132321137, "grad_norm": 3.582697629928589, "learning_rate": 4.220846234676007e-05, "loss": 0.9996, "num_input_tokens_seen": 132881600, "step": 8258 }, { "epoch": 0.5785284614778429, "grad_norm": 4.3087849617004395, "learning_rate": 4.2201464098073555e-05, "loss": 0.9265, "num_input_tokens_seen": 132897360, "step": 8259 }, { "epoch": 0.5785985097235721, "grad_norm": 3.9091784954071045, "learning_rate": 4.219446584938704e-05, "loss": 1.0683, "num_input_tokens_seen": 132913504, "step": 8260 }, { "epoch": 0.5786685579693014, "grad_norm": 3.899811267852783, "learning_rate": 4.218746760070053e-05, "loss": 1.0876, "num_input_tokens_seen": 132929184, "step": 8261 }, { "epoch": 0.5787386062150306, "grad_norm": 4.090141773223877, "learning_rate": 4.2180469352014014e-05, "loss": 1.1917, "num_input_tokens_seen": 132945016, "step": 8262 }, { "epoch": 0.5788086544607598, "grad_norm": 4.3014631271362305, "learning_rate": 4.21734711033275e-05, "loss": 0.8849, "num_input_tokens_seen": 132961224, "step": 8263 }, { "epoch": 0.5788787027064891, "grad_norm": 4.155791282653809, "learning_rate": 4.216647285464098e-05, "loss": 1.2749, "num_input_tokens_seen": 132977464, "step": 8264 }, { "epoch": 0.5789487509522183, "grad_norm": 3.6278793811798096, "learning_rate": 4.2159474605954466e-05, "loss": 0.9852, "num_input_tokens_seen": 132993848, "step": 8265 }, { "epoch": 0.5790187991979476, "grad_norm": 3.477964401245117, "learning_rate": 4.215247635726796e-05, "loss": 0.8369, "num_input_tokens_seen": 133010232, "step": 8266 }, { "epoch": 0.5790888474436768, "grad_norm": 5.3280439376831055, "learning_rate": 4.2145478108581436e-05, "loss": 0.9703, "num_input_tokens_seen": 133026616, "step": 8267 }, { "epoch": 0.579158895689406, "grad_norm": 3.5057740211486816, "learning_rate": 4.2138479859894925e-05, "loss": 0.9546, "num_input_tokens_seen": 133043000, "step": 8268 }, { "epoch": 0.5792289439351354, "grad_norm": 6.108116149902344, "learning_rate": 4.2131481611208407e-05, "loss": 0.9677, "num_input_tokens_seen": 133058584, "step": 8269 }, { "epoch": 0.5792989921808646, "grad_norm": 3.764878273010254, "learning_rate": 4.2124483362521895e-05, "loss": 0.9287, "num_input_tokens_seen": 133074176, "step": 8270 }, { "epoch": 0.5793690404265938, "grad_norm": 3.9870357513427734, "learning_rate": 4.2117485113835384e-05, "loss": 1.1149, "num_input_tokens_seen": 133090320, "step": 8271 }, { "epoch": 0.5794390886723231, "grad_norm": 3.3531510829925537, "learning_rate": 4.2110486865148866e-05, "loss": 0.7976, "num_input_tokens_seen": 133106312, "step": 8272 }, { "epoch": 0.5795091369180523, "grad_norm": 3.321676731109619, "learning_rate": 4.2103488616462354e-05, "loss": 0.8953, "num_input_tokens_seen": 133122696, "step": 8273 }, { "epoch": 0.5795791851637816, "grad_norm": 4.315324306488037, "learning_rate": 4.209649036777583e-05, "loss": 1.0502, "num_input_tokens_seen": 133139080, "step": 8274 }, { "epoch": 0.5796492334095108, "grad_norm": 3.9912524223327637, "learning_rate": 4.208949211908932e-05, "loss": 1.0919, "num_input_tokens_seen": 133155192, "step": 8275 }, { "epoch": 0.57971928165524, "grad_norm": 3.8014609813690186, "learning_rate": 4.208249387040281e-05, "loss": 1.0997, "num_input_tokens_seen": 133171576, "step": 8276 }, { "epoch": 0.5797893299009693, "grad_norm": 3.790175437927246, "learning_rate": 4.207549562171629e-05, "loss": 0.9689, "num_input_tokens_seen": 133187448, "step": 8277 }, { "epoch": 0.5798593781466985, "grad_norm": 3.7050399780273438, "learning_rate": 4.206849737302978e-05, "loss": 1.0661, "num_input_tokens_seen": 133203832, "step": 8278 }, { "epoch": 0.5799294263924278, "grad_norm": 3.9394073486328125, "learning_rate": 4.206149912434326e-05, "loss": 1.0586, "num_input_tokens_seen": 133219176, "step": 8279 }, { "epoch": 0.579999474638157, "grad_norm": 3.8162503242492676, "learning_rate": 4.205450087565675e-05, "loss": 1.0055, "num_input_tokens_seen": 133235272, "step": 8280 }, { "epoch": 0.5800695228838862, "grad_norm": 4.20829439163208, "learning_rate": 4.2047502626970236e-05, "loss": 1.0363, "num_input_tokens_seen": 133250872, "step": 8281 }, { "epoch": 0.5801395711296156, "grad_norm": 4.279886722564697, "learning_rate": 4.204050437828372e-05, "loss": 1.1982, "num_input_tokens_seen": 133266680, "step": 8282 }, { "epoch": 0.5802096193753448, "grad_norm": 4.4257354736328125, "learning_rate": 4.2033506129597206e-05, "loss": 1.1011, "num_input_tokens_seen": 133282928, "step": 8283 }, { "epoch": 0.580279667621074, "grad_norm": 4.393752098083496, "learning_rate": 4.202650788091068e-05, "loss": 1.373, "num_input_tokens_seen": 133299312, "step": 8284 }, { "epoch": 0.5803497158668033, "grad_norm": 3.4351353645324707, "learning_rate": 4.201950963222417e-05, "loss": 0.9685, "num_input_tokens_seen": 133315568, "step": 8285 }, { "epoch": 0.5804197641125325, "grad_norm": 5.595382213592529, "learning_rate": 4.201251138353765e-05, "loss": 0.9129, "num_input_tokens_seen": 133331952, "step": 8286 }, { "epoch": 0.5804898123582618, "grad_norm": 3.8465957641601562, "learning_rate": 4.200551313485114e-05, "loss": 1.083, "num_input_tokens_seen": 133347992, "step": 8287 }, { "epoch": 0.580559860603991, "grad_norm": 3.53275465965271, "learning_rate": 4.199851488616463e-05, "loss": 0.8685, "num_input_tokens_seen": 133364376, "step": 8288 }, { "epoch": 0.5806299088497202, "grad_norm": 3.821805000305176, "learning_rate": 4.199151663747811e-05, "loss": 0.9433, "num_input_tokens_seen": 133380760, "step": 8289 }, { "epoch": 0.5806999570954495, "grad_norm": 3.843419313430786, "learning_rate": 4.19845183887916e-05, "loss": 1.0135, "num_input_tokens_seen": 133397144, "step": 8290 }, { "epoch": 0.5807700053411787, "grad_norm": 3.5370657444000244, "learning_rate": 4.1977520140105074e-05, "loss": 1.0259, "num_input_tokens_seen": 133413528, "step": 8291 }, { "epoch": 0.5808400535869079, "grad_norm": 3.9939937591552734, "learning_rate": 4.197052189141857e-05, "loss": 1.1104, "num_input_tokens_seen": 133429912, "step": 8292 }, { "epoch": 0.5809101018326372, "grad_norm": 3.8171210289001465, "learning_rate": 4.196352364273206e-05, "loss": 1.0962, "num_input_tokens_seen": 133446280, "step": 8293 }, { "epoch": 0.5809801500783665, "grad_norm": 5.993227481842041, "learning_rate": 4.195652539404553e-05, "loss": 1.0393, "num_input_tokens_seen": 133462272, "step": 8294 }, { "epoch": 0.5810501983240958, "grad_norm": 3.480929136276245, "learning_rate": 4.194952714535902e-05, "loss": 0.8842, "num_input_tokens_seen": 133478656, "step": 8295 }, { "epoch": 0.581120246569825, "grad_norm": 4.371162414550781, "learning_rate": 4.1942528896672504e-05, "loss": 0.861, "num_input_tokens_seen": 133495040, "step": 8296 }, { "epoch": 0.5811902948155542, "grad_norm": 3.9835524559020996, "learning_rate": 4.193553064798599e-05, "loss": 1.025, "num_input_tokens_seen": 133510576, "step": 8297 }, { "epoch": 0.5812603430612835, "grad_norm": 3.935680866241455, "learning_rate": 4.192853239929948e-05, "loss": 1.2121, "num_input_tokens_seen": 133526960, "step": 8298 }, { "epoch": 0.5813303913070127, "grad_norm": 5.499049186706543, "learning_rate": 4.192153415061296e-05, "loss": 0.9905, "num_input_tokens_seen": 133542336, "step": 8299 }, { "epoch": 0.5814004395527419, "grad_norm": 4.98264217376709, "learning_rate": 4.191453590192645e-05, "loss": 1.0016, "num_input_tokens_seen": 133556464, "step": 8300 }, { "epoch": 0.5814704877984712, "grad_norm": 4.091787338256836, "learning_rate": 4.1907537653239926e-05, "loss": 0.9863, "num_input_tokens_seen": 133572280, "step": 8301 }, { "epoch": 0.5815405360442004, "grad_norm": 3.9688591957092285, "learning_rate": 4.190053940455342e-05, "loss": 1.0631, "num_input_tokens_seen": 133588224, "step": 8302 }, { "epoch": 0.5816105842899297, "grad_norm": 3.7555758953094482, "learning_rate": 4.189354115586691e-05, "loss": 1.0418, "num_input_tokens_seen": 133604016, "step": 8303 }, { "epoch": 0.5816806325356589, "grad_norm": 3.423367977142334, "learning_rate": 4.1886542907180385e-05, "loss": 1.0608, "num_input_tokens_seen": 133620400, "step": 8304 }, { "epoch": 0.5817506807813881, "grad_norm": 4.812343597412109, "learning_rate": 4.1879544658493874e-05, "loss": 1.0295, "num_input_tokens_seen": 133636008, "step": 8305 }, { "epoch": 0.5818207290271175, "grad_norm": 4.64470100402832, "learning_rate": 4.1872546409807356e-05, "loss": 1.1065, "num_input_tokens_seen": 133651944, "step": 8306 }, { "epoch": 0.5818907772728467, "grad_norm": 3.9178478717803955, "learning_rate": 4.1865548161120844e-05, "loss": 1.162, "num_input_tokens_seen": 133667336, "step": 8307 }, { "epoch": 0.5819608255185759, "grad_norm": 3.7328476905822754, "learning_rate": 4.185854991243433e-05, "loss": 0.9681, "num_input_tokens_seen": 133683720, "step": 8308 }, { "epoch": 0.5820308737643052, "grad_norm": 5.883306980133057, "learning_rate": 4.1851551663747815e-05, "loss": 1.0203, "num_input_tokens_seen": 133700104, "step": 8309 }, { "epoch": 0.5821009220100344, "grad_norm": 3.6011807918548584, "learning_rate": 4.18445534150613e-05, "loss": 0.8344, "num_input_tokens_seen": 133716184, "step": 8310 }, { "epoch": 0.5821709702557637, "grad_norm": 5.971445083618164, "learning_rate": 4.183755516637478e-05, "loss": 1.0888, "num_input_tokens_seen": 133732552, "step": 8311 }, { "epoch": 0.5822410185014929, "grad_norm": 3.7484395503997803, "learning_rate": 4.1830556917688274e-05, "loss": 0.9394, "num_input_tokens_seen": 133748400, "step": 8312 }, { "epoch": 0.5823110667472221, "grad_norm": 4.420420169830322, "learning_rate": 4.182355866900175e-05, "loss": 0.8677, "num_input_tokens_seen": 133764784, "step": 8313 }, { "epoch": 0.5823811149929514, "grad_norm": 4.270792484283447, "learning_rate": 4.181656042031524e-05, "loss": 1.0086, "num_input_tokens_seen": 133780472, "step": 8314 }, { "epoch": 0.5824511632386806, "grad_norm": 3.99307918548584, "learning_rate": 4.1809562171628726e-05, "loss": 0.8987, "num_input_tokens_seen": 133796856, "step": 8315 }, { "epoch": 0.5825212114844099, "grad_norm": 3.3790438175201416, "learning_rate": 4.180256392294221e-05, "loss": 0.8995, "num_input_tokens_seen": 133813240, "step": 8316 }, { "epoch": 0.5825912597301391, "grad_norm": 3.9880869388580322, "learning_rate": 4.1795565674255696e-05, "loss": 1.0347, "num_input_tokens_seen": 133829624, "step": 8317 }, { "epoch": 0.5826613079758683, "grad_norm": 4.359625339508057, "learning_rate": 4.178856742556918e-05, "loss": 1.0979, "num_input_tokens_seen": 133845048, "step": 8318 }, { "epoch": 0.5827313562215977, "grad_norm": 3.856341600418091, "learning_rate": 4.178156917688267e-05, "loss": 1.0426, "num_input_tokens_seen": 133861432, "step": 8319 }, { "epoch": 0.5828014044673269, "grad_norm": 4.141717433929443, "learning_rate": 4.1774570928196155e-05, "loss": 1.1243, "num_input_tokens_seen": 133877176, "step": 8320 }, { "epoch": 0.5828714527130561, "grad_norm": 4.0302605628967285, "learning_rate": 4.176757267950963e-05, "loss": 0.9288, "num_input_tokens_seen": 133892840, "step": 8321 }, { "epoch": 0.5829415009587854, "grad_norm": 4.464367389678955, "learning_rate": 4.1760574430823126e-05, "loss": 1.2155, "num_input_tokens_seen": 133909224, "step": 8322 }, { "epoch": 0.5830115492045146, "grad_norm": 5.884219169616699, "learning_rate": 4.17535761821366e-05, "loss": 1.2055, "num_input_tokens_seen": 133925432, "step": 8323 }, { "epoch": 0.5830815974502439, "grad_norm": 4.006690979003906, "learning_rate": 4.174657793345009e-05, "loss": 1.0853, "num_input_tokens_seen": 133941816, "step": 8324 }, { "epoch": 0.5831516456959731, "grad_norm": 4.723453998565674, "learning_rate": 4.173957968476358e-05, "loss": 1.0426, "num_input_tokens_seen": 133958200, "step": 8325 }, { "epoch": 0.5832216939417023, "grad_norm": 4.328615665435791, "learning_rate": 4.173258143607706e-05, "loss": 1.3134, "num_input_tokens_seen": 133974584, "step": 8326 }, { "epoch": 0.5832917421874316, "grad_norm": 3.6810529232025146, "learning_rate": 4.172558318739055e-05, "loss": 1.0627, "num_input_tokens_seen": 133990936, "step": 8327 }, { "epoch": 0.5833617904331608, "grad_norm": 4.4679741859436035, "learning_rate": 4.171858493870403e-05, "loss": 1.0715, "num_input_tokens_seen": 134006488, "step": 8328 }, { "epoch": 0.58343183867889, "grad_norm": 3.5542099475860596, "learning_rate": 4.171158669001752e-05, "loss": 0.9576, "num_input_tokens_seen": 134022128, "step": 8329 }, { "epoch": 0.5835018869246194, "grad_norm": 5.318003177642822, "learning_rate": 4.170458844133101e-05, "loss": 1.0429, "num_input_tokens_seen": 134038512, "step": 8330 }, { "epoch": 0.5835719351703486, "grad_norm": 3.7052905559539795, "learning_rate": 4.169759019264448e-05, "loss": 1.0589, "num_input_tokens_seen": 134054816, "step": 8331 }, { "epoch": 0.5836419834160779, "grad_norm": 3.937094211578369, "learning_rate": 4.169059194395798e-05, "loss": 0.8523, "num_input_tokens_seen": 134071200, "step": 8332 }, { "epoch": 0.5837120316618071, "grad_norm": 4.033092498779297, "learning_rate": 4.168359369527145e-05, "loss": 1.0589, "num_input_tokens_seen": 134087416, "step": 8333 }, { "epoch": 0.5837820799075363, "grad_norm": 3.4819228649139404, "learning_rate": 4.167659544658494e-05, "loss": 0.8606, "num_input_tokens_seen": 134103800, "step": 8334 }, { "epoch": 0.5838521281532656, "grad_norm": 4.7478861808776855, "learning_rate": 4.166959719789843e-05, "loss": 1.0191, "num_input_tokens_seen": 134119856, "step": 8335 }, { "epoch": 0.5839221763989948, "grad_norm": 3.6348135471343994, "learning_rate": 4.166259894921191e-05, "loss": 0.9789, "num_input_tokens_seen": 134135992, "step": 8336 }, { "epoch": 0.583992224644724, "grad_norm": 3.5731043815612793, "learning_rate": 4.16556007005254e-05, "loss": 0.9688, "num_input_tokens_seen": 134152376, "step": 8337 }, { "epoch": 0.5840622728904533, "grad_norm": 4.3569254875183105, "learning_rate": 4.164860245183888e-05, "loss": 1.1394, "num_input_tokens_seen": 134168760, "step": 8338 }, { "epoch": 0.5841323211361825, "grad_norm": 3.468846082687378, "learning_rate": 4.164160420315237e-05, "loss": 0.9848, "num_input_tokens_seen": 134185144, "step": 8339 }, { "epoch": 0.5842023693819118, "grad_norm": 6.284951210021973, "learning_rate": 4.1634605954465846e-05, "loss": 0.9518, "num_input_tokens_seen": 134200680, "step": 8340 }, { "epoch": 0.584272417627641, "grad_norm": 4.579503536224365, "learning_rate": 4.1627607705779334e-05, "loss": 0.9929, "num_input_tokens_seen": 134217064, "step": 8341 }, { "epoch": 0.5843424658733702, "grad_norm": 3.716926336288452, "learning_rate": 4.162060945709283e-05, "loss": 1.0092, "num_input_tokens_seen": 134233448, "step": 8342 }, { "epoch": 0.5844125141190996, "grad_norm": 4.120275974273682, "learning_rate": 4.1613611208406305e-05, "loss": 1.097, "num_input_tokens_seen": 134249832, "step": 8343 }, { "epoch": 0.5844825623648288, "grad_norm": 5.749308109283447, "learning_rate": 4.160661295971979e-05, "loss": 1.0097, "num_input_tokens_seen": 134266216, "step": 8344 }, { "epoch": 0.5845526106105581, "grad_norm": 3.9694550037384033, "learning_rate": 4.1599614711033275e-05, "loss": 0.9815, "num_input_tokens_seen": 134282600, "step": 8345 }, { "epoch": 0.5846226588562873, "grad_norm": 4.208174705505371, "learning_rate": 4.1592616462346764e-05, "loss": 1.0167, "num_input_tokens_seen": 134298088, "step": 8346 }, { "epoch": 0.5846927071020165, "grad_norm": 3.957308530807495, "learning_rate": 4.158561821366025e-05, "loss": 1.0122, "num_input_tokens_seen": 134314144, "step": 8347 }, { "epoch": 0.5847627553477458, "grad_norm": 4.887307643890381, "learning_rate": 4.1578619964973734e-05, "loss": 0.9538, "num_input_tokens_seen": 134330528, "step": 8348 }, { "epoch": 0.584832803593475, "grad_norm": 4.353172302246094, "learning_rate": 4.157162171628722e-05, "loss": 1.1329, "num_input_tokens_seen": 134346912, "step": 8349 }, { "epoch": 0.5849028518392042, "grad_norm": 3.504237174987793, "learning_rate": 4.15646234676007e-05, "loss": 0.9481, "num_input_tokens_seen": 134363296, "step": 8350 }, { "epoch": 0.5849729000849335, "grad_norm": 4.646234512329102, "learning_rate": 4.1557625218914186e-05, "loss": 1.0612, "num_input_tokens_seen": 134379376, "step": 8351 }, { "epoch": 0.5850429483306627, "grad_norm": 4.285154819488525, "learning_rate": 4.155062697022768e-05, "loss": 0.9446, "num_input_tokens_seen": 134395760, "step": 8352 }, { "epoch": 0.585112996576392, "grad_norm": 3.5311827659606934, "learning_rate": 4.154362872154116e-05, "loss": 1.0646, "num_input_tokens_seen": 134412144, "step": 8353 }, { "epoch": 0.5851830448221212, "grad_norm": 4.413166522979736, "learning_rate": 4.1536630472854645e-05, "loss": 0.9335, "num_input_tokens_seen": 134428136, "step": 8354 }, { "epoch": 0.5852530930678504, "grad_norm": 4.07955265045166, "learning_rate": 4.152963222416813e-05, "loss": 0.8923, "num_input_tokens_seen": 134444512, "step": 8355 }, { "epoch": 0.5853231413135798, "grad_norm": 3.809666633605957, "learning_rate": 4.1522633975481616e-05, "loss": 0.8746, "num_input_tokens_seen": 134460896, "step": 8356 }, { "epoch": 0.585393189559309, "grad_norm": 3.916811466217041, "learning_rate": 4.1515635726795104e-05, "loss": 0.9548, "num_input_tokens_seen": 134477128, "step": 8357 }, { "epoch": 0.5854632378050382, "grad_norm": 4.052529811859131, "learning_rate": 4.1508637478108586e-05, "loss": 1.0494, "num_input_tokens_seen": 134493512, "step": 8358 }, { "epoch": 0.5855332860507675, "grad_norm": 4.45082950592041, "learning_rate": 4.1501639229422075e-05, "loss": 1.2332, "num_input_tokens_seen": 134509192, "step": 8359 }, { "epoch": 0.5856033342964967, "grad_norm": 4.140877723693848, "learning_rate": 4.149464098073555e-05, "loss": 1.1588, "num_input_tokens_seen": 134524848, "step": 8360 }, { "epoch": 0.585673382542226, "grad_norm": 3.7743544578552246, "learning_rate": 4.148764273204904e-05, "loss": 0.8837, "num_input_tokens_seen": 134541232, "step": 8361 }, { "epoch": 0.5857434307879552, "grad_norm": 3.654794692993164, "learning_rate": 4.1480644483362534e-05, "loss": 1.2079, "num_input_tokens_seen": 134557616, "step": 8362 }, { "epoch": 0.5858134790336844, "grad_norm": 3.4448959827423096, "learning_rate": 4.147364623467601e-05, "loss": 0.8832, "num_input_tokens_seen": 134574000, "step": 8363 }, { "epoch": 0.5858835272794137, "grad_norm": 4.829925537109375, "learning_rate": 4.14666479859895e-05, "loss": 1.1541, "num_input_tokens_seen": 134590384, "step": 8364 }, { "epoch": 0.5859535755251429, "grad_norm": 3.5955686569213867, "learning_rate": 4.145964973730298e-05, "loss": 1.0381, "num_input_tokens_seen": 134606768, "step": 8365 }, { "epoch": 0.5860236237708721, "grad_norm": 5.0735368728637695, "learning_rate": 4.145265148861647e-05, "loss": 1.238, "num_input_tokens_seen": 134622752, "step": 8366 }, { "epoch": 0.5860936720166015, "grad_norm": 3.8610787391662598, "learning_rate": 4.144565323992994e-05, "loss": 1.1311, "num_input_tokens_seen": 134639112, "step": 8367 }, { "epoch": 0.5861637202623307, "grad_norm": 3.630153179168701, "learning_rate": 4.143865499124344e-05, "loss": 0.9734, "num_input_tokens_seen": 134655496, "step": 8368 }, { "epoch": 0.58623376850806, "grad_norm": 4.367414951324463, "learning_rate": 4.143165674255693e-05, "loss": 1.0743, "num_input_tokens_seen": 134671880, "step": 8369 }, { "epoch": 0.5863038167537892, "grad_norm": 3.709831953048706, "learning_rate": 4.14246584938704e-05, "loss": 1.0332, "num_input_tokens_seen": 134688168, "step": 8370 }, { "epoch": 0.5863738649995184, "grad_norm": 4.649940490722656, "learning_rate": 4.141766024518389e-05, "loss": 1.3275, "num_input_tokens_seen": 134704552, "step": 8371 }, { "epoch": 0.5864439132452477, "grad_norm": 3.7334702014923096, "learning_rate": 4.141066199649737e-05, "loss": 0.7565, "num_input_tokens_seen": 134719896, "step": 8372 }, { "epoch": 0.5865139614909769, "grad_norm": 4.841366291046143, "learning_rate": 4.140366374781086e-05, "loss": 1.337, "num_input_tokens_seen": 134735168, "step": 8373 }, { "epoch": 0.5865840097367061, "grad_norm": 4.331602096557617, "learning_rate": 4.139666549912435e-05, "loss": 1.1276, "num_input_tokens_seen": 134751552, "step": 8374 }, { "epoch": 0.5866540579824354, "grad_norm": 4.3667497634887695, "learning_rate": 4.138966725043783e-05, "loss": 0.9588, "num_input_tokens_seen": 134767272, "step": 8375 }, { "epoch": 0.5867241062281646, "grad_norm": 3.6581032276153564, "learning_rate": 4.138266900175132e-05, "loss": 1.0778, "num_input_tokens_seen": 134783656, "step": 8376 }, { "epoch": 0.5867941544738939, "grad_norm": 3.8137452602386475, "learning_rate": 4.1375670753064795e-05, "loss": 1.1091, "num_input_tokens_seen": 134799760, "step": 8377 }, { "epoch": 0.5868642027196231, "grad_norm": 4.273350238800049, "learning_rate": 4.136867250437829e-05, "loss": 1.0878, "num_input_tokens_seen": 134816144, "step": 8378 }, { "epoch": 0.5869342509653523, "grad_norm": 4.527581214904785, "learning_rate": 4.136167425569178e-05, "loss": 1.0128, "num_input_tokens_seen": 134832528, "step": 8379 }, { "epoch": 0.5870042992110817, "grad_norm": 4.647453784942627, "learning_rate": 4.1354676007005254e-05, "loss": 1.0938, "num_input_tokens_seen": 134847984, "step": 8380 }, { "epoch": 0.5870743474568109, "grad_norm": 5.219020843505859, "learning_rate": 4.134767775831874e-05, "loss": 1.2273, "num_input_tokens_seen": 134864368, "step": 8381 }, { "epoch": 0.5871443957025402, "grad_norm": 4.104679107666016, "learning_rate": 4.1340679509632224e-05, "loss": 1.0368, "num_input_tokens_seen": 134879776, "step": 8382 }, { "epoch": 0.5872144439482694, "grad_norm": 4.659088134765625, "learning_rate": 4.133368126094571e-05, "loss": 1.258, "num_input_tokens_seen": 134894656, "step": 8383 }, { "epoch": 0.5872844921939986, "grad_norm": 5.709257125854492, "learning_rate": 4.13266830122592e-05, "loss": 1.0678, "num_input_tokens_seen": 134910952, "step": 8384 }, { "epoch": 0.5873545404397279, "grad_norm": 3.874393939971924, "learning_rate": 4.131968476357268e-05, "loss": 1.2776, "num_input_tokens_seen": 134927320, "step": 8385 }, { "epoch": 0.5874245886854571, "grad_norm": 3.5335848331451416, "learning_rate": 4.131268651488617e-05, "loss": 0.9319, "num_input_tokens_seen": 134943704, "step": 8386 }, { "epoch": 0.5874946369311863, "grad_norm": 7.110137462615967, "learning_rate": 4.130568826619965e-05, "loss": 1.0478, "num_input_tokens_seen": 134960088, "step": 8387 }, { "epoch": 0.5875646851769156, "grad_norm": 5.622186660766602, "learning_rate": 4.129869001751314e-05, "loss": 1.3012, "num_input_tokens_seen": 134976232, "step": 8388 }, { "epoch": 0.5876347334226448, "grad_norm": 4.596433162689209, "learning_rate": 4.129169176882663e-05, "loss": 1.1115, "num_input_tokens_seen": 134992616, "step": 8389 }, { "epoch": 0.5877047816683741, "grad_norm": 4.493381023406982, "learning_rate": 4.1284693520140106e-05, "loss": 0.9248, "num_input_tokens_seen": 135008704, "step": 8390 }, { "epoch": 0.5877748299141033, "grad_norm": 3.4309275150299072, "learning_rate": 4.1277695271453594e-05, "loss": 0.9195, "num_input_tokens_seen": 135024360, "step": 8391 }, { "epoch": 0.5878448781598326, "grad_norm": 3.7281200885772705, "learning_rate": 4.1270697022767076e-05, "loss": 1.1526, "num_input_tokens_seen": 135040744, "step": 8392 }, { "epoch": 0.5879149264055619, "grad_norm": 4.484415054321289, "learning_rate": 4.1263698774080565e-05, "loss": 0.9914, "num_input_tokens_seen": 135057128, "step": 8393 }, { "epoch": 0.5879849746512911, "grad_norm": 4.102346897125244, "learning_rate": 4.125670052539405e-05, "loss": 1.0439, "num_input_tokens_seen": 135073248, "step": 8394 }, { "epoch": 0.5880550228970203, "grad_norm": 7.703208923339844, "learning_rate": 4.1249702276707535e-05, "loss": 1.3581, "num_input_tokens_seen": 135088344, "step": 8395 }, { "epoch": 0.5881250711427496, "grad_norm": 6.113401889801025, "learning_rate": 4.1242704028021024e-05, "loss": 1.0845, "num_input_tokens_seen": 135104728, "step": 8396 }, { "epoch": 0.5881951193884788, "grad_norm": 3.8614649772644043, "learning_rate": 4.12357057793345e-05, "loss": 0.9821, "num_input_tokens_seen": 135120272, "step": 8397 }, { "epoch": 0.5882651676342081, "grad_norm": 3.9187567234039307, "learning_rate": 4.1228707530647994e-05, "loss": 0.9296, "num_input_tokens_seen": 135136152, "step": 8398 }, { "epoch": 0.5883352158799373, "grad_norm": 3.274703025817871, "learning_rate": 4.122170928196147e-05, "loss": 0.9896, "num_input_tokens_seen": 135152536, "step": 8399 }, { "epoch": 0.5884052641256665, "grad_norm": 3.8558948040008545, "learning_rate": 4.121471103327496e-05, "loss": 1.0359, "num_input_tokens_seen": 135168920, "step": 8400 }, { "epoch": 0.5884052641256665, "eval_loss": 1.1207953691482544, "eval_runtime": 0.5047, "eval_samples_per_second": 1.981, "eval_steps_per_second": 1.981, "num_input_tokens_seen": 135168920, "step": 8400 } ], "logging_steps": 1, "max_steps": 14275, "num_input_tokens_seen": 135168920, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.9026020761447424e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }