diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15560 @@ +{ + "best_global_step": 5162, + "best_metric": 0.24373722, + "best_model_checkpoint": "/mmu_mllm_hdd_2/yifanzhang/models/tool_final/qwen_tool_all_data_180k_alldata_wpgemini/v0-20250616-170052/checkpoint-5162", + "epoch": 2.9990553927386343, + "eval_steps": 500, + "global_step": 7740, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00038753118414997455, + "grad_norm": 6.552547931671143, + "learning_rate": 2.583979328165375e-08, + "loss": 0.9296205043792725, + "memory(GiB)": 53.11, + "step": 1, + "token_acc": 0.7648440120764844, + "train_speed(iter/s)": 0.021396 + }, + { + "epoch": 0.0019376559207498728, + "grad_norm": 6.881824016571045, + "learning_rate": 1.2919896640826874e-07, + "loss": 0.9338862895965576, + "memory(GiB)": 68.26, + "step": 5, + "token_acc": 0.7612466124661247, + "train_speed(iter/s)": 0.049734 + }, + { + "epoch": 0.0038753118414997455, + "grad_norm": 7.085341930389404, + "learning_rate": 2.583979328165375e-07, + "loss": 0.9332740783691407, + "memory(GiB)": 68.26, + "step": 10, + "token_acc": 0.7584953537016083, + "train_speed(iter/s)": 0.062044 + }, + { + "epoch": 0.005812967762249619, + "grad_norm": 6.054533958435059, + "learning_rate": 3.8759689922480623e-07, + "loss": 0.9270101547241211, + "memory(GiB)": 68.26, + "step": 15, + "token_acc": 0.7512948517940717, + "train_speed(iter/s)": 0.066051 + }, + { + "epoch": 0.007750623682999491, + "grad_norm": 4.462226867675781, + "learning_rate": 5.16795865633075e-07, + "loss": 0.8828842163085937, + "memory(GiB)": 68.26, + "step": 20, + "token_acc": 0.758990211231324, + "train_speed(iter/s)": 0.069348 + }, + { + "epoch": 0.009688279603749364, + "grad_norm": 4.585198879241943, + "learning_rate": 6.459948320413437e-07, + "loss": 0.8611625671386719, + "memory(GiB)": 68.26, + "step": 25, + "token_acc": 0.7707904789891272, + "train_speed(iter/s)": 0.071211 + }, + { + "epoch": 0.011625935524499238, + "grad_norm": 2.7715461254119873, + "learning_rate": 7.751937984496125e-07, + "loss": 0.7636467933654785, + "memory(GiB)": 68.26, + "step": 30, + "token_acc": 0.7774806344522317, + "train_speed(iter/s)": 0.071926 + }, + { + "epoch": 0.01356359144524911, + "grad_norm": 2.4396770000457764, + "learning_rate": 9.043927648578812e-07, + "loss": 0.7435248374938965, + "memory(GiB)": 68.26, + "step": 35, + "token_acc": 0.8056009119196524, + "train_speed(iter/s)": 0.072532 + }, + { + "epoch": 0.015501247365998982, + "grad_norm": 2.1752095222473145, + "learning_rate": 1.03359173126615e-06, + "loss": 0.672543716430664, + "memory(GiB)": 68.26, + "step": 40, + "token_acc": 0.7990769230769231, + "train_speed(iter/s)": 0.073019 + }, + { + "epoch": 0.017438903286748856, + "grad_norm": 1.7170302867889404, + "learning_rate": 1.1627906976744188e-06, + "loss": 0.6109170913696289, + "memory(GiB)": 68.26, + "step": 45, + "token_acc": 0.81396280591633, + "train_speed(iter/s)": 0.074189 + }, + { + "epoch": 0.019376559207498728, + "grad_norm": 1.450340747833252, + "learning_rate": 1.2919896640826874e-06, + "loss": 0.590874195098877, + "memory(GiB)": 68.26, + "step": 50, + "token_acc": 0.8275686511598307, + "train_speed(iter/s)": 0.074984 + }, + { + "epoch": 0.0213142151282486, + "grad_norm": 1.4400850534439087, + "learning_rate": 1.421188630490956e-06, + "loss": 0.5400970458984375, + "memory(GiB)": 68.26, + "step": 55, + "token_acc": 0.8339371199378962, + "train_speed(iter/s)": 0.076011 + }, + { + "epoch": 0.023251871048998476, + "grad_norm": 1.2291407585144043, + "learning_rate": 1.550387596899225e-06, + "loss": 0.4900141716003418, + "memory(GiB)": 68.26, + "step": 60, + "token_acc": 0.8436885967884002, + "train_speed(iter/s)": 0.076361 + }, + { + "epoch": 0.025189526969748348, + "grad_norm": 0.973463237285614, + "learning_rate": 1.6795865633074938e-06, + "loss": 0.4778586387634277, + "memory(GiB)": 68.26, + "step": 65, + "token_acc": 0.8613077004330834, + "train_speed(iter/s)": 0.07651 + }, + { + "epoch": 0.02712718289049822, + "grad_norm": 0.999035656452179, + "learning_rate": 1.8087855297157624e-06, + "loss": 0.4614850997924805, + "memory(GiB)": 68.26, + "step": 70, + "token_acc": 0.8476737395366946, + "train_speed(iter/s)": 0.076462 + }, + { + "epoch": 0.029064838811248092, + "grad_norm": 0.9571713805198669, + "learning_rate": 1.9379844961240315e-06, + "loss": 0.44265012741088866, + "memory(GiB)": 68.26, + "step": 75, + "token_acc": 0.8589666423797003, + "train_speed(iter/s)": 0.076552 + }, + { + "epoch": 0.031002494731997964, + "grad_norm": 0.9145216941833496, + "learning_rate": 2.0671834625323e-06, + "loss": 0.42769956588745117, + "memory(GiB)": 68.26, + "step": 80, + "token_acc": 0.8603299712778038, + "train_speed(iter/s)": 0.07668 + }, + { + "epoch": 0.032940150652747836, + "grad_norm": 0.9121745824813843, + "learning_rate": 2.1963824289405687e-06, + "loss": 0.4430729866027832, + "memory(GiB)": 68.26, + "step": 85, + "token_acc": 0.8658448831461377, + "train_speed(iter/s)": 0.076904 + }, + { + "epoch": 0.03487780657349771, + "grad_norm": 0.9223958253860474, + "learning_rate": 2.3255813953488376e-06, + "loss": 0.41683158874511717, + "memory(GiB)": 68.26, + "step": 90, + "token_acc": 0.8670972793110198, + "train_speed(iter/s)": 0.076765 + }, + { + "epoch": 0.03681546249424758, + "grad_norm": 0.905558705329895, + "learning_rate": 2.454780361757106e-06, + "loss": 0.43251714706420896, + "memory(GiB)": 68.26, + "step": 95, + "token_acc": 0.8683047710037619, + "train_speed(iter/s)": 0.076628 + }, + { + "epoch": 0.038753118414997456, + "grad_norm": 0.8794025778770447, + "learning_rate": 2.583979328165375e-06, + "loss": 0.4090336799621582, + "memory(GiB)": 68.26, + "step": 100, + "token_acc": 0.8667196041902085, + "train_speed(iter/s)": 0.07673 + }, + { + "epoch": 0.04069077433574733, + "grad_norm": 0.8912697434425354, + "learning_rate": 2.7131782945736433e-06, + "loss": 0.399850869178772, + "memory(GiB)": 68.26, + "step": 105, + "token_acc": 0.8666218766996193, + "train_speed(iter/s)": 0.077113 + }, + { + "epoch": 0.0426284302564972, + "grad_norm": 0.8915030360221863, + "learning_rate": 2.842377260981912e-06, + "loss": 0.40947537422180175, + "memory(GiB)": 68.26, + "step": 110, + "token_acc": 0.8638117728372308, + "train_speed(iter/s)": 0.077292 + }, + { + "epoch": 0.044566086177247076, + "grad_norm": 0.8565830588340759, + "learning_rate": 2.971576227390181e-06, + "loss": 0.396804141998291, + "memory(GiB)": 68.26, + "step": 115, + "token_acc": 0.8661356714969378, + "train_speed(iter/s)": 0.07761 + }, + { + "epoch": 0.04650374209799695, + "grad_norm": 0.8623788952827454, + "learning_rate": 3.10077519379845e-06, + "loss": 0.3913600206375122, + "memory(GiB)": 68.26, + "step": 120, + "token_acc": 0.8754619622062617, + "train_speed(iter/s)": 0.078035 + }, + { + "epoch": 0.04844139801874682, + "grad_norm": 0.8666126728057861, + "learning_rate": 3.2299741602067187e-06, + "loss": 0.3875392436981201, + "memory(GiB)": 71.85, + "step": 125, + "token_acc": 0.8719657721636882, + "train_speed(iter/s)": 0.078025 + }, + { + "epoch": 0.050379053939496696, + "grad_norm": 0.9226394891738892, + "learning_rate": 3.3591731266149875e-06, + "loss": 0.3926274538040161, + "memory(GiB)": 71.85, + "step": 130, + "token_acc": 0.8687872763419483, + "train_speed(iter/s)": 0.078147 + }, + { + "epoch": 0.052316709860246564, + "grad_norm": 0.8856056928634644, + "learning_rate": 3.4883720930232564e-06, + "loss": 0.3912957668304443, + "memory(GiB)": 71.85, + "step": 135, + "token_acc": 0.8724723353638756, + "train_speed(iter/s)": 0.078214 + }, + { + "epoch": 0.05425436578099644, + "grad_norm": 0.8952415585517883, + "learning_rate": 3.617571059431525e-06, + "loss": 0.3882893085479736, + "memory(GiB)": 71.85, + "step": 140, + "token_acc": 0.863610550227158, + "train_speed(iter/s)": 0.078368 + }, + { + "epoch": 0.056192021701746316, + "grad_norm": 0.9110421538352966, + "learning_rate": 3.7467700258397936e-06, + "loss": 0.3694924831390381, + "memory(GiB)": 71.85, + "step": 145, + "token_acc": 0.883052064631957, + "train_speed(iter/s)": 0.078555 + }, + { + "epoch": 0.058129677622496184, + "grad_norm": 0.9361255168914795, + "learning_rate": 3.875968992248063e-06, + "loss": 0.3734897136688232, + "memory(GiB)": 71.85, + "step": 150, + "token_acc": 0.8814651629237498, + "train_speed(iter/s)": 0.078958 + }, + { + "epoch": 0.06006733354324606, + "grad_norm": 0.8287400007247925, + "learning_rate": 4.005167958656331e-06, + "loss": 0.3683502197265625, + "memory(GiB)": 71.85, + "step": 155, + "token_acc": 0.8783027148088738, + "train_speed(iter/s)": 0.079085 + }, + { + "epoch": 0.06200498946399593, + "grad_norm": 0.8841593265533447, + "learning_rate": 4.1343669250646e-06, + "loss": 0.38687331676483155, + "memory(GiB)": 71.85, + "step": 160, + "token_acc": 0.8696694133482158, + "train_speed(iter/s)": 0.079376 + }, + { + "epoch": 0.0639426453847458, + "grad_norm": 0.9208874702453613, + "learning_rate": 4.263565891472868e-06, + "loss": 0.37154593467712405, + "memory(GiB)": 71.85, + "step": 165, + "token_acc": 0.8725894872621734, + "train_speed(iter/s)": 0.079544 + }, + { + "epoch": 0.06588030130549567, + "grad_norm": 0.880796492099762, + "learning_rate": 4.3927648578811375e-06, + "loss": 0.3870300054550171, + "memory(GiB)": 71.85, + "step": 170, + "token_acc": 0.8695997091197567, + "train_speed(iter/s)": 0.079632 + }, + { + "epoch": 0.06781795722624555, + "grad_norm": 0.8492954969406128, + "learning_rate": 4.521963824289406e-06, + "loss": 0.37391808032989504, + "memory(GiB)": 71.85, + "step": 175, + "token_acc": 0.8902093180283592, + "train_speed(iter/s)": 0.07985 + }, + { + "epoch": 0.06975561314699542, + "grad_norm": 0.8536194562911987, + "learning_rate": 4.651162790697675e-06, + "loss": 0.3642561435699463, + "memory(GiB)": 71.85, + "step": 180, + "token_acc": 0.8766690784713543, + "train_speed(iter/s)": 0.079782 + }, + { + "epoch": 0.0716932690677453, + "grad_norm": 0.8752952218055725, + "learning_rate": 4.780361757105944e-06, + "loss": 0.37262544631958006, + "memory(GiB)": 71.85, + "step": 185, + "token_acc": 0.8835785829108446, + "train_speed(iter/s)": 0.079949 + }, + { + "epoch": 0.07363092498849516, + "grad_norm": 0.8449862003326416, + "learning_rate": 4.909560723514212e-06, + "loss": 0.36355421543121336, + "memory(GiB)": 71.85, + "step": 190, + "token_acc": 0.8750923624638947, + "train_speed(iter/s)": 0.079919 + }, + { + "epoch": 0.07556858090924504, + "grad_norm": 0.9465638399124146, + "learning_rate": 5.038759689922481e-06, + "loss": 0.371561074256897, + "memory(GiB)": 71.85, + "step": 195, + "token_acc": 0.876782571757995, + "train_speed(iter/s)": 0.080132 + }, + { + "epoch": 0.07750623682999491, + "grad_norm": 0.8754744529724121, + "learning_rate": 5.16795865633075e-06, + "loss": 0.37893788814544677, + "memory(GiB)": 71.85, + "step": 200, + "token_acc": 0.8805535324107793, + "train_speed(iter/s)": 0.08012 + }, + { + "epoch": 0.07944389275074479, + "grad_norm": 0.9141603112220764, + "learning_rate": 5.297157622739019e-06, + "loss": 0.35440912246704104, + "memory(GiB)": 71.85, + "step": 205, + "token_acc": 0.890790432880897, + "train_speed(iter/s)": 0.080087 + }, + { + "epoch": 0.08138154867149466, + "grad_norm": 0.8244524598121643, + "learning_rate": 5.4263565891472865e-06, + "loss": 0.3542884349822998, + "memory(GiB)": 71.85, + "step": 210, + "token_acc": 0.8815609234572851, + "train_speed(iter/s)": 0.080069 + }, + { + "epoch": 0.08331920459224454, + "grad_norm": 0.8364593982696533, + "learning_rate": 5.555555555555557e-06, + "loss": 0.355057692527771, + "memory(GiB)": 71.85, + "step": 215, + "token_acc": 0.876421973748177, + "train_speed(iter/s)": 0.079978 + }, + { + "epoch": 0.0852568605129944, + "grad_norm": 0.8796382546424866, + "learning_rate": 5.684754521963824e-06, + "loss": 0.3415433406829834, + "memory(GiB)": 71.85, + "step": 220, + "token_acc": 0.8817479925880173, + "train_speed(iter/s)": 0.080045 + }, + { + "epoch": 0.08719451643374428, + "grad_norm": 0.8751423954963684, + "learning_rate": 5.8139534883720935e-06, + "loss": 0.3774546146392822, + "memory(GiB)": 71.85, + "step": 225, + "token_acc": 0.8779613309653855, + "train_speed(iter/s)": 0.080169 + }, + { + "epoch": 0.08913217235449415, + "grad_norm": 0.7890948057174683, + "learning_rate": 5.943152454780362e-06, + "loss": 0.34930667877197263, + "memory(GiB)": 71.85, + "step": 230, + "token_acc": 0.8998645090369916, + "train_speed(iter/s)": 0.08015 + }, + { + "epoch": 0.09106982827524403, + "grad_norm": 0.8329840302467346, + "learning_rate": 6.072351421188631e-06, + "loss": 0.362669849395752, + "memory(GiB)": 71.85, + "step": 235, + "token_acc": 0.8783116299955096, + "train_speed(iter/s)": 0.08006 + }, + { + "epoch": 0.0930074841959939, + "grad_norm": 0.8648292422294617, + "learning_rate": 6.2015503875969e-06, + "loss": 0.37513184547424316, + "memory(GiB)": 71.85, + "step": 240, + "token_acc": 0.879857933201284, + "train_speed(iter/s)": 0.08001 + }, + { + "epoch": 0.09494514011674376, + "grad_norm": 0.833514392375946, + "learning_rate": 6.330749354005169e-06, + "loss": 0.35806612968444823, + "memory(GiB)": 71.85, + "step": 245, + "token_acc": 0.8780785479580661, + "train_speed(iter/s)": 0.080031 + }, + { + "epoch": 0.09688279603749364, + "grad_norm": 0.8195750713348389, + "learning_rate": 6.459948320413437e-06, + "loss": 0.3542565107345581, + "memory(GiB)": 71.85, + "step": 250, + "token_acc": 0.8780148722922728, + "train_speed(iter/s)": 0.080065 + }, + { + "epoch": 0.09882045195824352, + "grad_norm": 0.8255862593650818, + "learning_rate": 6.589147286821706e-06, + "loss": 0.35635693073272706, + "memory(GiB)": 71.85, + "step": 255, + "token_acc": 0.8740539677470861, + "train_speed(iter/s)": 0.08009 + }, + { + "epoch": 0.10075810787899339, + "grad_norm": 0.8040722012519836, + "learning_rate": 6.718346253229975e-06, + "loss": 0.3676167011260986, + "memory(GiB)": 71.85, + "step": 260, + "token_acc": 0.8800150109170306, + "train_speed(iter/s)": 0.080048 + }, + { + "epoch": 0.10269576379974327, + "grad_norm": 0.8556498289108276, + "learning_rate": 6.8475452196382435e-06, + "loss": 0.3633396148681641, + "memory(GiB)": 71.85, + "step": 265, + "token_acc": 0.8784773530356569, + "train_speed(iter/s)": 0.080154 + }, + { + "epoch": 0.10463341972049313, + "grad_norm": 0.8252087235450745, + "learning_rate": 6.976744186046513e-06, + "loss": 0.33879594802856444, + "memory(GiB)": 71.85, + "step": 270, + "token_acc": 0.8953123958124959, + "train_speed(iter/s)": 0.080137 + }, + { + "epoch": 0.106571075641243, + "grad_norm": 0.8291889429092407, + "learning_rate": 7.10594315245478e-06, + "loss": 0.3601172924041748, + "memory(GiB)": 71.85, + "step": 275, + "token_acc": 0.8832243378978936, + "train_speed(iter/s)": 0.08008 + }, + { + "epoch": 0.10850873156199288, + "grad_norm": 0.8094320893287659, + "learning_rate": 7.23514211886305e-06, + "loss": 0.35142252445220945, + "memory(GiB)": 71.85, + "step": 280, + "token_acc": 0.8835487626496313, + "train_speed(iter/s)": 0.080038 + }, + { + "epoch": 0.11044638748274276, + "grad_norm": 0.7662101984024048, + "learning_rate": 7.364341085271318e-06, + "loss": 0.3705580234527588, + "memory(GiB)": 71.85, + "step": 285, + "token_acc": 0.8818561396849497, + "train_speed(iter/s)": 0.080036 + }, + { + "epoch": 0.11238404340349263, + "grad_norm": 0.8766042590141296, + "learning_rate": 7.493540051679587e-06, + "loss": 0.3576506614685059, + "memory(GiB)": 71.85, + "step": 290, + "token_acc": 0.8764228750596414, + "train_speed(iter/s)": 0.080017 + }, + { + "epoch": 0.11432169932424249, + "grad_norm": 0.7793442010879517, + "learning_rate": 7.622739018087856e-06, + "loss": 0.36000614166259765, + "memory(GiB)": 71.85, + "step": 295, + "token_acc": 0.8948535291124519, + "train_speed(iter/s)": 0.079965 + }, + { + "epoch": 0.11625935524499237, + "grad_norm": 0.8231183886528015, + "learning_rate": 7.751937984496126e-06, + "loss": 0.33621535301208494, + "memory(GiB)": 71.85, + "step": 300, + "token_acc": 0.8816721700381496, + "train_speed(iter/s)": 0.079893 + }, + { + "epoch": 0.11819701116574224, + "grad_norm": 0.850242555141449, + "learning_rate": 7.881136950904393e-06, + "loss": 0.3525859355926514, + "memory(GiB)": 71.85, + "step": 305, + "token_acc": 0.8819446807062906, + "train_speed(iter/s)": 0.079931 + }, + { + "epoch": 0.12013466708649212, + "grad_norm": 0.8171371221542358, + "learning_rate": 8.010335917312663e-06, + "loss": 0.3546357870101929, + "memory(GiB)": 71.85, + "step": 310, + "token_acc": 0.8777248580326067, + "train_speed(iter/s)": 0.079884 + }, + { + "epoch": 0.122072323007242, + "grad_norm": 0.7793200016021729, + "learning_rate": 8.139534883720931e-06, + "loss": 0.3619884252548218, + "memory(GiB)": 71.85, + "step": 315, + "token_acc": 0.885740589198036, + "train_speed(iter/s)": 0.079851 + }, + { + "epoch": 0.12400997892799186, + "grad_norm": 0.8639906048774719, + "learning_rate": 8.2687338501292e-06, + "loss": 0.3500725269317627, + "memory(GiB)": 71.85, + "step": 320, + "token_acc": 0.8943805668016195, + "train_speed(iter/s)": 0.079868 + }, + { + "epoch": 0.12594763484874175, + "grad_norm": 0.7964992523193359, + "learning_rate": 8.397932816537468e-06, + "loss": 0.33359375, + "memory(GiB)": 71.85, + "step": 325, + "token_acc": 0.8848108035833667, + "train_speed(iter/s)": 0.079922 + }, + { + "epoch": 0.1278852907694916, + "grad_norm": 0.8312904834747314, + "learning_rate": 8.527131782945736e-06, + "loss": 0.3640293121337891, + "memory(GiB)": 71.85, + "step": 330, + "token_acc": 0.8779143037177064, + "train_speed(iter/s)": 0.079932 + }, + { + "epoch": 0.12982294669024147, + "grad_norm": 0.9487627744674683, + "learning_rate": 8.656330749354006e-06, + "loss": 0.3424405574798584, + "memory(GiB)": 71.85, + "step": 335, + "token_acc": 0.8947685759886504, + "train_speed(iter/s)": 0.080055 + }, + { + "epoch": 0.13176060261099135, + "grad_norm": 0.8095372319221497, + "learning_rate": 8.785529715762275e-06, + "loss": 0.3609053134918213, + "memory(GiB)": 71.85, + "step": 340, + "token_acc": 0.8851003253796096, + "train_speed(iter/s)": 0.080183 + }, + { + "epoch": 0.13369825853174122, + "grad_norm": 0.7795316576957703, + "learning_rate": 8.914728682170543e-06, + "loss": 0.34638004302978515, + "memory(GiB)": 71.85, + "step": 345, + "token_acc": 0.8741907781741314, + "train_speed(iter/s)": 0.08015 + }, + { + "epoch": 0.1356359144524911, + "grad_norm": 0.8028691411018372, + "learning_rate": 9.043927648578812e-06, + "loss": 0.3564423084259033, + "memory(GiB)": 71.85, + "step": 350, + "token_acc": 0.87438717787555, + "train_speed(iter/s)": 0.080072 + }, + { + "epoch": 0.13757357037324097, + "grad_norm": 0.8242841362953186, + "learning_rate": 9.173126614987082e-06, + "loss": 0.3585948944091797, + "memory(GiB)": 71.85, + "step": 355, + "token_acc": 0.8687102733506591, + "train_speed(iter/s)": 0.080091 + }, + { + "epoch": 0.13951122629399085, + "grad_norm": 0.8537614345550537, + "learning_rate": 9.30232558139535e-06, + "loss": 0.3516932487487793, + "memory(GiB)": 71.85, + "step": 360, + "token_acc": 0.8854277411581261, + "train_speed(iter/s)": 0.080165 + }, + { + "epoch": 0.14144888221474072, + "grad_norm": 0.8241102695465088, + "learning_rate": 9.431524547803619e-06, + "loss": 0.34127624034881593, + "memory(GiB)": 71.85, + "step": 365, + "token_acc": 0.8881654287864411, + "train_speed(iter/s)": 0.080126 + }, + { + "epoch": 0.1433865381354906, + "grad_norm": 0.7585867643356323, + "learning_rate": 9.560723514211887e-06, + "loss": 0.3570504903793335, + "memory(GiB)": 71.85, + "step": 370, + "token_acc": 0.8768577494692145, + "train_speed(iter/s)": 0.080189 + }, + { + "epoch": 0.14532419405624047, + "grad_norm": 0.7735254168510437, + "learning_rate": 9.689922480620156e-06, + "loss": 0.3525998592376709, + "memory(GiB)": 71.85, + "step": 375, + "token_acc": 0.8809278863704244, + "train_speed(iter/s)": 0.080227 + }, + { + "epoch": 0.14726184997699032, + "grad_norm": 0.8313542008399963, + "learning_rate": 9.819121447028424e-06, + "loss": 0.34625673294067383, + "memory(GiB)": 71.85, + "step": 380, + "token_acc": 0.8787977998952331, + "train_speed(iter/s)": 0.080142 + }, + { + "epoch": 0.1491995058977402, + "grad_norm": 0.7735046744346619, + "learning_rate": 9.948320413436692e-06, + "loss": 0.34496536254882815, + "memory(GiB)": 71.85, + "step": 385, + "token_acc": 0.8872030801357348, + "train_speed(iter/s)": 0.080142 + }, + { + "epoch": 0.15113716181849007, + "grad_norm": 0.7527068257331848, + "learning_rate": 9.999995892731712e-06, + "loss": 0.3598473072052002, + "memory(GiB)": 71.85, + "step": 390, + "token_acc": 0.8917708333333333, + "train_speed(iter/s)": 0.080105 + }, + { + "epoch": 0.15307481773923995, + "grad_norm": 0.8179726600646973, + "learning_rate": 9.999970792783274e-06, + "loss": 0.3587926387786865, + "memory(GiB)": 71.85, + "step": 395, + "token_acc": 0.8800157714355039, + "train_speed(iter/s)": 0.080088 + }, + { + "epoch": 0.15501247365998982, + "grad_norm": 0.8060595989227295, + "learning_rate": 9.999922874816521e-06, + "loss": 0.36094279289245607, + "memory(GiB)": 71.85, + "step": 400, + "token_acc": 0.8862200557996344, + "train_speed(iter/s)": 0.080129 + }, + { + "epoch": 0.1569501295807397, + "grad_norm": 0.7952097654342651, + "learning_rate": 9.999852139050132e-06, + "loss": 0.35231719017028806, + "memory(GiB)": 71.85, + "step": 405, + "token_acc": 0.8807519158733977, + "train_speed(iter/s)": 0.08013 + }, + { + "epoch": 0.15888778550148958, + "grad_norm": 0.7951153516769409, + "learning_rate": 9.999758585806923e-06, + "loss": 0.3466503620147705, + "memory(GiB)": 71.85, + "step": 410, + "token_acc": 0.8873098736787832, + "train_speed(iter/s)": 0.080147 + }, + { + "epoch": 0.16082544142223945, + "grad_norm": 0.793311595916748, + "learning_rate": 9.999642215513832e-06, + "loss": 0.3452963352203369, + "memory(GiB)": 71.85, + "step": 415, + "token_acc": 0.8855158793361577, + "train_speed(iter/s)": 0.080133 + }, + { + "epoch": 0.16276309734298933, + "grad_norm": 0.7786062359809875, + "learning_rate": 9.999503028701931e-06, + "loss": 0.3341179370880127, + "memory(GiB)": 71.85, + "step": 420, + "token_acc": 0.8807970661733684, + "train_speed(iter/s)": 0.08015 + }, + { + "epoch": 0.1647007532637392, + "grad_norm": 0.7555674910545349, + "learning_rate": 9.99934102600642e-06, + "loss": 0.343152379989624, + "memory(GiB)": 71.85, + "step": 425, + "token_acc": 0.8769051924567295, + "train_speed(iter/s)": 0.0801 + }, + { + "epoch": 0.16663840918448908, + "grad_norm": 0.7114547491073608, + "learning_rate": 9.999156208166614e-06, + "loss": 0.34557628631591797, + "memory(GiB)": 71.85, + "step": 430, + "token_acc": 0.8801453505688926, + "train_speed(iter/s)": 0.080148 + }, + { + "epoch": 0.16857606510523893, + "grad_norm": 0.7580552101135254, + "learning_rate": 9.99894857602596e-06, + "loss": 0.330863881111145, + "memory(GiB)": 71.85, + "step": 435, + "token_acc": 0.8863163953823031, + "train_speed(iter/s)": 0.080207 + }, + { + "epoch": 0.1705137210259888, + "grad_norm": 0.8019564747810364, + "learning_rate": 9.998718130532008e-06, + "loss": 0.3519331693649292, + "memory(GiB)": 71.85, + "step": 440, + "token_acc": 0.8858001022494888, + "train_speed(iter/s)": 0.080231 + }, + { + "epoch": 0.17245137694673868, + "grad_norm": 0.8301380276679993, + "learning_rate": 9.99846487273643e-06, + "loss": 0.3256737947463989, + "memory(GiB)": 71.85, + "step": 445, + "token_acc": 0.8935722938787145, + "train_speed(iter/s)": 0.080282 + }, + { + "epoch": 0.17438903286748855, + "grad_norm": 0.781501293182373, + "learning_rate": 9.998188803795e-06, + "loss": 0.35074899196624754, + "memory(GiB)": 71.85, + "step": 450, + "token_acc": 0.8913197897428612, + "train_speed(iter/s)": 0.080269 + }, + { + "epoch": 0.17632668878823843, + "grad_norm": 0.7460458278656006, + "learning_rate": 9.997889924967594e-06, + "loss": 0.3532873153686523, + "memory(GiB)": 71.85, + "step": 455, + "token_acc": 0.8787985639184511, + "train_speed(iter/s)": 0.08029 + }, + { + "epoch": 0.1782643447089883, + "grad_norm": 0.8098095655441284, + "learning_rate": 9.997568237618185e-06, + "loss": 0.3252811670303345, + "memory(GiB)": 71.85, + "step": 460, + "token_acc": 0.8971127170599719, + "train_speed(iter/s)": 0.080326 + }, + { + "epoch": 0.18020200062973818, + "grad_norm": 0.7576206922531128, + "learning_rate": 9.997223743214836e-06, + "loss": 0.3403348922729492, + "memory(GiB)": 71.85, + "step": 465, + "token_acc": 0.8788252334432494, + "train_speed(iter/s)": 0.080259 + }, + { + "epoch": 0.18213965655048805, + "grad_norm": 0.7641308903694153, + "learning_rate": 9.99685644332969e-06, + "loss": 0.33709111213684084, + "memory(GiB)": 71.85, + "step": 470, + "token_acc": 0.8855252274607114, + "train_speed(iter/s)": 0.080259 + }, + { + "epoch": 0.18407731247123793, + "grad_norm": 0.7454455494880676, + "learning_rate": 9.99646633963897e-06, + "loss": 0.3458225250244141, + "memory(GiB)": 71.85, + "step": 475, + "token_acc": 0.8953594473049599, + "train_speed(iter/s)": 0.080375 + }, + { + "epoch": 0.1860149683919878, + "grad_norm": 0.7686738967895508, + "learning_rate": 9.996053433922963e-06, + "loss": 0.3410240650177002, + "memory(GiB)": 71.85, + "step": 480, + "token_acc": 0.8842588257327465, + "train_speed(iter/s)": 0.080335 + }, + { + "epoch": 0.18795262431273765, + "grad_norm": 0.7207880020141602, + "learning_rate": 9.99561772806602e-06, + "loss": 0.33681282997131345, + "memory(GiB)": 71.85, + "step": 485, + "token_acc": 0.8913524098288169, + "train_speed(iter/s)": 0.080287 + }, + { + "epoch": 0.18989028023348753, + "grad_norm": 0.698858916759491, + "learning_rate": 9.99515922405654e-06, + "loss": 0.33275785446166994, + "memory(GiB)": 71.85, + "step": 490, + "token_acc": 0.8863521376169997, + "train_speed(iter/s)": 0.080363 + }, + { + "epoch": 0.1918279361542374, + "grad_norm": 0.717072069644928, + "learning_rate": 9.994677923986966e-06, + "loss": 0.3287045478820801, + "memory(GiB)": 71.85, + "step": 495, + "token_acc": 0.889167089249493, + "train_speed(iter/s)": 0.080389 + }, + { + "epoch": 0.19376559207498728, + "grad_norm": 0.8267790079116821, + "learning_rate": 9.994173830053775e-06, + "loss": 0.34030606746673586, + "memory(GiB)": 71.85, + "step": 500, + "token_acc": 0.8931808444019643, + "train_speed(iter/s)": 0.080364 + }, + { + "epoch": 0.19570324799573716, + "grad_norm": 0.8210350871086121, + "learning_rate": 9.993646944557464e-06, + "loss": 0.34555807113647463, + "memory(GiB)": 71.85, + "step": 505, + "token_acc": 0.8850965901724361, + "train_speed(iter/s)": 0.080382 + }, + { + "epoch": 0.19764090391648703, + "grad_norm": 0.7384742498397827, + "learning_rate": 9.993097269902543e-06, + "loss": 0.33759369850158694, + "memory(GiB)": 71.85, + "step": 510, + "token_acc": 0.8914952751528628, + "train_speed(iter/s)": 0.080379 + }, + { + "epoch": 0.1995785598372369, + "grad_norm": 0.7011024355888367, + "learning_rate": 9.992524808597527e-06, + "loss": 0.34852309226989747, + "memory(GiB)": 71.85, + "step": 515, + "token_acc": 0.8825714285714286, + "train_speed(iter/s)": 0.080335 + }, + { + "epoch": 0.20151621575798678, + "grad_norm": 0.7273775935173035, + "learning_rate": 9.991929563254913e-06, + "loss": 0.34394521713256837, + "memory(GiB)": 71.85, + "step": 520, + "token_acc": 0.893121525019857, + "train_speed(iter/s)": 0.080345 + }, + { + "epoch": 0.20345387167873666, + "grad_norm": 0.7170522212982178, + "learning_rate": 9.991311536591187e-06, + "loss": 0.3372217893600464, + "memory(GiB)": 71.85, + "step": 525, + "token_acc": 0.8847938227946663, + "train_speed(iter/s)": 0.080317 + }, + { + "epoch": 0.20539152759948653, + "grad_norm": 0.711494505405426, + "learning_rate": 9.990670731426787e-06, + "loss": 0.3441883087158203, + "memory(GiB)": 71.85, + "step": 530, + "token_acc": 0.879945936813651, + "train_speed(iter/s)": 0.080228 + }, + { + "epoch": 0.20732918352023638, + "grad_norm": 0.7713858485221863, + "learning_rate": 9.990007150686116e-06, + "loss": 0.33831114768981935, + "memory(GiB)": 71.85, + "step": 535, + "token_acc": 0.8745052315056304, + "train_speed(iter/s)": 0.080271 + }, + { + "epoch": 0.20926683944098626, + "grad_norm": 0.6852882504463196, + "learning_rate": 9.98932079739751e-06, + "loss": 0.34247727394104005, + "memory(GiB)": 71.85, + "step": 540, + "token_acc": 0.8979679758742012, + "train_speed(iter/s)": 0.080267 + }, + { + "epoch": 0.21120449536173613, + "grad_norm": 0.7397508025169373, + "learning_rate": 9.98861167469323e-06, + "loss": 0.31511332988739016, + "memory(GiB)": 71.85, + "step": 545, + "token_acc": 0.8939690385473518, + "train_speed(iter/s)": 0.080308 + }, + { + "epoch": 0.213142151282486, + "grad_norm": 0.7629010677337646, + "learning_rate": 9.987879785809452e-06, + "loss": 0.33156523704528806, + "memory(GiB)": 71.85, + "step": 550, + "token_acc": 0.8859730832235891, + "train_speed(iter/s)": 0.080275 + }, + { + "epoch": 0.21507980720323588, + "grad_norm": 0.7358865737915039, + "learning_rate": 9.987125134086247e-06, + "loss": 0.3441560506820679, + "memory(GiB)": 71.85, + "step": 555, + "token_acc": 0.8937451892506945, + "train_speed(iter/s)": 0.080387 + }, + { + "epoch": 0.21701746312398576, + "grad_norm": 0.7392430305480957, + "learning_rate": 9.986347722967562e-06, + "loss": 0.3216050863265991, + "memory(GiB)": 71.85, + "step": 560, + "token_acc": 0.8860679676091863, + "train_speed(iter/s)": 0.080374 + }, + { + "epoch": 0.21895511904473564, + "grad_norm": 0.7147052884101868, + "learning_rate": 9.985547556001219e-06, + "loss": 0.33604471683502196, + "memory(GiB)": 71.85, + "step": 565, + "token_acc": 0.8841051235074863, + "train_speed(iter/s)": 0.080384 + }, + { + "epoch": 0.2208927749654855, + "grad_norm": 0.7495723366737366, + "learning_rate": 9.98472463683888e-06, + "loss": 0.34450702667236327, + "memory(GiB)": 71.85, + "step": 570, + "token_acc": 0.8893646715513687, + "train_speed(iter/s)": 0.080369 + }, + { + "epoch": 0.2228304308862354, + "grad_norm": 0.6873824596405029, + "learning_rate": 9.98387896923605e-06, + "loss": 0.3255999326705933, + "memory(GiB)": 71.85, + "step": 575, + "token_acc": 0.884888862332696, + "train_speed(iter/s)": 0.080403 + }, + { + "epoch": 0.22476808680698526, + "grad_norm": 0.7272993326187134, + "learning_rate": 9.983010557052036e-06, + "loss": 0.34434938430786133, + "memory(GiB)": 71.85, + "step": 580, + "token_acc": 0.8903038878781753, + "train_speed(iter/s)": 0.080382 + }, + { + "epoch": 0.2267057427277351, + "grad_norm": 0.7470014095306396, + "learning_rate": 9.982119404249953e-06, + "loss": 0.33343305587768557, + "memory(GiB)": 71.85, + "step": 585, + "token_acc": 0.8992504684572142, + "train_speed(iter/s)": 0.080362 + }, + { + "epoch": 0.22864339864848499, + "grad_norm": 0.7505276203155518, + "learning_rate": 9.981205514896696e-06, + "loss": 0.3441640377044678, + "memory(GiB)": 71.85, + "step": 590, + "token_acc": 0.8857358193793623, + "train_speed(iter/s)": 0.080377 + }, + { + "epoch": 0.23058105456923486, + "grad_norm": 0.7288657426834106, + "learning_rate": 9.980268893162915e-06, + "loss": 0.3250357866287231, + "memory(GiB)": 71.85, + "step": 595, + "token_acc": 0.8958659141905215, + "train_speed(iter/s)": 0.080393 + }, + { + "epoch": 0.23251871048998474, + "grad_norm": 0.6993427276611328, + "learning_rate": 9.979309543323003e-06, + "loss": 0.3328931570053101, + "memory(GiB)": 71.85, + "step": 600, + "token_acc": 0.8799145041844783, + "train_speed(iter/s)": 0.080448 + }, + { + "epoch": 0.2344563664107346, + "grad_norm": 0.7342857718467712, + "learning_rate": 9.978327469755085e-06, + "loss": 0.3401214599609375, + "memory(GiB)": 71.85, + "step": 605, + "token_acc": 0.8813226985709538, + "train_speed(iter/s)": 0.080407 + }, + { + "epoch": 0.2363940223314845, + "grad_norm": 0.7363426685333252, + "learning_rate": 9.977322676940975e-06, + "loss": 0.3349423885345459, + "memory(GiB)": 71.85, + "step": 610, + "token_acc": 0.87650187118377, + "train_speed(iter/s)": 0.080462 + }, + { + "epoch": 0.23833167825223436, + "grad_norm": 0.7320806980133057, + "learning_rate": 9.97629516946618e-06, + "loss": 0.332952880859375, + "memory(GiB)": 71.85, + "step": 615, + "token_acc": 0.9026784355501446, + "train_speed(iter/s)": 0.080475 + }, + { + "epoch": 0.24026933417298424, + "grad_norm": 0.702238917350769, + "learning_rate": 9.975244952019863e-06, + "loss": 0.3283294677734375, + "memory(GiB)": 71.85, + "step": 620, + "token_acc": 0.8897130860741778, + "train_speed(iter/s)": 0.080534 + }, + { + "epoch": 0.24220699009373411, + "grad_norm": 0.6831479072570801, + "learning_rate": 9.974172029394827e-06, + "loss": 0.3218890428543091, + "memory(GiB)": 71.85, + "step": 625, + "token_acc": 0.902763491736954, + "train_speed(iter/s)": 0.080587 + }, + { + "epoch": 0.244144646014484, + "grad_norm": 0.7455374002456665, + "learning_rate": 9.973076406487497e-06, + "loss": 0.3427925109863281, + "memory(GiB)": 71.85, + "step": 630, + "token_acc": 0.8771017735513803, + "train_speed(iter/s)": 0.080633 + }, + { + "epoch": 0.24608230193523384, + "grad_norm": 0.7146010398864746, + "learning_rate": 9.971958088297886e-06, + "loss": 0.35007200241088865, + "memory(GiB)": 71.85, + "step": 635, + "token_acc": 0.8881121350139463, + "train_speed(iter/s)": 0.080632 + }, + { + "epoch": 0.2480199578559837, + "grad_norm": 0.6996679306030273, + "learning_rate": 9.97081707992959e-06, + "loss": 0.3419856071472168, + "memory(GiB)": 71.85, + "step": 640, + "token_acc": 0.8812891400641515, + "train_speed(iter/s)": 0.08065 + }, + { + "epoch": 0.2499576137767336, + "grad_norm": 0.7165552377700806, + "learning_rate": 9.969653386589749e-06, + "loss": 0.33933205604553224, + "memory(GiB)": 71.85, + "step": 645, + "token_acc": 0.8926531416190664, + "train_speed(iter/s)": 0.080663 + }, + { + "epoch": 0.2518952696974835, + "grad_norm": 0.6688835620880127, + "learning_rate": 9.968467013589025e-06, + "loss": 0.3391030550003052, + "memory(GiB)": 71.85, + "step": 650, + "token_acc": 0.8828354743847702, + "train_speed(iter/s)": 0.080687 + }, + { + "epoch": 0.25383292561823334, + "grad_norm": 0.6886083483695984, + "learning_rate": 9.967257966341591e-06, + "loss": 0.318090558052063, + "memory(GiB)": 71.85, + "step": 655, + "token_acc": 0.9001209101234748, + "train_speed(iter/s)": 0.080727 + }, + { + "epoch": 0.2557705815389832, + "grad_norm": 0.7216477394104004, + "learning_rate": 9.966026250365086e-06, + "loss": 0.3310126781463623, + "memory(GiB)": 71.85, + "step": 660, + "token_acc": 0.8833163784333672, + "train_speed(iter/s)": 0.080719 + }, + { + "epoch": 0.2577082374597331, + "grad_norm": 0.7138867974281311, + "learning_rate": 9.964771871280611e-06, + "loss": 0.30642869472503664, + "memory(GiB)": 71.85, + "step": 665, + "token_acc": 0.8931192070030896, + "train_speed(iter/s)": 0.080693 + }, + { + "epoch": 0.25964589338048294, + "grad_norm": 0.7110100984573364, + "learning_rate": 9.963494834812688e-06, + "loss": 0.33192169666290283, + "memory(GiB)": 71.85, + "step": 670, + "token_acc": 0.889419329696025, + "train_speed(iter/s)": 0.080681 + }, + { + "epoch": 0.26158354930123284, + "grad_norm": 0.6674191951751709, + "learning_rate": 9.962195146789237e-06, + "loss": 0.3481155872344971, + "memory(GiB)": 71.85, + "step": 675, + "token_acc": 0.8790135092679862, + "train_speed(iter/s)": 0.080646 + }, + { + "epoch": 0.2635212052219827, + "grad_norm": 0.7230032086372375, + "learning_rate": 9.960872813141555e-06, + "loss": 0.31385188102722167, + "memory(GiB)": 71.85, + "step": 680, + "token_acc": 0.8956414978514426, + "train_speed(iter/s)": 0.080653 + }, + { + "epoch": 0.2654588611427326, + "grad_norm": 0.754707396030426, + "learning_rate": 9.959527839904283e-06, + "loss": 0.33814034461975095, + "memory(GiB)": 71.85, + "step": 685, + "token_acc": 0.8922617070740618, + "train_speed(iter/s)": 0.080635 + }, + { + "epoch": 0.26739651706348244, + "grad_norm": 0.6921854615211487, + "learning_rate": 9.958160233215383e-06, + "loss": 0.3266612529754639, + "memory(GiB)": 71.85, + "step": 690, + "token_acc": 0.8988545975182947, + "train_speed(iter/s)": 0.08066 + }, + { + "epoch": 0.26933417298423235, + "grad_norm": 0.7002295851707458, + "learning_rate": 9.956769999316108e-06, + "loss": 0.34218902587890626, + "memory(GiB)": 71.85, + "step": 695, + "token_acc": 0.9018419058288576, + "train_speed(iter/s)": 0.080623 + }, + { + "epoch": 0.2712718289049822, + "grad_norm": 0.686951756477356, + "learning_rate": 9.955357144550967e-06, + "loss": 0.32504887580871583, + "memory(GiB)": 71.85, + "step": 700, + "token_acc": 0.8831498186254053, + "train_speed(iter/s)": 0.080585 + }, + { + "epoch": 0.2732094848257321, + "grad_norm": 0.635576069355011, + "learning_rate": 9.953921675367711e-06, + "loss": 0.3315123558044434, + "memory(GiB)": 71.85, + "step": 705, + "token_acc": 0.8815539513677811, + "train_speed(iter/s)": 0.080527 + }, + { + "epoch": 0.27514714074648194, + "grad_norm": 0.6723300218582153, + "learning_rate": 9.952463598317286e-06, + "loss": 0.31166939735412597, + "memory(GiB)": 71.85, + "step": 710, + "token_acc": 0.8939772037499157, + "train_speed(iter/s)": 0.080549 + }, + { + "epoch": 0.2770847966672318, + "grad_norm": 0.7758930921554565, + "learning_rate": 9.950982920053822e-06, + "loss": 0.33371968269348146, + "memory(GiB)": 71.85, + "step": 715, + "token_acc": 0.8760346611484738, + "train_speed(iter/s)": 0.080596 + }, + { + "epoch": 0.2790224525879817, + "grad_norm": 0.7074525952339172, + "learning_rate": 9.949479647334584e-06, + "loss": 0.33358490467071533, + "memory(GiB)": 71.85, + "step": 720, + "token_acc": 0.8858836043154535, + "train_speed(iter/s)": 0.080652 + }, + { + "epoch": 0.28096010850873154, + "grad_norm": 0.7324233055114746, + "learning_rate": 9.947953787019955e-06, + "loss": 0.3313758134841919, + "memory(GiB)": 71.85, + "step": 725, + "token_acc": 0.8816807191136506, + "train_speed(iter/s)": 0.080632 + }, + { + "epoch": 0.28289776442948145, + "grad_norm": 0.6607609391212463, + "learning_rate": 9.946405346073395e-06, + "loss": 0.343143105506897, + "memory(GiB)": 71.85, + "step": 730, + "token_acc": 0.8806101783497746, + "train_speed(iter/s)": 0.080642 + }, + { + "epoch": 0.2848354203502313, + "grad_norm": 0.6818123459815979, + "learning_rate": 9.944834331561418e-06, + "loss": 0.3311434507369995, + "memory(GiB)": 71.85, + "step": 735, + "token_acc": 0.8792830615210722, + "train_speed(iter/s)": 0.080645 + }, + { + "epoch": 0.2867730762709812, + "grad_norm": 0.6960747241973877, + "learning_rate": 9.943240750653552e-06, + "loss": 0.3408792972564697, + "memory(GiB)": 71.85, + "step": 740, + "token_acc": 0.8895531360630552, + "train_speed(iter/s)": 0.080666 + }, + { + "epoch": 0.28871073219173105, + "grad_norm": 0.6795424818992615, + "learning_rate": 9.941624610622312e-06, + "loss": 0.32071099281311033, + "memory(GiB)": 71.85, + "step": 745, + "token_acc": 0.8923172567240364, + "train_speed(iter/s)": 0.080651 + }, + { + "epoch": 0.29064838811248095, + "grad_norm": 0.7239606976509094, + "learning_rate": 9.939985918843163e-06, + "loss": 0.3250409126281738, + "memory(GiB)": 71.85, + "step": 750, + "token_acc": 0.8985261336793425, + "train_speed(iter/s)": 0.080637 + }, + { + "epoch": 0.2925860440332308, + "grad_norm": 0.6698892116546631, + "learning_rate": 9.938324682794489e-06, + "loss": 0.3211568832397461, + "memory(GiB)": 71.85, + "step": 755, + "token_acc": 0.8952513503661729, + "train_speed(iter/s)": 0.08066 + }, + { + "epoch": 0.29452369995398064, + "grad_norm": 0.6671373844146729, + "learning_rate": 9.936640910057557e-06, + "loss": 0.33314924240112304, + "memory(GiB)": 71.85, + "step": 760, + "token_acc": 0.8977999531192445, + "train_speed(iter/s)": 0.080628 + }, + { + "epoch": 0.29646135587473055, + "grad_norm": 0.6509130001068115, + "learning_rate": 9.934934608316484e-06, + "loss": 0.3198971748352051, + "memory(GiB)": 71.85, + "step": 765, + "token_acc": 0.9052997393570807, + "train_speed(iter/s)": 0.080619 + }, + { + "epoch": 0.2983990117954804, + "grad_norm": 0.7284981608390808, + "learning_rate": 9.9332057853582e-06, + "loss": 0.32789950370788573, + "memory(GiB)": 71.85, + "step": 770, + "token_acc": 0.8824503819135556, + "train_speed(iter/s)": 0.080637 + }, + { + "epoch": 0.3003366677162303, + "grad_norm": 0.6976826786994934, + "learning_rate": 9.931454449072414e-06, + "loss": 0.3451281309127808, + "memory(GiB)": 71.85, + "step": 775, + "token_acc": 0.8759064042410578, + "train_speed(iter/s)": 0.080614 + }, + { + "epoch": 0.30227432363698015, + "grad_norm": 0.7098056077957153, + "learning_rate": 9.929680607451577e-06, + "loss": 0.3235619306564331, + "memory(GiB)": 71.85, + "step": 780, + "token_acc": 0.8838213673119334, + "train_speed(iter/s)": 0.080685 + }, + { + "epoch": 0.30421197955773005, + "grad_norm": 0.7323970198631287, + "learning_rate": 9.927884268590846e-06, + "loss": 0.32435629367828367, + "memory(GiB)": 71.85, + "step": 785, + "token_acc": 0.901668191736468, + "train_speed(iter/s)": 0.080703 + }, + { + "epoch": 0.3061496354784799, + "grad_norm": 0.6804087162017822, + "learning_rate": 9.926065440688048e-06, + "loss": 0.33081588745117185, + "memory(GiB)": 71.85, + "step": 790, + "token_acc": 0.8955827220863896, + "train_speed(iter/s)": 0.080696 + }, + { + "epoch": 0.3080872913992298, + "grad_norm": 0.6831809282302856, + "learning_rate": 9.92422413204364e-06, + "loss": 0.3195831775665283, + "memory(GiB)": 71.85, + "step": 795, + "token_acc": 0.8932602703766899, + "train_speed(iter/s)": 0.080702 + }, + { + "epoch": 0.31002494731997965, + "grad_norm": 0.7360027432441711, + "learning_rate": 9.922360351060678e-06, + "loss": 0.3314258098602295, + "memory(GiB)": 71.85, + "step": 800, + "token_acc": 0.880964780938741, + "train_speed(iter/s)": 0.080737 + }, + { + "epoch": 0.31196260324072955, + "grad_norm": 0.6755993366241455, + "learning_rate": 9.920474106244764e-06, + "loss": 0.3221295833587646, + "memory(GiB)": 71.85, + "step": 805, + "token_acc": 0.9010371500919213, + "train_speed(iter/s)": 0.080777 + }, + { + "epoch": 0.3139002591614794, + "grad_norm": 0.6507264971733093, + "learning_rate": 9.918565406204026e-06, + "loss": 0.3143099546432495, + "memory(GiB)": 71.85, + "step": 810, + "token_acc": 0.8967003567181926, + "train_speed(iter/s)": 0.080768 + }, + { + "epoch": 0.31583791508222925, + "grad_norm": 0.6312580108642578, + "learning_rate": 9.916634259649063e-06, + "loss": 0.32530817985534666, + "memory(GiB)": 71.85, + "step": 815, + "token_acc": 0.8985116339529442, + "train_speed(iter/s)": 0.080764 + }, + { + "epoch": 0.31777557100297915, + "grad_norm": 0.6676200032234192, + "learning_rate": 9.914680675392915e-06, + "loss": 0.33031282424926756, + "memory(GiB)": 71.85, + "step": 820, + "token_acc": 0.8891376185907686, + "train_speed(iter/s)": 0.080751 + }, + { + "epoch": 0.319713226923729, + "grad_norm": 0.6487600803375244, + "learning_rate": 9.91270466235102e-06, + "loss": 0.3286482334136963, + "memory(GiB)": 71.85, + "step": 825, + "token_acc": 0.8954001460271103, + "train_speed(iter/s)": 0.080787 + }, + { + "epoch": 0.3216508828444789, + "grad_norm": 0.6774858236312866, + "learning_rate": 9.910706229541168e-06, + "loss": 0.33796694278717043, + "memory(GiB)": 71.85, + "step": 830, + "token_acc": 0.8842560947824105, + "train_speed(iter/s)": 0.080774 + }, + { + "epoch": 0.32358853876522875, + "grad_norm": 0.6695789098739624, + "learning_rate": 9.90868538608347e-06, + "loss": 0.3218432903289795, + "memory(GiB)": 71.85, + "step": 835, + "token_acc": 0.8819998762452819, + "train_speed(iter/s)": 0.080748 + }, + { + "epoch": 0.32552619468597865, + "grad_norm": 0.6637648940086365, + "learning_rate": 9.906642141200305e-06, + "loss": 0.3061497688293457, + "memory(GiB)": 71.85, + "step": 840, + "token_acc": 0.8962320773591197, + "train_speed(iter/s)": 0.080761 + }, + { + "epoch": 0.3274638506067285, + "grad_norm": 0.6814354658126831, + "learning_rate": 9.904576504216292e-06, + "loss": 0.3214125156402588, + "memory(GiB)": 71.85, + "step": 845, + "token_acc": 0.8940032614381096, + "train_speed(iter/s)": 0.080715 + }, + { + "epoch": 0.3294015065274784, + "grad_norm": 0.687163770198822, + "learning_rate": 9.902488484558231e-06, + "loss": 0.3220739603042603, + "memory(GiB)": 71.85, + "step": 850, + "token_acc": 0.8982021789725774, + "train_speed(iter/s)": 0.080746 + }, + { + "epoch": 0.33133916244822825, + "grad_norm": 0.6442270874977112, + "learning_rate": 9.900378091755072e-06, + "loss": 0.3244103670120239, + "memory(GiB)": 71.85, + "step": 855, + "token_acc": 0.8932569296375267, + "train_speed(iter/s)": 0.080751 + }, + { + "epoch": 0.33327681836897816, + "grad_norm": 0.6879537105560303, + "learning_rate": 9.89824533543787e-06, + "loss": 0.33673839569091796, + "memory(GiB)": 71.85, + "step": 860, + "token_acc": 0.8844792633061883, + "train_speed(iter/s)": 0.080723 + }, + { + "epoch": 0.335214474289728, + "grad_norm": 0.6312484741210938, + "learning_rate": 9.896090225339735e-06, + "loss": 0.3281097412109375, + "memory(GiB)": 71.85, + "step": 865, + "token_acc": 0.8877129503995176, + "train_speed(iter/s)": 0.080719 + }, + { + "epoch": 0.33715213021047785, + "grad_norm": 0.6339064836502075, + "learning_rate": 9.893912771295792e-06, + "loss": 0.33004236221313477, + "memory(GiB)": 71.85, + "step": 870, + "token_acc": 0.8873361187148826, + "train_speed(iter/s)": 0.080697 + }, + { + "epoch": 0.33908978613122776, + "grad_norm": 0.6488205790519714, + "learning_rate": 9.891712983243138e-06, + "loss": 0.31695027351379396, + "memory(GiB)": 71.85, + "step": 875, + "token_acc": 0.8903181427343079, + "train_speed(iter/s)": 0.080716 + }, + { + "epoch": 0.3410274420519776, + "grad_norm": 0.6404213905334473, + "learning_rate": 9.889490871220791e-06, + "loss": 0.3214251041412354, + "memory(GiB)": 71.85, + "step": 880, + "token_acc": 0.8919099073814202, + "train_speed(iter/s)": 0.080726 + }, + { + "epoch": 0.3429650979727275, + "grad_norm": 0.6886140704154968, + "learning_rate": 9.887246445369651e-06, + "loss": 0.32711448669433596, + "memory(GiB)": 71.85, + "step": 885, + "token_acc": 0.8934789023310711, + "train_speed(iter/s)": 0.080709 + }, + { + "epoch": 0.34490275389347735, + "grad_norm": 0.6515597701072693, + "learning_rate": 9.884979715932444e-06, + "loss": 0.3249198436737061, + "memory(GiB)": 71.85, + "step": 890, + "token_acc": 0.8946919213472081, + "train_speed(iter/s)": 0.080729 + }, + { + "epoch": 0.34684040981422726, + "grad_norm": 0.6933703422546387, + "learning_rate": 9.88269069325369e-06, + "loss": 0.3156230926513672, + "memory(GiB)": 71.85, + "step": 895, + "token_acc": 0.8905154706400182, + "train_speed(iter/s)": 0.080718 + }, + { + "epoch": 0.3487780657349771, + "grad_norm": 0.6851153373718262, + "learning_rate": 9.880379387779637e-06, + "loss": 0.330825662612915, + "memory(GiB)": 71.85, + "step": 900, + "token_acc": 0.8904936548635227, + "train_speed(iter/s)": 0.080739 + }, + { + "epoch": 0.350715721655727, + "grad_norm": 0.6466790437698364, + "learning_rate": 9.878045810058232e-06, + "loss": 0.3320996999740601, + "memory(GiB)": 71.85, + "step": 905, + "token_acc": 0.8887969258054981, + "train_speed(iter/s)": 0.08071 + }, + { + "epoch": 0.35265337757647686, + "grad_norm": 0.6607591509819031, + "learning_rate": 9.875689970739062e-06, + "loss": 0.3408550500869751, + "memory(GiB)": 71.85, + "step": 910, + "token_acc": 0.8919545885909107, + "train_speed(iter/s)": 0.080729 + }, + { + "epoch": 0.3545910334972267, + "grad_norm": 0.6846933960914612, + "learning_rate": 9.873311880573305e-06, + "loss": 0.33896684646606445, + "memory(GiB)": 71.85, + "step": 915, + "token_acc": 0.8795058992243009, + "train_speed(iter/s)": 0.08072 + }, + { + "epoch": 0.3565286894179766, + "grad_norm": 0.690334677696228, + "learning_rate": 9.870911550413684e-06, + "loss": 0.3155628204345703, + "memory(GiB)": 71.85, + "step": 920, + "token_acc": 0.885660422526782, + "train_speed(iter/s)": 0.080721 + }, + { + "epoch": 0.35846634533872646, + "grad_norm": 0.695077121257782, + "learning_rate": 9.86848899121442e-06, + "loss": 0.32448444366455076, + "memory(GiB)": 71.85, + "step": 925, + "token_acc": 0.8889850612297236, + "train_speed(iter/s)": 0.08077 + }, + { + "epoch": 0.36040400125947636, + "grad_norm": 0.6861225962638855, + "learning_rate": 9.866044214031179e-06, + "loss": 0.31257429122924807, + "memory(GiB)": 71.85, + "step": 930, + "token_acc": 0.8953256377369246, + "train_speed(iter/s)": 0.080788 + }, + { + "epoch": 0.3623416571802262, + "grad_norm": 0.6602544784545898, + "learning_rate": 9.86357723002102e-06, + "loss": 0.3284167766571045, + "memory(GiB)": 71.85, + "step": 935, + "token_acc": 0.8837062915053662, + "train_speed(iter/s)": 0.08078 + }, + { + "epoch": 0.3642793131009761, + "grad_norm": 0.641711950302124, + "learning_rate": 9.861088050442342e-06, + "loss": 0.3246732234954834, + "memory(GiB)": 71.85, + "step": 940, + "token_acc": 0.9021792751226023, + "train_speed(iter/s)": 0.080795 + }, + { + "epoch": 0.36621696902172596, + "grad_norm": 0.666860044002533, + "learning_rate": 9.858576686654847e-06, + "loss": 0.3272420406341553, + "memory(GiB)": 71.86, + "step": 945, + "token_acc": 0.9028790786948176, + "train_speed(iter/s)": 0.080725 + }, + { + "epoch": 0.36815462494247586, + "grad_norm": 0.6561484932899475, + "learning_rate": 9.856043150119466e-06, + "loss": 0.3124321460723877, + "memory(GiB)": 71.86, + "step": 950, + "token_acc": 0.9018067556952082, + "train_speed(iter/s)": 0.080731 + }, + { + "epoch": 0.3700922808632257, + "grad_norm": 0.6260762214660645, + "learning_rate": 9.853487452398324e-06, + "loss": 0.3247486114501953, + "memory(GiB)": 71.86, + "step": 955, + "token_acc": 0.8947277680079997, + "train_speed(iter/s)": 0.080736 + }, + { + "epoch": 0.3720299367839756, + "grad_norm": 0.6457754373550415, + "learning_rate": 9.850909605154682e-06, + "loss": 0.3376758575439453, + "memory(GiB)": 71.86, + "step": 960, + "token_acc": 0.8909614421632449, + "train_speed(iter/s)": 0.080731 + }, + { + "epoch": 0.37396759270472546, + "grad_norm": 0.6789243817329407, + "learning_rate": 9.84830962015288e-06, + "loss": 0.3184741258621216, + "memory(GiB)": 71.86, + "step": 965, + "token_acc": 0.8856595547922584, + "train_speed(iter/s)": 0.080768 + }, + { + "epoch": 0.3759052486254753, + "grad_norm": 0.6199044585227966, + "learning_rate": 9.84568750925829e-06, + "loss": 0.3230047941207886, + "memory(GiB)": 71.86, + "step": 970, + "token_acc": 0.8937249758387409, + "train_speed(iter/s)": 0.080769 + }, + { + "epoch": 0.3778429045462252, + "grad_norm": 0.630508542060852, + "learning_rate": 9.843043284437257e-06, + "loss": 0.33438754081726074, + "memory(GiB)": 71.86, + "step": 975, + "token_acc": 0.8890298436459357, + "train_speed(iter/s)": 0.080752 + }, + { + "epoch": 0.37978056046697506, + "grad_norm": 0.6191177368164062, + "learning_rate": 9.840376957757042e-06, + "loss": 0.32352337837219236, + "memory(GiB)": 71.86, + "step": 980, + "token_acc": 0.8885422380691168, + "train_speed(iter/s)": 0.080724 + }, + { + "epoch": 0.38171821638772496, + "grad_norm": 0.6409081816673279, + "learning_rate": 9.83768854138578e-06, + "loss": 0.33620543479919435, + "memory(GiB)": 71.86, + "step": 985, + "token_acc": 0.8729323535637606, + "train_speed(iter/s)": 0.080742 + }, + { + "epoch": 0.3836558723084748, + "grad_norm": 0.6569280028343201, + "learning_rate": 9.834978047592404e-06, + "loss": 0.3181809425354004, + "memory(GiB)": 71.86, + "step": 990, + "token_acc": 0.8895862964566199, + "train_speed(iter/s)": 0.08074 + }, + { + "epoch": 0.3855935282292247, + "grad_norm": 0.637712836265564, + "learning_rate": 9.832245488746612e-06, + "loss": 0.3220236301422119, + "memory(GiB)": 71.86, + "step": 995, + "token_acc": 0.9014977162482739, + "train_speed(iter/s)": 0.080745 + }, + { + "epoch": 0.38753118414997456, + "grad_norm": 0.6087589263916016, + "learning_rate": 9.829490877318785e-06, + "loss": 0.30138335227966306, + "memory(GiB)": 71.86, + "step": 1000, + "token_acc": 0.9027007152884551, + "train_speed(iter/s)": 0.080744 + }, + { + "epoch": 0.38946884007072446, + "grad_norm": 0.6550173759460449, + "learning_rate": 9.826714225879957e-06, + "loss": 0.3302351236343384, + "memory(GiB)": 71.86, + "step": 1005, + "token_acc": 0.884102234367465, + "train_speed(iter/s)": 0.080755 + }, + { + "epoch": 0.3914064959914743, + "grad_norm": 0.6532196998596191, + "learning_rate": 9.823915547101735e-06, + "loss": 0.33039035797119143, + "memory(GiB)": 71.86, + "step": 1010, + "token_acc": 0.8898355520751762, + "train_speed(iter/s)": 0.080755 + }, + { + "epoch": 0.39334415191222416, + "grad_norm": 0.6979678869247437, + "learning_rate": 9.821094853756256e-06, + "loss": 0.32817845344543456, + "memory(GiB)": 71.86, + "step": 1015, + "token_acc": 0.8947746637922185, + "train_speed(iter/s)": 0.080747 + }, + { + "epoch": 0.39528180783297406, + "grad_norm": 0.6975110173225403, + "learning_rate": 9.818252158716121e-06, + "loss": 0.33469138145446775, + "memory(GiB)": 71.86, + "step": 1020, + "token_acc": 0.8819404784567548, + "train_speed(iter/s)": 0.080768 + }, + { + "epoch": 0.3972194637537239, + "grad_norm": 0.6733399033546448, + "learning_rate": 9.81538747495434e-06, + "loss": 0.3251255989074707, + "memory(GiB)": 71.86, + "step": 1025, + "token_acc": 0.8900912769361061, + "train_speed(iter/s)": 0.080814 + }, + { + "epoch": 0.3991571196744738, + "grad_norm": 0.6425248980522156, + "learning_rate": 9.812500815544272e-06, + "loss": 0.314360523223877, + "memory(GiB)": 71.86, + "step": 1030, + "token_acc": 0.8923047727488377, + "train_speed(iter/s)": 0.080807 + }, + { + "epoch": 0.40109477559522366, + "grad_norm": 0.6609647274017334, + "learning_rate": 9.809592193659562e-06, + "loss": 0.31197681427001955, + "memory(GiB)": 71.86, + "step": 1035, + "token_acc": 0.8963696597418731, + "train_speed(iter/s)": 0.080824 + }, + { + "epoch": 0.40303243151597357, + "grad_norm": 0.7050966024398804, + "learning_rate": 9.806661622574084e-06, + "loss": 0.34067888259887696, + "memory(GiB)": 71.86, + "step": 1040, + "token_acc": 0.889449232838193, + "train_speed(iter/s)": 0.080806 + }, + { + "epoch": 0.4049700874367234, + "grad_norm": 0.6383348703384399, + "learning_rate": 9.803709115661882e-06, + "loss": 0.32077836990356445, + "memory(GiB)": 71.86, + "step": 1045, + "token_acc": 0.8989039372164641, + "train_speed(iter/s)": 0.080784 + }, + { + "epoch": 0.4069077433574733, + "grad_norm": 0.6369540095329285, + "learning_rate": 9.800734686397105e-06, + "loss": 0.3368854999542236, + "memory(GiB)": 71.86, + "step": 1050, + "token_acc": 0.8808575338300136, + "train_speed(iter/s)": 0.080786 + }, + { + "epoch": 0.40884539927822316, + "grad_norm": 0.6770035028457642, + "learning_rate": 9.797738348353951e-06, + "loss": 0.3146085023880005, + "memory(GiB)": 71.86, + "step": 1055, + "token_acc": 0.9046233656980617, + "train_speed(iter/s)": 0.080782 + }, + { + "epoch": 0.41078305519897307, + "grad_norm": 0.6430585980415344, + "learning_rate": 9.794720115206597e-06, + "loss": 0.3340238332748413, + "memory(GiB)": 71.86, + "step": 1060, + "token_acc": 0.8887318968562345, + "train_speed(iter/s)": 0.080765 + }, + { + "epoch": 0.4127207111197229, + "grad_norm": 0.6269405484199524, + "learning_rate": 9.791680000729145e-06, + "loss": 0.3351738452911377, + "memory(GiB)": 71.86, + "step": 1065, + "token_acc": 0.8993369707852752, + "train_speed(iter/s)": 0.080785 + }, + { + "epoch": 0.41465836704047276, + "grad_norm": 0.617246150970459, + "learning_rate": 9.788618018795552e-06, + "loss": 0.3090823650360107, + "memory(GiB)": 71.86, + "step": 1070, + "token_acc": 0.8943518151613985, + "train_speed(iter/s)": 0.080806 + }, + { + "epoch": 0.41659602296122267, + "grad_norm": 0.6660940647125244, + "learning_rate": 9.785534183379571e-06, + "loss": 0.3186957597732544, + "memory(GiB)": 71.86, + "step": 1075, + "token_acc": 0.8957356770833333, + "train_speed(iter/s)": 0.080805 + }, + { + "epoch": 0.4185336788819725, + "grad_norm": 0.6464110016822815, + "learning_rate": 9.78242850855469e-06, + "loss": 0.30962181091308594, + "memory(GiB)": 71.86, + "step": 1080, + "token_acc": 0.8997256004607203, + "train_speed(iter/s)": 0.080829 + }, + { + "epoch": 0.4204713348027224, + "grad_norm": 0.6295762062072754, + "learning_rate": 9.779301008494057e-06, + "loss": 0.32428104877471925, + "memory(GiB)": 71.86, + "step": 1085, + "token_acc": 0.8935828705338512, + "train_speed(iter/s)": 0.080829 + }, + { + "epoch": 0.42240899072347227, + "grad_norm": 0.6192321181297302, + "learning_rate": 9.776151697470431e-06, + "loss": 0.3047311305999756, + "memory(GiB)": 71.86, + "step": 1090, + "token_acc": 0.8998032786885246, + "train_speed(iter/s)": 0.080814 + }, + { + "epoch": 0.42434664664422217, + "grad_norm": 0.6715239882469177, + "learning_rate": 9.772980589856099e-06, + "loss": 0.3298771381378174, + "memory(GiB)": 71.86, + "step": 1095, + "token_acc": 0.8884317290211463, + "train_speed(iter/s)": 0.080817 + }, + { + "epoch": 0.426284302564972, + "grad_norm": 0.6691494584083557, + "learning_rate": 9.769787700122823e-06, + "loss": 0.32638509273529054, + "memory(GiB)": 71.86, + "step": 1100, + "token_acc": 0.9007710475131981, + "train_speed(iter/s)": 0.080813 + }, + { + "epoch": 0.4282219584857219, + "grad_norm": 0.6489860415458679, + "learning_rate": 9.766573042841776e-06, + "loss": 0.3245856761932373, + "memory(GiB)": 71.86, + "step": 1105, + "token_acc": 0.8856663638269621, + "train_speed(iter/s)": 0.080796 + }, + { + "epoch": 0.43015961440647177, + "grad_norm": 0.6144282817840576, + "learning_rate": 9.763336632683463e-06, + "loss": 0.3188477039337158, + "memory(GiB)": 71.86, + "step": 1110, + "token_acc": 0.8965635738831615, + "train_speed(iter/s)": 0.080789 + }, + { + "epoch": 0.4320972703272216, + "grad_norm": 0.6301048398017883, + "learning_rate": 9.760078484417661e-06, + "loss": 0.33411540985107424, + "memory(GiB)": 71.86, + "step": 1115, + "token_acc": 0.8902315885476043, + "train_speed(iter/s)": 0.080787 + }, + { + "epoch": 0.4340349262479715, + "grad_norm": 0.6585410237312317, + "learning_rate": 9.756798612913358e-06, + "loss": 0.3051680326461792, + "memory(GiB)": 71.86, + "step": 1120, + "token_acc": 0.8869892784700087, + "train_speed(iter/s)": 0.080825 + }, + { + "epoch": 0.43597258216872137, + "grad_norm": 0.5961290597915649, + "learning_rate": 9.753497033138674e-06, + "loss": 0.3312673091888428, + "memory(GiB)": 71.86, + "step": 1125, + "token_acc": 0.9026795875435389, + "train_speed(iter/s)": 0.080816 + }, + { + "epoch": 0.43791023808947127, + "grad_norm": 0.6419817209243774, + "learning_rate": 9.750173760160799e-06, + "loss": 0.3214743614196777, + "memory(GiB)": 71.86, + "step": 1130, + "token_acc": 0.8987567162314132, + "train_speed(iter/s)": 0.080822 + }, + { + "epoch": 0.4398478940102211, + "grad_norm": 0.6663675308227539, + "learning_rate": 9.74682880914592e-06, + "loss": 0.31963672637939455, + "memory(GiB)": 71.86, + "step": 1135, + "token_acc": 0.8844882486732373, + "train_speed(iter/s)": 0.080854 + }, + { + "epoch": 0.441785549930971, + "grad_norm": 0.6634994745254517, + "learning_rate": 9.74346219535916e-06, + "loss": 0.3009810447692871, + "memory(GiB)": 71.86, + "step": 1140, + "token_acc": 0.9029835103409726, + "train_speed(iter/s)": 0.080864 + }, + { + "epoch": 0.44372320585172087, + "grad_norm": 0.6361936926841736, + "learning_rate": 9.740073934164499e-06, + "loss": 0.30399558544158933, + "memory(GiB)": 71.86, + "step": 1145, + "token_acc": 0.8956846262545745, + "train_speed(iter/s)": 0.080856 + }, + { + "epoch": 0.4456608617724708, + "grad_norm": 0.635513961315155, + "learning_rate": 9.736664041024705e-06, + "loss": 0.3424172639846802, + "memory(GiB)": 71.86, + "step": 1150, + "token_acc": 0.8794452236247468, + "train_speed(iter/s)": 0.080844 + }, + { + "epoch": 0.4475985176932206, + "grad_norm": 0.6696315407752991, + "learning_rate": 9.733232531501275e-06, + "loss": 0.3195925235748291, + "memory(GiB)": 71.86, + "step": 1155, + "token_acc": 0.8961313012895662, + "train_speed(iter/s)": 0.080828 + }, + { + "epoch": 0.4495361736139705, + "grad_norm": 0.6544925570487976, + "learning_rate": 9.729779421254346e-06, + "loss": 0.30799403190612795, + "memory(GiB)": 71.86, + "step": 1160, + "token_acc": 0.8943392211108746, + "train_speed(iter/s)": 0.080858 + }, + { + "epoch": 0.4514738295347204, + "grad_norm": 0.6342989206314087, + "learning_rate": 9.726304726042639e-06, + "loss": 0.31985857486724856, + "memory(GiB)": 71.86, + "step": 1165, + "token_acc": 0.8814540558734433, + "train_speed(iter/s)": 0.080869 + }, + { + "epoch": 0.4534114854554702, + "grad_norm": 0.6403650641441345, + "learning_rate": 9.722808461723377e-06, + "loss": 0.31233992576599123, + "memory(GiB)": 71.86, + "step": 1170, + "token_acc": 0.8884251530328325, + "train_speed(iter/s)": 0.080865 + }, + { + "epoch": 0.4553491413762201, + "grad_norm": 0.6330046653747559, + "learning_rate": 9.719290644252219e-06, + "loss": 0.315544581413269, + "memory(GiB)": 71.86, + "step": 1175, + "token_acc": 0.9076247317596566, + "train_speed(iter/s)": 0.080863 + }, + { + "epoch": 0.45728679729696997, + "grad_norm": 0.5967057943344116, + "learning_rate": 9.715751289683181e-06, + "loss": 0.3158837080001831, + "memory(GiB)": 71.86, + "step": 1180, + "token_acc": 0.8883155397390273, + "train_speed(iter/s)": 0.08083 + }, + { + "epoch": 0.4592244532177199, + "grad_norm": 0.6456601619720459, + "learning_rate": 9.712190414168573e-06, + "loss": 0.31205410957336427, + "memory(GiB)": 71.86, + "step": 1185, + "token_acc": 0.895832576390655, + "train_speed(iter/s)": 0.080848 + }, + { + "epoch": 0.4611621091384697, + "grad_norm": 0.6398383378982544, + "learning_rate": 9.70860803395891e-06, + "loss": 0.32424077987670896, + "memory(GiB)": 71.86, + "step": 1190, + "token_acc": 0.8800454803865833, + "train_speed(iter/s)": 0.080842 + }, + { + "epoch": 0.4630997650592196, + "grad_norm": 0.620103657245636, + "learning_rate": 9.705004165402855e-06, + "loss": 0.30479159355163576, + "memory(GiB)": 71.86, + "step": 1195, + "token_acc": 0.9015556387650202, + "train_speed(iter/s)": 0.080832 + }, + { + "epoch": 0.4650374209799695, + "grad_norm": 0.6715317964553833, + "learning_rate": 9.70137882494713e-06, + "loss": 0.3113380432128906, + "memory(GiB)": 71.86, + "step": 1200, + "token_acc": 0.9031839539939687, + "train_speed(iter/s)": 0.080836 + }, + { + "epoch": 0.4669750769007194, + "grad_norm": 0.6765401363372803, + "learning_rate": 9.697732029136446e-06, + "loss": 0.3275330066680908, + "memory(GiB)": 71.86, + "step": 1205, + "token_acc": 0.8961176583924729, + "train_speed(iter/s)": 0.08084 + }, + { + "epoch": 0.4689127328214692, + "grad_norm": 0.6207024455070496, + "learning_rate": 9.694063794613435e-06, + "loss": 0.31103494167327883, + "memory(GiB)": 71.86, + "step": 1210, + "token_acc": 0.9014660276289822, + "train_speed(iter/s)": 0.080834 + }, + { + "epoch": 0.47085038874221913, + "grad_norm": 0.6183663606643677, + "learning_rate": 9.690374138118563e-06, + "loss": 0.31775264739990233, + "memory(GiB)": 71.86, + "step": 1215, + "token_acc": 0.8885089322563529, + "train_speed(iter/s)": 0.080828 + }, + { + "epoch": 0.472788044662969, + "grad_norm": 0.6315492391586304, + "learning_rate": 9.686663076490055e-06, + "loss": 0.3269613265991211, + "memory(GiB)": 71.86, + "step": 1220, + "token_acc": 0.8918183467376187, + "train_speed(iter/s)": 0.080821 + }, + { + "epoch": 0.4747257005837188, + "grad_norm": 0.6290526390075684, + "learning_rate": 9.682930626663826e-06, + "loss": 0.32108583450317385, + "memory(GiB)": 71.86, + "step": 1225, + "token_acc": 0.8929259130654377, + "train_speed(iter/s)": 0.080818 + }, + { + "epoch": 0.4766633565044687, + "grad_norm": 0.6333230137825012, + "learning_rate": 9.679176805673399e-06, + "loss": 0.3004627227783203, + "memory(GiB)": 71.86, + "step": 1230, + "token_acc": 0.8975369131960538, + "train_speed(iter/s)": 0.080831 + }, + { + "epoch": 0.4786010124252186, + "grad_norm": 0.6388303637504578, + "learning_rate": 9.675401630649824e-06, + "loss": 0.321624755859375, + "memory(GiB)": 71.86, + "step": 1235, + "token_acc": 0.8988916178600668, + "train_speed(iter/s)": 0.080824 + }, + { + "epoch": 0.4805386683459685, + "grad_norm": 0.6355609893798828, + "learning_rate": 9.671605118821603e-06, + "loss": 0.3184902906417847, + "memory(GiB)": 71.86, + "step": 1240, + "token_acc": 0.8958213888233112, + "train_speed(iter/s)": 0.080815 + }, + { + "epoch": 0.4824763242667183, + "grad_norm": 0.6359343528747559, + "learning_rate": 9.667787287514614e-06, + "loss": 0.3213565587997437, + "memory(GiB)": 71.86, + "step": 1245, + "token_acc": 0.8997242062088961, + "train_speed(iter/s)": 0.080816 + }, + { + "epoch": 0.48441398018746823, + "grad_norm": 0.595271646976471, + "learning_rate": 9.663948154152028e-06, + "loss": 0.3277468204498291, + "memory(GiB)": 71.86, + "step": 1250, + "token_acc": 0.8972503128487841, + "train_speed(iter/s)": 0.080817 + }, + { + "epoch": 0.4863516361082181, + "grad_norm": 0.5990849137306213, + "learning_rate": 9.660087736254228e-06, + "loss": 0.3170775890350342, + "memory(GiB)": 71.86, + "step": 1255, + "token_acc": 0.8847311019180143, + "train_speed(iter/s)": 0.080811 + }, + { + "epoch": 0.488289292028968, + "grad_norm": 0.6386568546295166, + "learning_rate": 9.656206051438736e-06, + "loss": 0.3276305913925171, + "memory(GiB)": 71.86, + "step": 1260, + "token_acc": 0.8964000938086304, + "train_speed(iter/s)": 0.080816 + }, + { + "epoch": 0.49022694794971783, + "grad_norm": 0.6302090287208557, + "learning_rate": 9.65230311742013e-06, + "loss": 0.32054777145385743, + "memory(GiB)": 71.86, + "step": 1265, + "token_acc": 0.8926590538336052, + "train_speed(iter/s)": 0.080838 + }, + { + "epoch": 0.4921646038704677, + "grad_norm": 0.6372174620628357, + "learning_rate": 9.64837895200995e-06, + "loss": 0.3121542692184448, + "memory(GiB)": 71.86, + "step": 1270, + "token_acc": 0.8987489689304372, + "train_speed(iter/s)": 0.080852 + }, + { + "epoch": 0.4941022597912176, + "grad_norm": 0.5766242146492004, + "learning_rate": 9.644433573116643e-06, + "loss": 0.31241488456726074, + "memory(GiB)": 71.86, + "step": 1275, + "token_acc": 0.9012521632902372, + "train_speed(iter/s)": 0.080856 + }, + { + "epoch": 0.4960399157119674, + "grad_norm": 0.6373428106307983, + "learning_rate": 9.640466998745456e-06, + "loss": 0.32321221828460694, + "memory(GiB)": 71.86, + "step": 1280, + "token_acc": 0.8911403689227182, + "train_speed(iter/s)": 0.080859 + }, + { + "epoch": 0.49797757163271733, + "grad_norm": 0.6617798209190369, + "learning_rate": 9.636479246998371e-06, + "loss": 0.3195833444595337, + "memory(GiB)": 71.86, + "step": 1285, + "token_acc": 0.8979424138837759, + "train_speed(iter/s)": 0.080879 + }, + { + "epoch": 0.4999152275534672, + "grad_norm": 0.6529935598373413, + "learning_rate": 9.632470336074009e-06, + "loss": 0.3121352672576904, + "memory(GiB)": 71.86, + "step": 1290, + "token_acc": 0.8983967935871744, + "train_speed(iter/s)": 0.080937 + }, + { + "epoch": 0.5018528834742171, + "grad_norm": 0.687832236289978, + "learning_rate": 9.628440284267562e-06, + "loss": 0.31133959293365476, + "memory(GiB)": 71.86, + "step": 1295, + "token_acc": 0.9002139001120429, + "train_speed(iter/s)": 0.080945 + }, + { + "epoch": 0.503790539394967, + "grad_norm": 0.6272649765014648, + "learning_rate": 9.624389109970693e-06, + "loss": 0.31259541511535643, + "memory(GiB)": 71.86, + "step": 1300, + "token_acc": 0.893979721166033, + "train_speed(iter/s)": 0.080937 + }, + { + "epoch": 0.5057281953157168, + "grad_norm": 0.663655698299408, + "learning_rate": 9.620316831671467e-06, + "loss": 0.3107600688934326, + "memory(GiB)": 71.86, + "step": 1305, + "token_acc": 0.8993554263294332, + "train_speed(iter/s)": 0.080942 + }, + { + "epoch": 0.5076658512364667, + "grad_norm": 0.6684431433677673, + "learning_rate": 9.616223467954254e-06, + "loss": 0.3163787364959717, + "memory(GiB)": 71.86, + "step": 1310, + "token_acc": 0.9010745466756213, + "train_speed(iter/s)": 0.080962 + }, + { + "epoch": 0.5096035071572166, + "grad_norm": 0.6781684160232544, + "learning_rate": 9.612109037499652e-06, + "loss": 0.3251797199249268, + "memory(GiB)": 71.86, + "step": 1315, + "token_acc": 0.8915072012982622, + "train_speed(iter/s)": 0.080986 + }, + { + "epoch": 0.5115411630779664, + "grad_norm": 0.6502076387405396, + "learning_rate": 9.607973559084403e-06, + "loss": 0.32048649787902833, + "memory(GiB)": 71.86, + "step": 1320, + "token_acc": 0.899069038621071, + "train_speed(iter/s)": 0.080974 + }, + { + "epoch": 0.5134788189987163, + "grad_norm": 0.6232612133026123, + "learning_rate": 9.6038170515813e-06, + "loss": 0.3221144437789917, + "memory(GiB)": 71.86, + "step": 1325, + "token_acc": 0.9006666433981362, + "train_speed(iter/s)": 0.080965 + }, + { + "epoch": 0.5154164749194662, + "grad_norm": 0.6397128701210022, + "learning_rate": 9.599639533959111e-06, + "loss": 0.3275588512420654, + "memory(GiB)": 71.86, + "step": 1330, + "token_acc": 0.8982322652586009, + "train_speed(iter/s)": 0.08096 + }, + { + "epoch": 0.5173541308402161, + "grad_norm": 0.6316452622413635, + "learning_rate": 9.595441025282477e-06, + "loss": 0.32077484130859374, + "memory(GiB)": 71.86, + "step": 1335, + "token_acc": 0.9006897522522522, + "train_speed(iter/s)": 0.080953 + }, + { + "epoch": 0.5192917867609659, + "grad_norm": 0.6124240159988403, + "learning_rate": 9.59122154471184e-06, + "loss": 0.304377007484436, + "memory(GiB)": 71.86, + "step": 1340, + "token_acc": 0.9003951367781156, + "train_speed(iter/s)": 0.080934 + }, + { + "epoch": 0.5212294426817158, + "grad_norm": 0.6277111768722534, + "learning_rate": 9.586981111503352e-06, + "loss": 0.31348342895507814, + "memory(GiB)": 71.86, + "step": 1345, + "token_acc": 0.8995058831893641, + "train_speed(iter/s)": 0.080904 + }, + { + "epoch": 0.5231670986024657, + "grad_norm": 0.6599859595298767, + "learning_rate": 9.58271974500878e-06, + "loss": 0.30618677139282224, + "memory(GiB)": 71.86, + "step": 1350, + "token_acc": 0.8968546885717086, + "train_speed(iter/s)": 0.080888 + }, + { + "epoch": 0.5251047545232156, + "grad_norm": 0.6723609566688538, + "learning_rate": 9.578437464675427e-06, + "loss": 0.31934945583343505, + "memory(GiB)": 71.86, + "step": 1355, + "token_acc": 0.9059823179331934, + "train_speed(iter/s)": 0.080911 + }, + { + "epoch": 0.5270424104439654, + "grad_norm": 0.6316993236541748, + "learning_rate": 9.574134290046038e-06, + "loss": 0.3180837631225586, + "memory(GiB)": 71.86, + "step": 1360, + "token_acc": 0.8795793439020562, + "train_speed(iter/s)": 0.080892 + }, + { + "epoch": 0.5289800663647153, + "grad_norm": 0.6185073852539062, + "learning_rate": 9.56981024075871e-06, + "loss": 0.2968299865722656, + "memory(GiB)": 71.86, + "step": 1365, + "token_acc": 0.8910199696663297, + "train_speed(iter/s)": 0.080903 + }, + { + "epoch": 0.5309177222854652, + "grad_norm": 0.6216744184494019, + "learning_rate": 9.565465336546806e-06, + "loss": 0.3100132942199707, + "memory(GiB)": 71.86, + "step": 1370, + "token_acc": 0.8934845760132798, + "train_speed(iter/s)": 0.080915 + }, + { + "epoch": 0.532855378206215, + "grad_norm": 0.6380418539047241, + "learning_rate": 9.561099597238862e-06, + "loss": 0.3299269199371338, + "memory(GiB)": 71.86, + "step": 1375, + "token_acc": 0.9037136496943571, + "train_speed(iter/s)": 0.080876 + }, + { + "epoch": 0.5347930341269649, + "grad_norm": 0.6195020079612732, + "learning_rate": 9.556713042758496e-06, + "loss": 0.32070245742797854, + "memory(GiB)": 71.86, + "step": 1380, + "token_acc": 0.8869779636681441, + "train_speed(iter/s)": 0.080852 + }, + { + "epoch": 0.5367306900477148, + "grad_norm": 0.6413518190383911, + "learning_rate": 9.552305693124327e-06, + "loss": 0.3137282609939575, + "memory(GiB)": 71.86, + "step": 1385, + "token_acc": 0.8823070702402958, + "train_speed(iter/s)": 0.08086 + }, + { + "epoch": 0.5386683459684647, + "grad_norm": 0.6277745366096497, + "learning_rate": 9.547877568449863e-06, + "loss": 0.3198817253112793, + "memory(GiB)": 71.86, + "step": 1390, + "token_acc": 0.8930678944296738, + "train_speed(iter/s)": 0.080875 + }, + { + "epoch": 0.5406060018892145, + "grad_norm": 0.6188220381736755, + "learning_rate": 9.543428688943432e-06, + "loss": 0.3162529468536377, + "memory(GiB)": 71.86, + "step": 1395, + "token_acc": 0.8938566098081023, + "train_speed(iter/s)": 0.080861 + }, + { + "epoch": 0.5425436578099644, + "grad_norm": 0.6183713674545288, + "learning_rate": 9.538959074908076e-06, + "loss": 0.3134912014007568, + "memory(GiB)": 71.86, + "step": 1400, + "token_acc": 0.8904494382022472, + "train_speed(iter/s)": 0.080874 + }, + { + "epoch": 0.5444813137307143, + "grad_norm": 0.6285673379898071, + "learning_rate": 9.534468746741459e-06, + "loss": 0.31511313915252687, + "memory(GiB)": 71.86, + "step": 1405, + "token_acc": 0.8941470378301214, + "train_speed(iter/s)": 0.080887 + }, + { + "epoch": 0.5464189696514642, + "grad_norm": 0.6295690536499023, + "learning_rate": 9.529957724935778e-06, + "loss": 0.30770626068115237, + "memory(GiB)": 71.86, + "step": 1410, + "token_acc": 0.8990203466465713, + "train_speed(iter/s)": 0.080902 + }, + { + "epoch": 0.548356625572214, + "grad_norm": 0.5778564810752869, + "learning_rate": 9.52542603007767e-06, + "loss": 0.31245245933532717, + "memory(GiB)": 71.86, + "step": 1415, + "token_acc": 0.8955087884310152, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 0.5502942814929639, + "grad_norm": 0.6101093292236328, + "learning_rate": 9.520873682848116e-06, + "loss": 0.31894540786743164, + "memory(GiB)": 71.86, + "step": 1420, + "token_acc": 0.897682608558941, + "train_speed(iter/s)": 0.080884 + }, + { + "epoch": 0.5522319374137138, + "grad_norm": 0.6187080144882202, + "learning_rate": 9.516300704022345e-06, + "loss": 0.30878329277038574, + "memory(GiB)": 71.86, + "step": 1425, + "token_acc": 0.9063188446340525, + "train_speed(iter/s)": 0.080887 + }, + { + "epoch": 0.5541695933344636, + "grad_norm": 0.6207754611968994, + "learning_rate": 9.511707114469743e-06, + "loss": 0.31332030296325686, + "memory(GiB)": 71.86, + "step": 1430, + "token_acc": 0.8907917355908015, + "train_speed(iter/s)": 0.080902 + }, + { + "epoch": 0.5561072492552135, + "grad_norm": 0.6338498592376709, + "learning_rate": 9.507092935153753e-06, + "loss": 0.29761881828308107, + "memory(GiB)": 71.86, + "step": 1435, + "token_acc": 0.8983898330307449, + "train_speed(iter/s)": 0.080882 + }, + { + "epoch": 0.5580449051759634, + "grad_norm": 0.5981598496437073, + "learning_rate": 9.502458187131782e-06, + "loss": 0.3157939434051514, + "memory(GiB)": 71.86, + "step": 1440, + "token_acc": 0.8913943563284286, + "train_speed(iter/s)": 0.08089 + }, + { + "epoch": 0.5599825610967133, + "grad_norm": 0.5984019637107849, + "learning_rate": 9.497802891555111e-06, + "loss": 0.30947513580322267, + "memory(GiB)": 71.86, + "step": 1445, + "token_acc": 0.8913408571812441, + "train_speed(iter/s)": 0.080891 + }, + { + "epoch": 0.5619202170174631, + "grad_norm": 0.6401349902153015, + "learning_rate": 9.493127069668786e-06, + "loss": 0.32453346252441406, + "memory(GiB)": 71.86, + "step": 1450, + "token_acc": 0.8906738363300286, + "train_speed(iter/s)": 0.080896 + }, + { + "epoch": 0.563857872938213, + "grad_norm": 0.6067079901695251, + "learning_rate": 9.488430742811527e-06, + "loss": 0.31854901313781736, + "memory(GiB)": 71.86, + "step": 1455, + "token_acc": 0.899044467704359, + "train_speed(iter/s)": 0.080887 + }, + { + "epoch": 0.5657955288589629, + "grad_norm": 0.6511966586112976, + "learning_rate": 9.48371393241564e-06, + "loss": 0.31812138557434083, + "memory(GiB)": 71.86, + "step": 1460, + "token_acc": 0.8997221683788847, + "train_speed(iter/s)": 0.080873 + }, + { + "epoch": 0.5677331847797128, + "grad_norm": 0.6218332648277283, + "learning_rate": 9.478976660006896e-06, + "loss": 0.3028250694274902, + "memory(GiB)": 71.86, + "step": 1465, + "token_acc": 0.8977089383672152, + "train_speed(iter/s)": 0.080891 + }, + { + "epoch": 0.5696708407004626, + "grad_norm": 0.6328048706054688, + "learning_rate": 9.47421894720446e-06, + "loss": 0.3045586109161377, + "memory(GiB)": 71.86, + "step": 1470, + "token_acc": 0.9078662334145681, + "train_speed(iter/s)": 0.0809 + }, + { + "epoch": 0.5716084966212125, + "grad_norm": 0.6221967339515686, + "learning_rate": 9.469440815720774e-06, + "loss": 0.29703731536865235, + "memory(GiB)": 71.86, + "step": 1475, + "token_acc": 0.911037834085387, + "train_speed(iter/s)": 0.080886 + }, + { + "epoch": 0.5735461525419624, + "grad_norm": 0.5945653319358826, + "learning_rate": 9.464642287361463e-06, + "loss": 0.30239651203155515, + "memory(GiB)": 71.86, + "step": 1480, + "token_acc": 0.8957724980090257, + "train_speed(iter/s)": 0.080884 + }, + { + "epoch": 0.5754838084627122, + "grad_norm": 0.6818097829818726, + "learning_rate": 9.459823384025235e-06, + "loss": 0.32621264457702637, + "memory(GiB)": 71.86, + "step": 1485, + "token_acc": 0.8870914053658212, + "train_speed(iter/s)": 0.080899 + }, + { + "epoch": 0.5774214643834621, + "grad_norm": 0.6261887550354004, + "learning_rate": 9.454984127703788e-06, + "loss": 0.31356096267700195, + "memory(GiB)": 71.86, + "step": 1490, + "token_acc": 0.8967860682429563, + "train_speed(iter/s)": 0.080917 + }, + { + "epoch": 0.579359120304212, + "grad_norm": 0.6280757784843445, + "learning_rate": 9.450124540481693e-06, + "loss": 0.3232735633850098, + "memory(GiB)": 71.86, + "step": 1495, + "token_acc": 0.8853289947123056, + "train_speed(iter/s)": 0.080916 + }, + { + "epoch": 0.5812967762249619, + "grad_norm": 0.6291863322257996, + "learning_rate": 9.445244644536314e-06, + "loss": 0.31281461715698244, + "memory(GiB)": 71.86, + "step": 1500, + "token_acc": 0.8985899401701692, + "train_speed(iter/s)": 0.08091 + }, + { + "epoch": 0.5832344321457117, + "grad_norm": 0.6008995175361633, + "learning_rate": 9.44034446213769e-06, + "loss": 0.31398735046386717, + "memory(GiB)": 71.86, + "step": 1505, + "token_acc": 0.903951683605546, + "train_speed(iter/s)": 0.080921 + }, + { + "epoch": 0.5851720880664616, + "grad_norm": 0.6257957220077515, + "learning_rate": 9.43542401564844e-06, + "loss": 0.30848581790924073, + "memory(GiB)": 71.86, + "step": 1510, + "token_acc": 0.9153630929670609, + "train_speed(iter/s)": 0.08092 + }, + { + "epoch": 0.5871097439872115, + "grad_norm": 0.6130756139755249, + "learning_rate": 9.430483327523667e-06, + "loss": 0.3132195949554443, + "memory(GiB)": 71.86, + "step": 1515, + "token_acc": 0.8899323208445322, + "train_speed(iter/s)": 0.080934 + }, + { + "epoch": 0.5890473999079613, + "grad_norm": 0.5911505222320557, + "learning_rate": 9.425522420310845e-06, + "loss": 0.31288986206054686, + "memory(GiB)": 71.86, + "step": 1520, + "token_acc": 0.8933943534004105, + "train_speed(iter/s)": 0.080906 + }, + { + "epoch": 0.5909850558287112, + "grad_norm": 0.586607813835144, + "learning_rate": 9.420541316649718e-06, + "loss": 0.3002540111541748, + "memory(GiB)": 71.86, + "step": 1525, + "token_acc": 0.9010152284263959, + "train_speed(iter/s)": 0.080906 + }, + { + "epoch": 0.5929227117494611, + "grad_norm": 0.6183869242668152, + "learning_rate": 9.415540039272202e-06, + "loss": 0.3158272266387939, + "memory(GiB)": 71.86, + "step": 1530, + "token_acc": 0.89158118146445, + "train_speed(iter/s)": 0.08092 + }, + { + "epoch": 0.594860367670211, + "grad_norm": 0.6333942413330078, + "learning_rate": 9.41051861100228e-06, + "loss": 0.32763597965240476, + "memory(GiB)": 71.86, + "step": 1535, + "token_acc": 0.9017759605480821, + "train_speed(iter/s)": 0.080921 + }, + { + "epoch": 0.5967980235909608, + "grad_norm": 0.6147089004516602, + "learning_rate": 9.405477054755891e-06, + "loss": 0.2997840404510498, + "memory(GiB)": 71.86, + "step": 1540, + "token_acc": 0.8861569340776516, + "train_speed(iter/s)": 0.080923 + }, + { + "epoch": 0.5987356795117107, + "grad_norm": 0.6072155833244324, + "learning_rate": 9.400415393540838e-06, + "loss": 0.3148937225341797, + "memory(GiB)": 71.86, + "step": 1545, + "token_acc": 0.8906319892644897, + "train_speed(iter/s)": 0.080897 + }, + { + "epoch": 0.6006733354324606, + "grad_norm": 0.6179304122924805, + "learning_rate": 9.39533365045667e-06, + "loss": 0.3015735149383545, + "memory(GiB)": 71.86, + "step": 1550, + "token_acc": 0.8949518913931725, + "train_speed(iter/s)": 0.080902 + }, + { + "epoch": 0.6026109913532105, + "grad_norm": 0.6203258037567139, + "learning_rate": 9.390231848694582e-06, + "loss": 0.29992084503173827, + "memory(GiB)": 71.86, + "step": 1555, + "token_acc": 0.8858452722063037, + "train_speed(iter/s)": 0.080887 + }, + { + "epoch": 0.6045486472739603, + "grad_norm": 0.6550023555755615, + "learning_rate": 9.385110011537312e-06, + "loss": 0.3103346347808838, + "memory(GiB)": 71.86, + "step": 1560, + "token_acc": 0.898361172392058, + "train_speed(iter/s)": 0.080915 + }, + { + "epoch": 0.6064863031947102, + "grad_norm": 0.6364780068397522, + "learning_rate": 9.379968162359034e-06, + "loss": 0.3126693248748779, + "memory(GiB)": 71.86, + "step": 1565, + "token_acc": 0.894732145512954, + "train_speed(iter/s)": 0.080924 + }, + { + "epoch": 0.6084239591154601, + "grad_norm": 0.66599041223526, + "learning_rate": 9.374806324625243e-06, + "loss": 0.3166584253311157, + "memory(GiB)": 71.86, + "step": 1570, + "token_acc": 0.8910642910371926, + "train_speed(iter/s)": 0.080926 + }, + { + "epoch": 0.6103616150362099, + "grad_norm": 0.6209014654159546, + "learning_rate": 9.369624521892662e-06, + "loss": 0.31363322734832766, + "memory(GiB)": 71.86, + "step": 1575, + "token_acc": 0.8906361686919227, + "train_speed(iter/s)": 0.080946 + }, + { + "epoch": 0.6122992709569598, + "grad_norm": 0.6511316299438477, + "learning_rate": 9.36442277780912e-06, + "loss": 0.31300342082977295, + "memory(GiB)": 71.86, + "step": 1580, + "token_acc": 0.8923055565196265, + "train_speed(iter/s)": 0.080963 + }, + { + "epoch": 0.6142369268777097, + "grad_norm": 0.5959946513175964, + "learning_rate": 9.359201116113454e-06, + "loss": 0.3122777700424194, + "memory(GiB)": 71.86, + "step": 1585, + "token_acc": 0.8964503956990684, + "train_speed(iter/s)": 0.080963 + }, + { + "epoch": 0.6161745827984596, + "grad_norm": 0.6623050570487976, + "learning_rate": 9.353959560635402e-06, + "loss": 0.3189687252044678, + "memory(GiB)": 71.86, + "step": 1590, + "token_acc": 0.9036677273488528, + "train_speed(iter/s)": 0.080949 + }, + { + "epoch": 0.6181122387192094, + "grad_norm": 0.620650589466095, + "learning_rate": 9.34869813529548e-06, + "loss": 0.3007480621337891, + "memory(GiB)": 71.86, + "step": 1595, + "token_acc": 0.9134024910661624, + "train_speed(iter/s)": 0.08096 + }, + { + "epoch": 0.6200498946399593, + "grad_norm": 0.5833150744438171, + "learning_rate": 9.34341686410489e-06, + "loss": 0.32265076637268064, + "memory(GiB)": 71.86, + "step": 1600, + "token_acc": 0.8827635466380962, + "train_speed(iter/s)": 0.080949 + }, + { + "epoch": 0.6219875505607092, + "grad_norm": 0.6048651337623596, + "learning_rate": 9.338115771165401e-06, + "loss": 0.29888324737548827, + "memory(GiB)": 71.86, + "step": 1605, + "token_acc": 0.9089085583290881, + "train_speed(iter/s)": 0.080933 + }, + { + "epoch": 0.6239252064814591, + "grad_norm": 0.5740781426429749, + "learning_rate": 9.332794880669244e-06, + "loss": 0.2918308019638062, + "memory(GiB)": 71.86, + "step": 1610, + "token_acc": 0.89769777194534, + "train_speed(iter/s)": 0.080956 + }, + { + "epoch": 0.6258628624022089, + "grad_norm": 0.6081269979476929, + "learning_rate": 9.327454216898994e-06, + "loss": 0.3145033597946167, + "memory(GiB)": 71.86, + "step": 1615, + "token_acc": 0.8979991021612261, + "train_speed(iter/s)": 0.08096 + }, + { + "epoch": 0.6278005183229588, + "grad_norm": 0.6427978277206421, + "learning_rate": 9.322093804227467e-06, + "loss": 0.30745735168457033, + "memory(GiB)": 71.86, + "step": 1620, + "token_acc": 0.8982848422519325, + "train_speed(iter/s)": 0.080968 + }, + { + "epoch": 0.6297381742437087, + "grad_norm": 0.6230420470237732, + "learning_rate": 9.316713667117605e-06, + "loss": 0.3176234245300293, + "memory(GiB)": 71.86, + "step": 1625, + "token_acc": 0.8878736748272898, + "train_speed(iter/s)": 0.080967 + }, + { + "epoch": 0.6316758301644585, + "grad_norm": 0.6435239911079407, + "learning_rate": 9.311313830122364e-06, + "loss": 0.3190913200378418, + "memory(GiB)": 71.86, + "step": 1630, + "token_acc": 0.8949252474205096, + "train_speed(iter/s)": 0.080954 + }, + { + "epoch": 0.6336134860852084, + "grad_norm": 0.5868645310401917, + "learning_rate": 9.305894317884603e-06, + "loss": 0.2969323396682739, + "memory(GiB)": 71.86, + "step": 1635, + "token_acc": 0.8913538481556177, + "train_speed(iter/s)": 0.080956 + }, + { + "epoch": 0.6355511420059583, + "grad_norm": 0.5792374610900879, + "learning_rate": 9.300455155136973e-06, + "loss": 0.3106818199157715, + "memory(GiB)": 71.86, + "step": 1640, + "token_acc": 0.907856750068738, + "train_speed(iter/s)": 0.080939 + }, + { + "epoch": 0.6374887979267082, + "grad_norm": 0.6041505932807922, + "learning_rate": 9.294996366701804e-06, + "loss": 0.3139790534973145, + "memory(GiB)": 71.86, + "step": 1645, + "token_acc": 0.8883584511217198, + "train_speed(iter/s)": 0.080926 + }, + { + "epoch": 0.639426453847458, + "grad_norm": 0.6277373433113098, + "learning_rate": 9.289517977490986e-06, + "loss": 0.32369270324707033, + "memory(GiB)": 71.86, + "step": 1650, + "token_acc": 0.8990907470137279, + "train_speed(iter/s)": 0.080935 + }, + { + "epoch": 0.6413641097682079, + "grad_norm": 0.6074517369270325, + "learning_rate": 9.284020012505863e-06, + "loss": 0.317038369178772, + "memory(GiB)": 71.86, + "step": 1655, + "token_acc": 0.895113307539747, + "train_speed(iter/s)": 0.08093 + }, + { + "epoch": 0.6433017656889578, + "grad_norm": 0.6136306524276733, + "learning_rate": 9.278502496837116e-06, + "loss": 0.30502321720123293, + "memory(GiB)": 71.86, + "step": 1660, + "token_acc": 0.8915775068424981, + "train_speed(iter/s)": 0.080933 + }, + { + "epoch": 0.6452394216097077, + "grad_norm": 0.5837361812591553, + "learning_rate": 9.272965455664644e-06, + "loss": 0.31087141036987304, + "memory(GiB)": 71.86, + "step": 1665, + "token_acc": 0.8925676335579646, + "train_speed(iter/s)": 0.080923 + }, + { + "epoch": 0.6471770775304575, + "grad_norm": 0.6269435882568359, + "learning_rate": 9.267408914257459e-06, + "loss": 0.31297247409820556, + "memory(GiB)": 71.86, + "step": 1670, + "token_acc": 0.8990828714700448, + "train_speed(iter/s)": 0.080911 + }, + { + "epoch": 0.6491147334512074, + "grad_norm": 0.6197941899299622, + "learning_rate": 9.261832897973559e-06, + "loss": 0.3180734157562256, + "memory(GiB)": 71.86, + "step": 1675, + "token_acc": 0.892425031396453, + "train_speed(iter/s)": 0.080927 + }, + { + "epoch": 0.6510523893719573, + "grad_norm": 0.6286627650260925, + "learning_rate": 9.256237432259823e-06, + "loss": 0.3299598217010498, + "memory(GiB)": 71.86, + "step": 1680, + "token_acc": 0.8858767912542473, + "train_speed(iter/s)": 0.080926 + }, + { + "epoch": 0.6529900452927071, + "grad_norm": 0.5860146284103394, + "learning_rate": 9.250622542651887e-06, + "loss": 0.31193127632141116, + "memory(GiB)": 71.86, + "step": 1685, + "token_acc": 0.886069178652086, + "train_speed(iter/s)": 0.080942 + }, + { + "epoch": 0.654927701213457, + "grad_norm": 0.6271302103996277, + "learning_rate": 9.244988254774032e-06, + "loss": 0.3232892513275146, + "memory(GiB)": 71.86, + "step": 1690, + "token_acc": 0.8902681549910187, + "train_speed(iter/s)": 0.080933 + }, + { + "epoch": 0.6568653571342069, + "grad_norm": 0.6138412952423096, + "learning_rate": 9.239334594339064e-06, + "loss": 0.3187739849090576, + "memory(GiB)": 71.86, + "step": 1695, + "token_acc": 0.8890871342373321, + "train_speed(iter/s)": 0.080926 + }, + { + "epoch": 0.6588030130549568, + "grad_norm": 0.5884256362915039, + "learning_rate": 9.233661587148202e-06, + "loss": 0.3060622215270996, + "memory(GiB)": 71.86, + "step": 1700, + "token_acc": 0.8999258710155671, + "train_speed(iter/s)": 0.080951 + }, + { + "epoch": 0.6607406689757066, + "grad_norm": 0.5839608311653137, + "learning_rate": 9.22796925909095e-06, + "loss": 0.312208890914917, + "memory(GiB)": 71.86, + "step": 1705, + "token_acc": 0.8906457570553737, + "train_speed(iter/s)": 0.080954 + }, + { + "epoch": 0.6626783248964565, + "grad_norm": 0.657823383808136, + "learning_rate": 9.222257636144992e-06, + "loss": 0.2995912075042725, + "memory(GiB)": 71.86, + "step": 1710, + "token_acc": 0.913647851727043, + "train_speed(iter/s)": 0.080952 + }, + { + "epoch": 0.6646159808172064, + "grad_norm": 0.6004208922386169, + "learning_rate": 9.216526744376059e-06, + "loss": 0.30365538597106934, + "memory(GiB)": 71.86, + "step": 1715, + "token_acc": 0.9049971046087816, + "train_speed(iter/s)": 0.080937 + }, + { + "epoch": 0.6665536367379563, + "grad_norm": 0.6169917583465576, + "learning_rate": 9.21077660993783e-06, + "loss": 0.3203572273254395, + "memory(GiB)": 71.86, + "step": 1720, + "token_acc": 0.896225152265578, + "train_speed(iter/s)": 0.080917 + }, + { + "epoch": 0.6684912926587061, + "grad_norm": 0.5970929265022278, + "learning_rate": 9.205007259071786e-06, + "loss": 0.29948410987854, + "memory(GiB)": 71.86, + "step": 1725, + "token_acc": 0.9039809714571858, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 0.670428948579456, + "grad_norm": 0.6267915368080139, + "learning_rate": 9.199218718107115e-06, + "loss": 0.311102294921875, + "memory(GiB)": 71.86, + "step": 1730, + "token_acc": 0.8852009147337472, + "train_speed(iter/s)": 0.080902 + }, + { + "epoch": 0.6723666045002059, + "grad_norm": 0.5979821085929871, + "learning_rate": 9.193411013460576e-06, + "loss": 0.30515599250793457, + "memory(GiB)": 71.86, + "step": 1735, + "token_acc": 0.8829092393522928, + "train_speed(iter/s)": 0.080896 + }, + { + "epoch": 0.6743042604209557, + "grad_norm": 0.6205952763557434, + "learning_rate": 9.187584171636388e-06, + "loss": 0.3166038990020752, + "memory(GiB)": 71.86, + "step": 1740, + "token_acc": 0.889463922460213, + "train_speed(iter/s)": 0.080916 + }, + { + "epoch": 0.6762419163417056, + "grad_norm": 0.598731279373169, + "learning_rate": 9.181738219226102e-06, + "loss": 0.29600186347961427, + "memory(GiB)": 71.86, + "step": 1745, + "token_acc": 0.8940675369246798, + "train_speed(iter/s)": 0.080913 + }, + { + "epoch": 0.6781795722624555, + "grad_norm": 0.6215488314628601, + "learning_rate": 9.175873182908484e-06, + "loss": 0.3002432107925415, + "memory(GiB)": 71.86, + "step": 1750, + "token_acc": 0.9014647976059222, + "train_speed(iter/s)": 0.080921 + }, + { + "epoch": 0.6801172281832054, + "grad_norm": 0.5754505395889282, + "learning_rate": 9.16998908944939e-06, + "loss": 0.3116154670715332, + "memory(GiB)": 71.86, + "step": 1755, + "token_acc": 0.8995418630901003, + "train_speed(iter/s)": 0.08093 + }, + { + "epoch": 0.6820548841039552, + "grad_norm": 0.640917956829071, + "learning_rate": 9.16408596570165e-06, + "loss": 0.31115570068359377, + "memory(GiB)": 71.86, + "step": 1760, + "token_acc": 0.8795623533403071, + "train_speed(iter/s)": 0.08093 + }, + { + "epoch": 0.6839925400247051, + "grad_norm": 0.6518266201019287, + "learning_rate": 9.158163838604938e-06, + "loss": 0.31464262008666993, + "memory(GiB)": 71.86, + "step": 1765, + "token_acc": 0.9010270455323519, + "train_speed(iter/s)": 0.080917 + }, + { + "epoch": 0.685930195945455, + "grad_norm": 0.6728587746620178, + "learning_rate": 9.15222273518565e-06, + "loss": 0.2999051570892334, + "memory(GiB)": 71.86, + "step": 1770, + "token_acc": 0.889860038329752, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 0.6878678518662048, + "grad_norm": 0.6566827297210693, + "learning_rate": 9.14626268255679e-06, + "loss": 0.30396265983581544, + "memory(GiB)": 71.86, + "step": 1775, + "token_acc": 0.893242859172943, + "train_speed(iter/s)": 0.080938 + }, + { + "epoch": 0.6898055077869547, + "grad_norm": 0.6036619544029236, + "learning_rate": 9.140283707917831e-06, + "loss": 0.3114708423614502, + "memory(GiB)": 71.86, + "step": 1780, + "token_acc": 0.8952726572034171, + "train_speed(iter/s)": 0.080945 + }, + { + "epoch": 0.6917431637077046, + "grad_norm": 0.6068353652954102, + "learning_rate": 9.134285838554605e-06, + "loss": 0.30329604148864747, + "memory(GiB)": 71.86, + "step": 1785, + "token_acc": 0.8964941042940917, + "train_speed(iter/s)": 0.080946 + }, + { + "epoch": 0.6936808196284545, + "grad_norm": 0.5659881234169006, + "learning_rate": 9.128269101839172e-06, + "loss": 0.31119503974914553, + "memory(GiB)": 71.86, + "step": 1790, + "token_acc": 0.8999454743729552, + "train_speed(iter/s)": 0.080951 + }, + { + "epoch": 0.6956184755492043, + "grad_norm": 0.644329309463501, + "learning_rate": 9.122233525229688e-06, + "loss": 0.32088627815246584, + "memory(GiB)": 71.86, + "step": 1795, + "token_acc": 0.8879273955255382, + "train_speed(iter/s)": 0.080955 + }, + { + "epoch": 0.6975561314699542, + "grad_norm": 0.592437744140625, + "learning_rate": 9.116179136270302e-06, + "loss": 0.2998102903366089, + "memory(GiB)": 71.86, + "step": 1800, + "token_acc": 0.9004172371080253, + "train_speed(iter/s)": 0.080935 + }, + { + "epoch": 0.6994937873907041, + "grad_norm": 0.6425099968910217, + "learning_rate": 9.110105962591e-06, + "loss": 0.30757966041564944, + "memory(GiB)": 71.86, + "step": 1805, + "token_acc": 0.9009111237192537, + "train_speed(iter/s)": 0.080943 + }, + { + "epoch": 0.701431443311454, + "grad_norm": 0.5910810232162476, + "learning_rate": 9.104014031907505e-06, + "loss": 0.31116063594818116, + "memory(GiB)": 71.86, + "step": 1810, + "token_acc": 0.8962950765684176, + "train_speed(iter/s)": 0.080951 + }, + { + "epoch": 0.7033690992322038, + "grad_norm": 0.563534140586853, + "learning_rate": 9.097903372021136e-06, + "loss": 0.30581443309783934, + "memory(GiB)": 71.86, + "step": 1815, + "token_acc": 0.8948527387833746, + "train_speed(iter/s)": 0.080954 + }, + { + "epoch": 0.7053067551529537, + "grad_norm": 0.5983726382255554, + "learning_rate": 9.091774010818686e-06, + "loss": 0.3007395029067993, + "memory(GiB)": 71.86, + "step": 1820, + "token_acc": 0.8795069692431747, + "train_speed(iter/s)": 0.080949 + }, + { + "epoch": 0.7072444110737036, + "grad_norm": 0.5900224447250366, + "learning_rate": 9.085625976272292e-06, + "loss": 0.31903905868530275, + "memory(GiB)": 71.86, + "step": 1825, + "token_acc": 0.8930551027062276, + "train_speed(iter/s)": 0.080938 + }, + { + "epoch": 0.7091820669944534, + "grad_norm": 0.6485769152641296, + "learning_rate": 9.079459296439312e-06, + "loss": 0.3097678184509277, + "memory(GiB)": 71.86, + "step": 1830, + "token_acc": 0.8928302360622803, + "train_speed(iter/s)": 0.080952 + }, + { + "epoch": 0.7111197229152033, + "grad_norm": 0.6100226044654846, + "learning_rate": 9.073273999462194e-06, + "loss": 0.3076323986053467, + "memory(GiB)": 71.86, + "step": 1835, + "token_acc": 0.8870309813572465, + "train_speed(iter/s)": 0.080963 + }, + { + "epoch": 0.7130573788359532, + "grad_norm": 0.6233454346656799, + "learning_rate": 9.067070113568346e-06, + "loss": 0.3112640857696533, + "memory(GiB)": 71.86, + "step": 1840, + "token_acc": 0.9014221634966445, + "train_speed(iter/s)": 0.080965 + }, + { + "epoch": 0.7149950347567031, + "grad_norm": 0.5628553032875061, + "learning_rate": 9.060847667070008e-06, + "loss": 0.3148493766784668, + "memory(GiB)": 71.86, + "step": 1845, + "token_acc": 0.902200413541956, + "train_speed(iter/s)": 0.080974 + }, + { + "epoch": 0.7169326906774529, + "grad_norm": 0.5869155526161194, + "learning_rate": 9.05460668836413e-06, + "loss": 0.31223297119140625, + "memory(GiB)": 71.86, + "step": 1850, + "token_acc": 0.9005300474977628, + "train_speed(iter/s)": 0.08096 + }, + { + "epoch": 0.7188703465982028, + "grad_norm": 0.5835767388343811, + "learning_rate": 9.048347205932227e-06, + "loss": 0.3029788494110107, + "memory(GiB)": 71.86, + "step": 1855, + "token_acc": 0.9045944838885854, + "train_speed(iter/s)": 0.080963 + }, + { + "epoch": 0.7208080025189527, + "grad_norm": 0.5862783193588257, + "learning_rate": 9.042069248340265e-06, + "loss": 0.30313446521759035, + "memory(GiB)": 71.86, + "step": 1860, + "token_acc": 0.9043334435990737, + "train_speed(iter/s)": 0.080964 + }, + { + "epoch": 0.7227456584397026, + "grad_norm": 0.650233268737793, + "learning_rate": 9.03577284423852e-06, + "loss": 0.32649707794189453, + "memory(GiB)": 71.86, + "step": 1865, + "token_acc": 0.8913325107958051, + "train_speed(iter/s)": 0.080966 + }, + { + "epoch": 0.7246833143604524, + "grad_norm": 0.6166769862174988, + "learning_rate": 9.029458022361455e-06, + "loss": 0.3023579835891724, + "memory(GiB)": 71.86, + "step": 1870, + "token_acc": 0.9074721468945013, + "train_speed(iter/s)": 0.080987 + }, + { + "epoch": 0.7266209702812023, + "grad_norm": 0.5706512331962585, + "learning_rate": 9.023124811527582e-06, + "loss": 0.3054157257080078, + "memory(GiB)": 71.86, + "step": 1875, + "token_acc": 0.8847754152606231, + "train_speed(iter/s)": 0.080984 + }, + { + "epoch": 0.7285586262019522, + "grad_norm": 0.6064714789390564, + "learning_rate": 9.016773240639334e-06, + "loss": 0.3063014507293701, + "memory(GiB)": 71.86, + "step": 1880, + "token_acc": 0.908094012236301, + "train_speed(iter/s)": 0.080989 + }, + { + "epoch": 0.730496282122702, + "grad_norm": 0.6083559393882751, + "learning_rate": 9.01040333868293e-06, + "loss": 0.30855863094329833, + "memory(GiB)": 71.86, + "step": 1885, + "token_acc": 0.9015966845503903, + "train_speed(iter/s)": 0.081004 + }, + { + "epoch": 0.7324339380434519, + "grad_norm": 0.595201849937439, + "learning_rate": 9.004015134728252e-06, + "loss": 0.30145509243011476, + "memory(GiB)": 71.86, + "step": 1890, + "token_acc": 0.8991807826626196, + "train_speed(iter/s)": 0.081022 + }, + { + "epoch": 0.7343715939642018, + "grad_norm": 0.6078715920448303, + "learning_rate": 8.997608657928698e-06, + "loss": 0.30652527809143065, + "memory(GiB)": 71.86, + "step": 1895, + "token_acc": 0.8993735064264031, + "train_speed(iter/s)": 0.081038 + }, + { + "epoch": 0.7363092498849517, + "grad_norm": 0.6498925685882568, + "learning_rate": 8.99118393752106e-06, + "loss": 0.3331868648529053, + "memory(GiB)": 71.86, + "step": 1900, + "token_acc": 0.8941127377681956, + "train_speed(iter/s)": 0.081032 + }, + { + "epoch": 0.7382469058057015, + "grad_norm": 0.6449311971664429, + "learning_rate": 8.98474100282539e-06, + "loss": 0.3127185344696045, + "memory(GiB)": 71.86, + "step": 1905, + "token_acc": 0.8932032412766661, + "train_speed(iter/s)": 0.08104 + }, + { + "epoch": 0.7401845617264514, + "grad_norm": 0.5712651014328003, + "learning_rate": 8.978279883244855e-06, + "loss": 0.3024605274200439, + "memory(GiB)": 71.86, + "step": 1910, + "token_acc": 0.8927019331634947, + "train_speed(iter/s)": 0.081039 + }, + { + "epoch": 0.7421222176472013, + "grad_norm": 0.636428713798523, + "learning_rate": 8.971800608265621e-06, + "loss": 0.3047468662261963, + "memory(GiB)": 71.86, + "step": 1915, + "token_acc": 0.895410208444682, + "train_speed(iter/s)": 0.081041 + }, + { + "epoch": 0.7440598735679512, + "grad_norm": 0.6133522987365723, + "learning_rate": 8.965303207456702e-06, + "loss": 0.28756425380706785, + "memory(GiB)": 71.86, + "step": 1920, + "token_acc": 0.9078662897056378, + "train_speed(iter/s)": 0.081048 + }, + { + "epoch": 0.745997529488701, + "grad_norm": 0.581291913986206, + "learning_rate": 8.958787710469832e-06, + "loss": 0.3084988832473755, + "memory(GiB)": 71.86, + "step": 1925, + "token_acc": 0.9017774896796424, + "train_speed(iter/s)": 0.081057 + }, + { + "epoch": 0.7479351854094509, + "grad_norm": 0.6467372179031372, + "learning_rate": 8.95225414703933e-06, + "loss": 0.3201260805130005, + "memory(GiB)": 71.86, + "step": 1930, + "token_acc": 0.8919000632163373, + "train_speed(iter/s)": 0.081043 + }, + { + "epoch": 0.7498728413302008, + "grad_norm": 0.6468227505683899, + "learning_rate": 8.94570254698197e-06, + "loss": 0.31164610385894775, + "memory(GiB)": 71.86, + "step": 1935, + "token_acc": 0.900514465156678, + "train_speed(iter/s)": 0.081051 + }, + { + "epoch": 0.7518104972509506, + "grad_norm": 0.6398622989654541, + "learning_rate": 8.939132940196825e-06, + "loss": 0.32195866107940674, + "memory(GiB)": 71.86, + "step": 1940, + "token_acc": 0.8985073645449764, + "train_speed(iter/s)": 0.081062 + }, + { + "epoch": 0.7537481531717005, + "grad_norm": 0.5821054577827454, + "learning_rate": 8.932545356665157e-06, + "loss": 0.2912397861480713, + "memory(GiB)": 71.86, + "step": 1945, + "token_acc": 0.8943398638499153, + "train_speed(iter/s)": 0.081049 + }, + { + "epoch": 0.7556858090924504, + "grad_norm": 0.6212338209152222, + "learning_rate": 8.925939826450259e-06, + "loss": 0.2999946117401123, + "memory(GiB)": 71.86, + "step": 1950, + "token_acc": 0.9014343865090134, + "train_speed(iter/s)": 0.081047 + }, + { + "epoch": 0.7576234650132003, + "grad_norm": 0.5945454239845276, + "learning_rate": 8.919316379697331e-06, + "loss": 0.3051981687545776, + "memory(GiB)": 71.86, + "step": 1955, + "token_acc": 0.9032103497843795, + "train_speed(iter/s)": 0.08103 + }, + { + "epoch": 0.7595611209339501, + "grad_norm": 0.6175397038459778, + "learning_rate": 8.912675046633336e-06, + "loss": 0.3095961093902588, + "memory(GiB)": 71.86, + "step": 1960, + "token_acc": 0.9021332892343311, + "train_speed(iter/s)": 0.08103 + }, + { + "epoch": 0.7614987768547, + "grad_norm": 0.5822819471359253, + "learning_rate": 8.90601585756686e-06, + "loss": 0.302655553817749, + "memory(GiB)": 71.86, + "step": 1965, + "token_acc": 0.8917866492146597, + "train_speed(iter/s)": 0.081012 + }, + { + "epoch": 0.7634364327754499, + "grad_norm": 0.5745783448219299, + "learning_rate": 8.899338842887982e-06, + "loss": 0.31560654640197755, + "memory(GiB)": 71.86, + "step": 1970, + "token_acc": 0.8880744017825427, + "train_speed(iter/s)": 0.081022 + }, + { + "epoch": 0.7653740886961997, + "grad_norm": 0.5756574273109436, + "learning_rate": 8.892644033068128e-06, + "loss": 0.29446749687194823, + "memory(GiB)": 71.86, + "step": 1975, + "token_acc": 0.9025089605734767, + "train_speed(iter/s)": 0.081021 + }, + { + "epoch": 0.7673117446169496, + "grad_norm": 0.602070689201355, + "learning_rate": 8.885931458659936e-06, + "loss": 0.28977217674255373, + "memory(GiB)": 71.86, + "step": 1980, + "token_acc": 0.9017447199265382, + "train_speed(iter/s)": 0.081044 + }, + { + "epoch": 0.7692494005376995, + "grad_norm": 0.6217750310897827, + "learning_rate": 8.879201150297114e-06, + "loss": 0.3090545654296875, + "memory(GiB)": 71.86, + "step": 1985, + "token_acc": 0.8897201527577787, + "train_speed(iter/s)": 0.081059 + }, + { + "epoch": 0.7711870564584494, + "grad_norm": 0.6141068935394287, + "learning_rate": 8.8724531386943e-06, + "loss": 0.316222620010376, + "memory(GiB)": 71.86, + "step": 1990, + "token_acc": 0.9057566110064323, + "train_speed(iter/s)": 0.081058 + }, + { + "epoch": 0.7731247123791992, + "grad_norm": 0.6178670525550842, + "learning_rate": 8.865687454646925e-06, + "loss": 0.30695629119873047, + "memory(GiB)": 71.86, + "step": 1995, + "token_acc": 0.902152258401646, + "train_speed(iter/s)": 0.081049 + }, + { + "epoch": 0.7750623682999491, + "grad_norm": 0.5628589987754822, + "learning_rate": 8.858904129031072e-06, + "loss": 0.30267930030822754, + "memory(GiB)": 71.86, + "step": 2000, + "token_acc": 0.889784859529723, + "train_speed(iter/s)": 0.081069 + }, + { + "epoch": 0.777000024220699, + "grad_norm": 0.610127329826355, + "learning_rate": 8.852103192803328e-06, + "loss": 0.3077036142349243, + "memory(GiB)": 71.86, + "step": 2005, + "token_acc": 0.9019208964183285, + "train_speed(iter/s)": 0.081069 + }, + { + "epoch": 0.7789376801414489, + "grad_norm": 0.6629402041435242, + "learning_rate": 8.845284677000652e-06, + "loss": 0.3216689586639404, + "memory(GiB)": 71.86, + "step": 2010, + "token_acc": 0.902394461005866, + "train_speed(iter/s)": 0.081073 + }, + { + "epoch": 0.7808753360621987, + "grad_norm": 0.6259828209877014, + "learning_rate": 8.838448612740232e-06, + "loss": 0.30659077167510984, + "memory(GiB)": 71.86, + "step": 2015, + "token_acc": 0.8962287480680062, + "train_speed(iter/s)": 0.081043 + }, + { + "epoch": 0.7828129919829486, + "grad_norm": 0.6070216298103333, + "learning_rate": 8.831595031219337e-06, + "loss": 0.31620666980743406, + "memory(GiB)": 71.86, + "step": 2020, + "token_acc": 0.8867298426980632, + "train_speed(iter/s)": 0.081035 + }, + { + "epoch": 0.7847506479036985, + "grad_norm": 0.5795909762382507, + "learning_rate": 8.82472396371518e-06, + "loss": 0.30554752349853515, + "memory(GiB)": 71.86, + "step": 2025, + "token_acc": 0.8927852041750595, + "train_speed(iter/s)": 0.08103 + }, + { + "epoch": 0.7866883038244483, + "grad_norm": 0.5747736096382141, + "learning_rate": 8.817835441584772e-06, + "loss": 0.30044281482696533, + "memory(GiB)": 71.86, + "step": 2030, + "token_acc": 0.9013870751420605, + "train_speed(iter/s)": 0.081052 + }, + { + "epoch": 0.7886259597451982, + "grad_norm": 0.621019721031189, + "learning_rate": 8.810929496264783e-06, + "loss": 0.2906287670135498, + "memory(GiB)": 71.86, + "step": 2035, + "token_acc": 0.8963844797178131, + "train_speed(iter/s)": 0.081046 + }, + { + "epoch": 0.7905636156659481, + "grad_norm": 0.6499260663986206, + "learning_rate": 8.80400615927139e-06, + "loss": 0.3061370372772217, + "memory(GiB)": 71.86, + "step": 2040, + "token_acc": 0.9048953721693156, + "train_speed(iter/s)": 0.081038 + }, + { + "epoch": 0.792501271586698, + "grad_norm": 0.6083590388298035, + "learning_rate": 8.79706546220015e-06, + "loss": 0.2965734004974365, + "memory(GiB)": 71.86, + "step": 2045, + "token_acc": 0.9033915001407261, + "train_speed(iter/s)": 0.081037 + }, + { + "epoch": 0.7944389275074478, + "grad_norm": 0.6360229849815369, + "learning_rate": 8.790107436725834e-06, + "loss": 0.3123593807220459, + "memory(GiB)": 71.86, + "step": 2050, + "token_acc": 0.9031104403176061, + "train_speed(iter/s)": 0.081028 + }, + { + "epoch": 0.7963765834281977, + "grad_norm": 0.5798035860061646, + "learning_rate": 8.783132114602299e-06, + "loss": 0.3108199596405029, + "memory(GiB)": 71.86, + "step": 2055, + "token_acc": 0.8884368108830015, + "train_speed(iter/s)": 0.081 + }, + { + "epoch": 0.7983142393489476, + "grad_norm": 0.6013064980506897, + "learning_rate": 8.77613952766234e-06, + "loss": 0.30339345932006834, + "memory(GiB)": 71.86, + "step": 2060, + "token_acc": 0.9030202185106115, + "train_speed(iter/s)": 0.08099 + }, + { + "epoch": 0.8002518952696975, + "grad_norm": 0.6271098256111145, + "learning_rate": 8.769129707817532e-06, + "loss": 0.3082975149154663, + "memory(GiB)": 71.86, + "step": 2065, + "token_acc": 0.8864762668652783, + "train_speed(iter/s)": 0.080983 + }, + { + "epoch": 0.8021895511904473, + "grad_norm": 0.5560224652290344, + "learning_rate": 8.762102687058108e-06, + "loss": 0.29483623504638673, + "memory(GiB)": 71.86, + "step": 2070, + "token_acc": 0.9096249912052347, + "train_speed(iter/s)": 0.081 + }, + { + "epoch": 0.8041272071111972, + "grad_norm": 0.5846743583679199, + "learning_rate": 8.755058497452789e-06, + "loss": 0.30487353801727296, + "memory(GiB)": 71.86, + "step": 2075, + "token_acc": 0.9014453781512605, + "train_speed(iter/s)": 0.08099 + }, + { + "epoch": 0.8060648630319471, + "grad_norm": 0.6149039268493652, + "learning_rate": 8.747997171148655e-06, + "loss": 0.3211073398590088, + "memory(GiB)": 71.86, + "step": 2080, + "token_acc": 0.8910502310689582, + "train_speed(iter/s)": 0.080987 + }, + { + "epoch": 0.8080025189526969, + "grad_norm": 0.5836396217346191, + "learning_rate": 8.740918740370985e-06, + "loss": 0.2896585464477539, + "memory(GiB)": 71.86, + "step": 2085, + "token_acc": 0.8984591194968553, + "train_speed(iter/s)": 0.081002 + }, + { + "epoch": 0.8099401748734468, + "grad_norm": 0.5869709849357605, + "learning_rate": 8.733823237423124e-06, + "loss": 0.31051561832427976, + "memory(GiB)": 71.86, + "step": 2090, + "token_acc": 0.9037916430453197, + "train_speed(iter/s)": 0.08102 + }, + { + "epoch": 0.8118778307941967, + "grad_norm": 0.5999536514282227, + "learning_rate": 8.726710694686324e-06, + "loss": 0.3082785129547119, + "memory(GiB)": 71.86, + "step": 2095, + "token_acc": 0.8980618292487587, + "train_speed(iter/s)": 0.081019 + }, + { + "epoch": 0.8138154867149466, + "grad_norm": 0.6493249535560608, + "learning_rate": 8.719581144619598e-06, + "loss": 0.31282968521118165, + "memory(GiB)": 71.86, + "step": 2100, + "token_acc": 0.898483664217602, + "train_speed(iter/s)": 0.081029 + }, + { + "epoch": 0.8157531426356964, + "grad_norm": 0.5943895578384399, + "learning_rate": 8.71243461975958e-06, + "loss": 0.32207574844360354, + "memory(GiB)": 71.86, + "step": 2105, + "token_acc": 0.9011841841172141, + "train_speed(iter/s)": 0.08103 + }, + { + "epoch": 0.8176907985564463, + "grad_norm": 0.60904461145401, + "learning_rate": 8.705271152720364e-06, + "loss": 0.31167449951171877, + "memory(GiB)": 71.86, + "step": 2110, + "token_acc": 0.890208345987973, + "train_speed(iter/s)": 0.081006 + }, + { + "epoch": 0.8196284544771962, + "grad_norm": 0.6434164643287659, + "learning_rate": 8.698090776193371e-06, + "loss": 0.3102255821228027, + "memory(GiB)": 71.86, + "step": 2115, + "token_acc": 0.903289090518959, + "train_speed(iter/s)": 0.081001 + }, + { + "epoch": 0.8215661103979461, + "grad_norm": 0.6066402792930603, + "learning_rate": 8.690893522947179e-06, + "loss": 0.3043588876724243, + "memory(GiB)": 71.86, + "step": 2120, + "token_acc": 0.8837515642494895, + "train_speed(iter/s)": 0.081012 + }, + { + "epoch": 0.8235037663186959, + "grad_norm": 0.6076641082763672, + "learning_rate": 8.683679425827395e-06, + "loss": 0.30590732097625734, + "memory(GiB)": 71.86, + "step": 2125, + "token_acc": 0.9053521668108004, + "train_speed(iter/s)": 0.081016 + }, + { + "epoch": 0.8254414222394458, + "grad_norm": 0.6392712593078613, + "learning_rate": 8.676448517756489e-06, + "loss": 0.3209038019180298, + "memory(GiB)": 71.86, + "step": 2130, + "token_acc": 0.8917396542180835, + "train_speed(iter/s)": 0.081003 + }, + { + "epoch": 0.8273790781601957, + "grad_norm": 0.6290078163146973, + "learning_rate": 8.669200831733655e-06, + "loss": 0.3171893835067749, + "memory(GiB)": 71.86, + "step": 2135, + "token_acc": 0.8987797088782062, + "train_speed(iter/s)": 0.081001 + }, + { + "epoch": 0.8293167340809455, + "grad_norm": 0.6287849545478821, + "learning_rate": 8.66193640083465e-06, + "loss": 0.30498471260070803, + "memory(GiB)": 71.86, + "step": 2140, + "token_acc": 0.893487338362069, + "train_speed(iter/s)": 0.08099 + }, + { + "epoch": 0.8312543900016954, + "grad_norm": 0.6079146265983582, + "learning_rate": 8.654655258211652e-06, + "loss": 0.31498687267303466, + "memory(GiB)": 71.86, + "step": 2145, + "token_acc": 0.8881385789782736, + "train_speed(iter/s)": 0.080992 + }, + { + "epoch": 0.8331920459224453, + "grad_norm": 0.5923950672149658, + "learning_rate": 8.647357437093104e-06, + "loss": 0.29389705657958987, + "memory(GiB)": 71.86, + "step": 2150, + "token_acc": 0.9051536737901742, + "train_speed(iter/s)": 0.081008 + }, + { + "epoch": 0.8351297018431952, + "grad_norm": 0.5749879479408264, + "learning_rate": 8.640042970783567e-06, + "loss": 0.31409273147583006, + "memory(GiB)": 71.86, + "step": 2155, + "token_acc": 0.902754901010884, + "train_speed(iter/s)": 0.081029 + }, + { + "epoch": 0.837067357763945, + "grad_norm": 0.6237610578536987, + "learning_rate": 8.632711892663554e-06, + "loss": 0.3192549705505371, + "memory(GiB)": 71.86, + "step": 2160, + "token_acc": 0.8993284397938466, + "train_speed(iter/s)": 0.081045 + }, + { + "epoch": 0.8390050136846949, + "grad_norm": 0.6099810600280762, + "learning_rate": 8.625364236189405e-06, + "loss": 0.32326383590698243, + "memory(GiB)": 71.86, + "step": 2165, + "token_acc": 0.8934105339342698, + "train_speed(iter/s)": 0.081057 + }, + { + "epoch": 0.8409426696054448, + "grad_norm": 0.6087759733200073, + "learning_rate": 8.6180000348931e-06, + "loss": 0.3070503234863281, + "memory(GiB)": 71.86, + "step": 2170, + "token_acc": 0.8968854051054383, + "train_speed(iter/s)": 0.08106 + }, + { + "epoch": 0.8428803255261947, + "grad_norm": 0.588354766368866, + "learning_rate": 8.610619322382133e-06, + "loss": 0.2942917823791504, + "memory(GiB)": 71.86, + "step": 2175, + "token_acc": 0.9020420697067404, + "train_speed(iter/s)": 0.081055 + }, + { + "epoch": 0.8448179814469445, + "grad_norm": 0.5909959673881531, + "learning_rate": 8.603222132339348e-06, + "loss": 0.3080431461334229, + "memory(GiB)": 71.86, + "step": 2180, + "token_acc": 0.9125050477857046, + "train_speed(iter/s)": 0.081058 + }, + { + "epoch": 0.8467556373676944, + "grad_norm": 0.5789511203765869, + "learning_rate": 8.595808498522788e-06, + "loss": 0.3041903257369995, + "memory(GiB)": 71.86, + "step": 2185, + "token_acc": 0.9008222686526925, + "train_speed(iter/s)": 0.081068 + }, + { + "epoch": 0.8486932932884443, + "grad_norm": 0.6096296906471252, + "learning_rate": 8.588378454765535e-06, + "loss": 0.30947365760803225, + "memory(GiB)": 71.86, + "step": 2190, + "token_acc": 0.8935091277890467, + "train_speed(iter/s)": 0.08106 + }, + { + "epoch": 0.8506309492091941, + "grad_norm": 0.6188230514526367, + "learning_rate": 8.580932034975563e-06, + "loss": 0.31347320079803465, + "memory(GiB)": 71.86, + "step": 2195, + "token_acc": 0.8942014242115972, + "train_speed(iter/s)": 0.081054 + }, + { + "epoch": 0.852568605129944, + "grad_norm": 0.590366005897522, + "learning_rate": 8.573469273135578e-06, + "loss": 0.3038686752319336, + "memory(GiB)": 71.86, + "step": 2200, + "token_acc": 0.8942797934278478, + "train_speed(iter/s)": 0.081049 + }, + { + "epoch": 0.8545062610506939, + "grad_norm": 0.5962342023849487, + "learning_rate": 8.565990203302866e-06, + "loss": 0.3185643196105957, + "memory(GiB)": 71.86, + "step": 2205, + "token_acc": 0.8901260899567154, + "train_speed(iter/s)": 0.081039 + }, + { + "epoch": 0.8564439169714438, + "grad_norm": 0.6078639030456543, + "learning_rate": 8.558494859609137e-06, + "loss": 0.3044931650161743, + "memory(GiB)": 71.86, + "step": 2210, + "token_acc": 0.8968926976897492, + "train_speed(iter/s)": 0.081023 + }, + { + "epoch": 0.8583815728921936, + "grad_norm": 0.5709052681922913, + "learning_rate": 8.55098327626037e-06, + "loss": 0.29850192070007325, + "memory(GiB)": 71.86, + "step": 2215, + "token_acc": 0.8929022936608272, + "train_speed(iter/s)": 0.081006 + }, + { + "epoch": 0.8603192288129435, + "grad_norm": 0.5804421305656433, + "learning_rate": 8.543455487536654e-06, + "loss": 0.3016526222229004, + "memory(GiB)": 71.86, + "step": 2220, + "token_acc": 0.8973657201505303, + "train_speed(iter/s)": 0.081011 + }, + { + "epoch": 0.8622568847336934, + "grad_norm": 0.5794805884361267, + "learning_rate": 8.535911527792032e-06, + "loss": 0.29973387718200684, + "memory(GiB)": 71.86, + "step": 2225, + "token_acc": 0.8946569324202094, + "train_speed(iter/s)": 0.081003 + }, + { + "epoch": 0.8641945406544432, + "grad_norm": 0.5984667539596558, + "learning_rate": 8.528351431454352e-06, + "loss": 0.31471891403198243, + "memory(GiB)": 71.86, + "step": 2230, + "token_acc": 0.9035908316456103, + "train_speed(iter/s)": 0.080987 + }, + { + "epoch": 0.8661321965751931, + "grad_norm": 0.599163830280304, + "learning_rate": 8.520775233025094e-06, + "loss": 0.28800148963928224, + "memory(GiB)": 71.86, + "step": 2235, + "token_acc": 0.9002882990356894, + "train_speed(iter/s)": 0.080985 + }, + { + "epoch": 0.868069852495943, + "grad_norm": 0.5712947249412537, + "learning_rate": 8.513182967079228e-06, + "loss": 0.31661105155944824, + "memory(GiB)": 71.86, + "step": 2240, + "token_acc": 0.890927432066074, + "train_speed(iter/s)": 0.081 + }, + { + "epoch": 0.8700075084166929, + "grad_norm": 0.6933742165565491, + "learning_rate": 8.50557466826505e-06, + "loss": 0.2999523878097534, + "memory(GiB)": 71.86, + "step": 2245, + "token_acc": 0.8946842250413679, + "train_speed(iter/s)": 0.081019 + }, + { + "epoch": 0.8719451643374427, + "grad_norm": 0.5995941162109375, + "learning_rate": 8.497950371304025e-06, + "loss": 0.3046144962310791, + "memory(GiB)": 71.86, + "step": 2250, + "token_acc": 0.8875266849649284, + "train_speed(iter/s)": 0.081015 + }, + { + "epoch": 0.8738828202581926, + "grad_norm": 0.6179806590080261, + "learning_rate": 8.490310110990623e-06, + "loss": 0.30069704055786134, + "memory(GiB)": 71.86, + "step": 2255, + "token_acc": 0.8802156301394896, + "train_speed(iter/s)": 0.081008 + }, + { + "epoch": 0.8758204761789425, + "grad_norm": 0.6140835881233215, + "learning_rate": 8.482653922192169e-06, + "loss": 0.3018641948699951, + "memory(GiB)": 71.86, + "step": 2260, + "token_acc": 0.908906160867085, + "train_speed(iter/s)": 0.080993 + }, + { + "epoch": 0.8777581320996924, + "grad_norm": 0.5812036991119385, + "learning_rate": 8.474981839848675e-06, + "loss": 0.29200072288513185, + "memory(GiB)": 71.86, + "step": 2265, + "token_acc": 0.9004617925641285, + "train_speed(iter/s)": 0.080992 + }, + { + "epoch": 0.8796957880204422, + "grad_norm": 0.6047909259796143, + "learning_rate": 8.467293898972694e-06, + "loss": 0.29525227546691896, + "memory(GiB)": 71.86, + "step": 2270, + "token_acc": 0.903614068010483, + "train_speed(iter/s)": 0.080993 + }, + { + "epoch": 0.8816334439411921, + "grad_norm": 0.6456109881401062, + "learning_rate": 8.459590134649143e-06, + "loss": 0.3068693161010742, + "memory(GiB)": 71.86, + "step": 2275, + "token_acc": 0.89543441552235, + "train_speed(iter/s)": 0.080997 + }, + { + "epoch": 0.883571099861942, + "grad_norm": 0.6033593416213989, + "learning_rate": 8.451870582035155e-06, + "loss": 0.3062614917755127, + "memory(GiB)": 71.86, + "step": 2280, + "token_acc": 0.8919612576002933, + "train_speed(iter/s)": 0.08101 + }, + { + "epoch": 0.8855087557826918, + "grad_norm": 0.6340951323509216, + "learning_rate": 8.444135276359913e-06, + "loss": 0.2958319902420044, + "memory(GiB)": 71.86, + "step": 2285, + "token_acc": 0.9055605173497403, + "train_speed(iter/s)": 0.080999 + }, + { + "epoch": 0.8874464117034417, + "grad_norm": 0.5683568120002747, + "learning_rate": 8.436384252924496e-06, + "loss": 0.2960463047027588, + "memory(GiB)": 71.86, + "step": 2290, + "token_acc": 0.8960641209435684, + "train_speed(iter/s)": 0.081004 + }, + { + "epoch": 0.8893840676241916, + "grad_norm": 0.6018267273902893, + "learning_rate": 8.428617547101707e-06, + "loss": 0.3079716682434082, + "memory(GiB)": 71.86, + "step": 2295, + "token_acc": 0.8969420906320126, + "train_speed(iter/s)": 0.081009 + }, + { + "epoch": 0.8913217235449415, + "grad_norm": 0.5824874639511108, + "learning_rate": 8.42083519433592e-06, + "loss": 0.308961820602417, + "memory(GiB)": 71.86, + "step": 2300, + "token_acc": 0.9047845105286358, + "train_speed(iter/s)": 0.081008 + }, + { + "epoch": 0.8932593794656913, + "grad_norm": 0.6830450892448425, + "learning_rate": 8.413037230142916e-06, + "loss": 0.3022101402282715, + "memory(GiB)": 71.86, + "step": 2305, + "token_acc": 0.8928808351663698, + "train_speed(iter/s)": 0.08102 + }, + { + "epoch": 0.8951970353864412, + "grad_norm": 0.607520580291748, + "learning_rate": 8.405223690109723e-06, + "loss": 0.3095353126525879, + "memory(GiB)": 71.86, + "step": 2310, + "token_acc": 0.8990780317213607, + "train_speed(iter/s)": 0.081012 + }, + { + "epoch": 0.8971346913071911, + "grad_norm": 0.5952607989311218, + "learning_rate": 8.397394609894446e-06, + "loss": 0.2872850656509399, + "memory(GiB)": 71.86, + "step": 2315, + "token_acc": 0.9125979306469207, + "train_speed(iter/s)": 0.08101 + }, + { + "epoch": 0.899072347227941, + "grad_norm": 0.586456298828125, + "learning_rate": 8.389550025226117e-06, + "loss": 0.30147950649261473, + "memory(GiB)": 71.86, + "step": 2320, + "token_acc": 0.8989215036124394, + "train_speed(iter/s)": 0.081001 + }, + { + "epoch": 0.9010100031486908, + "grad_norm": 0.5853320956230164, + "learning_rate": 8.381689971904514e-06, + "loss": 0.30711946487426756, + "memory(GiB)": 71.86, + "step": 2325, + "token_acc": 0.8965238741358145, + "train_speed(iter/s)": 0.081013 + }, + { + "epoch": 0.9029476590694407, + "grad_norm": 0.5991949439048767, + "learning_rate": 8.373814485800022e-06, + "loss": 0.301871132850647, + "memory(GiB)": 71.86, + "step": 2330, + "token_acc": 0.9006086506086506, + "train_speed(iter/s)": 0.081022 + }, + { + "epoch": 0.9048853149901906, + "grad_norm": 0.5821011066436768, + "learning_rate": 8.365923602853444e-06, + "loss": 0.3028329133987427, + "memory(GiB)": 71.86, + "step": 2335, + "token_acc": 0.8980891719745223, + "train_speed(iter/s)": 0.081012 + }, + { + "epoch": 0.9068229709109404, + "grad_norm": 0.5990322232246399, + "learning_rate": 8.358017359075854e-06, + "loss": 0.2973571062088013, + "memory(GiB)": 71.86, + "step": 2340, + "token_acc": 0.8983305073692448, + "train_speed(iter/s)": 0.081021 + }, + { + "epoch": 0.9087606268316903, + "grad_norm": 0.6502439379692078, + "learning_rate": 8.350095790548424e-06, + "loss": 0.30336899757385255, + "memory(GiB)": 71.86, + "step": 2345, + "token_acc": 0.8919290834613415, + "train_speed(iter/s)": 0.081011 + }, + { + "epoch": 0.9106982827524402, + "grad_norm": 0.587325930595398, + "learning_rate": 8.342158933422266e-06, + "loss": 0.29255731105804444, + "memory(GiB)": 71.86, + "step": 2350, + "token_acc": 0.9055942855180467, + "train_speed(iter/s)": 0.081013 + }, + { + "epoch": 0.9126359386731902, + "grad_norm": 0.5987265706062317, + "learning_rate": 8.334206823918262e-06, + "loss": 0.31260898113250735, + "memory(GiB)": 71.86, + "step": 2355, + "token_acc": 0.8882655446470313, + "train_speed(iter/s)": 0.081013 + }, + { + "epoch": 0.9145735945939399, + "grad_norm": 0.5997234582901001, + "learning_rate": 8.3262394983269e-06, + "loss": 0.3038615703582764, + "memory(GiB)": 71.86, + "step": 2360, + "token_acc": 0.8856326635424022, + "train_speed(iter/s)": 0.081018 + }, + { + "epoch": 0.9165112505146898, + "grad_norm": 0.6047623157501221, + "learning_rate": 8.318256993008108e-06, + "loss": 0.2977303981781006, + "memory(GiB)": 71.86, + "step": 2365, + "token_acc": 0.8961500928636635, + "train_speed(iter/s)": 0.081026 + }, + { + "epoch": 0.9184489064354397, + "grad_norm": 0.605626106262207, + "learning_rate": 8.31025934439109e-06, + "loss": 0.312529468536377, + "memory(GiB)": 71.86, + "step": 2370, + "token_acc": 0.9005831363278172, + "train_speed(iter/s)": 0.081024 + }, + { + "epoch": 0.9203865623561897, + "grad_norm": 0.597262978553772, + "learning_rate": 8.302246588974156e-06, + "loss": 0.28765239715576174, + "memory(GiB)": 71.86, + "step": 2375, + "token_acc": 0.9029015993856222, + "train_speed(iter/s)": 0.081019 + }, + { + "epoch": 0.9223242182769394, + "grad_norm": 0.563490629196167, + "learning_rate": 8.29421876332456e-06, + "loss": 0.2911080837249756, + "memory(GiB)": 71.86, + "step": 2380, + "token_acc": 0.9119191065074164, + "train_speed(iter/s)": 0.081032 + }, + { + "epoch": 0.9242618741976893, + "grad_norm": 0.6514439582824707, + "learning_rate": 8.286175904078333e-06, + "loss": 0.31125853061676023, + "memory(GiB)": 71.86, + "step": 2385, + "token_acc": 0.9083231470163351, + "train_speed(iter/s)": 0.081045 + }, + { + "epoch": 0.9261995301184393, + "grad_norm": 0.58599454164505, + "learning_rate": 8.27811804794011e-06, + "loss": 0.30330696105957033, + "memory(GiB)": 71.86, + "step": 2390, + "token_acc": 0.894368, + "train_speed(iter/s)": 0.08105 + }, + { + "epoch": 0.928137186039189, + "grad_norm": 0.5928026437759399, + "learning_rate": 8.270045231682966e-06, + "loss": 0.3130389928817749, + "memory(GiB)": 71.86, + "step": 2395, + "token_acc": 0.902063969382176, + "train_speed(iter/s)": 0.081042 + }, + { + "epoch": 0.930074841959939, + "grad_norm": 0.5734291076660156, + "learning_rate": 8.261957492148252e-06, + "loss": 0.2945571422576904, + "memory(GiB)": 71.86, + "step": 2400, + "token_acc": 0.9001185351550315, + "train_speed(iter/s)": 0.081039 + }, + { + "epoch": 0.9320124978806889, + "grad_norm": 0.5834051966667175, + "learning_rate": 8.253854866245421e-06, + "loss": 0.30569703578948976, + "memory(GiB)": 71.86, + "step": 2405, + "token_acc": 0.9009355543240865, + "train_speed(iter/s)": 0.081039 + }, + { + "epoch": 0.9339501538014388, + "grad_norm": 0.5792734026908875, + "learning_rate": 8.245737390951861e-06, + "loss": 0.29303114414215087, + "memory(GiB)": 71.86, + "step": 2410, + "token_acc": 0.9045537095580817, + "train_speed(iter/s)": 0.081046 + }, + { + "epoch": 0.9358878097221885, + "grad_norm": 0.6011708974838257, + "learning_rate": 8.23760510331273e-06, + "loss": 0.2884019136428833, + "memory(GiB)": 71.86, + "step": 2415, + "token_acc": 0.8996518944972554, + "train_speed(iter/s)": 0.08106 + }, + { + "epoch": 0.9378254656429384, + "grad_norm": 0.6015334725379944, + "learning_rate": 8.229458040440783e-06, + "loss": 0.2984572172164917, + "memory(GiB)": 71.86, + "step": 2420, + "token_acc": 0.9039551835853131, + "train_speed(iter/s)": 0.08106 + }, + { + "epoch": 0.9397631215636884, + "grad_norm": 0.5682195425033569, + "learning_rate": 8.2212962395162e-06, + "loss": 0.30713858604431155, + "memory(GiB)": 71.86, + "step": 2425, + "token_acc": 0.8901319652625398, + "train_speed(iter/s)": 0.081067 + }, + { + "epoch": 0.9417007774844383, + "grad_norm": 0.5934369564056396, + "learning_rate": 8.213119737786425e-06, + "loss": 0.29988932609558105, + "memory(GiB)": 71.86, + "step": 2430, + "token_acc": 0.9072377226397611, + "train_speed(iter/s)": 0.081072 + }, + { + "epoch": 0.943638433405188, + "grad_norm": 0.5825701355934143, + "learning_rate": 8.204928572565992e-06, + "loss": 0.2891493320465088, + "memory(GiB)": 71.86, + "step": 2435, + "token_acc": 0.9002493765586035, + "train_speed(iter/s)": 0.081076 + }, + { + "epoch": 0.945576089325938, + "grad_norm": 0.6257404088973999, + "learning_rate": 8.196722781236345e-06, + "loss": 0.2987071514129639, + "memory(GiB)": 71.86, + "step": 2440, + "token_acc": 0.8886912499144831, + "train_speed(iter/s)": 0.081082 + }, + { + "epoch": 0.9475137452466879, + "grad_norm": 0.5801171660423279, + "learning_rate": 8.188502401245685e-06, + "loss": 0.294779896736145, + "memory(GiB)": 71.86, + "step": 2445, + "token_acc": 0.914494488962574, + "train_speed(iter/s)": 0.081085 + }, + { + "epoch": 0.9494514011674376, + "grad_norm": 0.6023902893066406, + "learning_rate": 8.180267470108791e-06, + "loss": 0.3050975799560547, + "memory(GiB)": 71.86, + "step": 2450, + "token_acc": 0.8998041334594084, + "train_speed(iter/s)": 0.081082 + }, + { + "epoch": 0.9513890570881876, + "grad_norm": 0.5659891366958618, + "learning_rate": 8.17201802540684e-06, + "loss": 0.291573166847229, + "memory(GiB)": 71.86, + "step": 2455, + "token_acc": 0.898186114392939, + "train_speed(iter/s)": 0.081089 + }, + { + "epoch": 0.9533267130089375, + "grad_norm": 0.5715075731277466, + "learning_rate": 8.16375410478725e-06, + "loss": 0.31129865646362304, + "memory(GiB)": 71.86, + "step": 2460, + "token_acc": 0.8948479093183805, + "train_speed(iter/s)": 0.081082 + }, + { + "epoch": 0.9552643689296874, + "grad_norm": 0.5733673572540283, + "learning_rate": 8.155475745963497e-06, + "loss": 0.30196385383605956, + "memory(GiB)": 71.86, + "step": 2465, + "token_acc": 0.8970524127292847, + "train_speed(iter/s)": 0.081079 + }, + { + "epoch": 0.9572020248504371, + "grad_norm": 0.6193950176239014, + "learning_rate": 8.147182986714951e-06, + "loss": 0.3040598392486572, + "memory(GiB)": 71.86, + "step": 2470, + "token_acc": 0.899796048012304, + "train_speed(iter/s)": 0.081069 + }, + { + "epoch": 0.959139680771187, + "grad_norm": 0.5847228765487671, + "learning_rate": 8.138875864886704e-06, + "loss": 0.2934266567230225, + "memory(GiB)": 71.86, + "step": 2475, + "token_acc": 0.8982365532941456, + "train_speed(iter/s)": 0.081082 + }, + { + "epoch": 0.961077336691937, + "grad_norm": 0.646843671798706, + "learning_rate": 8.130554418389385e-06, + "loss": 0.3016360759735107, + "memory(GiB)": 71.86, + "step": 2480, + "token_acc": 0.896435309303942, + "train_speed(iter/s)": 0.081069 + }, + { + "epoch": 0.9630149926126867, + "grad_norm": 0.6069235801696777, + "learning_rate": 8.122218685199001e-06, + "loss": 0.29787559509277345, + "memory(GiB)": 71.86, + "step": 2485, + "token_acc": 0.8999183673469388, + "train_speed(iter/s)": 0.081064 + }, + { + "epoch": 0.9649526485334367, + "grad_norm": 0.5728825926780701, + "learning_rate": 8.113868703356755e-06, + "loss": 0.3042722702026367, + "memory(GiB)": 71.86, + "step": 2490, + "token_acc": 0.8947862704111, + "train_speed(iter/s)": 0.08107 + }, + { + "epoch": 0.9668903044541866, + "grad_norm": 0.5724794864654541, + "learning_rate": 8.105504510968878e-06, + "loss": 0.29997859001159666, + "memory(GiB)": 71.86, + "step": 2495, + "token_acc": 0.9035289881166727, + "train_speed(iter/s)": 0.081054 + }, + { + "epoch": 0.9688279603749365, + "grad_norm": 0.6254956722259521, + "learning_rate": 8.097126146206454e-06, + "loss": 0.3147443771362305, + "memory(GiB)": 71.86, + "step": 2500, + "token_acc": 0.8975554174435467, + "train_speed(iter/s)": 0.081062 + }, + { + "epoch": 0.9707656162956863, + "grad_norm": 0.6081948280334473, + "learning_rate": 8.08873364730524e-06, + "loss": 0.29856858253479, + "memory(GiB)": 71.86, + "step": 2505, + "token_acc": 0.8946579947383806, + "train_speed(iter/s)": 0.081059 + }, + { + "epoch": 0.9727032722164362, + "grad_norm": 0.592427670955658, + "learning_rate": 8.080327052565498e-06, + "loss": 0.28695039749145507, + "memory(GiB)": 71.86, + "step": 2510, + "token_acc": 0.8975024846910968, + "train_speed(iter/s)": 0.081067 + }, + { + "epoch": 0.9746409281371861, + "grad_norm": 0.585587203502655, + "learning_rate": 8.071906400351823e-06, + "loss": 0.3133570194244385, + "memory(GiB)": 71.86, + "step": 2515, + "token_acc": 0.891758151728658, + "train_speed(iter/s)": 0.081079 + }, + { + "epoch": 0.976578584057936, + "grad_norm": 0.5797784328460693, + "learning_rate": 8.063471729092953e-06, + "loss": 0.29287357330322267, + "memory(GiB)": 71.86, + "step": 2520, + "token_acc": 0.9092919601238474, + "train_speed(iter/s)": 0.081073 + }, + { + "epoch": 0.9785162399786858, + "grad_norm": 0.6151092052459717, + "learning_rate": 8.055023077281614e-06, + "loss": 0.29386229515075685, + "memory(GiB)": 71.86, + "step": 2525, + "token_acc": 0.9003792401151767, + "train_speed(iter/s)": 0.081085 + }, + { + "epoch": 0.9804538958994357, + "grad_norm": 0.5305030345916748, + "learning_rate": 8.046560483474327e-06, + "loss": 0.27774505615234374, + "memory(GiB)": 71.86, + "step": 2530, + "token_acc": 0.9066745422327229, + "train_speed(iter/s)": 0.081101 + }, + { + "epoch": 0.9823915518201856, + "grad_norm": 0.569251298904419, + "learning_rate": 8.038083986291242e-06, + "loss": 0.29792633056640627, + "memory(GiB)": 71.86, + "step": 2535, + "token_acc": 0.894457876139111, + "train_speed(iter/s)": 0.081103 + }, + { + "epoch": 0.9843292077409354, + "grad_norm": 0.5375701785087585, + "learning_rate": 8.029593624415961e-06, + "loss": 0.30445160865783694, + "memory(GiB)": 71.86, + "step": 2540, + "token_acc": 0.9060257773053415, + "train_speed(iter/s)": 0.081113 + }, + { + "epoch": 0.9862668636616853, + "grad_norm": 0.5565457344055176, + "learning_rate": 8.021089436595354e-06, + "loss": 0.288785719871521, + "memory(GiB)": 71.86, + "step": 2545, + "token_acc": 0.9060629628377236, + "train_speed(iter/s)": 0.081105 + }, + { + "epoch": 0.9882045195824352, + "grad_norm": 0.5765477418899536, + "learning_rate": 8.012571461639391e-06, + "loss": 0.29372076988220214, + "memory(GiB)": 71.86, + "step": 2550, + "token_acc": 0.8993327432423232, + "train_speed(iter/s)": 0.081094 + }, + { + "epoch": 0.9901421755031851, + "grad_norm": 0.5807486772537231, + "learning_rate": 8.004039738420962e-06, + "loss": 0.3115732192993164, + "memory(GiB)": 71.86, + "step": 2555, + "token_acc": 0.898284978308026, + "train_speed(iter/s)": 0.081087 + }, + { + "epoch": 0.9920798314239349, + "grad_norm": 0.5933189988136292, + "learning_rate": 7.995494305875696e-06, + "loss": 0.2891175031661987, + "memory(GiB)": 71.86, + "step": 2560, + "token_acc": 0.9098856523411547, + "train_speed(iter/s)": 0.081099 + }, + { + "epoch": 0.9940174873446848, + "grad_norm": 0.5942724943161011, + "learning_rate": 7.98693520300179e-06, + "loss": 0.3004627704620361, + "memory(GiB)": 71.86, + "step": 2565, + "token_acc": 0.8957946815089672, + "train_speed(iter/s)": 0.081106 + }, + { + "epoch": 0.9959551432654347, + "grad_norm": 0.609277606010437, + "learning_rate": 7.978362468859824e-06, + "loss": 0.30392889976501464, + "memory(GiB)": 71.86, + "step": 2570, + "token_acc": 0.904025201260063, + "train_speed(iter/s)": 0.081112 + }, + { + "epoch": 0.9978927991861846, + "grad_norm": 0.5895731449127197, + "learning_rate": 7.969776142572588e-06, + "loss": 0.31368122100830076, + "memory(GiB)": 71.86, + "step": 2575, + "token_acc": 0.9018180772107474, + "train_speed(iter/s)": 0.081108 + }, + { + "epoch": 0.9998304551069344, + "grad_norm": 0.601768970489502, + "learning_rate": 7.961176263324902e-06, + "loss": 0.3015678882598877, + "memory(GiB)": 71.86, + "step": 2580, + "token_acc": 0.9027051901521875, + "train_speed(iter/s)": 0.081115 + }, + { + "epoch": 1.0, + "eval_loss": 0.2590896189212799, + "eval_runtime": 104.8434, + "eval_samples_per_second": 31.8, + "eval_steps_per_second": 3.977, + "eval_token_acc": 0.9006845098082552, + "step": 2581 + }, + { + "epoch": 1.0015501247365999, + "grad_norm": 0.5477020144462585, + "learning_rate": 7.952562870363431e-06, + "loss": 0.25441799163818357, + "memory(GiB)": 71.86, + "step": 2585, + "token_acc": 0.9042220212839953, + "train_speed(iter/s)": 0.080632 + }, + { + "epoch": 1.0034877806573497, + "grad_norm": 0.5873720049858093, + "learning_rate": 7.943936002996523e-06, + "loss": 0.24828603267669677, + "memory(GiB)": 72.48, + "step": 2590, + "token_acc": 0.9172822882928782, + "train_speed(iter/s)": 0.080629 + }, + { + "epoch": 1.0054254365780997, + "grad_norm": 0.6077190637588501, + "learning_rate": 7.935295700594008e-06, + "loss": 0.25363209247589114, + "memory(GiB)": 72.48, + "step": 2595, + "token_acc": 0.9170952689124289, + "train_speed(iter/s)": 0.08063 + }, + { + "epoch": 1.0073630924988495, + "grad_norm": 0.5879862904548645, + "learning_rate": 7.926642002587031e-06, + "loss": 0.24791350364685058, + "memory(GiB)": 72.48, + "step": 2600, + "token_acc": 0.912384696809315, + "train_speed(iter/s)": 0.080644 + }, + { + "epoch": 1.0093007484195995, + "grad_norm": 0.5582185387611389, + "learning_rate": 7.917974948467875e-06, + "loss": 0.23642451763153077, + "memory(GiB)": 72.48, + "step": 2605, + "token_acc": 0.921409292741678, + "train_speed(iter/s)": 0.080649 + }, + { + "epoch": 1.0112384043403493, + "grad_norm": 0.6070327758789062, + "learning_rate": 7.909294577789765e-06, + "loss": 0.23681924343109131, + "memory(GiB)": 72.48, + "step": 2610, + "token_acc": 0.9224056603773585, + "train_speed(iter/s)": 0.080642 + }, + { + "epoch": 1.013176060261099, + "grad_norm": 0.5828495025634766, + "learning_rate": 7.900600930166709e-06, + "loss": 0.2503397464752197, + "memory(GiB)": 72.48, + "step": 2615, + "token_acc": 0.9147136677642198, + "train_speed(iter/s)": 0.080619 + }, + { + "epoch": 1.015113716181849, + "grad_norm": 0.5445525646209717, + "learning_rate": 7.891894045273296e-06, + "loss": 0.24323725700378418, + "memory(GiB)": 72.48, + "step": 2620, + "token_acc": 0.9177933840903124, + "train_speed(iter/s)": 0.080613 + }, + { + "epoch": 1.0170513721025989, + "grad_norm": 0.5520332455635071, + "learning_rate": 7.883173962844535e-06, + "loss": 0.23910813331604003, + "memory(GiB)": 72.48, + "step": 2625, + "token_acc": 0.9152115156852623, + "train_speed(iter/s)": 0.080612 + }, + { + "epoch": 1.0189890280233487, + "grad_norm": 0.5695937871932983, + "learning_rate": 7.874440722675654e-06, + "loss": 0.2562225580215454, + "memory(GiB)": 72.48, + "step": 2630, + "token_acc": 0.911888513298042, + "train_speed(iter/s)": 0.080611 + }, + { + "epoch": 1.0209266839440987, + "grad_norm": 0.5355059504508972, + "learning_rate": 7.865694364621936e-06, + "loss": 0.25381102561950686, + "memory(GiB)": 72.48, + "step": 2635, + "token_acc": 0.9101550981895382, + "train_speed(iter/s)": 0.080613 + }, + { + "epoch": 1.0228643398648485, + "grad_norm": 0.603877067565918, + "learning_rate": 7.856934928598526e-06, + "loss": 0.24031267166137696, + "memory(GiB)": 72.48, + "step": 2640, + "token_acc": 0.9104291735853877, + "train_speed(iter/s)": 0.080612 + }, + { + "epoch": 1.0248019957855983, + "grad_norm": 0.5709960460662842, + "learning_rate": 7.848162454580248e-06, + "loss": 0.23710267543792723, + "memory(GiB)": 72.48, + "step": 2645, + "token_acc": 0.9217326459812599, + "train_speed(iter/s)": 0.080619 + }, + { + "epoch": 1.0267396517063483, + "grad_norm": 0.6112598776817322, + "learning_rate": 7.839376982601434e-06, + "loss": 0.2632643938064575, + "memory(GiB)": 72.48, + "step": 2650, + "token_acc": 0.9160356212515902, + "train_speed(iter/s)": 0.08062 + }, + { + "epoch": 1.028677307627098, + "grad_norm": 0.5586497187614441, + "learning_rate": 7.830578552755728e-06, + "loss": 0.2568197250366211, + "memory(GiB)": 72.48, + "step": 2655, + "token_acc": 0.9143515554986558, + "train_speed(iter/s)": 0.08061 + }, + { + "epoch": 1.030614963547848, + "grad_norm": 0.5832489728927612, + "learning_rate": 7.821767205195913e-06, + "loss": 0.24051623344421386, + "memory(GiB)": 72.48, + "step": 2660, + "token_acc": 0.9099613992040456, + "train_speed(iter/s)": 0.080604 + }, + { + "epoch": 1.0325526194685979, + "grad_norm": 0.564221203327179, + "learning_rate": 7.812942980133723e-06, + "loss": 0.2458951473236084, + "memory(GiB)": 72.48, + "step": 2665, + "token_acc": 0.918996300466463, + "train_speed(iter/s)": 0.080602 + }, + { + "epoch": 1.0344902753893477, + "grad_norm": 0.5216150283813477, + "learning_rate": 7.804105917839658e-06, + "loss": 0.24463191032409667, + "memory(GiB)": 72.48, + "step": 2670, + "token_acc": 0.9253029928271086, + "train_speed(iter/s)": 0.080602 + }, + { + "epoch": 1.0364279313100977, + "grad_norm": 0.5396441221237183, + "learning_rate": 7.795256058642799e-06, + "loss": 0.24592304229736328, + "memory(GiB)": 72.48, + "step": 2675, + "token_acc": 0.9139424329252409, + "train_speed(iter/s)": 0.080603 + }, + { + "epoch": 1.0383655872308475, + "grad_norm": 0.5646589398384094, + "learning_rate": 7.786393442930638e-06, + "loss": 0.2515740394592285, + "memory(GiB)": 72.48, + "step": 2680, + "token_acc": 0.9193938660698082, + "train_speed(iter/s)": 0.08061 + }, + { + "epoch": 1.0403032431515973, + "grad_norm": 0.5464441180229187, + "learning_rate": 7.777518111148873e-06, + "loss": 0.25590317249298095, + "memory(GiB)": 72.48, + "step": 2685, + "token_acc": 0.9124738451633672, + "train_speed(iter/s)": 0.080607 + }, + { + "epoch": 1.0422408990723473, + "grad_norm": 0.6054975986480713, + "learning_rate": 7.768630103801239e-06, + "loss": 0.24789299964904785, + "memory(GiB)": 72.48, + "step": 2690, + "token_acc": 0.9170572651314449, + "train_speed(iter/s)": 0.080614 + }, + { + "epoch": 1.044178554993097, + "grad_norm": 0.6122474670410156, + "learning_rate": 7.759729461449317e-06, + "loss": 0.2429750919342041, + "memory(GiB)": 72.48, + "step": 2695, + "token_acc": 0.9201933542480313, + "train_speed(iter/s)": 0.080616 + }, + { + "epoch": 1.0461162109138469, + "grad_norm": 0.5993043780326843, + "learning_rate": 7.750816224712345e-06, + "loss": 0.2456050395965576, + "memory(GiB)": 72.48, + "step": 2700, + "token_acc": 0.9081399895724713, + "train_speed(iter/s)": 0.080608 + }, + { + "epoch": 1.048053866834597, + "grad_norm": 0.5747437477111816, + "learning_rate": 7.741890434267043e-06, + "loss": 0.2360602378845215, + "memory(GiB)": 72.48, + "step": 2705, + "token_acc": 0.9099276579253326, + "train_speed(iter/s)": 0.080614 + }, + { + "epoch": 1.0499915227553467, + "grad_norm": 0.6145790815353394, + "learning_rate": 7.732952130847418e-06, + "loss": 0.23937726020812988, + "memory(GiB)": 72.48, + "step": 2710, + "token_acc": 0.9228708069056745, + "train_speed(iter/s)": 0.080619 + }, + { + "epoch": 1.0519291786760967, + "grad_norm": 0.5605219006538391, + "learning_rate": 7.724001355244582e-06, + "loss": 0.24741392135620116, + "memory(GiB)": 72.48, + "step": 2715, + "token_acc": 0.9123340546230774, + "train_speed(iter/s)": 0.080622 + }, + { + "epoch": 1.0538668345968465, + "grad_norm": 0.573218822479248, + "learning_rate": 7.715038148306566e-06, + "loss": 0.2538093090057373, + "memory(GiB)": 72.48, + "step": 2720, + "token_acc": 0.9092292928290149, + "train_speed(iter/s)": 0.080625 + }, + { + "epoch": 1.0558044905175963, + "grad_norm": 0.5715059041976929, + "learning_rate": 7.706062550938134e-06, + "loss": 0.25236949920654295, + "memory(GiB)": 72.48, + "step": 2725, + "token_acc": 0.9107142857142857, + "train_speed(iter/s)": 0.080622 + }, + { + "epoch": 1.0577421464383463, + "grad_norm": 0.5391227602958679, + "learning_rate": 7.697074604100595e-06, + "loss": 0.23396830558776854, + "memory(GiB)": 72.48, + "step": 2730, + "token_acc": 0.9122044684709344, + "train_speed(iter/s)": 0.08064 + }, + { + "epoch": 1.059679802359096, + "grad_norm": 0.5576350688934326, + "learning_rate": 7.688074348811612e-06, + "loss": 0.24273238182067872, + "memory(GiB)": 72.48, + "step": 2735, + "token_acc": 0.9194692680264683, + "train_speed(iter/s)": 0.08064 + }, + { + "epoch": 1.0616174582798459, + "grad_norm": 0.5859219431877136, + "learning_rate": 7.679061826145027e-06, + "loss": 0.2543762683868408, + "memory(GiB)": 72.48, + "step": 2740, + "token_acc": 0.9142654460696551, + "train_speed(iter/s)": 0.080643 + }, + { + "epoch": 1.063555114200596, + "grad_norm": 0.5633178949356079, + "learning_rate": 7.670037077230659e-06, + "loss": 0.23209047317504883, + "memory(GiB)": 72.48, + "step": 2745, + "token_acc": 0.931188868990219, + "train_speed(iter/s)": 0.080652 + }, + { + "epoch": 1.0654927701213457, + "grad_norm": 0.5863512754440308, + "learning_rate": 7.661000143254129e-06, + "loss": 0.2488046646118164, + "memory(GiB)": 72.48, + "step": 2750, + "token_acc": 0.9081606630538731, + "train_speed(iter/s)": 0.080661 + }, + { + "epoch": 1.0674304260420955, + "grad_norm": 0.5774025917053223, + "learning_rate": 7.651951065456658e-06, + "loss": 0.23735444545745848, + "memory(GiB)": 72.48, + "step": 2755, + "token_acc": 0.9132469507736406, + "train_speed(iter/s)": 0.080665 + }, + { + "epoch": 1.0693680819628455, + "grad_norm": 0.6099271178245544, + "learning_rate": 7.642889885134897e-06, + "loss": 0.2543477058410645, + "memory(GiB)": 72.48, + "step": 2760, + "token_acc": 0.9162998624484182, + "train_speed(iter/s)": 0.080679 + }, + { + "epoch": 1.0713057378835953, + "grad_norm": 0.5847581624984741, + "learning_rate": 7.63381664364072e-06, + "loss": 0.24816784858703614, + "memory(GiB)": 72.48, + "step": 2765, + "token_acc": 0.9119142811450504, + "train_speed(iter/s)": 0.080672 + }, + { + "epoch": 1.0732433938043453, + "grad_norm": 0.5548186898231506, + "learning_rate": 7.624731382381048e-06, + "loss": 0.25109171867370605, + "memory(GiB)": 72.48, + "step": 2770, + "token_acc": 0.9072886841777668, + "train_speed(iter/s)": 0.080683 + }, + { + "epoch": 1.075181049725095, + "grad_norm": 0.5436238050460815, + "learning_rate": 7.6156341428176536e-06, + "loss": 0.24203226566314698, + "memory(GiB)": 72.48, + "step": 2775, + "token_acc": 0.914740183562211, + "train_speed(iter/s)": 0.080688 + }, + { + "epoch": 1.0771187056458449, + "grad_norm": 0.5817808508872986, + "learning_rate": 7.606524966466979e-06, + "loss": 0.23057384490966798, + "memory(GiB)": 72.48, + "step": 2780, + "token_acc": 0.9219449364031557, + "train_speed(iter/s)": 0.080686 + }, + { + "epoch": 1.079056361566595, + "grad_norm": 0.5919598937034607, + "learning_rate": 7.597403894899932e-06, + "loss": 0.259158730506897, + "memory(GiB)": 72.48, + "step": 2785, + "token_acc": 0.9131787972118658, + "train_speed(iter/s)": 0.080686 + }, + { + "epoch": 1.0809940174873447, + "grad_norm": 0.5995221138000488, + "learning_rate": 7.588270969741715e-06, + "loss": 0.24664411544799805, + "memory(GiB)": 72.48, + "step": 2790, + "token_acc": 0.9199007507316452, + "train_speed(iter/s)": 0.080685 + }, + { + "epoch": 1.0829316734080945, + "grad_norm": 0.5648677349090576, + "learning_rate": 7.579126232671621e-06, + "loss": 0.24066920280456544, + "memory(GiB)": 72.48, + "step": 2795, + "token_acc": 0.9095726198749131, + "train_speed(iter/s)": 0.080696 + }, + { + "epoch": 1.0848693293288445, + "grad_norm": 0.5983613729476929, + "learning_rate": 7.5699697254228496e-06, + "loss": 0.2533870697021484, + "memory(GiB)": 72.48, + "step": 2800, + "token_acc": 0.9211015879343565, + "train_speed(iter/s)": 0.080698 + }, + { + "epoch": 1.0868069852495943, + "grad_norm": 0.5517467260360718, + "learning_rate": 7.560801489782315e-06, + "loss": 0.24492838382720947, + "memory(GiB)": 72.48, + "step": 2805, + "token_acc": 0.9226202126965486, + "train_speed(iter/s)": 0.080705 + }, + { + "epoch": 1.088744641170344, + "grad_norm": 0.5659517049789429, + "learning_rate": 7.5516215675904555e-06, + "loss": 0.2431710481643677, + "memory(GiB)": 72.48, + "step": 2810, + "token_acc": 0.9124055690828494, + "train_speed(iter/s)": 0.080734 + }, + { + "epoch": 1.090682297091094, + "grad_norm": 0.5607476234436035, + "learning_rate": 7.542430000741042e-06, + "loss": 0.25229265689849856, + "memory(GiB)": 72.48, + "step": 2815, + "token_acc": 0.9133296939725764, + "train_speed(iter/s)": 0.080736 + }, + { + "epoch": 1.092619953011844, + "grad_norm": 0.5616917014122009, + "learning_rate": 7.533226831180988e-06, + "loss": 0.24518215656280518, + "memory(GiB)": 72.48, + "step": 2820, + "token_acc": 0.9195824853935535, + "train_speed(iter/s)": 0.080733 + }, + { + "epoch": 1.0945576089325937, + "grad_norm": 0.5850186347961426, + "learning_rate": 7.524012100910158e-06, + "loss": 0.2529883861541748, + "memory(GiB)": 72.48, + "step": 2825, + "token_acc": 0.9079726351795816, + "train_speed(iter/s)": 0.080733 + }, + { + "epoch": 1.0964952648533437, + "grad_norm": 0.5485085844993591, + "learning_rate": 7.5147858519811725e-06, + "loss": 0.2463089942932129, + "memory(GiB)": 72.48, + "step": 2830, + "token_acc": 0.9261502906365991, + "train_speed(iter/s)": 0.080741 + }, + { + "epoch": 1.0984329207740935, + "grad_norm": 0.5665585994720459, + "learning_rate": 7.50554812649922e-06, + "loss": 0.24828519821166992, + "memory(GiB)": 72.48, + "step": 2835, + "token_acc": 0.912733688460625, + "train_speed(iter/s)": 0.080748 + }, + { + "epoch": 1.1003705766948435, + "grad_norm": 0.5671249032020569, + "learning_rate": 7.496298966621869e-06, + "loss": 0.2461564540863037, + "memory(GiB)": 72.48, + "step": 2840, + "token_acc": 0.9144147028688525, + "train_speed(iter/s)": 0.080745 + }, + { + "epoch": 1.1023082326155933, + "grad_norm": 0.5896663069725037, + "learning_rate": 7.4870384145588625e-06, + "loss": 0.24836764335632325, + "memory(GiB)": 72.48, + "step": 2845, + "token_acc": 0.924081992805267, + "train_speed(iter/s)": 0.080735 + }, + { + "epoch": 1.104245888536343, + "grad_norm": 0.5926647782325745, + "learning_rate": 7.477766512571938e-06, + "loss": 0.2438123941421509, + "memory(GiB)": 72.48, + "step": 2850, + "token_acc": 0.9217040298905791, + "train_speed(iter/s)": 0.080743 + }, + { + "epoch": 1.106183544457093, + "grad_norm": 0.5987430810928345, + "learning_rate": 7.468483302974629e-06, + "loss": 0.24998788833618163, + "memory(GiB)": 72.48, + "step": 2855, + "token_acc": 0.9153590774069544, + "train_speed(iter/s)": 0.080732 + }, + { + "epoch": 1.108121200377843, + "grad_norm": 0.606680691242218, + "learning_rate": 7.459188828132069e-06, + "loss": 0.2444852828979492, + "memory(GiB)": 72.48, + "step": 2860, + "token_acc": 0.9168447750135362, + "train_speed(iter/s)": 0.080731 + }, + { + "epoch": 1.1100588562985927, + "grad_norm": 0.6073605418205261, + "learning_rate": 7.449883130460809e-06, + "loss": 0.2551856517791748, + "memory(GiB)": 72.48, + "step": 2865, + "token_acc": 0.9164820180671552, + "train_speed(iter/s)": 0.080735 + }, + { + "epoch": 1.1119965122193427, + "grad_norm": 0.5303031206130981, + "learning_rate": 7.440566252428612e-06, + "loss": 0.2426982879638672, + "memory(GiB)": 72.48, + "step": 2870, + "token_acc": 0.921721788266578, + "train_speed(iter/s)": 0.080731 + }, + { + "epoch": 1.1139341681400925, + "grad_norm": 0.6147097945213318, + "learning_rate": 7.431238236554263e-06, + "loss": 0.24436643123626708, + "memory(GiB)": 72.48, + "step": 2875, + "token_acc": 0.9231390279347651, + "train_speed(iter/s)": 0.080731 + }, + { + "epoch": 1.1158718240608425, + "grad_norm": 0.5928937196731567, + "learning_rate": 7.4218991254073815e-06, + "loss": 0.24488534927368164, + "memory(GiB)": 72.48, + "step": 2880, + "token_acc": 0.922065383847127, + "train_speed(iter/s)": 0.08074 + }, + { + "epoch": 1.1178094799815923, + "grad_norm": 0.5589123964309692, + "learning_rate": 7.412548961608217e-06, + "loss": 0.23948745727539061, + "memory(GiB)": 72.48, + "step": 2885, + "token_acc": 0.917822325964575, + "train_speed(iter/s)": 0.080734 + }, + { + "epoch": 1.119747135902342, + "grad_norm": 0.5714585781097412, + "learning_rate": 7.403187787827459e-06, + "loss": 0.24300243854522705, + "memory(GiB)": 72.48, + "step": 2890, + "token_acc": 0.9179357311522535, + "train_speed(iter/s)": 0.080747 + }, + { + "epoch": 1.121684791823092, + "grad_norm": 0.5701161026954651, + "learning_rate": 7.393815646786047e-06, + "loss": 0.24715614318847656, + "memory(GiB)": 72.48, + "step": 2895, + "token_acc": 0.9169183809735779, + "train_speed(iter/s)": 0.080739 + }, + { + "epoch": 1.123622447743842, + "grad_norm": 0.541515588760376, + "learning_rate": 7.384432581254963e-06, + "loss": 0.23558435440063477, + "memory(GiB)": 72.48, + "step": 2900, + "token_acc": 0.9188448238262874, + "train_speed(iter/s)": 0.080735 + }, + { + "epoch": 1.1255601036645917, + "grad_norm": 0.5822292566299438, + "learning_rate": 7.375038634055056e-06, + "loss": 0.2428572654724121, + "memory(GiB)": 72.48, + "step": 2905, + "token_acc": 0.9137654495707143, + "train_speed(iter/s)": 0.080733 + }, + { + "epoch": 1.1274977595853417, + "grad_norm": 0.6057553291320801, + "learning_rate": 7.3656338480568234e-06, + "loss": 0.23881745338439941, + "memory(GiB)": 72.48, + "step": 2910, + "token_acc": 0.9118832437713728, + "train_speed(iter/s)": 0.080724 + }, + { + "epoch": 1.1294354155060915, + "grad_norm": 0.6620405912399292, + "learning_rate": 7.3562182661802325e-06, + "loss": 0.23701815605163573, + "memory(GiB)": 72.48, + "step": 2915, + "token_acc": 0.926943768632423, + "train_speed(iter/s)": 0.080722 + }, + { + "epoch": 1.1313730714268413, + "grad_norm": 0.5819861888885498, + "learning_rate": 7.34679193139452e-06, + "loss": 0.2385103225708008, + "memory(GiB)": 72.48, + "step": 2920, + "token_acc": 0.9193125252253465, + "train_speed(iter/s)": 0.08072 + }, + { + "epoch": 1.1333107273475913, + "grad_norm": 0.548102080821991, + "learning_rate": 7.337354886717991e-06, + "loss": 0.2568079471588135, + "memory(GiB)": 72.48, + "step": 2925, + "token_acc": 0.9147654320987655, + "train_speed(iter/s)": 0.080731 + }, + { + "epoch": 1.135248383268341, + "grad_norm": 0.572535514831543, + "learning_rate": 7.32790717521783e-06, + "loss": 0.24390482902526855, + "memory(GiB)": 72.48, + "step": 2930, + "token_acc": 0.9171663495027108, + "train_speed(iter/s)": 0.080734 + }, + { + "epoch": 1.137186039189091, + "grad_norm": 0.5705650448799133, + "learning_rate": 7.3184488400099e-06, + "loss": 0.2597285270690918, + "memory(GiB)": 72.48, + "step": 2935, + "token_acc": 0.912927241962775, + "train_speed(iter/s)": 0.08073 + }, + { + "epoch": 1.139123695109841, + "grad_norm": 0.5626857280731201, + "learning_rate": 7.308979924258547e-06, + "loss": 0.2346130132675171, + "memory(GiB)": 72.48, + "step": 2940, + "token_acc": 0.9203948469131671, + "train_speed(iter/s)": 0.080736 + }, + { + "epoch": 1.1410613510305907, + "grad_norm": 0.5514906048774719, + "learning_rate": 7.2995004711763996e-06, + "loss": 0.23178393840789796, + "memory(GiB)": 72.48, + "step": 2945, + "token_acc": 0.926688815060908, + "train_speed(iter/s)": 0.080731 + }, + { + "epoch": 1.1429990069513407, + "grad_norm": 0.5872433185577393, + "learning_rate": 7.290010524024178e-06, + "loss": 0.2397766590118408, + "memory(GiB)": 72.48, + "step": 2950, + "token_acc": 0.9042819620059862, + "train_speed(iter/s)": 0.080738 + }, + { + "epoch": 1.1449366628720905, + "grad_norm": 0.5944450497627258, + "learning_rate": 7.2805101261104934e-06, + "loss": 0.24837064743041992, + "memory(GiB)": 72.48, + "step": 2955, + "token_acc": 0.9123956365409227, + "train_speed(iter/s)": 0.080736 + }, + { + "epoch": 1.1468743187928403, + "grad_norm": 0.5810188055038452, + "learning_rate": 7.270999320791651e-06, + "loss": 0.2451401948928833, + "memory(GiB)": 72.48, + "step": 2960, + "token_acc": 0.9096873667289965, + "train_speed(iter/s)": 0.080733 + }, + { + "epoch": 1.1488119747135903, + "grad_norm": 0.5396209955215454, + "learning_rate": 7.261478151471448e-06, + "loss": 0.24097609519958496, + "memory(GiB)": 72.48, + "step": 2965, + "token_acc": 0.9194308013187109, + "train_speed(iter/s)": 0.080733 + }, + { + "epoch": 1.15074963063434, + "grad_norm": 0.5635169148445129, + "learning_rate": 7.251946661600982e-06, + "loss": 0.22938740253448486, + "memory(GiB)": 72.48, + "step": 2970, + "token_acc": 0.9203730023919299, + "train_speed(iter/s)": 0.080728 + }, + { + "epoch": 1.15268728655509, + "grad_norm": 0.5849376320838928, + "learning_rate": 7.242404894678452e-06, + "loss": 0.2458806037902832, + "memory(GiB)": 72.48, + "step": 2975, + "token_acc": 0.9125376851487744, + "train_speed(iter/s)": 0.080737 + }, + { + "epoch": 1.15462494247584, + "grad_norm": 0.5923815965652466, + "learning_rate": 7.232852894248951e-06, + "loss": 0.2423017978668213, + "memory(GiB)": 72.48, + "step": 2980, + "token_acc": 0.9189056677779956, + "train_speed(iter/s)": 0.080741 + }, + { + "epoch": 1.1565625983965897, + "grad_norm": 0.5999000072479248, + "learning_rate": 7.223290703904278e-06, + "loss": 0.24765682220458984, + "memory(GiB)": 72.48, + "step": 2985, + "token_acc": 0.9232919480142308, + "train_speed(iter/s)": 0.080744 + }, + { + "epoch": 1.1585002543173397, + "grad_norm": 0.5837448239326477, + "learning_rate": 7.213718367282737e-06, + "loss": 0.24890732765197754, + "memory(GiB)": 72.48, + "step": 2990, + "token_acc": 0.9119633463431189, + "train_speed(iter/s)": 0.080737 + }, + { + "epoch": 1.1604379102380895, + "grad_norm": 0.5687659382820129, + "learning_rate": 7.204135928068934e-06, + "loss": 0.24406375885009765, + "memory(GiB)": 72.48, + "step": 2995, + "token_acc": 0.9261226980651183, + "train_speed(iter/s)": 0.080741 + }, + { + "epoch": 1.1623755661588393, + "grad_norm": 0.5431941151618958, + "learning_rate": 7.194543429993576e-06, + "loss": 0.23955135345458983, + "memory(GiB)": 72.48, + "step": 3000, + "token_acc": 0.920778318276581, + "train_speed(iter/s)": 0.080744 + }, + { + "epoch": 1.1643132220795893, + "grad_norm": 0.6085385084152222, + "learning_rate": 7.18494091683328e-06, + "loss": 0.24273269176483153, + "memory(GiB)": 72.48, + "step": 3005, + "token_acc": 0.9147768308536761, + "train_speed(iter/s)": 0.080754 + }, + { + "epoch": 1.166250878000339, + "grad_norm": 0.5439560413360596, + "learning_rate": 7.175328432410367e-06, + "loss": 0.2393632411956787, + "memory(GiB)": 72.48, + "step": 3010, + "token_acc": 0.9225050983487928, + "train_speed(iter/s)": 0.080748 + }, + { + "epoch": 1.168188533921089, + "grad_norm": 0.5491480231285095, + "learning_rate": 7.1657060205926606e-06, + "loss": 0.24930839538574218, + "memory(GiB)": 72.48, + "step": 3015, + "token_acc": 0.9135148013580321, + "train_speed(iter/s)": 0.080759 + }, + { + "epoch": 1.170126189841839, + "grad_norm": 0.5586943626403809, + "learning_rate": 7.156073725293293e-06, + "loss": 0.2412470817565918, + "memory(GiB)": 72.48, + "step": 3020, + "token_acc": 0.9073272300624649, + "train_speed(iter/s)": 0.080762 + }, + { + "epoch": 1.1720638457625887, + "grad_norm": 0.5404574871063232, + "learning_rate": 7.146431590470498e-06, + "loss": 0.24754083156585693, + "memory(GiB)": 72.48, + "step": 3025, + "token_acc": 0.9088854223042883, + "train_speed(iter/s)": 0.080763 + }, + { + "epoch": 1.1740015016833385, + "grad_norm": 0.5775381922721863, + "learning_rate": 7.1367796601274144e-06, + "loss": 0.23798027038574218, + "memory(GiB)": 72.48, + "step": 3030, + "token_acc": 0.9125572996249479, + "train_speed(iter/s)": 0.080769 + }, + { + "epoch": 1.1759391576040885, + "grad_norm": 0.5799135565757751, + "learning_rate": 7.127117978311884e-06, + "loss": 0.24995725154876708, + "memory(GiB)": 72.48, + "step": 3035, + "token_acc": 0.9050905683947533, + "train_speed(iter/s)": 0.080775 + }, + { + "epoch": 1.1778768135248383, + "grad_norm": 0.5825245380401611, + "learning_rate": 7.117446589116253e-06, + "loss": 0.25867457389831544, + "memory(GiB)": 72.48, + "step": 3040, + "token_acc": 0.9115309110803591, + "train_speed(iter/s)": 0.080772 + }, + { + "epoch": 1.179814469445588, + "grad_norm": 0.5762625932693481, + "learning_rate": 7.107765536677162e-06, + "loss": 0.23916106224060057, + "memory(GiB)": 72.48, + "step": 3045, + "token_acc": 0.9156965587821415, + "train_speed(iter/s)": 0.080771 + }, + { + "epoch": 1.1817521253663381, + "grad_norm": 0.5372615456581116, + "learning_rate": 7.098074865175358e-06, + "loss": 0.2506131649017334, + "memory(GiB)": 72.48, + "step": 3050, + "token_acc": 0.9142211471131199, + "train_speed(iter/s)": 0.080783 + }, + { + "epoch": 1.183689781287088, + "grad_norm": 0.6118589043617249, + "learning_rate": 7.088374618835485e-06, + "loss": 0.24305667877197265, + "memory(GiB)": 72.48, + "step": 3055, + "token_acc": 0.9190882087046365, + "train_speed(iter/s)": 0.080785 + }, + { + "epoch": 1.1856274372078377, + "grad_norm": 0.6260302066802979, + "learning_rate": 7.078664841925879e-06, + "loss": 0.2404834270477295, + "memory(GiB)": 72.48, + "step": 3060, + "token_acc": 0.9191426510493594, + "train_speed(iter/s)": 0.08079 + }, + { + "epoch": 1.1875650931285877, + "grad_norm": 0.561583936214447, + "learning_rate": 7.0689455787583725e-06, + "loss": 0.24459214210510255, + "memory(GiB)": 72.48, + "step": 3065, + "token_acc": 0.920421984891899, + "train_speed(iter/s)": 0.080784 + }, + { + "epoch": 1.1895027490493375, + "grad_norm": 0.57597416639328, + "learning_rate": 7.059216873688093e-06, + "loss": 0.23862845897674562, + "memory(GiB)": 72.48, + "step": 3070, + "token_acc": 0.916607294317218, + "train_speed(iter/s)": 0.08078 + }, + { + "epoch": 1.1914404049700875, + "grad_norm": 0.5645927786827087, + "learning_rate": 7.049478771113248e-06, + "loss": 0.24027485847473146, + "memory(GiB)": 72.48, + "step": 3075, + "token_acc": 0.9250272257010618, + "train_speed(iter/s)": 0.080775 + }, + { + "epoch": 1.1933780608908373, + "grad_norm": 0.6004230380058289, + "learning_rate": 7.039731315474941e-06, + "loss": 0.252230167388916, + "memory(GiB)": 72.48, + "step": 3080, + "token_acc": 0.9052574971529799, + "train_speed(iter/s)": 0.080784 + }, + { + "epoch": 1.195315716811587, + "grad_norm": 0.5419390201568604, + "learning_rate": 7.029974551256957e-06, + "loss": 0.2478321075439453, + "memory(GiB)": 72.48, + "step": 3085, + "token_acc": 0.9099167547568711, + "train_speed(iter/s)": 0.080779 + }, + { + "epoch": 1.1972533727323371, + "grad_norm": 0.5636507272720337, + "learning_rate": 7.020208522985559e-06, + "loss": 0.24982898235321044, + "memory(GiB)": 72.48, + "step": 3090, + "token_acc": 0.9075738529226901, + "train_speed(iter/s)": 0.080783 + }, + { + "epoch": 1.199191028653087, + "grad_norm": 0.5782200694084167, + "learning_rate": 7.010433275229289e-06, + "loss": 0.22980303764343263, + "memory(GiB)": 72.48, + "step": 3095, + "token_acc": 0.9177668261448253, + "train_speed(iter/s)": 0.080792 + }, + { + "epoch": 1.201128684573837, + "grad_norm": 0.5414060354232788, + "learning_rate": 7.0006488525987686e-06, + "loss": 0.2309312105178833, + "memory(GiB)": 72.48, + "step": 3100, + "token_acc": 0.9217795870892701, + "train_speed(iter/s)": 0.080785 + }, + { + "epoch": 1.2030663404945867, + "grad_norm": 0.5498025417327881, + "learning_rate": 6.990855299746482e-06, + "loss": 0.2509820222854614, + "memory(GiB)": 72.48, + "step": 3105, + "token_acc": 0.9060274263846072, + "train_speed(iter/s)": 0.080787 + }, + { + "epoch": 1.2050039964153365, + "grad_norm": 0.5826764106750488, + "learning_rate": 6.981052661366583e-06, + "loss": 0.2404792547225952, + "memory(GiB)": 72.48, + "step": 3110, + "token_acc": 0.9211193348879652, + "train_speed(iter/s)": 0.080782 + }, + { + "epoch": 1.2069416523360865, + "grad_norm": 0.6133027076721191, + "learning_rate": 6.971240982194692e-06, + "loss": 0.24429504871368407, + "memory(GiB)": 72.48, + "step": 3115, + "token_acc": 0.9174181613979454, + "train_speed(iter/s)": 0.080798 + }, + { + "epoch": 1.2088793082568363, + "grad_norm": 0.571624755859375, + "learning_rate": 6.961420307007684e-06, + "loss": 0.24460911750793457, + "memory(GiB)": 72.48, + "step": 3120, + "token_acc": 0.9124199343852523, + "train_speed(iter/s)": 0.080817 + }, + { + "epoch": 1.210816964177586, + "grad_norm": 0.5924415588378906, + "learning_rate": 6.95159068062349e-06, + "loss": 0.23861315250396728, + "memory(GiB)": 72.48, + "step": 3125, + "token_acc": 0.9094428978272695, + "train_speed(iter/s)": 0.080814 + }, + { + "epoch": 1.2127546200983361, + "grad_norm": 0.5533959269523621, + "learning_rate": 6.941752147900893e-06, + "loss": 0.23748021125793456, + "memory(GiB)": 72.48, + "step": 3130, + "token_acc": 0.9162806088682991, + "train_speed(iter/s)": 0.080819 + }, + { + "epoch": 1.214692276019086, + "grad_norm": 0.597855269908905, + "learning_rate": 6.931904753739317e-06, + "loss": 0.249676513671875, + "memory(GiB)": 72.48, + "step": 3135, + "token_acc": 0.9153109572247441, + "train_speed(iter/s)": 0.080817 + }, + { + "epoch": 1.2166299319398357, + "grad_norm": 0.5988847613334656, + "learning_rate": 6.922048543078629e-06, + "loss": 0.23966631889343262, + "memory(GiB)": 72.48, + "step": 3140, + "token_acc": 0.9228861802814934, + "train_speed(iter/s)": 0.080823 + }, + { + "epoch": 1.2185675878605857, + "grad_norm": 0.57000732421875, + "learning_rate": 6.912183560898933e-06, + "loss": 0.24102869033813476, + "memory(GiB)": 72.48, + "step": 3145, + "token_acc": 0.9046189600077467, + "train_speed(iter/s)": 0.08083 + }, + { + "epoch": 1.2205052437813355, + "grad_norm": 0.5944874286651611, + "learning_rate": 6.902309852220357e-06, + "loss": 0.24906721115112304, + "memory(GiB)": 72.48, + "step": 3150, + "token_acc": 0.9149213846249457, + "train_speed(iter/s)": 0.080838 + }, + { + "epoch": 1.2224428997020853, + "grad_norm": 0.5449170470237732, + "learning_rate": 6.89242746210286e-06, + "loss": 0.2430576801300049, + "memory(GiB)": 72.48, + "step": 3155, + "token_acc": 0.9243398027362393, + "train_speed(iter/s)": 0.080847 + }, + { + "epoch": 1.2243805556228353, + "grad_norm": 0.5581479072570801, + "learning_rate": 6.882536435646017e-06, + "loss": 0.23644862174987794, + "memory(GiB)": 72.48, + "step": 3160, + "token_acc": 0.9143110192634143, + "train_speed(iter/s)": 0.080855 + }, + { + "epoch": 1.2263182115435851, + "grad_norm": 0.5619523525238037, + "learning_rate": 6.872636817988814e-06, + "loss": 0.24310529232025146, + "memory(GiB)": 72.48, + "step": 3165, + "token_acc": 0.9242279311287237, + "train_speed(iter/s)": 0.08086 + }, + { + "epoch": 1.228255867464335, + "grad_norm": 0.5570391416549683, + "learning_rate": 6.862728654309449e-06, + "loss": 0.23729300498962402, + "memory(GiB)": 72.48, + "step": 3170, + "token_acc": 0.9165948350170672, + "train_speed(iter/s)": 0.080872 + }, + { + "epoch": 1.230193523385085, + "grad_norm": 0.608040988445282, + "learning_rate": 6.852811989825118e-06, + "loss": 0.24692845344543457, + "memory(GiB)": 72.48, + "step": 3175, + "token_acc": 0.9088531576908175, + "train_speed(iter/s)": 0.080868 + }, + { + "epoch": 1.2321311793058347, + "grad_norm": 0.6009603142738342, + "learning_rate": 6.84288686979181e-06, + "loss": 0.243471360206604, + "memory(GiB)": 72.48, + "step": 3180, + "token_acc": 0.9175115352019837, + "train_speed(iter/s)": 0.080869 + }, + { + "epoch": 1.2340688352265847, + "grad_norm": 0.564767599105835, + "learning_rate": 6.832953339504105e-06, + "loss": 0.25086545944213867, + "memory(GiB)": 72.48, + "step": 3185, + "token_acc": 0.9183437357284213, + "train_speed(iter/s)": 0.080862 + }, + { + "epoch": 1.2360064911473345, + "grad_norm": 0.5559407472610474, + "learning_rate": 6.823011444294962e-06, + "loss": 0.25046041011810305, + "memory(GiB)": 72.48, + "step": 3190, + "token_acc": 0.9121144863683043, + "train_speed(iter/s)": 0.080867 + }, + { + "epoch": 1.2379441470680843, + "grad_norm": 0.5758140087127686, + "learning_rate": 6.813061229535517e-06, + "loss": 0.24177942276000977, + "memory(GiB)": 72.48, + "step": 3195, + "token_acc": 0.9083065626434144, + "train_speed(iter/s)": 0.080867 + }, + { + "epoch": 1.2398818029888343, + "grad_norm": 0.6040492653846741, + "learning_rate": 6.80310274063487e-06, + "loss": 0.2439272880554199, + "memory(GiB)": 72.48, + "step": 3200, + "token_acc": 0.9128642730639188, + "train_speed(iter/s)": 0.080873 + }, + { + "epoch": 1.2418194589095841, + "grad_norm": 0.5259760022163391, + "learning_rate": 6.7931360230398835e-06, + "loss": 0.2501484155654907, + "memory(GiB)": 72.48, + "step": 3205, + "token_acc": 0.9080067044868488, + "train_speed(iter/s)": 0.08088 + }, + { + "epoch": 1.243757114830334, + "grad_norm": 0.5799282789230347, + "learning_rate": 6.7831611222349745e-06, + "loss": 0.23147008419036866, + "memory(GiB)": 72.48, + "step": 3210, + "token_acc": 0.9207616921842848, + "train_speed(iter/s)": 0.080882 + }, + { + "epoch": 1.245694770751084, + "grad_norm": 0.5783043503761292, + "learning_rate": 6.773178083741899e-06, + "loss": 0.2400331974029541, + "memory(GiB)": 72.48, + "step": 3215, + "token_acc": 0.9239953407105417, + "train_speed(iter/s)": 0.080869 + }, + { + "epoch": 1.2476324266718337, + "grad_norm": 0.6135613322257996, + "learning_rate": 6.763186953119556e-06, + "loss": 0.25634021759033204, + "memory(GiB)": 72.48, + "step": 3220, + "token_acc": 0.9119443126087644, + "train_speed(iter/s)": 0.08087 + }, + { + "epoch": 1.2495700825925837, + "grad_norm": 0.5815515518188477, + "learning_rate": 6.753187775963773e-06, + "loss": 0.2590181350708008, + "memory(GiB)": 72.48, + "step": 3225, + "token_acc": 0.906501256281407, + "train_speed(iter/s)": 0.080852 + }, + { + "epoch": 1.2515077385133335, + "grad_norm": 0.5874833464622498, + "learning_rate": 6.743180597907095e-06, + "loss": 0.24145932197570802, + "memory(GiB)": 72.48, + "step": 3230, + "token_acc": 0.9158553754905191, + "train_speed(iter/s)": 0.080847 + }, + { + "epoch": 1.2534453944340833, + "grad_norm": 0.5646886229515076, + "learning_rate": 6.7331654646185876e-06, + "loss": 0.25124948024749755, + "memory(GiB)": 72.48, + "step": 3235, + "token_acc": 0.913716884521197, + "train_speed(iter/s)": 0.080845 + }, + { + "epoch": 1.2553830503548333, + "grad_norm": 0.5993587970733643, + "learning_rate": 6.723142421803614e-06, + "loss": 0.24982619285583496, + "memory(GiB)": 72.48, + "step": 3240, + "token_acc": 0.9041410309541267, + "train_speed(iter/s)": 0.080833 + }, + { + "epoch": 1.2573207062755831, + "grad_norm": 0.5263670086860657, + "learning_rate": 6.713111515203635e-06, + "loss": 0.2320237398147583, + "memory(GiB)": 72.48, + "step": 3245, + "token_acc": 0.9194774030129316, + "train_speed(iter/s)": 0.080831 + }, + { + "epoch": 1.259258362196333, + "grad_norm": 0.5713490843772888, + "learning_rate": 6.703072790596003e-06, + "loss": 0.245804500579834, + "memory(GiB)": 72.48, + "step": 3250, + "token_acc": 0.9159877625382421, + "train_speed(iter/s)": 0.080838 + }, + { + "epoch": 1.261196018117083, + "grad_norm": 0.5493902564048767, + "learning_rate": 6.693026293793745e-06, + "loss": 0.2505074977874756, + "memory(GiB)": 72.48, + "step": 3255, + "token_acc": 0.9134997827467496, + "train_speed(iter/s)": 0.080834 + }, + { + "epoch": 1.2631336740378327, + "grad_norm": 0.5481184124946594, + "learning_rate": 6.682972070645357e-06, + "loss": 0.2504453182220459, + "memory(GiB)": 72.48, + "step": 3260, + "token_acc": 0.9140858147091983, + "train_speed(iter/s)": 0.080833 + }, + { + "epoch": 1.2650713299585825, + "grad_norm": 0.5771926045417786, + "learning_rate": 6.672910167034599e-06, + "loss": 0.24117374420166016, + "memory(GiB)": 72.48, + "step": 3265, + "token_acc": 0.9149578195976639, + "train_speed(iter/s)": 0.080835 + }, + { + "epoch": 1.2670089858793325, + "grad_norm": 0.5835690498352051, + "learning_rate": 6.6628406288802785e-06, + "loss": 0.2438589096069336, + "memory(GiB)": 72.48, + "step": 3270, + "token_acc": 0.9113916349809886, + "train_speed(iter/s)": 0.080838 + }, + { + "epoch": 1.2689466418000823, + "grad_norm": 0.5542348027229309, + "learning_rate": 6.652763502136044e-06, + "loss": 0.24294943809509278, + "memory(GiB)": 72.48, + "step": 3275, + "token_acc": 0.9156563907170073, + "train_speed(iter/s)": 0.080839 + }, + { + "epoch": 1.2708842977208321, + "grad_norm": 0.6041167378425598, + "learning_rate": 6.642678832790177e-06, + "loss": 0.2392800807952881, + "memory(GiB)": 72.48, + "step": 3280, + "token_acc": 0.9145334292861793, + "train_speed(iter/s)": 0.080836 + }, + { + "epoch": 1.2728219536415821, + "grad_norm": 0.5393861532211304, + "learning_rate": 6.632586666865383e-06, + "loss": 0.2392728567123413, + "memory(GiB)": 72.48, + "step": 3285, + "token_acc": 0.9183891314895681, + "train_speed(iter/s)": 0.080822 + }, + { + "epoch": 1.274759609562332, + "grad_norm": 0.584801971912384, + "learning_rate": 6.622487050418572e-06, + "loss": 0.24840068817138672, + "memory(GiB)": 72.48, + "step": 3290, + "token_acc": 0.9125019857029388, + "train_speed(iter/s)": 0.080825 + }, + { + "epoch": 1.2766972654830817, + "grad_norm": 0.549976110458374, + "learning_rate": 6.612380029540663e-06, + "loss": 0.24426255226135254, + "memory(GiB)": 72.48, + "step": 3295, + "token_acc": 0.911495008293431, + "train_speed(iter/s)": 0.080836 + }, + { + "epoch": 1.2786349214038317, + "grad_norm": 0.6103242039680481, + "learning_rate": 6.602265650356363e-06, + "loss": 0.2518136501312256, + "memory(GiB)": 72.48, + "step": 3300, + "token_acc": 0.9153556827473426, + "train_speed(iter/s)": 0.080828 + }, + { + "epoch": 1.2805725773245815, + "grad_norm": 0.5820600390434265, + "learning_rate": 6.5921439590239565e-06, + "loss": 0.23869671821594238, + "memory(GiB)": 72.48, + "step": 3305, + "token_acc": 0.9189243027888446, + "train_speed(iter/s)": 0.080835 + }, + { + "epoch": 1.2825102332453315, + "grad_norm": 0.6232005953788757, + "learning_rate": 6.582015001735105e-06, + "loss": 0.2620884656906128, + "memory(GiB)": 72.48, + "step": 3310, + "token_acc": 0.9090964460821634, + "train_speed(iter/s)": 0.080831 + }, + { + "epoch": 1.2844478891660813, + "grad_norm": 0.5614112019538879, + "learning_rate": 6.571878824714622e-06, + "loss": 0.24253828525543214, + "memory(GiB)": 72.48, + "step": 3315, + "token_acc": 0.926297150111924, + "train_speed(iter/s)": 0.080825 + }, + { + "epoch": 1.2863855450868313, + "grad_norm": 0.5985612869262695, + "learning_rate": 6.561735474220274e-06, + "loss": 0.24328384399414063, + "memory(GiB)": 72.48, + "step": 3320, + "token_acc": 0.9195212546430045, + "train_speed(iter/s)": 0.080829 + }, + { + "epoch": 1.2883232010075811, + "grad_norm": 0.5797421336174011, + "learning_rate": 6.551584996542561e-06, + "loss": 0.2494278907775879, + "memory(GiB)": 72.48, + "step": 3325, + "token_acc": 0.9162843641130006, + "train_speed(iter/s)": 0.080833 + }, + { + "epoch": 1.290260856928331, + "grad_norm": 0.5532296895980835, + "learning_rate": 6.541427438004515e-06, + "loss": 0.24654242992401124, + "memory(GiB)": 72.48, + "step": 3330, + "token_acc": 0.9212665664708294, + "train_speed(iter/s)": 0.080832 + }, + { + "epoch": 1.292198512849081, + "grad_norm": 0.5895270705223083, + "learning_rate": 6.531262844961472e-06, + "loss": 0.23927061557769774, + "memory(GiB)": 72.48, + "step": 3335, + "token_acc": 0.9222903885480572, + "train_speed(iter/s)": 0.080828 + }, + { + "epoch": 1.2941361687698307, + "grad_norm": 0.6126574873924255, + "learning_rate": 6.521091263800882e-06, + "loss": 0.24304308891296386, + "memory(GiB)": 72.48, + "step": 3340, + "token_acc": 0.9151754441826403, + "train_speed(iter/s)": 0.080838 + }, + { + "epoch": 1.2960738246905805, + "grad_norm": 0.5514196753501892, + "learning_rate": 6.510912740942079e-06, + "loss": 0.23981564044952391, + "memory(GiB)": 72.48, + "step": 3345, + "token_acc": 0.9065907584794265, + "train_speed(iter/s)": 0.080841 + }, + { + "epoch": 1.2980114806113305, + "grad_norm": 0.563449501991272, + "learning_rate": 6.500727322836079e-06, + "loss": 0.2316061496734619, + "memory(GiB)": 72.48, + "step": 3350, + "token_acc": 0.9239852975678423, + "train_speed(iter/s)": 0.08084 + }, + { + "epoch": 1.2999491365320803, + "grad_norm": 0.5519877076148987, + "learning_rate": 6.490535055965365e-06, + "loss": 0.2478248119354248, + "memory(GiB)": 72.48, + "step": 3355, + "token_acc": 0.9153843855508083, + "train_speed(iter/s)": 0.080847 + }, + { + "epoch": 1.3018867924528301, + "grad_norm": 0.5955557227134705, + "learning_rate": 6.480335986843675e-06, + "loss": 0.22984936237335205, + "memory(GiB)": 72.48, + "step": 3360, + "token_acc": 0.9195110142822561, + "train_speed(iter/s)": 0.080861 + }, + { + "epoch": 1.3038244483735801, + "grad_norm": 0.5684180855751038, + "learning_rate": 6.470130162015789e-06, + "loss": 0.24722616672515868, + "memory(GiB)": 72.48, + "step": 3365, + "token_acc": 0.9208458149779736, + "train_speed(iter/s)": 0.080861 + }, + { + "epoch": 1.30576210429433, + "grad_norm": 0.5446616411209106, + "learning_rate": 6.459917628057319e-06, + "loss": 0.23405933380126953, + "memory(GiB)": 72.48, + "step": 3370, + "token_acc": 0.919267457705986, + "train_speed(iter/s)": 0.080861 + }, + { + "epoch": 1.3076997602150797, + "grad_norm": 0.5617885589599609, + "learning_rate": 6.449698431574497e-06, + "loss": 0.24888741970062256, + "memory(GiB)": 72.48, + "step": 3375, + "token_acc": 0.9263560451898113, + "train_speed(iter/s)": 0.080862 + }, + { + "epoch": 1.3096374161358297, + "grad_norm": 0.6414940357208252, + "learning_rate": 6.439472619203956e-06, + "loss": 0.24251883029937743, + "memory(GiB)": 72.48, + "step": 3380, + "token_acc": 0.9109069886947585, + "train_speed(iter/s)": 0.08087 + }, + { + "epoch": 1.3115750720565795, + "grad_norm": 0.5910158157348633, + "learning_rate": 6.429240237612523e-06, + "loss": 0.24664535522460937, + "memory(GiB)": 72.48, + "step": 3385, + "token_acc": 0.9196039465885716, + "train_speed(iter/s)": 0.080864 + }, + { + "epoch": 1.3135127279773293, + "grad_norm": 0.5916693806648254, + "learning_rate": 6.419001333497007e-06, + "loss": 0.24849185943603516, + "memory(GiB)": 72.48, + "step": 3390, + "token_acc": 0.9203673689897353, + "train_speed(iter/s)": 0.080874 + }, + { + "epoch": 1.3154503838980793, + "grad_norm": 0.5980777740478516, + "learning_rate": 6.4087559535839785e-06, + "loss": 0.24993062019348145, + "memory(GiB)": 72.48, + "step": 3395, + "token_acc": 0.904221611997948, + "train_speed(iter/s)": 0.080874 + }, + { + "epoch": 1.3173880398188291, + "grad_norm": 0.5520764589309692, + "learning_rate": 6.3985041446295645e-06, + "loss": 0.25055632591247556, + "memory(GiB)": 72.48, + "step": 3400, + "token_acc": 0.913022677255765, + "train_speed(iter/s)": 0.080871 + }, + { + "epoch": 1.319325695739579, + "grad_norm": 0.5889905095100403, + "learning_rate": 6.388245953419232e-06, + "loss": 0.24010930061340333, + "memory(GiB)": 72.48, + "step": 3405, + "token_acc": 0.9154612325344033, + "train_speed(iter/s)": 0.080865 + }, + { + "epoch": 1.321263351660329, + "grad_norm": 0.5444064736366272, + "learning_rate": 6.377981426767574e-06, + "loss": 0.24458551406860352, + "memory(GiB)": 72.48, + "step": 3410, + "token_acc": 0.914785553047404, + "train_speed(iter/s)": 0.080871 + }, + { + "epoch": 1.3232010075810787, + "grad_norm": 0.6147655844688416, + "learning_rate": 6.367710611518095e-06, + "loss": 0.24504415988922118, + "memory(GiB)": 72.48, + "step": 3415, + "token_acc": 0.9134879163945134, + "train_speed(iter/s)": 0.080868 + }, + { + "epoch": 1.3251386635018287, + "grad_norm": 0.5855667591094971, + "learning_rate": 6.357433554543e-06, + "loss": 0.24513816833496094, + "memory(GiB)": 72.48, + "step": 3420, + "token_acc": 0.9188609715242881, + "train_speed(iter/s)": 0.080875 + }, + { + "epoch": 1.3270763194225785, + "grad_norm": 0.5783650875091553, + "learning_rate": 6.3471503027429744e-06, + "loss": 0.24665217399597167, + "memory(GiB)": 72.48, + "step": 3425, + "token_acc": 0.920804794520548, + "train_speed(iter/s)": 0.080874 + }, + { + "epoch": 1.3290139753433285, + "grad_norm": 0.5873458981513977, + "learning_rate": 6.336860903046982e-06, + "loss": 0.25883505344390867, + "memory(GiB)": 72.48, + "step": 3430, + "token_acc": 0.9149731218041169, + "train_speed(iter/s)": 0.080883 + }, + { + "epoch": 1.3309516312640783, + "grad_norm": 0.5453392267227173, + "learning_rate": 6.326565402412035e-06, + "loss": 0.24571163654327394, + "memory(GiB)": 72.48, + "step": 3435, + "token_acc": 0.9219789446489698, + "train_speed(iter/s)": 0.080871 + }, + { + "epoch": 1.3328892871848281, + "grad_norm": 0.6027646064758301, + "learning_rate": 6.3162638478229965e-06, + "loss": 0.24323391914367676, + "memory(GiB)": 72.48, + "step": 3440, + "token_acc": 0.9171201888400761, + "train_speed(iter/s)": 0.080879 + }, + { + "epoch": 1.3348269431055781, + "grad_norm": 0.6076679825782776, + "learning_rate": 6.305956286292352e-06, + "loss": 0.2376950979232788, + "memory(GiB)": 72.48, + "step": 3445, + "token_acc": 0.912549107285445, + "train_speed(iter/s)": 0.080885 + }, + { + "epoch": 1.336764599026328, + "grad_norm": 0.6235714554786682, + "learning_rate": 6.29564276486e-06, + "loss": 0.25636212825775145, + "memory(GiB)": 72.48, + "step": 3450, + "token_acc": 0.9180037284924055, + "train_speed(iter/s)": 0.080873 + }, + { + "epoch": 1.3387022549470777, + "grad_norm": 0.6193718910217285, + "learning_rate": 6.285323330593042e-06, + "loss": 0.247752046585083, + "memory(GiB)": 72.48, + "step": 3455, + "token_acc": 0.9203055292686224, + "train_speed(iter/s)": 0.080874 + }, + { + "epoch": 1.3406399108678277, + "grad_norm": 0.5753944516181946, + "learning_rate": 6.274998030585559e-06, + "loss": 0.24330606460571289, + "memory(GiB)": 72.48, + "step": 3460, + "token_acc": 0.9098618296116823, + "train_speed(iter/s)": 0.080872 + }, + { + "epoch": 1.3425775667885775, + "grad_norm": 0.5862641334533691, + "learning_rate": 6.264666911958404e-06, + "loss": 0.23821985721588135, + "memory(GiB)": 72.48, + "step": 3465, + "token_acc": 0.9153970303421562, + "train_speed(iter/s)": 0.080872 + }, + { + "epoch": 1.3445152227093273, + "grad_norm": 0.602203369140625, + "learning_rate": 6.254330021858985e-06, + "loss": 0.24624221324920653, + "memory(GiB)": 72.48, + "step": 3470, + "token_acc": 0.9169319741799807, + "train_speed(iter/s)": 0.080874 + }, + { + "epoch": 1.3464528786300773, + "grad_norm": 0.5835914611816406, + "learning_rate": 6.243987407461044e-06, + "loss": 0.24328317642211914, + "memory(GiB)": 72.48, + "step": 3475, + "token_acc": 0.9154203718674212, + "train_speed(iter/s)": 0.080867 + }, + { + "epoch": 1.3483905345508271, + "grad_norm": 0.5612725019454956, + "learning_rate": 6.233639115964454e-06, + "loss": 0.23664026260375975, + "memory(GiB)": 72.48, + "step": 3480, + "token_acc": 0.9206032300616778, + "train_speed(iter/s)": 0.080873 + }, + { + "epoch": 1.350328190471577, + "grad_norm": 0.6184651851654053, + "learning_rate": 6.223285194594986e-06, + "loss": 0.2517274856567383, + "memory(GiB)": 72.48, + "step": 3485, + "token_acc": 0.9179711323439723, + "train_speed(iter/s)": 0.080875 + }, + { + "epoch": 1.352265846392327, + "grad_norm": 0.5319633483886719, + "learning_rate": 6.212925690604113e-06, + "loss": 0.25687355995178224, + "memory(GiB)": 72.48, + "step": 3490, + "token_acc": 0.9110232762406676, + "train_speed(iter/s)": 0.080874 + }, + { + "epoch": 1.3542035023130767, + "grad_norm": 0.6207578182220459, + "learning_rate": 6.2025606512687816e-06, + "loss": 0.23406295776367186, + "memory(GiB)": 72.48, + "step": 3495, + "token_acc": 0.9191729323308271, + "train_speed(iter/s)": 0.080872 + }, + { + "epoch": 1.3561411582338265, + "grad_norm": 0.5518024563789368, + "learning_rate": 6.192190123891201e-06, + "loss": 0.2398101806640625, + "memory(GiB)": 72.48, + "step": 3500, + "token_acc": 0.9174518777762821, + "train_speed(iter/s)": 0.08087 + }, + { + "epoch": 1.3580788141545765, + "grad_norm": 0.5900284051895142, + "learning_rate": 6.18181415579862e-06, + "loss": 0.22111029624938966, + "memory(GiB)": 72.48, + "step": 3505, + "token_acc": 0.9129341398762999, + "train_speed(iter/s)": 0.080889 + }, + { + "epoch": 1.3600164700753263, + "grad_norm": 0.5826399922370911, + "learning_rate": 6.1714327943431255e-06, + "loss": 0.24394874572753905, + "memory(GiB)": 72.48, + "step": 3510, + "token_acc": 0.9186270406027627, + "train_speed(iter/s)": 0.080885 + }, + { + "epoch": 1.3619541259960761, + "grad_norm": 0.5645210146903992, + "learning_rate": 6.1610460869014096e-06, + "loss": 0.23350658416748046, + "memory(GiB)": 72.48, + "step": 3515, + "token_acc": 0.9214305633017289, + "train_speed(iter/s)": 0.080888 + }, + { + "epoch": 1.3638917819168261, + "grad_norm": 0.5921911597251892, + "learning_rate": 6.150654080874569e-06, + "loss": 0.23872621059417726, + "memory(GiB)": 72.48, + "step": 3520, + "token_acc": 0.9154886606325638, + "train_speed(iter/s)": 0.080896 + }, + { + "epoch": 1.365829437837576, + "grad_norm": 0.6218807697296143, + "learning_rate": 6.140256823687875e-06, + "loss": 0.24797072410583496, + "memory(GiB)": 72.48, + "step": 3525, + "token_acc": 0.9094979818365287, + "train_speed(iter/s)": 0.080896 + }, + { + "epoch": 1.367767093758326, + "grad_norm": 0.5517158508300781, + "learning_rate": 6.129854362790567e-06, + "loss": 0.23862147331237793, + "memory(GiB)": 72.48, + "step": 3530, + "token_acc": 0.9226441179307462, + "train_speed(iter/s)": 0.080898 + }, + { + "epoch": 1.3697047496790757, + "grad_norm": 0.5724954009056091, + "learning_rate": 6.1194467456556305e-06, + "loss": 0.22112350463867186, + "memory(GiB)": 72.48, + "step": 3535, + "token_acc": 0.917589736399327, + "train_speed(iter/s)": 0.080902 + }, + { + "epoch": 1.3716424055998255, + "grad_norm": 0.5898615121841431, + "learning_rate": 6.109034019779583e-06, + "loss": 0.2419571876525879, + "memory(GiB)": 72.48, + "step": 3540, + "token_acc": 0.9103599797194524, + "train_speed(iter/s)": 0.080904 + }, + { + "epoch": 1.3735800615205755, + "grad_norm": 0.5712639689445496, + "learning_rate": 6.098616232682255e-06, + "loss": 0.24022350311279297, + "memory(GiB)": 72.48, + "step": 3545, + "token_acc": 0.9193883397794013, + "train_speed(iter/s)": 0.080902 + }, + { + "epoch": 1.3755177174413253, + "grad_norm": 0.6036831736564636, + "learning_rate": 6.088193431906576e-06, + "loss": 0.2510042428970337, + "memory(GiB)": 72.48, + "step": 3550, + "token_acc": 0.9242049814658256, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 1.3774553733620754, + "grad_norm": 0.5672377347946167, + "learning_rate": 6.077765665018356e-06, + "loss": 0.24461116790771484, + "memory(GiB)": 72.48, + "step": 3555, + "token_acc": 0.9190039318479686, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 1.3793930292828251, + "grad_norm": 0.6056391000747681, + "learning_rate": 6.067332979606069e-06, + "loss": 0.24074273109436034, + "memory(GiB)": 72.48, + "step": 3560, + "token_acc": 0.9218976658314652, + "train_speed(iter/s)": 0.080911 + }, + { + "epoch": 1.381330685203575, + "grad_norm": 0.5587301850318909, + "learning_rate": 6.0568954232806335e-06, + "loss": 0.24055736064910888, + "memory(GiB)": 72.48, + "step": 3565, + "token_acc": 0.9175392670157068, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 1.383268341124325, + "grad_norm": 0.5654682517051697, + "learning_rate": 6.046453043675197e-06, + "loss": 0.2507538557052612, + "memory(GiB)": 72.48, + "step": 3570, + "token_acc": 0.9204025918494886, + "train_speed(iter/s)": 0.080917 + }, + { + "epoch": 1.3852059970450747, + "grad_norm": 0.5490247011184692, + "learning_rate": 6.036005888444922e-06, + "loss": 0.2445591926574707, + "memory(GiB)": 72.48, + "step": 3575, + "token_acc": 0.9217004355235751, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 1.3871436529658245, + "grad_norm": 0.6079689264297485, + "learning_rate": 6.025554005266761e-06, + "loss": 0.24872751235961915, + "memory(GiB)": 72.48, + "step": 3580, + "token_acc": 0.9157847533632287, + "train_speed(iter/s)": 0.080913 + }, + { + "epoch": 1.3890813088865746, + "grad_norm": 0.5383176207542419, + "learning_rate": 6.015097441839246e-06, + "loss": 0.24193205833435058, + "memory(GiB)": 72.48, + "step": 3585, + "token_acc": 0.9197718753003749, + "train_speed(iter/s)": 0.080914 + }, + { + "epoch": 1.3910189648073243, + "grad_norm": 0.5508478283882141, + "learning_rate": 6.004636245882265e-06, + "loss": 0.23985228538513184, + "memory(GiB)": 72.48, + "step": 3590, + "token_acc": 0.915475677288855, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 1.3929566207280741, + "grad_norm": 0.5500128865242004, + "learning_rate": 5.994170465136853e-06, + "loss": 0.23378024101257325, + "memory(GiB)": 72.48, + "step": 3595, + "token_acc": 0.9231037610773847, + "train_speed(iter/s)": 0.080899 + }, + { + "epoch": 1.3948942766488241, + "grad_norm": 0.564172089099884, + "learning_rate": 5.98370014736496e-06, + "loss": 0.24442434310913086, + "memory(GiB)": 72.48, + "step": 3600, + "token_acc": 0.9116323797094481, + "train_speed(iter/s)": 0.080899 + }, + { + "epoch": 1.396831932569574, + "grad_norm": 0.6100159287452698, + "learning_rate": 5.97322534034925e-06, + "loss": 0.2467043399810791, + "memory(GiB)": 72.48, + "step": 3605, + "token_acc": 0.913891419509397, + "train_speed(iter/s)": 0.08089 + }, + { + "epoch": 1.3987695884903237, + "grad_norm": 0.5676229596138, + "learning_rate": 5.962746091892866e-06, + "loss": 0.23828344345092772, + "memory(GiB)": 72.48, + "step": 3610, + "token_acc": 0.9075207695671185, + "train_speed(iter/s)": 0.080885 + }, + { + "epoch": 1.4007072444110737, + "grad_norm": 0.5872523784637451, + "learning_rate": 5.952262449819225e-06, + "loss": 0.25522215366363527, + "memory(GiB)": 72.48, + "step": 3615, + "token_acc": 0.9063737962575285, + "train_speed(iter/s)": 0.080877 + }, + { + "epoch": 1.4026449003318235, + "grad_norm": 0.593393862247467, + "learning_rate": 5.941774461971794e-06, + "loss": 0.2511239290237427, + "memory(GiB)": 72.48, + "step": 3620, + "token_acc": 0.9169553533849192, + "train_speed(iter/s)": 0.080877 + }, + { + "epoch": 1.4045825562525733, + "grad_norm": 0.543071448802948, + "learning_rate": 5.931282176213875e-06, + "loss": 0.24414536952972413, + "memory(GiB)": 72.48, + "step": 3625, + "token_acc": 0.9186960812402508, + "train_speed(iter/s)": 0.080872 + }, + { + "epoch": 1.4065202121733233, + "grad_norm": 0.5903857946395874, + "learning_rate": 5.920785640428377e-06, + "loss": 0.23714954853057862, + "memory(GiB)": 72.48, + "step": 3630, + "token_acc": 0.9076739427012278, + "train_speed(iter/s)": 0.080883 + }, + { + "epoch": 1.4084578680940731, + "grad_norm": 0.5597670078277588, + "learning_rate": 5.910284902517614e-06, + "loss": 0.24124536514282227, + "memory(GiB)": 72.48, + "step": 3635, + "token_acc": 0.9245324857713061, + "train_speed(iter/s)": 0.080888 + }, + { + "epoch": 1.410395524014823, + "grad_norm": 0.56638503074646, + "learning_rate": 5.899780010403066e-06, + "loss": 0.23916149139404297, + "memory(GiB)": 72.48, + "step": 3640, + "token_acc": 0.9108338966386038, + "train_speed(iter/s)": 0.080886 + }, + { + "epoch": 1.412333179935573, + "grad_norm": 0.5485346913337708, + "learning_rate": 5.8892710120251806e-06, + "loss": 0.23300790786743164, + "memory(GiB)": 72.48, + "step": 3645, + "token_acc": 0.9207074721780604, + "train_speed(iter/s)": 0.080893 + }, + { + "epoch": 1.4142708358563227, + "grad_norm": 0.5811129808425903, + "learning_rate": 5.87875795534314e-06, + "loss": 0.23677351474761962, + "memory(GiB)": 72.48, + "step": 3650, + "token_acc": 0.9236622904740684, + "train_speed(iter/s)": 0.080892 + }, + { + "epoch": 1.4162084917770728, + "grad_norm": 0.5233686566352844, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.24698786735534667, + "memory(GiB)": 72.48, + "step": 3655, + "token_acc": 0.909466098919096, + "train_speed(iter/s)": 0.08089 + }, + { + "epoch": 1.4181461476978225, + "grad_norm": 0.5673002004623413, + "learning_rate": 5.85771985899572e-06, + "loss": 0.24160895347595215, + "memory(GiB)": 72.48, + "step": 3660, + "token_acc": 0.9140604973083825, + "train_speed(iter/s)": 0.08089 + }, + { + "epoch": 1.4200838036185726, + "grad_norm": 0.520982563495636, + "learning_rate": 5.847194915340432e-06, + "loss": 0.2456125497817993, + "memory(GiB)": 72.48, + "step": 3665, + "token_acc": 0.9184846596024655, + "train_speed(iter/s)": 0.08089 + }, + { + "epoch": 1.4220214595393224, + "grad_norm": 0.5889917612075806, + "learning_rate": 5.836666105400739e-06, + "loss": 0.2436441659927368, + "memory(GiB)": 72.48, + "step": 3670, + "token_acc": 0.9159888877732034, + "train_speed(iter/s)": 0.080892 + }, + { + "epoch": 1.4239591154600721, + "grad_norm": 0.5952669382095337, + "learning_rate": 5.826133477226239e-06, + "loss": 0.24678261280059816, + "memory(GiB)": 72.48, + "step": 3675, + "token_acc": 0.9162605668821482, + "train_speed(iter/s)": 0.08089 + }, + { + "epoch": 1.4258967713808222, + "grad_norm": 0.5867316126823425, + "learning_rate": 5.815597078883955e-06, + "loss": 0.24308767318725585, + "memory(GiB)": 72.48, + "step": 3680, + "token_acc": 0.9156236666192575, + "train_speed(iter/s)": 0.080892 + }, + { + "epoch": 1.427834427301572, + "grad_norm": 0.5791524648666382, + "learning_rate": 5.805056958458111e-06, + "loss": 0.23985023498535157, + "memory(GiB)": 72.48, + "step": 3685, + "token_acc": 0.9216729778596527, + "train_speed(iter/s)": 0.080893 + }, + { + "epoch": 1.4297720832223217, + "grad_norm": 0.6174024343490601, + "learning_rate": 5.79451316404992e-06, + "loss": 0.24676356315612794, + "memory(GiB)": 72.48, + "step": 3690, + "token_acc": 0.915408156476615, + "train_speed(iter/s)": 0.080903 + }, + { + "epoch": 1.4317097391430718, + "grad_norm": 0.563883364200592, + "learning_rate": 5.7839657437773644e-06, + "loss": 0.23523716926574706, + "memory(GiB)": 72.48, + "step": 3695, + "token_acc": 0.917732751608371, + "train_speed(iter/s)": 0.080904 + }, + { + "epoch": 1.4336473950638216, + "grad_norm": 0.5705844759941101, + "learning_rate": 5.77341474577497e-06, + "loss": 0.23880462646484374, + "memory(GiB)": 72.48, + "step": 3700, + "token_acc": 0.9202369333288459, + "train_speed(iter/s)": 0.080898 + }, + { + "epoch": 1.4355850509845713, + "grad_norm": 0.5820704698562622, + "learning_rate": 5.76286021819359e-06, + "loss": 0.23488302230834962, + "memory(GiB)": 72.48, + "step": 3705, + "token_acc": 0.9148230088495575, + "train_speed(iter/s)": 0.080902 + }, + { + "epoch": 1.4375227069053214, + "grad_norm": 0.5883727073669434, + "learning_rate": 5.752302209200187e-06, + "loss": 0.2560927391052246, + "memory(GiB)": 72.48, + "step": 3710, + "token_acc": 0.9100109639875179, + "train_speed(iter/s)": 0.080901 + }, + { + "epoch": 1.4394603628260711, + "grad_norm": 0.5630972385406494, + "learning_rate": 5.7417407669776135e-06, + "loss": 0.23473753929138183, + "memory(GiB)": 72.48, + "step": 3715, + "token_acc": 0.9217704758933832, + "train_speed(iter/s)": 0.080905 + }, + { + "epoch": 1.441398018746821, + "grad_norm": 0.5839196443557739, + "learning_rate": 5.731175939724384e-06, + "loss": 0.24403223991394044, + "memory(GiB)": 72.48, + "step": 3720, + "token_acc": 0.922809604043808, + "train_speed(iter/s)": 0.080904 + }, + { + "epoch": 1.443335674667571, + "grad_norm": 0.5882570147514343, + "learning_rate": 5.720607775654467e-06, + "loss": 0.23943960666656494, + "memory(GiB)": 72.48, + "step": 3725, + "token_acc": 0.9180094480036239, + "train_speed(iter/s)": 0.080906 + }, + { + "epoch": 1.4452733305883207, + "grad_norm": 0.6032676100730896, + "learning_rate": 5.710036322997055e-06, + "loss": 0.24937214851379394, + "memory(GiB)": 72.48, + "step": 3730, + "token_acc": 0.9147170043348603, + "train_speed(iter/s)": 0.080893 + }, + { + "epoch": 1.4472109865090705, + "grad_norm": 0.5497545599937439, + "learning_rate": 5.699461629996349e-06, + "loss": 0.2398749828338623, + "memory(GiB)": 72.48, + "step": 3735, + "token_acc": 0.9171464422793032, + "train_speed(iter/s)": 0.080894 + }, + { + "epoch": 1.4491486424298206, + "grad_norm": 0.5942593216896057, + "learning_rate": 5.68888374491134e-06, + "loss": 0.2407346248626709, + "memory(GiB)": 72.48, + "step": 3740, + "token_acc": 0.9223353884933747, + "train_speed(iter/s)": 0.08089 + }, + { + "epoch": 1.4510862983505703, + "grad_norm": 0.5797078013420105, + "learning_rate": 5.678302716015586e-06, + "loss": 0.25144505500793457, + "memory(GiB)": 72.48, + "step": 3745, + "token_acc": 0.9161485407535759, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 1.4530239542713201, + "grad_norm": 0.6159892678260803, + "learning_rate": 5.66771859159699e-06, + "loss": 0.24453871250152587, + "memory(GiB)": 72.48, + "step": 3750, + "token_acc": 0.9172922514524446, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 1.4549616101920702, + "grad_norm": 0.6003066897392273, + "learning_rate": 5.6571314199575845e-06, + "loss": 0.2406759262084961, + "memory(GiB)": 72.48, + "step": 3755, + "token_acc": 0.9244022744399534, + "train_speed(iter/s)": 0.080914 + }, + { + "epoch": 1.45689926611282, + "grad_norm": 0.5446083545684814, + "learning_rate": 5.646541249413304e-06, + "loss": 0.24054651260375975, + "memory(GiB)": 72.48, + "step": 3760, + "token_acc": 0.9230030296312717, + "train_speed(iter/s)": 0.080918 + }, + { + "epoch": 1.45883692203357, + "grad_norm": 0.5630480647087097, + "learning_rate": 5.635948128293775e-06, + "loss": 0.25225830078125, + "memory(GiB)": 72.48, + "step": 3765, + "token_acc": 0.9266996208017335, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 1.4607745779543198, + "grad_norm": 0.6038119792938232, + "learning_rate": 5.625352104942085e-06, + "loss": 0.24250342845916747, + "memory(GiB)": 72.48, + "step": 3770, + "token_acc": 0.9214726151614073, + "train_speed(iter/s)": 0.080905 + }, + { + "epoch": 1.4627122338750698, + "grad_norm": 0.5660669803619385, + "learning_rate": 5.614753227714567e-06, + "loss": 0.23111968040466307, + "memory(GiB)": 72.48, + "step": 3775, + "token_acc": 0.9160903040584627, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 1.4646498897958196, + "grad_norm": 0.6035475730895996, + "learning_rate": 5.6041515449805804e-06, + "loss": 0.24287838935852052, + "memory(GiB)": 72.48, + "step": 3780, + "token_acc": 0.9190956330224715, + "train_speed(iter/s)": 0.080923 + }, + { + "epoch": 1.4665875457165694, + "grad_norm": 0.5913589596748352, + "learning_rate": 5.5935471051222844e-06, + "loss": 0.24291999340057374, + "memory(GiB)": 72.48, + "step": 3785, + "token_acc": 0.9209064512056558, + "train_speed(iter/s)": 0.08093 + }, + { + "epoch": 1.4685252016373194, + "grad_norm": 0.6146913766860962, + "learning_rate": 5.582939956534421e-06, + "loss": 0.2419736385345459, + "memory(GiB)": 72.48, + "step": 3790, + "token_acc": 0.9155126180068325, + "train_speed(iter/s)": 0.080943 + }, + { + "epoch": 1.4704628575580692, + "grad_norm": 0.5897228121757507, + "learning_rate": 5.572330147624097e-06, + "loss": 0.23940186500549315, + "memory(GiB)": 72.48, + "step": 3795, + "token_acc": 0.9194137976870929, + "train_speed(iter/s)": 0.080937 + }, + { + "epoch": 1.472400513478819, + "grad_norm": 0.5833806991577148, + "learning_rate": 5.561717726810557e-06, + "loss": 0.2410250186920166, + "memory(GiB)": 72.48, + "step": 3800, + "token_acc": 0.9106603227673118, + "train_speed(iter/s)": 0.080941 + }, + { + "epoch": 1.474338169399569, + "grad_norm": 0.5665867924690247, + "learning_rate": 5.551102742524967e-06, + "loss": 0.23539307117462158, + "memory(GiB)": 72.48, + "step": 3805, + "token_acc": 0.9179119150609516, + "train_speed(iter/s)": 0.080945 + }, + { + "epoch": 1.4762758253203188, + "grad_norm": 0.5832652449607849, + "learning_rate": 5.540485243210194e-06, + "loss": 0.23438220024108886, + "memory(GiB)": 72.48, + "step": 3810, + "token_acc": 0.9260935143288085, + "train_speed(iter/s)": 0.080948 + }, + { + "epoch": 1.4782134812410685, + "grad_norm": 0.5857250690460205, + "learning_rate": 5.529865277320575e-06, + "loss": 0.2527660846710205, + "memory(GiB)": 72.48, + "step": 3815, + "token_acc": 0.9062848751835536, + "train_speed(iter/s)": 0.080941 + }, + { + "epoch": 1.4801511371618186, + "grad_norm": 0.5673555135726929, + "learning_rate": 5.51924289332171e-06, + "loss": 0.24681317806243896, + "memory(GiB)": 72.48, + "step": 3820, + "token_acc": 0.9058411144350769, + "train_speed(iter/s)": 0.080938 + }, + { + "epoch": 1.4820887930825684, + "grad_norm": 0.5933298468589783, + "learning_rate": 5.5086181396902335e-06, + "loss": 0.2507610321044922, + "memory(GiB)": 72.48, + "step": 3825, + "token_acc": 0.9154622988259644, + "train_speed(iter/s)": 0.080928 + }, + { + "epoch": 1.4840264490033181, + "grad_norm": 0.5970496535301208, + "learning_rate": 5.497991064913592e-06, + "loss": 0.23358287811279296, + "memory(GiB)": 72.48, + "step": 3830, + "token_acc": 0.9168089297439265, + "train_speed(iter/s)": 0.080929 + }, + { + "epoch": 1.4859641049240682, + "grad_norm": 0.5848801732063293, + "learning_rate": 5.487361717489828e-06, + "loss": 0.24907338619232178, + "memory(GiB)": 72.48, + "step": 3835, + "token_acc": 0.9162323031913104, + "train_speed(iter/s)": 0.080934 + }, + { + "epoch": 1.487901760844818, + "grad_norm": 0.5591345429420471, + "learning_rate": 5.476730145927354e-06, + "loss": 0.24457969665527343, + "memory(GiB)": 72.48, + "step": 3840, + "token_acc": 0.9261170157878444, + "train_speed(iter/s)": 0.080933 + }, + { + "epoch": 1.4898394167655677, + "grad_norm": 0.5818787217140198, + "learning_rate": 5.46609639874473e-06, + "loss": 0.2569440841674805, + "memory(GiB)": 72.48, + "step": 3845, + "token_acc": 0.9123161416577237, + "train_speed(iter/s)": 0.080928 + }, + { + "epoch": 1.4917770726863178, + "grad_norm": 0.5587847828865051, + "learning_rate": 5.455460524470447e-06, + "loss": 0.23373932838439943, + "memory(GiB)": 72.48, + "step": 3850, + "token_acc": 0.9148546222664016, + "train_speed(iter/s)": 0.080921 + }, + { + "epoch": 1.4937147286070676, + "grad_norm": 0.6207402944564819, + "learning_rate": 5.444822571642705e-06, + "loss": 0.24684290885925292, + "memory(GiB)": 72.48, + "step": 3855, + "token_acc": 0.9198185556244493, + "train_speed(iter/s)": 0.080921 + }, + { + "epoch": 1.4956523845278173, + "grad_norm": 0.5635160207748413, + "learning_rate": 5.434182588809187e-06, + "loss": 0.23250012397766112, + "memory(GiB)": 72.48, + "step": 3860, + "token_acc": 0.9091337892928136, + "train_speed(iter/s)": 0.080933 + }, + { + "epoch": 1.4975900404485674, + "grad_norm": 0.5980255603790283, + "learning_rate": 5.423540624526843e-06, + "loss": 0.23713982105255127, + "memory(GiB)": 72.48, + "step": 3865, + "token_acc": 0.9194705073806251, + "train_speed(iter/s)": 0.080937 + }, + { + "epoch": 1.4995276963693172, + "grad_norm": 0.6113118529319763, + "learning_rate": 5.412896727361663e-06, + "loss": 0.24681100845336915, + "memory(GiB)": 72.48, + "step": 3870, + "token_acc": 0.9220164887146912, + "train_speed(iter/s)": 0.08094 + }, + { + "epoch": 1.501465352290067, + "grad_norm": 0.5499523878097534, + "learning_rate": 5.402250945888457e-06, + "loss": 0.2583901882171631, + "memory(GiB)": 72.48, + "step": 3875, + "token_acc": 0.9158980665436907, + "train_speed(iter/s)": 0.080942 + }, + { + "epoch": 1.503403008210817, + "grad_norm": 0.5971123576164246, + "learning_rate": 5.391603328690639e-06, + "loss": 0.23753485679626465, + "memory(GiB)": 72.48, + "step": 3880, + "token_acc": 0.9205142428296408, + "train_speed(iter/s)": 0.080938 + }, + { + "epoch": 1.505340664131567, + "grad_norm": 0.5578671097755432, + "learning_rate": 5.380953924359995e-06, + "loss": 0.24026894569396973, + "memory(GiB)": 72.48, + "step": 3885, + "token_acc": 0.9254251914710007, + "train_speed(iter/s)": 0.080932 + }, + { + "epoch": 1.5072783200523165, + "grad_norm": 0.6061117053031921, + "learning_rate": 5.370302781496471e-06, + "loss": 0.24199223518371582, + "memory(GiB)": 72.48, + "step": 3890, + "token_acc": 0.9121084317954992, + "train_speed(iter/s)": 0.080928 + }, + { + "epoch": 1.5092159759730666, + "grad_norm": 0.5695658922195435, + "learning_rate": 5.3596499487079466e-06, + "loss": 0.24618167877197267, + "memory(GiB)": 72.48, + "step": 3895, + "token_acc": 0.9203536977491962, + "train_speed(iter/s)": 0.08093 + }, + { + "epoch": 1.5111536318938166, + "grad_norm": 0.6070546507835388, + "learning_rate": 5.348995474610011e-06, + "loss": 0.23130958080291747, + "memory(GiB)": 72.48, + "step": 3900, + "token_acc": 0.9138054830287207, + "train_speed(iter/s)": 0.080925 + }, + { + "epoch": 1.5130912878145664, + "grad_norm": 0.5819863677024841, + "learning_rate": 5.338339407825746e-06, + "loss": 0.2405244827270508, + "memory(GiB)": 72.48, + "step": 3905, + "token_acc": 0.9126821826208329, + "train_speed(iter/s)": 0.080922 + }, + { + "epoch": 1.5150289437353162, + "grad_norm": 0.5835677981376648, + "learning_rate": 5.3276817969855e-06, + "loss": 0.24713582992553712, + "memory(GiB)": 72.48, + "step": 3910, + "token_acc": 0.9214152938914172, + "train_speed(iter/s)": 0.080929 + }, + { + "epoch": 1.5169665996560662, + "grad_norm": 0.5336800217628479, + "learning_rate": 5.317022690726669e-06, + "loss": 0.2448024034500122, + "memory(GiB)": 72.48, + "step": 3915, + "token_acc": 0.9104163794292017, + "train_speed(iter/s)": 0.080936 + }, + { + "epoch": 1.518904255576816, + "grad_norm": 0.5947490930557251, + "learning_rate": 5.306362137693473e-06, + "loss": 0.23219313621520996, + "memory(GiB)": 72.48, + "step": 3920, + "token_acc": 0.9211144568346514, + "train_speed(iter/s)": 0.08093 + }, + { + "epoch": 1.5208419114975658, + "grad_norm": 0.6150482296943665, + "learning_rate": 5.295700186536739e-06, + "loss": 0.24512362480163574, + "memory(GiB)": 72.48, + "step": 3925, + "token_acc": 0.9168956760705465, + "train_speed(iter/s)": 0.080932 + }, + { + "epoch": 1.5227795674183158, + "grad_norm": 0.5676717758178711, + "learning_rate": 5.2850368859136666e-06, + "loss": 0.23571786880493165, + "memory(GiB)": 72.48, + "step": 3930, + "token_acc": 0.9132034377207441, + "train_speed(iter/s)": 0.080932 + }, + { + "epoch": 1.5247172233390656, + "grad_norm": 0.6027458906173706, + "learning_rate": 5.274372284487619e-06, + "loss": 0.23536810874938965, + "memory(GiB)": 72.48, + "step": 3935, + "token_acc": 0.9179603899853925, + "train_speed(iter/s)": 0.080937 + }, + { + "epoch": 1.5266548792598154, + "grad_norm": 0.6239861249923706, + "learning_rate": 5.263706430927895e-06, + "loss": 0.22824177742004395, + "memory(GiB)": 72.48, + "step": 3940, + "token_acc": 0.9203759531831885, + "train_speed(iter/s)": 0.080937 + }, + { + "epoch": 1.5285925351805654, + "grad_norm": 0.6263760924339294, + "learning_rate": 5.253039373909507e-06, + "loss": 0.2348174571990967, + "memory(GiB)": 72.48, + "step": 3945, + "token_acc": 0.9155720509924157, + "train_speed(iter/s)": 0.080941 + }, + { + "epoch": 1.5305301911013152, + "grad_norm": 0.546149492263794, + "learning_rate": 5.242371162112958e-06, + "loss": 0.22561445236206054, + "memory(GiB)": 72.48, + "step": 3950, + "token_acc": 0.9153545911616829, + "train_speed(iter/s)": 0.080926 + }, + { + "epoch": 1.532467847022065, + "grad_norm": 0.5849190950393677, + "learning_rate": 5.2317018442240255e-06, + "loss": 0.23886852264404296, + "memory(GiB)": 72.48, + "step": 3955, + "token_acc": 0.9273893393592981, + "train_speed(iter/s)": 0.080925 + }, + { + "epoch": 1.534405502942815, + "grad_norm": 0.5801311731338501, + "learning_rate": 5.221031468933532e-06, + "loss": 0.24041290283203126, + "memory(GiB)": 72.48, + "step": 3960, + "token_acc": 0.9221912783589187, + "train_speed(iter/s)": 0.08093 + }, + { + "epoch": 1.5363431588635648, + "grad_norm": 0.5716031193733215, + "learning_rate": 5.210360084937125e-06, + "loss": 0.21993811130523683, + "memory(GiB)": 72.48, + "step": 3965, + "token_acc": 0.9108157099697886, + "train_speed(iter/s)": 0.08094 + }, + { + "epoch": 1.5382808147843146, + "grad_norm": 0.5501062870025635, + "learning_rate": 5.199687740935057e-06, + "loss": 0.23140015602111816, + "memory(GiB)": 72.48, + "step": 3970, + "token_acc": 0.9091976870256595, + "train_speed(iter/s)": 0.080951 + }, + { + "epoch": 1.5402184707050646, + "grad_norm": 0.5458846092224121, + "learning_rate": 5.189014485631957e-06, + "loss": 0.2395151138305664, + "memory(GiB)": 72.48, + "step": 3975, + "token_acc": 0.9074286778629745, + "train_speed(iter/s)": 0.080949 + }, + { + "epoch": 1.5421561266258146, + "grad_norm": 0.5497922301292419, + "learning_rate": 5.178340367736621e-06, + "loss": 0.24570670127868652, + "memory(GiB)": 72.48, + "step": 3980, + "token_acc": 0.9208731241473397, + "train_speed(iter/s)": 0.080946 + }, + { + "epoch": 1.5440937825465642, + "grad_norm": 0.5686230659484863, + "learning_rate": 5.167665435961774e-06, + "loss": 0.23753948211669923, + "memory(GiB)": 72.48, + "step": 3985, + "token_acc": 0.9165367101241345, + "train_speed(iter/s)": 0.080943 + }, + { + "epoch": 1.5460314384673142, + "grad_norm": 0.5919835567474365, + "learning_rate": 5.156989739023861e-06, + "loss": 0.2217550754547119, + "memory(GiB)": 72.48, + "step": 3990, + "token_acc": 0.9181872307638427, + "train_speed(iter/s)": 0.080948 + }, + { + "epoch": 1.5479690943880642, + "grad_norm": 0.5866130590438843, + "learning_rate": 5.146313325642814e-06, + "loss": 0.24096102714538575, + "memory(GiB)": 72.48, + "step": 3995, + "token_acc": 0.9132077771682818, + "train_speed(iter/s)": 0.080949 + }, + { + "epoch": 1.5499067503088138, + "grad_norm": 0.5747529864311218, + "learning_rate": 5.1356362445418395e-06, + "loss": 0.23785798549652098, + "memory(GiB)": 72.48, + "step": 4000, + "token_acc": 0.9139216157730223, + "train_speed(iter/s)": 0.080951 + }, + { + "epoch": 1.5518444062295638, + "grad_norm": 0.5678396821022034, + "learning_rate": 5.124958544447185e-06, + "loss": 0.2483994483947754, + "memory(GiB)": 72.48, + "step": 4005, + "token_acc": 0.9185373042256417, + "train_speed(iter/s)": 0.080956 + }, + { + "epoch": 1.5537820621503138, + "grad_norm": 0.5727476477622986, + "learning_rate": 5.1142802740879285e-06, + "loss": 0.2249526262283325, + "memory(GiB)": 72.48, + "step": 4010, + "token_acc": 0.9195504694031384, + "train_speed(iter/s)": 0.080967 + }, + { + "epoch": 1.5557197180710636, + "grad_norm": 0.5712579488754272, + "learning_rate": 5.103601482195748e-06, + "loss": 0.2431882858276367, + "memory(GiB)": 72.48, + "step": 4015, + "token_acc": 0.912086790912801, + "train_speed(iter/s)": 0.080964 + }, + { + "epoch": 1.5576573739918134, + "grad_norm": 0.5597922205924988, + "learning_rate": 5.0929222175047025e-06, + "loss": 0.23807837963104247, + "memory(GiB)": 72.48, + "step": 4020, + "token_acc": 0.9209720305610717, + "train_speed(iter/s)": 0.080966 + }, + { + "epoch": 1.5595950299125634, + "grad_norm": 0.6336641311645508, + "learning_rate": 5.082242528751008e-06, + "loss": 0.23821985721588135, + "memory(GiB)": 72.48, + "step": 4025, + "token_acc": 0.9157320502829261, + "train_speed(iter/s)": 0.080972 + }, + { + "epoch": 1.5615326858333132, + "grad_norm": 0.6377075910568237, + "learning_rate": 5.071562464672815e-06, + "loss": 0.24726173877716065, + "memory(GiB)": 72.48, + "step": 4030, + "token_acc": 0.9214040816326531, + "train_speed(iter/s)": 0.080961 + }, + { + "epoch": 1.563470341754063, + "grad_norm": 0.554710865020752, + "learning_rate": 5.060882074009988e-06, + "loss": 0.24027609825134277, + "memory(GiB)": 72.48, + "step": 4035, + "token_acc": 0.9306184489692517, + "train_speed(iter/s)": 0.080962 + }, + { + "epoch": 1.565407997674813, + "grad_norm": 0.5918537378311157, + "learning_rate": 5.050201405503883e-06, + "loss": 0.2343815803527832, + "memory(GiB)": 72.48, + "step": 4040, + "token_acc": 0.9282309322033898, + "train_speed(iter/s)": 0.080959 + }, + { + "epoch": 1.5673456535955628, + "grad_norm": 0.56691575050354, + "learning_rate": 5.039520507897121e-06, + "loss": 0.23046693801879883, + "memory(GiB)": 72.48, + "step": 4045, + "token_acc": 0.9246655031995347, + "train_speed(iter/s)": 0.08096 + }, + { + "epoch": 1.5692833095163126, + "grad_norm": 0.5462111234664917, + "learning_rate": 5.02883942993337e-06, + "loss": 0.23171257972717285, + "memory(GiB)": 72.48, + "step": 4050, + "token_acc": 0.9142313359528488, + "train_speed(iter/s)": 0.080955 + }, + { + "epoch": 1.5712209654370626, + "grad_norm": 0.6200662851333618, + "learning_rate": 5.0181582203571245e-06, + "loss": 0.24694461822509767, + "memory(GiB)": 72.48, + "step": 4055, + "token_acc": 0.9075337364830647, + "train_speed(iter/s)": 0.080954 + }, + { + "epoch": 1.5731586213578124, + "grad_norm": 0.5647085309028625, + "learning_rate": 5.007476927913473e-06, + "loss": 0.23658227920532227, + "memory(GiB)": 72.48, + "step": 4060, + "token_acc": 0.9197628187551412, + "train_speed(iter/s)": 0.080952 + }, + { + "epoch": 1.5750962772785622, + "grad_norm": 0.5532594919204712, + "learning_rate": 4.996795601347885e-06, + "loss": 0.2439354658126831, + "memory(GiB)": 72.48, + "step": 4065, + "token_acc": 0.9192622409924772, + "train_speed(iter/s)": 0.080954 + }, + { + "epoch": 1.5770339331993122, + "grad_norm": 0.5520694255828857, + "learning_rate": 4.9861142894059906e-06, + "loss": 0.23830606937408447, + "memory(GiB)": 72.48, + "step": 4070, + "token_acc": 0.923468251021166, + "train_speed(iter/s)": 0.080955 + }, + { + "epoch": 1.578971589120062, + "grad_norm": 0.5831983089447021, + "learning_rate": 4.975433040833344e-06, + "loss": 0.24065241813659669, + "memory(GiB)": 72.48, + "step": 4075, + "token_acc": 0.9181812322763598, + "train_speed(iter/s)": 0.080952 + }, + { + "epoch": 1.5809092450408118, + "grad_norm": 0.6240581274032593, + "learning_rate": 4.964751904375217e-06, + "loss": 0.24529509544372557, + "memory(GiB)": 72.48, + "step": 4080, + "token_acc": 0.9190409564996184, + "train_speed(iter/s)": 0.080944 + }, + { + "epoch": 1.5828469009615618, + "grad_norm": 0.5469648241996765, + "learning_rate": 4.9540709287763685e-06, + "loss": 0.23772037029266357, + "memory(GiB)": 72.48, + "step": 4085, + "token_acc": 0.9185966835685898, + "train_speed(iter/s)": 0.080942 + }, + { + "epoch": 1.5847845568823116, + "grad_norm": 0.5892153978347778, + "learning_rate": 4.943390162780823e-06, + "loss": 0.25028512477874754, + "memory(GiB)": 72.48, + "step": 4090, + "token_acc": 0.9149353318786959, + "train_speed(iter/s)": 0.080945 + }, + { + "epoch": 1.5867222128030614, + "grad_norm": 0.6053619384765625, + "learning_rate": 4.932709655131646e-06, + "loss": 0.2440941572189331, + "memory(GiB)": 72.48, + "step": 4095, + "token_acc": 0.9105009493208704, + "train_speed(iter/s)": 0.080939 + }, + { + "epoch": 1.5886598687238114, + "grad_norm": 0.566652238368988, + "learning_rate": 4.922029454570727e-06, + "loss": 0.2509720802307129, + "memory(GiB)": 72.48, + "step": 4100, + "token_acc": 0.9231726836208429, + "train_speed(iter/s)": 0.080927 + }, + { + "epoch": 1.5905975246445614, + "grad_norm": 0.5329635739326477, + "learning_rate": 4.911349609838554e-06, + "loss": 0.23644027709960938, + "memory(GiB)": 72.48, + "step": 4105, + "token_acc": 0.9231957270182438, + "train_speed(iter/s)": 0.080922 + }, + { + "epoch": 1.592535180565311, + "grad_norm": 0.6006894111633301, + "learning_rate": 4.900670169673989e-06, + "loss": 0.24374380111694335, + "memory(GiB)": 72.48, + "step": 4110, + "token_acc": 0.919171473872415, + "train_speed(iter/s)": 0.080918 + }, + { + "epoch": 1.594472836486061, + "grad_norm": 0.6029812097549438, + "learning_rate": 4.88999118281405e-06, + "loss": 0.23951337337493897, + "memory(GiB)": 72.48, + "step": 4115, + "token_acc": 0.9179036655558922, + "train_speed(iter/s)": 0.080914 + }, + { + "epoch": 1.596410492406811, + "grad_norm": 0.5867647528648376, + "learning_rate": 4.879312697993685e-06, + "loss": 0.24152259826660155, + "memory(GiB)": 72.48, + "step": 4120, + "token_acc": 0.915404528818401, + "train_speed(iter/s)": 0.080921 + }, + { + "epoch": 1.5983481483275608, + "grad_norm": 0.5779484510421753, + "learning_rate": 4.868634763945549e-06, + "loss": 0.2460160255432129, + "memory(GiB)": 72.48, + "step": 4125, + "token_acc": 0.9114298024250631, + "train_speed(iter/s)": 0.08092 + }, + { + "epoch": 1.6002858042483106, + "grad_norm": 0.5900539755821228, + "learning_rate": 4.857957429399788e-06, + "loss": 0.2371826171875, + "memory(GiB)": 72.48, + "step": 4130, + "token_acc": 0.917403008709422, + "train_speed(iter/s)": 0.080927 + }, + { + "epoch": 1.6022234601690606, + "grad_norm": 0.6000462770462036, + "learning_rate": 4.847280743083812e-06, + "loss": 0.2404171943664551, + "memory(GiB)": 72.48, + "step": 4135, + "token_acc": 0.9210667922243896, + "train_speed(iter/s)": 0.080932 + }, + { + "epoch": 1.6041611160898104, + "grad_norm": 0.5760965347290039, + "learning_rate": 4.836604753722065e-06, + "loss": 0.23797254562377929, + "memory(GiB)": 72.48, + "step": 4140, + "token_acc": 0.9124895437874011, + "train_speed(iter/s)": 0.080926 + }, + { + "epoch": 1.6060987720105602, + "grad_norm": 0.5743480920791626, + "learning_rate": 4.825929510035818e-06, + "loss": 0.23742265701293946, + "memory(GiB)": 72.48, + "step": 4145, + "token_acc": 0.9258983303417211, + "train_speed(iter/s)": 0.080923 + }, + { + "epoch": 1.6080364279313102, + "grad_norm": 0.5980106592178345, + "learning_rate": 4.815255060742938e-06, + "loss": 0.23633360862731934, + "memory(GiB)": 72.48, + "step": 4150, + "token_acc": 0.9170678477437576, + "train_speed(iter/s)": 0.080921 + }, + { + "epoch": 1.60997408385206, + "grad_norm": 0.5759811997413635, + "learning_rate": 4.804581454557663e-06, + "loss": 0.24039463996887206, + "memory(GiB)": 72.48, + "step": 4155, + "token_acc": 0.9208314753198642, + "train_speed(iter/s)": 0.080917 + }, + { + "epoch": 1.6119117397728098, + "grad_norm": 0.5623453259468079, + "learning_rate": 4.793908740190388e-06, + "loss": 0.2390963077545166, + "memory(GiB)": 72.48, + "step": 4160, + "token_acc": 0.9164110429447853, + "train_speed(iter/s)": 0.080919 + }, + { + "epoch": 1.6138493956935598, + "grad_norm": 0.5763586759567261, + "learning_rate": 4.783236966347436e-06, + "loss": 0.2290616512298584, + "memory(GiB)": 72.48, + "step": 4165, + "token_acc": 0.9196206818694965, + "train_speed(iter/s)": 0.080925 + }, + { + "epoch": 1.6157870516143096, + "grad_norm": 0.5638933777809143, + "learning_rate": 4.772566181730835e-06, + "loss": 0.23402900695800782, + "memory(GiB)": 72.48, + "step": 4170, + "token_acc": 0.9164476459327039, + "train_speed(iter/s)": 0.080925 + }, + { + "epoch": 1.6177247075350594, + "grad_norm": 0.5336311459541321, + "learning_rate": 4.7618964350381054e-06, + "loss": 0.2433910369873047, + "memory(GiB)": 72.48, + "step": 4175, + "token_acc": 0.9133335336719055, + "train_speed(iter/s)": 0.080928 + }, + { + "epoch": 1.6196623634558094, + "grad_norm": 0.5347714424133301, + "learning_rate": 4.751227774962022e-06, + "loss": 0.2435020923614502, + "memory(GiB)": 72.48, + "step": 4180, + "token_acc": 0.9224319358610322, + "train_speed(iter/s)": 0.080923 + }, + { + "epoch": 1.6216000193765592, + "grad_norm": 0.5854753255844116, + "learning_rate": 4.74056025019041e-06, + "loss": 0.23801109790802003, + "memory(GiB)": 72.48, + "step": 4185, + "token_acc": 0.9016558105557922, + "train_speed(iter/s)": 0.080927 + }, + { + "epoch": 1.623537675297309, + "grad_norm": 0.5887041687965393, + "learning_rate": 4.729893909405905e-06, + "loss": 0.23980984687805176, + "memory(GiB)": 72.48, + "step": 4190, + "token_acc": 0.9203709837694601, + "train_speed(iter/s)": 0.080931 + }, + { + "epoch": 1.625475331218059, + "grad_norm": 0.5586225986480713, + "learning_rate": 4.719228801285748e-06, + "loss": 0.23947548866271973, + "memory(GiB)": 72.48, + "step": 4195, + "token_acc": 0.9210292268531297, + "train_speed(iter/s)": 0.080926 + }, + { + "epoch": 1.6274129871388088, + "grad_norm": 0.5300182700157166, + "learning_rate": 4.708564974501545e-06, + "loss": 0.23387060165405274, + "memory(GiB)": 72.48, + "step": 4200, + "token_acc": 0.9218061674008811, + "train_speed(iter/s)": 0.080915 + }, + { + "epoch": 1.6293506430595586, + "grad_norm": 0.5876345634460449, + "learning_rate": 4.69790247771906e-06, + "loss": 0.22723779678344727, + "memory(GiB)": 72.48, + "step": 4205, + "token_acc": 0.9254710351059201, + "train_speed(iter/s)": 0.080918 + }, + { + "epoch": 1.6312882989803086, + "grad_norm": 0.5677121877670288, + "learning_rate": 4.687241359597988e-06, + "loss": 0.23792381286621095, + "memory(GiB)": 72.48, + "step": 4210, + "token_acc": 0.9219298563103985, + "train_speed(iter/s)": 0.080922 + }, + { + "epoch": 1.6332259549010586, + "grad_norm": 0.6215115189552307, + "learning_rate": 4.676581668791731e-06, + "loss": 0.23932018280029296, + "memory(GiB)": 72.48, + "step": 4215, + "token_acc": 0.9241675418545746, + "train_speed(iter/s)": 0.08092 + }, + { + "epoch": 1.6351636108218082, + "grad_norm": 0.590103268623352, + "learning_rate": 4.665923453947176e-06, + "loss": 0.22915961742401122, + "memory(GiB)": 72.48, + "step": 4220, + "token_acc": 0.9112952111537684, + "train_speed(iter/s)": 0.080928 + }, + { + "epoch": 1.6371012667425582, + "grad_norm": 0.585893452167511, + "learning_rate": 4.655266763704476e-06, + "loss": 0.2472785472869873, + "memory(GiB)": 72.48, + "step": 4225, + "token_acc": 0.9150163544454356, + "train_speed(iter/s)": 0.08092 + }, + { + "epoch": 1.6390389226633082, + "grad_norm": 0.5503528714179993, + "learning_rate": 4.644611646696826e-06, + "loss": 0.25024340152740476, + "memory(GiB)": 72.48, + "step": 4230, + "token_acc": 0.9093188977582041, + "train_speed(iter/s)": 0.080923 + }, + { + "epoch": 1.640976578584058, + "grad_norm": 0.5937880873680115, + "learning_rate": 4.633958151550242e-06, + "loss": 0.24387547969818116, + "memory(GiB)": 72.48, + "step": 4235, + "token_acc": 0.9265440293318161, + "train_speed(iter/s)": 0.080922 + }, + { + "epoch": 1.6429142345048078, + "grad_norm": 0.5671422481536865, + "learning_rate": 4.623306326883336e-06, + "loss": 0.23435800075531005, + "memory(GiB)": 72.48, + "step": 4240, + "token_acc": 0.9159615384615385, + "train_speed(iter/s)": 0.080924 + }, + { + "epoch": 1.6448518904255578, + "grad_norm": 0.6274139285087585, + "learning_rate": 4.612656221307097e-06, + "loss": 0.2379068374633789, + "memory(GiB)": 72.48, + "step": 4245, + "token_acc": 0.9165053473390244, + "train_speed(iter/s)": 0.080925 + }, + { + "epoch": 1.6467895463463076, + "grad_norm": 0.5559883713722229, + "learning_rate": 4.602007883424673e-06, + "loss": 0.24304821491241455, + "memory(GiB)": 72.48, + "step": 4250, + "token_acc": 0.9144308877309946, + "train_speed(iter/s)": 0.080918 + }, + { + "epoch": 1.6487272022670574, + "grad_norm": 0.6241846084594727, + "learning_rate": 4.59136136183114e-06, + "loss": 0.2329557418823242, + "memory(GiB)": 72.48, + "step": 4255, + "token_acc": 0.920522930068479, + "train_speed(iter/s)": 0.08092 + }, + { + "epoch": 1.6506648581878074, + "grad_norm": 0.5585425496101379, + "learning_rate": 4.580716705113285e-06, + "loss": 0.2362123489379883, + "memory(GiB)": 72.48, + "step": 4260, + "token_acc": 0.920124514617631, + "train_speed(iter/s)": 0.080922 + }, + { + "epoch": 1.6526025141085572, + "grad_norm": 0.5571277737617493, + "learning_rate": 4.570073961849388e-06, + "loss": 0.24444966316223143, + "memory(GiB)": 72.48, + "step": 4265, + "token_acc": 0.9185006185092489, + "train_speed(iter/s)": 0.080927 + }, + { + "epoch": 1.654540170029307, + "grad_norm": 0.5879797339439392, + "learning_rate": 4.559433180608994e-06, + "loss": 0.23404364585876464, + "memory(GiB)": 72.48, + "step": 4270, + "token_acc": 0.9163342705813117, + "train_speed(iter/s)": 0.080932 + }, + { + "epoch": 1.656477825950057, + "grad_norm": 0.5947754979133606, + "learning_rate": 4.548794409952697e-06, + "loss": 0.24070556163787843, + "memory(GiB)": 72.48, + "step": 4275, + "token_acc": 0.9180384736676127, + "train_speed(iter/s)": 0.080938 + }, + { + "epoch": 1.6584154818708068, + "grad_norm": 0.553914487361908, + "learning_rate": 4.538157698431911e-06, + "loss": 0.23187220096588135, + "memory(GiB)": 72.48, + "step": 4280, + "token_acc": 0.9252291530776889, + "train_speed(iter/s)": 0.080932 + }, + { + "epoch": 1.6603531377915566, + "grad_norm": 0.5857269763946533, + "learning_rate": 4.527523094588655e-06, + "loss": 0.24459095001220704, + "memory(GiB)": 72.48, + "step": 4285, + "token_acc": 0.911944202266783, + "train_speed(iter/s)": 0.080933 + }, + { + "epoch": 1.6622907937123066, + "grad_norm": 0.6173809766769409, + "learning_rate": 4.516890646955331e-06, + "loss": 0.23617539405822754, + "memory(GiB)": 72.48, + "step": 4290, + "token_acc": 0.9265784625690839, + "train_speed(iter/s)": 0.080943 + }, + { + "epoch": 1.6642284496330564, + "grad_norm": 0.5930407047271729, + "learning_rate": 4.506260404054499e-06, + "loss": 0.23567602634429932, + "memory(GiB)": 72.48, + "step": 4295, + "token_acc": 0.9236125473974491, + "train_speed(iter/s)": 0.080938 + }, + { + "epoch": 1.6661661055538062, + "grad_norm": 0.536730945110321, + "learning_rate": 4.495632414398659e-06, + "loss": 0.23399744033813477, + "memory(GiB)": 72.48, + "step": 4300, + "token_acc": 0.917054956277526, + "train_speed(iter/s)": 0.080931 + }, + { + "epoch": 1.6681037614745562, + "grad_norm": 0.554707944393158, + "learning_rate": 4.485006726490025e-06, + "loss": 0.2406824827194214, + "memory(GiB)": 72.48, + "step": 4305, + "token_acc": 0.915921518173046, + "train_speed(iter/s)": 0.080929 + }, + { + "epoch": 1.670041417395306, + "grad_norm": 0.5852320194244385, + "learning_rate": 4.474383388820308e-06, + "loss": 0.23379006385803222, + "memory(GiB)": 72.48, + "step": 4310, + "token_acc": 0.9103054104281356, + "train_speed(iter/s)": 0.080927 + }, + { + "epoch": 1.6719790733160558, + "grad_norm": 0.6317037343978882, + "learning_rate": 4.463762449870497e-06, + "loss": 0.23294405937194823, + "memory(GiB)": 72.48, + "step": 4315, + "token_acc": 0.9220221695393093, + "train_speed(iter/s)": 0.080941 + }, + { + "epoch": 1.6739167292368058, + "grad_norm": 0.5779034495353699, + "learning_rate": 4.4531439581106295e-06, + "loss": 0.24557096958160402, + "memory(GiB)": 72.48, + "step": 4320, + "token_acc": 0.9234873129472999, + "train_speed(iter/s)": 0.080942 + }, + { + "epoch": 1.6758543851575558, + "grad_norm": 0.593512237071991, + "learning_rate": 4.442527961999575e-06, + "loss": 0.23549408912658693, + "memory(GiB)": 72.48, + "step": 4325, + "token_acc": 0.9139355455414447, + "train_speed(iter/s)": 0.080943 + }, + { + "epoch": 1.6777920410783054, + "grad_norm": 0.6004024147987366, + "learning_rate": 4.431914509984815e-06, + "loss": 0.24675235748291016, + "memory(GiB)": 72.48, + "step": 4330, + "token_acc": 0.9207930790722725, + "train_speed(iter/s)": 0.080946 + }, + { + "epoch": 1.6797296969990554, + "grad_norm": 0.6113540530204773, + "learning_rate": 4.421303650502224e-06, + "loss": 0.24028847217559815, + "memory(GiB)": 72.48, + "step": 4335, + "token_acc": 0.9078976291411525, + "train_speed(iter/s)": 0.080948 + }, + { + "epoch": 1.6816673529198054, + "grad_norm": 0.5542721152305603, + "learning_rate": 4.410695431975839e-06, + "loss": 0.23191099166870116, + "memory(GiB)": 72.48, + "step": 4340, + "token_acc": 0.9157742044998611, + "train_speed(iter/s)": 0.080951 + }, + { + "epoch": 1.683605008840555, + "grad_norm": 0.5804362297058105, + "learning_rate": 4.400089902817649e-06, + "loss": 0.23258790969848633, + "memory(GiB)": 72.48, + "step": 4345, + "token_acc": 0.9184183581280492, + "train_speed(iter/s)": 0.080953 + }, + { + "epoch": 1.685542664761305, + "grad_norm": 0.6177292466163635, + "learning_rate": 4.389487111427368e-06, + "loss": 0.2363499164581299, + "memory(GiB)": 72.48, + "step": 4350, + "token_acc": 0.9231600270087779, + "train_speed(iter/s)": 0.080956 + }, + { + "epoch": 1.687480320682055, + "grad_norm": 0.5826561450958252, + "learning_rate": 4.378887106192218e-06, + "loss": 0.2349027156829834, + "memory(GiB)": 72.48, + "step": 4355, + "token_acc": 0.9184171214292247, + "train_speed(iter/s)": 0.080969 + }, + { + "epoch": 1.6894179766028048, + "grad_norm": 0.5665357112884521, + "learning_rate": 4.368289935486703e-06, + "loss": 0.23099970817565918, + "memory(GiB)": 72.48, + "step": 4360, + "token_acc": 0.9163845394272817, + "train_speed(iter/s)": 0.080962 + }, + { + "epoch": 1.6913556325235546, + "grad_norm": 0.6047654747962952, + "learning_rate": 4.357695647672392e-06, + "loss": 0.23423008918762206, + "memory(GiB)": 72.48, + "step": 4365, + "token_acc": 0.9242815165197449, + "train_speed(iter/s)": 0.080962 + }, + { + "epoch": 1.6932932884443046, + "grad_norm": 0.5313150882720947, + "learning_rate": 4.347104291097698e-06, + "loss": 0.241463041305542, + "memory(GiB)": 72.48, + "step": 4370, + "token_acc": 0.9196163157733444, + "train_speed(iter/s)": 0.080961 + }, + { + "epoch": 1.6952309443650544, + "grad_norm": 0.5334797501564026, + "learning_rate": 4.3365159140976585e-06, + "loss": 0.2367786645889282, + "memory(GiB)": 72.48, + "step": 4375, + "token_acc": 0.9231794649223425, + "train_speed(iter/s)": 0.080964 + }, + { + "epoch": 1.6971686002858042, + "grad_norm": 0.5730844736099243, + "learning_rate": 4.325930564993713e-06, + "loss": 0.24104855060577393, + "memory(GiB)": 72.48, + "step": 4380, + "token_acc": 0.9082375734901122, + "train_speed(iter/s)": 0.080961 + }, + { + "epoch": 1.6991062562065542, + "grad_norm": 0.550591766834259, + "learning_rate": 4.315348292093477e-06, + "loss": 0.23299641609191896, + "memory(GiB)": 72.48, + "step": 4385, + "token_acc": 0.9203085632523562, + "train_speed(iter/s)": 0.080961 + }, + { + "epoch": 1.701043912127304, + "grad_norm": 0.5867196321487427, + "learning_rate": 4.3047691436905345e-06, + "loss": 0.2483926773071289, + "memory(GiB)": 72.48, + "step": 4390, + "token_acc": 0.9120561220799884, + "train_speed(iter/s)": 0.080965 + }, + { + "epoch": 1.7029815680480538, + "grad_norm": 0.5573843121528625, + "learning_rate": 4.29419316806421e-06, + "loss": 0.23110456466674806, + "memory(GiB)": 72.48, + "step": 4395, + "token_acc": 0.9250759034964579, + "train_speed(iter/s)": 0.080964 + }, + { + "epoch": 1.7049192239688038, + "grad_norm": 0.6066656708717346, + "learning_rate": 4.283620413479343e-06, + "loss": 0.23136162757873535, + "memory(GiB)": 72.48, + "step": 4400, + "token_acc": 0.9227839111627607, + "train_speed(iter/s)": 0.080971 + }, + { + "epoch": 1.7068568798895536, + "grad_norm": 0.6089814901351929, + "learning_rate": 4.273050928186078e-06, + "loss": 0.24389944076538086, + "memory(GiB)": 72.48, + "step": 4405, + "token_acc": 0.9158227439850352, + "train_speed(iter/s)": 0.080971 + }, + { + "epoch": 1.7087945358103034, + "grad_norm": 0.5605995655059814, + "learning_rate": 4.26248476041964e-06, + "loss": 0.2332322120666504, + "memory(GiB)": 72.48, + "step": 4410, + "token_acc": 0.9247135842880524, + "train_speed(iter/s)": 0.080961 + }, + { + "epoch": 1.7107321917310534, + "grad_norm": 0.5949825644493103, + "learning_rate": 4.2519219584001106e-06, + "loss": 0.24070992469787597, + "memory(GiB)": 72.48, + "step": 4415, + "token_acc": 0.9084620570390128, + "train_speed(iter/s)": 0.080961 + }, + { + "epoch": 1.7126698476518032, + "grad_norm": 0.609623908996582, + "learning_rate": 4.241362570332216e-06, + "loss": 0.23741843700408935, + "memory(GiB)": 72.48, + "step": 4420, + "token_acc": 0.9277577279146398, + "train_speed(iter/s)": 0.080957 + }, + { + "epoch": 1.714607503572553, + "grad_norm": 0.5393386483192444, + "learning_rate": 4.230806644405096e-06, + "loss": 0.22986218929290772, + "memory(GiB)": 72.48, + "step": 4425, + "token_acc": 0.916211918051701, + "train_speed(iter/s)": 0.080956 + }, + { + "epoch": 1.716545159493303, + "grad_norm": 0.5796214938163757, + "learning_rate": 4.220254228792098e-06, + "loss": 0.23478846549987792, + "memory(GiB)": 72.48, + "step": 4430, + "token_acc": 0.9145027540705768, + "train_speed(iter/s)": 0.080961 + }, + { + "epoch": 1.718482815414053, + "grad_norm": 0.5430638790130615, + "learning_rate": 4.209705371650544e-06, + "loss": 0.23895716667175293, + "memory(GiB)": 72.48, + "step": 4435, + "token_acc": 0.9104987196683332, + "train_speed(iter/s)": 0.080964 + }, + { + "epoch": 1.7204204713348026, + "grad_norm": 0.5694401264190674, + "learning_rate": 4.19916012112152e-06, + "loss": 0.21948375701904296, + "memory(GiB)": 72.48, + "step": 4440, + "token_acc": 0.9201905284079227, + "train_speed(iter/s)": 0.080967 + }, + { + "epoch": 1.7223581272555526, + "grad_norm": 0.5556661486625671, + "learning_rate": 4.188618525329648e-06, + "loss": 0.2423006057739258, + "memory(GiB)": 72.48, + "step": 4445, + "token_acc": 0.9202016516865112, + "train_speed(iter/s)": 0.080973 + }, + { + "epoch": 1.7242957831763026, + "grad_norm": 0.5384371876716614, + "learning_rate": 4.178080632382875e-06, + "loss": 0.2399946928024292, + "memory(GiB)": 72.48, + "step": 4450, + "token_acc": 0.9170499120789937, + "train_speed(iter/s)": 0.080967 + }, + { + "epoch": 1.7262334390970522, + "grad_norm": 0.5373743772506714, + "learning_rate": 4.167546490372251e-06, + "loss": 0.2229823350906372, + "memory(GiB)": 72.48, + "step": 4455, + "token_acc": 0.9276191068154271, + "train_speed(iter/s)": 0.080973 + }, + { + "epoch": 1.7281710950178022, + "grad_norm": 0.599141538143158, + "learning_rate": 4.157016147371704e-06, + "loss": 0.23781347274780273, + "memory(GiB)": 72.48, + "step": 4460, + "token_acc": 0.9221821756225426, + "train_speed(iter/s)": 0.080972 + }, + { + "epoch": 1.7301087509385522, + "grad_norm": 0.6435913443565369, + "learning_rate": 4.146489651437826e-06, + "loss": 0.23757214546203614, + "memory(GiB)": 72.48, + "step": 4465, + "token_acc": 0.9272176631496678, + "train_speed(iter/s)": 0.080965 + }, + { + "epoch": 1.732046406859302, + "grad_norm": 0.586665689945221, + "learning_rate": 4.135967050609655e-06, + "loss": 0.25291759967803956, + "memory(GiB)": 72.48, + "step": 4470, + "token_acc": 0.9171209800918836, + "train_speed(iter/s)": 0.080956 + }, + { + "epoch": 1.7339840627800518, + "grad_norm": 0.556600034236908, + "learning_rate": 4.12544839290845e-06, + "loss": 0.24632768630981444, + "memory(GiB)": 72.48, + "step": 4475, + "token_acc": 0.9216685429874978, + "train_speed(iter/s)": 0.080958 + }, + { + "epoch": 1.7359217187008018, + "grad_norm": 0.5605632066726685, + "learning_rate": 4.114933726337477e-06, + "loss": 0.23179025650024415, + "memory(GiB)": 72.48, + "step": 4480, + "token_acc": 0.9092191080557025, + "train_speed(iter/s)": 0.080954 + }, + { + "epoch": 1.7378593746215516, + "grad_norm": 0.6133179068565369, + "learning_rate": 4.1044230988817865e-06, + "loss": 0.23435473442077637, + "memory(GiB)": 72.48, + "step": 4485, + "token_acc": 0.9098795180722892, + "train_speed(iter/s)": 0.080956 + }, + { + "epoch": 1.7397970305423014, + "grad_norm": 0.5663028359413147, + "learning_rate": 4.093916558507996e-06, + "loss": 0.23801469802856445, + "memory(GiB)": 72.48, + "step": 4490, + "token_acc": 0.9059203301274585, + "train_speed(iter/s)": 0.080963 + }, + { + "epoch": 1.7417346864630514, + "grad_norm": 0.5228857398033142, + "learning_rate": 4.083414153164073e-06, + "loss": 0.23023405075073242, + "memory(GiB)": 72.48, + "step": 4495, + "token_acc": 0.9179573892458572, + "train_speed(iter/s)": 0.080961 + }, + { + "epoch": 1.7436723423838012, + "grad_norm": 0.5712965726852417, + "learning_rate": 4.072915930779112e-06, + "loss": 0.24636218547821045, + "memory(GiB)": 72.48, + "step": 4500, + "token_acc": 0.9129208140493248, + "train_speed(iter/s)": 0.08096 + }, + { + "epoch": 1.745609998304551, + "grad_norm": 0.6002662181854248, + "learning_rate": 4.062421939263123e-06, + "loss": 0.23319830894470214, + "memory(GiB)": 72.48, + "step": 4505, + "token_acc": 0.9240683564299562, + "train_speed(iter/s)": 0.080957 + }, + { + "epoch": 1.747547654225301, + "grad_norm": 0.5801786780357361, + "learning_rate": 4.051932226506797e-06, + "loss": 0.23109970092773438, + "memory(GiB)": 72.48, + "step": 4510, + "token_acc": 0.9139643057434104, + "train_speed(iter/s)": 0.080955 + }, + { + "epoch": 1.7494853101460508, + "grad_norm": 0.5877240300178528, + "learning_rate": 4.041446840381309e-06, + "loss": 0.23986215591430665, + "memory(GiB)": 72.48, + "step": 4515, + "token_acc": 0.9096103267934339, + "train_speed(iter/s)": 0.080949 + }, + { + "epoch": 1.7514229660668006, + "grad_norm": 0.6062881946563721, + "learning_rate": 4.030965828738085e-06, + "loss": 0.23394384384155273, + "memory(GiB)": 72.48, + "step": 4520, + "token_acc": 0.9261261870163225, + "train_speed(iter/s)": 0.080948 + }, + { + "epoch": 1.7533606219875506, + "grad_norm": 0.5838972330093384, + "learning_rate": 4.020489239408586e-06, + "loss": 0.23096508979797364, + "memory(GiB)": 72.48, + "step": 4525, + "token_acc": 0.9149048625792812, + "train_speed(iter/s)": 0.08095 + }, + { + "epoch": 1.7552982779083004, + "grad_norm": 0.5489374995231628, + "learning_rate": 4.010017120204095e-06, + "loss": 0.23239753246307374, + "memory(GiB)": 72.48, + "step": 4530, + "token_acc": 0.91888136800855, + "train_speed(iter/s)": 0.080952 + }, + { + "epoch": 1.7572359338290502, + "grad_norm": 0.5938902497291565, + "learning_rate": 3.999549518915491e-06, + "loss": 0.23465304374694823, + "memory(GiB)": 72.48, + "step": 4535, + "token_acc": 0.9183352306971138, + "train_speed(iter/s)": 0.08096 + }, + { + "epoch": 1.7591735897498002, + "grad_norm": 0.6340152025222778, + "learning_rate": 3.989086483313039e-06, + "loss": 0.24967737197875978, + "memory(GiB)": 72.48, + "step": 4540, + "token_acc": 0.9101321585903084, + "train_speed(iter/s)": 0.080967 + }, + { + "epoch": 1.7611112456705502, + "grad_norm": 0.5557016134262085, + "learning_rate": 3.978628061146161e-06, + "loss": 0.2475881338119507, + "memory(GiB)": 72.48, + "step": 4545, + "token_acc": 0.9139274296179036, + "train_speed(iter/s)": 0.080972 + }, + { + "epoch": 1.7630489015912998, + "grad_norm": 0.5651369690895081, + "learning_rate": 3.968174300143234e-06, + "loss": 0.23720135688781738, + "memory(GiB)": 72.48, + "step": 4550, + "token_acc": 0.9207509948641203, + "train_speed(iter/s)": 0.080966 + }, + { + "epoch": 1.7649865575120498, + "grad_norm": 0.5669804811477661, + "learning_rate": 3.957725248011356e-06, + "loss": 0.24074983596801758, + "memory(GiB)": 72.48, + "step": 4555, + "token_acc": 0.925565985643291, + "train_speed(iter/s)": 0.080965 + }, + { + "epoch": 1.7669242134327998, + "grad_norm": 0.538011372089386, + "learning_rate": 3.94728095243614e-06, + "loss": 0.22811007499694824, + "memory(GiB)": 72.48, + "step": 4560, + "token_acc": 0.9120666521047212, + "train_speed(iter/s)": 0.080967 + }, + { + "epoch": 1.7688618693535494, + "grad_norm": 0.6247624754905701, + "learning_rate": 3.93684146108149e-06, + "loss": 0.24950284957885743, + "memory(GiB)": 72.48, + "step": 4565, + "token_acc": 0.9151660309293592, + "train_speed(iter/s)": 0.080968 + }, + { + "epoch": 1.7707995252742994, + "grad_norm": 0.5420736074447632, + "learning_rate": 3.926406821589383e-06, + "loss": 0.2388762950897217, + "memory(GiB)": 72.48, + "step": 4570, + "token_acc": 0.9174054909198774, + "train_speed(iter/s)": 0.08097 + }, + { + "epoch": 1.7727371811950494, + "grad_norm": 0.5523030757904053, + "learning_rate": 3.915977081579658e-06, + "loss": 0.2391650438308716, + "memory(GiB)": 72.48, + "step": 4575, + "token_acc": 0.9292837420821829, + "train_speed(iter/s)": 0.080971 + }, + { + "epoch": 1.7746748371157992, + "grad_norm": 0.6097285151481628, + "learning_rate": 3.905552288649792e-06, + "loss": 0.23768877983093262, + "memory(GiB)": 72.48, + "step": 4580, + "token_acc": 0.9200933400605449, + "train_speed(iter/s)": 0.080976 + }, + { + "epoch": 1.776612493036549, + "grad_norm": 0.6126767992973328, + "learning_rate": 3.895132490374686e-06, + "loss": 0.2507157802581787, + "memory(GiB)": 72.48, + "step": 4585, + "token_acc": 0.921820542609171, + "train_speed(iter/s)": 0.08097 + }, + { + "epoch": 1.778550148957299, + "grad_norm": 0.5675840973854065, + "learning_rate": 3.884717734306448e-06, + "loss": 0.22835259437561034, + "memory(GiB)": 72.48, + "step": 4590, + "token_acc": 0.9248906174039864, + "train_speed(iter/s)": 0.080974 + }, + { + "epoch": 1.7804878048780488, + "grad_norm": 0.5561676621437073, + "learning_rate": 3.8743080679741735e-06, + "loss": 0.23788776397705078, + "memory(GiB)": 72.48, + "step": 4595, + "token_acc": 0.9150849505935879, + "train_speed(iter/s)": 0.080976 + }, + { + "epoch": 1.7824254607987986, + "grad_norm": 0.5253614187240601, + "learning_rate": 3.8639035388837335e-06, + "loss": 0.22025654315948487, + "memory(GiB)": 72.48, + "step": 4600, + "token_acc": 0.917173455160343, + "train_speed(iter/s)": 0.080977 + }, + { + "epoch": 1.7843631167195486, + "grad_norm": 0.5718342661857605, + "learning_rate": 3.853504194517551e-06, + "loss": 0.24571945667266845, + "memory(GiB)": 72.48, + "step": 4605, + "token_acc": 0.9146592199343296, + "train_speed(iter/s)": 0.08098 + }, + { + "epoch": 1.7863007726402984, + "grad_norm": 0.5782023072242737, + "learning_rate": 3.843110082334388e-06, + "loss": 0.2297668933868408, + "memory(GiB)": 72.48, + "step": 4610, + "token_acc": 0.9213476319227147, + "train_speed(iter/s)": 0.080991 + }, + { + "epoch": 1.7882384285610482, + "grad_norm": 0.6311032772064209, + "learning_rate": 3.832721249769132e-06, + "loss": 0.237768816947937, + "memory(GiB)": 72.48, + "step": 4615, + "token_acc": 0.9166639987193853, + "train_speed(iter/s)": 0.080997 + }, + { + "epoch": 1.7901760844817982, + "grad_norm": 0.5533443093299866, + "learning_rate": 3.8223377442325744e-06, + "loss": 0.2349649429321289, + "memory(GiB)": 72.48, + "step": 4620, + "token_acc": 0.9282687009959737, + "train_speed(iter/s)": 0.080996 + }, + { + "epoch": 1.792113740402548, + "grad_norm": 0.5883183479309082, + "learning_rate": 3.811959613111197e-06, + "loss": 0.23518748283386232, + "memory(GiB)": 72.48, + "step": 4625, + "token_acc": 0.9147151559677963, + "train_speed(iter/s)": 0.080996 + }, + { + "epoch": 1.7940513963232978, + "grad_norm": 0.5875689387321472, + "learning_rate": 3.8015869037669496e-06, + "loss": 0.23918418884277343, + "memory(GiB)": 72.48, + "step": 4630, + "token_acc": 0.9236037186856859, + "train_speed(iter/s)": 0.080993 + }, + { + "epoch": 1.7959890522440478, + "grad_norm": 0.5876352190971375, + "learning_rate": 3.7912196635370452e-06, + "loss": 0.2464871883392334, + "memory(GiB)": 72.48, + "step": 4635, + "token_acc": 0.9047433144641811, + "train_speed(iter/s)": 0.080991 + }, + { + "epoch": 1.7979267081647976, + "grad_norm": 0.5628306865692139, + "learning_rate": 3.7808579397337373e-06, + "loss": 0.23177266120910645, + "memory(GiB)": 72.48, + "step": 4640, + "token_acc": 0.9217531942422772, + "train_speed(iter/s)": 0.080996 + }, + { + "epoch": 1.7998643640855474, + "grad_norm": 0.5328852534294128, + "learning_rate": 3.7705017796441003e-06, + "loss": 0.2317207098007202, + "memory(GiB)": 72.48, + "step": 4645, + "token_acc": 0.9266968638705438, + "train_speed(iter/s)": 0.080997 + }, + { + "epoch": 1.8018020200062974, + "grad_norm": 0.5785929560661316, + "learning_rate": 3.760151230529821e-06, + "loss": 0.23354239463806153, + "memory(GiB)": 72.48, + "step": 4650, + "token_acc": 0.9157065070244578, + "train_speed(iter/s)": 0.081 + }, + { + "epoch": 1.8037396759270472, + "grad_norm": 0.5678642392158508, + "learning_rate": 3.74980633962698e-06, + "loss": 0.23528060913085938, + "memory(GiB)": 72.48, + "step": 4655, + "token_acc": 0.9224140938426653, + "train_speed(iter/s)": 0.080992 + }, + { + "epoch": 1.805677331847797, + "grad_norm": 0.5892844796180725, + "learning_rate": 3.7394671541458345e-06, + "loss": 0.24472272396087646, + "memory(GiB)": 72.48, + "step": 4660, + "token_acc": 0.9063246013304899, + "train_speed(iter/s)": 0.080987 + }, + { + "epoch": 1.807614987768547, + "grad_norm": 0.5627365112304688, + "learning_rate": 3.7291337212706057e-06, + "loss": 0.2330082893371582, + "memory(GiB)": 72.48, + "step": 4665, + "token_acc": 0.9164701813952026, + "train_speed(iter/s)": 0.080993 + }, + { + "epoch": 1.809552643689297, + "grad_norm": 0.5916730761528015, + "learning_rate": 3.7188060881592603e-06, + "loss": 0.22397446632385254, + "memory(GiB)": 72.48, + "step": 4670, + "token_acc": 0.9247956042501075, + "train_speed(iter/s)": 0.080995 + }, + { + "epoch": 1.8114902996100466, + "grad_norm": 0.5689864754676819, + "learning_rate": 3.708484301943298e-06, + "loss": 0.2298985242843628, + "memory(GiB)": 72.48, + "step": 4675, + "token_acc": 0.9312669969824535, + "train_speed(iter/s)": 0.081 + }, + { + "epoch": 1.8134279555307966, + "grad_norm": 0.5318856835365295, + "learning_rate": 3.6981684097275357e-06, + "loss": 0.23902740478515624, + "memory(GiB)": 72.48, + "step": 4680, + "token_acc": 0.9195728129738282, + "train_speed(iter/s)": 0.081 + }, + { + "epoch": 1.8153656114515466, + "grad_norm": 0.5639890432357788, + "learning_rate": 3.6878584585898913e-06, + "loss": 0.22230730056762696, + "memory(GiB)": 72.48, + "step": 4685, + "token_acc": 0.9186222446376123, + "train_speed(iter/s)": 0.080999 + }, + { + "epoch": 1.8173032673722964, + "grad_norm": 0.5812826752662659, + "learning_rate": 3.677554495581173e-06, + "loss": 0.233660888671875, + "memory(GiB)": 72.48, + "step": 4690, + "token_acc": 0.9195180279617365, + "train_speed(iter/s)": 0.080994 + }, + { + "epoch": 1.8192409232930462, + "grad_norm": 0.5809809565544128, + "learning_rate": 3.667256567724855e-06, + "loss": 0.23797879219055176, + "memory(GiB)": 72.48, + "step": 4695, + "token_acc": 0.9145623572554444, + "train_speed(iter/s)": 0.080996 + }, + { + "epoch": 1.8211785792137962, + "grad_norm": 0.5800526142120361, + "learning_rate": 3.656964722016875e-06, + "loss": 0.24539968967437745, + "memory(GiB)": 72.48, + "step": 4700, + "token_acc": 0.9213923231497565, + "train_speed(iter/s)": 0.080991 + }, + { + "epoch": 1.823116235134546, + "grad_norm": 0.5599920749664307, + "learning_rate": 3.646679005425412e-06, + "loss": 0.2380000591278076, + "memory(GiB)": 72.48, + "step": 4705, + "token_acc": 0.9146419233467253, + "train_speed(iter/s)": 0.080988 + }, + { + "epoch": 1.8250538910552958, + "grad_norm": 0.5796039700508118, + "learning_rate": 3.636399464890673e-06, + "loss": 0.22853050231933594, + "memory(GiB)": 72.48, + "step": 4710, + "token_acc": 0.9201017307438235, + "train_speed(iter/s)": 0.080978 + }, + { + "epoch": 1.8269915469760458, + "grad_norm": 0.5604573488235474, + "learning_rate": 3.626126147324682e-06, + "loss": 0.2464301347732544, + "memory(GiB)": 72.48, + "step": 4715, + "token_acc": 0.912274280136652, + "train_speed(iter/s)": 0.08097 + }, + { + "epoch": 1.8289292028967956, + "grad_norm": 0.6337606310844421, + "learning_rate": 3.615859099611063e-06, + "loss": 0.24546365737915038, + "memory(GiB)": 72.48, + "step": 4720, + "token_acc": 0.9116720267680213, + "train_speed(iter/s)": 0.080972 + }, + { + "epoch": 1.8308668588175454, + "grad_norm": 0.5712890625, + "learning_rate": 3.6055983686048267e-06, + "loss": 0.2261251449584961, + "memory(GiB)": 72.48, + "step": 4725, + "token_acc": 0.9267950729378777, + "train_speed(iter/s)": 0.080969 + }, + { + "epoch": 1.8328045147382954, + "grad_norm": 0.6142176389694214, + "learning_rate": 3.595344001132154e-06, + "loss": 0.22348663806915284, + "memory(GiB)": 72.48, + "step": 4730, + "token_acc": 0.9256462478458405, + "train_speed(iter/s)": 0.080966 + }, + { + "epoch": 1.8347421706590452, + "grad_norm": 0.5970928072929382, + "learning_rate": 3.5850960439901882e-06, + "loss": 0.23217971324920655, + "memory(GiB)": 72.48, + "step": 4735, + "token_acc": 0.9128682072117826, + "train_speed(iter/s)": 0.080963 + }, + { + "epoch": 1.836679826579795, + "grad_norm": 0.5526993870735168, + "learning_rate": 3.5748545439468175e-06, + "loss": 0.2441422939300537, + "memory(GiB)": 72.48, + "step": 4740, + "token_acc": 0.9143733759118374, + "train_speed(iter/s)": 0.080963 + }, + { + "epoch": 1.838617482500545, + "grad_norm": 0.5681501626968384, + "learning_rate": 3.5646195477404622e-06, + "loss": 0.24079837799072265, + "memory(GiB)": 72.48, + "step": 4745, + "token_acc": 0.9231079654808468, + "train_speed(iter/s)": 0.080967 + }, + { + "epoch": 1.8405551384212948, + "grad_norm": 0.6051904559135437, + "learning_rate": 3.5543911020798633e-06, + "loss": 0.22717626094818116, + "memory(GiB)": 72.48, + "step": 4750, + "token_acc": 0.9185553304634084, + "train_speed(iter/s)": 0.080961 + }, + { + "epoch": 1.8424927943420446, + "grad_norm": 0.5435159802436829, + "learning_rate": 3.544169253643861e-06, + "loss": 0.23873801231384278, + "memory(GiB)": 72.48, + "step": 4755, + "token_acc": 0.9283750440606274, + "train_speed(iter/s)": 0.080956 + }, + { + "epoch": 1.8444304502627946, + "grad_norm": 0.5636653900146484, + "learning_rate": 3.533954049081196e-06, + "loss": 0.23597636222839355, + "memory(GiB)": 72.48, + "step": 4760, + "token_acc": 0.916999383257052, + "train_speed(iter/s)": 0.080954 + }, + { + "epoch": 1.8463681061835444, + "grad_norm": 0.5626488327980042, + "learning_rate": 3.5237455350102846e-06, + "loss": 0.22850396633148193, + "memory(GiB)": 72.48, + "step": 4765, + "token_acc": 0.9220848468690639, + "train_speed(iter/s)": 0.080959 + }, + { + "epoch": 1.8483057621042942, + "grad_norm": 0.5658772587776184, + "learning_rate": 3.513543758019011e-06, + "loss": 0.23132677078247071, + "memory(GiB)": 72.48, + "step": 4770, + "token_acc": 0.915556797367359, + "train_speed(iter/s)": 0.08095 + }, + { + "epoch": 1.8502434180250442, + "grad_norm": 0.5570623278617859, + "learning_rate": 3.5033487646645137e-06, + "loss": 0.232391881942749, + "memory(GiB)": 72.48, + "step": 4775, + "token_acc": 0.916139132802396, + "train_speed(iter/s)": 0.080954 + }, + { + "epoch": 1.8521810739457942, + "grad_norm": 0.5710665583610535, + "learning_rate": 3.4931606014729747e-06, + "loss": 0.2462557315826416, + "memory(GiB)": 72.48, + "step": 4780, + "token_acc": 0.9168437752566827, + "train_speed(iter/s)": 0.080954 + }, + { + "epoch": 1.8541187298665438, + "grad_norm": 0.54609614610672, + "learning_rate": 3.482979314939404e-06, + "loss": 0.23276431560516359, + "memory(GiB)": 72.48, + "step": 4785, + "token_acc": 0.9173787210584344, + "train_speed(iter/s)": 0.080957 + }, + { + "epoch": 1.8560563857872938, + "grad_norm": 0.5884754657745361, + "learning_rate": 3.47280495152743e-06, + "loss": 0.23690245151519776, + "memory(GiB)": 72.48, + "step": 4790, + "token_acc": 0.9226331313000357, + "train_speed(iter/s)": 0.080962 + }, + { + "epoch": 1.8579940417080438, + "grad_norm": 0.5663356781005859, + "learning_rate": 3.462637557669084e-06, + "loss": 0.22185473442077636, + "memory(GiB)": 72.48, + "step": 4795, + "token_acc": 0.9220468365894683, + "train_speed(iter/s)": 0.080968 + }, + { + "epoch": 1.8599316976287934, + "grad_norm": 0.6133237481117249, + "learning_rate": 3.452477179764595e-06, + "loss": 0.23785548210144042, + "memory(GiB)": 72.48, + "step": 4800, + "token_acc": 0.920966865833786, + "train_speed(iter/s)": 0.08097 + }, + { + "epoch": 1.8618693535495434, + "grad_norm": 0.6006550192832947, + "learning_rate": 3.4423238641821703e-06, + "loss": 0.2466524600982666, + "memory(GiB)": 72.48, + "step": 4805, + "token_acc": 0.9154687183640413, + "train_speed(iter/s)": 0.080969 + }, + { + "epoch": 1.8638070094702934, + "grad_norm": 0.5626029372215271, + "learning_rate": 3.4321776572577925e-06, + "loss": 0.23381493091583253, + "memory(GiB)": 72.48, + "step": 4810, + "token_acc": 0.916163872129071, + "train_speed(iter/s)": 0.080962 + }, + { + "epoch": 1.8657446653910432, + "grad_norm": 0.5765612721443176, + "learning_rate": 3.4220386052949934e-06, + "loss": 0.22932858467102052, + "memory(GiB)": 72.48, + "step": 4815, + "token_acc": 0.9275064822817631, + "train_speed(iter/s)": 0.080966 + }, + { + "epoch": 1.867682321311793, + "grad_norm": 0.6077262759208679, + "learning_rate": 3.411906754564662e-06, + "loss": 0.2330641746520996, + "memory(GiB)": 72.48, + "step": 4820, + "token_acc": 0.9138856860484269, + "train_speed(iter/s)": 0.080965 + }, + { + "epoch": 1.869619977232543, + "grad_norm": 0.5446533560752869, + "learning_rate": 3.4017821513048166e-06, + "loss": 0.2277933120727539, + "memory(GiB)": 72.48, + "step": 4825, + "token_acc": 0.9137432806139888, + "train_speed(iter/s)": 0.080964 + }, + { + "epoch": 1.8715576331532928, + "grad_norm": 0.5742576122283936, + "learning_rate": 3.3916648417204057e-06, + "loss": 0.22510766983032227, + "memory(GiB)": 72.48, + "step": 4830, + "token_acc": 0.9291481701562445, + "train_speed(iter/s)": 0.080958 + }, + { + "epoch": 1.8734952890740426, + "grad_norm": 0.5845666527748108, + "learning_rate": 3.38155487198309e-06, + "loss": 0.23452870845794677, + "memory(GiB)": 72.48, + "step": 4835, + "token_acc": 0.9201796933493929, + "train_speed(iter/s)": 0.080953 + }, + { + "epoch": 1.8754329449947926, + "grad_norm": 0.5497782826423645, + "learning_rate": 3.371452288231033e-06, + "loss": 0.23567650318145753, + "memory(GiB)": 72.48, + "step": 4840, + "token_acc": 0.9179969055436055, + "train_speed(iter/s)": 0.080955 + }, + { + "epoch": 1.8773706009155424, + "grad_norm": 0.6008865237236023, + "learning_rate": 3.3613571365686937e-06, + "loss": 0.24633493423461914, + "memory(GiB)": 72.48, + "step": 4845, + "token_acc": 0.9117782152230971, + "train_speed(iter/s)": 0.080952 + }, + { + "epoch": 1.8793082568362922, + "grad_norm": 0.5858570337295532, + "learning_rate": 3.351269463066611e-06, + "loss": 0.22995858192443847, + "memory(GiB)": 72.48, + "step": 4850, + "token_acc": 0.9209995343783952, + "train_speed(iter/s)": 0.080954 + }, + { + "epoch": 1.8812459127570422, + "grad_norm": 0.5978894233703613, + "learning_rate": 3.341189313761197e-06, + "loss": 0.23932342529296874, + "memory(GiB)": 72.48, + "step": 4855, + "token_acc": 0.9203655352480418, + "train_speed(iter/s)": 0.080953 + }, + { + "epoch": 1.883183568677792, + "grad_norm": 0.563109278678894, + "learning_rate": 3.331116734654529e-06, + "loss": 0.22672557830810547, + "memory(GiB)": 72.48, + "step": 4860, + "token_acc": 0.9299508482638338, + "train_speed(iter/s)": 0.080951 + }, + { + "epoch": 1.8851212245985418, + "grad_norm": 0.6089335083961487, + "learning_rate": 3.321051771714132e-06, + "loss": 0.24452657699584962, + "memory(GiB)": 72.48, + "step": 4865, + "token_acc": 0.9190753378830456, + "train_speed(iter/s)": 0.080955 + }, + { + "epoch": 1.8870588805192918, + "grad_norm": 0.5918197631835938, + "learning_rate": 3.310994470872779e-06, + "loss": 0.22724080085754395, + "memory(GiB)": 72.48, + "step": 4870, + "token_acc": 0.9237190302815025, + "train_speed(iter/s)": 0.080954 + }, + { + "epoch": 1.8889965364400416, + "grad_norm": 0.5685690641403198, + "learning_rate": 3.3009448780282705e-06, + "loss": 0.22789173126220702, + "memory(GiB)": 72.48, + "step": 4875, + "token_acc": 0.9148283738540004, + "train_speed(iter/s)": 0.080949 + }, + { + "epoch": 1.8909341923607914, + "grad_norm": 0.5765368342399597, + "learning_rate": 3.290903039043234e-06, + "loss": 0.23500416278839112, + "memory(GiB)": 72.48, + "step": 4880, + "token_acc": 0.9254320820955981, + "train_speed(iter/s)": 0.080953 + }, + { + "epoch": 1.8928718482815414, + "grad_norm": 0.5873888731002808, + "learning_rate": 3.2808689997449097e-06, + "loss": 0.22656865119934083, + "memory(GiB)": 72.48, + "step": 4885, + "token_acc": 0.9189216209010931, + "train_speed(iter/s)": 0.080956 + }, + { + "epoch": 1.8948095042022914, + "grad_norm": 0.6020394563674927, + "learning_rate": 3.2708428059249437e-06, + "loss": 0.2382340431213379, + "memory(GiB)": 72.48, + "step": 4890, + "token_acc": 0.9176993854615828, + "train_speed(iter/s)": 0.080958 + }, + { + "epoch": 1.896747160123041, + "grad_norm": 0.5413195490837097, + "learning_rate": 3.2608245033391785e-06, + "loss": 0.2260499954223633, + "memory(GiB)": 72.48, + "step": 4895, + "token_acc": 0.9201593520561024, + "train_speed(iter/s)": 0.080955 + }, + { + "epoch": 1.898684816043791, + "grad_norm": 0.5700716972351074, + "learning_rate": 3.250814137707444e-06, + "loss": 0.23337714672088622, + "memory(GiB)": 72.48, + "step": 4900, + "token_acc": 0.9231728610626091, + "train_speed(iter/s)": 0.080956 + }, + { + "epoch": 1.900622471964541, + "grad_norm": 0.5829481482505798, + "learning_rate": 3.2408117547133483e-06, + "loss": 0.23229174613952636, + "memory(GiB)": 72.48, + "step": 4905, + "token_acc": 0.9205786313682806, + "train_speed(iter/s)": 0.080962 + }, + { + "epoch": 1.9025601278852906, + "grad_norm": 0.5741070508956909, + "learning_rate": 3.2308174000040684e-06, + "loss": 0.22675461769104005, + "memory(GiB)": 72.48, + "step": 4910, + "token_acc": 0.9256168249825498, + "train_speed(iter/s)": 0.080961 + }, + { + "epoch": 1.9044977838060406, + "grad_norm": 0.5426390767097473, + "learning_rate": 3.2208311191901454e-06, + "loss": 0.24353694915771484, + "memory(GiB)": 72.48, + "step": 4915, + "token_acc": 0.932967065778866, + "train_speed(iter/s)": 0.080959 + }, + { + "epoch": 1.9064354397267906, + "grad_norm": 0.5824358463287354, + "learning_rate": 3.210852957845274e-06, + "loss": 0.2382187843322754, + "memory(GiB)": 72.48, + "step": 4920, + "token_acc": 0.9137491742946117, + "train_speed(iter/s)": 0.080953 + }, + { + "epoch": 1.9083730956475404, + "grad_norm": 0.5535913109779358, + "learning_rate": 3.200882961506092e-06, + "loss": 0.22503528594970704, + "memory(GiB)": 72.48, + "step": 4925, + "token_acc": 0.9320696191625022, + "train_speed(iter/s)": 0.080951 + }, + { + "epoch": 1.9103107515682902, + "grad_norm": 0.5807521939277649, + "learning_rate": 3.1909211756719793e-06, + "loss": 0.23938980102539062, + "memory(GiB)": 72.48, + "step": 4930, + "token_acc": 0.9238847673186241, + "train_speed(iter/s)": 0.080945 + }, + { + "epoch": 1.9122484074890402, + "grad_norm": 0.5811308026313782, + "learning_rate": 3.1809676458048435e-06, + "loss": 0.24059734344482422, + "memory(GiB)": 72.48, + "step": 4935, + "token_acc": 0.9185429646536303, + "train_speed(iter/s)": 0.080934 + }, + { + "epoch": 1.91418606340979, + "grad_norm": 0.5475001931190491, + "learning_rate": 3.171022417328913e-06, + "loss": 0.22782738208770753, + "memory(GiB)": 72.48, + "step": 4940, + "token_acc": 0.9167722277380259, + "train_speed(iter/s)": 0.080941 + }, + { + "epoch": 1.9161237193305398, + "grad_norm": 0.5681920051574707, + "learning_rate": 3.1610855356305354e-06, + "loss": 0.23460836410522462, + "memory(GiB)": 72.48, + "step": 4945, + "token_acc": 0.926126769710352, + "train_speed(iter/s)": 0.080935 + }, + { + "epoch": 1.9180613752512898, + "grad_norm": 0.65660160779953, + "learning_rate": 3.151157046057965e-06, + "loss": 0.23111975193023682, + "memory(GiB)": 72.48, + "step": 4950, + "token_acc": 0.934885964307862, + "train_speed(iter/s)": 0.080932 + }, + { + "epoch": 1.9199990311720396, + "grad_norm": 0.5645193457603455, + "learning_rate": 3.141236993921158e-06, + "loss": 0.23473634719848632, + "memory(GiB)": 72.48, + "step": 4955, + "token_acc": 0.9231619679380874, + "train_speed(iter/s)": 0.080929 + }, + { + "epoch": 1.9219366870927894, + "grad_norm": 0.6071648001670837, + "learning_rate": 3.1313254244915653e-06, + "loss": 0.22611503601074218, + "memory(GiB)": 72.48, + "step": 4960, + "token_acc": 0.9172273912623548, + "train_speed(iter/s)": 0.080931 + }, + { + "epoch": 1.9238743430135394, + "grad_norm": 0.59253990650177, + "learning_rate": 3.121422383001927e-06, + "loss": 0.22836060523986818, + "memory(GiB)": 72.48, + "step": 4965, + "token_acc": 0.924332395735667, + "train_speed(iter/s)": 0.08092 + }, + { + "epoch": 1.9258119989342892, + "grad_norm": 0.5572670102119446, + "learning_rate": 3.111527914646061e-06, + "loss": 0.2278214931488037, + "memory(GiB)": 72.48, + "step": 4970, + "token_acc": 0.9200324466826647, + "train_speed(iter/s)": 0.080918 + }, + { + "epoch": 1.927749654855039, + "grad_norm": 0.5666624903678894, + "learning_rate": 3.101642064578664e-06, + "loss": 0.21897401809692382, + "memory(GiB)": 72.48, + "step": 4975, + "token_acc": 0.9254069046055032, + "train_speed(iter/s)": 0.080921 + }, + { + "epoch": 1.929687310775789, + "grad_norm": 0.5414124131202698, + "learning_rate": 3.091764877915101e-06, + "loss": 0.21500952243804933, + "memory(GiB)": 72.48, + "step": 4980, + "token_acc": 0.9259556000667668, + "train_speed(iter/s)": 0.080926 + }, + { + "epoch": 1.9316249666965388, + "grad_norm": 0.6081916689872742, + "learning_rate": 3.081896399731202e-06, + "loss": 0.23827474117279052, + "memory(GiB)": 72.48, + "step": 4985, + "token_acc": 0.9237981946679729, + "train_speed(iter/s)": 0.080928 + }, + { + "epoch": 1.9335626226172886, + "grad_norm": 0.5481286644935608, + "learning_rate": 3.0720366750630524e-06, + "loss": 0.2238980531692505, + "memory(GiB)": 72.48, + "step": 4990, + "token_acc": 0.9258295380611581, + "train_speed(iter/s)": 0.080932 + }, + { + "epoch": 1.9355002785380386, + "grad_norm": 0.5673938393592834, + "learning_rate": 3.0621857489067908e-06, + "loss": 0.23099775314331056, + "memory(GiB)": 72.48, + "step": 4995, + "token_acc": 0.9236520674207999, + "train_speed(iter/s)": 0.080928 + }, + { + "epoch": 1.9374379344587886, + "grad_norm": 0.5904103517532349, + "learning_rate": 3.0523436662184013e-06, + "loss": 0.2294713020324707, + "memory(GiB)": 72.48, + "step": 5000, + "token_acc": 0.9182310696736802, + "train_speed(iter/s)": 0.080935 + }, + { + "epoch": 1.9393755903795382, + "grad_norm": 0.5377696752548218, + "learning_rate": 3.0425104719135124e-06, + "loss": 0.23087844848632813, + "memory(GiB)": 72.48, + "step": 5005, + "token_acc": 0.9300810424280948, + "train_speed(iter/s)": 0.080933 + }, + { + "epoch": 1.9413132463002882, + "grad_norm": 0.6162227392196655, + "learning_rate": 3.0326862108671863e-06, + "loss": 0.23395605087280275, + "memory(GiB)": 72.48, + "step": 5010, + "token_acc": 0.9195617918511293, + "train_speed(iter/s)": 0.080937 + }, + { + "epoch": 1.9432509022210382, + "grad_norm": 0.5727110505104065, + "learning_rate": 3.022870927913719e-06, + "loss": 0.22884960174560548, + "memory(GiB)": 72.48, + "step": 5015, + "token_acc": 0.9202504168370718, + "train_speed(iter/s)": 0.080936 + }, + { + "epoch": 1.9451885581417878, + "grad_norm": 0.5721259117126465, + "learning_rate": 3.0130646678464344e-06, + "loss": 0.23694920539855957, + "memory(GiB)": 72.48, + "step": 5020, + "token_acc": 0.9184976587314582, + "train_speed(iter/s)": 0.080935 + }, + { + "epoch": 1.9471262140625378, + "grad_norm": 0.5581057071685791, + "learning_rate": 3.0032674754174775e-06, + "loss": 0.23090701103210448, + "memory(GiB)": 72.48, + "step": 5025, + "token_acc": 0.9220899962811454, + "train_speed(iter/s)": 0.080932 + }, + { + "epoch": 1.9490638699832878, + "grad_norm": 0.5250717401504517, + "learning_rate": 2.9934793953376135e-06, + "loss": 0.22847251892089843, + "memory(GiB)": 72.48, + "step": 5030, + "token_acc": 0.9218730397691632, + "train_speed(iter/s)": 0.080939 + }, + { + "epoch": 1.9510015259040376, + "grad_norm": 0.6175058484077454, + "learning_rate": 2.98370047227602e-06, + "loss": 0.23140432834625244, + "memory(GiB)": 72.48, + "step": 5035, + "token_acc": 0.9123302326963654, + "train_speed(iter/s)": 0.080938 + }, + { + "epoch": 1.9529391818247874, + "grad_norm": 0.6077125072479248, + "learning_rate": 2.973930750860088e-06, + "loss": 0.23387997150421141, + "memory(GiB)": 72.48, + "step": 5040, + "token_acc": 0.9297987071708477, + "train_speed(iter/s)": 0.080942 + }, + { + "epoch": 1.9548768377455374, + "grad_norm": 0.5392444133758545, + "learning_rate": 2.9641702756752134e-06, + "loss": 0.22093567848205567, + "memory(GiB)": 72.48, + "step": 5045, + "token_acc": 0.9346273904775573, + "train_speed(iter/s)": 0.080939 + }, + { + "epoch": 1.9568144936662872, + "grad_norm": 0.6002140045166016, + "learning_rate": 2.9544190912645978e-06, + "loss": 0.23936209678649903, + "memory(GiB)": 72.48, + "step": 5050, + "token_acc": 0.9177467430255535, + "train_speed(iter/s)": 0.080937 + }, + { + "epoch": 1.958752149587037, + "grad_norm": 0.5801167488098145, + "learning_rate": 2.9446772421290427e-06, + "loss": 0.2447594165802002, + "memory(GiB)": 72.48, + "step": 5055, + "token_acc": 0.9178470254957507, + "train_speed(iter/s)": 0.080939 + }, + { + "epoch": 1.960689805507787, + "grad_norm": 0.5460479259490967, + "learning_rate": 2.9349447727267444e-06, + "loss": 0.22198958396911622, + "memory(GiB)": 72.48, + "step": 5060, + "token_acc": 0.9201584184777183, + "train_speed(iter/s)": 0.080946 + }, + { + "epoch": 1.9626274614285368, + "grad_norm": 0.5960065126419067, + "learning_rate": 2.9252217274730964e-06, + "loss": 0.23214805126190186, + "memory(GiB)": 72.48, + "step": 5065, + "token_acc": 0.918548799182422, + "train_speed(iter/s)": 0.080943 + }, + { + "epoch": 1.9645651173492866, + "grad_norm": 0.6021069884300232, + "learning_rate": 2.9155081507404813e-06, + "loss": 0.22706222534179688, + "memory(GiB)": 72.48, + "step": 5070, + "token_acc": 0.9270862900936702, + "train_speed(iter/s)": 0.080948 + }, + { + "epoch": 1.9665027732700366, + "grad_norm": 0.6196221113204956, + "learning_rate": 2.905804086858074e-06, + "loss": 0.23130836486816406, + "memory(GiB)": 72.48, + "step": 5075, + "token_acc": 0.9091763126992115, + "train_speed(iter/s)": 0.080945 + }, + { + "epoch": 1.9684404291907864, + "grad_norm": 0.5536569356918335, + "learning_rate": 2.896109580111634e-06, + "loss": 0.2225879430770874, + "memory(GiB)": 72.48, + "step": 5080, + "token_acc": 0.9220095988186069, + "train_speed(iter/s)": 0.080946 + }, + { + "epoch": 1.9703780851115362, + "grad_norm": 0.5561604499816895, + "learning_rate": 2.8864246747433065e-06, + "loss": 0.2250436782836914, + "memory(GiB)": 72.48, + "step": 5085, + "token_acc": 0.9154984677577102, + "train_speed(iter/s)": 0.080947 + }, + { + "epoch": 1.9723157410322862, + "grad_norm": 0.5968231558799744, + "learning_rate": 2.87674941495142e-06, + "loss": 0.23562178611755372, + "memory(GiB)": 72.48, + "step": 5090, + "token_acc": 0.9204119545024932, + "train_speed(iter/s)": 0.080949 + }, + { + "epoch": 1.974253396953036, + "grad_norm": 0.6080553531646729, + "learning_rate": 2.8670838448902815e-06, + "loss": 0.23601851463317872, + "memory(GiB)": 72.48, + "step": 5095, + "token_acc": 0.9168413922131544, + "train_speed(iter/s)": 0.08095 + }, + { + "epoch": 1.9761910528737858, + "grad_norm": 0.5545769333839417, + "learning_rate": 2.857428008669983e-06, + "loss": 0.23675503730773925, + "memory(GiB)": 72.48, + "step": 5100, + "token_acc": 0.9197643979057591, + "train_speed(iter/s)": 0.08095 + }, + { + "epoch": 1.9781287087945358, + "grad_norm": 0.5824362635612488, + "learning_rate": 2.8477819503561876e-06, + "loss": 0.2293771266937256, + "memory(GiB)": 72.48, + "step": 5105, + "token_acc": 0.9200192184497117, + "train_speed(iter/s)": 0.080951 + }, + { + "epoch": 1.9800663647152856, + "grad_norm": 0.5837356448173523, + "learning_rate": 2.838145713969941e-06, + "loss": 0.22520647048950196, + "memory(GiB)": 72.48, + "step": 5110, + "token_acc": 0.91966621411305, + "train_speed(iter/s)": 0.080958 + }, + { + "epoch": 1.9820040206360354, + "grad_norm": 0.578384280204773, + "learning_rate": 2.8285193434874637e-06, + "loss": 0.23027491569519043, + "memory(GiB)": 72.48, + "step": 5115, + "token_acc": 0.9200089985859365, + "train_speed(iter/s)": 0.08095 + }, + { + "epoch": 1.9839416765567854, + "grad_norm": 0.5708916187286377, + "learning_rate": 2.8189028828399546e-06, + "loss": 0.23889336585998536, + "memory(GiB)": 72.48, + "step": 5120, + "token_acc": 0.9176338228327807, + "train_speed(iter/s)": 0.080953 + }, + { + "epoch": 1.9858793324775355, + "grad_norm": 0.5448703765869141, + "learning_rate": 2.8092963759133806e-06, + "loss": 0.2348088264465332, + "memory(GiB)": 72.48, + "step": 5125, + "token_acc": 0.9210328730454086, + "train_speed(iter/s)": 0.080956 + }, + { + "epoch": 1.987816988398285, + "grad_norm": 0.5802229046821594, + "learning_rate": 2.79969986654829e-06, + "loss": 0.2369297981262207, + "memory(GiB)": 72.48, + "step": 5130, + "token_acc": 0.9280055353745027, + "train_speed(iter/s)": 0.080951 + }, + { + "epoch": 1.989754644319035, + "grad_norm": 0.5599392652511597, + "learning_rate": 2.7901133985396035e-06, + "loss": 0.2221320629119873, + "memory(GiB)": 72.48, + "step": 5135, + "token_acc": 0.9295444199320657, + "train_speed(iter/s)": 0.080959 + }, + { + "epoch": 1.991692300239785, + "grad_norm": 0.5506635308265686, + "learning_rate": 2.7805370156364182e-06, + "loss": 0.2278140068054199, + "memory(GiB)": 72.48, + "step": 5140, + "token_acc": 0.9242746872504658, + "train_speed(iter/s)": 0.080964 + }, + { + "epoch": 1.9936299561605348, + "grad_norm": 0.5609742403030396, + "learning_rate": 2.7709707615418046e-06, + "loss": 0.21695501804351808, + "memory(GiB)": 72.48, + "step": 5145, + "token_acc": 0.9342338352524358, + "train_speed(iter/s)": 0.080959 + }, + { + "epoch": 1.9955676120812846, + "grad_norm": 0.6262750625610352, + "learning_rate": 2.7614146799126106e-06, + "loss": 0.23981974124908448, + "memory(GiB)": 72.48, + "step": 5150, + "token_acc": 0.9218865924748277, + "train_speed(iter/s)": 0.080963 + }, + { + "epoch": 1.9975052680020347, + "grad_norm": 0.5987786054611206, + "learning_rate": 2.7518688143592593e-06, + "loss": 0.23971712589263916, + "memory(GiB)": 72.48, + "step": 5155, + "token_acc": 0.9039727507942227, + "train_speed(iter/s)": 0.080966 + }, + { + "epoch": 1.9994429239227844, + "grad_norm": 0.6018982529640198, + "learning_rate": 2.7423332084455543e-06, + "loss": 0.23073389530181884, + "memory(GiB)": 72.48, + "step": 5160, + "token_acc": 0.9174361075221379, + "train_speed(iter/s)": 0.080969 + }, + { + "epoch": 2.0, + "eval_loss": 0.24373722076416016, + "eval_runtime": 104.8784, + "eval_samples_per_second": 31.789, + "eval_steps_per_second": 3.976, + "eval_token_acc": 0.9073567439055579, + "step": 5162 + }, + { + "epoch": 2.0011625935524497, + "grad_norm": 0.5473401546478271, + "learning_rate": 2.7328079056884727e-06, + "loss": 0.20911026000976562, + "memory(GiB)": 72.48, + "step": 5165, + "token_acc": 0.9121585000026832, + "train_speed(iter/s)": 0.080753 + }, + { + "epoch": 2.0031002494731998, + "grad_norm": 0.5424655675888062, + "learning_rate": 2.723292949557976e-06, + "loss": 0.17972030639648437, + "memory(GiB)": 72.48, + "step": 5170, + "token_acc": 0.9384443200895355, + "train_speed(iter/s)": 0.080747 + }, + { + "epoch": 2.0050379053939498, + "grad_norm": 0.5732024908065796, + "learning_rate": 2.7137883834768076e-06, + "loss": 0.19022332429885863, + "memory(GiB)": 72.48, + "step": 5175, + "token_acc": 0.9398598319451069, + "train_speed(iter/s)": 0.080743 + }, + { + "epoch": 2.0069755613146993, + "grad_norm": 0.5672215223312378, + "learning_rate": 2.704294250820293e-06, + "loss": 0.1836836814880371, + "memory(GiB)": 72.48, + "step": 5180, + "token_acc": 0.945720689172153, + "train_speed(iter/s)": 0.080735 + }, + { + "epoch": 2.0089132172354494, + "grad_norm": 0.5584644675254822, + "learning_rate": 2.6948105949161446e-06, + "loss": 0.1804587125778198, + "memory(GiB)": 72.48, + "step": 5185, + "token_acc": 0.9337713186011016, + "train_speed(iter/s)": 0.080734 + }, + { + "epoch": 2.0108508731561994, + "grad_norm": 0.5705211162567139, + "learning_rate": 2.68533745904426e-06, + "loss": 0.1826365113258362, + "memory(GiB)": 72.48, + "step": 5190, + "token_acc": 0.937906090964376, + "train_speed(iter/s)": 0.080734 + }, + { + "epoch": 2.012788529076949, + "grad_norm": 0.543353796005249, + "learning_rate": 2.67587488643653e-06, + "loss": 0.18609116077423096, + "memory(GiB)": 72.48, + "step": 5195, + "token_acc": 0.9352631412985211, + "train_speed(iter/s)": 0.080738 + }, + { + "epoch": 2.014726184997699, + "grad_norm": 0.5423488616943359, + "learning_rate": 2.6664229202766377e-06, + "loss": 0.18657855987548827, + "memory(GiB)": 72.48, + "step": 5200, + "token_acc": 0.9393792727932724, + "train_speed(iter/s)": 0.080735 + }, + { + "epoch": 2.016663840918449, + "grad_norm": 0.5571469068527222, + "learning_rate": 2.656981603699864e-06, + "loss": 0.18082787990570068, + "memory(GiB)": 72.48, + "step": 5205, + "token_acc": 0.9425005176136532, + "train_speed(iter/s)": 0.080729 + }, + { + "epoch": 2.018601496839199, + "grad_norm": 0.5621320009231567, + "learning_rate": 2.647550979792887e-06, + "loss": 0.18856477737426758, + "memory(GiB)": 72.48, + "step": 5210, + "token_acc": 0.9337371210375588, + "train_speed(iter/s)": 0.080733 + }, + { + "epoch": 2.0205391527599486, + "grad_norm": 0.5593150854110718, + "learning_rate": 2.6381310915935863e-06, + "loss": 0.17925558090209961, + "memory(GiB)": 72.48, + "step": 5215, + "token_acc": 0.9430511481898523, + "train_speed(iter/s)": 0.080731 + }, + { + "epoch": 2.0224768086806986, + "grad_norm": 0.6031186580657959, + "learning_rate": 2.6287219820908505e-06, + "loss": 0.18746013641357423, + "memory(GiB)": 72.48, + "step": 5220, + "token_acc": 0.9389572271872836, + "train_speed(iter/s)": 0.080731 + }, + { + "epoch": 2.0244144646014486, + "grad_norm": 0.5469415783882141, + "learning_rate": 2.6193236942243793e-06, + "loss": 0.18313326835632324, + "memory(GiB)": 72.48, + "step": 5225, + "token_acc": 0.9326484751203852, + "train_speed(iter/s)": 0.080736 + }, + { + "epoch": 2.026352120522198, + "grad_norm": 0.5326108336448669, + "learning_rate": 2.60993627088448e-06, + "loss": 0.1851900815963745, + "memory(GiB)": 72.48, + "step": 5230, + "token_acc": 0.9346198750085846, + "train_speed(iter/s)": 0.080736 + }, + { + "epoch": 2.028289776442948, + "grad_norm": 0.5510690808296204, + "learning_rate": 2.6005597549118844e-06, + "loss": 0.18629932403564453, + "memory(GiB)": 72.48, + "step": 5235, + "token_acc": 0.9409745831309697, + "train_speed(iter/s)": 0.080734 + }, + { + "epoch": 2.030227432363698, + "grad_norm": 0.5134409666061401, + "learning_rate": 2.5911941890975446e-06, + "loss": 0.1788806438446045, + "memory(GiB)": 72.48, + "step": 5240, + "token_acc": 0.9349969751966122, + "train_speed(iter/s)": 0.080737 + }, + { + "epoch": 2.0321650882844478, + "grad_norm": 0.5783552527427673, + "learning_rate": 2.5818396161824434e-06, + "loss": 0.18259315490722655, + "memory(GiB)": 72.48, + "step": 5245, + "token_acc": 0.9349454931343859, + "train_speed(iter/s)": 0.080742 + }, + { + "epoch": 2.0341027442051978, + "grad_norm": 0.5441116690635681, + "learning_rate": 2.5724960788573887e-06, + "loss": 0.1771859884262085, + "memory(GiB)": 72.48, + "step": 5250, + "token_acc": 0.9366488850601372, + "train_speed(iter/s)": 0.080746 + }, + { + "epoch": 2.036040400125948, + "grad_norm": 0.548576295375824, + "learning_rate": 2.563163619762833e-06, + "loss": 0.17985576391220093, + "memory(GiB)": 72.48, + "step": 5255, + "token_acc": 0.9376425855513308, + "train_speed(iter/s)": 0.080746 + }, + { + "epoch": 2.0379780560466974, + "grad_norm": 0.528372585773468, + "learning_rate": 2.55384228148867e-06, + "loss": 0.1838611841201782, + "memory(GiB)": 72.48, + "step": 5260, + "token_acc": 0.9355038872869292, + "train_speed(iter/s)": 0.080748 + }, + { + "epoch": 2.0399157119674474, + "grad_norm": 0.5744831562042236, + "learning_rate": 2.544532106574041e-06, + "loss": 0.18163129091262817, + "memory(GiB)": 72.48, + "step": 5265, + "token_acc": 0.9287501840129545, + "train_speed(iter/s)": 0.08075 + }, + { + "epoch": 2.0418533678881974, + "grad_norm": 0.5563311576843262, + "learning_rate": 2.5352331375071437e-06, + "loss": 0.1764971375465393, + "memory(GiB)": 72.48, + "step": 5270, + "token_acc": 0.9388310203072358, + "train_speed(iter/s)": 0.080749 + }, + { + "epoch": 2.043791023808947, + "grad_norm": 0.5354945659637451, + "learning_rate": 2.525945416725034e-06, + "loss": 0.17477788925170898, + "memory(GiB)": 72.48, + "step": 5275, + "token_acc": 0.9430880951598594, + "train_speed(iter/s)": 0.080752 + }, + { + "epoch": 2.045728679729697, + "grad_norm": 0.5532642602920532, + "learning_rate": 2.516668986613437e-06, + "loss": 0.1873680591583252, + "memory(GiB)": 72.48, + "step": 5280, + "token_acc": 0.9383436207013498, + "train_speed(iter/s)": 0.080751 + }, + { + "epoch": 2.047666335650447, + "grad_norm": 0.5849664807319641, + "learning_rate": 2.507403889506551e-06, + "loss": 0.1897585391998291, + "memory(GiB)": 72.48, + "step": 5285, + "token_acc": 0.9374327148375657, + "train_speed(iter/s)": 0.080757 + }, + { + "epoch": 2.0496039915711965, + "grad_norm": 0.5856564044952393, + "learning_rate": 2.4981501676868525e-06, + "loss": 0.18512728214263915, + "memory(GiB)": 72.48, + "step": 5290, + "token_acc": 0.9345540861308919, + "train_speed(iter/s)": 0.080749 + }, + { + "epoch": 2.0515416474919466, + "grad_norm": 0.5460135340690613, + "learning_rate": 2.488907863384907e-06, + "loss": 0.1801106095314026, + "memory(GiB)": 72.48, + "step": 5295, + "token_acc": 0.9387762374826272, + "train_speed(iter/s)": 0.08075 + }, + { + "epoch": 2.0534793034126966, + "grad_norm": 0.5635910034179688, + "learning_rate": 2.4796770187791746e-06, + "loss": 0.1819519281387329, + "memory(GiB)": 72.48, + "step": 5300, + "token_acc": 0.9433834024332223, + "train_speed(iter/s)": 0.080754 + }, + { + "epoch": 2.055416959333446, + "grad_norm": 0.5485233664512634, + "learning_rate": 2.470457675995821e-06, + "loss": 0.19221407175064087, + "memory(GiB)": 72.48, + "step": 5305, + "token_acc": 0.9369553014860114, + "train_speed(iter/s)": 0.080752 + }, + { + "epoch": 2.057354615254196, + "grad_norm": 0.5501575469970703, + "learning_rate": 2.461249877108513e-06, + "loss": 0.1826251745223999, + "memory(GiB)": 72.48, + "step": 5310, + "token_acc": 0.9387762324379799, + "train_speed(iter/s)": 0.08075 + }, + { + "epoch": 2.059292271174946, + "grad_norm": 0.5151733756065369, + "learning_rate": 2.452053664138244e-06, + "loss": 0.17927682399749756, + "memory(GiB)": 72.48, + "step": 5315, + "token_acc": 0.9408301346691822, + "train_speed(iter/s)": 0.080751 + }, + { + "epoch": 2.061229927095696, + "grad_norm": 0.5707798600196838, + "learning_rate": 2.4428690790531303e-06, + "loss": 0.17755191326141356, + "memory(GiB)": 72.48, + "step": 5320, + "token_acc": 0.9346226935572856, + "train_speed(iter/s)": 0.080755 + }, + { + "epoch": 2.0631675830164458, + "grad_norm": 0.5683720111846924, + "learning_rate": 2.4336961637682214e-06, + "loss": 0.18373007774353028, + "memory(GiB)": 72.48, + "step": 5325, + "token_acc": 0.9251355606087108, + "train_speed(iter/s)": 0.080753 + }, + { + "epoch": 2.0651052389371958, + "grad_norm": 0.564379096031189, + "learning_rate": 2.424534960145314e-06, + "loss": 0.18030145168304443, + "memory(GiB)": 72.48, + "step": 5330, + "token_acc": 0.932678254920862, + "train_speed(iter/s)": 0.080759 + }, + { + "epoch": 2.067042894857946, + "grad_norm": 0.5815133452415466, + "learning_rate": 2.415385509992752e-06, + "loss": 0.17914021015167236, + "memory(GiB)": 72.48, + "step": 5335, + "token_acc": 0.9349110494532398, + "train_speed(iter/s)": 0.080763 + }, + { + "epoch": 2.0689805507786954, + "grad_norm": 0.5522861480712891, + "learning_rate": 2.4062478550652457e-06, + "loss": 0.16629674434661865, + "memory(GiB)": 72.48, + "step": 5340, + "token_acc": 0.9421650160461024, + "train_speed(iter/s)": 0.080765 + }, + { + "epoch": 2.0709182066994454, + "grad_norm": 0.6132717728614807, + "learning_rate": 2.3971220370636754e-06, + "loss": 0.1797514796257019, + "memory(GiB)": 72.48, + "step": 5345, + "token_acc": 0.9354761661036612, + "train_speed(iter/s)": 0.080771 + }, + { + "epoch": 2.0728558626201954, + "grad_norm": 0.5450883507728577, + "learning_rate": 2.388008097634897e-06, + "loss": 0.17508809566497802, + "memory(GiB)": 72.48, + "step": 5350, + "token_acc": 0.9436914154696878, + "train_speed(iter/s)": 0.080771 + }, + { + "epoch": 2.074793518540945, + "grad_norm": 0.5589373707771301, + "learning_rate": 2.3789060783715625e-06, + "loss": 0.185202956199646, + "memory(GiB)": 72.48, + "step": 5355, + "token_acc": 0.9355623297444545, + "train_speed(iter/s)": 0.080772 + }, + { + "epoch": 2.076731174461695, + "grad_norm": 0.5466156005859375, + "learning_rate": 2.3698160208119233e-06, + "loss": 0.17637484073638915, + "memory(GiB)": 72.48, + "step": 5360, + "token_acc": 0.9371462897973379, + "train_speed(iter/s)": 0.080774 + }, + { + "epoch": 2.078668830382445, + "grad_norm": 0.572566568851471, + "learning_rate": 2.3607379664396414e-06, + "loss": 0.1739407777786255, + "memory(GiB)": 72.48, + "step": 5365, + "token_acc": 0.9440337552742616, + "train_speed(iter/s)": 0.080779 + }, + { + "epoch": 2.0806064863031946, + "grad_norm": 0.5492961406707764, + "learning_rate": 2.3516719566836032e-06, + "loss": 0.1829778552055359, + "memory(GiB)": 72.48, + "step": 5370, + "token_acc": 0.9381134293270643, + "train_speed(iter/s)": 0.080778 + }, + { + "epoch": 2.0825441422239446, + "grad_norm": 0.5619326233863831, + "learning_rate": 2.3426180329177217e-06, + "loss": 0.18022637367248534, + "memory(GiB)": 72.48, + "step": 5375, + "token_acc": 0.9355506145389414, + "train_speed(iter/s)": 0.08078 + }, + { + "epoch": 2.0844817981446946, + "grad_norm": 0.5690088868141174, + "learning_rate": 2.33357623646076e-06, + "loss": 0.17759082317352295, + "memory(GiB)": 72.48, + "step": 5380, + "token_acc": 0.9401349614395886, + "train_speed(iter/s)": 0.080782 + }, + { + "epoch": 2.086419454065444, + "grad_norm": 0.5527958273887634, + "learning_rate": 2.324546608576134e-06, + "loss": 0.18613047599792482, + "memory(GiB)": 72.48, + "step": 5385, + "token_acc": 0.9303143995579211, + "train_speed(iter/s)": 0.080788 + }, + { + "epoch": 2.088357109986194, + "grad_norm": 0.546678900718689, + "learning_rate": 2.3155291904717286e-06, + "loss": 0.17882769107818602, + "memory(GiB)": 72.48, + "step": 5390, + "token_acc": 0.9307346020053202, + "train_speed(iter/s)": 0.080798 + }, + { + "epoch": 2.090294765906944, + "grad_norm": 0.5436745882034302, + "learning_rate": 2.3065240232997055e-06, + "loss": 0.18277444839477539, + "memory(GiB)": 72.48, + "step": 5395, + "token_acc": 0.9328300921187308, + "train_speed(iter/s)": 0.080803 + }, + { + "epoch": 2.0922324218276938, + "grad_norm": 0.5892859697341919, + "learning_rate": 2.2975311481563186e-06, + "loss": 0.17380056381225586, + "memory(GiB)": 72.48, + "step": 5400, + "token_acc": 0.9400539580180298, + "train_speed(iter/s)": 0.08081 + }, + { + "epoch": 2.0941700777484438, + "grad_norm": 0.5891724228858948, + "learning_rate": 2.2885506060817274e-06, + "loss": 0.18320174217224122, + "memory(GiB)": 72.48, + "step": 5405, + "token_acc": 0.9279184917158636, + "train_speed(iter/s)": 0.080811 + }, + { + "epoch": 2.096107733669194, + "grad_norm": 0.5693268179893494, + "learning_rate": 2.2795824380598033e-06, + "loss": 0.18516907691955567, + "memory(GiB)": 72.48, + "step": 5410, + "token_acc": 0.9385831960461285, + "train_speed(iter/s)": 0.080815 + }, + { + "epoch": 2.0980453895899434, + "grad_norm": 0.5973000526428223, + "learning_rate": 2.2706266850179504e-06, + "loss": 0.18582874536514282, + "memory(GiB)": 72.48, + "step": 5415, + "token_acc": 0.9332009691748553, + "train_speed(iter/s)": 0.080808 + }, + { + "epoch": 2.0999830455106934, + "grad_norm": 0.552210807800293, + "learning_rate": 2.261683387826915e-06, + "loss": 0.17878878116607666, + "memory(GiB)": 72.48, + "step": 5420, + "token_acc": 0.9350939878825539, + "train_speed(iter/s)": 0.080809 + }, + { + "epoch": 2.1019207014314434, + "grad_norm": 0.5775700807571411, + "learning_rate": 2.2527525873006005e-06, + "loss": 0.18521888256073, + "memory(GiB)": 72.48, + "step": 5425, + "token_acc": 0.9341401817048771, + "train_speed(iter/s)": 0.08081 + }, + { + "epoch": 2.1038583573521934, + "grad_norm": 0.5570356845855713, + "learning_rate": 2.2438343241958793e-06, + "loss": 0.18898725509643555, + "memory(GiB)": 72.48, + "step": 5430, + "token_acc": 0.9432755241652183, + "train_speed(iter/s)": 0.08081 + }, + { + "epoch": 2.105796013272943, + "grad_norm": 0.5660508871078491, + "learning_rate": 2.2349286392124047e-06, + "loss": 0.18533942699432374, + "memory(GiB)": 72.48, + "step": 5435, + "token_acc": 0.9363217294547767, + "train_speed(iter/s)": 0.080809 + }, + { + "epoch": 2.107733669193693, + "grad_norm": 0.5466207265853882, + "learning_rate": 2.2260355729924323e-06, + "loss": 0.18058595657348633, + "memory(GiB)": 72.48, + "step": 5440, + "token_acc": 0.9440250535684852, + "train_speed(iter/s)": 0.080815 + }, + { + "epoch": 2.109671325114443, + "grad_norm": 0.555391788482666, + "learning_rate": 2.217155166120629e-06, + "loss": 0.17833871841430665, + "memory(GiB)": 72.48, + "step": 5445, + "token_acc": 0.9375606285972967, + "train_speed(iter/s)": 0.080813 + }, + { + "epoch": 2.1116089810351926, + "grad_norm": 0.571415901184082, + "learning_rate": 2.2082874591238875e-06, + "loss": 0.19260311126708984, + "memory(GiB)": 72.48, + "step": 5450, + "token_acc": 0.9347647782529732, + "train_speed(iter/s)": 0.08081 + }, + { + "epoch": 2.1135466369559426, + "grad_norm": 0.5604360103607178, + "learning_rate": 2.1994324924711458e-06, + "loss": 0.1859738349914551, + "memory(GiB)": 72.48, + "step": 5455, + "token_acc": 0.9434542469025228, + "train_speed(iter/s)": 0.080806 + }, + { + "epoch": 2.1154842928766926, + "grad_norm": 0.5471526384353638, + "learning_rate": 2.190590306573198e-06, + "loss": 0.1887765884399414, + "memory(GiB)": 72.48, + "step": 5460, + "token_acc": 0.9441244438312696, + "train_speed(iter/s)": 0.080807 + }, + { + "epoch": 2.117421948797442, + "grad_norm": 0.5706793069839478, + "learning_rate": 2.1817609417825124e-06, + "loss": 0.18401453495025635, + "memory(GiB)": 72.48, + "step": 5465, + "token_acc": 0.9318755256518082, + "train_speed(iter/s)": 0.080808 + }, + { + "epoch": 2.119359604718192, + "grad_norm": 0.5502917766571045, + "learning_rate": 2.172944438393044e-06, + "loss": 0.18263095617294312, + "memory(GiB)": 72.48, + "step": 5470, + "token_acc": 0.9409442551972239, + "train_speed(iter/s)": 0.080813 + }, + { + "epoch": 2.121297260638942, + "grad_norm": 0.5597527027130127, + "learning_rate": 2.164140836640056e-06, + "loss": 0.18310918807983398, + "memory(GiB)": 72.48, + "step": 5475, + "token_acc": 0.9404852285855752, + "train_speed(iter/s)": 0.080816 + }, + { + "epoch": 2.1232349165596918, + "grad_norm": 0.566792368888855, + "learning_rate": 2.155350176699932e-06, + "loss": 0.1773963212966919, + "memory(GiB)": 72.48, + "step": 5480, + "token_acc": 0.9416049569291219, + "train_speed(iter/s)": 0.080815 + }, + { + "epoch": 2.125172572480442, + "grad_norm": 0.5454405546188354, + "learning_rate": 2.146572498689994e-06, + "loss": 0.1833777904510498, + "memory(GiB)": 72.48, + "step": 5485, + "token_acc": 0.9389079172704519, + "train_speed(iter/s)": 0.080814 + }, + { + "epoch": 2.127110228401192, + "grad_norm": 0.5806962847709656, + "learning_rate": 2.137807842668323e-06, + "loss": 0.1891713857650757, + "memory(GiB)": 72.48, + "step": 5490, + "token_acc": 0.933165319281249, + "train_speed(iter/s)": 0.080814 + }, + { + "epoch": 2.1290478843219414, + "grad_norm": 0.5590419769287109, + "learning_rate": 2.1290562486335635e-06, + "loss": 0.1789409875869751, + "memory(GiB)": 72.48, + "step": 5495, + "token_acc": 0.9396403965720047, + "train_speed(iter/s)": 0.080817 + }, + { + "epoch": 2.1309855402426914, + "grad_norm": 0.5818547606468201, + "learning_rate": 2.120317756524758e-06, + "loss": 0.18473920822143555, + "memory(GiB)": 72.48, + "step": 5500, + "token_acc": 0.9352483427802863, + "train_speed(iter/s)": 0.080813 + }, + { + "epoch": 2.1329231961634414, + "grad_norm": 0.5636138319969177, + "learning_rate": 2.111592406221154e-06, + "loss": 0.18380820751190186, + "memory(GiB)": 72.48, + "step": 5505, + "token_acc": 0.9377715487035739, + "train_speed(iter/s)": 0.080818 + }, + { + "epoch": 2.134860852084191, + "grad_norm": 0.5725641846656799, + "learning_rate": 2.1028802375420244e-06, + "loss": 0.17626445293426513, + "memory(GiB)": 72.48, + "step": 5510, + "token_acc": 0.9363154366506186, + "train_speed(iter/s)": 0.080821 + }, + { + "epoch": 2.136798508004941, + "grad_norm": 0.5447889566421509, + "learning_rate": 2.094181290246487e-06, + "loss": 0.17057907581329346, + "memory(GiB)": 72.48, + "step": 5515, + "token_acc": 0.9342668222481525, + "train_speed(iter/s)": 0.080821 + }, + { + "epoch": 2.138736163925691, + "grad_norm": 0.5622581243515015, + "learning_rate": 2.085495604033321e-06, + "loss": 0.17494313716888427, + "memory(GiB)": 72.48, + "step": 5520, + "token_acc": 0.9453718261800421, + "train_speed(iter/s)": 0.080822 + }, + { + "epoch": 2.1406738198464406, + "grad_norm": 0.583169162273407, + "learning_rate": 2.076823218540789e-06, + "loss": 0.1729996919631958, + "memory(GiB)": 72.48, + "step": 5525, + "token_acc": 0.9369956443934226, + "train_speed(iter/s)": 0.080828 + }, + { + "epoch": 2.1426114757671906, + "grad_norm": 0.5594836473464966, + "learning_rate": 2.068164173346449e-06, + "loss": 0.18059495687484742, + "memory(GiB)": 72.48, + "step": 5530, + "token_acc": 0.927891023765402, + "train_speed(iter/s)": 0.080827 + }, + { + "epoch": 2.1445491316879406, + "grad_norm": 0.5576629638671875, + "learning_rate": 2.0595185079669837e-06, + "loss": 0.18530012369155885, + "memory(GiB)": 72.48, + "step": 5535, + "token_acc": 0.9356248957116636, + "train_speed(iter/s)": 0.080828 + }, + { + "epoch": 2.1464867876086906, + "grad_norm": 0.5433487892150879, + "learning_rate": 2.050886261858015e-06, + "loss": 0.18785598278045654, + "memory(GiB)": 72.48, + "step": 5540, + "token_acc": 0.9393884512058921, + "train_speed(iter/s)": 0.080827 + }, + { + "epoch": 2.14842444352944, + "grad_norm": 0.6088826060295105, + "learning_rate": 2.0422674744139216e-06, + "loss": 0.18208863735198974, + "memory(GiB)": 72.48, + "step": 5545, + "token_acc": 0.940564921147756, + "train_speed(iter/s)": 0.080827 + }, + { + "epoch": 2.15036209945019, + "grad_norm": 0.6054071187973022, + "learning_rate": 2.033662184967663e-06, + "loss": 0.18218772411346434, + "memory(GiB)": 72.48, + "step": 5550, + "token_acc": 0.9345688912649618, + "train_speed(iter/s)": 0.080828 + }, + { + "epoch": 2.15229975537094, + "grad_norm": 0.5496972799301147, + "learning_rate": 2.0250704327906025e-06, + "loss": 0.17930705547332765, + "memory(GiB)": 72.48, + "step": 5555, + "token_acc": 0.9460105112279026, + "train_speed(iter/s)": 0.080825 + }, + { + "epoch": 2.1542374112916898, + "grad_norm": 0.576370120048523, + "learning_rate": 2.016492257092316e-06, + "loss": 0.1762238025665283, + "memory(GiB)": 72.48, + "step": 5560, + "token_acc": 0.9480714235893144, + "train_speed(iter/s)": 0.080827 + }, + { + "epoch": 2.15617506721244, + "grad_norm": 0.5276342034339905, + "learning_rate": 2.0079276970204278e-06, + "loss": 0.17920913696289062, + "memory(GiB)": 72.48, + "step": 5565, + "token_acc": 0.9295376792944352, + "train_speed(iter/s)": 0.080828 + }, + { + "epoch": 2.15811272313319, + "grad_norm": 0.5416987538337708, + "learning_rate": 1.9993767916604244e-06, + "loss": 0.17366034984588624, + "memory(GiB)": 72.48, + "step": 5570, + "token_acc": 0.9397382920110193, + "train_speed(iter/s)": 0.080829 + }, + { + "epoch": 2.1600503790539394, + "grad_norm": 0.5710574984550476, + "learning_rate": 1.9908395800354768e-06, + "loss": 0.17503013610839843, + "memory(GiB)": 72.48, + "step": 5575, + "token_acc": 0.947594501718213, + "train_speed(iter/s)": 0.080832 + }, + { + "epoch": 2.1619880349746894, + "grad_norm": 0.5497194528579712, + "learning_rate": 1.982316101106263e-06, + "loss": 0.17725342512130737, + "memory(GiB)": 72.48, + "step": 5580, + "token_acc": 0.9366713820442807, + "train_speed(iter/s)": 0.080836 + }, + { + "epoch": 2.1639256908954394, + "grad_norm": 0.5640018582344055, + "learning_rate": 1.97380639377079e-06, + "loss": 0.1781161069869995, + "memory(GiB)": 72.48, + "step": 5585, + "token_acc": 0.9356641580134006, + "train_speed(iter/s)": 0.080838 + }, + { + "epoch": 2.165863346816189, + "grad_norm": 0.5462841391563416, + "learning_rate": 1.965310496864217e-06, + "loss": 0.17803661823272704, + "memory(GiB)": 72.48, + "step": 5590, + "token_acc": 0.9411239374193762, + "train_speed(iter/s)": 0.080837 + }, + { + "epoch": 2.167801002736939, + "grad_norm": 0.5980976819992065, + "learning_rate": 1.956828449158675e-06, + "loss": 0.18186450004577637, + "memory(GiB)": 72.48, + "step": 5595, + "token_acc": 0.9349667665841261, + "train_speed(iter/s)": 0.08084 + }, + { + "epoch": 2.169738658657689, + "grad_norm": 0.5738852620124817, + "learning_rate": 1.948360289363094e-06, + "loss": 0.17838164567947387, + "memory(GiB)": 72.48, + "step": 5600, + "token_acc": 0.9373933206660011, + "train_speed(iter/s)": 0.080837 + }, + { + "epoch": 2.1716763145784386, + "grad_norm": 0.5519325137138367, + "learning_rate": 1.939906056123025e-06, + "loss": 0.18007891178131102, + "memory(GiB)": 72.48, + "step": 5605, + "token_acc": 0.9323807416305757, + "train_speed(iter/s)": 0.080831 + }, + { + "epoch": 2.1736139704991886, + "grad_norm": 0.5649546384811401, + "learning_rate": 1.931465788020464e-06, + "loss": 0.17921049594879152, + "memory(GiB)": 72.48, + "step": 5610, + "token_acc": 0.9420893516963386, + "train_speed(iter/s)": 0.080822 + }, + { + "epoch": 2.1755516264199386, + "grad_norm": 0.5733870267868042, + "learning_rate": 1.923039523573676e-06, + "loss": 0.17436559200286866, + "memory(GiB)": 72.48, + "step": 5615, + "token_acc": 0.9356518043701526, + "train_speed(iter/s)": 0.080822 + }, + { + "epoch": 2.177489282340688, + "grad_norm": 0.5561407208442688, + "learning_rate": 1.914627301237014e-06, + "loss": 0.1824689269065857, + "memory(GiB)": 72.48, + "step": 5620, + "token_acc": 0.9439788895624577, + "train_speed(iter/s)": 0.080824 + }, + { + "epoch": 2.179426938261438, + "grad_norm": 0.552563488483429, + "learning_rate": 1.9062291594007508e-06, + "loss": 0.17798593044281005, + "memory(GiB)": 72.48, + "step": 5625, + "token_acc": 0.9391477839532564, + "train_speed(iter/s)": 0.080828 + }, + { + "epoch": 2.181364594182188, + "grad_norm": 0.5480149388313293, + "learning_rate": 1.8978451363909028e-06, + "loss": 0.17788171768188477, + "memory(GiB)": 72.48, + "step": 5630, + "token_acc": 0.9355552744750822, + "train_speed(iter/s)": 0.08083 + }, + { + "epoch": 2.1833022501029378, + "grad_norm": 0.5488108992576599, + "learning_rate": 1.889475270469051e-06, + "loss": 0.18669912815093995, + "memory(GiB)": 72.48, + "step": 5635, + "token_acc": 0.9317667314989, + "train_speed(iter/s)": 0.080832 + }, + { + "epoch": 2.185239906023688, + "grad_norm": 0.5238755941390991, + "learning_rate": 1.8811195998321696e-06, + "loss": 0.17910051345825195, + "memory(GiB)": 72.48, + "step": 5640, + "token_acc": 0.9435528162567889, + "train_speed(iter/s)": 0.080835 + }, + { + "epoch": 2.187177561944438, + "grad_norm": 0.5725862383842468, + "learning_rate": 1.8727781626124502e-06, + "loss": 0.1775214672088623, + "memory(GiB)": 72.48, + "step": 5645, + "token_acc": 0.9405904786954234, + "train_speed(iter/s)": 0.080837 + }, + { + "epoch": 2.1891152178651874, + "grad_norm": 0.5793642401695251, + "learning_rate": 1.8644509968771302e-06, + "loss": 0.1762074589729309, + "memory(GiB)": 72.48, + "step": 5650, + "token_acc": 0.9406073468368653, + "train_speed(iter/s)": 0.080838 + }, + { + "epoch": 2.1910528737859374, + "grad_norm": 0.5786473155021667, + "learning_rate": 1.8561381406283125e-06, + "loss": 0.18379080295562744, + "memory(GiB)": 72.48, + "step": 5655, + "token_acc": 0.939537311971808, + "train_speed(iter/s)": 0.080836 + }, + { + "epoch": 2.1929905297066874, + "grad_norm": 0.5768445134162903, + "learning_rate": 1.847839631802802e-06, + "loss": 0.1796635150909424, + "memory(GiB)": 72.48, + "step": 5660, + "token_acc": 0.9358396971477058, + "train_speed(iter/s)": 0.080837 + }, + { + "epoch": 2.1949281856274374, + "grad_norm": 0.5747734308242798, + "learning_rate": 1.8395555082719242e-06, + "loss": 0.1772952437400818, + "memory(GiB)": 72.48, + "step": 5665, + "token_acc": 0.9418783422459893, + "train_speed(iter/s)": 0.080839 + }, + { + "epoch": 2.196865841548187, + "grad_norm": 0.5827616453170776, + "learning_rate": 1.831285807841357e-06, + "loss": 0.1837351679801941, + "memory(GiB)": 72.48, + "step": 5670, + "token_acc": 0.9460246982100735, + "train_speed(iter/s)": 0.080843 + }, + { + "epoch": 2.198803497468937, + "grad_norm": 0.5412510633468628, + "learning_rate": 1.823030568250958e-06, + "loss": 0.1785785436630249, + "memory(GiB)": 72.48, + "step": 5675, + "token_acc": 0.9367165906023012, + "train_speed(iter/s)": 0.080838 + }, + { + "epoch": 2.200741153389687, + "grad_norm": 0.577472448348999, + "learning_rate": 1.8147898271745856e-06, + "loss": 0.17977702617645264, + "memory(GiB)": 72.48, + "step": 5680, + "token_acc": 0.9380172565592534, + "train_speed(iter/s)": 0.080839 + }, + { + "epoch": 2.2026788093104366, + "grad_norm": 0.599717378616333, + "learning_rate": 1.8065636222199363e-06, + "loss": 0.18185811042785643, + "memory(GiB)": 72.48, + "step": 5685, + "token_acc": 0.9329965688143347, + "train_speed(iter/s)": 0.080843 + }, + { + "epoch": 2.2046164652311866, + "grad_norm": 0.5418193936347961, + "learning_rate": 1.7983519909283698e-06, + "loss": 0.1911879301071167, + "memory(GiB)": 72.48, + "step": 5690, + "token_acc": 0.9374051225735602, + "train_speed(iter/s)": 0.080841 + }, + { + "epoch": 2.2065541211519366, + "grad_norm": 0.5883736610412598, + "learning_rate": 1.7901549707747346e-06, + "loss": 0.17937839031219482, + "memory(GiB)": 72.48, + "step": 5695, + "token_acc": 0.9443733016834807, + "train_speed(iter/s)": 0.080839 + }, + { + "epoch": 2.208491777072686, + "grad_norm": 0.5832462310791016, + "learning_rate": 1.7819725991672004e-06, + "loss": 0.1827946901321411, + "memory(GiB)": 72.48, + "step": 5700, + "token_acc": 0.9389562126353207, + "train_speed(iter/s)": 0.080839 + }, + { + "epoch": 2.210429432993436, + "grad_norm": 0.5429852604866028, + "learning_rate": 1.7738049134470875e-06, + "loss": 0.1846563220024109, + "memory(GiB)": 72.48, + "step": 5705, + "token_acc": 0.9425343639606869, + "train_speed(iter/s)": 0.080838 + }, + { + "epoch": 2.212367088914186, + "grad_norm": 0.5365347266197205, + "learning_rate": 1.7656519508886943e-06, + "loss": 0.17892229557037354, + "memory(GiB)": 72.48, + "step": 5710, + "token_acc": 0.9343960349619905, + "train_speed(iter/s)": 0.080843 + }, + { + "epoch": 2.214304744834936, + "grad_norm": 0.5458106398582458, + "learning_rate": 1.7575137486991255e-06, + "loss": 0.18492650985717773, + "memory(GiB)": 72.48, + "step": 5715, + "token_acc": 0.9375423428073685, + "train_speed(iter/s)": 0.080843 + }, + { + "epoch": 2.216242400755686, + "grad_norm": 0.5663112998008728, + "learning_rate": 1.7493903440181293e-06, + "loss": 0.1814013957977295, + "memory(GiB)": 72.48, + "step": 5720, + "token_acc": 0.9399794251912814, + "train_speed(iter/s)": 0.080841 + }, + { + "epoch": 2.218180056676436, + "grad_norm": 0.5446045994758606, + "learning_rate": 1.7412817739179211e-06, + "loss": 0.17452262639999389, + "memory(GiB)": 72.48, + "step": 5725, + "token_acc": 0.936608194789894, + "train_speed(iter/s)": 0.080845 + }, + { + "epoch": 2.2201177125971854, + "grad_norm": 0.5571759343147278, + "learning_rate": 1.7331880754030172e-06, + "loss": 0.1752955675125122, + "memory(GiB)": 72.48, + "step": 5730, + "token_acc": 0.9398055752269362, + "train_speed(iter/s)": 0.080844 + }, + { + "epoch": 2.2220553685179354, + "grad_norm": 0.605722188949585, + "learning_rate": 1.725109285410066e-06, + "loss": 0.18015103340148925, + "memory(GiB)": 72.48, + "step": 5735, + "token_acc": 0.9428771929824561, + "train_speed(iter/s)": 0.08084 + }, + { + "epoch": 2.2239930244386854, + "grad_norm": 0.5884843468666077, + "learning_rate": 1.7170454408076797e-06, + "loss": 0.18069713115692138, + "memory(GiB)": 72.48, + "step": 5740, + "token_acc": 0.9442363301787592, + "train_speed(iter/s)": 0.080836 + }, + { + "epoch": 2.225930680359435, + "grad_norm": 0.5465793609619141, + "learning_rate": 1.7089965783962608e-06, + "loss": 0.17759050130844117, + "memory(GiB)": 72.48, + "step": 5745, + "token_acc": 0.942534179533327, + "train_speed(iter/s)": 0.080837 + }, + { + "epoch": 2.227868336280185, + "grad_norm": 0.5535234212875366, + "learning_rate": 1.7009627349078434e-06, + "loss": 0.18377819061279296, + "memory(GiB)": 72.48, + "step": 5750, + "token_acc": 0.9349665677107899, + "train_speed(iter/s)": 0.080841 + }, + { + "epoch": 2.229805992200935, + "grad_norm": 0.5973391532897949, + "learning_rate": 1.6929439470059195e-06, + "loss": 0.18841125965118408, + "memory(GiB)": 72.48, + "step": 5755, + "token_acc": 0.9280665540751861, + "train_speed(iter/s)": 0.080836 + }, + { + "epoch": 2.231743648121685, + "grad_norm": 0.5607689619064331, + "learning_rate": 1.6849402512852724e-06, + "loss": 0.1824529767036438, + "memory(GiB)": 72.48, + "step": 5760, + "token_acc": 0.9380976346250629, + "train_speed(iter/s)": 0.080836 + }, + { + "epoch": 2.2336813040424346, + "grad_norm": 0.5884976983070374, + "learning_rate": 1.6769516842718115e-06, + "loss": 0.1874626874923706, + "memory(GiB)": 72.48, + "step": 5765, + "token_acc": 0.9303225806451613, + "train_speed(iter/s)": 0.080841 + }, + { + "epoch": 2.2356189599631846, + "grad_norm": 0.5460970401763916, + "learning_rate": 1.6689782824224037e-06, + "loss": 0.18645325899124146, + "memory(GiB)": 72.48, + "step": 5770, + "token_acc": 0.9363728328740364, + "train_speed(iter/s)": 0.080838 + }, + { + "epoch": 2.2375566158839346, + "grad_norm": 0.577038586139679, + "learning_rate": 1.6610200821247051e-06, + "loss": 0.19113829135894775, + "memory(GiB)": 72.48, + "step": 5775, + "token_acc": 0.9348496530454896, + "train_speed(iter/s)": 0.080836 + }, + { + "epoch": 2.239494271804684, + "grad_norm": 0.5635644197463989, + "learning_rate": 1.6530771196970014e-06, + "loss": 0.17582175731658936, + "memory(GiB)": 72.48, + "step": 5780, + "token_acc": 0.9394200897480152, + "train_speed(iter/s)": 0.080841 + }, + { + "epoch": 2.241431927725434, + "grad_norm": 0.5600139498710632, + "learning_rate": 1.6451494313880362e-06, + "loss": 0.18222497701644896, + "memory(GiB)": 72.48, + "step": 5785, + "token_acc": 0.9339602988006309, + "train_speed(iter/s)": 0.080841 + }, + { + "epoch": 2.243369583646184, + "grad_norm": 0.5830628275871277, + "learning_rate": 1.637237053376849e-06, + "loss": 0.18120429515838624, + "memory(GiB)": 72.48, + "step": 5790, + "token_acc": 0.9346733668341709, + "train_speed(iter/s)": 0.08084 + }, + { + "epoch": 2.245307239566934, + "grad_norm": 0.5865716934204102, + "learning_rate": 1.6293400217726074e-06, + "loss": 0.17290226221084595, + "memory(GiB)": 72.48, + "step": 5795, + "token_acc": 0.9306092690039942, + "train_speed(iter/s)": 0.080847 + }, + { + "epoch": 2.247244895487684, + "grad_norm": 0.558928370475769, + "learning_rate": 1.6214583726144462e-06, + "loss": 0.18007926940917968, + "memory(GiB)": 72.48, + "step": 5800, + "token_acc": 0.938637446403049, + "train_speed(iter/s)": 0.080846 + }, + { + "epoch": 2.249182551408434, + "grad_norm": 0.5890582799911499, + "learning_rate": 1.6135921418712959e-06, + "loss": 0.18051011562347413, + "memory(GiB)": 72.48, + "step": 5805, + "token_acc": 0.9370391190767501, + "train_speed(iter/s)": 0.080848 + }, + { + "epoch": 2.2511202073291834, + "grad_norm": 0.5818348526954651, + "learning_rate": 1.605741365441726e-06, + "loss": 0.17387816905975342, + "memory(GiB)": 72.48, + "step": 5810, + "token_acc": 0.9401293419561304, + "train_speed(iter/s)": 0.080842 + }, + { + "epoch": 2.2530578632499334, + "grad_norm": 0.5488393902778625, + "learning_rate": 1.597906079153778e-06, + "loss": 0.1793942928314209, + "memory(GiB)": 72.48, + "step": 5815, + "token_acc": 0.9430531840890407, + "train_speed(iter/s)": 0.080839 + }, + { + "epoch": 2.2549955191706834, + "grad_norm": 0.5455806851387024, + "learning_rate": 1.590086318764803e-06, + "loss": 0.1723968505859375, + "memory(GiB)": 72.48, + "step": 5820, + "token_acc": 0.9418294573643411, + "train_speed(iter/s)": 0.080836 + }, + { + "epoch": 2.256933175091433, + "grad_norm": 0.5931349396705627, + "learning_rate": 1.582282119961296e-06, + "loss": 0.1813875675201416, + "memory(GiB)": 72.48, + "step": 5825, + "token_acc": 0.9381025025154988, + "train_speed(iter/s)": 0.080835 + }, + { + "epoch": 2.258870831012183, + "grad_norm": 0.6027455925941467, + "learning_rate": 1.5744935183587362e-06, + "loss": 0.17939608097076415, + "memory(GiB)": 72.48, + "step": 5830, + "token_acc": 0.9448940650085538, + "train_speed(iter/s)": 0.080835 + }, + { + "epoch": 2.260808486932933, + "grad_norm": 0.5705295205116272, + "learning_rate": 1.5667205495014203e-06, + "loss": 0.1888546347618103, + "memory(GiB)": 72.48, + "step": 5835, + "token_acc": 0.927797833935018, + "train_speed(iter/s)": 0.080836 + }, + { + "epoch": 2.2627461428536826, + "grad_norm": 0.545686662197113, + "learning_rate": 1.5589632488623053e-06, + "loss": 0.1704582691192627, + "memory(GiB)": 72.48, + "step": 5840, + "token_acc": 0.9393606027269393, + "train_speed(iter/s)": 0.080833 + }, + { + "epoch": 2.2646837987744326, + "grad_norm": 0.5886187553405762, + "learning_rate": 1.5512216518428435e-06, + "loss": 0.1811639666557312, + "memory(GiB)": 72.48, + "step": 5845, + "token_acc": 0.9381139489194499, + "train_speed(iter/s)": 0.080833 + }, + { + "epoch": 2.2666214546951826, + "grad_norm": 0.5708662271499634, + "learning_rate": 1.5434957937728223e-06, + "loss": 0.18458144664764403, + "memory(GiB)": 72.48, + "step": 5850, + "token_acc": 0.9392434210526316, + "train_speed(iter/s)": 0.080831 + }, + { + "epoch": 2.268559110615932, + "grad_norm": 0.5548220872879028, + "learning_rate": 1.535785709910202e-06, + "loss": 0.1823875665664673, + "memory(GiB)": 72.48, + "step": 5855, + "token_acc": 0.9409633096783935, + "train_speed(iter/s)": 0.080825 + }, + { + "epoch": 2.270496766536682, + "grad_norm": 0.5868509411811829, + "learning_rate": 1.528091435440956e-06, + "loss": 0.1741629958152771, + "memory(GiB)": 72.48, + "step": 5860, + "token_acc": 0.937952826274297, + "train_speed(iter/s)": 0.080828 + }, + { + "epoch": 2.272434422457432, + "grad_norm": 0.570006251335144, + "learning_rate": 1.5204130054789056e-06, + "loss": 0.17339808940887452, + "memory(GiB)": 72.48, + "step": 5865, + "token_acc": 0.9392643559899685, + "train_speed(iter/s)": 0.080823 + }, + { + "epoch": 2.274372078378182, + "grad_norm": 0.5686207413673401, + "learning_rate": 1.5127504550655687e-06, + "loss": 0.17772071361541747, + "memory(GiB)": 72.48, + "step": 5870, + "token_acc": 0.9382895076050528, + "train_speed(iter/s)": 0.080827 + }, + { + "epoch": 2.276309734298932, + "grad_norm": 0.5607608556747437, + "learning_rate": 1.5051038191699919e-06, + "loss": 0.1857010006904602, + "memory(GiB)": 72.48, + "step": 5875, + "token_acc": 0.9359289971495206, + "train_speed(iter/s)": 0.080832 + }, + { + "epoch": 2.278247390219682, + "grad_norm": 0.5881747603416443, + "learning_rate": 1.497473132688595e-06, + "loss": 0.18277556896209718, + "memory(GiB)": 72.48, + "step": 5880, + "token_acc": 0.937333792197228, + "train_speed(iter/s)": 0.080831 + }, + { + "epoch": 2.2801850461404314, + "grad_norm": 0.5419440865516663, + "learning_rate": 1.4898584304450102e-06, + "loss": 0.16934518814086913, + "memory(GiB)": 72.48, + "step": 5885, + "token_acc": 0.9370211175481219, + "train_speed(iter/s)": 0.080831 + }, + { + "epoch": 2.2821227020611814, + "grad_norm": 0.5387718081474304, + "learning_rate": 1.4822597471899257e-06, + "loss": 0.1737461805343628, + "memory(GiB)": 72.48, + "step": 5890, + "token_acc": 0.9424770290964778, + "train_speed(iter/s)": 0.080829 + }, + { + "epoch": 2.2840603579819314, + "grad_norm": 0.5882220268249512, + "learning_rate": 1.4746771176009184e-06, + "loss": 0.18355174064636232, + "memory(GiB)": 72.48, + "step": 5895, + "token_acc": 0.9305537914390957, + "train_speed(iter/s)": 0.080822 + }, + { + "epoch": 2.2859980139026814, + "grad_norm": 0.5630345344543457, + "learning_rate": 1.4671105762823097e-06, + "loss": 0.17446522712707518, + "memory(GiB)": 72.48, + "step": 5900, + "token_acc": 0.9483853713980076, + "train_speed(iter/s)": 0.080821 + }, + { + "epoch": 2.287935669823431, + "grad_norm": 0.5718346238136292, + "learning_rate": 1.4595601577649954e-06, + "loss": 0.1837272524833679, + "memory(GiB)": 72.48, + "step": 5905, + "token_acc": 0.9470140331004971, + "train_speed(iter/s)": 0.080825 + }, + { + "epoch": 2.289873325744181, + "grad_norm": 0.5395617485046387, + "learning_rate": 1.4520258965062955e-06, + "loss": 0.19387768507003783, + "memory(GiB)": 72.48, + "step": 5910, + "token_acc": 0.9369055466616443, + "train_speed(iter/s)": 0.080821 + }, + { + "epoch": 2.291810981664931, + "grad_norm": 0.5457859039306641, + "learning_rate": 1.4445078268897928e-06, + "loss": 0.1853898286819458, + "memory(GiB)": 72.48, + "step": 5915, + "token_acc": 0.9348705882352941, + "train_speed(iter/s)": 0.080823 + }, + { + "epoch": 2.2937486375856806, + "grad_norm": 0.5322543382644653, + "learning_rate": 1.4370059832251771e-06, + "loss": 0.17845585346221923, + "memory(GiB)": 72.48, + "step": 5920, + "token_acc": 0.9355810524966348, + "train_speed(iter/s)": 0.08082 + }, + { + "epoch": 2.2956862935064306, + "grad_norm": 0.5761004090309143, + "learning_rate": 1.4295203997480927e-06, + "loss": 0.17757489681243896, + "memory(GiB)": 72.48, + "step": 5925, + "token_acc": 0.9361775170089258, + "train_speed(iter/s)": 0.080827 + }, + { + "epoch": 2.2976239494271806, + "grad_norm": 0.6075831651687622, + "learning_rate": 1.4220511106199707e-06, + "loss": 0.18235840797424316, + "memory(GiB)": 72.48, + "step": 5930, + "token_acc": 0.940095087163233, + "train_speed(iter/s)": 0.080825 + }, + { + "epoch": 2.29956160534793, + "grad_norm": 0.5502698421478271, + "learning_rate": 1.4145981499278877e-06, + "loss": 0.17435197830200194, + "memory(GiB)": 72.48, + "step": 5935, + "token_acc": 0.9347957784650703, + "train_speed(iter/s)": 0.080821 + }, + { + "epoch": 2.30149926126868, + "grad_norm": 0.5805485844612122, + "learning_rate": 1.4071615516844012e-06, + "loss": 0.1907791018486023, + "memory(GiB)": 72.48, + "step": 5940, + "token_acc": 0.9363292452254561, + "train_speed(iter/s)": 0.080821 + }, + { + "epoch": 2.30343691718943, + "grad_norm": 0.5583797097206116, + "learning_rate": 1.399741349827396e-06, + "loss": 0.1786908984184265, + "memory(GiB)": 72.48, + "step": 5945, + "token_acc": 0.9454643241305127, + "train_speed(iter/s)": 0.08082 + }, + { + "epoch": 2.30537457311018, + "grad_norm": 0.5587339997291565, + "learning_rate": 1.3923375782199312e-06, + "loss": 0.18267627954483032, + "memory(GiB)": 72.48, + "step": 5950, + "token_acc": 0.9357506361323156, + "train_speed(iter/s)": 0.080821 + }, + { + "epoch": 2.30731222903093, + "grad_norm": 0.6145024299621582, + "learning_rate": 1.3849502706500833e-06, + "loss": 0.18850283622741698, + "memory(GiB)": 72.48, + "step": 5955, + "token_acc": 0.9302015806766717, + "train_speed(iter/s)": 0.080825 + }, + { + "epoch": 2.30924988495168, + "grad_norm": 0.5499513149261475, + "learning_rate": 1.377579460830792e-06, + "loss": 0.1795741558074951, + "memory(GiB)": 72.48, + "step": 5960, + "token_acc": 0.9391224050070112, + "train_speed(iter/s)": 0.080822 + }, + { + "epoch": 2.3111875408724294, + "grad_norm": 0.5719257593154907, + "learning_rate": 1.3702251823997082e-06, + "loss": 0.1977135419845581, + "memory(GiB)": 72.48, + "step": 5965, + "token_acc": 0.929769291964996, + "train_speed(iter/s)": 0.080826 + }, + { + "epoch": 2.3131251967931794, + "grad_norm": 0.553665816783905, + "learning_rate": 1.3628874689190409e-06, + "loss": 0.1760319709777832, + "memory(GiB)": 72.48, + "step": 5970, + "token_acc": 0.9368856121537087, + "train_speed(iter/s)": 0.080828 + }, + { + "epoch": 2.3150628527139294, + "grad_norm": 0.5423727631568909, + "learning_rate": 1.3555663538754016e-06, + "loss": 0.1752350687980652, + "memory(GiB)": 72.48, + "step": 5975, + "token_acc": 0.9409250955490686, + "train_speed(iter/s)": 0.08083 + }, + { + "epoch": 2.3170005086346794, + "grad_norm": 0.5965731143951416, + "learning_rate": 1.3482618706796536e-06, + "loss": 0.18348853588104247, + "memory(GiB)": 72.48, + "step": 5980, + "token_acc": 0.9422314911366007, + "train_speed(iter/s)": 0.08083 + }, + { + "epoch": 2.318938164555429, + "grad_norm": 0.5590642690658569, + "learning_rate": 1.3409740526667581e-06, + "loss": 0.17766213417053223, + "memory(GiB)": 72.48, + "step": 5985, + "token_acc": 0.9440682217205919, + "train_speed(iter/s)": 0.080834 + }, + { + "epoch": 2.320875820476179, + "grad_norm": 0.5313416719436646, + "learning_rate": 1.3337029330956203e-06, + "loss": 0.17021008729934692, + "memory(GiB)": 72.48, + "step": 5990, + "token_acc": 0.9451896876964997, + "train_speed(iter/s)": 0.080835 + }, + { + "epoch": 2.322813476396929, + "grad_norm": 0.5804678797721863, + "learning_rate": 1.3264485451489435e-06, + "loss": 0.17493901252746583, + "memory(GiB)": 72.48, + "step": 5995, + "token_acc": 0.9351591363590144, + "train_speed(iter/s)": 0.080841 + }, + { + "epoch": 2.3247511323176786, + "grad_norm": 0.5881349444389343, + "learning_rate": 1.3192109219330717e-06, + "loss": 0.1815449118614197, + "memory(GiB)": 72.48, + "step": 6000, + "token_acc": 0.931446340860101, + "train_speed(iter/s)": 0.080844 + }, + { + "epoch": 2.3266887882384286, + "grad_norm": 0.6008525490760803, + "learning_rate": 1.311990096477842e-06, + "loss": 0.1786556601524353, + "memory(GiB)": 72.48, + "step": 6005, + "token_acc": 0.9421632364334557, + "train_speed(iter/s)": 0.080843 + }, + { + "epoch": 2.3286264441591786, + "grad_norm": 0.5839970111846924, + "learning_rate": 1.3047861017364332e-06, + "loss": 0.18735158443450928, + "memory(GiB)": 72.48, + "step": 6010, + "token_acc": 0.9399279134961954, + "train_speed(iter/s)": 0.080844 + }, + { + "epoch": 2.330564100079928, + "grad_norm": 0.5671572685241699, + "learning_rate": 1.2975989705852144e-06, + "loss": 0.17721595764160156, + "memory(GiB)": 72.48, + "step": 6015, + "token_acc": 0.9393546294794876, + "train_speed(iter/s)": 0.080846 + }, + { + "epoch": 2.332501756000678, + "grad_norm": 0.5906935334205627, + "learning_rate": 1.290428735823593e-06, + "loss": 0.1887272357940674, + "memory(GiB)": 72.48, + "step": 6020, + "token_acc": 0.9377751858871556, + "train_speed(iter/s)": 0.080846 + }, + { + "epoch": 2.3344394119214282, + "grad_norm": 0.5749350190162659, + "learning_rate": 1.283275430173871e-06, + "loss": 0.18697314262390136, + "memory(GiB)": 72.48, + "step": 6025, + "token_acc": 0.944213984667972, + "train_speed(iter/s)": 0.080846 + }, + { + "epoch": 2.336377067842178, + "grad_norm": 0.5701440572738647, + "learning_rate": 1.2761390862810907e-06, + "loss": 0.1756414532661438, + "memory(GiB)": 72.48, + "step": 6030, + "token_acc": 0.9355313030976343, + "train_speed(iter/s)": 0.080841 + }, + { + "epoch": 2.338314723762928, + "grad_norm": 0.5958893299102783, + "learning_rate": 1.2690197367128886e-06, + "loss": 0.18113667964935304, + "memory(GiB)": 72.48, + "step": 6035, + "token_acc": 0.9414217825623121, + "train_speed(iter/s)": 0.080838 + }, + { + "epoch": 2.340252379683678, + "grad_norm": 0.5517078042030334, + "learning_rate": 1.2619174139593426e-06, + "loss": 0.17177793979644776, + "memory(GiB)": 72.48, + "step": 6040, + "token_acc": 0.9472622376384808, + "train_speed(iter/s)": 0.080835 + }, + { + "epoch": 2.3421900356044274, + "grad_norm": 0.539323627948761, + "learning_rate": 1.2548321504328309e-06, + "loss": 0.16735206842422484, + "memory(GiB)": 72.48, + "step": 6045, + "token_acc": 0.937607171866069, + "train_speed(iter/s)": 0.080835 + }, + { + "epoch": 2.3441276915251774, + "grad_norm": 0.5506569743156433, + "learning_rate": 1.247763978467874e-06, + "loss": 0.1754169464111328, + "memory(GiB)": 72.48, + "step": 6050, + "token_acc": 0.9432072879762852, + "train_speed(iter/s)": 0.080839 + }, + { + "epoch": 2.3460653474459274, + "grad_norm": 0.5305989384651184, + "learning_rate": 1.2407129303209964e-06, + "loss": 0.1766362190246582, + "memory(GiB)": 72.48, + "step": 6055, + "token_acc": 0.9392191659272404, + "train_speed(iter/s)": 0.080841 + }, + { + "epoch": 2.348003003366677, + "grad_norm": 0.5523601770401001, + "learning_rate": 1.233679038170576e-06, + "loss": 0.18580877780914307, + "memory(GiB)": 72.48, + "step": 6060, + "token_acc": 0.9315313647443612, + "train_speed(iter/s)": 0.080841 + }, + { + "epoch": 2.349940659287427, + "grad_norm": 0.5426312685012817, + "learning_rate": 1.2266623341166955e-06, + "loss": 0.1706899881362915, + "memory(GiB)": 72.48, + "step": 6065, + "token_acc": 0.9475939237279133, + "train_speed(iter/s)": 0.080844 + }, + { + "epoch": 2.351878315208177, + "grad_norm": 0.5808708667755127, + "learning_rate": 1.2196628501809994e-06, + "loss": 0.18040728569030762, + "memory(GiB)": 72.48, + "step": 6070, + "token_acc": 0.941207314675231, + "train_speed(iter/s)": 0.080846 + }, + { + "epoch": 2.3538159711289266, + "grad_norm": 0.5775749087333679, + "learning_rate": 1.2126806183065449e-06, + "loss": 0.18611918687820433, + "memory(GiB)": 72.48, + "step": 6075, + "token_acc": 0.9329947627360736, + "train_speed(iter/s)": 0.080846 + }, + { + "epoch": 2.3557536270496766, + "grad_norm": 0.5580968856811523, + "learning_rate": 1.2057156703576557e-06, + "loss": 0.17953190803527833, + "memory(GiB)": 72.48, + "step": 6080, + "token_acc": 0.934079196294605, + "train_speed(iter/s)": 0.080836 + }, + { + "epoch": 2.3576912829704266, + "grad_norm": 0.5725741982460022, + "learning_rate": 1.1987680381197797e-06, + "loss": 0.1781769275665283, + "memory(GiB)": 72.48, + "step": 6085, + "token_acc": 0.9443928376949934, + "train_speed(iter/s)": 0.080836 + }, + { + "epoch": 2.359628938891176, + "grad_norm": 0.5721781849861145, + "learning_rate": 1.1918377532993425e-06, + "loss": 0.17660874128341675, + "memory(GiB)": 72.48, + "step": 6090, + "token_acc": 0.9424791673421744, + "train_speed(iter/s)": 0.080836 + }, + { + "epoch": 2.361566594811926, + "grad_norm": 0.5958847403526306, + "learning_rate": 1.184924847523602e-06, + "loss": 0.17371095418930055, + "memory(GiB)": 72.48, + "step": 6095, + "token_acc": 0.9402736922880164, + "train_speed(iter/s)": 0.080838 + }, + { + "epoch": 2.3635042507326762, + "grad_norm": 0.5600293278694153, + "learning_rate": 1.1780293523405044e-06, + "loss": 0.17254823446273804, + "memory(GiB)": 72.48, + "step": 6100, + "token_acc": 0.9396461656334889, + "train_speed(iter/s)": 0.080841 + }, + { + "epoch": 2.365441906653426, + "grad_norm": 0.5692081451416016, + "learning_rate": 1.1711512992185408e-06, + "loss": 0.18158187866210937, + "memory(GiB)": 72.48, + "step": 6105, + "token_acc": 0.9332443257676902, + "train_speed(iter/s)": 0.080838 + }, + { + "epoch": 2.367379562574176, + "grad_norm": 0.5824898481369019, + "learning_rate": 1.1642907195466047e-06, + "loss": 0.1743251323699951, + "memory(GiB)": 72.48, + "step": 6110, + "token_acc": 0.9418575383840926, + "train_speed(iter/s)": 0.080842 + }, + { + "epoch": 2.369317218494926, + "grad_norm": 0.5314090847969055, + "learning_rate": 1.1574476446338423e-06, + "loss": 0.17454535961151124, + "memory(GiB)": 72.48, + "step": 6115, + "token_acc": 0.9452273082330885, + "train_speed(iter/s)": 0.080841 + }, + { + "epoch": 2.3712548744156754, + "grad_norm": 0.5183026790618896, + "learning_rate": 1.1506221057095191e-06, + "loss": 0.1725538730621338, + "memory(GiB)": 72.48, + "step": 6120, + "token_acc": 0.9378591705324378, + "train_speed(iter/s)": 0.080834 + }, + { + "epoch": 2.3731925303364254, + "grad_norm": 0.5685054063796997, + "learning_rate": 1.143814133922872e-06, + "loss": 0.182060706615448, + "memory(GiB)": 72.48, + "step": 6125, + "token_acc": 0.9310846806665428, + "train_speed(iter/s)": 0.080835 + }, + { + "epoch": 2.3751301862571754, + "grad_norm": 0.5826152563095093, + "learning_rate": 1.137023760342967e-06, + "loss": 0.18058472871780396, + "memory(GiB)": 72.48, + "step": 6130, + "token_acc": 0.9364295232214241, + "train_speed(iter/s)": 0.080842 + }, + { + "epoch": 2.3770678421779254, + "grad_norm": 0.5600957870483398, + "learning_rate": 1.13025101595856e-06, + "loss": 0.18131771087646484, + "memory(GiB)": 72.48, + "step": 6135, + "token_acc": 0.9257859586251832, + "train_speed(iter/s)": 0.080838 + }, + { + "epoch": 2.379005498098675, + "grad_norm": 0.5904654860496521, + "learning_rate": 1.1234959316779509e-06, + "loss": 0.17485294342041016, + "memory(GiB)": 72.48, + "step": 6140, + "token_acc": 0.9428003462759306, + "train_speed(iter/s)": 0.080842 + }, + { + "epoch": 2.380943154019425, + "grad_norm": 0.6105183362960815, + "learning_rate": 1.116758538328847e-06, + "loss": 0.1759654998779297, + "memory(GiB)": 72.48, + "step": 6145, + "token_acc": 0.9417246357789736, + "train_speed(iter/s)": 0.080843 + }, + { + "epoch": 2.382880809940175, + "grad_norm": 0.5727225542068481, + "learning_rate": 1.1100388666582224e-06, + "loss": 0.17617188692092894, + "memory(GiB)": 72.48, + "step": 6150, + "token_acc": 0.9286671773266948, + "train_speed(iter/s)": 0.080848 + }, + { + "epoch": 2.3848184658609246, + "grad_norm": 0.5402427315711975, + "learning_rate": 1.1033369473321737e-06, + "loss": 0.1745760917663574, + "memory(GiB)": 72.48, + "step": 6155, + "token_acc": 0.9433132808884775, + "train_speed(iter/s)": 0.080844 + }, + { + "epoch": 2.3867561217816746, + "grad_norm": 0.5623111724853516, + "learning_rate": 1.0966528109357833e-06, + "loss": 0.17613692283630372, + "memory(GiB)": 72.48, + "step": 6160, + "token_acc": 0.939906103286385, + "train_speed(iter/s)": 0.080849 + }, + { + "epoch": 2.3886937777024246, + "grad_norm": 0.5594596266746521, + "learning_rate": 1.0899864879729782e-06, + "loss": 0.17606816291809083, + "memory(GiB)": 72.48, + "step": 6165, + "token_acc": 0.9393115942028986, + "train_speed(iter/s)": 0.080852 + }, + { + "epoch": 2.390631433623174, + "grad_norm": 0.5597702264785767, + "learning_rate": 1.083338008866394e-06, + "loss": 0.1854230523109436, + "memory(GiB)": 72.48, + "step": 6170, + "token_acc": 0.9366223908918406, + "train_speed(iter/s)": 0.080855 + }, + { + "epoch": 2.392569089543924, + "grad_norm": 0.5472501516342163, + "learning_rate": 1.076707403957229e-06, + "loss": 0.1821220636367798, + "memory(GiB)": 72.48, + "step": 6175, + "token_acc": 0.9418488444722048, + "train_speed(iter/s)": 0.080853 + }, + { + "epoch": 2.3945067454646742, + "grad_norm": 0.5826203227043152, + "learning_rate": 1.070094703505114e-06, + "loss": 0.18362678289413453, + "memory(GiB)": 72.48, + "step": 6180, + "token_acc": 0.9364526939932972, + "train_speed(iter/s)": 0.080857 + }, + { + "epoch": 2.396444401385424, + "grad_norm": 0.5729585289955139, + "learning_rate": 1.0634999376879684e-06, + "loss": 0.1807836651802063, + "memory(GiB)": 72.48, + "step": 6185, + "token_acc": 0.9344845892015115, + "train_speed(iter/s)": 0.080859 + }, + { + "epoch": 2.398382057306174, + "grad_norm": 0.5837453603744507, + "learning_rate": 1.0569231366018667e-06, + "loss": 0.18412914276123046, + "memory(GiB)": 72.48, + "step": 6190, + "token_acc": 0.9381114618822868, + "train_speed(iter/s)": 0.080861 + }, + { + "epoch": 2.400319713226924, + "grad_norm": 0.5625714063644409, + "learning_rate": 1.0503643302608986e-06, + "loss": 0.18067336082458496, + "memory(GiB)": 72.48, + "step": 6195, + "token_acc": 0.9334223206146633, + "train_speed(iter/s)": 0.080861 + }, + { + "epoch": 2.402257369147674, + "grad_norm": 0.5841348767280579, + "learning_rate": 1.0438235485970288e-06, + "loss": 0.18118083477020264, + "memory(GiB)": 72.48, + "step": 6200, + "token_acc": 0.933785735882017, + "train_speed(iter/s)": 0.080866 + }, + { + "epoch": 2.4041950250684234, + "grad_norm": 0.5591301918029785, + "learning_rate": 1.0373008214599678e-06, + "loss": 0.17193083763122557, + "memory(GiB)": 72.48, + "step": 6205, + "token_acc": 0.9449988472812304, + "train_speed(iter/s)": 0.080869 + }, + { + "epoch": 2.4061326809891734, + "grad_norm": 0.5522191524505615, + "learning_rate": 1.0307961786170318e-06, + "loss": 0.18121827840805055, + "memory(GiB)": 72.48, + "step": 6210, + "token_acc": 0.934540164861807, + "train_speed(iter/s)": 0.080868 + }, + { + "epoch": 2.4080703369099234, + "grad_norm": 0.5809552073478699, + "learning_rate": 1.0243096497530058e-06, + "loss": 0.17905534505844117, + "memory(GiB)": 72.48, + "step": 6215, + "token_acc": 0.9423424701251729, + "train_speed(iter/s)": 0.080865 + }, + { + "epoch": 2.410007992830673, + "grad_norm": 0.5794199705123901, + "learning_rate": 1.0178412644700093e-06, + "loss": 0.1741298794746399, + "memory(GiB)": 72.48, + "step": 6220, + "token_acc": 0.9405881573718373, + "train_speed(iter/s)": 0.080867 + }, + { + "epoch": 2.411945648751423, + "grad_norm": 0.5728258490562439, + "learning_rate": 1.0113910522873615e-06, + "loss": 0.1761408805847168, + "memory(GiB)": 72.48, + "step": 6225, + "token_acc": 0.9384154460719041, + "train_speed(iter/s)": 0.080867 + }, + { + "epoch": 2.413883304672173, + "grad_norm": 0.5635868906974792, + "learning_rate": 1.0049590426414479e-06, + "loss": 0.1836428999900818, + "memory(GiB)": 72.48, + "step": 6230, + "token_acc": 0.9472552491778397, + "train_speed(iter/s)": 0.080868 + }, + { + "epoch": 2.4158209605929226, + "grad_norm": 0.5687157511711121, + "learning_rate": 9.985452648855803e-07, + "loss": 0.17928725481033325, + "memory(GiB)": 72.48, + "step": 6235, + "token_acc": 0.9374187884108868, + "train_speed(iter/s)": 0.080868 + }, + { + "epoch": 2.4177586165136726, + "grad_norm": 0.5609813332557678, + "learning_rate": 9.921497482898702e-07, + "loss": 0.16830335855484008, + "memory(GiB)": 72.48, + "step": 6240, + "token_acc": 0.9397059267534192, + "train_speed(iter/s)": 0.080867 + }, + { + "epoch": 2.4196962724344226, + "grad_norm": 0.5949937701225281, + "learning_rate": 9.857725220410908e-07, + "loss": 0.17889876365661622, + "memory(GiB)": 72.48, + "step": 6245, + "token_acc": 0.9311128386055134, + "train_speed(iter/s)": 0.080867 + }, + { + "epoch": 2.421633928355172, + "grad_norm": 0.5891610980033875, + "learning_rate": 9.79413615242546e-07, + "loss": 0.1865074396133423, + "memory(GiB)": 72.48, + "step": 6250, + "token_acc": 0.9318126741790864, + "train_speed(iter/s)": 0.08086 + }, + { + "epoch": 2.4235715842759222, + "grad_norm": 0.5652107000350952, + "learning_rate": 9.730730569139368e-07, + "loss": 0.18406026363372802, + "memory(GiB)": 72.48, + "step": 6255, + "token_acc": 0.9368455497382199, + "train_speed(iter/s)": 0.080858 + }, + { + "epoch": 2.4255092401966722, + "grad_norm": 0.5708385705947876, + "learning_rate": 9.667508759912242e-07, + "loss": 0.1803309679031372, + "memory(GiB)": 72.48, + "step": 6260, + "token_acc": 0.9434427336574824, + "train_speed(iter/s)": 0.080858 + }, + { + "epoch": 2.427446896117422, + "grad_norm": 0.5924727916717529, + "learning_rate": 9.604471013265064e-07, + "loss": 0.1720449686050415, + "memory(GiB)": 72.48, + "step": 6265, + "token_acc": 0.9419114576082417, + "train_speed(iter/s)": 0.080862 + }, + { + "epoch": 2.429384552038172, + "grad_norm": 0.5573844313621521, + "learning_rate": 9.541617616878812e-07, + "loss": 0.18263744115829467, + "memory(GiB)": 72.48, + "step": 6270, + "token_acc": 0.9388622714083271, + "train_speed(iter/s)": 0.080859 + }, + { + "epoch": 2.431322207958922, + "grad_norm": 0.583974301815033, + "learning_rate": 9.478948857593146e-07, + "loss": 0.18000080585479736, + "memory(GiB)": 72.48, + "step": 6275, + "token_acc": 0.9437852006659346, + "train_speed(iter/s)": 0.080857 + }, + { + "epoch": 2.4332598638796714, + "grad_norm": 0.5686377286911011, + "learning_rate": 9.416465021405108e-07, + "loss": 0.18695411682128907, + "memory(GiB)": 72.48, + "step": 6280, + "token_acc": 0.9373716297721123, + "train_speed(iter/s)": 0.08086 + }, + { + "epoch": 2.4351975198004214, + "grad_norm": 0.5770736336708069, + "learning_rate": 9.354166393467845e-07, + "loss": 0.18186668157577515, + "memory(GiB)": 72.48, + "step": 6285, + "token_acc": 0.940144604777449, + "train_speed(iter/s)": 0.080863 + }, + { + "epoch": 2.4371351757211714, + "grad_norm": 0.5655450224876404, + "learning_rate": 9.292053258089251e-07, + "loss": 0.18221802711486818, + "memory(GiB)": 72.48, + "step": 6290, + "token_acc": 0.9383114885237831, + "train_speed(iter/s)": 0.080865 + }, + { + "epoch": 2.439072831641921, + "grad_norm": 0.5566943883895874, + "learning_rate": 9.23012589873073e-07, + "loss": 0.18132885694503784, + "memory(GiB)": 72.48, + "step": 6295, + "token_acc": 0.9439205058717254, + "train_speed(iter/s)": 0.080867 + }, + { + "epoch": 2.441010487562671, + "grad_norm": 0.548239529132843, + "learning_rate": 9.168384598005831e-07, + "loss": 0.17681779861450195, + "memory(GiB)": 72.48, + "step": 6300, + "token_acc": 0.9441413081497027, + "train_speed(iter/s)": 0.080865 + }, + { + "epoch": 2.442948143483421, + "grad_norm": 0.5674452185630798, + "learning_rate": 9.106829637679043e-07, + "loss": 0.18225634098052979, + "memory(GiB)": 72.48, + "step": 6305, + "token_acc": 0.9364469845805798, + "train_speed(iter/s)": 0.080861 + }, + { + "epoch": 2.4448857994041706, + "grad_norm": 0.5836702585220337, + "learning_rate": 9.045461298664443e-07, + "loss": 0.1857957124710083, + "memory(GiB)": 72.48, + "step": 6310, + "token_acc": 0.9381540015876453, + "train_speed(iter/s)": 0.08086 + }, + { + "epoch": 2.4468234553249206, + "grad_norm": 0.5767743587493896, + "learning_rate": 8.984279861024453e-07, + "loss": 0.18774926662445068, + "memory(GiB)": 72.48, + "step": 6315, + "token_acc": 0.9405742821473159, + "train_speed(iter/s)": 0.080862 + }, + { + "epoch": 2.4487611112456706, + "grad_norm": 0.5304026007652283, + "learning_rate": 8.92328560396854e-07, + "loss": 0.17166202068328856, + "memory(GiB)": 72.48, + "step": 6320, + "token_acc": 0.938264787094899, + "train_speed(iter/s)": 0.080859 + }, + { + "epoch": 2.45069876716642, + "grad_norm": 0.5317398309707642, + "learning_rate": 8.862478805851921e-07, + "loss": 0.17200467586517335, + "memory(GiB)": 72.48, + "step": 6325, + "token_acc": 0.94351630867144, + "train_speed(iter/s)": 0.080859 + }, + { + "epoch": 2.4526364230871702, + "grad_norm": 0.525132954120636, + "learning_rate": 8.801859744174357e-07, + "loss": 0.17509570121765136, + "memory(GiB)": 72.48, + "step": 6330, + "token_acc": 0.9369052979138526, + "train_speed(iter/s)": 0.080858 + }, + { + "epoch": 2.4545740790079202, + "grad_norm": 0.5590113401412964, + "learning_rate": 8.741428695578841e-07, + "loss": 0.18100138902664184, + "memory(GiB)": 72.48, + "step": 6335, + "token_acc": 0.9329828274896727, + "train_speed(iter/s)": 0.080856 + }, + { + "epoch": 2.45651173492867, + "grad_norm": 0.5894602537155151, + "learning_rate": 8.681185935850334e-07, + "loss": 0.18703469038009643, + "memory(GiB)": 72.48, + "step": 6340, + "token_acc": 0.9299990313519099, + "train_speed(iter/s)": 0.080856 + }, + { + "epoch": 2.45844939084942, + "grad_norm": 0.5846702456474304, + "learning_rate": 8.621131739914524e-07, + "loss": 0.1720949411392212, + "memory(GiB)": 72.48, + "step": 6345, + "token_acc": 0.9361929221840499, + "train_speed(iter/s)": 0.08086 + }, + { + "epoch": 2.46038704677017, + "grad_norm": 0.5550934076309204, + "learning_rate": 8.561266381836558e-07, + "loss": 0.18395935297012328, + "memory(GiB)": 72.48, + "step": 6350, + "token_acc": 0.9337567171470444, + "train_speed(iter/s)": 0.080865 + }, + { + "epoch": 2.46232470269092, + "grad_norm": 0.548347532749176, + "learning_rate": 8.501590134819809e-07, + "loss": 0.17986433506011962, + "memory(GiB)": 72.48, + "step": 6355, + "token_acc": 0.9395786970703899, + "train_speed(iter/s)": 0.080865 + }, + { + "epoch": 2.4642623586116694, + "grad_norm": 0.5795870423316956, + "learning_rate": 8.442103271204588e-07, + "loss": 0.18020589351654054, + "memory(GiB)": 72.48, + "step": 6360, + "token_acc": 0.9434226980205569, + "train_speed(iter/s)": 0.080866 + }, + { + "epoch": 2.4662000145324194, + "grad_norm": 0.6125761270523071, + "learning_rate": 8.382806062466958e-07, + "loss": 0.1750793695449829, + "memory(GiB)": 72.48, + "step": 6365, + "token_acc": 0.9413035279596804, + "train_speed(iter/s)": 0.080863 + }, + { + "epoch": 2.4681376704531695, + "grad_norm": 0.5819019079208374, + "learning_rate": 8.323698779217465e-07, + "loss": 0.17794256210327147, + "memory(GiB)": 72.48, + "step": 6370, + "token_acc": 0.9363123419158181, + "train_speed(iter/s)": 0.080863 + }, + { + "epoch": 2.470075326373919, + "grad_norm": 0.5621497631072998, + "learning_rate": 8.264781691199892e-07, + "loss": 0.17670561075210572, + "memory(GiB)": 72.48, + "step": 6375, + "token_acc": 0.937870866343779, + "train_speed(iter/s)": 0.080862 + }, + { + "epoch": 2.472012982294669, + "grad_norm": 0.5306990742683411, + "learning_rate": 8.206055067290059e-07, + "loss": 0.17103338241577148, + "memory(GiB)": 72.48, + "step": 6380, + "token_acc": 0.9375408569560825, + "train_speed(iter/s)": 0.08087 + }, + { + "epoch": 2.473950638215419, + "grad_norm": 0.5630704760551453, + "learning_rate": 8.14751917549455e-07, + "loss": 0.18032557964324952, + "memory(GiB)": 72.48, + "step": 6385, + "token_acc": 0.9378391711889492, + "train_speed(iter/s)": 0.080867 + }, + { + "epoch": 2.4758882941361686, + "grad_norm": 0.5820339322090149, + "learning_rate": 8.089174282949547e-07, + "loss": 0.1891242265701294, + "memory(GiB)": 72.48, + "step": 6390, + "token_acc": 0.9376766041210274, + "train_speed(iter/s)": 0.080869 + }, + { + "epoch": 2.4778259500569186, + "grad_norm": 0.5377040505409241, + "learning_rate": 8.031020655919563e-07, + "loss": 0.18622909784317015, + "memory(GiB)": 72.48, + "step": 6395, + "token_acc": 0.9344472204871955, + "train_speed(iter/s)": 0.080866 + }, + { + "epoch": 2.4797636059776687, + "grad_norm": 0.5398117303848267, + "learning_rate": 7.973058559796265e-07, + "loss": 0.1743963360786438, + "memory(GiB)": 72.48, + "step": 6400, + "token_acc": 0.9385559164124266, + "train_speed(iter/s)": 0.080866 + }, + { + "epoch": 2.481701261898418, + "grad_norm": 0.5849392414093018, + "learning_rate": 7.915288259097226e-07, + "loss": 0.17278592586517333, + "memory(GiB)": 72.48, + "step": 6405, + "token_acc": 0.9324333727645954, + "train_speed(iter/s)": 0.080869 + }, + { + "epoch": 2.4836389178191682, + "grad_norm": 0.5697488188743591, + "learning_rate": 7.857710017464737e-07, + "loss": 0.17992591857910156, + "memory(GiB)": 72.48, + "step": 6410, + "token_acc": 0.9392217270361619, + "train_speed(iter/s)": 0.080868 + }, + { + "epoch": 2.4855765737399182, + "grad_norm": 0.5471551418304443, + "learning_rate": 7.800324097664629e-07, + "loss": 0.17690662145614625, + "memory(GiB)": 72.48, + "step": 6415, + "token_acc": 0.9364684527210999, + "train_speed(iter/s)": 0.080862 + }, + { + "epoch": 2.487514229660668, + "grad_norm": 0.5691121816635132, + "learning_rate": 7.743130761584999e-07, + "loss": 0.17598928213119508, + "memory(GiB)": 72.48, + "step": 6420, + "token_acc": 0.9407388727777077, + "train_speed(iter/s)": 0.080858 + }, + { + "epoch": 2.489451885581418, + "grad_norm": 0.5582486391067505, + "learning_rate": 7.686130270235098e-07, + "loss": 0.17861016988754272, + "memory(GiB)": 72.48, + "step": 6425, + "token_acc": 0.9379861845790031, + "train_speed(iter/s)": 0.080861 + }, + { + "epoch": 2.491389541502168, + "grad_norm": 0.5467748641967773, + "learning_rate": 7.629322883744095e-07, + "loss": 0.17899925708770753, + "memory(GiB)": 72.48, + "step": 6430, + "token_acc": 0.9343893494406608, + "train_speed(iter/s)": 0.080865 + }, + { + "epoch": 2.493327197422918, + "grad_norm": 0.5240247845649719, + "learning_rate": 7.572708861359912e-07, + "loss": 0.1765903949737549, + "memory(GiB)": 72.48, + "step": 6435, + "token_acc": 0.9447293978675122, + "train_speed(iter/s)": 0.080866 + }, + { + "epoch": 2.4952648533436674, + "grad_norm": 0.5655544996261597, + "learning_rate": 7.516288461448018e-07, + "loss": 0.17434144020080566, + "memory(GiB)": 72.48, + "step": 6440, + "token_acc": 0.941343003467252, + "train_speed(iter/s)": 0.080869 + }, + { + "epoch": 2.4972025092644174, + "grad_norm": 0.5669127106666565, + "learning_rate": 7.460061941490243e-07, + "loss": 0.17970057725906372, + "memory(GiB)": 72.48, + "step": 6445, + "token_acc": 0.931559674649211, + "train_speed(iter/s)": 0.080868 + }, + { + "epoch": 2.4991401651851675, + "grad_norm": 0.61557537317276, + "learning_rate": 7.404029558083653e-07, + "loss": 0.17963045835494995, + "memory(GiB)": 72.48, + "step": 6450, + "token_acc": 0.9467308217308217, + "train_speed(iter/s)": 0.080867 + }, + { + "epoch": 2.501077821105917, + "grad_norm": 0.5478566288948059, + "learning_rate": 7.348191566939322e-07, + "loss": 0.17327165603637695, + "memory(GiB)": 72.48, + "step": 6455, + "token_acc": 0.9342934293429342, + "train_speed(iter/s)": 0.080869 + }, + { + "epoch": 2.503015477026667, + "grad_norm": 0.5557734966278076, + "learning_rate": 7.292548222881213e-07, + "loss": 0.17639074325561524, + "memory(GiB)": 72.48, + "step": 6460, + "token_acc": 0.9434013791220045, + "train_speed(iter/s)": 0.080865 + }, + { + "epoch": 2.504953132947417, + "grad_norm": 0.5693845748901367, + "learning_rate": 7.237099779844964e-07, + "loss": 0.1701158881187439, + "memory(GiB)": 72.48, + "step": 6465, + "token_acc": 0.9314411035534931, + "train_speed(iter/s)": 0.080866 + }, + { + "epoch": 2.5068907888681666, + "grad_norm": 0.6034876704216003, + "learning_rate": 7.181846490876781e-07, + "loss": 0.1744617462158203, + "memory(GiB)": 72.48, + "step": 6470, + "token_acc": 0.9363396040821346, + "train_speed(iter/s)": 0.080862 + }, + { + "epoch": 2.5088284447889166, + "grad_norm": 0.5935420393943787, + "learning_rate": 7.126788608132252e-07, + "loss": 0.17597038745880128, + "memory(GiB)": 72.48, + "step": 6475, + "token_acc": 0.9392177771662481, + "train_speed(iter/s)": 0.080866 + }, + { + "epoch": 2.5107661007096667, + "grad_norm": 0.6055991053581238, + "learning_rate": 7.071926382875194e-07, + "loss": 0.17976925373077393, + "memory(GiB)": 72.48, + "step": 6480, + "token_acc": 0.9430797089070347, + "train_speed(iter/s)": 0.080866 + }, + { + "epoch": 2.5127037566304162, + "grad_norm": 0.5665844082832336, + "learning_rate": 7.017260065476517e-07, + "loss": 0.17805242538452148, + "memory(GiB)": 72.48, + "step": 6485, + "token_acc": 0.9451027397260274, + "train_speed(iter/s)": 0.08087 + }, + { + "epoch": 2.5146414125511662, + "grad_norm": 0.5817480087280273, + "learning_rate": 6.962789905413086e-07, + "loss": 0.1649151086807251, + "memory(GiB)": 72.48, + "step": 6490, + "token_acc": 0.941207838954806, + "train_speed(iter/s)": 0.080868 + }, + { + "epoch": 2.5165790684719163, + "grad_norm": 0.5650894045829773, + "learning_rate": 6.908516151266581e-07, + "loss": 0.18006772994995118, + "memory(GiB)": 72.48, + "step": 6495, + "token_acc": 0.936968601459463, + "train_speed(iter/s)": 0.080867 + }, + { + "epoch": 2.518516724392666, + "grad_norm": 0.5909514427185059, + "learning_rate": 6.854439050722356e-07, + "loss": 0.1848963975906372, + "memory(GiB)": 72.48, + "step": 6500, + "token_acc": 0.9373421717171717, + "train_speed(iter/s)": 0.080864 + }, + { + "epoch": 2.520454380313416, + "grad_norm": 0.5733851790428162, + "learning_rate": 6.800558850568295e-07, + "loss": 0.17741067409515382, + "memory(GiB)": 72.48, + "step": 6505, + "token_acc": 0.9419108778563745, + "train_speed(iter/s)": 0.080864 + }, + { + "epoch": 2.522392036234166, + "grad_norm": 0.5257155895233154, + "learning_rate": 6.746875796693714e-07, + "loss": 0.17399654388427735, + "memory(GiB)": 72.48, + "step": 6510, + "token_acc": 0.9378819399297881, + "train_speed(iter/s)": 0.080865 + }, + { + "epoch": 2.5243296921549154, + "grad_norm": 0.5499019622802734, + "learning_rate": 6.693390134088229e-07, + "loss": 0.17450027465820311, + "memory(GiB)": 72.48, + "step": 6515, + "token_acc": 0.9319403386186509, + "train_speed(iter/s)": 0.080864 + }, + { + "epoch": 2.5262673480756654, + "grad_norm": 0.5747637748718262, + "learning_rate": 6.640102106840635e-07, + "loss": 0.18348932266235352, + "memory(GiB)": 72.48, + "step": 6520, + "token_acc": 0.9365109034267913, + "train_speed(iter/s)": 0.080865 + }, + { + "epoch": 2.5282050039964155, + "grad_norm": 0.5292996168136597, + "learning_rate": 6.587011958137779e-07, + "loss": 0.1740797519683838, + "memory(GiB)": 72.48, + "step": 6525, + "token_acc": 0.9383220106422141, + "train_speed(iter/s)": 0.08086 + }, + { + "epoch": 2.530142659917165, + "grad_norm": 0.550775945186615, + "learning_rate": 6.534119930263488e-07, + "loss": 0.18158359527587892, + "memory(GiB)": 72.48, + "step": 6530, + "token_acc": 0.9338427133805441, + "train_speed(iter/s)": 0.080859 + }, + { + "epoch": 2.532080315837915, + "grad_norm": 0.5693374872207642, + "learning_rate": 6.481426264597412e-07, + "loss": 0.17761898040771484, + "memory(GiB)": 72.48, + "step": 6535, + "token_acc": 0.9389302357932809, + "train_speed(iter/s)": 0.080866 + }, + { + "epoch": 2.534017971758665, + "grad_norm": 0.5608578324317932, + "learning_rate": 6.42893120161398e-07, + "loss": 0.17133185863494874, + "memory(GiB)": 72.48, + "step": 6540, + "token_acc": 0.9415438337306518, + "train_speed(iter/s)": 0.080866 + }, + { + "epoch": 2.5359556276794146, + "grad_norm": 0.547332763671875, + "learning_rate": 6.376634980881224e-07, + "loss": 0.18398122787475585, + "memory(GiB)": 72.48, + "step": 6545, + "token_acc": 0.9410398930556533, + "train_speed(iter/s)": 0.080868 + }, + { + "epoch": 2.5378932836001646, + "grad_norm": 0.5538179278373718, + "learning_rate": 6.324537841059781e-07, + "loss": 0.18358099460601807, + "memory(GiB)": 72.48, + "step": 6550, + "token_acc": 0.940280399029388, + "train_speed(iter/s)": 0.080866 + }, + { + "epoch": 2.5398309395209147, + "grad_norm": 0.6050009727478027, + "learning_rate": 6.272640019901732e-07, + "loss": 0.17833110094070434, + "memory(GiB)": 72.48, + "step": 6555, + "token_acc": 0.9338536585365854, + "train_speed(iter/s)": 0.080866 + }, + { + "epoch": 2.5417685954416642, + "grad_norm": 0.5568081736564636, + "learning_rate": 6.22094175424956e-07, + "loss": 0.17815144062042237, + "memory(GiB)": 72.48, + "step": 6560, + "token_acc": 0.9374870053364751, + "train_speed(iter/s)": 0.080866 + }, + { + "epoch": 2.5437062513624142, + "grad_norm": 0.5535020232200623, + "learning_rate": 6.16944328003502e-07, + "loss": 0.18283143043518066, + "memory(GiB)": 72.48, + "step": 6565, + "token_acc": 0.9375201599896781, + "train_speed(iter/s)": 0.080867 + }, + { + "epoch": 2.5456439072831643, + "grad_norm": 0.5756219625473022, + "learning_rate": 6.118144832278117e-07, + "loss": 0.17421250343322753, + "memory(GiB)": 72.48, + "step": 6570, + "token_acc": 0.944570943075616, + "train_speed(iter/s)": 0.080863 + }, + { + "epoch": 2.547581563203914, + "grad_norm": 0.5519009828567505, + "learning_rate": 6.067046645086e-07, + "loss": 0.17970068454742433, + "memory(GiB)": 72.48, + "step": 6575, + "token_acc": 0.9271296054350666, + "train_speed(iter/s)": 0.080864 + }, + { + "epoch": 2.549519219124664, + "grad_norm": 0.5751408338546753, + "learning_rate": 6.016148951651912e-07, + "loss": 0.17249932289123535, + "memory(GiB)": 72.48, + "step": 6580, + "token_acc": 0.9375794932002739, + "train_speed(iter/s)": 0.080866 + }, + { + "epoch": 2.551456875045414, + "grad_norm": 0.5762319564819336, + "learning_rate": 5.965451984254106e-07, + "loss": 0.17329201698303223, + "memory(GiB)": 72.48, + "step": 6585, + "token_acc": 0.9398695054945055, + "train_speed(iter/s)": 0.080862 + }, + { + "epoch": 2.5533945309661634, + "grad_norm": 0.5738294720649719, + "learning_rate": 5.914955974254804e-07, + "loss": 0.17728983163833617, + "memory(GiB)": 72.48, + "step": 6590, + "token_acc": 0.9353730148138263, + "train_speed(iter/s)": 0.080865 + }, + { + "epoch": 2.5553321868869134, + "grad_norm": 0.5713756084442139, + "learning_rate": 5.864661152099122e-07, + "loss": 0.18523712158203126, + "memory(GiB)": 72.48, + "step": 6595, + "token_acc": 0.9360657869076253, + "train_speed(iter/s)": 0.080866 + }, + { + "epoch": 2.5572698428076635, + "grad_norm": 0.5726301074028015, + "learning_rate": 5.814567747314049e-07, + "loss": 0.18754193782806397, + "memory(GiB)": 72.48, + "step": 6600, + "token_acc": 0.9317076065029073, + "train_speed(iter/s)": 0.080869 + }, + { + "epoch": 2.5592074987284135, + "grad_norm": 0.5507714748382568, + "learning_rate": 5.76467598850734e-07, + "loss": 0.173997163772583, + "memory(GiB)": 72.48, + "step": 6605, + "token_acc": 0.941759388038943, + "train_speed(iter/s)": 0.08087 + }, + { + "epoch": 2.561145154649163, + "grad_norm": 0.5710182189941406, + "learning_rate": 5.71498610336656e-07, + "loss": 0.17184269428253174, + "memory(GiB)": 72.48, + "step": 6610, + "token_acc": 0.9332494515421345, + "train_speed(iter/s)": 0.080873 + }, + { + "epoch": 2.563082810569913, + "grad_norm": 0.5885529518127441, + "learning_rate": 5.665498318657963e-07, + "loss": 0.18314269781112671, + "memory(GiB)": 72.48, + "step": 6615, + "token_acc": 0.9375413683133763, + "train_speed(iter/s)": 0.080869 + }, + { + "epoch": 2.565020466490663, + "grad_norm": 0.5782006978988647, + "learning_rate": 5.616212860225529e-07, + "loss": 0.1815364956855774, + "memory(GiB)": 72.48, + "step": 6620, + "token_acc": 0.936763468013468, + "train_speed(iter/s)": 0.080868 + }, + { + "epoch": 2.5669581224114126, + "grad_norm": 0.5746456384658813, + "learning_rate": 5.567129952989831e-07, + "loss": 0.18321220874786376, + "memory(GiB)": 72.48, + "step": 6625, + "token_acc": 0.9419481429572529, + "train_speed(iter/s)": 0.080867 + }, + { + "epoch": 2.5688957783321626, + "grad_norm": 0.5613101720809937, + "learning_rate": 5.518249820947141e-07, + "loss": 0.18082406520843505, + "memory(GiB)": 72.48, + "step": 6630, + "token_acc": 0.9322963191128908, + "train_speed(iter/s)": 0.080864 + }, + { + "epoch": 2.5708334342529127, + "grad_norm": 0.5367494225502014, + "learning_rate": 5.469572687168295e-07, + "loss": 0.1775603175163269, + "memory(GiB)": 72.48, + "step": 6635, + "token_acc": 0.9408753045677452, + "train_speed(iter/s)": 0.080869 + }, + { + "epoch": 2.5727710901736627, + "grad_norm": 0.5576823353767395, + "learning_rate": 5.421098773797751e-07, + "loss": 0.17280523777008056, + "memory(GiB)": 72.48, + "step": 6640, + "token_acc": 0.941501756338469, + "train_speed(iter/s)": 0.080871 + }, + { + "epoch": 2.5747087460944122, + "grad_norm": 0.5695961713790894, + "learning_rate": 5.372828302052524e-07, + "loss": 0.17974162101745605, + "memory(GiB)": 72.48, + "step": 6645, + "token_acc": 0.9428788338900699, + "train_speed(iter/s)": 0.080873 + }, + { + "epoch": 2.5766464020151623, + "grad_norm": 0.5692358613014221, + "learning_rate": 5.324761492221203e-07, + "loss": 0.17853889465332032, + "memory(GiB)": 72.48, + "step": 6650, + "token_acc": 0.9366494749269869, + "train_speed(iter/s)": 0.080877 + }, + { + "epoch": 2.5785840579359123, + "grad_norm": 0.5954664349555969, + "learning_rate": 5.276898563662936e-07, + "loss": 0.18170109987258912, + "memory(GiB)": 72.48, + "step": 6655, + "token_acc": 0.94287518277283, + "train_speed(iter/s)": 0.080882 + }, + { + "epoch": 2.580521713856662, + "grad_norm": 0.5591299533843994, + "learning_rate": 5.22923973480644e-07, + "loss": 0.1883463978767395, + "memory(GiB)": 72.48, + "step": 6660, + "token_acc": 0.9334826559783089, + "train_speed(iter/s)": 0.080882 + }, + { + "epoch": 2.582459369777412, + "grad_norm": 0.55705726146698, + "learning_rate": 5.181785223148999e-07, + "loss": 0.1746220827102661, + "memory(GiB)": 72.48, + "step": 6665, + "token_acc": 0.9413660832285724, + "train_speed(iter/s)": 0.080887 + }, + { + "epoch": 2.584397025698162, + "grad_norm": 0.5565838813781738, + "learning_rate": 5.134535245255439e-07, + "loss": 0.17614197731018066, + "memory(GiB)": 72.48, + "step": 6670, + "token_acc": 0.9389331402365436, + "train_speed(iter/s)": 0.080882 + }, + { + "epoch": 2.5863346816189114, + "grad_norm": 0.5887038707733154, + "learning_rate": 5.087490016757202e-07, + "loss": 0.18511323928833007, + "memory(GiB)": 72.48, + "step": 6675, + "token_acc": 0.9347578538415166, + "train_speed(iter/s)": 0.080886 + }, + { + "epoch": 2.5882723375396615, + "grad_norm": 0.559231698513031, + "learning_rate": 5.040649752351323e-07, + "loss": 0.1750473976135254, + "memory(GiB)": 72.48, + "step": 6680, + "token_acc": 0.944081865416297, + "train_speed(iter/s)": 0.080884 + }, + { + "epoch": 2.5902099934604115, + "grad_norm": 0.5665563344955444, + "learning_rate": 4.994014665799463e-07, + "loss": 0.17605587244033813, + "memory(GiB)": 72.48, + "step": 6685, + "token_acc": 0.9398994906157054, + "train_speed(iter/s)": 0.080886 + }, + { + "epoch": 2.592147649381161, + "grad_norm": 0.529827892780304, + "learning_rate": 4.947584969926894e-07, + "loss": 0.17327487468719482, + "memory(GiB)": 72.48, + "step": 6690, + "token_acc": 0.9405542557748706, + "train_speed(iter/s)": 0.080885 + }, + { + "epoch": 2.594085305301911, + "grad_norm": 0.5781688094139099, + "learning_rate": 4.901360876621597e-07, + "loss": 0.1757380723953247, + "memory(GiB)": 72.48, + "step": 6695, + "token_acc": 0.9384161130443952, + "train_speed(iter/s)": 0.080885 + }, + { + "epoch": 2.596022961222661, + "grad_norm": 0.5731996297836304, + "learning_rate": 4.855342596833241e-07, + "loss": 0.18313926458358765, + "memory(GiB)": 72.48, + "step": 6700, + "token_acc": 0.9378850621149379, + "train_speed(iter/s)": 0.080884 + }, + { + "epoch": 2.5979606171434106, + "grad_norm": 0.5984396934509277, + "learning_rate": 4.809530340572244e-07, + "loss": 0.17600476741790771, + "memory(GiB)": 72.48, + "step": 6705, + "token_acc": 0.9409620058846104, + "train_speed(iter/s)": 0.080885 + }, + { + "epoch": 2.5998982730641607, + "grad_norm": 0.5436455607414246, + "learning_rate": 4.7639243169088134e-07, + "loss": 0.18034532070159912, + "memory(GiB)": 72.48, + "step": 6710, + "token_acc": 0.9404441788666468, + "train_speed(iter/s)": 0.080884 + }, + { + "epoch": 2.6018359289849107, + "grad_norm": 0.584514856338501, + "learning_rate": 4.718524733971974e-07, + "loss": 0.17961220741271972, + "memory(GiB)": 72.48, + "step": 6715, + "token_acc": 0.9375684556407448, + "train_speed(iter/s)": 0.080884 + }, + { + "epoch": 2.6037735849056602, + "grad_norm": 0.5552295446395874, + "learning_rate": 4.6733317989486435e-07, + "loss": 0.181429922580719, + "memory(GiB)": 72.48, + "step": 6720, + "token_acc": 0.9331636472525827, + "train_speed(iter/s)": 0.080886 + }, + { + "epoch": 2.6057112408264103, + "grad_norm": 0.5627444386482239, + "learning_rate": 4.628345718082677e-07, + "loss": 0.1798389196395874, + "memory(GiB)": 72.48, + "step": 6725, + "token_acc": 0.9303203159280387, + "train_speed(iter/s)": 0.080889 + }, + { + "epoch": 2.6076488967471603, + "grad_norm": 0.5578245520591736, + "learning_rate": 4.583566696673908e-07, + "loss": 0.17884159088134766, + "memory(GiB)": 72.48, + "step": 6730, + "token_acc": 0.939159530833993, + "train_speed(iter/s)": 0.080891 + }, + { + "epoch": 2.60958655266791, + "grad_norm": 0.5763718485832214, + "learning_rate": 4.5389949390772293e-07, + "loss": 0.18293684720993042, + "memory(GiB)": 72.48, + "step": 6735, + "token_acc": 0.9430318635339556, + "train_speed(iter/s)": 0.080889 + }, + { + "epoch": 2.61152420858866, + "grad_norm": 0.5593513250350952, + "learning_rate": 4.494630648701681e-07, + "loss": 0.18049242496490478, + "memory(GiB)": 72.48, + "step": 6740, + "token_acc": 0.9235769828926905, + "train_speed(iter/s)": 0.080883 + }, + { + "epoch": 2.61346186450941, + "grad_norm": 0.5691155791282654, + "learning_rate": 4.4504740280094824e-07, + "loss": 0.17617813348770142, + "memory(GiB)": 72.48, + "step": 6745, + "token_acc": 0.937477328936521, + "train_speed(iter/s)": 0.080882 + }, + { + "epoch": 2.6153995204301594, + "grad_norm": 0.5468465089797974, + "learning_rate": 4.4065252785151113e-07, + "loss": 0.18133144378662108, + "memory(GiB)": 72.48, + "step": 6750, + "token_acc": 0.9441213328620546, + "train_speed(iter/s)": 0.080883 + }, + { + "epoch": 2.6173371763509095, + "grad_norm": 0.5726532936096191, + "learning_rate": 4.3627846007844257e-07, + "loss": 0.16658930778503417, + "memory(GiB)": 72.48, + "step": 6755, + "token_acc": 0.938378446669404, + "train_speed(iter/s)": 0.080886 + }, + { + "epoch": 2.6192748322716595, + "grad_norm": 0.5524346232414246, + "learning_rate": 4.31925219443371e-07, + "loss": 0.16769044399261473, + "memory(GiB)": 72.48, + "step": 6760, + "token_acc": 0.9425934919864012, + "train_speed(iter/s)": 0.080889 + }, + { + "epoch": 2.621212488192409, + "grad_norm": 0.573850691318512, + "learning_rate": 4.275928258128764e-07, + "loss": 0.18391640186309816, + "memory(GiB)": 72.48, + "step": 6765, + "token_acc": 0.9425073364677669, + "train_speed(iter/s)": 0.080892 + }, + { + "epoch": 2.623150144113159, + "grad_norm": 0.5574968457221985, + "learning_rate": 4.2328129895840233e-07, + "loss": 0.17317439317703248, + "memory(GiB)": 72.48, + "step": 6770, + "token_acc": 0.938301515004684, + "train_speed(iter/s)": 0.080889 + }, + { + "epoch": 2.625087800033909, + "grad_norm": 0.5664544701576233, + "learning_rate": 4.189906585561637e-07, + "loss": 0.1772672176361084, + "memory(GiB)": 72.48, + "step": 6775, + "token_acc": 0.934027002010788, + "train_speed(iter/s)": 0.08089 + }, + { + "epoch": 2.6270254559546586, + "grad_norm": 0.5962421894073486, + "learning_rate": 4.14720924187057e-07, + "loss": 0.17836084365844726, + "memory(GiB)": 72.48, + "step": 6780, + "token_acc": 0.9277760689018764, + "train_speed(iter/s)": 0.080886 + }, + { + "epoch": 2.6289631118754087, + "grad_norm": 0.6081134080886841, + "learning_rate": 4.1047211533657203e-07, + "loss": 0.18367011547088624, + "memory(GiB)": 72.48, + "step": 6785, + "token_acc": 0.9310902801026671, + "train_speed(iter/s)": 0.080892 + }, + { + "epoch": 2.6309007677961587, + "grad_norm": 0.5423617959022522, + "learning_rate": 4.062442513947007e-07, + "loss": 0.1798401355743408, + "memory(GiB)": 72.48, + "step": 6790, + "token_acc": 0.9351920360530033, + "train_speed(iter/s)": 0.080893 + }, + { + "epoch": 2.6328384237169082, + "grad_norm": 0.5457631945610046, + "learning_rate": 4.0203735165585067e-07, + "loss": 0.1814468502998352, + "memory(GiB)": 72.48, + "step": 6795, + "token_acc": 0.9389636855644272, + "train_speed(iter/s)": 0.080888 + }, + { + "epoch": 2.6347760796376583, + "grad_norm": 0.5330853462219238, + "learning_rate": 3.9785143531875845e-07, + "loss": 0.17347393035888672, + "memory(GiB)": 72.48, + "step": 6800, + "token_acc": 0.936892710837255, + "train_speed(iter/s)": 0.080891 + }, + { + "epoch": 2.6367137355584083, + "grad_norm": 0.5479428768157959, + "learning_rate": 3.9368652148639883e-07, + "loss": 0.17252148389816285, + "memory(GiB)": 72.48, + "step": 6805, + "token_acc": 0.9414675828675187, + "train_speed(iter/s)": 0.080897 + }, + { + "epoch": 2.638651391479158, + "grad_norm": 0.549821138381958, + "learning_rate": 3.8954262916589716e-07, + "loss": 0.17428038120269776, + "memory(GiB)": 72.48, + "step": 6810, + "token_acc": 0.9399053094352384, + "train_speed(iter/s)": 0.080896 + }, + { + "epoch": 2.640589047399908, + "grad_norm": 0.5740098357200623, + "learning_rate": 3.854197772684476e-07, + "loss": 0.17955343723297118, + "memory(GiB)": 72.48, + "step": 6815, + "token_acc": 0.9381714466803993, + "train_speed(iter/s)": 0.080897 + }, + { + "epoch": 2.642526703320658, + "grad_norm": 0.5651427507400513, + "learning_rate": 3.813179846092213e-07, + "loss": 0.18090170621871948, + "memory(GiB)": 72.48, + "step": 6820, + "token_acc": 0.9276593234834409, + "train_speed(iter/s)": 0.080898 + }, + { + "epoch": 2.6444643592414074, + "grad_norm": 0.5585702061653137, + "learning_rate": 3.7723726990728404e-07, + "loss": 0.1783726453781128, + "memory(GiB)": 72.48, + "step": 6825, + "token_acc": 0.9355342256468165, + "train_speed(iter/s)": 0.0809 + }, + { + "epoch": 2.6464020151621575, + "grad_norm": 0.5618979334831238, + "learning_rate": 3.7317765178550904e-07, + "loss": 0.17897303104400636, + "memory(GiB)": 72.48, + "step": 6830, + "token_acc": 0.9334919962181724, + "train_speed(iter/s)": 0.080902 + }, + { + "epoch": 2.6483396710829075, + "grad_norm": 0.5554330348968506, + "learning_rate": 3.6913914877049263e-07, + "loss": 0.17849816083908082, + "memory(GiB)": 72.48, + "step": 6835, + "token_acc": 0.9372031160934828, + "train_speed(iter/s)": 0.080903 + }, + { + "epoch": 2.6502773270036575, + "grad_norm": 0.5622228384017944, + "learning_rate": 3.6512177929246997e-07, + "loss": 0.1788189649581909, + "memory(GiB)": 72.48, + "step": 6840, + "token_acc": 0.9408510241506853, + "train_speed(iter/s)": 0.0809 + }, + { + "epoch": 2.652214982924407, + "grad_norm": 0.575397253036499, + "learning_rate": 3.6112556168522996e-07, + "loss": 0.17994626760482788, + "memory(GiB)": 72.48, + "step": 6845, + "token_acc": 0.9398202072693606, + "train_speed(iter/s)": 0.080898 + }, + { + "epoch": 2.654152638845157, + "grad_norm": 0.579742968082428, + "learning_rate": 3.5715051418603263e-07, + "loss": 0.17810392379760742, + "memory(GiB)": 72.48, + "step": 6850, + "token_acc": 0.938123884445032, + "train_speed(iter/s)": 0.080896 + }, + { + "epoch": 2.656090294765907, + "grad_norm": 0.623775064945221, + "learning_rate": 3.531966549355248e-07, + "loss": 0.17560627460479736, + "memory(GiB)": 72.48, + "step": 6855, + "token_acc": 0.9456114270941055, + "train_speed(iter/s)": 0.0809 + }, + { + "epoch": 2.658027950686657, + "grad_norm": 0.5556665062904358, + "learning_rate": 3.492640019776583e-07, + "loss": 0.18241939544677735, + "memory(GiB)": 72.48, + "step": 6860, + "token_acc": 0.9434278944250641, + "train_speed(iter/s)": 0.0809 + }, + { + "epoch": 2.6599656066074067, + "grad_norm": 0.6107257008552551, + "learning_rate": 3.4535257325960916e-07, + "loss": 0.18052737712860106, + "memory(GiB)": 72.48, + "step": 6865, + "token_acc": 0.9402803168799513, + "train_speed(iter/s)": 0.080902 + }, + { + "epoch": 2.6619032625281567, + "grad_norm": 0.5620692372322083, + "learning_rate": 3.414623866316891e-07, + "loss": 0.1779445767402649, + "memory(GiB)": 72.48, + "step": 6870, + "token_acc": 0.9431987134751343, + "train_speed(iter/s)": 0.080901 + }, + { + "epoch": 2.6638409184489067, + "grad_norm": 0.5509908199310303, + "learning_rate": 3.375934598472741e-07, + "loss": 0.1766197443008423, + "memory(GiB)": 72.48, + "step": 6875, + "token_acc": 0.9389340890392175, + "train_speed(iter/s)": 0.080901 + }, + { + "epoch": 2.6657785743696563, + "grad_norm": 0.5386033058166504, + "learning_rate": 3.337458105627145e-07, + "loss": 0.186127233505249, + "memory(GiB)": 72.48, + "step": 6880, + "token_acc": 0.9373184977322413, + "train_speed(iter/s)": 0.080904 + }, + { + "epoch": 2.6677162302904063, + "grad_norm": 0.5640909075737, + "learning_rate": 3.299194563372604e-07, + "loss": 0.17956843376159667, + "memory(GiB)": 72.48, + "step": 6885, + "token_acc": 0.9340066880502286, + "train_speed(iter/s)": 0.080901 + }, + { + "epoch": 2.6696538862111563, + "grad_norm": 0.591550350189209, + "learning_rate": 3.26114414632977e-07, + "loss": 0.18439195156097413, + "memory(GiB)": 72.48, + "step": 6890, + "token_acc": 0.9411013858497447, + "train_speed(iter/s)": 0.080902 + }, + { + "epoch": 2.671591542131906, + "grad_norm": 0.5314768552780151, + "learning_rate": 3.223307028146677e-07, + "loss": 0.1863186001777649, + "memory(GiB)": 72.48, + "step": 6895, + "token_acc": 0.9312440529175235, + "train_speed(iter/s)": 0.080907 + }, + { + "epoch": 2.673529198052656, + "grad_norm": 0.5642319917678833, + "learning_rate": 3.1856833814979617e-07, + "loss": 0.1659186601638794, + "memory(GiB)": 72.48, + "step": 6900, + "token_acc": 0.9432498411530615, + "train_speed(iter/s)": 0.080908 + }, + { + "epoch": 2.675466853973406, + "grad_norm": 0.5679530501365662, + "learning_rate": 3.1482733780840215e-07, + "loss": 0.18756985664367676, + "memory(GiB)": 72.48, + "step": 6905, + "token_acc": 0.9293795510621653, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 2.6774045098941555, + "grad_norm": 0.5828523635864258, + "learning_rate": 3.111077188630296e-07, + "loss": 0.173581326007843, + "memory(GiB)": 72.48, + "step": 6910, + "token_acc": 0.9433222128533196, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 2.6793421658149055, + "grad_norm": 0.5647770166397095, + "learning_rate": 3.074094982886433e-07, + "loss": 0.17866160869598388, + "memory(GiB)": 72.48, + "step": 6915, + "token_acc": 0.9379952000526022, + "train_speed(iter/s)": 0.080913 + }, + { + "epoch": 2.6812798217356555, + "grad_norm": 0.5703522562980652, + "learning_rate": 3.037326929625545e-07, + "loss": 0.185385799407959, + "memory(GiB)": 72.48, + "step": 6920, + "token_acc": 0.93567492240451, + "train_speed(iter/s)": 0.080915 + }, + { + "epoch": 2.683217477656405, + "grad_norm": 0.5823149085044861, + "learning_rate": 3.000773196643453e-07, + "loss": 0.17492568492889404, + "memory(GiB)": 72.48, + "step": 6925, + "token_acc": 0.9457313494666344, + "train_speed(iter/s)": 0.080915 + }, + { + "epoch": 2.685155133577155, + "grad_norm": 0.5752143263816833, + "learning_rate": 2.964433950757861e-07, + "loss": 0.17690329551696776, + "memory(GiB)": 72.48, + "step": 6930, + "token_acc": 0.9423112086304626, + "train_speed(iter/s)": 0.080919 + }, + { + "epoch": 2.687092789497905, + "grad_norm": 0.5538632869720459, + "learning_rate": 2.928309357807663e-07, + "loss": 0.18144471645355226, + "memory(GiB)": 72.48, + "step": 6935, + "token_acc": 0.9387541327048349, + "train_speed(iter/s)": 0.080911 + }, + { + "epoch": 2.6890304454186547, + "grad_norm": 0.6096916198730469, + "learning_rate": 2.8923995826521387e-07, + "loss": 0.17612223625183104, + "memory(GiB)": 72.48, + "step": 6940, + "token_acc": 0.9423413840570642, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 2.6909681013394047, + "grad_norm": 0.5589750409126282, + "learning_rate": 2.8567047891702394e-07, + "loss": 0.18112629652023315, + "memory(GiB)": 72.48, + "step": 6945, + "token_acc": 0.936176827183138, + "train_speed(iter/s)": 0.08091 + }, + { + "epoch": 2.6929057572601547, + "grad_norm": 0.5414318442344666, + "learning_rate": 2.8212251402597977e-07, + "loss": 0.1684859037399292, + "memory(GiB)": 72.48, + "step": 6950, + "token_acc": 0.9418085690744301, + "train_speed(iter/s)": 0.08091 + }, + { + "epoch": 2.6948434131809043, + "grad_norm": 0.5703111886978149, + "learning_rate": 2.7859607978368175e-07, + "loss": 0.18538738489151002, + "memory(GiB)": 72.48, + "step": 6955, + "token_acc": 0.9352054537494527, + "train_speed(iter/s)": 0.080907 + }, + { + "epoch": 2.6967810691016543, + "grad_norm": 0.5782099962234497, + "learning_rate": 2.750911922834726e-07, + "loss": 0.1821625828742981, + "memory(GiB)": 72.48, + "step": 6960, + "token_acc": 0.9416863672182821, + "train_speed(iter/s)": 0.080905 + }, + { + "epoch": 2.6987187250224043, + "grad_norm": 0.5848382711410522, + "learning_rate": 2.7160786752036206e-07, + "loss": 0.1709179162979126, + "memory(GiB)": 72.48, + "step": 6965, + "token_acc": 0.939601422943044, + "train_speed(iter/s)": 0.080904 + }, + { + "epoch": 2.700656380943154, + "grad_norm": 0.5618194937705994, + "learning_rate": 2.6814612139095863e-07, + "loss": 0.17481892108917235, + "memory(GiB)": 72.48, + "step": 6970, + "token_acc": 0.9394167579408543, + "train_speed(iter/s)": 0.080908 + }, + { + "epoch": 2.702594036863904, + "grad_norm": 0.5825572609901428, + "learning_rate": 2.6470596969338957e-07, + "loss": 0.17760159969329833, + "memory(GiB)": 72.48, + "step": 6975, + "token_acc": 0.938193398164635, + "train_speed(iter/s)": 0.080905 + }, + { + "epoch": 2.704531692784654, + "grad_norm": 0.5450993180274963, + "learning_rate": 2.612874281272371e-07, + "loss": 0.1793541431427002, + "memory(GiB)": 72.48, + "step": 6980, + "token_acc": 0.9428062216461439, + "train_speed(iter/s)": 0.080904 + }, + { + "epoch": 2.7064693487054035, + "grad_norm": 0.5646445155143738, + "learning_rate": 2.5789051229346054e-07, + "loss": 0.1738759994506836, + "memory(GiB)": 72.48, + "step": 6985, + "token_acc": 0.9388832085490946, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 2.7084070046261535, + "grad_norm": 0.5360503792762756, + "learning_rate": 2.5451523769432774e-07, + "loss": 0.17443559169769288, + "memory(GiB)": 72.48, + "step": 6990, + "token_acc": 0.9433324034035431, + "train_speed(iter/s)": 0.080908 + }, + { + "epoch": 2.7103446605469035, + "grad_norm": 0.5633649826049805, + "learning_rate": 2.5116161973334443e-07, + "loss": 0.17268643379211426, + "memory(GiB)": 72.48, + "step": 6995, + "token_acc": 0.9407816564436879, + "train_speed(iter/s)": 0.080908 + }, + { + "epoch": 2.712282316467653, + "grad_norm": 0.5596593022346497, + "learning_rate": 2.4782967371518363e-07, + "loss": 0.17048782110214233, + "memory(GiB)": 72.48, + "step": 7000, + "token_acc": 0.9404626696156375, + "train_speed(iter/s)": 0.080911 + }, + { + "epoch": 2.714219972388403, + "grad_norm": 0.5548799633979797, + "learning_rate": 2.445194148456148e-07, + "loss": 0.17602903842926027, + "memory(GiB)": 72.48, + "step": 7005, + "token_acc": 0.9416142492370928, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 2.716157628309153, + "grad_norm": 0.5687563419342041, + "learning_rate": 2.4123085823143543e-07, + "loss": 0.18110830783843995, + "memory(GiB)": 72.48, + "step": 7010, + "token_acc": 0.9394876468864946, + "train_speed(iter/s)": 0.08091 + }, + { + "epoch": 2.7180952842299027, + "grad_norm": 0.5731750130653381, + "learning_rate": 2.3796401888040277e-07, + "loss": 0.1813871145248413, + "memory(GiB)": 72.48, + "step": 7015, + "token_acc": 0.9472184931731116, + "train_speed(iter/s)": 0.08091 + }, + { + "epoch": 2.7200329401506527, + "grad_norm": 0.5686662197113037, + "learning_rate": 2.3471891170116333e-07, + "loss": 0.18984251022338866, + "memory(GiB)": 72.48, + "step": 7020, + "token_acc": 0.935462094106528, + "train_speed(iter/s)": 0.080914 + }, + { + "epoch": 2.7219705960714027, + "grad_norm": 1.3723950386047363, + "learning_rate": 2.3149555150318748e-07, + "loss": 0.18166351318359375, + "memory(GiB)": 72.48, + "step": 7025, + "token_acc": 0.9368168127517006, + "train_speed(iter/s)": 0.080913 + }, + { + "epoch": 2.7239082519921523, + "grad_norm": 0.5821495652198792, + "learning_rate": 2.2829395299669878e-07, + "loss": 0.1856691598892212, + "memory(GiB)": 72.48, + "step": 7030, + "token_acc": 0.9391278994427886, + "train_speed(iter/s)": 0.080914 + }, + { + "epoch": 2.7258459079129023, + "grad_norm": 0.5817452073097229, + "learning_rate": 2.2511413079261024e-07, + "loss": 0.1843825101852417, + "memory(GiB)": 72.48, + "step": 7035, + "token_acc": 0.94321184930244, + "train_speed(iter/s)": 0.080913 + }, + { + "epoch": 2.7277835638336523, + "grad_norm": 0.5707274079322815, + "learning_rate": 2.2195609940245388e-07, + "loss": 0.18705679178237916, + "memory(GiB)": 72.48, + "step": 7040, + "token_acc": 0.9346490475373262, + "train_speed(iter/s)": 0.080914 + }, + { + "epoch": 2.729721219754402, + "grad_norm": 0.5833871960639954, + "learning_rate": 2.1881987323831734e-07, + "loss": 0.1628001570701599, + "memory(GiB)": 72.48, + "step": 7045, + "token_acc": 0.9400068138879425, + "train_speed(iter/s)": 0.080921 + }, + { + "epoch": 2.731658875675152, + "grad_norm": 0.5860168933868408, + "learning_rate": 2.1570546661277893e-07, + "loss": 0.18646140098571778, + "memory(GiB)": 72.48, + "step": 7050, + "token_acc": 0.9412682085144507, + "train_speed(iter/s)": 0.080915 + }, + { + "epoch": 2.733596531595902, + "grad_norm": 0.5721165537834167, + "learning_rate": 2.126128937388372e-07, + "loss": 0.18164710998535155, + "memory(GiB)": 72.48, + "step": 7055, + "token_acc": 0.9363445717051138, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 2.735534187516652, + "grad_norm": 0.5800652503967285, + "learning_rate": 2.0954216872985267e-07, + "loss": 0.1815112590789795, + "memory(GiB)": 72.48, + "step": 7060, + "token_acc": 0.9349871443505778, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 2.7374718434374015, + "grad_norm": 0.5445681214332581, + "learning_rate": 2.0649330559947888e-07, + "loss": 0.18451664447784424, + "memory(GiB)": 72.48, + "step": 7065, + "token_acc": 0.9358861279235986, + "train_speed(iter/s)": 0.080908 + }, + { + "epoch": 2.7394094993581515, + "grad_norm": 0.5537438988685608, + "learning_rate": 2.034663182615998e-07, + "loss": 0.17921509742736816, + "memory(GiB)": 72.48, + "step": 7070, + "token_acc": 0.9403338276591046, + "train_speed(iter/s)": 0.080905 + }, + { + "epoch": 2.7413471552789015, + "grad_norm": 0.5611521601676941, + "learning_rate": 2.0046122053026697e-07, + "loss": 0.18472495079040527, + "memory(GiB)": 72.48, + "step": 7075, + "token_acc": 0.9353880089458043, + "train_speed(iter/s)": 0.080908 + }, + { + "epoch": 2.743284811199651, + "grad_norm": 0.6002687215805054, + "learning_rate": 1.9747802611963573e-07, + "loss": 0.17965627908706666, + "memory(GiB)": 72.48, + "step": 7080, + "token_acc": 0.9414823121622999, + "train_speed(iter/s)": 0.08091 + }, + { + "epoch": 2.745222467120401, + "grad_norm": 0.7260130047798157, + "learning_rate": 1.9451674864390146e-07, + "loss": 0.18268961906433107, + "memory(GiB)": 72.48, + "step": 7085, + "token_acc": 0.9378769786595634, + "train_speed(iter/s)": 0.080907 + }, + { + "epoch": 2.747160123041151, + "grad_norm": 0.5571488738059998, + "learning_rate": 1.9157740161724114e-07, + "loss": 0.1752035140991211, + "memory(GiB)": 72.48, + "step": 7090, + "token_acc": 0.9346371590324241, + "train_speed(iter/s)": 0.080905 + }, + { + "epoch": 2.749097778961901, + "grad_norm": 0.54665207862854, + "learning_rate": 1.8865999845374794e-07, + "loss": 0.1749187707901001, + "memory(GiB)": 72.48, + "step": 7095, + "token_acc": 0.9437026673560631, + "train_speed(iter/s)": 0.080905 + }, + { + "epoch": 2.7510354348826507, + "grad_norm": 0.5501448512077332, + "learning_rate": 1.857645524673707e-07, + "loss": 0.16233822107315063, + "memory(GiB)": 72.48, + "step": 7100, + "token_acc": 0.9529371052859243, + "train_speed(iter/s)": 0.080907 + }, + { + "epoch": 2.7529730908034007, + "grad_norm": 0.5393111109733582, + "learning_rate": 1.8289107687185448e-07, + "loss": 0.18919681310653685, + "memory(GiB)": 72.48, + "step": 7105, + "token_acc": 0.9318746623446785, + "train_speed(iter/s)": 0.080907 + }, + { + "epoch": 2.7549107467241507, + "grad_norm": 0.5541280508041382, + "learning_rate": 1.800395847806802e-07, + "loss": 0.1748568296432495, + "memory(GiB)": 72.48, + "step": 7110, + "token_acc": 0.9377408026185686, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 2.7568484026449003, + "grad_norm": 0.5741930603981018, + "learning_rate": 1.7721008920700277e-07, + "loss": 0.17860870361328124, + "memory(GiB)": 72.48, + "step": 7115, + "token_acc": 0.9323363982351555, + "train_speed(iter/s)": 0.08091 + }, + { + "epoch": 2.7587860585656503, + "grad_norm": 0.5515788197517395, + "learning_rate": 1.744026030635948e-07, + "loss": 0.1757514238357544, + "memory(GiB)": 72.48, + "step": 7120, + "token_acc": 0.9459343180840384, + "train_speed(iter/s)": 0.08091 + }, + { + "epoch": 2.7607237144864003, + "grad_norm": 0.5688087940216064, + "learning_rate": 1.7161713916278467e-07, + "loss": 0.17468688488006592, + "memory(GiB)": 72.48, + "step": 7125, + "token_acc": 0.9401110690395053, + "train_speed(iter/s)": 0.080908 + }, + { + "epoch": 2.76266137040715, + "grad_norm": 0.532461404800415, + "learning_rate": 1.6885371021640007e-07, + "loss": 0.17514437437057495, + "memory(GiB)": 72.48, + "step": 7130, + "token_acc": 0.9459091512817107, + "train_speed(iter/s)": 0.080906 + }, + { + "epoch": 2.7645990263279, + "grad_norm": 0.5280036330223083, + "learning_rate": 1.661123288357097e-07, + "loss": 0.17472249269485474, + "memory(GiB)": 72.48, + "step": 7135, + "token_acc": 0.9471332836932241, + "train_speed(iter/s)": 0.080908 + }, + { + "epoch": 2.76653668224865, + "grad_norm": 0.535383939743042, + "learning_rate": 1.633930075313639e-07, + "loss": 0.16930484771728516, + "memory(GiB)": 72.48, + "step": 7140, + "token_acc": 0.9404903371067609, + "train_speed(iter/s)": 0.08091 + }, + { + "epoch": 2.7684743381693995, + "grad_norm": 0.5797159671783447, + "learning_rate": 1.606957587133401e-07, + "loss": 0.18371963500976562, + "memory(GiB)": 72.48, + "step": 7145, + "token_acc": 0.9424690928086366, + "train_speed(iter/s)": 0.080907 + }, + { + "epoch": 2.7704119940901495, + "grad_norm": 0.5747814178466797, + "learning_rate": 1.5802059469088472e-07, + "loss": 0.1853799343109131, + "memory(GiB)": 72.48, + "step": 7150, + "token_acc": 0.9366994386378439, + "train_speed(iter/s)": 0.080908 + }, + { + "epoch": 2.7723496500108995, + "grad_norm": 0.5973690152168274, + "learning_rate": 1.553675276724581e-07, + "loss": 0.18086956739425658, + "memory(GiB)": 72.48, + "step": 7155, + "token_acc": 0.9434806576088719, + "train_speed(iter/s)": 0.080907 + }, + { + "epoch": 2.774287305931649, + "grad_norm": 0.6191303730010986, + "learning_rate": 1.527365697656763e-07, + "loss": 0.17998770475387574, + "memory(GiB)": 72.48, + "step": 7160, + "token_acc": 0.9484810014466653, + "train_speed(iter/s)": 0.080905 + }, + { + "epoch": 2.776224961852399, + "grad_norm": 0.5710081458091736, + "learning_rate": 1.5012773297725935e-07, + "loss": 0.1800989866256714, + "memory(GiB)": 72.48, + "step": 7165, + "token_acc": 0.9347461042450296, + "train_speed(iter/s)": 0.080905 + }, + { + "epoch": 2.778162617773149, + "grad_norm": 0.5566663146018982, + "learning_rate": 1.4754102921297363e-07, + "loss": 0.18220709562301635, + "memory(GiB)": 72.48, + "step": 7170, + "token_acc": 0.9431165210295468, + "train_speed(iter/s)": 0.080907 + }, + { + "epoch": 2.7801002736938987, + "grad_norm": 0.6151481866836548, + "learning_rate": 1.449764702775791e-07, + "loss": 0.17154158353805543, + "memory(GiB)": 72.48, + "step": 7175, + "token_acc": 0.9446883230904302, + "train_speed(iter/s)": 0.080903 + }, + { + "epoch": 2.7820379296146487, + "grad_norm": 0.5727818012237549, + "learning_rate": 1.4243406787477377e-07, + "loss": 0.17165830135345458, + "memory(GiB)": 72.48, + "step": 7180, + "token_acc": 0.9406975253237595, + "train_speed(iter/s)": 0.080903 + }, + { + "epoch": 2.7839755855353987, + "grad_norm": 0.5685797929763794, + "learning_rate": 1.3991383360714318e-07, + "loss": 0.17021799087524414, + "memory(GiB)": 72.48, + "step": 7185, + "token_acc": 0.9400856973995272, + "train_speed(iter/s)": 0.080904 + }, + { + "epoch": 2.7859132414561483, + "grad_norm": 0.5543227195739746, + "learning_rate": 1.3741577897610492e-07, + "loss": 0.17734270095825194, + "memory(GiB)": 72.48, + "step": 7190, + "token_acc": 0.9402452126651077, + "train_speed(iter/s)": 0.080905 + }, + { + "epoch": 2.7878508973768983, + "grad_norm": 0.5909552574157715, + "learning_rate": 1.3493991538185648e-07, + "loss": 0.180253267288208, + "memory(GiB)": 72.48, + "step": 7195, + "token_acc": 0.9438191560191497, + "train_speed(iter/s)": 0.080905 + }, + { + "epoch": 2.7897885532976483, + "grad_norm": 0.587429940700531, + "learning_rate": 1.3248625412332406e-07, + "loss": 0.1743743896484375, + "memory(GiB)": 72.48, + "step": 7200, + "token_acc": 0.9424674429571589, + "train_speed(iter/s)": 0.080905 + }, + { + "epoch": 2.791726209218398, + "grad_norm": 0.5480507612228394, + "learning_rate": 1.3005480639811053e-07, + "loss": 0.1738879919052124, + "memory(GiB)": 72.48, + "step": 7205, + "token_acc": 0.9413131514306579, + "train_speed(iter/s)": 0.080907 + }, + { + "epoch": 2.793663865139148, + "grad_norm": 0.5716990232467651, + "learning_rate": 1.2764558330244537e-07, + "loss": 0.1896296739578247, + "memory(GiB)": 72.48, + "step": 7210, + "token_acc": 0.9348259519529335, + "train_speed(iter/s)": 0.080907 + }, + { + "epoch": 2.795601521059898, + "grad_norm": 0.5980350971221924, + "learning_rate": 1.2525859583113087e-07, + "loss": 0.18182060718536378, + "memory(GiB)": 72.48, + "step": 7215, + "token_acc": 0.944327968261136, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 2.7975391769806475, + "grad_norm": 0.538253128528595, + "learning_rate": 1.2289385487749605e-07, + "loss": 0.17818151712417601, + "memory(GiB)": 72.48, + "step": 7220, + "token_acc": 0.9330373658897522, + "train_speed(iter/s)": 0.080906 + }, + { + "epoch": 2.7994768329013975, + "grad_norm": 0.5648424029350281, + "learning_rate": 1.2055137123334448e-07, + "loss": 0.17778961658477782, + "memory(GiB)": 72.48, + "step": 7225, + "token_acc": 0.9421487603305785, + "train_speed(iter/s)": 0.080908 + }, + { + "epoch": 2.8014144888221475, + "grad_norm": 0.5715829730033875, + "learning_rate": 1.1823115558890542e-07, + "loss": 0.17621153593063354, + "memory(GiB)": 72.48, + "step": 7230, + "token_acc": 0.9278312608637518, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 2.803352144742897, + "grad_norm": 0.5605965852737427, + "learning_rate": 1.1593321853278493e-07, + "loss": 0.16748225688934326, + "memory(GiB)": 72.48, + "step": 7235, + "token_acc": 0.9397157162539064, + "train_speed(iter/s)": 0.08091 + }, + { + "epoch": 2.805289800663647, + "grad_norm": 0.5902793407440186, + "learning_rate": 1.1365757055191883e-07, + "loss": 0.17629606723785402, + "memory(GiB)": 72.48, + "step": 7240, + "token_acc": 0.9356316976875353, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 2.807227456584397, + "grad_norm": 0.5694670677185059, + "learning_rate": 1.1140422203152256e-07, + "loss": 0.1794296383857727, + "memory(GiB)": 72.48, + "step": 7245, + "token_acc": 0.9338077379235182, + "train_speed(iter/s)": 0.080906 + }, + { + "epoch": 2.8091651125051467, + "grad_norm": 0.5887570977210999, + "learning_rate": 1.0917318325504688e-07, + "loss": 0.17631058692932128, + "memory(GiB)": 72.48, + "step": 7250, + "token_acc": 0.9417443297591609, + "train_speed(iter/s)": 0.080906 + }, + { + "epoch": 2.8111027684258967, + "grad_norm": 0.5737782716751099, + "learning_rate": 1.0696446440412678e-07, + "loss": 0.17871012687683105, + "memory(GiB)": 72.48, + "step": 7255, + "token_acc": 0.9362381117886821, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 2.8130404243466467, + "grad_norm": 0.5355263948440552, + "learning_rate": 1.0477807555853925e-07, + "loss": 0.17086371183395385, + "memory(GiB)": 72.48, + "step": 7260, + "token_acc": 0.9380856031128405, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 2.8149780802673963, + "grad_norm": 0.6204429268836975, + "learning_rate": 1.0261402669615505e-07, + "loss": 0.17964229583740235, + "memory(GiB)": 72.48, + "step": 7265, + "token_acc": 0.939118504544675, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 2.8169157361881463, + "grad_norm": 0.5910000801086426, + "learning_rate": 1.0047232769289206e-07, + "loss": 0.1701977014541626, + "memory(GiB)": 72.48, + "step": 7270, + "token_acc": 0.9451066297274661, + "train_speed(iter/s)": 0.080914 + }, + { + "epoch": 2.8188533921088963, + "grad_norm": 0.5505579710006714, + "learning_rate": 9.835298832267415e-08, + "loss": 0.17648329734802246, + "memory(GiB)": 72.48, + "step": 7275, + "token_acc": 0.9310598783221261, + "train_speed(iter/s)": 0.080913 + }, + { + "epoch": 2.820791048029646, + "grad_norm": 0.5610338449478149, + "learning_rate": 9.625601825738185e-08, + "loss": 0.1816624879837036, + "memory(GiB)": 72.48, + "step": 7280, + "token_acc": 0.9337944986458707, + "train_speed(iter/s)": 0.08091 + }, + { + "epoch": 2.822728703950396, + "grad_norm": 0.5854775309562683, + "learning_rate": 9.418142706681122e-08, + "loss": 0.1665945529937744, + "memory(GiB)": 72.48, + "step": 7285, + "token_acc": 0.9456264775413712, + "train_speed(iter/s)": 0.080911 + }, + { + "epoch": 2.824666359871146, + "grad_norm": 0.5666484832763672, + "learning_rate": 9.212922421863058e-08, + "loss": 0.17162128686904907, + "memory(GiB)": 72.48, + "step": 7290, + "token_acc": 0.9401981166687049, + "train_speed(iter/s)": 0.08091 + }, + { + "epoch": 2.826604015791896, + "grad_norm": 0.5913862586021423, + "learning_rate": 9.009941907833386e-08, + "loss": 0.17618238925933838, + "memory(GiB)": 72.48, + "step": 7295, + "token_acc": 0.9368721151235406, + "train_speed(iter/s)": 0.08091 + }, + { + "epoch": 2.8285416717126455, + "grad_norm": 0.5444965362548828, + "learning_rate": 8.809202090920178e-08, + "loss": 0.17910236120224, + "memory(GiB)": 72.48, + "step": 7300, + "token_acc": 0.944204860762879, + "train_speed(iter/s)": 0.080913 + }, + { + "epoch": 2.8304793276333955, + "grad_norm": 0.523826003074646, + "learning_rate": 8.610703887225735e-08, + "loss": 0.17348699569702147, + "memory(GiB)": 72.48, + "step": 7305, + "token_acc": 0.9402267485300329, + "train_speed(iter/s)": 0.08091 + }, + { + "epoch": 2.8324169835541455, + "grad_norm": 0.5677260160446167, + "learning_rate": 8.414448202622494e-08, + "loss": 0.1860441207885742, + "memory(GiB)": 72.48, + "step": 7310, + "token_acc": 0.9366769209498462, + "train_speed(iter/s)": 0.080913 + }, + { + "epoch": 2.8343546394748955, + "grad_norm": 0.556863009929657, + "learning_rate": 8.22043593274885e-08, + "loss": 0.17637474536895753, + "memory(GiB)": 72.48, + "step": 7315, + "token_acc": 0.940773748044973, + "train_speed(iter/s)": 0.080913 + }, + { + "epoch": 2.836292295395645, + "grad_norm": 0.6017458438873291, + "learning_rate": 8.02866796300511e-08, + "loss": 0.17668163776397705, + "memory(GiB)": 72.48, + "step": 7320, + "token_acc": 0.9348431345918783, + "train_speed(iter/s)": 0.080911 + }, + { + "epoch": 2.838229951316395, + "grad_norm": 0.5390684008598328, + "learning_rate": 7.839145168549333e-08, + "loss": 0.18287774324417114, + "memory(GiB)": 72.48, + "step": 7325, + "token_acc": 0.9365806018433742, + "train_speed(iter/s)": 0.08091 + }, + { + "epoch": 2.840167607237145, + "grad_norm": 0.5609269142150879, + "learning_rate": 7.651868414293495e-08, + "loss": 0.17237355709075927, + "memory(GiB)": 72.48, + "step": 7330, + "token_acc": 0.9359338981528605, + "train_speed(iter/s)": 0.080909 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.5909106731414795, + "learning_rate": 7.466838554899547e-08, + "loss": 0.17883057594299318, + "memory(GiB)": 72.48, + "step": 7335, + "token_acc": 0.9382247853068388, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 2.8440429190786447, + "grad_norm": 0.5627955198287964, + "learning_rate": 7.284056434775367e-08, + "loss": 0.17918133735656738, + "memory(GiB)": 72.48, + "step": 7340, + "token_acc": 0.9392019891072697, + "train_speed(iter/s)": 0.080915 + }, + { + "epoch": 2.8459805749993947, + "grad_norm": 0.5700935125350952, + "learning_rate": 7.103522888070868e-08, + "loss": 0.18129817247390748, + "memory(GiB)": 72.48, + "step": 7345, + "token_acc": 0.9376185879298292, + "train_speed(iter/s)": 0.080912 + }, + { + "epoch": 2.8479182309201443, + "grad_norm": 0.5400469303131104, + "learning_rate": 6.925238738674511e-08, + "loss": 0.17303977012634278, + "memory(GiB)": 72.48, + "step": 7350, + "token_acc": 0.9393620750087627, + "train_speed(iter/s)": 0.080916 + }, + { + "epoch": 2.8498558868408943, + "grad_norm": 0.5996050238609314, + "learning_rate": 6.74920480020913e-08, + "loss": 0.1782202124595642, + "memory(GiB)": 72.48, + "step": 7355, + "token_acc": 0.9333392377409736, + "train_speed(iter/s)": 0.080915 + }, + { + "epoch": 2.8517935427616443, + "grad_norm": 0.5571048855781555, + "learning_rate": 6.575421876028721e-08, + "loss": 0.18081109523773192, + "memory(GiB)": 72.48, + "step": 7360, + "token_acc": 0.944696919067032, + "train_speed(iter/s)": 0.080916 + }, + { + "epoch": 2.853731198682394, + "grad_norm": 0.581684410572052, + "learning_rate": 6.40389075921416e-08, + "loss": 0.17170381546020508, + "memory(GiB)": 72.48, + "step": 7365, + "token_acc": 0.9400785854616895, + "train_speed(iter/s)": 0.080919 + }, + { + "epoch": 2.855668854603144, + "grad_norm": 0.555248498916626, + "learning_rate": 6.234612232570103e-08, + "loss": 0.17882840633392333, + "memory(GiB)": 72.48, + "step": 7370, + "token_acc": 0.93646096555393, + "train_speed(iter/s)": 0.080914 + }, + { + "epoch": 2.857606510523894, + "grad_norm": 0.5772862434387207, + "learning_rate": 6.067587068621205e-08, + "loss": 0.17019734382629395, + "memory(GiB)": 72.48, + "step": 7375, + "token_acc": 0.9507737160798385, + "train_speed(iter/s)": 0.080916 + }, + { + "epoch": 2.8595441664446435, + "grad_norm": 0.5468375086784363, + "learning_rate": 5.902816029608516e-08, + "loss": 0.17486650943756105, + "memory(GiB)": 72.48, + "step": 7380, + "token_acc": 0.9327562219817591, + "train_speed(iter/s)": 0.080917 + }, + { + "epoch": 2.8614818223653935, + "grad_norm": 0.5444268584251404, + "learning_rate": 5.740299867486143e-08, + "loss": 0.1715664267539978, + "memory(GiB)": 72.48, + "step": 7385, + "token_acc": 0.9419283134203945, + "train_speed(iter/s)": 0.080921 + }, + { + "epoch": 2.8634194782861435, + "grad_norm": 0.5390260815620422, + "learning_rate": 5.580039323917819e-08, + "loss": 0.16665502786636352, + "memory(GiB)": 72.48, + "step": 7390, + "token_acc": 0.9419317681336179, + "train_speed(iter/s)": 0.080918 + }, + { + "epoch": 2.865357134206893, + "grad_norm": 0.5891969799995422, + "learning_rate": 5.42203513027334e-08, + "loss": 0.17798895835876466, + "memory(GiB)": 72.48, + "step": 7395, + "token_acc": 0.9460712379935966, + "train_speed(iter/s)": 0.080917 + }, + { + "epoch": 2.867294790127643, + "grad_norm": 0.5550128221511841, + "learning_rate": 5.266288007625575e-08, + "loss": 0.1693821668624878, + "memory(GiB)": 72.48, + "step": 7400, + "token_acc": 0.9406276505513147, + "train_speed(iter/s)": 0.080917 + }, + { + "epoch": 2.869232446048393, + "grad_norm": 0.5216034054756165, + "learning_rate": 5.112798666746688e-08, + "loss": 0.176193904876709, + "memory(GiB)": 72.48, + "step": 7405, + "token_acc": 0.9479760597797217, + "train_speed(iter/s)": 0.080914 + }, + { + "epoch": 2.8711701019691427, + "grad_norm": 0.552652895450592, + "learning_rate": 4.9615678081053055e-08, + "loss": 0.17783741950988768, + "memory(GiB)": 72.48, + "step": 7410, + "token_acc": 0.9402217679489562, + "train_speed(iter/s)": 0.080915 + }, + { + "epoch": 2.8731077578898927, + "grad_norm": 0.5211761593818665, + "learning_rate": 4.8125961218632446e-08, + "loss": 0.17638933658599854, + "memory(GiB)": 72.48, + "step": 7415, + "token_acc": 0.9420168067226891, + "train_speed(iter/s)": 0.080914 + }, + { + "epoch": 2.8750454138106427, + "grad_norm": 0.5861157178878784, + "learning_rate": 4.665884287872069e-08, + "loss": 0.17872381210327148, + "memory(GiB)": 72.48, + "step": 7420, + "token_acc": 0.9381103104586167, + "train_speed(iter/s)": 0.080911 + }, + { + "epoch": 2.8769830697313923, + "grad_norm": 0.5576848983764648, + "learning_rate": 4.521432975670481e-08, + "loss": 0.17486473321914672, + "memory(GiB)": 72.48, + "step": 7425, + "token_acc": 0.9473684210526315, + "train_speed(iter/s)": 0.080913 + }, + { + "epoch": 2.8789207256521423, + "grad_norm": 0.5525920391082764, + "learning_rate": 4.3792428444808245e-08, + "loss": 0.17528605461120605, + "memory(GiB)": 72.48, + "step": 7430, + "token_acc": 0.9383393355996096, + "train_speed(iter/s)": 0.080914 + }, + { + "epoch": 2.8808583815728923, + "grad_norm": 0.558647632598877, + "learning_rate": 4.2393145432062524e-08, + "loss": 0.17458267211914064, + "memory(GiB)": 72.48, + "step": 7435, + "token_acc": 0.9422948409516172, + "train_speed(iter/s)": 0.080919 + }, + { + "epoch": 2.882796037493642, + "grad_norm": 0.5761637091636658, + "learning_rate": 4.101648710427841e-08, + "loss": 0.17908756732940673, + "memory(GiB)": 72.48, + "step": 7440, + "token_acc": 0.9369502550077611, + "train_speed(iter/s)": 0.080923 + }, + { + "epoch": 2.884733693414392, + "grad_norm": 0.5622467994689941, + "learning_rate": 3.9662459744015945e-08, + "loss": 0.1826641082763672, + "memory(GiB)": 72.48, + "step": 7445, + "token_acc": 0.939357744940791, + "train_speed(iter/s)": 0.080922 + }, + { + "epoch": 2.886671349335142, + "grad_norm": 0.56577068567276, + "learning_rate": 3.833106953055443e-08, + "loss": 0.17491524219512938, + "memory(GiB)": 72.48, + "step": 7450, + "token_acc": 0.936007640878701, + "train_speed(iter/s)": 0.080923 + }, + { + "epoch": 2.8886090052558915, + "grad_norm": 0.5660979747772217, + "learning_rate": 3.702232253986804e-08, + "loss": 0.1770297646522522, + "memory(GiB)": 72.48, + "step": 7455, + "token_acc": 0.9419789820207648, + "train_speed(iter/s)": 0.080918 + }, + { + "epoch": 2.8905466611766415, + "grad_norm": 0.6129657626152039, + "learning_rate": 3.573622474459304e-08, + "loss": 0.17652267217636108, + "memory(GiB)": 72.48, + "step": 7460, + "token_acc": 0.9396475302294245, + "train_speed(iter/s)": 0.080917 + }, + { + "epoch": 2.8924843170973915, + "grad_norm": 0.5563213229179382, + "learning_rate": 3.44727820140045e-08, + "loss": 0.1800151586532593, + "memory(GiB)": 72.48, + "step": 7465, + "token_acc": 0.940325372101073, + "train_speed(iter/s)": 0.08092 + }, + { + "epoch": 2.894421973018141, + "grad_norm": 0.5570213198661804, + "learning_rate": 3.323200011398853e-08, + "loss": 0.1769587755203247, + "memory(GiB)": 72.48, + "step": 7470, + "token_acc": 0.9385570317676186, + "train_speed(iter/s)": 0.080922 + }, + { + "epoch": 2.896359628938891, + "grad_norm": 0.5758700370788574, + "learning_rate": 3.2013884707015053e-08, + "loss": 0.17842193841934204, + "memory(GiB)": 72.48, + "step": 7475, + "token_acc": 0.9400053407708513, + "train_speed(iter/s)": 0.080924 + }, + { + "epoch": 2.898297284859641, + "grad_norm": 0.5742579102516174, + "learning_rate": 3.081844135211176e-08, + "loss": 0.17959511280059814, + "memory(GiB)": 72.48, + "step": 7480, + "token_acc": 0.9387418855241045, + "train_speed(iter/s)": 0.080933 + }, + { + "epoch": 2.9002349407803907, + "grad_norm": 0.5965311527252197, + "learning_rate": 2.964567550484021e-08, + "loss": 0.17918524742126465, + "memory(GiB)": 72.48, + "step": 7485, + "token_acc": 0.938234600747651, + "train_speed(iter/s)": 0.080932 + }, + { + "epoch": 2.9021725967011407, + "grad_norm": 0.5796821713447571, + "learning_rate": 2.8495592517270853e-08, + "loss": 0.1829594373703003, + "memory(GiB)": 72.48, + "step": 7490, + "token_acc": 0.9416605091690737, + "train_speed(iter/s)": 0.08093 + }, + { + "epoch": 2.9041102526218907, + "grad_norm": 0.5433580279350281, + "learning_rate": 2.736819763795695e-08, + "loss": 0.17868415117263795, + "memory(GiB)": 72.48, + "step": 7495, + "token_acc": 0.9428167520763385, + "train_speed(iter/s)": 0.080928 + }, + { + "epoch": 2.9060479085426403, + "grad_norm": 0.5626754760742188, + "learning_rate": 2.6263496011911805e-08, + "loss": 0.17260587215423584, + "memory(GiB)": 72.48, + "step": 7500, + "token_acc": 0.9441806517090144, + "train_speed(iter/s)": 0.080926 + }, + { + "epoch": 2.9079855644633903, + "grad_norm": 0.6491982936859131, + "learning_rate": 2.518149268058545e-08, + "loss": 0.17835769653320313, + "memory(GiB)": 72.48, + "step": 7505, + "token_acc": 0.9360804983229516, + "train_speed(iter/s)": 0.080924 + }, + { + "epoch": 2.9099232203841403, + "grad_norm": 0.5484529137611389, + "learning_rate": 2.4122192581840786e-08, + "loss": 0.17280452251434325, + "memory(GiB)": 72.48, + "step": 7510, + "token_acc": 0.9375745131805537, + "train_speed(iter/s)": 0.08092 + }, + { + "epoch": 2.9118608763048903, + "grad_norm": 0.5870897173881531, + "learning_rate": 2.3085600549932476e-08, + "loss": 0.18518280982971191, + "memory(GiB)": 72.48, + "step": 7515, + "token_acc": 0.9352595330085671, + "train_speed(iter/s)": 0.080924 + }, + { + "epoch": 2.91379853222564, + "grad_norm": 0.569111704826355, + "learning_rate": 2.2071721315483074e-08, + "loss": 0.18087258338928222, + "memory(GiB)": 72.48, + "step": 7520, + "token_acc": 0.9298930464311784, + "train_speed(iter/s)": 0.080926 + }, + { + "epoch": 2.91573618814639, + "grad_norm": 0.5556108951568604, + "learning_rate": 2.1080559505462504e-08, + "loss": 0.17546112537384034, + "memory(GiB)": 72.48, + "step": 7525, + "token_acc": 0.9410911406736484, + "train_speed(iter/s)": 0.080931 + }, + { + "epoch": 2.91767384406714, + "grad_norm": 0.5702312588691711, + "learning_rate": 2.011211964316695e-08, + "loss": 0.17208282947540282, + "memory(GiB)": 72.48, + "step": 7530, + "token_acc": 0.9407693723541322, + "train_speed(iter/s)": 0.080931 + }, + { + "epoch": 2.9196114999878895, + "grad_norm": 0.5893260836601257, + "learning_rate": 1.916640614819776e-08, + "loss": 0.17540626525878905, + "memory(GiB)": 72.48, + "step": 7535, + "token_acc": 0.945633251124329, + "train_speed(iter/s)": 0.08093 + }, + { + "epoch": 2.9215491559086395, + "grad_norm": 0.5966506600379944, + "learning_rate": 1.8243423336442022e-08, + "loss": 0.18844203948974608, + "memory(GiB)": 72.48, + "step": 7540, + "token_acc": 0.9377620422994503, + "train_speed(iter/s)": 0.080932 + }, + { + "epoch": 2.9234868118293895, + "grad_norm": 0.5934481620788574, + "learning_rate": 1.7343175420051485e-08, + "loss": 0.16864123344421386, + "memory(GiB)": 72.48, + "step": 7545, + "token_acc": 0.948881469115192, + "train_speed(iter/s)": 0.080938 + }, + { + "epoch": 2.9254244677501395, + "grad_norm": 0.5466935038566589, + "learning_rate": 1.6465666507425314e-08, + "loss": 0.17849595546722413, + "memory(GiB)": 72.48, + "step": 7550, + "token_acc": 0.9363413561239239, + "train_speed(iter/s)": 0.080939 + }, + { + "epoch": 2.927362123670889, + "grad_norm": 0.5641075968742371, + "learning_rate": 1.5610900603189593e-08, + "loss": 0.17939648628234864, + "memory(GiB)": 72.48, + "step": 7555, + "token_acc": 0.9402934467665373, + "train_speed(iter/s)": 0.080942 + }, + { + "epoch": 2.929299779591639, + "grad_norm": 0.5786232948303223, + "learning_rate": 1.4778881608180085e-08, + "loss": 0.1795699715614319, + "memory(GiB)": 72.48, + "step": 7560, + "token_acc": 0.933832010905659, + "train_speed(iter/s)": 0.08094 + }, + { + "epoch": 2.931237435512389, + "grad_norm": 0.5693191289901733, + "learning_rate": 1.3969613319423924e-08, + "loss": 0.18208067417144774, + "memory(GiB)": 72.48, + "step": 7565, + "token_acc": 0.9298536732538845, + "train_speed(iter/s)": 0.080937 + }, + { + "epoch": 2.9331750914331387, + "grad_norm": 0.5985451340675354, + "learning_rate": 1.3183099430122414e-08, + "loss": 0.18023189306259155, + "memory(GiB)": 72.48, + "step": 7570, + "token_acc": 0.9351883803369201, + "train_speed(iter/s)": 0.080937 + }, + { + "epoch": 2.9351127473538887, + "grad_norm": 0.5915820598602295, + "learning_rate": 1.2419343529633809e-08, + "loss": 0.17908819913864135, + "memory(GiB)": 72.48, + "step": 7575, + "token_acc": 0.9367020741882518, + "train_speed(iter/s)": 0.08094 + }, + { + "epoch": 2.9370504032746387, + "grad_norm": 0.5795514583587646, + "learning_rate": 1.167834910345833e-08, + "loss": 0.17875158786773682, + "memory(GiB)": 72.48, + "step": 7580, + "token_acc": 0.9413019342020686, + "train_speed(iter/s)": 0.080939 + }, + { + "epoch": 2.9389880591953883, + "grad_norm": 0.5513561367988586, + "learning_rate": 1.0960119533220404e-08, + "loss": 0.17047605514526368, + "memory(GiB)": 72.48, + "step": 7585, + "token_acc": 0.9344127422911114, + "train_speed(iter/s)": 0.080935 + }, + { + "epoch": 2.9409257151161383, + "grad_norm": 0.5630007982254028, + "learning_rate": 1.0264658096653669e-08, + "loss": 0.1801111578941345, + "memory(GiB)": 72.48, + "step": 7590, + "token_acc": 0.9363969165748981, + "train_speed(iter/s)": 0.080935 + }, + { + "epoch": 2.9428633710368883, + "grad_norm": 0.5613774657249451, + "learning_rate": 9.591967967588212e-09, + "loss": 0.17825416326522828, + "memory(GiB)": 72.48, + "step": 7595, + "token_acc": 0.9386145606497558, + "train_speed(iter/s)": 0.080936 + }, + { + "epoch": 2.944801026957638, + "grad_norm": 0.5781858563423157, + "learning_rate": 8.942052215931695e-09, + "loss": 0.18551340103149414, + "memory(GiB)": 72.48, + "step": 7600, + "token_acc": 0.9380983393031586, + "train_speed(iter/s)": 0.080936 + }, + { + "epoch": 2.946738682878388, + "grad_norm": 0.5731537938117981, + "learning_rate": 8.314913807659918e-09, + "loss": 0.16966052055358888, + "memory(GiB)": 72.48, + "step": 7605, + "token_acc": 0.9390907609581228, + "train_speed(iter/s)": 0.08094 + }, + { + "epoch": 2.948676338799138, + "grad_norm": 0.5632595419883728, + "learning_rate": 7.710555604801273e-09, + "loss": 0.18501553535461426, + "memory(GiB)": 72.48, + "step": 7610, + "token_acc": 0.9322401892478693, + "train_speed(iter/s)": 0.080935 + }, + { + "epoch": 2.9506139947198875, + "grad_norm": 0.5846831798553467, + "learning_rate": 7.128980365422866e-09, + "loss": 0.1800399661064148, + "memory(GiB)": 72.48, + "step": 7615, + "token_acc": 0.9379122666903305, + "train_speed(iter/s)": 0.080938 + }, + { + "epoch": 2.9525516506406375, + "grad_norm": 0.569674551486969, + "learning_rate": 6.570190743618865e-09, + "loss": 0.17378931045532225, + "memory(GiB)": 72.48, + "step": 7620, + "token_acc": 0.9357696937274308, + "train_speed(iter/s)": 0.080939 + }, + { + "epoch": 2.9544893065613875, + "grad_norm": 0.5462942719459534, + "learning_rate": 6.0341892894982825e-09, + "loss": 0.180399489402771, + "memory(GiB)": 72.48, + "step": 7625, + "token_acc": 0.9430610889774237, + "train_speed(iter/s)": 0.08094 + }, + { + "epoch": 2.956426962482137, + "grad_norm": 0.5479962229728699, + "learning_rate": 5.52097844917443e-09, + "loss": 0.1843660831451416, + "memory(GiB)": 72.48, + "step": 7630, + "token_acc": 0.9352336893666235, + "train_speed(iter/s)": 0.080936 + }, + { + "epoch": 2.958364618402887, + "grad_norm": 0.6159018278121948, + "learning_rate": 5.030560564751042e-09, + "loss": 0.17185438871383668, + "memory(GiB)": 72.48, + "step": 7635, + "token_acc": 0.9426850041006877, + "train_speed(iter/s)": 0.080942 + }, + { + "epoch": 2.960302274323637, + "grad_norm": 0.599929928779602, + "learning_rate": 4.5629378743150544e-09, + "loss": 0.1817415952682495, + "memory(GiB)": 72.48, + "step": 7640, + "token_acc": 0.9399086010504059, + "train_speed(iter/s)": 0.080942 + }, + { + "epoch": 2.9622399302443867, + "grad_norm": 0.5604813694953918, + "learning_rate": 4.1181125119221785e-09, + "loss": 0.17877774238586425, + "memory(GiB)": 72.48, + "step": 7645, + "token_acc": 0.9378071957533259, + "train_speed(iter/s)": 0.080938 + }, + { + "epoch": 2.9641775861651367, + "grad_norm": 0.5399024486541748, + "learning_rate": 3.696086507592456e-09, + "loss": 0.1777539372444153, + "memory(GiB)": 72.48, + "step": 7650, + "token_acc": 0.942624539639465, + "train_speed(iter/s)": 0.080939 + }, + { + "epoch": 2.9661152420858867, + "grad_norm": 0.528237521648407, + "learning_rate": 3.296861787295269e-09, + "loss": 0.1777519941329956, + "memory(GiB)": 72.48, + "step": 7655, + "token_acc": 0.934471078686816, + "train_speed(iter/s)": 0.080938 + }, + { + "epoch": 2.9680528980066363, + "grad_norm": 0.5477710366249084, + "learning_rate": 2.920440172944905e-09, + "loss": 0.169677734375, + "memory(GiB)": 72.48, + "step": 7660, + "token_acc": 0.9360831656606304, + "train_speed(iter/s)": 0.080937 + }, + { + "epoch": 2.9699905539273863, + "grad_norm": 0.5542213320732117, + "learning_rate": 2.5668233823911147e-09, + "loss": 0.17264974117279053, + "memory(GiB)": 72.48, + "step": 7665, + "token_acc": 0.9443621747408504, + "train_speed(iter/s)": 0.080943 + }, + { + "epoch": 2.9719282098481363, + "grad_norm": 0.5807104706764221, + "learning_rate": 2.236013029409678e-09, + "loss": 0.18212735652923584, + "memory(GiB)": 72.48, + "step": 7670, + "token_acc": 0.929225645295587, + "train_speed(iter/s)": 0.08094 + }, + { + "epoch": 2.973865865768886, + "grad_norm": 0.5461880564689636, + "learning_rate": 1.9280106236968523e-09, + "loss": 0.17995672225952147, + "memory(GiB)": 72.48, + "step": 7675, + "token_acc": 0.9400897334389021, + "train_speed(iter/s)": 0.080943 + }, + { + "epoch": 2.975803521689636, + "grad_norm": 0.5512206554412842, + "learning_rate": 1.642817570861599e-09, + "loss": 0.17052674293518066, + "memory(GiB)": 72.48, + "step": 7680, + "token_acc": 0.9422564949967869, + "train_speed(iter/s)": 0.08094 + }, + { + "epoch": 2.977741177610386, + "grad_norm": 0.5648573637008667, + "learning_rate": 1.380435172420036e-09, + "loss": 0.18253281116485595, + "memory(GiB)": 72.48, + "step": 7685, + "token_acc": 0.9338983584864048, + "train_speed(iter/s)": 0.080941 + }, + { + "epoch": 2.9796788335311355, + "grad_norm": 0.5563445091247559, + "learning_rate": 1.1408646257882183e-09, + "loss": 0.1693689703941345, + "memory(GiB)": 72.48, + "step": 7690, + "token_acc": 0.943661542215715, + "train_speed(iter/s)": 0.080939 + }, + { + "epoch": 2.9816164894518855, + "grad_norm": 0.5370368957519531, + "learning_rate": 9.241070242771433e-10, + "loss": 0.16747430562973023, + "memory(GiB)": 72.48, + "step": 7695, + "token_acc": 0.9449776269094275, + "train_speed(iter/s)": 0.080939 + }, + { + "epoch": 2.9835541453726355, + "grad_norm": 0.5579628348350525, + "learning_rate": 7.301633570888645e-10, + "loss": 0.17679812908172607, + "memory(GiB)": 72.48, + "step": 7700, + "token_acc": 0.9357514362443485, + "train_speed(iter/s)": 0.080939 + }, + { + "epoch": 2.985491801293385, + "grad_norm": 0.5460850596427917, + "learning_rate": 5.590345093109406e-10, + "loss": 0.16887528896331788, + "memory(GiB)": 72.48, + "step": 7705, + "token_acc": 0.9432771576566717, + "train_speed(iter/s)": 0.080941 + }, + { + "epoch": 2.987429457214135, + "grad_norm": 0.5764380097389221, + "learning_rate": 4.107212619108847e-10, + "loss": 0.18432481288909913, + "memory(GiB)": 72.48, + "step": 7710, + "token_acc": 0.9365053901437371, + "train_speed(iter/s)": 0.08094 + }, + { + "epoch": 2.989367113134885, + "grad_norm": 0.5813378691673279, + "learning_rate": 2.8522429173671875e-10, + "loss": 0.18421580791473388, + "memory(GiB)": 72.48, + "step": 7715, + "token_acc": 0.9358071332522251, + "train_speed(iter/s)": 0.080944 + }, + { + "epoch": 2.9913047690556347, + "grad_norm": 0.6146759390830994, + "learning_rate": 1.8254417150975756e-10, + "loss": 0.17415223121643067, + "memory(GiB)": 72.48, + "step": 7720, + "token_acc": 0.9437247617509288, + "train_speed(iter/s)": 0.080942 + }, + { + "epoch": 2.9932424249763847, + "grad_norm": 0.5900871753692627, + "learning_rate": 1.0268136982405363e-10, + "loss": 0.18007500171661378, + "memory(GiB)": 72.48, + "step": 7725, + "token_acc": 0.9351921381697085, + "train_speed(iter/s)": 0.080945 + }, + { + "epoch": 2.9951800808971347, + "grad_norm": 0.5420626401901245, + "learning_rate": 4.563625114417658e-11, + "loss": 0.1813086152076721, + "memory(GiB)": 72.48, + "step": 7730, + "token_acc": 0.9420055078567957, + "train_speed(iter/s)": 0.080946 + }, + { + "epoch": 2.9971177368178843, + "grad_norm": 0.5917452573776245, + "learning_rate": 1.1409075802992775e-11, + "loss": 0.17294812202453613, + "memory(GiB)": 72.48, + "step": 7735, + "token_acc": 0.9346650914688719, + "train_speed(iter/s)": 0.080948 + }, + { + "epoch": 2.9990553927386343, + "grad_norm": 0.5623822212219238, + "learning_rate": 0.0, + "loss": 0.17589629888534547, + "memory(GiB)": 72.48, + "step": 7740, + "token_acc": 0.9381395076887096, + "train_speed(iter/s)": 0.080953 + }, + { + "epoch": 2.9990553927386343, + "eval_loss": 0.2572384476661682, + "eval_runtime": 105.2178, + "eval_samples_per_second": 31.687, + "eval_steps_per_second": 3.963, + "eval_token_acc": 0.908205277166483, + "step": 7740 + }, + { + "epoch": 2.9990553927386343, + "eval_loss": 0.2572384476661682, + "eval_runtime": 105.452, + "eval_samples_per_second": 31.616, + "eval_steps_per_second": 3.954, + "eval_token_acc": 0.908205277166483, + "step": 7740 + } + ], + "logging_steps": 5, + "max_steps": 7740, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.690206829043148e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}