diff --git "a/full/checkpoint-13464/trainer_state.json" "b/full/checkpoint-13464/trainer_state.json" new file mode 100644--- /dev/null +++ "b/full/checkpoint-13464/trainer_state.json" @@ -0,0 +1,18877 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999628653124883, + "eval_steps": 500, + "global_step": 13464, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003713468751160459, + "grad_norm": 24.08954620361328, + "learning_rate": 3.094059405940594e-07, + "loss": 1.1527, + "step": 5 + }, + { + "epoch": 0.0007426937502320918, + "grad_norm": 18.06231689453125, + "learning_rate": 6.188118811881188e-07, + "loss": 1.1166, + "step": 10 + }, + { + "epoch": 0.0011140406253481376, + "grad_norm": 20.671451568603516, + "learning_rate": 9.282178217821782e-07, + "loss": 1.0603, + "step": 15 + }, + { + "epoch": 0.0014853875004641837, + "grad_norm": 6.015936374664307, + "learning_rate": 1.2376237623762377e-06, + "loss": 0.9175, + "step": 20 + }, + { + "epoch": 0.0018567343755802295, + "grad_norm": 3.1561789512634277, + "learning_rate": 1.5470297029702971e-06, + "loss": 0.8301, + "step": 25 + }, + { + "epoch": 0.002228081250696275, + "grad_norm": 2.571364641189575, + "learning_rate": 1.8564356435643564e-06, + "loss": 0.747, + "step": 30 + }, + { + "epoch": 0.0025994281258123215, + "grad_norm": 2.3243155479431152, + "learning_rate": 2.1658415841584156e-06, + "loss": 0.6899, + "step": 35 + }, + { + "epoch": 0.0029707750009283673, + "grad_norm": 2.2084057331085205, + "learning_rate": 2.4752475247524753e-06, + "loss": 0.692, + "step": 40 + }, + { + "epoch": 0.003342121876044413, + "grad_norm": 2.460026979446411, + "learning_rate": 2.784653465346535e-06, + "loss": 0.6368, + "step": 45 + }, + { + "epoch": 0.003713468751160459, + "grad_norm": 2.309441566467285, + "learning_rate": 3.0940594059405943e-06, + "loss": 0.5975, + "step": 50 + }, + { + "epoch": 0.004084815626276505, + "grad_norm": 2.027996301651001, + "learning_rate": 3.403465346534654e-06, + "loss": 0.6295, + "step": 55 + }, + { + "epoch": 0.00445616250139255, + "grad_norm": 1.7956230640411377, + "learning_rate": 3.7128712871287128e-06, + "loss": 0.5895, + "step": 60 + }, + { + "epoch": 0.004827509376508597, + "grad_norm": 1.8671722412109375, + "learning_rate": 4.0222772277227725e-06, + "loss": 0.5851, + "step": 65 + }, + { + "epoch": 0.005198856251624643, + "grad_norm": 1.6895778179168701, + "learning_rate": 4.331683168316831e-06, + "loss": 0.5727, + "step": 70 + }, + { + "epoch": 0.005570203126740688, + "grad_norm": 1.6208025217056274, + "learning_rate": 4.641089108910891e-06, + "loss": 0.562, + "step": 75 + }, + { + "epoch": 0.005941550001856735, + "grad_norm": 1.7942975759506226, + "learning_rate": 4.950495049504951e-06, + "loss": 0.579, + "step": 80 + }, + { + "epoch": 0.00631289687697278, + "grad_norm": 1.7987550497055054, + "learning_rate": 5.25990099009901e-06, + "loss": 0.5846, + "step": 85 + }, + { + "epoch": 0.006684243752088826, + "grad_norm": 1.887704849243164, + "learning_rate": 5.56930693069307e-06, + "loss": 0.5673, + "step": 90 + }, + { + "epoch": 0.007055590627204872, + "grad_norm": 2.0477488040924072, + "learning_rate": 5.878712871287129e-06, + "loss": 0.5562, + "step": 95 + }, + { + "epoch": 0.007426937502320918, + "grad_norm": 2.3760573863983154, + "learning_rate": 6.1881188118811885e-06, + "loss": 0.5614, + "step": 100 + }, + { + "epoch": 0.007798284377436964, + "grad_norm": 2.2995216846466064, + "learning_rate": 6.497524752475248e-06, + "loss": 0.5513, + "step": 105 + }, + { + "epoch": 0.00816963125255301, + "grad_norm": 2.010840654373169, + "learning_rate": 6.806930693069308e-06, + "loss": 0.5673, + "step": 110 + }, + { + "epoch": 0.008540978127669055, + "grad_norm": 1.9840739965438843, + "learning_rate": 7.116336633663366e-06, + "loss": 0.5649, + "step": 115 + }, + { + "epoch": 0.0089123250027851, + "grad_norm": 2.1075551509857178, + "learning_rate": 7.4257425742574256e-06, + "loss": 0.545, + "step": 120 + }, + { + "epoch": 0.009283671877901148, + "grad_norm": 1.8998537063598633, + "learning_rate": 7.735148514851485e-06, + "loss": 0.5478, + "step": 125 + }, + { + "epoch": 0.009655018753017193, + "grad_norm": 1.6922427415847778, + "learning_rate": 8.044554455445545e-06, + "loss": 0.5526, + "step": 130 + }, + { + "epoch": 0.010026365628133239, + "grad_norm": 1.8979038000106812, + "learning_rate": 8.353960396039605e-06, + "loss": 0.5557, + "step": 135 + }, + { + "epoch": 0.010397712503249286, + "grad_norm": 1.866027593612671, + "learning_rate": 8.663366336633663e-06, + "loss": 0.535, + "step": 140 + }, + { + "epoch": 0.010769059378365331, + "grad_norm": 2.00050950050354, + "learning_rate": 8.972772277227722e-06, + "loss": 0.5548, + "step": 145 + }, + { + "epoch": 0.011140406253481377, + "grad_norm": 1.6254732608795166, + "learning_rate": 9.282178217821782e-06, + "loss": 0.5264, + "step": 150 + }, + { + "epoch": 0.011511753128597422, + "grad_norm": 1.7823671102523804, + "learning_rate": 9.591584158415842e-06, + "loss": 0.554, + "step": 155 + }, + { + "epoch": 0.01188310000371347, + "grad_norm": 1.4978529214859009, + "learning_rate": 9.900990099009901e-06, + "loss": 0.5328, + "step": 160 + }, + { + "epoch": 0.012254446878829515, + "grad_norm": 1.9153281450271606, + "learning_rate": 1.0210396039603961e-05, + "loss": 0.5349, + "step": 165 + }, + { + "epoch": 0.01262579375394556, + "grad_norm": 1.415780782699585, + "learning_rate": 1.051980198019802e-05, + "loss": 0.5395, + "step": 170 + }, + { + "epoch": 0.012997140629061606, + "grad_norm": 1.6551861763000488, + "learning_rate": 1.082920792079208e-05, + "loss": 0.5429, + "step": 175 + }, + { + "epoch": 0.013368487504177653, + "grad_norm": 1.8297767639160156, + "learning_rate": 1.113861386138614e-05, + "loss": 0.536, + "step": 180 + }, + { + "epoch": 0.013739834379293698, + "grad_norm": 1.7368695735931396, + "learning_rate": 1.14480198019802e-05, + "loss": 0.5343, + "step": 185 + }, + { + "epoch": 0.014111181254409744, + "grad_norm": 1.7614681720733643, + "learning_rate": 1.1757425742574258e-05, + "loss": 0.532, + "step": 190 + }, + { + "epoch": 0.01448252812952579, + "grad_norm": 1.63772714138031, + "learning_rate": 1.2066831683168317e-05, + "loss": 0.5602, + "step": 195 + }, + { + "epoch": 0.014853875004641836, + "grad_norm": 1.5544772148132324, + "learning_rate": 1.2376237623762377e-05, + "loss": 0.5387, + "step": 200 + }, + { + "epoch": 0.015225221879757882, + "grad_norm": 1.7135066986083984, + "learning_rate": 1.2685643564356437e-05, + "loss": 0.532, + "step": 205 + }, + { + "epoch": 0.015596568754873927, + "grad_norm": 1.9367787837982178, + "learning_rate": 1.2995049504950496e-05, + "loss": 0.5353, + "step": 210 + }, + { + "epoch": 0.015967915629989973, + "grad_norm": 1.8191226720809937, + "learning_rate": 1.3304455445544556e-05, + "loss": 0.53, + "step": 215 + }, + { + "epoch": 0.01633926250510602, + "grad_norm": 1.752224087715149, + "learning_rate": 1.3613861386138616e-05, + "loss": 0.5535, + "step": 220 + }, + { + "epoch": 0.016710609380222067, + "grad_norm": 1.6079806089401245, + "learning_rate": 1.3923267326732675e-05, + "loss": 0.5679, + "step": 225 + }, + { + "epoch": 0.01708195625533811, + "grad_norm": 1.429037094116211, + "learning_rate": 1.4232673267326732e-05, + "loss": 0.5179, + "step": 230 + }, + { + "epoch": 0.017453303130454158, + "grad_norm": 1.5963634252548218, + "learning_rate": 1.4542079207920791e-05, + "loss": 0.5224, + "step": 235 + }, + { + "epoch": 0.0178246500055702, + "grad_norm": 2.0120062828063965, + "learning_rate": 1.4851485148514851e-05, + "loss": 0.5274, + "step": 240 + }, + { + "epoch": 0.01819599688068625, + "grad_norm": 1.9131743907928467, + "learning_rate": 1.516089108910891e-05, + "loss": 0.5365, + "step": 245 + }, + { + "epoch": 0.018567343755802296, + "grad_norm": 1.6677278280258179, + "learning_rate": 1.547029702970297e-05, + "loss": 0.5378, + "step": 250 + }, + { + "epoch": 0.01893869063091834, + "grad_norm": 1.643923282623291, + "learning_rate": 1.577970297029703e-05, + "loss": 0.5427, + "step": 255 + }, + { + "epoch": 0.019310037506034387, + "grad_norm": 1.5207113027572632, + "learning_rate": 1.608910891089109e-05, + "loss": 0.543, + "step": 260 + }, + { + "epoch": 0.019681384381150434, + "grad_norm": 1.5055848360061646, + "learning_rate": 1.639851485148515e-05, + "loss": 0.5276, + "step": 265 + }, + { + "epoch": 0.020052731256266478, + "grad_norm": 1.609848141670227, + "learning_rate": 1.670792079207921e-05, + "loss": 0.5295, + "step": 270 + }, + { + "epoch": 0.020424078131382525, + "grad_norm": 1.8507963418960571, + "learning_rate": 1.701732673267327e-05, + "loss": 0.5399, + "step": 275 + }, + { + "epoch": 0.020795425006498572, + "grad_norm": 1.620498538017273, + "learning_rate": 1.7326732673267325e-05, + "loss": 0.5126, + "step": 280 + }, + { + "epoch": 0.021166771881614616, + "grad_norm": 1.9648469686508179, + "learning_rate": 1.7636138613861385e-05, + "loss": 0.5409, + "step": 285 + }, + { + "epoch": 0.021538118756730663, + "grad_norm": 1.8357899188995361, + "learning_rate": 1.7945544554455445e-05, + "loss": 0.5291, + "step": 290 + }, + { + "epoch": 0.021909465631846706, + "grad_norm": 1.7807576656341553, + "learning_rate": 1.8254950495049504e-05, + "loss": 0.5415, + "step": 295 + }, + { + "epoch": 0.022280812506962754, + "grad_norm": 1.6266732215881348, + "learning_rate": 1.8564356435643564e-05, + "loss": 0.5351, + "step": 300 + }, + { + "epoch": 0.0226521593820788, + "grad_norm": 1.704314947128296, + "learning_rate": 1.8873762376237624e-05, + "loss": 0.5197, + "step": 305 + }, + { + "epoch": 0.023023506257194844, + "grad_norm": 1.6191048622131348, + "learning_rate": 1.9183168316831683e-05, + "loss": 0.5324, + "step": 310 + }, + { + "epoch": 0.02339485313231089, + "grad_norm": 1.5764744281768799, + "learning_rate": 1.9492574257425743e-05, + "loss": 0.5155, + "step": 315 + }, + { + "epoch": 0.02376620000742694, + "grad_norm": 1.7684473991394043, + "learning_rate": 1.9801980198019803e-05, + "loss": 0.5385, + "step": 320 + }, + { + "epoch": 0.024137546882542982, + "grad_norm": 1.5683865547180176, + "learning_rate": 2.0111386138613862e-05, + "loss": 0.5161, + "step": 325 + }, + { + "epoch": 0.02450889375765903, + "grad_norm": 1.4367133378982544, + "learning_rate": 2.0420792079207922e-05, + "loss": 0.5179, + "step": 330 + }, + { + "epoch": 0.024880240632775077, + "grad_norm": 1.755611538887024, + "learning_rate": 2.073019801980198e-05, + "loss": 0.5299, + "step": 335 + }, + { + "epoch": 0.02525158750789112, + "grad_norm": 1.6052712202072144, + "learning_rate": 2.103960396039604e-05, + "loss": 0.5318, + "step": 340 + }, + { + "epoch": 0.025622934383007168, + "grad_norm": 1.6675814390182495, + "learning_rate": 2.13490099009901e-05, + "loss": 0.5473, + "step": 345 + }, + { + "epoch": 0.02599428125812321, + "grad_norm": 1.628082036972046, + "learning_rate": 2.165841584158416e-05, + "loss": 0.5245, + "step": 350 + }, + { + "epoch": 0.02636562813323926, + "grad_norm": 1.5258337259292603, + "learning_rate": 2.196782178217822e-05, + "loss": 0.5231, + "step": 355 + }, + { + "epoch": 0.026736975008355306, + "grad_norm": 1.8839592933654785, + "learning_rate": 2.227722772277228e-05, + "loss": 0.5127, + "step": 360 + }, + { + "epoch": 0.02710832188347135, + "grad_norm": 1.5799691677093506, + "learning_rate": 2.258663366336634e-05, + "loss": 0.5396, + "step": 365 + }, + { + "epoch": 0.027479668758587397, + "grad_norm": 1.3579293489456177, + "learning_rate": 2.28960396039604e-05, + "loss": 0.5211, + "step": 370 + }, + { + "epoch": 0.027851015633703444, + "grad_norm": 1.8711540699005127, + "learning_rate": 2.320544554455446e-05, + "loss": 0.5274, + "step": 375 + }, + { + "epoch": 0.028222362508819487, + "grad_norm": 1.3717608451843262, + "learning_rate": 2.3514851485148515e-05, + "loss": 0.5333, + "step": 380 + }, + { + "epoch": 0.028593709383935535, + "grad_norm": 1.8865386247634888, + "learning_rate": 2.3824257425742575e-05, + "loss": 0.5255, + "step": 385 + }, + { + "epoch": 0.02896505625905158, + "grad_norm": 1.7165244817733765, + "learning_rate": 2.4133663366336635e-05, + "loss": 0.5313, + "step": 390 + }, + { + "epoch": 0.029336403134167625, + "grad_norm": 1.7152857780456543, + "learning_rate": 2.4443069306930694e-05, + "loss": 0.5368, + "step": 395 + }, + { + "epoch": 0.029707750009283673, + "grad_norm": 1.3566906452178955, + "learning_rate": 2.4752475247524754e-05, + "loss": 0.5333, + "step": 400 + }, + { + "epoch": 0.030079096884399716, + "grad_norm": 1.4016120433807373, + "learning_rate": 2.5061881188118814e-05, + "loss": 0.4903, + "step": 405 + }, + { + "epoch": 0.030450443759515763, + "grad_norm": 1.3507463932037354, + "learning_rate": 2.5371287128712873e-05, + "loss": 0.5223, + "step": 410 + }, + { + "epoch": 0.03082179063463181, + "grad_norm": 1.451769232749939, + "learning_rate": 2.5680693069306933e-05, + "loss": 0.5069, + "step": 415 + }, + { + "epoch": 0.031193137509747854, + "grad_norm": 1.7525213956832886, + "learning_rate": 2.5990099009900993e-05, + "loss": 0.5088, + "step": 420 + }, + { + "epoch": 0.0315644843848639, + "grad_norm": 1.8567416667938232, + "learning_rate": 2.6299504950495053e-05, + "loss": 0.5191, + "step": 425 + }, + { + "epoch": 0.031935831259979945, + "grad_norm": 1.7284778356552124, + "learning_rate": 2.6608910891089112e-05, + "loss": 0.5325, + "step": 430 + }, + { + "epoch": 0.032307178135095996, + "grad_norm": 1.5363997220993042, + "learning_rate": 2.6918316831683172e-05, + "loss": 0.5364, + "step": 435 + }, + { + "epoch": 0.03267852501021204, + "grad_norm": 1.3674659729003906, + "learning_rate": 2.722772277227723e-05, + "loss": 0.519, + "step": 440 + }, + { + "epoch": 0.03304987188532808, + "grad_norm": 1.5923279523849487, + "learning_rate": 2.753712871287129e-05, + "loss": 0.5007, + "step": 445 + }, + { + "epoch": 0.033421218760444134, + "grad_norm": 1.5102838277816772, + "learning_rate": 2.784653465346535e-05, + "loss": 0.5195, + "step": 450 + }, + { + "epoch": 0.03379256563556018, + "grad_norm": 1.2837610244750977, + "learning_rate": 2.8155940594059404e-05, + "loss": 0.5125, + "step": 455 + }, + { + "epoch": 0.03416391251067622, + "grad_norm": 1.592606782913208, + "learning_rate": 2.8465346534653464e-05, + "loss": 0.516, + "step": 460 + }, + { + "epoch": 0.03453525938579227, + "grad_norm": 1.7333269119262695, + "learning_rate": 2.8774752475247523e-05, + "loss": 0.5314, + "step": 465 + }, + { + "epoch": 0.034906606260908316, + "grad_norm": 1.5925847291946411, + "learning_rate": 2.9084158415841583e-05, + "loss": 0.5196, + "step": 470 + }, + { + "epoch": 0.03527795313602436, + "grad_norm": 1.5458823442459106, + "learning_rate": 2.9393564356435643e-05, + "loss": 0.5271, + "step": 475 + }, + { + "epoch": 0.0356493000111404, + "grad_norm": 1.2263989448547363, + "learning_rate": 2.9702970297029702e-05, + "loss": 0.5217, + "step": 480 + }, + { + "epoch": 0.036020646886256454, + "grad_norm": 1.3630470037460327, + "learning_rate": 3.0012376237623762e-05, + "loss": 0.5026, + "step": 485 + }, + { + "epoch": 0.0363919937613725, + "grad_norm": 1.3696426153182983, + "learning_rate": 3.032178217821782e-05, + "loss": 0.5317, + "step": 490 + }, + { + "epoch": 0.03676334063648854, + "grad_norm": 1.7865773439407349, + "learning_rate": 3.063118811881188e-05, + "loss": 0.5163, + "step": 495 + }, + { + "epoch": 0.03713468751160459, + "grad_norm": 1.4488468170166016, + "learning_rate": 3.094059405940594e-05, + "loss": 0.4999, + "step": 500 + }, + { + "epoch": 0.037506034386720635, + "grad_norm": 1.3591450452804565, + "learning_rate": 3.125e-05, + "loss": 0.5287, + "step": 505 + }, + { + "epoch": 0.03787738126183668, + "grad_norm": 1.3478190898895264, + "learning_rate": 3.155940594059406e-05, + "loss": 0.5325, + "step": 510 + }, + { + "epoch": 0.03824872813695273, + "grad_norm": 1.257960319519043, + "learning_rate": 3.186881188118812e-05, + "loss": 0.5381, + "step": 515 + }, + { + "epoch": 0.03862007501206877, + "grad_norm": 1.758743166923523, + "learning_rate": 3.217821782178218e-05, + "loss": 0.5584, + "step": 520 + }, + { + "epoch": 0.03899142188718482, + "grad_norm": 1.5418094396591187, + "learning_rate": 3.248762376237624e-05, + "loss": 0.5378, + "step": 525 + }, + { + "epoch": 0.03936276876230087, + "grad_norm": 3.621204376220703, + "learning_rate": 3.27970297029703e-05, + "loss": 0.5402, + "step": 530 + }, + { + "epoch": 0.03973411563741691, + "grad_norm": 1.2378004789352417, + "learning_rate": 3.310643564356436e-05, + "loss": 0.5234, + "step": 535 + }, + { + "epoch": 0.040105462512532955, + "grad_norm": 1.4689031839370728, + "learning_rate": 3.341584158415842e-05, + "loss": 0.5305, + "step": 540 + }, + { + "epoch": 0.040476809387649006, + "grad_norm": 1.4394994974136353, + "learning_rate": 3.372524752475248e-05, + "loss": 0.5413, + "step": 545 + }, + { + "epoch": 0.04084815626276505, + "grad_norm": 1.2873084545135498, + "learning_rate": 3.403465346534654e-05, + "loss": 0.563, + "step": 550 + }, + { + "epoch": 0.04121950313788109, + "grad_norm": 1.5888473987579346, + "learning_rate": 3.43440594059406e-05, + "loss": 0.5219, + "step": 555 + }, + { + "epoch": 0.041590850012997144, + "grad_norm": 1.2506392002105713, + "learning_rate": 3.465346534653465e-05, + "loss": 0.5213, + "step": 560 + }, + { + "epoch": 0.04196219688811319, + "grad_norm": 1.3455543518066406, + "learning_rate": 3.496287128712871e-05, + "loss": 0.5112, + "step": 565 + }, + { + "epoch": 0.04233354376322923, + "grad_norm": 1.3935933113098145, + "learning_rate": 3.527227722772277e-05, + "loss": 0.5205, + "step": 570 + }, + { + "epoch": 0.04270489063834528, + "grad_norm": 1.4280612468719482, + "learning_rate": 3.558168316831683e-05, + "loss": 0.5188, + "step": 575 + }, + { + "epoch": 0.043076237513461325, + "grad_norm": 1.0775330066680908, + "learning_rate": 3.589108910891089e-05, + "loss": 0.5062, + "step": 580 + }, + { + "epoch": 0.04344758438857737, + "grad_norm": 1.1990082263946533, + "learning_rate": 3.620049504950495e-05, + "loss": 0.5318, + "step": 585 + }, + { + "epoch": 0.04381893126369341, + "grad_norm": 1.9189587831497192, + "learning_rate": 3.650990099009901e-05, + "loss": 0.5046, + "step": 590 + }, + { + "epoch": 0.04419027813880946, + "grad_norm": 1.3169859647750854, + "learning_rate": 3.681930693069307e-05, + "loss": 0.5214, + "step": 595 + }, + { + "epoch": 0.04456162501392551, + "grad_norm": 1.3218770027160645, + "learning_rate": 3.712871287128713e-05, + "loss": 0.5586, + "step": 600 + }, + { + "epoch": 0.04493297188904155, + "grad_norm": 1.255031704902649, + "learning_rate": 3.743811881188119e-05, + "loss": 0.5327, + "step": 605 + }, + { + "epoch": 0.0453043187641576, + "grad_norm": 1.5894232988357544, + "learning_rate": 3.774752475247525e-05, + "loss": 0.5205, + "step": 610 + }, + { + "epoch": 0.045675665639273645, + "grad_norm": 1.4050710201263428, + "learning_rate": 3.805693069306931e-05, + "loss": 0.5393, + "step": 615 + }, + { + "epoch": 0.04604701251438969, + "grad_norm": 1.4929163455963135, + "learning_rate": 3.8366336633663367e-05, + "loss": 0.5024, + "step": 620 + }, + { + "epoch": 0.04641835938950574, + "grad_norm": 1.336452841758728, + "learning_rate": 3.8675742574257426e-05, + "loss": 0.5212, + "step": 625 + }, + { + "epoch": 0.04678970626462178, + "grad_norm": 1.2623363733291626, + "learning_rate": 3.8985148514851486e-05, + "loss": 0.5135, + "step": 630 + }, + { + "epoch": 0.04716105313973783, + "grad_norm": 1.336139440536499, + "learning_rate": 3.9294554455445546e-05, + "loss": 0.5288, + "step": 635 + }, + { + "epoch": 0.04753240001485388, + "grad_norm": 1.4025272130966187, + "learning_rate": 3.9603960396039605e-05, + "loss": 0.5192, + "step": 640 + }, + { + "epoch": 0.04790374688996992, + "grad_norm": 1.3830080032348633, + "learning_rate": 3.9913366336633665e-05, + "loss": 0.5229, + "step": 645 + }, + { + "epoch": 0.048275093765085965, + "grad_norm": 1.624871850013733, + "learning_rate": 4.0222772277227725e-05, + "loss": 0.5078, + "step": 650 + }, + { + "epoch": 0.048646440640202016, + "grad_norm": 1.2770441770553589, + "learning_rate": 4.0532178217821784e-05, + "loss": 0.5115, + "step": 655 + }, + { + "epoch": 0.04901778751531806, + "grad_norm": 1.5466053485870361, + "learning_rate": 4.0841584158415844e-05, + "loss": 0.5047, + "step": 660 + }, + { + "epoch": 0.0493891343904341, + "grad_norm": 1.1815299987792969, + "learning_rate": 4.1150990099009904e-05, + "loss": 0.5196, + "step": 665 + }, + { + "epoch": 0.049760481265550154, + "grad_norm": 1.5421866178512573, + "learning_rate": 4.146039603960396e-05, + "loss": 0.4923, + "step": 670 + }, + { + "epoch": 0.0501318281406662, + "grad_norm": 1.328102707862854, + "learning_rate": 4.176980198019802e-05, + "loss": 0.5092, + "step": 675 + }, + { + "epoch": 0.05050317501578224, + "grad_norm": 1.354792594909668, + "learning_rate": 4.207920792079208e-05, + "loss": 0.5084, + "step": 680 + }, + { + "epoch": 0.050874521890898285, + "grad_norm": 1.2935703992843628, + "learning_rate": 4.238861386138614e-05, + "loss": 0.5262, + "step": 685 + }, + { + "epoch": 0.051245868766014335, + "grad_norm": 1.5065386295318604, + "learning_rate": 4.26980198019802e-05, + "loss": 0.5239, + "step": 690 + }, + { + "epoch": 0.05161721564113038, + "grad_norm": 1.410735845565796, + "learning_rate": 4.300742574257426e-05, + "loss": 0.5291, + "step": 695 + }, + { + "epoch": 0.05198856251624642, + "grad_norm": 1.1437729597091675, + "learning_rate": 4.331683168316832e-05, + "loss": 0.5144, + "step": 700 + }, + { + "epoch": 0.05235990939136247, + "grad_norm": 1.7676151990890503, + "learning_rate": 4.362623762376238e-05, + "loss": 0.5213, + "step": 705 + }, + { + "epoch": 0.05273125626647852, + "grad_norm": 1.1369547843933105, + "learning_rate": 4.393564356435644e-05, + "loss": 0.5246, + "step": 710 + }, + { + "epoch": 0.05310260314159456, + "grad_norm": 1.37111496925354, + "learning_rate": 4.42450495049505e-05, + "loss": 0.5051, + "step": 715 + }, + { + "epoch": 0.05347395001671061, + "grad_norm": 1.3732452392578125, + "learning_rate": 4.455445544554456e-05, + "loss": 0.5288, + "step": 720 + }, + { + "epoch": 0.053845296891826655, + "grad_norm": 1.937808632850647, + "learning_rate": 4.486386138613862e-05, + "loss": 0.5245, + "step": 725 + }, + { + "epoch": 0.0542166437669427, + "grad_norm": 1.255505084991455, + "learning_rate": 4.517326732673268e-05, + "loss": 0.5184, + "step": 730 + }, + { + "epoch": 0.05458799064205875, + "grad_norm": 1.2280532121658325, + "learning_rate": 4.548267326732674e-05, + "loss": 0.5227, + "step": 735 + }, + { + "epoch": 0.05495933751717479, + "grad_norm": 1.4528734683990479, + "learning_rate": 4.57920792079208e-05, + "loss": 0.5056, + "step": 740 + }, + { + "epoch": 0.05533068439229084, + "grad_norm": 1.3214168548583984, + "learning_rate": 4.610148514851486e-05, + "loss": 0.5231, + "step": 745 + }, + { + "epoch": 0.05570203126740689, + "grad_norm": 1.2361901998519897, + "learning_rate": 4.641089108910892e-05, + "loss": 0.5375, + "step": 750 + }, + { + "epoch": 0.05607337814252293, + "grad_norm": 1.4549564123153687, + "learning_rate": 4.672029702970298e-05, + "loss": 0.5249, + "step": 755 + }, + { + "epoch": 0.056444725017638975, + "grad_norm": 1.2583074569702148, + "learning_rate": 4.702970297029703e-05, + "loss": 0.522, + "step": 760 + }, + { + "epoch": 0.056816071892755025, + "grad_norm": 1.1710424423217773, + "learning_rate": 4.733910891089109e-05, + "loss": 0.4936, + "step": 765 + }, + { + "epoch": 0.05718741876787107, + "grad_norm": 1.3571052551269531, + "learning_rate": 4.764851485148515e-05, + "loss": 0.5418, + "step": 770 + }, + { + "epoch": 0.05755876564298711, + "grad_norm": 1.2731226682662964, + "learning_rate": 4.795792079207921e-05, + "loss": 0.5191, + "step": 775 + }, + { + "epoch": 0.05793011251810316, + "grad_norm": 1.5696614980697632, + "learning_rate": 4.826732673267327e-05, + "loss": 0.5168, + "step": 780 + }, + { + "epoch": 0.05830145939321921, + "grad_norm": 1.2827826738357544, + "learning_rate": 4.857673267326733e-05, + "loss": 0.512, + "step": 785 + }, + { + "epoch": 0.05867280626833525, + "grad_norm": 2.312535524368286, + "learning_rate": 4.888613861386139e-05, + "loss": 0.5248, + "step": 790 + }, + { + "epoch": 0.059044153143451295, + "grad_norm": 1.1396629810333252, + "learning_rate": 4.919554455445545e-05, + "loss": 0.5347, + "step": 795 + }, + { + "epoch": 0.059415500018567345, + "grad_norm": 1.181564211845398, + "learning_rate": 4.950495049504951e-05, + "loss": 0.5062, + "step": 800 + }, + { + "epoch": 0.05978684689368339, + "grad_norm": 1.0706778764724731, + "learning_rate": 4.981435643564357e-05, + "loss": 0.5154, + "step": 805 + }, + { + "epoch": 0.06015819376879943, + "grad_norm": 1.4956251382827759, + "learning_rate": 4.9999999276691775e-05, + "loss": 0.5154, + "step": 810 + }, + { + "epoch": 0.06052954064391548, + "grad_norm": 1.0542585849761963, + "learning_rate": 4.9999991139474754e-05, + "loss": 0.5341, + "step": 815 + }, + { + "epoch": 0.06090088751903153, + "grad_norm": 1.3105566501617432, + "learning_rate": 4.999997396090837e-05, + "loss": 0.5366, + "step": 820 + }, + { + "epoch": 0.06127223439414757, + "grad_norm": 1.2557010650634766, + "learning_rate": 4.9999947740998846e-05, + "loss": 0.5292, + "step": 825 + }, + { + "epoch": 0.06164358126926362, + "grad_norm": 1.1770594120025635, + "learning_rate": 4.999991247975567e-05, + "loss": 0.5266, + "step": 830 + }, + { + "epoch": 0.062014928144379665, + "grad_norm": 1.2295761108398438, + "learning_rate": 4.9999868177191576e-05, + "loss": 0.5223, + "step": 835 + }, + { + "epoch": 0.06238627501949571, + "grad_norm": 1.2842494249343872, + "learning_rate": 4.999981483332261e-05, + "loss": 0.5138, + "step": 840 + }, + { + "epoch": 0.06275762189461176, + "grad_norm": 1.5931482315063477, + "learning_rate": 4.999975244816805e-05, + "loss": 0.5125, + "step": 845 + }, + { + "epoch": 0.0631289687697278, + "grad_norm": 1.422558069229126, + "learning_rate": 4.9999681021750454e-05, + "loss": 0.5139, + "step": 850 + }, + { + "epoch": 0.06350031564484385, + "grad_norm": 1.165947675704956, + "learning_rate": 4.999960055409566e-05, + "loss": 0.5069, + "step": 855 + }, + { + "epoch": 0.06387166251995989, + "grad_norm": 1.1719167232513428, + "learning_rate": 4.999951104523277e-05, + "loss": 0.5198, + "step": 860 + }, + { + "epoch": 0.06424300939507593, + "grad_norm": 1.5366706848144531, + "learning_rate": 4.999941249519416e-05, + "loss": 0.5169, + "step": 865 + }, + { + "epoch": 0.06461435627019199, + "grad_norm": 1.3433315753936768, + "learning_rate": 4.999930490401546e-05, + "loss": 0.5239, + "step": 870 + }, + { + "epoch": 0.06498570314530804, + "grad_norm": 1.2835320234298706, + "learning_rate": 4.9999188271735584e-05, + "loss": 0.5341, + "step": 875 + }, + { + "epoch": 0.06535705002042408, + "grad_norm": 1.7535203695297241, + "learning_rate": 4.9999062598396715e-05, + "loss": 0.4885, + "step": 880 + }, + { + "epoch": 0.06572839689554012, + "grad_norm": 1.2752184867858887, + "learning_rate": 4.999892788404431e-05, + "loss": 0.5412, + "step": 885 + }, + { + "epoch": 0.06609974377065617, + "grad_norm": 1.509055733680725, + "learning_rate": 4.999878412872708e-05, + "loss": 0.526, + "step": 890 + }, + { + "epoch": 0.06647109064577221, + "grad_norm": 1.215212106704712, + "learning_rate": 4.9998631332497014e-05, + "loss": 0.5281, + "step": 895 + }, + { + "epoch": 0.06684243752088827, + "grad_norm": 1.4890586137771606, + "learning_rate": 4.999846949540937e-05, + "loss": 0.5112, + "step": 900 + }, + { + "epoch": 0.06721378439600431, + "grad_norm": 1.1772491931915283, + "learning_rate": 4.999829861752269e-05, + "loss": 0.5101, + "step": 905 + }, + { + "epoch": 0.06758513127112036, + "grad_norm": 1.0531741380691528, + "learning_rate": 4.999811869889876e-05, + "loss": 0.4913, + "step": 910 + }, + { + "epoch": 0.0679564781462364, + "grad_norm": 1.1728847026824951, + "learning_rate": 4.999792973960266e-05, + "loss": 0.5091, + "step": 915 + }, + { + "epoch": 0.06832782502135244, + "grad_norm": 1.3163584470748901, + "learning_rate": 4.99977317397027e-05, + "loss": 0.534, + "step": 920 + }, + { + "epoch": 0.06869917189646849, + "grad_norm": 1.0669679641723633, + "learning_rate": 4.9997524699270526e-05, + "loss": 0.5161, + "step": 925 + }, + { + "epoch": 0.06907051877158454, + "grad_norm": 1.5822666883468628, + "learning_rate": 4.999730861838099e-05, + "loss": 0.5384, + "step": 930 + }, + { + "epoch": 0.06944186564670059, + "grad_norm": 1.1044566631317139, + "learning_rate": 4.999708349711224e-05, + "loss": 0.5078, + "step": 935 + }, + { + "epoch": 0.06981321252181663, + "grad_norm": 1.54874849319458, + "learning_rate": 4.99968493355457e-05, + "loss": 0.5163, + "step": 940 + }, + { + "epoch": 0.07018455939693267, + "grad_norm": 1.185843586921692, + "learning_rate": 4.999660613376606e-05, + "loss": 0.5176, + "step": 945 + }, + { + "epoch": 0.07055590627204872, + "grad_norm": 1.1341667175292969, + "learning_rate": 4.999635389186126e-05, + "loss": 0.5192, + "step": 950 + }, + { + "epoch": 0.07092725314716476, + "grad_norm": 1.001858115196228, + "learning_rate": 4.999609260992253e-05, + "loss": 0.5224, + "step": 955 + }, + { + "epoch": 0.0712986000222808, + "grad_norm": 1.3810193538665771, + "learning_rate": 4.999582228804437e-05, + "loss": 0.4965, + "step": 960 + }, + { + "epoch": 0.07166994689739686, + "grad_norm": 1.026790738105774, + "learning_rate": 4.9995542926324536e-05, + "loss": 0.5086, + "step": 965 + }, + { + "epoch": 0.07204129377251291, + "grad_norm": 1.3299849033355713, + "learning_rate": 4.999525452486407e-05, + "loss": 0.5177, + "step": 970 + }, + { + "epoch": 0.07241264064762895, + "grad_norm": 1.235654354095459, + "learning_rate": 4.9994957083767256e-05, + "loss": 0.5109, + "step": 975 + }, + { + "epoch": 0.072783987522745, + "grad_norm": 1.2094359397888184, + "learning_rate": 4.9994650603141676e-05, + "loss": 0.5019, + "step": 980 + }, + { + "epoch": 0.07315533439786104, + "grad_norm": 1.2392818927764893, + "learning_rate": 4.999433508309817e-05, + "loss": 0.52, + "step": 985 + }, + { + "epoch": 0.07352668127297708, + "grad_norm": 1.0825366973876953, + "learning_rate": 4.999401052375085e-05, + "loss": 0.491, + "step": 990 + }, + { + "epoch": 0.07389802814809314, + "grad_norm": 1.3078283071517944, + "learning_rate": 4.999367692521709e-05, + "loss": 0.4992, + "step": 995 + }, + { + "epoch": 0.07426937502320918, + "grad_norm": 1.2533962726593018, + "learning_rate": 4.9993334287617546e-05, + "loss": 0.5161, + "step": 1000 + }, + { + "epoch": 0.07464072189832523, + "grad_norm": 1.3753759860992432, + "learning_rate": 4.999298261107611e-05, + "loss": 0.526, + "step": 1005 + }, + { + "epoch": 0.07501206877344127, + "grad_norm": 1.099392056465149, + "learning_rate": 4.999262189571999e-05, + "loss": 0.5163, + "step": 1010 + }, + { + "epoch": 0.07538341564855731, + "grad_norm": 1.0913268327713013, + "learning_rate": 4.999225214167964e-05, + "loss": 0.5052, + "step": 1015 + }, + { + "epoch": 0.07575476252367336, + "grad_norm": 1.2516167163848877, + "learning_rate": 4.999187334908877e-05, + "loss": 0.5093, + "step": 1020 + }, + { + "epoch": 0.07612610939878942, + "grad_norm": 1.0062530040740967, + "learning_rate": 4.999148551808438e-05, + "loss": 0.4984, + "step": 1025 + }, + { + "epoch": 0.07649745627390546, + "grad_norm": 0.9695545434951782, + "learning_rate": 4.999108864880673e-05, + "loss": 0.5041, + "step": 1030 + }, + { + "epoch": 0.0768688031490215, + "grad_norm": 1.2652006149291992, + "learning_rate": 4.999068274139934e-05, + "loss": 0.525, + "step": 1035 + }, + { + "epoch": 0.07724015002413755, + "grad_norm": 1.2803031206130981, + "learning_rate": 4.999026779600903e-05, + "loss": 0.5169, + "step": 1040 + }, + { + "epoch": 0.07761149689925359, + "grad_norm": 1.0694555044174194, + "learning_rate": 4.9989843812785845e-05, + "loss": 0.532, + "step": 1045 + }, + { + "epoch": 0.07798284377436963, + "grad_norm": 1.0950549840927124, + "learning_rate": 4.998941079188313e-05, + "loss": 0.5061, + "step": 1050 + }, + { + "epoch": 0.07835419064948568, + "grad_norm": 1.2538092136383057, + "learning_rate": 4.998896873345749e-05, + "loss": 0.521, + "step": 1055 + }, + { + "epoch": 0.07872553752460174, + "grad_norm": 1.1405869722366333, + "learning_rate": 4.998851763766879e-05, + "loss": 0.5175, + "step": 1060 + }, + { + "epoch": 0.07909688439971778, + "grad_norm": 1.5427117347717285, + "learning_rate": 4.998805750468017e-05, + "loss": 0.5416, + "step": 1065 + }, + { + "epoch": 0.07946823127483382, + "grad_norm": 1.0638082027435303, + "learning_rate": 4.998758833465805e-05, + "loss": 0.4665, + "step": 1070 + }, + { + "epoch": 0.07983957814994987, + "grad_norm": 1.1549831628799438, + "learning_rate": 4.998711012777211e-05, + "loss": 0.5117, + "step": 1075 + }, + { + "epoch": 0.08021092502506591, + "grad_norm": 1.106373906135559, + "learning_rate": 4.998662288419528e-05, + "loss": 0.5101, + "step": 1080 + }, + { + "epoch": 0.08058227190018195, + "grad_norm": 1.6457034349441528, + "learning_rate": 4.998612660410378e-05, + "loss": 0.4911, + "step": 1085 + }, + { + "epoch": 0.08095361877529801, + "grad_norm": 0.9553409814834595, + "learning_rate": 4.998562128767709e-05, + "loss": 0.5055, + "step": 1090 + }, + { + "epoch": 0.08132496565041406, + "grad_norm": 0.9238666892051697, + "learning_rate": 4.998510693509797e-05, + "loss": 0.5128, + "step": 1095 + }, + { + "epoch": 0.0816963125255301, + "grad_norm": 0.9115836024284363, + "learning_rate": 4.998458354655242e-05, + "loss": 0.5021, + "step": 1100 + }, + { + "epoch": 0.08206765940064614, + "grad_norm": 0.9943495392799377, + "learning_rate": 4.998405112222974e-05, + "loss": 0.4871, + "step": 1105 + }, + { + "epoch": 0.08243900627576219, + "grad_norm": 1.342914342880249, + "learning_rate": 4.998350966232248e-05, + "loss": 0.5015, + "step": 1110 + }, + { + "epoch": 0.08281035315087823, + "grad_norm": 1.1267045736312866, + "learning_rate": 4.998295916702646e-05, + "loss": 0.4942, + "step": 1115 + }, + { + "epoch": 0.08318170002599429, + "grad_norm": 1.411318302154541, + "learning_rate": 4.998239963654077e-05, + "loss": 0.5256, + "step": 1120 + }, + { + "epoch": 0.08355304690111033, + "grad_norm": 1.043161153793335, + "learning_rate": 4.9981831071067766e-05, + "loss": 0.4977, + "step": 1125 + }, + { + "epoch": 0.08392439377622637, + "grad_norm": 1.0562570095062256, + "learning_rate": 4.9981253470813066e-05, + "loss": 0.4864, + "step": 1130 + }, + { + "epoch": 0.08429574065134242, + "grad_norm": 1.0544936656951904, + "learning_rate": 4.9980666835985565e-05, + "loss": 0.4805, + "step": 1135 + }, + { + "epoch": 0.08466708752645846, + "grad_norm": 1.0702053308486938, + "learning_rate": 4.998007116679744e-05, + "loss": 0.4813, + "step": 1140 + }, + { + "epoch": 0.0850384344015745, + "grad_norm": 1.0946789979934692, + "learning_rate": 4.9979466463464084e-05, + "loss": 0.5028, + "step": 1145 + }, + { + "epoch": 0.08540978127669056, + "grad_norm": 1.2464505434036255, + "learning_rate": 4.997885272620422e-05, + "loss": 0.5108, + "step": 1150 + }, + { + "epoch": 0.08578112815180661, + "grad_norm": 1.092294692993164, + "learning_rate": 4.997822995523979e-05, + "loss": 0.4844, + "step": 1155 + }, + { + "epoch": 0.08615247502692265, + "grad_norm": 1.2743663787841797, + "learning_rate": 4.9977598150796024e-05, + "loss": 0.4984, + "step": 1160 + }, + { + "epoch": 0.0865238219020387, + "grad_norm": 1.2585391998291016, + "learning_rate": 4.997695731310143e-05, + "loss": 0.4927, + "step": 1165 + }, + { + "epoch": 0.08689516877715474, + "grad_norm": 0.9504044055938721, + "learning_rate": 4.997630744238775e-05, + "loss": 0.5196, + "step": 1170 + }, + { + "epoch": 0.08726651565227078, + "grad_norm": 1.1162606477737427, + "learning_rate": 4.9975648538890026e-05, + "loss": 0.4981, + "step": 1175 + }, + { + "epoch": 0.08763786252738683, + "grad_norm": 0.9510879516601562, + "learning_rate": 4.997498060284655e-05, + "loss": 0.5058, + "step": 1180 + }, + { + "epoch": 0.08800920940250288, + "grad_norm": 1.524391770362854, + "learning_rate": 4.9974303634498884e-05, + "loss": 0.4991, + "step": 1185 + }, + { + "epoch": 0.08838055627761893, + "grad_norm": 0.9613252878189087, + "learning_rate": 4.997361763409185e-05, + "loss": 0.4902, + "step": 1190 + }, + { + "epoch": 0.08875190315273497, + "grad_norm": 0.8819934725761414, + "learning_rate": 4.997292260187356e-05, + "loss": 0.4917, + "step": 1195 + }, + { + "epoch": 0.08912325002785101, + "grad_norm": 1.2517043352127075, + "learning_rate": 4.997221853809536e-05, + "loss": 0.4978, + "step": 1200 + }, + { + "epoch": 0.08949459690296706, + "grad_norm": 1.4532949924468994, + "learning_rate": 4.997150544301188e-05, + "loss": 0.5176, + "step": 1205 + }, + { + "epoch": 0.0898659437780831, + "grad_norm": 0.9775285720825195, + "learning_rate": 4.997078331688101e-05, + "loss": 0.4947, + "step": 1210 + }, + { + "epoch": 0.09023729065319916, + "grad_norm": 1.055336833000183, + "learning_rate": 4.997005215996392e-05, + "loss": 0.4804, + "step": 1215 + }, + { + "epoch": 0.0906086375283152, + "grad_norm": 1.0310511589050293, + "learning_rate": 4.996931197252503e-05, + "loss": 0.4905, + "step": 1220 + }, + { + "epoch": 0.09097998440343125, + "grad_norm": 1.1748933792114258, + "learning_rate": 4.996856275483204e-05, + "loss": 0.5215, + "step": 1225 + }, + { + "epoch": 0.09135133127854729, + "grad_norm": 1.008360505104065, + "learning_rate": 4.996780450715589e-05, + "loss": 0.5094, + "step": 1230 + }, + { + "epoch": 0.09172267815366333, + "grad_norm": 1.1358569860458374, + "learning_rate": 4.996703722977082e-05, + "loss": 0.4978, + "step": 1235 + }, + { + "epoch": 0.09209402502877938, + "grad_norm": 1.0410033464431763, + "learning_rate": 4.996626092295431e-05, + "loss": 0.4902, + "step": 1240 + }, + { + "epoch": 0.09246537190389544, + "grad_norm": 1.212699294090271, + "learning_rate": 4.9965475586987123e-05, + "loss": 0.5013, + "step": 1245 + }, + { + "epoch": 0.09283671877901148, + "grad_norm": 1.0074434280395508, + "learning_rate": 4.996468122215326e-05, + "loss": 0.4745, + "step": 1250 + }, + { + "epoch": 0.09320806565412752, + "grad_norm": 1.273102045059204, + "learning_rate": 4.996387782874003e-05, + "loss": 0.5078, + "step": 1255 + }, + { + "epoch": 0.09357941252924357, + "grad_norm": 1.1507134437561035, + "learning_rate": 4.9963065407037977e-05, + "loss": 0.5149, + "step": 1260 + }, + { + "epoch": 0.09395075940435961, + "grad_norm": 1.203317642211914, + "learning_rate": 4.99622439573409e-05, + "loss": 0.4843, + "step": 1265 + }, + { + "epoch": 0.09432210627947565, + "grad_norm": 1.1259766817092896, + "learning_rate": 4.9961413479945905e-05, + "loss": 0.5034, + "step": 1270 + }, + { + "epoch": 0.0946934531545917, + "grad_norm": 1.205767273902893, + "learning_rate": 4.9960573975153325e-05, + "loss": 0.5017, + "step": 1275 + }, + { + "epoch": 0.09506480002970776, + "grad_norm": 1.0495589971542358, + "learning_rate": 4.9959725443266765e-05, + "loss": 0.5004, + "step": 1280 + }, + { + "epoch": 0.0954361469048238, + "grad_norm": 1.2640390396118164, + "learning_rate": 4.995886788459311e-05, + "loss": 0.4858, + "step": 1285 + }, + { + "epoch": 0.09580749377993984, + "grad_norm": 1.2448623180389404, + "learning_rate": 4.9958001299442486e-05, + "loss": 0.5001, + "step": 1290 + }, + { + "epoch": 0.09617884065505589, + "grad_norm": 0.8900221586227417, + "learning_rate": 4.995712568812832e-05, + "loss": 0.472, + "step": 1295 + }, + { + "epoch": 0.09655018753017193, + "grad_norm": 0.8876100182533264, + "learning_rate": 4.995624105096725e-05, + "loss": 0.498, + "step": 1300 + }, + { + "epoch": 0.09692153440528797, + "grad_norm": 0.8986858129501343, + "learning_rate": 4.995534738827923e-05, + "loss": 0.4935, + "step": 1305 + }, + { + "epoch": 0.09729288128040403, + "grad_norm": 1.0120513439178467, + "learning_rate": 4.995444470038746e-05, + "loss": 0.491, + "step": 1310 + }, + { + "epoch": 0.09766422815552007, + "grad_norm": 1.1924999952316284, + "learning_rate": 4.995353298761839e-05, + "loss": 0.4768, + "step": 1315 + }, + { + "epoch": 0.09803557503063612, + "grad_norm": 1.1135913133621216, + "learning_rate": 4.9952612250301746e-05, + "loss": 0.4949, + "step": 1320 + }, + { + "epoch": 0.09840692190575216, + "grad_norm": 1.3137718439102173, + "learning_rate": 4.995168248877051e-05, + "loss": 0.4907, + "step": 1325 + }, + { + "epoch": 0.0987782687808682, + "grad_norm": 1.22038733959198, + "learning_rate": 4.9950743703360954e-05, + "loss": 0.4961, + "step": 1330 + }, + { + "epoch": 0.09914961565598425, + "grad_norm": 1.185892939567566, + "learning_rate": 4.9949795894412576e-05, + "loss": 0.4842, + "step": 1335 + }, + { + "epoch": 0.09952096253110031, + "grad_norm": 0.8760488629341125, + "learning_rate": 4.994883906226816e-05, + "loss": 0.4797, + "step": 1340 + }, + { + "epoch": 0.09989230940621635, + "grad_norm": 1.1870434284210205, + "learning_rate": 4.9947873207273756e-05, + "loss": 0.4775, + "step": 1345 + }, + { + "epoch": 0.1002636562813324, + "grad_norm": 1.0038819313049316, + "learning_rate": 4.994689832977866e-05, + "loss": 0.4744, + "step": 1350 + }, + { + "epoch": 0.10063500315644844, + "grad_norm": 1.1821181774139404, + "learning_rate": 4.994591443013543e-05, + "loss": 0.4957, + "step": 1355 + }, + { + "epoch": 0.10100635003156448, + "grad_norm": 0.9416809678077698, + "learning_rate": 4.994492150869992e-05, + "loss": 0.4998, + "step": 1360 + }, + { + "epoch": 0.10137769690668053, + "grad_norm": 0.8790668845176697, + "learning_rate": 4.9943919565831216e-05, + "loss": 0.5038, + "step": 1365 + }, + { + "epoch": 0.10174904378179657, + "grad_norm": 0.9455973505973816, + "learning_rate": 4.9942908601891666e-05, + "loss": 0.4791, + "step": 1370 + }, + { + "epoch": 0.10212039065691263, + "grad_norm": 1.012926697731018, + "learning_rate": 4.99418886172469e-05, + "loss": 0.4505, + "step": 1375 + }, + { + "epoch": 0.10249173753202867, + "grad_norm": 1.208351492881775, + "learning_rate": 4.99408596122658e-05, + "loss": 0.4861, + "step": 1380 + }, + { + "epoch": 0.10286308440714471, + "grad_norm": 1.1314637660980225, + "learning_rate": 4.99398215873205e-05, + "loss": 0.4873, + "step": 1385 + }, + { + "epoch": 0.10323443128226076, + "grad_norm": 1.5732420682907104, + "learning_rate": 4.9938774542786416e-05, + "loss": 0.4915, + "step": 1390 + }, + { + "epoch": 0.1036057781573768, + "grad_norm": 1.0597292184829712, + "learning_rate": 4.993771847904221e-05, + "loss": 0.4978, + "step": 1395 + }, + { + "epoch": 0.10397712503249285, + "grad_norm": 1.0806019306182861, + "learning_rate": 4.9936653396469814e-05, + "loss": 0.4827, + "step": 1400 + }, + { + "epoch": 0.1043484719076089, + "grad_norm": 0.9781217575073242, + "learning_rate": 4.9935579295454415e-05, + "loss": 0.4542, + "step": 1405 + }, + { + "epoch": 0.10471981878272495, + "grad_norm": 1.0065959692001343, + "learning_rate": 4.993449617638447e-05, + "loss": 0.5061, + "step": 1410 + }, + { + "epoch": 0.10509116565784099, + "grad_norm": 0.883406400680542, + "learning_rate": 4.9933404039651696e-05, + "loss": 0.4943, + "step": 1415 + }, + { + "epoch": 0.10546251253295703, + "grad_norm": 1.0783164501190186, + "learning_rate": 4.993230288565107e-05, + "loss": 0.4971, + "step": 1420 + }, + { + "epoch": 0.10583385940807308, + "grad_norm": 1.1411856412887573, + "learning_rate": 4.993119271478082e-05, + "loss": 0.4713, + "step": 1425 + }, + { + "epoch": 0.10620520628318912, + "grad_norm": 1.077597737312317, + "learning_rate": 4.993007352744245e-05, + "loss": 0.4882, + "step": 1430 + }, + { + "epoch": 0.10657655315830518, + "grad_norm": 1.2164273262023926, + "learning_rate": 4.992894532404071e-05, + "loss": 0.478, + "step": 1435 + }, + { + "epoch": 0.10694790003342122, + "grad_norm": 1.0068302154541016, + "learning_rate": 4.9927808104983644e-05, + "loss": 0.4889, + "step": 1440 + }, + { + "epoch": 0.10731924690853727, + "grad_norm": 1.2516838312149048, + "learning_rate": 4.99266618706825e-05, + "loss": 0.4866, + "step": 1445 + }, + { + "epoch": 0.10769059378365331, + "grad_norm": 0.9896201491355896, + "learning_rate": 4.992550662155184e-05, + "loss": 0.4707, + "step": 1450 + }, + { + "epoch": 0.10806194065876935, + "grad_norm": 0.8557565808296204, + "learning_rate": 4.992434235800946e-05, + "loss": 0.4912, + "step": 1455 + }, + { + "epoch": 0.1084332875338854, + "grad_norm": 0.9181938171386719, + "learning_rate": 4.992316908047641e-05, + "loss": 0.4949, + "step": 1460 + }, + { + "epoch": 0.10880463440900145, + "grad_norm": 0.9550036787986755, + "learning_rate": 4.9921986789377016e-05, + "loss": 0.4766, + "step": 1465 + }, + { + "epoch": 0.1091759812841175, + "grad_norm": 1.0532774925231934, + "learning_rate": 4.992079548513887e-05, + "loss": 0.4732, + "step": 1470 + }, + { + "epoch": 0.10954732815923354, + "grad_norm": 1.107776403427124, + "learning_rate": 4.99195951681928e-05, + "loss": 0.4779, + "step": 1475 + }, + { + "epoch": 0.10991867503434959, + "grad_norm": 1.1875584125518799, + "learning_rate": 4.991838583897291e-05, + "loss": 0.5138, + "step": 1480 + }, + { + "epoch": 0.11029002190946563, + "grad_norm": 0.963405966758728, + "learning_rate": 4.991716749791656e-05, + "loss": 0.4846, + "step": 1485 + }, + { + "epoch": 0.11066136878458167, + "grad_norm": 1.337599277496338, + "learning_rate": 4.991594014546437e-05, + "loss": 0.482, + "step": 1490 + }, + { + "epoch": 0.11103271565969772, + "grad_norm": 1.094760775566101, + "learning_rate": 4.991470378206021e-05, + "loss": 0.4878, + "step": 1495 + }, + { + "epoch": 0.11140406253481377, + "grad_norm": 1.0118831396102905, + "learning_rate": 4.9913458408151216e-05, + "loss": 0.4795, + "step": 1500 + }, + { + "epoch": 0.11177540940992982, + "grad_norm": 1.1335307359695435, + "learning_rate": 4.991220402418778e-05, + "loss": 0.4705, + "step": 1505 + }, + { + "epoch": 0.11214675628504586, + "grad_norm": 0.9686943292617798, + "learning_rate": 4.991094063062357e-05, + "loss": 0.4788, + "step": 1510 + }, + { + "epoch": 0.1125181031601619, + "grad_norm": 1.356569766998291, + "learning_rate": 4.9909668227915485e-05, + "loss": 0.4903, + "step": 1515 + }, + { + "epoch": 0.11288945003527795, + "grad_norm": 1.1215263605117798, + "learning_rate": 4.99083868165237e-05, + "loss": 0.4612, + "step": 1520 + }, + { + "epoch": 0.113260796910394, + "grad_norm": 0.9811176061630249, + "learning_rate": 4.9907096396911634e-05, + "loss": 0.4978, + "step": 1525 + }, + { + "epoch": 0.11363214378551005, + "grad_norm": 1.0463788509368896, + "learning_rate": 4.9905796969545985e-05, + "loss": 0.4743, + "step": 1530 + }, + { + "epoch": 0.1140034906606261, + "grad_norm": 0.8583905696868896, + "learning_rate": 4.9904488534896695e-05, + "loss": 0.5038, + "step": 1535 + }, + { + "epoch": 0.11437483753574214, + "grad_norm": 1.0710617303848267, + "learning_rate": 4.990317109343695e-05, + "loss": 0.4723, + "step": 1540 + }, + { + "epoch": 0.11474618441085818, + "grad_norm": 1.47568678855896, + "learning_rate": 4.9901844645643225e-05, + "loss": 0.4758, + "step": 1545 + }, + { + "epoch": 0.11511753128597423, + "grad_norm": 1.0400784015655518, + "learning_rate": 4.990050919199522e-05, + "loss": 0.4836, + "step": 1550 + }, + { + "epoch": 0.11548887816109027, + "grad_norm": 0.974274754524231, + "learning_rate": 4.989916473297592e-05, + "loss": 0.4874, + "step": 1555 + }, + { + "epoch": 0.11586022503620633, + "grad_norm": 2.123159646987915, + "learning_rate": 4.9897811269071554e-05, + "loss": 0.471, + "step": 1560 + }, + { + "epoch": 0.11623157191132237, + "grad_norm": 0.9718955159187317, + "learning_rate": 4.98964488007716e-05, + "loss": 0.4726, + "step": 1565 + }, + { + "epoch": 0.11660291878643841, + "grad_norm": 1.31617271900177, + "learning_rate": 4.989507732856881e-05, + "loss": 0.4713, + "step": 1570 + }, + { + "epoch": 0.11697426566155446, + "grad_norm": 1.1256753206253052, + "learning_rate": 4.989369685295916e-05, + "loss": 0.4697, + "step": 1575 + }, + { + "epoch": 0.1173456125366705, + "grad_norm": 1.2796781063079834, + "learning_rate": 4.9892307374441935e-05, + "loss": 0.5017, + "step": 1580 + }, + { + "epoch": 0.11771695941178655, + "grad_norm": 1.045142650604248, + "learning_rate": 4.989090889351963e-05, + "loss": 0.4939, + "step": 1585 + }, + { + "epoch": 0.11808830628690259, + "grad_norm": 1.194403886795044, + "learning_rate": 4.9889501410698006e-05, + "loss": 0.483, + "step": 1590 + }, + { + "epoch": 0.11845965316201865, + "grad_norm": 1.3747239112854004, + "learning_rate": 4.98880849264861e-05, + "loss": 0.5081, + "step": 1595 + }, + { + "epoch": 0.11883100003713469, + "grad_norm": 1.1211419105529785, + "learning_rate": 4.988665944139618e-05, + "loss": 0.48, + "step": 1600 + }, + { + "epoch": 0.11920234691225073, + "grad_norm": 1.2679734230041504, + "learning_rate": 4.988522495594378e-05, + "loss": 0.4886, + "step": 1605 + }, + { + "epoch": 0.11957369378736678, + "grad_norm": 0.9416651725769043, + "learning_rate": 4.9883781470647684e-05, + "loss": 0.4769, + "step": 1610 + }, + { + "epoch": 0.11994504066248282, + "grad_norm": 1.0857641696929932, + "learning_rate": 4.9882328986029934e-05, + "loss": 0.4767, + "step": 1615 + }, + { + "epoch": 0.12031638753759887, + "grad_norm": 1.0913755893707275, + "learning_rate": 4.9880867502615834e-05, + "loss": 0.4845, + "step": 1620 + }, + { + "epoch": 0.12068773441271492, + "grad_norm": 1.094753623008728, + "learning_rate": 4.987939702093394e-05, + "loss": 0.4514, + "step": 1625 + }, + { + "epoch": 0.12105908128783097, + "grad_norm": 1.1173486709594727, + "learning_rate": 4.9877917541516036e-05, + "loss": 0.4695, + "step": 1630 + }, + { + "epoch": 0.12143042816294701, + "grad_norm": 1.1658018827438354, + "learning_rate": 4.987642906489721e-05, + "loss": 0.5047, + "step": 1635 + }, + { + "epoch": 0.12180177503806305, + "grad_norm": 1.165580153465271, + "learning_rate": 4.9874931591615756e-05, + "loss": 0.493, + "step": 1640 + }, + { + "epoch": 0.1221731219131791, + "grad_norm": 1.222952127456665, + "learning_rate": 4.9873425122213245e-05, + "loss": 0.5104, + "step": 1645 + }, + { + "epoch": 0.12254446878829514, + "grad_norm": 0.9523534178733826, + "learning_rate": 4.98719096572345e-05, + "loss": 0.4683, + "step": 1650 + }, + { + "epoch": 0.1229158156634112, + "grad_norm": 0.9762274622917175, + "learning_rate": 4.987038519722759e-05, + "loss": 0.4804, + "step": 1655 + }, + { + "epoch": 0.12328716253852724, + "grad_norm": 1.119864821434021, + "learning_rate": 4.986885174274386e-05, + "loss": 0.4881, + "step": 1660 + }, + { + "epoch": 0.12365850941364329, + "grad_norm": 0.6928736567497253, + "learning_rate": 4.9867309294337865e-05, + "loss": 0.4629, + "step": 1665 + }, + { + "epoch": 0.12402985628875933, + "grad_norm": 0.9195057153701782, + "learning_rate": 4.986575785256745e-05, + "loss": 0.4444, + "step": 1670 + }, + { + "epoch": 0.12440120316387537, + "grad_norm": 0.9007054567337036, + "learning_rate": 4.9864197417993705e-05, + "loss": 0.4845, + "step": 1675 + }, + { + "epoch": 0.12477255003899142, + "grad_norm": 1.1238789558410645, + "learning_rate": 4.9862627991180967e-05, + "loss": 0.4726, + "step": 1680 + }, + { + "epoch": 0.12514389691410746, + "grad_norm": 0.964441180229187, + "learning_rate": 4.9861049572696814e-05, + "loss": 0.4785, + "step": 1685 + }, + { + "epoch": 0.12551524378922352, + "grad_norm": 0.9361527562141418, + "learning_rate": 4.98594621631121e-05, + "loss": 0.4457, + "step": 1690 + }, + { + "epoch": 0.12588659066433955, + "grad_norm": 0.9168547987937927, + "learning_rate": 4.985786576300091e-05, + "loss": 0.4703, + "step": 1695 + }, + { + "epoch": 0.1262579375394556, + "grad_norm": 1.0740379095077515, + "learning_rate": 4.9856260372940587e-05, + "loss": 0.4661, + "step": 1700 + }, + { + "epoch": 0.12662928441457166, + "grad_norm": 0.9852397441864014, + "learning_rate": 4.985464599351174e-05, + "loss": 0.458, + "step": 1705 + }, + { + "epoch": 0.1270006312896877, + "grad_norm": 0.8920067548751831, + "learning_rate": 4.98530226252982e-05, + "loss": 0.4639, + "step": 1710 + }, + { + "epoch": 0.12737197816480375, + "grad_norm": 0.983961820602417, + "learning_rate": 4.985139026888708e-05, + "loss": 0.4877, + "step": 1715 + }, + { + "epoch": 0.12774332503991978, + "grad_norm": 1.1460301876068115, + "learning_rate": 4.984974892486872e-05, + "loss": 0.4696, + "step": 1720 + }, + { + "epoch": 0.12811467191503584, + "grad_norm": 1.1181328296661377, + "learning_rate": 4.984809859383671e-05, + "loss": 0.4756, + "step": 1725 + }, + { + "epoch": 0.12848601879015187, + "grad_norm": 0.9542262554168701, + "learning_rate": 4.984643927638792e-05, + "loss": 0.4715, + "step": 1730 + }, + { + "epoch": 0.12885736566526793, + "grad_norm": 1.1598364114761353, + "learning_rate": 4.9844770973122426e-05, + "loss": 0.4744, + "step": 1735 + }, + { + "epoch": 0.12922871254038398, + "grad_norm": 1.019036889076233, + "learning_rate": 4.98430936846436e-05, + "loss": 0.4886, + "step": 1740 + }, + { + "epoch": 0.1296000594155, + "grad_norm": 1.1101309061050415, + "learning_rate": 4.984140741155802e-05, + "loss": 0.4813, + "step": 1745 + }, + { + "epoch": 0.12997140629061607, + "grad_norm": 1.0255146026611328, + "learning_rate": 4.983971215447555e-05, + "loss": 0.4749, + "step": 1750 + }, + { + "epoch": 0.1303427531657321, + "grad_norm": 0.8294594287872314, + "learning_rate": 4.983800791400927e-05, + "loss": 0.4626, + "step": 1755 + }, + { + "epoch": 0.13071410004084816, + "grad_norm": 1.257464051246643, + "learning_rate": 4.983629469077554e-05, + "loss": 0.4851, + "step": 1760 + }, + { + "epoch": 0.13108544691596422, + "grad_norm": 0.9600897431373596, + "learning_rate": 4.983457248539395e-05, + "loss": 0.474, + "step": 1765 + }, + { + "epoch": 0.13145679379108025, + "grad_norm": 1.1232261657714844, + "learning_rate": 4.983284129848733e-05, + "loss": 0.4704, + "step": 1770 + }, + { + "epoch": 0.1318281406661963, + "grad_norm": 0.901701807975769, + "learning_rate": 4.983110113068179e-05, + "loss": 0.4659, + "step": 1775 + }, + { + "epoch": 0.13219948754131233, + "grad_norm": 0.829624354839325, + "learning_rate": 4.982935198260666e-05, + "loss": 0.463, + "step": 1780 + }, + { + "epoch": 0.1325708344164284, + "grad_norm": 1.1589818000793457, + "learning_rate": 4.982759385489453e-05, + "loss": 0.4614, + "step": 1785 + }, + { + "epoch": 0.13294218129154442, + "grad_norm": 0.9396231770515442, + "learning_rate": 4.9825826748181225e-05, + "loss": 0.4658, + "step": 1790 + }, + { + "epoch": 0.13331352816666048, + "grad_norm": 1.2056121826171875, + "learning_rate": 4.982405066310584e-05, + "loss": 0.4618, + "step": 1795 + }, + { + "epoch": 0.13368487504177654, + "grad_norm": 1.2107425928115845, + "learning_rate": 4.982226560031069e-05, + "loss": 0.4813, + "step": 1800 + }, + { + "epoch": 0.13405622191689257, + "grad_norm": 1.4222759008407593, + "learning_rate": 4.9820471560441365e-05, + "loss": 0.4693, + "step": 1805 + }, + { + "epoch": 0.13442756879200862, + "grad_norm": 1.177914023399353, + "learning_rate": 4.9818668544146675e-05, + "loss": 0.4696, + "step": 1810 + }, + { + "epoch": 0.13479891566712465, + "grad_norm": 1.003184199333191, + "learning_rate": 4.981685655207869e-05, + "loss": 0.4646, + "step": 1815 + }, + { + "epoch": 0.1351702625422407, + "grad_norm": 1.2326256036758423, + "learning_rate": 4.981503558489272e-05, + "loss": 0.4656, + "step": 1820 + }, + { + "epoch": 0.13554160941735674, + "grad_norm": 0.9597446322441101, + "learning_rate": 4.9813205643247343e-05, + "loss": 0.4609, + "step": 1825 + }, + { + "epoch": 0.1359129562924728, + "grad_norm": 1.1886649131774902, + "learning_rate": 4.981136672780434e-05, + "loss": 0.472, + "step": 1830 + }, + { + "epoch": 0.13628430316758885, + "grad_norm": 0.9577616453170776, + "learning_rate": 4.9809518839228786e-05, + "loss": 0.4688, + "step": 1835 + }, + { + "epoch": 0.13665565004270488, + "grad_norm": 1.0367263555526733, + "learning_rate": 4.9807661978188966e-05, + "loss": 0.4562, + "step": 1840 + }, + { + "epoch": 0.13702699691782094, + "grad_norm": 0.9544695019721985, + "learning_rate": 4.980579614535642e-05, + "loss": 0.4729, + "step": 1845 + }, + { + "epoch": 0.13739834379293697, + "grad_norm": 1.0527535676956177, + "learning_rate": 4.9803921341405943e-05, + "loss": 0.4644, + "step": 1850 + }, + { + "epoch": 0.13776969066805303, + "grad_norm": 1.109802007675171, + "learning_rate": 4.980203756701555e-05, + "loss": 0.4876, + "step": 1855 + }, + { + "epoch": 0.1381410375431691, + "grad_norm": 1.127073049545288, + "learning_rate": 4.9800144822866526e-05, + "loss": 0.4829, + "step": 1860 + }, + { + "epoch": 0.13851238441828512, + "grad_norm": 1.0131291151046753, + "learning_rate": 4.979824310964339e-05, + "loss": 0.4933, + "step": 1865 + }, + { + "epoch": 0.13888373129340117, + "grad_norm": 0.9607428908348083, + "learning_rate": 4.979633242803391e-05, + "loss": 0.4698, + "step": 1870 + }, + { + "epoch": 0.1392550781685172, + "grad_norm": 0.9015031456947327, + "learning_rate": 4.979441277872907e-05, + "loss": 0.4914, + "step": 1875 + }, + { + "epoch": 0.13962642504363326, + "grad_norm": 1.4103434085845947, + "learning_rate": 4.9792484162423146e-05, + "loss": 0.4834, + "step": 1880 + }, + { + "epoch": 0.1399977719187493, + "grad_norm": 0.8119305372238159, + "learning_rate": 4.979054657981361e-05, + "loss": 0.4593, + "step": 1885 + }, + { + "epoch": 0.14036911879386535, + "grad_norm": 0.9538850784301758, + "learning_rate": 4.978860003160121e-05, + "loss": 0.4475, + "step": 1890 + }, + { + "epoch": 0.1407404656689814, + "grad_norm": 0.964397668838501, + "learning_rate": 4.978664451848992e-05, + "loss": 0.4568, + "step": 1895 + }, + { + "epoch": 0.14111181254409744, + "grad_norm": 0.9824573397636414, + "learning_rate": 4.9784680041186946e-05, + "loss": 0.4499, + "step": 1900 + }, + { + "epoch": 0.1414831594192135, + "grad_norm": 2.0519871711730957, + "learning_rate": 4.9782706600402774e-05, + "loss": 0.4711, + "step": 1905 + }, + { + "epoch": 0.14185450629432952, + "grad_norm": 1.1351428031921387, + "learning_rate": 4.978072419685108e-05, + "loss": 0.4851, + "step": 1910 + }, + { + "epoch": 0.14222585316944558, + "grad_norm": 1.1103254556655884, + "learning_rate": 4.977873283124883e-05, + "loss": 0.4622, + "step": 1915 + }, + { + "epoch": 0.1425972000445616, + "grad_norm": 1.0085701942443848, + "learning_rate": 4.97767325043162e-05, + "loss": 0.4777, + "step": 1920 + }, + { + "epoch": 0.14296854691967767, + "grad_norm": 0.9454324245452881, + "learning_rate": 4.977472321677662e-05, + "loss": 0.4542, + "step": 1925 + }, + { + "epoch": 0.14333989379479373, + "grad_norm": 1.1343188285827637, + "learning_rate": 4.9772704969356746e-05, + "loss": 0.4716, + "step": 1930 + }, + { + "epoch": 0.14371124066990976, + "grad_norm": 1.3752365112304688, + "learning_rate": 4.97706777627865e-05, + "loss": 0.4778, + "step": 1935 + }, + { + "epoch": 0.14408258754502581, + "grad_norm": 1.1367682218551636, + "learning_rate": 4.976864159779903e-05, + "loss": 0.4674, + "step": 1940 + }, + { + "epoch": 0.14445393442014184, + "grad_norm": 1.1333668231964111, + "learning_rate": 4.97665964751307e-05, + "loss": 0.4848, + "step": 1945 + }, + { + "epoch": 0.1448252812952579, + "grad_norm": 0.9810808300971985, + "learning_rate": 4.976454239552117e-05, + "loss": 0.4754, + "step": 1950 + }, + { + "epoch": 0.14519662817037396, + "grad_norm": 1.1838676929473877, + "learning_rate": 4.976247935971328e-05, + "loss": 0.4632, + "step": 1955 + }, + { + "epoch": 0.14556797504549, + "grad_norm": 0.9803593754768372, + "learning_rate": 4.976040736845316e-05, + "loss": 0.4817, + "step": 1960 + }, + { + "epoch": 0.14593932192060605, + "grad_norm": 1.4662213325500488, + "learning_rate": 4.975832642249012e-05, + "loss": 0.4718, + "step": 1965 + }, + { + "epoch": 0.14631066879572208, + "grad_norm": 1.4118518829345703, + "learning_rate": 4.975623652257677e-05, + "loss": 0.4812, + "step": 1970 + }, + { + "epoch": 0.14668201567083813, + "grad_norm": 0.9826330542564392, + "learning_rate": 4.975413766946893e-05, + "loss": 0.4664, + "step": 1975 + }, + { + "epoch": 0.14705336254595416, + "grad_norm": 1.0626474618911743, + "learning_rate": 4.975202986392565e-05, + "loss": 0.4695, + "step": 1980 + }, + { + "epoch": 0.14742470942107022, + "grad_norm": 0.8978970646858215, + "learning_rate": 4.974991310670922e-05, + "loss": 0.4836, + "step": 1985 + }, + { + "epoch": 0.14779605629618628, + "grad_norm": 0.993985116481781, + "learning_rate": 4.97477873985852e-05, + "loss": 0.4761, + "step": 1990 + }, + { + "epoch": 0.1481674031713023, + "grad_norm": 0.9552356004714966, + "learning_rate": 4.974565274032233e-05, + "loss": 0.4765, + "step": 1995 + }, + { + "epoch": 0.14853875004641837, + "grad_norm": 0.9018028974533081, + "learning_rate": 4.974350913269263e-05, + "loss": 0.4664, + "step": 2000 + }, + { + "epoch": 0.1489100969215344, + "grad_norm": 0.978186845779419, + "learning_rate": 4.9741356576471356e-05, + "loss": 0.4541, + "step": 2005 + }, + { + "epoch": 0.14928144379665045, + "grad_norm": 1.0590962171554565, + "learning_rate": 4.973919507243697e-05, + "loss": 0.4698, + "step": 2010 + }, + { + "epoch": 0.14965279067176648, + "grad_norm": 0.9982547163963318, + "learning_rate": 4.973702462137121e-05, + "loss": 0.4669, + "step": 2015 + }, + { + "epoch": 0.15002413754688254, + "grad_norm": 1.0058673620224, + "learning_rate": 4.9734845224059e-05, + "loss": 0.4572, + "step": 2020 + }, + { + "epoch": 0.1503954844219986, + "grad_norm": 0.9145234823226929, + "learning_rate": 4.973265688128855e-05, + "loss": 0.4683, + "step": 2025 + }, + { + "epoch": 0.15076683129711463, + "grad_norm": 0.8498237133026123, + "learning_rate": 4.9730459593851285e-05, + "loss": 0.4686, + "step": 2030 + }, + { + "epoch": 0.15113817817223069, + "grad_norm": 1.5811172723770142, + "learning_rate": 4.972825336254185e-05, + "loss": 0.4608, + "step": 2035 + }, + { + "epoch": 0.15150952504734672, + "grad_norm": 1.1382615566253662, + "learning_rate": 4.972603818815814e-05, + "loss": 0.4698, + "step": 2040 + }, + { + "epoch": 0.15188087192246277, + "grad_norm": 1.0171360969543457, + "learning_rate": 4.9723814071501295e-05, + "loss": 0.4559, + "step": 2045 + }, + { + "epoch": 0.15225221879757883, + "grad_norm": 1.2327966690063477, + "learning_rate": 4.972158101337566e-05, + "loss": 0.4462, + "step": 2050 + }, + { + "epoch": 0.15262356567269486, + "grad_norm": 1.284108281135559, + "learning_rate": 4.971933901458883e-05, + "loss": 0.4561, + "step": 2055 + }, + { + "epoch": 0.15299491254781092, + "grad_norm": 1.175051212310791, + "learning_rate": 4.971708807595165e-05, + "loss": 0.4632, + "step": 2060 + }, + { + "epoch": 0.15336625942292695, + "grad_norm": 1.0316301584243774, + "learning_rate": 4.9714828198278165e-05, + "loss": 0.4601, + "step": 2065 + }, + { + "epoch": 0.153737606298043, + "grad_norm": 0.9318085312843323, + "learning_rate": 4.971255938238567e-05, + "loss": 0.4735, + "step": 2070 + }, + { + "epoch": 0.15410895317315904, + "grad_norm": 0.8723039031028748, + "learning_rate": 4.971028162909469e-05, + "loss": 0.4442, + "step": 2075 + }, + { + "epoch": 0.1544803000482751, + "grad_norm": 1.4766545295715332, + "learning_rate": 4.9707994939228997e-05, + "loss": 0.4601, + "step": 2080 + }, + { + "epoch": 0.15485164692339115, + "grad_norm": 1.0068985223770142, + "learning_rate": 4.970569931361557e-05, + "loss": 0.4633, + "step": 2085 + }, + { + "epoch": 0.15522299379850718, + "grad_norm": 0.9444260001182556, + "learning_rate": 4.970339475308464e-05, + "loss": 0.458, + "step": 2090 + }, + { + "epoch": 0.15559434067362324, + "grad_norm": 0.7326513528823853, + "learning_rate": 4.970108125846966e-05, + "loss": 0.4568, + "step": 2095 + }, + { + "epoch": 0.15596568754873927, + "grad_norm": 1.3498059511184692, + "learning_rate": 4.969875883060731e-05, + "loss": 0.4804, + "step": 2100 + }, + { + "epoch": 0.15633703442385533, + "grad_norm": 1.2519272565841675, + "learning_rate": 4.9696427470337495e-05, + "loss": 0.454, + "step": 2105 + }, + { + "epoch": 0.15670838129897136, + "grad_norm": 0.9399036169052124, + "learning_rate": 4.969408717850338e-05, + "loss": 0.4411, + "step": 2110 + }, + { + "epoch": 0.1570797281740874, + "grad_norm": 1.2885676622390747, + "learning_rate": 4.9691737955951334e-05, + "loss": 0.4688, + "step": 2115 + }, + { + "epoch": 0.15745107504920347, + "grad_norm": 0.9798744320869446, + "learning_rate": 4.968937980353097e-05, + "loss": 0.469, + "step": 2120 + }, + { + "epoch": 0.1578224219243195, + "grad_norm": 0.9649677872657776, + "learning_rate": 4.9687012722095106e-05, + "loss": 0.468, + "step": 2125 + }, + { + "epoch": 0.15819376879943556, + "grad_norm": 1.1075035333633423, + "learning_rate": 4.9684636712499816e-05, + "loss": 0.437, + "step": 2130 + }, + { + "epoch": 0.1585651156745516, + "grad_norm": 1.0285245180130005, + "learning_rate": 4.9682251775604397e-05, + "loss": 0.4469, + "step": 2135 + }, + { + "epoch": 0.15893646254966765, + "grad_norm": 1.5093187093734741, + "learning_rate": 4.9679857912271365e-05, + "loss": 0.4684, + "step": 2140 + }, + { + "epoch": 0.1593078094247837, + "grad_norm": 1.210630178451538, + "learning_rate": 4.967745512336648e-05, + "loss": 0.466, + "step": 2145 + }, + { + "epoch": 0.15967915629989973, + "grad_norm": 1.0899977684020996, + "learning_rate": 4.967504340975871e-05, + "loss": 0.4438, + "step": 2150 + }, + { + "epoch": 0.1600505031750158, + "grad_norm": 0.985546350479126, + "learning_rate": 4.9672622772320274e-05, + "loss": 0.463, + "step": 2155 + }, + { + "epoch": 0.16042185005013182, + "grad_norm": 1.167546033859253, + "learning_rate": 4.967019321192659e-05, + "loss": 0.4543, + "step": 2160 + }, + { + "epoch": 0.16079319692524788, + "grad_norm": 1.2297065258026123, + "learning_rate": 4.966775472945633e-05, + "loss": 0.4369, + "step": 2165 + }, + { + "epoch": 0.1611645438003639, + "grad_norm": 1.4039182662963867, + "learning_rate": 4.9665307325791375e-05, + "loss": 0.4446, + "step": 2170 + }, + { + "epoch": 0.16153589067547997, + "grad_norm": 0.9450597167015076, + "learning_rate": 4.966285100181684e-05, + "loss": 0.4493, + "step": 2175 + }, + { + "epoch": 0.16190723755059602, + "grad_norm": 0.9438601136207581, + "learning_rate": 4.966038575842107e-05, + "loss": 0.4618, + "step": 2180 + }, + { + "epoch": 0.16227858442571205, + "grad_norm": 0.9899676442146301, + "learning_rate": 4.965791159649562e-05, + "loss": 0.4851, + "step": 2185 + }, + { + "epoch": 0.1626499313008281, + "grad_norm": 1.3799790143966675, + "learning_rate": 4.965542851693528e-05, + "loss": 0.4661, + "step": 2190 + }, + { + "epoch": 0.16302127817594414, + "grad_norm": 0.9580912590026855, + "learning_rate": 4.965293652063808e-05, + "loss": 0.4495, + "step": 2195 + }, + { + "epoch": 0.1633926250510602, + "grad_norm": 1.1697057485580444, + "learning_rate": 4.9650435608505255e-05, + "loss": 0.4805, + "step": 2200 + }, + { + "epoch": 0.16376397192617625, + "grad_norm": 0.8966139554977417, + "learning_rate": 4.964792578144126e-05, + "loss": 0.4365, + "step": 2205 + }, + { + "epoch": 0.16413531880129228, + "grad_norm": 0.918121874332428, + "learning_rate": 4.96454070403538e-05, + "loss": 0.4669, + "step": 2210 + }, + { + "epoch": 0.16450666567640834, + "grad_norm": 0.9863184690475464, + "learning_rate": 4.964287938615378e-05, + "loss": 0.4437, + "step": 2215 + }, + { + "epoch": 0.16487801255152437, + "grad_norm": 1.2543003559112549, + "learning_rate": 4.964034281975534e-05, + "loss": 0.471, + "step": 2220 + }, + { + "epoch": 0.16524935942664043, + "grad_norm": 0.7972759008407593, + "learning_rate": 4.963779734207582e-05, + "loss": 0.448, + "step": 2225 + }, + { + "epoch": 0.16562070630175646, + "grad_norm": 1.0038282871246338, + "learning_rate": 4.963524295403583e-05, + "loss": 0.4569, + "step": 2230 + }, + { + "epoch": 0.16599205317687252, + "grad_norm": 1.3752259016036987, + "learning_rate": 4.9632679656559164e-05, + "loss": 0.4662, + "step": 2235 + }, + { + "epoch": 0.16636340005198857, + "grad_norm": 1.423388123512268, + "learning_rate": 4.963010745057285e-05, + "loss": 0.4842, + "step": 2240 + }, + { + "epoch": 0.1667347469271046, + "grad_norm": 1.0444741249084473, + "learning_rate": 4.962752633700714e-05, + "loss": 0.4554, + "step": 2245 + }, + { + "epoch": 0.16710609380222066, + "grad_norm": 0.9717627167701721, + "learning_rate": 4.962493631679549e-05, + "loss": 0.4636, + "step": 2250 + }, + { + "epoch": 0.1674774406773367, + "grad_norm": 1.248321771621704, + "learning_rate": 4.9622337390874604e-05, + "loss": 0.4451, + "step": 2255 + }, + { + "epoch": 0.16784878755245275, + "grad_norm": 0.9144636988639832, + "learning_rate": 4.9619729560184394e-05, + "loss": 0.4491, + "step": 2260 + }, + { + "epoch": 0.16822013442756878, + "grad_norm": 0.9530754685401917, + "learning_rate": 4.961711282566799e-05, + "loss": 0.4665, + "step": 2265 + }, + { + "epoch": 0.16859148130268484, + "grad_norm": 1.0826373100280762, + "learning_rate": 4.961448718827175e-05, + "loss": 0.4386, + "step": 2270 + }, + { + "epoch": 0.1689628281778009, + "grad_norm": 0.8597514033317566, + "learning_rate": 4.961185264894523e-05, + "loss": 0.4703, + "step": 2275 + }, + { + "epoch": 0.16933417505291692, + "grad_norm": 0.9985958933830261, + "learning_rate": 4.960920920864124e-05, + "loss": 0.4422, + "step": 2280 + }, + { + "epoch": 0.16970552192803298, + "grad_norm": 1.0870766639709473, + "learning_rate": 4.960655686831579e-05, + "loss": 0.4793, + "step": 2285 + }, + { + "epoch": 0.170076868803149, + "grad_norm": 1.083158016204834, + "learning_rate": 4.96038956289281e-05, + "loss": 0.4528, + "step": 2290 + }, + { + "epoch": 0.17044821567826507, + "grad_norm": 1.491381049156189, + "learning_rate": 4.960122549144062e-05, + "loss": 0.4299, + "step": 2295 + }, + { + "epoch": 0.17081956255338113, + "grad_norm": 0.8504036068916321, + "learning_rate": 4.959854645681902e-05, + "loss": 0.4569, + "step": 2300 + }, + { + "epoch": 0.17119090942849716, + "grad_norm": 1.2399927377700806, + "learning_rate": 4.959585852603218e-05, + "loss": 0.4547, + "step": 2305 + }, + { + "epoch": 0.17156225630361321, + "grad_norm": 0.8506380915641785, + "learning_rate": 4.959316170005221e-05, + "loss": 0.4448, + "step": 2310 + }, + { + "epoch": 0.17193360317872924, + "grad_norm": 1.13820219039917, + "learning_rate": 4.9590455979854424e-05, + "loss": 0.462, + "step": 2315 + }, + { + "epoch": 0.1723049500538453, + "grad_norm": 1.0776944160461426, + "learning_rate": 4.9587741366417344e-05, + "loss": 0.4556, + "step": 2320 + }, + { + "epoch": 0.17267629692896133, + "grad_norm": 1.0773634910583496, + "learning_rate": 4.9585017860722735e-05, + "loss": 0.4379, + "step": 2325 + }, + { + "epoch": 0.1730476438040774, + "grad_norm": 1.044816255569458, + "learning_rate": 4.958228546375557e-05, + "loss": 0.4758, + "step": 2330 + }, + { + "epoch": 0.17341899067919345, + "grad_norm": 0.9068227410316467, + "learning_rate": 4.957954417650401e-05, + "loss": 0.4528, + "step": 2335 + }, + { + "epoch": 0.17379033755430948, + "grad_norm": 0.9294278621673584, + "learning_rate": 4.957679399995948e-05, + "loss": 0.4642, + "step": 2340 + }, + { + "epoch": 0.17416168442942553, + "grad_norm": 0.9256554245948792, + "learning_rate": 4.9574034935116564e-05, + "loss": 0.4621, + "step": 2345 + }, + { + "epoch": 0.17453303130454156, + "grad_norm": 1.2020440101623535, + "learning_rate": 4.957126698297311e-05, + "loss": 0.4491, + "step": 2350 + }, + { + "epoch": 0.17490437817965762, + "grad_norm": 0.8281359672546387, + "learning_rate": 4.9568490144530144e-05, + "loss": 0.4485, + "step": 2355 + }, + { + "epoch": 0.17527572505477365, + "grad_norm": 0.8755785822868347, + "learning_rate": 4.956570442079194e-05, + "loss": 0.457, + "step": 2360 + }, + { + "epoch": 0.1756470719298897, + "grad_norm": 1.081264853477478, + "learning_rate": 4.956290981276595e-05, + "loss": 0.4571, + "step": 2365 + }, + { + "epoch": 0.17601841880500577, + "grad_norm": 0.8732466697692871, + "learning_rate": 4.9560106321462864e-05, + "loss": 0.4475, + "step": 2370 + }, + { + "epoch": 0.1763897656801218, + "grad_norm": 0.7618255019187927, + "learning_rate": 4.955729394789657e-05, + "loss": 0.4696, + "step": 2375 + }, + { + "epoch": 0.17676111255523785, + "grad_norm": 0.9234897494316101, + "learning_rate": 4.955447269308418e-05, + "loss": 0.4112, + "step": 2380 + }, + { + "epoch": 0.17713245943035388, + "grad_norm": 1.0670307874679565, + "learning_rate": 4.955164255804601e-05, + "loss": 0.4223, + "step": 2385 + }, + { + "epoch": 0.17750380630546994, + "grad_norm": 1.4194025993347168, + "learning_rate": 4.954880354380559e-05, + "loss": 0.4294, + "step": 2390 + }, + { + "epoch": 0.177875153180586, + "grad_norm": 0.8653207421302795, + "learning_rate": 4.954595565138966e-05, + "loss": 0.4607, + "step": 2395 + }, + { + "epoch": 0.17824650005570203, + "grad_norm": 1.3651710748672485, + "learning_rate": 4.954309888182818e-05, + "loss": 0.4508, + "step": 2400 + }, + { + "epoch": 0.17861784693081809, + "grad_norm": 1.020920991897583, + "learning_rate": 4.954023323615429e-05, + "loss": 0.4659, + "step": 2405 + }, + { + "epoch": 0.17898919380593412, + "grad_norm": 1.022684097290039, + "learning_rate": 4.953735871540439e-05, + "loss": 0.4464, + "step": 2410 + }, + { + "epoch": 0.17936054068105017, + "grad_norm": 1.0919467210769653, + "learning_rate": 4.953447532061805e-05, + "loss": 0.4756, + "step": 2415 + }, + { + "epoch": 0.1797318875561662, + "grad_norm": 0.9148445129394531, + "learning_rate": 4.9531583052838056e-05, + "loss": 0.4427, + "step": 2420 + }, + { + "epoch": 0.18010323443128226, + "grad_norm": 0.9530015587806702, + "learning_rate": 4.952868191311042e-05, + "loss": 0.445, + "step": 2425 + }, + { + "epoch": 0.18047458130639832, + "grad_norm": 1.0264039039611816, + "learning_rate": 4.952577190248434e-05, + "loss": 0.4512, + "step": 2430 + }, + { + "epoch": 0.18084592818151435, + "grad_norm": 1.0021376609802246, + "learning_rate": 4.9522853022012236e-05, + "loss": 0.4604, + "step": 2435 + }, + { + "epoch": 0.1812172750566304, + "grad_norm": 1.5464372634887695, + "learning_rate": 4.951992527274975e-05, + "loss": 0.4836, + "step": 2440 + }, + { + "epoch": 0.18158862193174644, + "grad_norm": 1.0679705142974854, + "learning_rate": 4.951698865575569e-05, + "loss": 0.4829, + "step": 2445 + }, + { + "epoch": 0.1819599688068625, + "grad_norm": 1.0598608255386353, + "learning_rate": 4.951404317209211e-05, + "loss": 0.4587, + "step": 2450 + }, + { + "epoch": 0.18233131568197852, + "grad_norm": 1.095999002456665, + "learning_rate": 4.9511088822824246e-05, + "loss": 0.4487, + "step": 2455 + }, + { + "epoch": 0.18270266255709458, + "grad_norm": 0.9862808585166931, + "learning_rate": 4.950812560902056e-05, + "loss": 0.4516, + "step": 2460 + }, + { + "epoch": 0.18307400943221064, + "grad_norm": 1.0196229219436646, + "learning_rate": 4.9505153531752715e-05, + "loss": 0.4521, + "step": 2465 + }, + { + "epoch": 0.18344535630732667, + "grad_norm": 1.2567567825317383, + "learning_rate": 4.950217259209555e-05, + "loss": 0.4607, + "step": 2470 + }, + { + "epoch": 0.18381670318244273, + "grad_norm": 1.211969256401062, + "learning_rate": 4.9499182791127166e-05, + "loss": 0.4295, + "step": 2475 + }, + { + "epoch": 0.18418805005755876, + "grad_norm": 0.8847858905792236, + "learning_rate": 4.9496184129928815e-05, + "loss": 0.4231, + "step": 2480 + }, + { + "epoch": 0.1845593969326748, + "grad_norm": 1.00296151638031, + "learning_rate": 4.949317660958499e-05, + "loss": 0.4302, + "step": 2485 + }, + { + "epoch": 0.18493074380779087, + "grad_norm": 1.3998608589172363, + "learning_rate": 4.949016023118336e-05, + "loss": 0.4551, + "step": 2490 + }, + { + "epoch": 0.1853020906829069, + "grad_norm": 1.3031240701675415, + "learning_rate": 4.948713499581481e-05, + "loss": 0.475, + "step": 2495 + }, + { + "epoch": 0.18567343755802296, + "grad_norm": 1.0290464162826538, + "learning_rate": 4.9484100904573436e-05, + "loss": 0.4522, + "step": 2500 + }, + { + "epoch": 0.186044784433139, + "grad_norm": 1.1162320375442505, + "learning_rate": 4.948105795855653e-05, + "loss": 0.4417, + "step": 2505 + }, + { + "epoch": 0.18641613130825505, + "grad_norm": 1.0466334819793701, + "learning_rate": 4.9478006158864586e-05, + "loss": 0.4514, + "step": 2510 + }, + { + "epoch": 0.18678747818337108, + "grad_norm": 0.9525560140609741, + "learning_rate": 4.94749455066013e-05, + "loss": 0.4335, + "step": 2515 + }, + { + "epoch": 0.18715882505848713, + "grad_norm": 0.918469250202179, + "learning_rate": 4.947187600287356e-05, + "loss": 0.4488, + "step": 2520 + }, + { + "epoch": 0.1875301719336032, + "grad_norm": 0.9201804995536804, + "learning_rate": 4.946879764879149e-05, + "loss": 0.4598, + "step": 2525 + }, + { + "epoch": 0.18790151880871922, + "grad_norm": 0.810072660446167, + "learning_rate": 4.946571044546835e-05, + "loss": 0.4531, + "step": 2530 + }, + { + "epoch": 0.18827286568383528, + "grad_norm": 1.2535103559494019, + "learning_rate": 4.946261439402067e-05, + "loss": 0.4832, + "step": 2535 + }, + { + "epoch": 0.1886442125589513, + "grad_norm": 0.7918863892555237, + "learning_rate": 4.945950949556814e-05, + "loss": 0.4433, + "step": 2540 + }, + { + "epoch": 0.18901555943406737, + "grad_norm": 0.9122186899185181, + "learning_rate": 4.945639575123366e-05, + "loss": 0.451, + "step": 2545 + }, + { + "epoch": 0.1893869063091834, + "grad_norm": 0.9057549834251404, + "learning_rate": 4.9453273162143325e-05, + "loss": 0.4345, + "step": 2550 + }, + { + "epoch": 0.18975825318429945, + "grad_norm": 0.932822585105896, + "learning_rate": 4.945014172942644e-05, + "loss": 0.4801, + "step": 2555 + }, + { + "epoch": 0.1901296000594155, + "grad_norm": 1.1034599542617798, + "learning_rate": 4.9447001454215494e-05, + "loss": 0.4444, + "step": 2560 + }, + { + "epoch": 0.19050094693453154, + "grad_norm": 1.0966523885726929, + "learning_rate": 4.944385233764618e-05, + "loss": 0.4456, + "step": 2565 + }, + { + "epoch": 0.1908722938096476, + "grad_norm": 1.0594526529312134, + "learning_rate": 4.94406943808574e-05, + "loss": 0.4359, + "step": 2570 + }, + { + "epoch": 0.19124364068476363, + "grad_norm": 0.8708404898643494, + "learning_rate": 4.9437527584991226e-05, + "loss": 0.4377, + "step": 2575 + }, + { + "epoch": 0.19161498755987968, + "grad_norm": 0.972554087638855, + "learning_rate": 4.9434351951192956e-05, + "loss": 0.444, + "step": 2580 + }, + { + "epoch": 0.19198633443499574, + "grad_norm": 1.0811034440994263, + "learning_rate": 4.943116748061106e-05, + "loss": 0.4336, + "step": 2585 + }, + { + "epoch": 0.19235768131011177, + "grad_norm": 0.8871290683746338, + "learning_rate": 4.942797417439722e-05, + "loss": 0.4537, + "step": 2590 + }, + { + "epoch": 0.19272902818522783, + "grad_norm": 1.0862644910812378, + "learning_rate": 4.942477203370631e-05, + "loss": 0.4406, + "step": 2595 + }, + { + "epoch": 0.19310037506034386, + "grad_norm": 0.9374284744262695, + "learning_rate": 4.9421561059696397e-05, + "loss": 0.4473, + "step": 2600 + }, + { + "epoch": 0.19347172193545992, + "grad_norm": 1.007503867149353, + "learning_rate": 4.941834125352874e-05, + "loss": 0.447, + "step": 2605 + }, + { + "epoch": 0.19384306881057595, + "grad_norm": 1.0377479791641235, + "learning_rate": 4.941511261636779e-05, + "loss": 0.4608, + "step": 2610 + }, + { + "epoch": 0.194214415685692, + "grad_norm": 1.0334117412567139, + "learning_rate": 4.941187514938122e-05, + "loss": 0.4527, + "step": 2615 + }, + { + "epoch": 0.19458576256080806, + "grad_norm": 1.2403839826583862, + "learning_rate": 4.940862885373984e-05, + "loss": 0.4355, + "step": 2620 + }, + { + "epoch": 0.1949571094359241, + "grad_norm": 0.9511928558349609, + "learning_rate": 4.940537373061771e-05, + "loss": 0.4372, + "step": 2625 + }, + { + "epoch": 0.19532845631104015, + "grad_norm": 0.9966602325439453, + "learning_rate": 4.940210978119205e-05, + "loss": 0.4451, + "step": 2630 + }, + { + "epoch": 0.19569980318615618, + "grad_norm": 0.8612943291664124, + "learning_rate": 4.9398837006643286e-05, + "loss": 0.4587, + "step": 2635 + }, + { + "epoch": 0.19607115006127224, + "grad_norm": 1.1878662109375, + "learning_rate": 4.939555540815502e-05, + "loss": 0.468, + "step": 2640 + }, + { + "epoch": 0.19644249693638827, + "grad_norm": 0.9826651215553284, + "learning_rate": 4.939226498691407e-05, + "loss": 0.4819, + "step": 2645 + }, + { + "epoch": 0.19681384381150432, + "grad_norm": 1.0273140668869019, + "learning_rate": 4.9388965744110416e-05, + "loss": 0.4426, + "step": 2650 + }, + { + "epoch": 0.19718519068662038, + "grad_norm": 1.0519393682479858, + "learning_rate": 4.9385657680937246e-05, + "loss": 0.456, + "step": 2655 + }, + { + "epoch": 0.1975565375617364, + "grad_norm": 0.8215546607971191, + "learning_rate": 4.938234079859094e-05, + "loss": 0.4444, + "step": 2660 + }, + { + "epoch": 0.19792788443685247, + "grad_norm": 0.9723137021064758, + "learning_rate": 4.937901509827107e-05, + "loss": 0.4561, + "step": 2665 + }, + { + "epoch": 0.1982992313119685, + "grad_norm": 1.0705112218856812, + "learning_rate": 4.9375680581180375e-05, + "loss": 0.4512, + "step": 2670 + }, + { + "epoch": 0.19867057818708456, + "grad_norm": 0.9131028652191162, + "learning_rate": 4.937233724852479e-05, + "loss": 0.4496, + "step": 2675 + }, + { + "epoch": 0.19904192506220061, + "grad_norm": 0.9114557504653931, + "learning_rate": 4.936898510151346e-05, + "loss": 0.4638, + "step": 2680 + }, + { + "epoch": 0.19941327193731664, + "grad_norm": 0.7485722899436951, + "learning_rate": 4.9365624141358706e-05, + "loss": 0.4399, + "step": 2685 + }, + { + "epoch": 0.1997846188124327, + "grad_norm": 1.0070992708206177, + "learning_rate": 4.936225436927602e-05, + "loss": 0.4228, + "step": 2690 + }, + { + "epoch": 0.20015596568754873, + "grad_norm": 1.330584168434143, + "learning_rate": 4.93588757864841e-05, + "loss": 0.438, + "step": 2695 + }, + { + "epoch": 0.2005273125626648, + "grad_norm": 1.0673860311508179, + "learning_rate": 4.935548839420483e-05, + "loss": 0.4643, + "step": 2700 + }, + { + "epoch": 0.20089865943778082, + "grad_norm": 1.4085111618041992, + "learning_rate": 4.935209219366326e-05, + "loss": 0.4395, + "step": 2705 + }, + { + "epoch": 0.20127000631289688, + "grad_norm": 0.8586167693138123, + "learning_rate": 4.934868718608765e-05, + "loss": 0.4578, + "step": 2710 + }, + { + "epoch": 0.20164135318801293, + "grad_norm": 1.0641870498657227, + "learning_rate": 4.9345273372709435e-05, + "loss": 0.446, + "step": 2715 + }, + { + "epoch": 0.20201270006312896, + "grad_norm": 0.8288577198982239, + "learning_rate": 4.934185075476323e-05, + "loss": 0.4267, + "step": 2720 + }, + { + "epoch": 0.20238404693824502, + "grad_norm": 0.9544481039047241, + "learning_rate": 4.933841933348684e-05, + "loss": 0.4445, + "step": 2725 + }, + { + "epoch": 0.20275539381336105, + "grad_norm": 0.9621331095695496, + "learning_rate": 4.933497911012126e-05, + "loss": 0.4276, + "step": 2730 + }, + { + "epoch": 0.2031267406884771, + "grad_norm": 0.9601415395736694, + "learning_rate": 4.933153008591065e-05, + "loss": 0.4706, + "step": 2735 + }, + { + "epoch": 0.20349808756359314, + "grad_norm": 1.2368193864822388, + "learning_rate": 4.932807226210237e-05, + "loss": 0.4305, + "step": 2740 + }, + { + "epoch": 0.2038694344387092, + "grad_norm": 1.1494923830032349, + "learning_rate": 4.932460563994695e-05, + "loss": 0.4529, + "step": 2745 + }, + { + "epoch": 0.20424078131382525, + "grad_norm": 0.9575107097625732, + "learning_rate": 4.932113022069812e-05, + "loss": 0.4683, + "step": 2750 + }, + { + "epoch": 0.20461212818894128, + "grad_norm": 1.0877399444580078, + "learning_rate": 4.931764600561277e-05, + "loss": 0.4704, + "step": 2755 + }, + { + "epoch": 0.20498347506405734, + "grad_norm": 0.8758722543716431, + "learning_rate": 4.931415299595098e-05, + "loss": 0.4592, + "step": 2760 + }, + { + "epoch": 0.20535482193917337, + "grad_norm": 1.0770483016967773, + "learning_rate": 4.9310651192976014e-05, + "loss": 0.4485, + "step": 2765 + }, + { + "epoch": 0.20572616881428943, + "grad_norm": 0.9201017022132874, + "learning_rate": 4.9307140597954314e-05, + "loss": 0.4404, + "step": 2770 + }, + { + "epoch": 0.20609751568940549, + "grad_norm": 0.8507365584373474, + "learning_rate": 4.9303621212155506e-05, + "loss": 0.4565, + "step": 2775 + }, + { + "epoch": 0.20646886256452152, + "grad_norm": 0.9411526322364807, + "learning_rate": 4.9300093036852384e-05, + "loss": 0.4393, + "step": 2780 + }, + { + "epoch": 0.20684020943963757, + "grad_norm": 1.0703768730163574, + "learning_rate": 4.9296556073320925e-05, + "loss": 0.4297, + "step": 2785 + }, + { + "epoch": 0.2072115563147536, + "grad_norm": 1.1285152435302734, + "learning_rate": 4.92930103228403e-05, + "loss": 0.4269, + "step": 2790 + }, + { + "epoch": 0.20758290318986966, + "grad_norm": 1.2548874616622925, + "learning_rate": 4.928945578669282e-05, + "loss": 0.4486, + "step": 2795 + }, + { + "epoch": 0.2079542500649857, + "grad_norm": 0.824675440788269, + "learning_rate": 4.928589246616402e-05, + "loss": 0.4543, + "step": 2800 + }, + { + "epoch": 0.20832559694010175, + "grad_norm": 1.1596276760101318, + "learning_rate": 4.928232036254257e-05, + "loss": 0.4291, + "step": 2805 + }, + { + "epoch": 0.2086969438152178, + "grad_norm": 0.7505876421928406, + "learning_rate": 4.9278739477120364e-05, + "loss": 0.4275, + "step": 2810 + }, + { + "epoch": 0.20906829069033384, + "grad_norm": 1.3590447902679443, + "learning_rate": 4.9275149811192414e-05, + "loss": 0.4727, + "step": 2815 + }, + { + "epoch": 0.2094396375654499, + "grad_norm": 0.9690061807632446, + "learning_rate": 4.9271551366056954e-05, + "loss": 0.4522, + "step": 2820 + }, + { + "epoch": 0.20981098444056592, + "grad_norm": 1.082969307899475, + "learning_rate": 4.9267944143015364e-05, + "loss": 0.4563, + "step": 2825 + }, + { + "epoch": 0.21018233131568198, + "grad_norm": 0.8886339664459229, + "learning_rate": 4.926432814337223e-05, + "loss": 0.4441, + "step": 2830 + }, + { + "epoch": 0.21055367819079804, + "grad_norm": 0.8670248985290527, + "learning_rate": 4.926070336843528e-05, + "loss": 0.4481, + "step": 2835 + }, + { + "epoch": 0.21092502506591407, + "grad_norm": 0.8443616032600403, + "learning_rate": 4.925706981951543e-05, + "loss": 0.4475, + "step": 2840 + }, + { + "epoch": 0.21129637194103013, + "grad_norm": 0.9135586023330688, + "learning_rate": 4.925342749792677e-05, + "loss": 0.4618, + "step": 2845 + }, + { + "epoch": 0.21166771881614616, + "grad_norm": 1.1337090730667114, + "learning_rate": 4.924977640498657e-05, + "loss": 0.4454, + "step": 2850 + }, + { + "epoch": 0.2120390656912622, + "grad_norm": 0.8965758085250854, + "learning_rate": 4.9246116542015244e-05, + "loss": 0.4554, + "step": 2855 + }, + { + "epoch": 0.21241041256637824, + "grad_norm": 0.9783238768577576, + "learning_rate": 4.92424479103364e-05, + "loss": 0.441, + "step": 2860 + }, + { + "epoch": 0.2127817594414943, + "grad_norm": 0.8907028436660767, + "learning_rate": 4.9238770511276824e-05, + "loss": 0.436, + "step": 2865 + }, + { + "epoch": 0.21315310631661036, + "grad_norm": 1.0488508939743042, + "learning_rate": 4.9235084346166464e-05, + "loss": 0.4247, + "step": 2870 + }, + { + "epoch": 0.2135244531917264, + "grad_norm": 0.9586307406425476, + "learning_rate": 4.923138941633841e-05, + "loss": 0.4512, + "step": 2875 + }, + { + "epoch": 0.21389580006684245, + "grad_norm": 1.152602195739746, + "learning_rate": 4.922768572312899e-05, + "loss": 0.4569, + "step": 2880 + }, + { + "epoch": 0.21426714694195848, + "grad_norm": 1.1480371952056885, + "learning_rate": 4.922397326787763e-05, + "loss": 0.4287, + "step": 2885 + }, + { + "epoch": 0.21463849381707453, + "grad_norm": 1.0389635562896729, + "learning_rate": 4.9220252051926964e-05, + "loss": 0.4553, + "step": 2890 + }, + { + "epoch": 0.21500984069219056, + "grad_norm": 0.8871331214904785, + "learning_rate": 4.9216522076622776e-05, + "loss": 0.4579, + "step": 2895 + }, + { + "epoch": 0.21538118756730662, + "grad_norm": 0.7509046196937561, + "learning_rate": 4.921278334331404e-05, + "loss": 0.4398, + "step": 2900 + }, + { + "epoch": 0.21575253444242268, + "grad_norm": 1.3356949090957642, + "learning_rate": 4.920903585335288e-05, + "loss": 0.4378, + "step": 2905 + }, + { + "epoch": 0.2161238813175387, + "grad_norm": 0.9830528497695923, + "learning_rate": 4.920527960809459e-05, + "loss": 0.4623, + "step": 2910 + }, + { + "epoch": 0.21649522819265477, + "grad_norm": 1.6082857847213745, + "learning_rate": 4.9201514608897635e-05, + "loss": 0.4316, + "step": 2915 + }, + { + "epoch": 0.2168665750677708, + "grad_norm": 0.8056818842887878, + "learning_rate": 4.9197740857123645e-05, + "loss": 0.4257, + "step": 2920 + }, + { + "epoch": 0.21723792194288685, + "grad_norm": 0.8548802733421326, + "learning_rate": 4.919395835413739e-05, + "loss": 0.4382, + "step": 2925 + }, + { + "epoch": 0.2176092688180029, + "grad_norm": 1.2386173009872437, + "learning_rate": 4.919016710130685e-05, + "loss": 0.4366, + "step": 2930 + }, + { + "epoch": 0.21798061569311894, + "grad_norm": 1.0536203384399414, + "learning_rate": 4.918636710000315e-05, + "loss": 0.4362, + "step": 2935 + }, + { + "epoch": 0.218351962568235, + "grad_norm": 1.0775947570800781, + "learning_rate": 4.918255835160056e-05, + "loss": 0.4301, + "step": 2940 + }, + { + "epoch": 0.21872330944335103, + "grad_norm": 1.2517681121826172, + "learning_rate": 4.9178740857476545e-05, + "loss": 0.4007, + "step": 2945 + }, + { + "epoch": 0.21909465631846708, + "grad_norm": 0.9359631538391113, + "learning_rate": 4.9174914619011706e-05, + "loss": 0.4347, + "step": 2950 + }, + { + "epoch": 0.21946600319358311, + "grad_norm": 1.2085964679718018, + "learning_rate": 4.9171079637589824e-05, + "loss": 0.447, + "step": 2955 + }, + { + "epoch": 0.21983735006869917, + "grad_norm": 1.3199816942214966, + "learning_rate": 4.9167235914597826e-05, + "loss": 0.4585, + "step": 2960 + }, + { + "epoch": 0.22020869694381523, + "grad_norm": 1.0172200202941895, + "learning_rate": 4.916338345142582e-05, + "loss": 0.427, + "step": 2965 + }, + { + "epoch": 0.22058004381893126, + "grad_norm": 0.9850211143493652, + "learning_rate": 4.915952224946707e-05, + "loss": 0.4373, + "step": 2970 + }, + { + "epoch": 0.22095139069404732, + "grad_norm": 0.7755311727523804, + "learning_rate": 4.9155652310117976e-05, + "loss": 0.4038, + "step": 2975 + }, + { + "epoch": 0.22132273756916335, + "grad_norm": 1.2828562259674072, + "learning_rate": 4.9151773634778145e-05, + "loss": 0.4351, + "step": 2980 + }, + { + "epoch": 0.2216940844442794, + "grad_norm": 0.7583413124084473, + "learning_rate": 4.914788622485028e-05, + "loss": 0.425, + "step": 2985 + }, + { + "epoch": 0.22206543131939543, + "grad_norm": 0.9321625828742981, + "learning_rate": 4.9143990081740314e-05, + "loss": 0.4344, + "step": 2990 + }, + { + "epoch": 0.2224367781945115, + "grad_norm": 1.1756302118301392, + "learning_rate": 4.914008520685729e-05, + "loss": 0.4594, + "step": 2995 + }, + { + "epoch": 0.22280812506962755, + "grad_norm": 1.289210557937622, + "learning_rate": 4.913617160161341e-05, + "loss": 0.4596, + "step": 3000 + }, + { + "epoch": 0.22317947194474358, + "grad_norm": 0.8851030468940735, + "learning_rate": 4.913224926742407e-05, + "loss": 0.4429, + "step": 3005 + }, + { + "epoch": 0.22355081881985964, + "grad_norm": 0.9165136218070984, + "learning_rate": 4.912831820570777e-05, + "loss": 0.444, + "step": 3010 + }, + { + "epoch": 0.22392216569497567, + "grad_norm": 1.077898621559143, + "learning_rate": 4.912437841788622e-05, + "loss": 0.4279, + "step": 3015 + }, + { + "epoch": 0.22429351257009172, + "grad_norm": 0.9308784008026123, + "learning_rate": 4.912042990538424e-05, + "loss": 0.4445, + "step": 3020 + }, + { + "epoch": 0.22466485944520778, + "grad_norm": 0.9209590554237366, + "learning_rate": 4.911647266962984e-05, + "loss": 0.447, + "step": 3025 + }, + { + "epoch": 0.2250362063203238, + "grad_norm": 0.7767472267150879, + "learning_rate": 4.9112506712054165e-05, + "loss": 0.4474, + "step": 3030 + }, + { + "epoch": 0.22540755319543987, + "grad_norm": 1.1049778461456299, + "learning_rate": 4.910853203409152e-05, + "loss": 0.4665, + "step": 3035 + }, + { + "epoch": 0.2257789000705559, + "grad_norm": 0.803619921207428, + "learning_rate": 4.910454863717936e-05, + "loss": 0.4522, + "step": 3040 + }, + { + "epoch": 0.22615024694567196, + "grad_norm": 1.1416994333267212, + "learning_rate": 4.91005565227583e-05, + "loss": 0.4509, + "step": 3045 + }, + { + "epoch": 0.226521593820788, + "grad_norm": 0.9570928812026978, + "learning_rate": 4.90965556922721e-05, + "loss": 0.4321, + "step": 3050 + }, + { + "epoch": 0.22689294069590404, + "grad_norm": 0.979243814945221, + "learning_rate": 4.9092546147167696e-05, + "loss": 0.4453, + "step": 3055 + }, + { + "epoch": 0.2272642875710201, + "grad_norm": 1.013358473777771, + "learning_rate": 4.908852788889514e-05, + "loss": 0.45, + "step": 3060 + }, + { + "epoch": 0.22763563444613613, + "grad_norm": 1.1085175275802612, + "learning_rate": 4.908450091890765e-05, + "loss": 0.442, + "step": 3065 + }, + { + "epoch": 0.2280069813212522, + "grad_norm": 0.9845399260520935, + "learning_rate": 4.90804652386616e-05, + "loss": 0.4279, + "step": 3070 + }, + { + "epoch": 0.22837832819636822, + "grad_norm": 0.9485452771186829, + "learning_rate": 4.907642084961651e-05, + "loss": 0.4463, + "step": 3075 + }, + { + "epoch": 0.22874967507148428, + "grad_norm": 1.0089235305786133, + "learning_rate": 4.907236775323505e-05, + "loss": 0.4399, + "step": 3080 + }, + { + "epoch": 0.2291210219466003, + "grad_norm": 1.4188969135284424, + "learning_rate": 4.906830595098304e-05, + "loss": 0.4251, + "step": 3085 + }, + { + "epoch": 0.22949236882171636, + "grad_norm": 1.038788080215454, + "learning_rate": 4.9064235444329455e-05, + "loss": 0.4493, + "step": 3090 + }, + { + "epoch": 0.22986371569683242, + "grad_norm": 1.14411461353302, + "learning_rate": 4.9060156234746394e-05, + "loss": 0.4607, + "step": 3095 + }, + { + "epoch": 0.23023506257194845, + "grad_norm": 1.0400830507278442, + "learning_rate": 4.905606832370912e-05, + "loss": 0.427, + "step": 3100 + }, + { + "epoch": 0.2306064094470645, + "grad_norm": 0.7747494578361511, + "learning_rate": 4.905197171269607e-05, + "loss": 0.4404, + "step": 3105 + }, + { + "epoch": 0.23097775632218054, + "grad_norm": 0.9267304539680481, + "learning_rate": 4.904786640318876e-05, + "loss": 0.4644, + "step": 3110 + }, + { + "epoch": 0.2313491031972966, + "grad_norm": 1.1972533464431763, + "learning_rate": 4.904375239667193e-05, + "loss": 0.4521, + "step": 3115 + }, + { + "epoch": 0.23172045007241265, + "grad_norm": 0.8945311307907104, + "learning_rate": 4.903962969463341e-05, + "loss": 0.4202, + "step": 3120 + }, + { + "epoch": 0.23209179694752868, + "grad_norm": 0.8536193370819092, + "learning_rate": 4.9035498298564175e-05, + "loss": 0.4504, + "step": 3125 + }, + { + "epoch": 0.23246314382264474, + "grad_norm": 0.886146605014801, + "learning_rate": 4.903135820995839e-05, + "loss": 0.4499, + "step": 3130 + }, + { + "epoch": 0.23283449069776077, + "grad_norm": 0.8584529161453247, + "learning_rate": 4.9027209430313325e-05, + "loss": 0.4066, + "step": 3135 + }, + { + "epoch": 0.23320583757287683, + "grad_norm": 2.4391167163848877, + "learning_rate": 4.9023051961129404e-05, + "loss": 0.4334, + "step": 3140 + }, + { + "epoch": 0.23357718444799286, + "grad_norm": 1.1037994623184204, + "learning_rate": 4.9018885803910186e-05, + "loss": 0.4382, + "step": 3145 + }, + { + "epoch": 0.23394853132310892, + "grad_norm": 0.8490164279937744, + "learning_rate": 4.9014710960162394e-05, + "loss": 0.4394, + "step": 3150 + }, + { + "epoch": 0.23431987819822497, + "grad_norm": 1.031388282775879, + "learning_rate": 4.901052743139586e-05, + "loss": 0.4654, + "step": 3155 + }, + { + "epoch": 0.234691225073341, + "grad_norm": 0.8660055994987488, + "learning_rate": 4.9006335219123576e-05, + "loss": 0.4347, + "step": 3160 + }, + { + "epoch": 0.23506257194845706, + "grad_norm": 0.9747069478034973, + "learning_rate": 4.900213432486168e-05, + "loss": 0.4359, + "step": 3165 + }, + { + "epoch": 0.2354339188235731, + "grad_norm": 0.9307319521903992, + "learning_rate": 4.8997924750129444e-05, + "loss": 0.4363, + "step": 3170 + }, + { + "epoch": 0.23580526569868915, + "grad_norm": 1.0029345750808716, + "learning_rate": 4.899370649644928e-05, + "loss": 0.4549, + "step": 3175 + }, + { + "epoch": 0.23617661257380518, + "grad_norm": 1.0878911018371582, + "learning_rate": 4.898947956534672e-05, + "loss": 0.4557, + "step": 3180 + }, + { + "epoch": 0.23654795944892124, + "grad_norm": 1.0974375009536743, + "learning_rate": 4.898524395835047e-05, + "loss": 0.4555, + "step": 3185 + }, + { + "epoch": 0.2369193063240373, + "grad_norm": 1.0777721405029297, + "learning_rate": 4.8980999676992345e-05, + "loss": 0.4376, + "step": 3190 + }, + { + "epoch": 0.23729065319915332, + "grad_norm": 0.9894423484802246, + "learning_rate": 4.897674672280731e-05, + "loss": 0.4296, + "step": 3195 + }, + { + "epoch": 0.23766200007426938, + "grad_norm": 0.9134079813957214, + "learning_rate": 4.897248509733346e-05, + "loss": 0.4359, + "step": 3200 + }, + { + "epoch": 0.2380333469493854, + "grad_norm": 1.1131818294525146, + "learning_rate": 4.8968214802112034e-05, + "loss": 0.4543, + "step": 3205 + }, + { + "epoch": 0.23840469382450147, + "grad_norm": 1.0237282514572144, + "learning_rate": 4.89639358386874e-05, + "loss": 0.43, + "step": 3210 + }, + { + "epoch": 0.23877604069961753, + "grad_norm": 0.9851337671279907, + "learning_rate": 4.895964820860705e-05, + "loss": 0.4319, + "step": 3215 + }, + { + "epoch": 0.23914738757473356, + "grad_norm": 0.8571179509162903, + "learning_rate": 4.8955351913421656e-05, + "loss": 0.4303, + "step": 3220 + }, + { + "epoch": 0.2395187344498496, + "grad_norm": 0.9178614616394043, + "learning_rate": 4.895104695468495e-05, + "loss": 0.4199, + "step": 3225 + }, + { + "epoch": 0.23989008132496564, + "grad_norm": 0.7542501091957092, + "learning_rate": 4.894673333395387e-05, + "loss": 0.4335, + "step": 3230 + }, + { + "epoch": 0.2402614282000817, + "grad_norm": 0.9010379910469055, + "learning_rate": 4.894241105278844e-05, + "loss": 0.4334, + "step": 3235 + }, + { + "epoch": 0.24063277507519773, + "grad_norm": 1.1573511362075806, + "learning_rate": 4.893808011275183e-05, + "loss": 0.4202, + "step": 3240 + }, + { + "epoch": 0.2410041219503138, + "grad_norm": 0.9232560992240906, + "learning_rate": 4.893374051541035e-05, + "loss": 0.4202, + "step": 3245 + }, + { + "epoch": 0.24137546882542985, + "grad_norm": 0.9102317094802856, + "learning_rate": 4.8929392262333426e-05, + "loss": 0.4414, + "step": 3250 + }, + { + "epoch": 0.24174681570054588, + "grad_norm": 0.8203946948051453, + "learning_rate": 4.892503535509363e-05, + "loss": 0.4186, + "step": 3255 + }, + { + "epoch": 0.24211816257566193, + "grad_norm": 0.802352786064148, + "learning_rate": 4.892066979526664e-05, + "loss": 0.4426, + "step": 3260 + }, + { + "epoch": 0.24248950945077796, + "grad_norm": 0.8278765678405762, + "learning_rate": 4.891629558443129e-05, + "loss": 0.4114, + "step": 3265 + }, + { + "epoch": 0.24286085632589402, + "grad_norm": 1.7212625741958618, + "learning_rate": 4.891191272416953e-05, + "loss": 0.438, + "step": 3270 + }, + { + "epoch": 0.24323220320101005, + "grad_norm": 5.098775386810303, + "learning_rate": 4.8907521216066446e-05, + "loss": 0.458, + "step": 3275 + }, + { + "epoch": 0.2436035500761261, + "grad_norm": 0.9874866604804993, + "learning_rate": 4.890312106171023e-05, + "loss": 0.4227, + "step": 3280 + }, + { + "epoch": 0.24397489695124217, + "grad_norm": 1.151456594467163, + "learning_rate": 4.8898712262692225e-05, + "loss": 0.4593, + "step": 3285 + }, + { + "epoch": 0.2443462438263582, + "grad_norm": 0.8587061762809753, + "learning_rate": 4.889429482060689e-05, + "loss": 0.4161, + "step": 3290 + }, + { + "epoch": 0.24471759070147425, + "grad_norm": 1.2171800136566162, + "learning_rate": 4.888986873705181e-05, + "loss": 0.4358, + "step": 3295 + }, + { + "epoch": 0.24508893757659028, + "grad_norm": 0.8855735063552856, + "learning_rate": 4.888543401362771e-05, + "loss": 0.4328, + "step": 3300 + }, + { + "epoch": 0.24546028445170634, + "grad_norm": 1.1113088130950928, + "learning_rate": 4.88809906519384e-05, + "loss": 0.4259, + "step": 3305 + }, + { + "epoch": 0.2458316313268224, + "grad_norm": 6.572192668914795, + "learning_rate": 4.887653865359085e-05, + "loss": 0.4463, + "step": 3310 + }, + { + "epoch": 0.24620297820193843, + "grad_norm": 1.3304423093795776, + "learning_rate": 4.887207802019515e-05, + "loss": 0.4284, + "step": 3315 + }, + { + "epoch": 0.24657432507705448, + "grad_norm": 0.8548965454101562, + "learning_rate": 4.88676087533645e-05, + "loss": 0.4335, + "step": 3320 + }, + { + "epoch": 0.24694567195217051, + "grad_norm": 1.166911244392395, + "learning_rate": 4.886313085471524e-05, + "loss": 0.4364, + "step": 3325 + }, + { + "epoch": 0.24731701882728657, + "grad_norm": 1.1613044738769531, + "learning_rate": 4.885864432586681e-05, + "loss": 0.4326, + "step": 3330 + }, + { + "epoch": 0.2476883657024026, + "grad_norm": 0.9552707076072693, + "learning_rate": 4.8854149168441776e-05, + "loss": 0.4353, + "step": 3335 + }, + { + "epoch": 0.24805971257751866, + "grad_norm": 0.9253180027008057, + "learning_rate": 4.8849645384065843e-05, + "loss": 0.4177, + "step": 3340 + }, + { + "epoch": 0.24843105945263472, + "grad_norm": 0.9358575344085693, + "learning_rate": 4.884513297436782e-05, + "loss": 0.4393, + "step": 3345 + }, + { + "epoch": 0.24880240632775075, + "grad_norm": 0.9707365036010742, + "learning_rate": 4.884061194097963e-05, + "loss": 0.4308, + "step": 3350 + }, + { + "epoch": 0.2491737532028668, + "grad_norm": 0.8588376641273499, + "learning_rate": 4.883608228553635e-05, + "loss": 0.4028, + "step": 3355 + }, + { + "epoch": 0.24954510007798283, + "grad_norm": 0.9038694500923157, + "learning_rate": 4.88315440096761e-05, + "loss": 0.4524, + "step": 3360 + }, + { + "epoch": 0.2499164469530989, + "grad_norm": 1.0825928449630737, + "learning_rate": 4.882699711504021e-05, + "loss": 0.4372, + "step": 3365 + }, + { + "epoch": 0.2502877938282149, + "grad_norm": 1.158320665359497, + "learning_rate": 4.882244160327306e-05, + "loss": 0.418, + "step": 3370 + }, + { + "epoch": 0.250659140703331, + "grad_norm": 1.276110291481018, + "learning_rate": 4.881787747602218e-05, + "loss": 0.4545, + "step": 3375 + }, + { + "epoch": 0.25103048757844704, + "grad_norm": 1.0024749040603638, + "learning_rate": 4.8813304734938206e-05, + "loss": 0.4674, + "step": 3380 + }, + { + "epoch": 0.25140183445356307, + "grad_norm": 1.1805554628372192, + "learning_rate": 4.8808723381674884e-05, + "loss": 0.4638, + "step": 3385 + }, + { + "epoch": 0.2517731813286791, + "grad_norm": 1.3252657651901245, + "learning_rate": 4.880413341788907e-05, + "loss": 0.4241, + "step": 3390 + }, + { + "epoch": 0.2521445282037952, + "grad_norm": 1.0752530097961426, + "learning_rate": 4.879953484524076e-05, + "loss": 0.4416, + "step": 3395 + }, + { + "epoch": 0.2525158750789112, + "grad_norm": 1.014775276184082, + "learning_rate": 4.8794927665393044e-05, + "loss": 0.4295, + "step": 3400 + }, + { + "epoch": 0.25288722195402724, + "grad_norm": 0.9509404301643372, + "learning_rate": 4.879031188001211e-05, + "loss": 0.427, + "step": 3405 + }, + { + "epoch": 0.2532585688291433, + "grad_norm": 1.2720471620559692, + "learning_rate": 4.87856874907673e-05, + "loss": 0.4238, + "step": 3410 + }, + { + "epoch": 0.25362991570425936, + "grad_norm": 1.0840013027191162, + "learning_rate": 4.878105449933103e-05, + "loss": 0.4166, + "step": 3415 + }, + { + "epoch": 0.2540012625793754, + "grad_norm": 0.956425666809082, + "learning_rate": 4.877641290737884e-05, + "loss": 0.4203, + "step": 3420 + }, + { + "epoch": 0.2543726094544914, + "grad_norm": 1.113987684249878, + "learning_rate": 4.877176271658939e-05, + "loss": 0.4318, + "step": 3425 + }, + { + "epoch": 0.2547439563296075, + "grad_norm": 1.0058467388153076, + "learning_rate": 4.8767103928644434e-05, + "loss": 0.4475, + "step": 3430 + }, + { + "epoch": 0.25511530320472353, + "grad_norm": 1.0395872592926025, + "learning_rate": 4.876243654522883e-05, + "loss": 0.4322, + "step": 3435 + }, + { + "epoch": 0.25548665007983956, + "grad_norm": 0.9146986603736877, + "learning_rate": 4.8757760568030574e-05, + "loss": 0.4309, + "step": 3440 + }, + { + "epoch": 0.25585799695495565, + "grad_norm": 0.9177193641662598, + "learning_rate": 4.875307599874075e-05, + "loss": 0.4369, + "step": 3445 + }, + { + "epoch": 0.2562293438300717, + "grad_norm": 0.9752681255340576, + "learning_rate": 4.8748382839053544e-05, + "loss": 0.4264, + "step": 3450 + }, + { + "epoch": 0.2566006907051877, + "grad_norm": 0.9808909296989441, + "learning_rate": 4.8743681090666265e-05, + "loss": 0.4313, + "step": 3455 + }, + { + "epoch": 0.25697203758030374, + "grad_norm": 0.8328979015350342, + "learning_rate": 4.873897075527931e-05, + "loss": 0.4325, + "step": 3460 + }, + { + "epoch": 0.2573433844554198, + "grad_norm": 0.925728976726532, + "learning_rate": 4.87342518345962e-05, + "loss": 0.4305, + "step": 3465 + }, + { + "epoch": 0.25771473133053585, + "grad_norm": 0.9330899119377136, + "learning_rate": 4.872952433032355e-05, + "loss": 0.4218, + "step": 3470 + }, + { + "epoch": 0.2580860782056519, + "grad_norm": 0.9173476696014404, + "learning_rate": 4.8724788244171076e-05, + "loss": 0.4265, + "step": 3475 + }, + { + "epoch": 0.25845742508076797, + "grad_norm": 1.3077492713928223, + "learning_rate": 4.872004357785161e-05, + "loss": 0.4421, + "step": 3480 + }, + { + "epoch": 0.258828771955884, + "grad_norm": 1.0578168630599976, + "learning_rate": 4.8715290333081073e-05, + "loss": 0.4426, + "step": 3485 + }, + { + "epoch": 0.259200118831, + "grad_norm": 0.7287719249725342, + "learning_rate": 4.87105285115785e-05, + "loss": 0.4407, + "step": 3490 + }, + { + "epoch": 0.25957146570611606, + "grad_norm": 0.7943789958953857, + "learning_rate": 4.8705758115066026e-05, + "loss": 0.4297, + "step": 3495 + }, + { + "epoch": 0.25994281258123214, + "grad_norm": 0.9521101713180542, + "learning_rate": 4.870097914526888e-05, + "loss": 0.4287, + "step": 3500 + }, + { + "epoch": 0.26031415945634817, + "grad_norm": 1.0764399766921997, + "learning_rate": 4.8696191603915394e-05, + "loss": 0.4339, + "step": 3505 + }, + { + "epoch": 0.2606855063314642, + "grad_norm": 1.1681718826293945, + "learning_rate": 4.8691395492737e-05, + "loss": 0.4451, + "step": 3510 + }, + { + "epoch": 0.2610568532065803, + "grad_norm": 0.9852285981178284, + "learning_rate": 4.8686590813468245e-05, + "loss": 0.4569, + "step": 3515 + }, + { + "epoch": 0.2614282000816963, + "grad_norm": 0.9874268770217896, + "learning_rate": 4.868177756784676e-05, + "loss": 0.4192, + "step": 3520 + }, + { + "epoch": 0.26179954695681235, + "grad_norm": 1.0080324411392212, + "learning_rate": 4.867695575761324e-05, + "loss": 0.4347, + "step": 3525 + }, + { + "epoch": 0.26217089383192843, + "grad_norm": 0.9174315929412842, + "learning_rate": 4.867212538451156e-05, + "loss": 0.4194, + "step": 3530 + }, + { + "epoch": 0.26254224070704446, + "grad_norm": 1.1453148126602173, + "learning_rate": 4.866728645028862e-05, + "loss": 0.4419, + "step": 3535 + }, + { + "epoch": 0.2629135875821605, + "grad_norm": 0.9571831226348877, + "learning_rate": 4.866243895669445e-05, + "loss": 0.4443, + "step": 3540 + }, + { + "epoch": 0.2632849344572765, + "grad_norm": 0.7948081493377686, + "learning_rate": 4.8657582905482155e-05, + "loss": 0.4199, + "step": 3545 + }, + { + "epoch": 0.2636562813323926, + "grad_norm": 0.9427078366279602, + "learning_rate": 4.8652718298407944e-05, + "loss": 0.4481, + "step": 3550 + }, + { + "epoch": 0.26402762820750864, + "grad_norm": 0.9686035513877869, + "learning_rate": 4.8647845137231143e-05, + "loss": 0.4431, + "step": 3555 + }, + { + "epoch": 0.26439897508262467, + "grad_norm": 1.433406949043274, + "learning_rate": 4.8642963423714124e-05, + "loss": 0.4271, + "step": 3560 + }, + { + "epoch": 0.26477032195774075, + "grad_norm": 1.2971220016479492, + "learning_rate": 4.86380731596224e-05, + "loss": 0.4361, + "step": 3565 + }, + { + "epoch": 0.2651416688328568, + "grad_norm": 1.1293293237686157, + "learning_rate": 4.8633174346724544e-05, + "loss": 0.4512, + "step": 3570 + }, + { + "epoch": 0.2655130157079728, + "grad_norm": 0.8800034523010254, + "learning_rate": 4.8628266986792235e-05, + "loss": 0.4351, + "step": 3575 + }, + { + "epoch": 0.26588436258308884, + "grad_norm": 0.7938306331634521, + "learning_rate": 4.862335108160024e-05, + "loss": 0.4311, + "step": 3580 + }, + { + "epoch": 0.2662557094582049, + "grad_norm": 0.768816351890564, + "learning_rate": 4.861842663292641e-05, + "loss": 0.4229, + "step": 3585 + }, + { + "epoch": 0.26662705633332096, + "grad_norm": 0.7999665141105652, + "learning_rate": 4.86134936425517e-05, + "loss": 0.4431, + "step": 3590 + }, + { + "epoch": 0.266998403208437, + "grad_norm": 0.9142205119132996, + "learning_rate": 4.860855211226015e-05, + "loss": 0.4048, + "step": 3595 + }, + { + "epoch": 0.26736975008355307, + "grad_norm": 0.9010916352272034, + "learning_rate": 4.860360204383888e-05, + "loss": 0.4178, + "step": 3600 + }, + { + "epoch": 0.2677410969586691, + "grad_norm": 0.7956218719482422, + "learning_rate": 4.859864343907809e-05, + "loss": 0.4451, + "step": 3605 + }, + { + "epoch": 0.26811244383378513, + "grad_norm": 0.9147886633872986, + "learning_rate": 4.8593676299771096e-05, + "loss": 0.4474, + "step": 3610 + }, + { + "epoch": 0.26848379070890116, + "grad_norm": 1.0486589670181274, + "learning_rate": 4.858870062771428e-05, + "loss": 0.4374, + "step": 3615 + }, + { + "epoch": 0.26885513758401725, + "grad_norm": 0.9408405423164368, + "learning_rate": 4.8583716424707115e-05, + "loss": 0.4216, + "step": 3620 + }, + { + "epoch": 0.2692264844591333, + "grad_norm": 1.010460615158081, + "learning_rate": 4.8578723692552155e-05, + "loss": 0.4178, + "step": 3625 + }, + { + "epoch": 0.2695978313342493, + "grad_norm": 0.8639962673187256, + "learning_rate": 4.857372243305504e-05, + "loss": 0.4383, + "step": 3630 + }, + { + "epoch": 0.2699691782093654, + "grad_norm": 0.9397608041763306, + "learning_rate": 4.856871264802451e-05, + "loss": 0.4627, + "step": 3635 + }, + { + "epoch": 0.2703405250844814, + "grad_norm": 0.8951156735420227, + "learning_rate": 4.856369433927235e-05, + "loss": 0.4427, + "step": 3640 + }, + { + "epoch": 0.27071187195959745, + "grad_norm": 2.018953561782837, + "learning_rate": 4.855866750861348e-05, + "loss": 0.4281, + "step": 3645 + }, + { + "epoch": 0.2710832188347135, + "grad_norm": 1.3016127347946167, + "learning_rate": 4.855363215786585e-05, + "loss": 0.4172, + "step": 3650 + }, + { + "epoch": 0.27145456570982957, + "grad_norm": 0.979545533657074, + "learning_rate": 4.854858828885053e-05, + "loss": 0.4509, + "step": 3655 + }, + { + "epoch": 0.2718259125849456, + "grad_norm": 0.9157105088233948, + "learning_rate": 4.8543535903391656e-05, + "loss": 0.4373, + "step": 3660 + }, + { + "epoch": 0.2721972594600616, + "grad_norm": 0.8544624447822571, + "learning_rate": 4.853847500331643e-05, + "loss": 0.4296, + "step": 3665 + }, + { + "epoch": 0.2725686063351777, + "grad_norm": 1.124678134918213, + "learning_rate": 4.853340559045516e-05, + "loss": 0.4154, + "step": 3670 + }, + { + "epoch": 0.27293995321029374, + "grad_norm": 0.9993969202041626, + "learning_rate": 4.852832766664122e-05, + "loss": 0.4447, + "step": 3675 + }, + { + "epoch": 0.27331130008540977, + "grad_norm": 0.8751777410507202, + "learning_rate": 4.8523241233711056e-05, + "loss": 0.4452, + "step": 3680 + }, + { + "epoch": 0.2736826469605258, + "grad_norm": 1.4159234762191772, + "learning_rate": 4.85181462935042e-05, + "loss": 0.4125, + "step": 3685 + }, + { + "epoch": 0.2740539938356419, + "grad_norm": 0.9526045918464661, + "learning_rate": 4.851304284786326e-05, + "loss": 0.4295, + "step": 3690 + }, + { + "epoch": 0.2744253407107579, + "grad_norm": 0.8319068551063538, + "learning_rate": 4.8507930898633915e-05, + "loss": 0.4304, + "step": 3695 + }, + { + "epoch": 0.27479668758587394, + "grad_norm": 0.9949585795402527, + "learning_rate": 4.850281044766492e-05, + "loss": 0.4138, + "step": 3700 + }, + { + "epoch": 0.27516803446099003, + "grad_norm": 0.8212401866912842, + "learning_rate": 4.849768149680811e-05, + "loss": 0.4204, + "step": 3705 + }, + { + "epoch": 0.27553938133610606, + "grad_norm": 1.044382095336914, + "learning_rate": 4.84925440479184e-05, + "loss": 0.4224, + "step": 3710 + }, + { + "epoch": 0.2759107282112221, + "grad_norm": 0.9368321299552917, + "learning_rate": 4.8487398102853754e-05, + "loss": 0.4434, + "step": 3715 + }, + { + "epoch": 0.2762820750863382, + "grad_norm": 1.0776258707046509, + "learning_rate": 4.848224366347522e-05, + "loss": 0.4255, + "step": 3720 + }, + { + "epoch": 0.2766534219614542, + "grad_norm": 0.8507828712463379, + "learning_rate": 4.847708073164695e-05, + "loss": 0.446, + "step": 3725 + }, + { + "epoch": 0.27702476883657023, + "grad_norm": 0.9348880648612976, + "learning_rate": 4.847190930923612e-05, + "loss": 0.3984, + "step": 3730 + }, + { + "epoch": 0.27739611571168626, + "grad_norm": 2.778374195098877, + "learning_rate": 4.8466729398113e-05, + "loss": 0.4018, + "step": 3735 + }, + { + "epoch": 0.27776746258680235, + "grad_norm": 0.9289271235466003, + "learning_rate": 4.8461541000150916e-05, + "loss": 0.4328, + "step": 3740 + }, + { + "epoch": 0.2781388094619184, + "grad_norm": 0.7580967545509338, + "learning_rate": 4.8456344117226296e-05, + "loss": 0.404, + "step": 3745 + }, + { + "epoch": 0.2785101563370344, + "grad_norm": 0.7272197008132935, + "learning_rate": 4.8451138751218584e-05, + "loss": 0.442, + "step": 3750 + }, + { + "epoch": 0.2788815032121505, + "grad_norm": 0.8026915192604065, + "learning_rate": 4.844592490401035e-05, + "loss": 0.4272, + "step": 3755 + }, + { + "epoch": 0.2792528500872665, + "grad_norm": 0.8243690133094788, + "learning_rate": 4.8440702577487186e-05, + "loss": 0.4389, + "step": 3760 + }, + { + "epoch": 0.27962419696238255, + "grad_norm": 0.8651421070098877, + "learning_rate": 4.843547177353778e-05, + "loss": 0.4177, + "step": 3765 + }, + { + "epoch": 0.2799955438374986, + "grad_norm": 2.0537188053131104, + "learning_rate": 4.8430232494053864e-05, + "loss": 0.4236, + "step": 3770 + }, + { + "epoch": 0.28036689071261467, + "grad_norm": 1.3934059143066406, + "learning_rate": 4.842498474093025e-05, + "loss": 0.4236, + "step": 3775 + }, + { + "epoch": 0.2807382375877307, + "grad_norm": 1.1123522520065308, + "learning_rate": 4.84197285160648e-05, + "loss": 0.4236, + "step": 3780 + }, + { + "epoch": 0.28110958446284673, + "grad_norm": 1.1888607740402222, + "learning_rate": 4.841446382135846e-05, + "loss": 0.4224, + "step": 3785 + }, + { + "epoch": 0.2814809313379628, + "grad_norm": 0.9268679618835449, + "learning_rate": 4.840919065871522e-05, + "loss": 0.4528, + "step": 3790 + }, + { + "epoch": 0.28185227821307884, + "grad_norm": 1.1020976305007935, + "learning_rate": 4.840390903004216e-05, + "loss": 0.427, + "step": 3795 + }, + { + "epoch": 0.2822236250881949, + "grad_norm": 1.011352300643921, + "learning_rate": 4.839861893724937e-05, + "loss": 0.4511, + "step": 3800 + }, + { + "epoch": 0.2825949719633109, + "grad_norm": 0.9751265645027161, + "learning_rate": 4.8393320382250076e-05, + "loss": 0.4277, + "step": 3805 + }, + { + "epoch": 0.282966318838427, + "grad_norm": 1.1336296796798706, + "learning_rate": 4.8388013366960484e-05, + "loss": 0.4254, + "step": 3810 + }, + { + "epoch": 0.283337665713543, + "grad_norm": 1.0664730072021484, + "learning_rate": 4.838269789329991e-05, + "loss": 0.3992, + "step": 3815 + }, + { + "epoch": 0.28370901258865905, + "grad_norm": 0.9408633708953857, + "learning_rate": 4.837737396319073e-05, + "loss": 0.4228, + "step": 3820 + }, + { + "epoch": 0.28408035946377513, + "grad_norm": 0.9652064442634583, + "learning_rate": 4.837204157855835e-05, + "loss": 0.4141, + "step": 3825 + }, + { + "epoch": 0.28445170633889116, + "grad_norm": 1.0405441522598267, + "learning_rate": 4.836670074133125e-05, + "loss": 0.4437, + "step": 3830 + }, + { + "epoch": 0.2848230532140072, + "grad_norm": 1.0543575286865234, + "learning_rate": 4.8361351453440975e-05, + "loss": 0.4349, + "step": 3835 + }, + { + "epoch": 0.2851944000891232, + "grad_norm": 1.3610877990722656, + "learning_rate": 4.83559937168221e-05, + "loss": 0.4469, + "step": 3840 + }, + { + "epoch": 0.2855657469642393, + "grad_norm": 1.1551353931427002, + "learning_rate": 4.8350627533412295e-05, + "loss": 0.4183, + "step": 3845 + }, + { + "epoch": 0.28593709383935534, + "grad_norm": 0.8394594788551331, + "learning_rate": 4.834525290515224e-05, + "loss": 0.4219, + "step": 3850 + }, + { + "epoch": 0.28630844071447137, + "grad_norm": 0.9839069843292236, + "learning_rate": 4.833986983398571e-05, + "loss": 0.4195, + "step": 3855 + }, + { + "epoch": 0.28667978758958745, + "grad_norm": 0.9199720621109009, + "learning_rate": 4.8334478321859505e-05, + "loss": 0.4211, + "step": 3860 + }, + { + "epoch": 0.2870511344647035, + "grad_norm": 0.8119776844978333, + "learning_rate": 4.832907837072349e-05, + "loss": 0.4187, + "step": 3865 + }, + { + "epoch": 0.2874224813398195, + "grad_norm": 1.8330432176589966, + "learning_rate": 4.832366998253058e-05, + "loss": 0.4458, + "step": 3870 + }, + { + "epoch": 0.2877938282149356, + "grad_norm": 1.2934834957122803, + "learning_rate": 4.831825315923674e-05, + "loss": 0.4275, + "step": 3875 + }, + { + "epoch": 0.28816517509005163, + "grad_norm": 0.8350692987442017, + "learning_rate": 4.8312827902800976e-05, + "loss": 0.4392, + "step": 3880 + }, + { + "epoch": 0.28853652196516766, + "grad_norm": 0.9420866966247559, + "learning_rate": 4.8307394215185374e-05, + "loss": 0.4119, + "step": 3885 + }, + { + "epoch": 0.2889078688402837, + "grad_norm": 1.0160822868347168, + "learning_rate": 4.830195209835504e-05, + "loss": 0.4138, + "step": 3890 + }, + { + "epoch": 0.2892792157153998, + "grad_norm": 1.0691471099853516, + "learning_rate": 4.829650155427813e-05, + "loss": 0.4323, + "step": 3895 + }, + { + "epoch": 0.2896505625905158, + "grad_norm": 1.0200157165527344, + "learning_rate": 4.829104258492587e-05, + "loss": 0.4123, + "step": 3900 + }, + { + "epoch": 0.29002190946563183, + "grad_norm": 0.7049094438552856, + "learning_rate": 4.828557519227251e-05, + "loss": 0.42, + "step": 3905 + }, + { + "epoch": 0.2903932563407479, + "grad_norm": 1.1828627586364746, + "learning_rate": 4.828009937829535e-05, + "loss": 0.4265, + "step": 3910 + }, + { + "epoch": 0.29076460321586395, + "grad_norm": 1.1924457550048828, + "learning_rate": 4.827461514497475e-05, + "loss": 0.434, + "step": 3915 + }, + { + "epoch": 0.29113595009098, + "grad_norm": 1.2100766897201538, + "learning_rate": 4.826912249429411e-05, + "loss": 0.4307, + "step": 3920 + }, + { + "epoch": 0.291507296966096, + "grad_norm": 0.9846333861351013, + "learning_rate": 4.8263621428239855e-05, + "loss": 0.4366, + "step": 3925 + }, + { + "epoch": 0.2918786438412121, + "grad_norm": 1.0882083177566528, + "learning_rate": 4.8258111948801484e-05, + "loss": 0.4128, + "step": 3930 + }, + { + "epoch": 0.2922499907163281, + "grad_norm": 0.8791163563728333, + "learning_rate": 4.825259405797151e-05, + "loss": 0.4417, + "step": 3935 + }, + { + "epoch": 0.29262133759144415, + "grad_norm": 1.0271704196929932, + "learning_rate": 4.82470677577455e-05, + "loss": 0.4214, + "step": 3940 + }, + { + "epoch": 0.29299268446656024, + "grad_norm": 1.3489196300506592, + "learning_rate": 4.8241533050122076e-05, + "loss": 0.4208, + "step": 3945 + }, + { + "epoch": 0.29336403134167627, + "grad_norm": 0.7574217915534973, + "learning_rate": 4.8235989937102874e-05, + "loss": 0.4211, + "step": 3950 + }, + { + "epoch": 0.2937353782167923, + "grad_norm": 0.928737461566925, + "learning_rate": 4.823043842069259e-05, + "loss": 0.4397, + "step": 3955 + }, + { + "epoch": 0.2941067250919083, + "grad_norm": 0.9212475419044495, + "learning_rate": 4.822487850289896e-05, + "loss": 0.4156, + "step": 3960 + }, + { + "epoch": 0.2944780719670244, + "grad_norm": 0.77180016040802, + "learning_rate": 4.821931018573274e-05, + "loss": 0.4262, + "step": 3965 + }, + { + "epoch": 0.29484941884214044, + "grad_norm": 1.0247732400894165, + "learning_rate": 4.821373347120773e-05, + "loss": 0.4208, + "step": 3970 + }, + { + "epoch": 0.2952207657172565, + "grad_norm": 0.9615873694419861, + "learning_rate": 4.820814836134079e-05, + "loss": 0.4056, + "step": 3975 + }, + { + "epoch": 0.29559211259237256, + "grad_norm": 0.9379513263702393, + "learning_rate": 4.820255485815178e-05, + "loss": 0.4463, + "step": 3980 + }, + { + "epoch": 0.2959634594674886, + "grad_norm": 1.0274722576141357, + "learning_rate": 4.819695296366362e-05, + "loss": 0.4299, + "step": 3985 + }, + { + "epoch": 0.2963348063426046, + "grad_norm": 1.0676618814468384, + "learning_rate": 4.819134267990226e-05, + "loss": 0.4361, + "step": 3990 + }, + { + "epoch": 0.29670615321772065, + "grad_norm": 0.9245099425315857, + "learning_rate": 4.818572400889669e-05, + "loss": 0.4248, + "step": 3995 + }, + { + "epoch": 0.29707750009283673, + "grad_norm": 1.0777188539505005, + "learning_rate": 4.8180096952678914e-05, + "loss": 0.4382, + "step": 4000 + }, + { + "epoch": 0.29744884696795276, + "grad_norm": 0.8690295815467834, + "learning_rate": 4.817446151328397e-05, + "loss": 0.4226, + "step": 4005 + }, + { + "epoch": 0.2978201938430688, + "grad_norm": 0.9178885221481323, + "learning_rate": 4.816881769274997e-05, + "loss": 0.4394, + "step": 4010 + }, + { + "epoch": 0.2981915407181849, + "grad_norm": 0.9835901856422424, + "learning_rate": 4.816316549311799e-05, + "loss": 0.4227, + "step": 4015 + }, + { + "epoch": 0.2985628875933009, + "grad_norm": 0.893937349319458, + "learning_rate": 4.815750491643219e-05, + "loss": 0.4159, + "step": 4020 + }, + { + "epoch": 0.29893423446841694, + "grad_norm": 1.1184569597244263, + "learning_rate": 4.815183596473975e-05, + "loss": 0.4322, + "step": 4025 + }, + { + "epoch": 0.29930558134353297, + "grad_norm": 0.9903736710548401, + "learning_rate": 4.814615864009085e-05, + "loss": 0.4254, + "step": 4030 + }, + { + "epoch": 0.29967692821864905, + "grad_norm": 0.8701817393302917, + "learning_rate": 4.814047294453872e-05, + "loss": 0.4144, + "step": 4035 + }, + { + "epoch": 0.3000482750937651, + "grad_norm": 0.9406540393829346, + "learning_rate": 4.813477888013962e-05, + "loss": 0.4346, + "step": 4040 + }, + { + "epoch": 0.3004196219688811, + "grad_norm": 1.1688896417617798, + "learning_rate": 4.8129076448952826e-05, + "loss": 0.4492, + "step": 4045 + }, + { + "epoch": 0.3007909688439972, + "grad_norm": 1.02422034740448, + "learning_rate": 4.812336565304066e-05, + "loss": 0.418, + "step": 4050 + }, + { + "epoch": 0.3011623157191132, + "grad_norm": 0.9559808969497681, + "learning_rate": 4.811764649446845e-05, + "loss": 0.4208, + "step": 4055 + }, + { + "epoch": 0.30153366259422926, + "grad_norm": 0.6793580651283264, + "learning_rate": 4.811191897530454e-05, + "loss": 0.4155, + "step": 4060 + }, + { + "epoch": 0.30190500946934534, + "grad_norm": 1.0068315267562866, + "learning_rate": 4.8106183097620314e-05, + "loss": 0.4447, + "step": 4065 + }, + { + "epoch": 0.30227635634446137, + "grad_norm": 0.9851054549217224, + "learning_rate": 4.8100438863490196e-05, + "loss": 0.4442, + "step": 4070 + }, + { + "epoch": 0.3026477032195774, + "grad_norm": 1.5529272556304932, + "learning_rate": 4.80946862749916e-05, + "loss": 0.4025, + "step": 4075 + }, + { + "epoch": 0.30301905009469343, + "grad_norm": 0.726374626159668, + "learning_rate": 4.808892533420496e-05, + "loss": 0.4047, + "step": 4080 + }, + { + "epoch": 0.3033903969698095, + "grad_norm": 0.9681001901626587, + "learning_rate": 4.808315604321375e-05, + "loss": 0.4129, + "step": 4085 + }, + { + "epoch": 0.30376174384492555, + "grad_norm": 0.8776106238365173, + "learning_rate": 4.807737840410447e-05, + "loss": 0.4335, + "step": 4090 + }, + { + "epoch": 0.3041330907200416, + "grad_norm": 1.3474504947662354, + "learning_rate": 4.807159241896662e-05, + "loss": 0.432, + "step": 4095 + }, + { + "epoch": 0.30450443759515766, + "grad_norm": 0.9012182950973511, + "learning_rate": 4.8065798089892724e-05, + "loss": 0.4367, + "step": 4100 + }, + { + "epoch": 0.3048757844702737, + "grad_norm": 1.0132664442062378, + "learning_rate": 4.8059995418978325e-05, + "loss": 0.4111, + "step": 4105 + }, + { + "epoch": 0.3052471313453897, + "grad_norm": 0.7376078963279724, + "learning_rate": 4.8054184408321984e-05, + "loss": 0.4415, + "step": 4110 + }, + { + "epoch": 0.30561847822050575, + "grad_norm": 0.8478325009346008, + "learning_rate": 4.804836506002527e-05, + "loss": 0.4358, + "step": 4115 + }, + { + "epoch": 0.30598982509562184, + "grad_norm": 0.7889040112495422, + "learning_rate": 4.8042537376192784e-05, + "loss": 0.4049, + "step": 4120 + }, + { + "epoch": 0.30636117197073787, + "grad_norm": 2.196556806564331, + "learning_rate": 4.803670135893213e-05, + "loss": 0.3927, + "step": 4125 + }, + { + "epoch": 0.3067325188458539, + "grad_norm": 0.6706188321113586, + "learning_rate": 4.8030857010353924e-05, + "loss": 0.4148, + "step": 4130 + }, + { + "epoch": 0.30710386572097, + "grad_norm": 0.9582722187042236, + "learning_rate": 4.80250043325718e-05, + "loss": 0.4262, + "step": 4135 + }, + { + "epoch": 0.307475212596086, + "grad_norm": 0.8332164287567139, + "learning_rate": 4.801914332770241e-05, + "loss": 0.4455, + "step": 4140 + }, + { + "epoch": 0.30784655947120204, + "grad_norm": 1.1163699626922607, + "learning_rate": 4.8013273997865394e-05, + "loss": 0.4344, + "step": 4145 + }, + { + "epoch": 0.30821790634631807, + "grad_norm": 0.8440271019935608, + "learning_rate": 4.800739634518343e-05, + "loss": 0.3971, + "step": 4150 + }, + { + "epoch": 0.30858925322143416, + "grad_norm": 0.8289729356765747, + "learning_rate": 4.800151037178219e-05, + "loss": 0.4024, + "step": 4155 + }, + { + "epoch": 0.3089606000965502, + "grad_norm": 0.844292163848877, + "learning_rate": 4.799561607979037e-05, + "loss": 0.4333, + "step": 4160 + }, + { + "epoch": 0.3093319469716662, + "grad_norm": 0.743328332901001, + "learning_rate": 4.798971347133966e-05, + "loss": 0.4371, + "step": 4165 + }, + { + "epoch": 0.3097032938467823, + "grad_norm": 1.2272253036499023, + "learning_rate": 4.798380254856476e-05, + "loss": 0.4197, + "step": 4170 + }, + { + "epoch": 0.31007464072189833, + "grad_norm": 0.8877312541007996, + "learning_rate": 4.797788331360339e-05, + "loss": 0.4121, + "step": 4175 + }, + { + "epoch": 0.31044598759701436, + "grad_norm": 1.0048223733901978, + "learning_rate": 4.7971955768596244e-05, + "loss": 0.4314, + "step": 4180 + }, + { + "epoch": 0.3108173344721304, + "grad_norm": 1.184412956237793, + "learning_rate": 4.796601991568706e-05, + "loss": 0.4307, + "step": 4185 + }, + { + "epoch": 0.3111886813472465, + "grad_norm": 0.7950173020362854, + "learning_rate": 4.796007575702256e-05, + "loss": 0.4165, + "step": 4190 + }, + { + "epoch": 0.3115600282223625, + "grad_norm": 0.9427579641342163, + "learning_rate": 4.7954123294752475e-05, + "loss": 0.4202, + "step": 4195 + }, + { + "epoch": 0.31193137509747854, + "grad_norm": 0.8798120021820068, + "learning_rate": 4.794816253102953e-05, + "loss": 0.4276, + "step": 4200 + }, + { + "epoch": 0.3123027219725946, + "grad_norm": 0.9066138863563538, + "learning_rate": 4.7942193468009465e-05, + "loss": 0.4223, + "step": 4205 + }, + { + "epoch": 0.31267406884771065, + "grad_norm": 1.109160304069519, + "learning_rate": 4.793621610785102e-05, + "loss": 0.3984, + "step": 4210 + }, + { + "epoch": 0.3130454157228267, + "grad_norm": 0.9013233184814453, + "learning_rate": 4.793023045271592e-05, + "loss": 0.4128, + "step": 4215 + }, + { + "epoch": 0.3134167625979427, + "grad_norm": 0.8852935433387756, + "learning_rate": 4.792423650476892e-05, + "loss": 0.4275, + "step": 4220 + }, + { + "epoch": 0.3137881094730588, + "grad_norm": 1.0032018423080444, + "learning_rate": 4.791823426617774e-05, + "loss": 0.4299, + "step": 4225 + }, + { + "epoch": 0.3141594563481748, + "grad_norm": 1.0188161134719849, + "learning_rate": 4.7912223739113125e-05, + "loss": 0.4151, + "step": 4230 + }, + { + "epoch": 0.31453080322329086, + "grad_norm": 0.8745348453521729, + "learning_rate": 4.7906204925748786e-05, + "loss": 0.425, + "step": 4235 + }, + { + "epoch": 0.31490215009840694, + "grad_norm": 0.8320831060409546, + "learning_rate": 4.790017782826148e-05, + "loss": 0.4199, + "step": 4240 + }, + { + "epoch": 0.31527349697352297, + "grad_norm": 0.8643554449081421, + "learning_rate": 4.789414244883092e-05, + "loss": 0.4582, + "step": 4245 + }, + { + "epoch": 0.315644843848639, + "grad_norm": 0.9545555114746094, + "learning_rate": 4.788809878963982e-05, + "loss": 0.4064, + "step": 4250 + }, + { + "epoch": 0.3160161907237551, + "grad_norm": 1.0125293731689453, + "learning_rate": 4.7882046852873896e-05, + "loss": 0.4186, + "step": 4255 + }, + { + "epoch": 0.3163875375988711, + "grad_norm": 1.3028231859207153, + "learning_rate": 4.787598664072186e-05, + "loss": 0.4336, + "step": 4260 + }, + { + "epoch": 0.31675888447398715, + "grad_norm": 0.8226160407066345, + "learning_rate": 4.786991815537542e-05, + "loss": 0.4105, + "step": 4265 + }, + { + "epoch": 0.3171302313491032, + "grad_norm": 1.0225698947906494, + "learning_rate": 4.786384139902924e-05, + "loss": 0.4341, + "step": 4270 + }, + { + "epoch": 0.31750157822421926, + "grad_norm": 0.7511369585990906, + "learning_rate": 4.7857756373881035e-05, + "loss": 0.429, + "step": 4275 + }, + { + "epoch": 0.3178729250993353, + "grad_norm": 0.9852497577667236, + "learning_rate": 4.7851663082131466e-05, + "loss": 0.4497, + "step": 4280 + }, + { + "epoch": 0.3182442719744513, + "grad_norm": 0.9103526473045349, + "learning_rate": 4.784556152598419e-05, + "loss": 0.4116, + "step": 4285 + }, + { + "epoch": 0.3186156188495674, + "grad_norm": 0.908010721206665, + "learning_rate": 4.7839451707645884e-05, + "loss": 0.4548, + "step": 4290 + }, + { + "epoch": 0.31898696572468344, + "grad_norm": 0.8548882603645325, + "learning_rate": 4.783333362932616e-05, + "loss": 0.4009, + "step": 4295 + }, + { + "epoch": 0.31935831259979947, + "grad_norm": 1.011780858039856, + "learning_rate": 4.7827207293237664e-05, + "loss": 0.4284, + "step": 4300 + }, + { + "epoch": 0.3197296594749155, + "grad_norm": 0.7785174250602722, + "learning_rate": 4.7821072701596007e-05, + "loss": 0.4125, + "step": 4305 + }, + { + "epoch": 0.3201010063500316, + "grad_norm": 0.8328904509544373, + "learning_rate": 4.781492985661979e-05, + "loss": 0.4225, + "step": 4310 + }, + { + "epoch": 0.3204723532251476, + "grad_norm": 1.5389479398727417, + "learning_rate": 4.780877876053059e-05, + "loss": 0.4086, + "step": 4315 + }, + { + "epoch": 0.32084370010026364, + "grad_norm": 1.0274771451950073, + "learning_rate": 4.780261941555298e-05, + "loss": 0.4263, + "step": 4320 + }, + { + "epoch": 0.3212150469753797, + "grad_norm": 0.9804433584213257, + "learning_rate": 4.7796451823914526e-05, + "loss": 0.4599, + "step": 4325 + }, + { + "epoch": 0.32158639385049576, + "grad_norm": 0.793917179107666, + "learning_rate": 4.7790275987845745e-05, + "loss": 0.4156, + "step": 4330 + }, + { + "epoch": 0.3219577407256118, + "grad_norm": 0.8779480457305908, + "learning_rate": 4.778409190958016e-05, + "loss": 0.4314, + "step": 4335 + }, + { + "epoch": 0.3223290876007278, + "grad_norm": 0.8802932500839233, + "learning_rate": 4.777789959135427e-05, + "loss": 0.4211, + "step": 4340 + }, + { + "epoch": 0.3227004344758439, + "grad_norm": 0.8226904273033142, + "learning_rate": 4.777169903540755e-05, + "loss": 0.4152, + "step": 4345 + }, + { + "epoch": 0.32307178135095993, + "grad_norm": 0.8297529816627502, + "learning_rate": 4.776549024398245e-05, + "loss": 0.4169, + "step": 4350 + }, + { + "epoch": 0.32344312822607596, + "grad_norm": 1.039888858795166, + "learning_rate": 4.775927321932442e-05, + "loss": 0.4451, + "step": 4355 + }, + { + "epoch": 0.32381447510119205, + "grad_norm": 1.020605206489563, + "learning_rate": 4.7753047963681856e-05, + "loss": 0.4209, + "step": 4360 + }, + { + "epoch": 0.3241858219763081, + "grad_norm": 1.1356257200241089, + "learning_rate": 4.774681447930616e-05, + "loss": 0.4108, + "step": 4365 + }, + { + "epoch": 0.3245571688514241, + "grad_norm": 1.8272466659545898, + "learning_rate": 4.77405727684517e-05, + "loss": 0.4172, + "step": 4370 + }, + { + "epoch": 0.32492851572654013, + "grad_norm": 0.9962214231491089, + "learning_rate": 4.77343228333758e-05, + "loss": 0.4147, + "step": 4375 + }, + { + "epoch": 0.3252998626016562, + "grad_norm": 0.873292863368988, + "learning_rate": 4.772806467633878e-05, + "loss": 0.4377, + "step": 4380 + }, + { + "epoch": 0.32567120947677225, + "grad_norm": 0.8075531721115112, + "learning_rate": 4.772179829960394e-05, + "loss": 0.4211, + "step": 4385 + }, + { + "epoch": 0.3260425563518883, + "grad_norm": 1.0832219123840332, + "learning_rate": 4.771552370543753e-05, + "loss": 0.4312, + "step": 4390 + }, + { + "epoch": 0.32641390322700436, + "grad_norm": 0.9082911610603333, + "learning_rate": 4.7709240896108786e-05, + "loss": 0.438, + "step": 4395 + }, + { + "epoch": 0.3267852501021204, + "grad_norm": 0.7049622535705566, + "learning_rate": 4.770294987388991e-05, + "loss": 0.4371, + "step": 4400 + }, + { + "epoch": 0.3271565969772364, + "grad_norm": 1.050377607345581, + "learning_rate": 4.769665064105608e-05, + "loss": 0.4126, + "step": 4405 + }, + { + "epoch": 0.3275279438523525, + "grad_norm": 0.9951369762420654, + "learning_rate": 4.7690343199885434e-05, + "loss": 0.431, + "step": 4410 + }, + { + "epoch": 0.32789929072746854, + "grad_norm": 0.8130521774291992, + "learning_rate": 4.768402755265908e-05, + "loss": 0.4099, + "step": 4415 + }, + { + "epoch": 0.32827063760258457, + "grad_norm": 0.8695419430732727, + "learning_rate": 4.7677703701661115e-05, + "loss": 0.4242, + "step": 4420 + }, + { + "epoch": 0.3286419844777006, + "grad_norm": 0.8330603241920471, + "learning_rate": 4.767137164917857e-05, + "loss": 0.4007, + "step": 4425 + }, + { + "epoch": 0.3290133313528167, + "grad_norm": 0.8372355699539185, + "learning_rate": 4.766503139750147e-05, + "loss": 0.4218, + "step": 4430 + }, + { + "epoch": 0.3293846782279327, + "grad_norm": 0.835241436958313, + "learning_rate": 4.765868294892278e-05, + "loss": 0.4127, + "step": 4435 + }, + { + "epoch": 0.32975602510304874, + "grad_norm": 1.2005548477172852, + "learning_rate": 4.765232630573845e-05, + "loss": 0.4205, + "step": 4440 + }, + { + "epoch": 0.33012737197816483, + "grad_norm": 0.9409222602844238, + "learning_rate": 4.764596147024738e-05, + "loss": 0.4084, + "step": 4445 + }, + { + "epoch": 0.33049871885328086, + "grad_norm": 0.9494869709014893, + "learning_rate": 4.7639588444751445e-05, + "loss": 0.4241, + "step": 4450 + }, + { + "epoch": 0.3308700657283969, + "grad_norm": 0.992162823677063, + "learning_rate": 4.763320723155548e-05, + "loss": 0.4082, + "step": 4455 + }, + { + "epoch": 0.3312414126035129, + "grad_norm": 1.012285590171814, + "learning_rate": 4.7626817832967256e-05, + "loss": 0.4194, + "step": 4460 + }, + { + "epoch": 0.331612759478629, + "grad_norm": 0.860953688621521, + "learning_rate": 4.762042025129755e-05, + "loss": 0.3837, + "step": 4465 + }, + { + "epoch": 0.33198410635374503, + "grad_norm": 0.8259625434875488, + "learning_rate": 4.7614014488860065e-05, + "loss": 0.4119, + "step": 4470 + }, + { + "epoch": 0.33235545322886106, + "grad_norm": 0.7920925617218018, + "learning_rate": 4.7607600547971466e-05, + "loss": 0.4372, + "step": 4475 + }, + { + "epoch": 0.33272680010397715, + "grad_norm": 0.9090449213981628, + "learning_rate": 4.760117843095139e-05, + "loss": 0.4074, + "step": 4480 + }, + { + "epoch": 0.3330981469790932, + "grad_norm": 0.8305264711380005, + "learning_rate": 4.7594748140122416e-05, + "loss": 0.406, + "step": 4485 + }, + { + "epoch": 0.3334694938542092, + "grad_norm": 0.8922953009605408, + "learning_rate": 4.758830967781008e-05, + "loss": 0.424, + "step": 4490 + }, + { + "epoch": 0.33384084072932524, + "grad_norm": 0.7428418397903442, + "learning_rate": 4.758186304634289e-05, + "loss": 0.4096, + "step": 4495 + }, + { + "epoch": 0.3342121876044413, + "grad_norm": 0.6855220794677734, + "learning_rate": 4.757540824805229e-05, + "loss": 0.4262, + "step": 4500 + }, + { + "epoch": 0.33458353447955735, + "grad_norm": 0.8831934332847595, + "learning_rate": 4.756894528527268e-05, + "loss": 0.429, + "step": 4505 + }, + { + "epoch": 0.3349548813546734, + "grad_norm": 0.7570902705192566, + "learning_rate": 4.756247416034143e-05, + "loss": 0.4068, + "step": 4510 + }, + { + "epoch": 0.33532622822978947, + "grad_norm": 0.8471874594688416, + "learning_rate": 4.755599487559884e-05, + "loss": 0.4319, + "step": 4515 + }, + { + "epoch": 0.3356975751049055, + "grad_norm": 1.5127962827682495, + "learning_rate": 4.754950743338817e-05, + "loss": 0.3788, + "step": 4520 + }, + { + "epoch": 0.33606892198002153, + "grad_norm": 0.8634641170501709, + "learning_rate": 4.754301183605563e-05, + "loss": 0.4188, + "step": 4525 + }, + { + "epoch": 0.33644026885513756, + "grad_norm": 0.9390023350715637, + "learning_rate": 4.753650808595038e-05, + "loss": 0.3925, + "step": 4530 + }, + { + "epoch": 0.33681161573025364, + "grad_norm": 1.002371072769165, + "learning_rate": 4.752999618542453e-05, + "loss": 0.4376, + "step": 4535 + }, + { + "epoch": 0.3371829626053697, + "grad_norm": 1.0945231914520264, + "learning_rate": 4.752347613683313e-05, + "loss": 0.4435, + "step": 4540 + }, + { + "epoch": 0.3375543094804857, + "grad_norm": 1.3020802736282349, + "learning_rate": 4.75169479425342e-05, + "loss": 0.4297, + "step": 4545 + }, + { + "epoch": 0.3379256563556018, + "grad_norm": 0.9557328820228577, + "learning_rate": 4.751041160488866e-05, + "loss": 0.4348, + "step": 4550 + }, + { + "epoch": 0.3382970032307178, + "grad_norm": 0.7302908301353455, + "learning_rate": 4.7503867126260426e-05, + "loss": 0.4073, + "step": 4555 + }, + { + "epoch": 0.33866835010583385, + "grad_norm": 0.9016184210777283, + "learning_rate": 4.749731450901632e-05, + "loss": 0.4192, + "step": 4560 + }, + { + "epoch": 0.3390396969809499, + "grad_norm": 1.1325968503952026, + "learning_rate": 4.749075375552614e-05, + "loss": 0.4138, + "step": 4565 + }, + { + "epoch": 0.33941104385606596, + "grad_norm": 0.7631894946098328, + "learning_rate": 4.7484184868162595e-05, + "loss": 0.3958, + "step": 4570 + }, + { + "epoch": 0.339782390731182, + "grad_norm": 0.9254847764968872, + "learning_rate": 4.7477607849301356e-05, + "loss": 0.4121, + "step": 4575 + }, + { + "epoch": 0.340153737606298, + "grad_norm": 0.8892576694488525, + "learning_rate": 4.747102270132103e-05, + "loss": 0.4099, + "step": 4580 + }, + { + "epoch": 0.3405250844814141, + "grad_norm": 1.2878713607788086, + "learning_rate": 4.7464429426603155e-05, + "loss": 0.4052, + "step": 4585 + }, + { + "epoch": 0.34089643135653014, + "grad_norm": 1.045745611190796, + "learning_rate": 4.7457828027532225e-05, + "loss": 0.4015, + "step": 4590 + }, + { + "epoch": 0.34126777823164617, + "grad_norm": 0.7936729192733765, + "learning_rate": 4.7451218506495656e-05, + "loss": 0.4264, + "step": 4595 + }, + { + "epoch": 0.34163912510676225, + "grad_norm": 1.1288927793502808, + "learning_rate": 4.744460086588382e-05, + "loss": 0.457, + "step": 4600 + }, + { + "epoch": 0.3420104719818783, + "grad_norm": 0.7576532363891602, + "learning_rate": 4.743797510809e-05, + "loss": 0.415, + "step": 4605 + }, + { + "epoch": 0.3423818188569943, + "grad_norm": 0.8422873616218567, + "learning_rate": 4.743134123551043e-05, + "loss": 0.4191, + "step": 4610 + }, + { + "epoch": 0.34275316573211034, + "grad_norm": 1.4032751321792603, + "learning_rate": 4.742469925054429e-05, + "loss": 0.4125, + "step": 4615 + }, + { + "epoch": 0.34312451260722643, + "grad_norm": 0.7778531312942505, + "learning_rate": 4.741804915559367e-05, + "loss": 0.4079, + "step": 4620 + }, + { + "epoch": 0.34349585948234246, + "grad_norm": 1.4162076711654663, + "learning_rate": 4.74113909530636e-05, + "loss": 0.4409, + "step": 4625 + }, + { + "epoch": 0.3438672063574585, + "grad_norm": 1.6434004306793213, + "learning_rate": 4.7404724645362056e-05, + "loss": 0.4113, + "step": 4630 + }, + { + "epoch": 0.3442385532325746, + "grad_norm": 0.9791908264160156, + "learning_rate": 4.7398050234899935e-05, + "loss": 0.4103, + "step": 4635 + }, + { + "epoch": 0.3446099001076906, + "grad_norm": 0.8746637105941772, + "learning_rate": 4.739136772409106e-05, + "loss": 0.4281, + "step": 4640 + }, + { + "epoch": 0.34498124698280663, + "grad_norm": 1.060185194015503, + "learning_rate": 4.73846771153522e-05, + "loss": 0.4098, + "step": 4645 + }, + { + "epoch": 0.34535259385792266, + "grad_norm": 0.9599495530128479, + "learning_rate": 4.737797841110302e-05, + "loss": 0.4181, + "step": 4650 + }, + { + "epoch": 0.34572394073303875, + "grad_norm": 1.1338365077972412, + "learning_rate": 4.737127161376616e-05, + "loss": 0.4168, + "step": 4655 + }, + { + "epoch": 0.3460952876081548, + "grad_norm": 1.0875575542449951, + "learning_rate": 4.7364556725767127e-05, + "loss": 0.4383, + "step": 4660 + }, + { + "epoch": 0.3464666344832708, + "grad_norm": 0.8746302127838135, + "learning_rate": 4.735783374953442e-05, + "loss": 0.4245, + "step": 4665 + }, + { + "epoch": 0.3468379813583869, + "grad_norm": 1.142159104347229, + "learning_rate": 4.7351102687499415e-05, + "loss": 0.4159, + "step": 4670 + }, + { + "epoch": 0.3472093282335029, + "grad_norm": 1.0561567544937134, + "learning_rate": 4.734436354209644e-05, + "loss": 0.4076, + "step": 4675 + }, + { + "epoch": 0.34758067510861895, + "grad_norm": 0.7917932868003845, + "learning_rate": 4.733761631576271e-05, + "loss": 0.4199, + "step": 4680 + }, + { + "epoch": 0.347952021983735, + "grad_norm": 0.8924756646156311, + "learning_rate": 4.733086101093842e-05, + "loss": 0.3952, + "step": 4685 + }, + { + "epoch": 0.34832336885885107, + "grad_norm": 0.9237497448921204, + "learning_rate": 4.7324097630066625e-05, + "loss": 0.4241, + "step": 4690 + }, + { + "epoch": 0.3486947157339671, + "grad_norm": 0.8856948614120483, + "learning_rate": 4.7317326175593345e-05, + "loss": 0.4269, + "step": 4695 + }, + { + "epoch": 0.3490660626090831, + "grad_norm": 0.9089455604553223, + "learning_rate": 4.73105466499675e-05, + "loss": 0.3952, + "step": 4700 + }, + { + "epoch": 0.3494374094841992, + "grad_norm": 0.948407769203186, + "learning_rate": 4.730375905564093e-05, + "loss": 0.421, + "step": 4705 + }, + { + "epoch": 0.34980875635931524, + "grad_norm": 0.658133864402771, + "learning_rate": 4.729696339506841e-05, + "loss": 0.4207, + "step": 4710 + }, + { + "epoch": 0.3501801032344313, + "grad_norm": 1.0339378118515015, + "learning_rate": 4.72901596707076e-05, + "loss": 0.4139, + "step": 4715 + }, + { + "epoch": 0.3505514501095473, + "grad_norm": 0.9219155311584473, + "learning_rate": 4.7283347885019106e-05, + "loss": 0.4305, + "step": 4720 + }, + { + "epoch": 0.3509227969846634, + "grad_norm": 1.0298707485198975, + "learning_rate": 4.727652804046644e-05, + "loss": 0.4122, + "step": 4725 + }, + { + "epoch": 0.3512941438597794, + "grad_norm": 0.7566863894462585, + "learning_rate": 4.7269700139516016e-05, + "loss": 0.433, + "step": 4730 + }, + { + "epoch": 0.35166549073489545, + "grad_norm": 0.851755678653717, + "learning_rate": 4.726286418463718e-05, + "loss": 0.4055, + "step": 4735 + }, + { + "epoch": 0.35203683761001153, + "grad_norm": 1.0216217041015625, + "learning_rate": 4.725602017830219e-05, + "loss": 0.431, + "step": 4740 + }, + { + "epoch": 0.35240818448512756, + "grad_norm": 0.917681097984314, + "learning_rate": 4.7249168122986196e-05, + "loss": 0.421, + "step": 4745 + }, + { + "epoch": 0.3527795313602436, + "grad_norm": 1.4520493745803833, + "learning_rate": 4.7242308021167274e-05, + "loss": 0.4081, + "step": 4750 + }, + { + "epoch": 0.3531508782353596, + "grad_norm": 0.8711174130439758, + "learning_rate": 4.7235439875326414e-05, + "loss": 0.4266, + "step": 4755 + }, + { + "epoch": 0.3535222251104757, + "grad_norm": 0.8265815377235413, + "learning_rate": 4.7228563687947504e-05, + "loss": 0.4126, + "step": 4760 + }, + { + "epoch": 0.35389357198559174, + "grad_norm": 0.9316338896751404, + "learning_rate": 4.722167946151735e-05, + "loss": 0.406, + "step": 4765 + }, + { + "epoch": 0.35426491886070777, + "grad_norm": 0.9665477871894836, + "learning_rate": 4.7214787198525656e-05, + "loss": 0.4188, + "step": 4770 + }, + { + "epoch": 0.35463626573582385, + "grad_norm": 0.9457529187202454, + "learning_rate": 4.720788690146504e-05, + "loss": 0.4481, + "step": 4775 + }, + { + "epoch": 0.3550076126109399, + "grad_norm": 0.9023556113243103, + "learning_rate": 4.7200978572831026e-05, + "loss": 0.4059, + "step": 4780 + }, + { + "epoch": 0.3553789594860559, + "grad_norm": 0.9159427881240845, + "learning_rate": 4.719406221512203e-05, + "loss": 0.4029, + "step": 4785 + }, + { + "epoch": 0.355750306361172, + "grad_norm": 0.7920024394989014, + "learning_rate": 4.718713783083939e-05, + "loss": 0.4157, + "step": 4790 + }, + { + "epoch": 0.356121653236288, + "grad_norm": 0.8898520469665527, + "learning_rate": 4.7180205422487335e-05, + "loss": 0.4111, + "step": 4795 + }, + { + "epoch": 0.35649300011140406, + "grad_norm": 1.610779881477356, + "learning_rate": 4.7173264992572994e-05, + "loss": 0.4243, + "step": 4800 + }, + { + "epoch": 0.3568643469865201, + "grad_norm": 1.1176823377609253, + "learning_rate": 4.7166316543606406e-05, + "loss": 0.4032, + "step": 4805 + }, + { + "epoch": 0.35723569386163617, + "grad_norm": 0.8275334239006042, + "learning_rate": 4.715936007810051e-05, + "loss": 0.3972, + "step": 4810 + }, + { + "epoch": 0.3576070407367522, + "grad_norm": 0.8289036750793457, + "learning_rate": 4.715239559857114e-05, + "loss": 0.4108, + "step": 4815 + }, + { + "epoch": 0.35797838761186823, + "grad_norm": 0.825310230255127, + "learning_rate": 4.714542310753702e-05, + "loss": 0.4153, + "step": 4820 + }, + { + "epoch": 0.3583497344869843, + "grad_norm": 0.7950535416603088, + "learning_rate": 4.713844260751978e-05, + "loss": 0.4071, + "step": 4825 + }, + { + "epoch": 0.35872108136210035, + "grad_norm": 1.4191958904266357, + "learning_rate": 4.713145410104396e-05, + "loss": 0.4023, + "step": 4830 + }, + { + "epoch": 0.3590924282372164, + "grad_norm": 0.9486410021781921, + "learning_rate": 4.712445759063697e-05, + "loss": 0.4136, + "step": 4835 + }, + { + "epoch": 0.3594637751123324, + "grad_norm": 1.385170817375183, + "learning_rate": 4.711745307882914e-05, + "loss": 0.4231, + "step": 4840 + }, + { + "epoch": 0.3598351219874485, + "grad_norm": 0.7949105501174927, + "learning_rate": 4.711044056815366e-05, + "loss": 0.4139, + "step": 4845 + }, + { + "epoch": 0.3602064688625645, + "grad_norm": 0.8897971510887146, + "learning_rate": 4.710342006114665e-05, + "loss": 0.4398, + "step": 4850 + }, + { + "epoch": 0.36057781573768055, + "grad_norm": 1.0138311386108398, + "learning_rate": 4.70963915603471e-05, + "loss": 0.413, + "step": 4855 + }, + { + "epoch": 0.36094916261279664, + "grad_norm": 1.127977728843689, + "learning_rate": 4.708935506829689e-05, + "loss": 0.42, + "step": 4860 + }, + { + "epoch": 0.36132050948791267, + "grad_norm": 0.8652618527412415, + "learning_rate": 4.708231058754081e-05, + "loss": 0.4238, + "step": 4865 + }, + { + "epoch": 0.3616918563630287, + "grad_norm": 0.8667855858802795, + "learning_rate": 4.707525812062651e-05, + "loss": 0.3933, + "step": 4870 + }, + { + "epoch": 0.3620632032381447, + "grad_norm": 0.8137786388397217, + "learning_rate": 4.706819767010455e-05, + "loss": 0.4253, + "step": 4875 + }, + { + "epoch": 0.3624345501132608, + "grad_norm": 0.837216317653656, + "learning_rate": 4.7061129238528376e-05, + "loss": 0.4101, + "step": 4880 + }, + { + "epoch": 0.36280589698837684, + "grad_norm": 0.8621715903282166, + "learning_rate": 4.705405282845431e-05, + "loss": 0.3916, + "step": 4885 + }, + { + "epoch": 0.36317724386349287, + "grad_norm": 0.9200319647789001, + "learning_rate": 4.7046968442441566e-05, + "loss": 0.4334, + "step": 4890 + }, + { + "epoch": 0.36354859073860896, + "grad_norm": 0.9625046849250793, + "learning_rate": 4.7039876083052235e-05, + "loss": 0.4364, + "step": 4895 + }, + { + "epoch": 0.363919937613725, + "grad_norm": 0.9593774676322937, + "learning_rate": 4.703277575285131e-05, + "loss": 0.405, + "step": 4900 + }, + { + "epoch": 0.364291284488841, + "grad_norm": 0.9800950884819031, + "learning_rate": 4.702566745440665e-05, + "loss": 0.4131, + "step": 4905 + }, + { + "epoch": 0.36466263136395705, + "grad_norm": 0.808862566947937, + "learning_rate": 4.7018551190289e-05, + "loss": 0.402, + "step": 4910 + }, + { + "epoch": 0.36503397823907313, + "grad_norm": 0.8395322561264038, + "learning_rate": 4.701142696307198e-05, + "loss": 0.3979, + "step": 4915 + }, + { + "epoch": 0.36540532511418916, + "grad_norm": 0.9485448598861694, + "learning_rate": 4.70042947753321e-05, + "loss": 0.3904, + "step": 4920 + }, + { + "epoch": 0.3657766719893052, + "grad_norm": 1.0159146785736084, + "learning_rate": 4.6997154629648743e-05, + "loss": 0.4085, + "step": 4925 + }, + { + "epoch": 0.3661480188644213, + "grad_norm": 1.6333436965942383, + "learning_rate": 4.699000652860418e-05, + "loss": 0.404, + "step": 4930 + }, + { + "epoch": 0.3665193657395373, + "grad_norm": 0.9616270065307617, + "learning_rate": 4.6982850474783546e-05, + "loss": 0.4199, + "step": 4935 + }, + { + "epoch": 0.36689071261465334, + "grad_norm": 0.8749721646308899, + "learning_rate": 4.6975686470774854e-05, + "loss": 0.4358, + "step": 4940 + }, + { + "epoch": 0.3672620594897694, + "grad_norm": 0.7861535549163818, + "learning_rate": 4.6968514519169e-05, + "loss": 0.3966, + "step": 4945 + }, + { + "epoch": 0.36763340636488545, + "grad_norm": 0.9194499254226685, + "learning_rate": 4.696133462255975e-05, + "loss": 0.4247, + "step": 4950 + }, + { + "epoch": 0.3680047532400015, + "grad_norm": 0.8600870370864868, + "learning_rate": 4.695414678354374e-05, + "loss": 0.3944, + "step": 4955 + }, + { + "epoch": 0.3683761001151175, + "grad_norm": 0.7348113059997559, + "learning_rate": 4.6946951004720476e-05, + "loss": 0.416, + "step": 4960 + }, + { + "epoch": 0.3687474469902336, + "grad_norm": 0.9184551239013672, + "learning_rate": 4.693974728869235e-05, + "loss": 0.3881, + "step": 4965 + }, + { + "epoch": 0.3691187938653496, + "grad_norm": 0.8617504239082336, + "learning_rate": 4.693253563806462e-05, + "loss": 0.4277, + "step": 4970 + }, + { + "epoch": 0.36949014074046566, + "grad_norm": 0.8256643414497375, + "learning_rate": 4.69253160554454e-05, + "loss": 0.4238, + "step": 4975 + }, + { + "epoch": 0.36986148761558174, + "grad_norm": 0.9334145188331604, + "learning_rate": 4.691808854344568e-05, + "loss": 0.4065, + "step": 4980 + }, + { + "epoch": 0.37023283449069777, + "grad_norm": 1.2154117822647095, + "learning_rate": 4.691085310467932e-05, + "loss": 0.4309, + "step": 4985 + }, + { + "epoch": 0.3706041813658138, + "grad_norm": 1.2641429901123047, + "learning_rate": 4.690360974176306e-05, + "loss": 0.4277, + "step": 4990 + }, + { + "epoch": 0.37097552824092983, + "grad_norm": 1.158447265625, + "learning_rate": 4.6896358457316475e-05, + "loss": 0.4309, + "step": 4995 + }, + { + "epoch": 0.3713468751160459, + "grad_norm": 0.7376827001571655, + "learning_rate": 4.6889099253962035e-05, + "loss": 0.4129, + "step": 5000 + }, + { + "epoch": 0.37171822199116195, + "grad_norm": 0.9000616669654846, + "learning_rate": 4.688183213432504e-05, + "loss": 0.4151, + "step": 5005 + }, + { + "epoch": 0.372089568866278, + "grad_norm": 0.8609789609909058, + "learning_rate": 4.687455710103369e-05, + "loss": 0.4264, + "step": 5010 + }, + { + "epoch": 0.37246091574139406, + "grad_norm": 0.9813238978385925, + "learning_rate": 4.686727415671903e-05, + "loss": 0.4125, + "step": 5015 + }, + { + "epoch": 0.3728322626165101, + "grad_norm": 0.7004589438438416, + "learning_rate": 4.6859983304014974e-05, + "loss": 0.4049, + "step": 5020 + }, + { + "epoch": 0.3732036094916261, + "grad_norm": 0.7324821949005127, + "learning_rate": 4.685268454555827e-05, + "loss": 0.4229, + "step": 5025 + }, + { + "epoch": 0.37357495636674215, + "grad_norm": 0.8856626152992249, + "learning_rate": 4.684537788398855e-05, + "loss": 0.4101, + "step": 5030 + }, + { + "epoch": 0.37394630324185824, + "grad_norm": 1.6421722173690796, + "learning_rate": 4.683806332194831e-05, + "loss": 0.3939, + "step": 5035 + }, + { + "epoch": 0.37431765011697427, + "grad_norm": 0.740895688533783, + "learning_rate": 4.683074086208288e-05, + "loss": 0.4111, + "step": 5040 + }, + { + "epoch": 0.3746889969920903, + "grad_norm": 1.21027410030365, + "learning_rate": 4.682341050704046e-05, + "loss": 0.4195, + "step": 5045 + }, + { + "epoch": 0.3750603438672064, + "grad_norm": 0.8727134466171265, + "learning_rate": 4.68160722594721e-05, + "loss": 0.4091, + "step": 5050 + }, + { + "epoch": 0.3754316907423224, + "grad_norm": 0.8802042603492737, + "learning_rate": 4.680872612203171e-05, + "loss": 0.3945, + "step": 5055 + }, + { + "epoch": 0.37580303761743844, + "grad_norm": 0.8701076507568359, + "learning_rate": 4.680137209737606e-05, + "loss": 0.3914, + "step": 5060 + }, + { + "epoch": 0.37617438449255447, + "grad_norm": 0.9685407280921936, + "learning_rate": 4.679401018816475e-05, + "loss": 0.4192, + "step": 5065 + }, + { + "epoch": 0.37654573136767056, + "grad_norm": 0.839486300945282, + "learning_rate": 4.6786640397060246e-05, + "loss": 0.4029, + "step": 5070 + }, + { + "epoch": 0.3769170782427866, + "grad_norm": 0.8863376975059509, + "learning_rate": 4.6779262726727866e-05, + "loss": 0.3989, + "step": 5075 + }, + { + "epoch": 0.3772884251179026, + "grad_norm": 0.9725920557975769, + "learning_rate": 4.677187717983578e-05, + "loss": 0.4042, + "step": 5080 + }, + { + "epoch": 0.3776597719930187, + "grad_norm": 0.9129449129104614, + "learning_rate": 4.6764483759055e-05, + "loss": 0.4086, + "step": 5085 + }, + { + "epoch": 0.37803111886813473, + "grad_norm": 1.042917251586914, + "learning_rate": 4.675708246705938e-05, + "loss": 0.3807, + "step": 5090 + }, + { + "epoch": 0.37840246574325076, + "grad_norm": 0.8052343130111694, + "learning_rate": 4.674967330652562e-05, + "loss": 0.4202, + "step": 5095 + }, + { + "epoch": 0.3787738126183668, + "grad_norm": 0.8487399220466614, + "learning_rate": 4.67422562801333e-05, + "loss": 0.4092, + "step": 5100 + }, + { + "epoch": 0.3791451594934829, + "grad_norm": 1.2970117330551147, + "learning_rate": 4.6734831390564795e-05, + "loss": 0.4255, + "step": 5105 + }, + { + "epoch": 0.3795165063685989, + "grad_norm": 0.9775636196136475, + "learning_rate": 4.672739864050535e-05, + "loss": 0.4067, + "step": 5110 + }, + { + "epoch": 0.37988785324371493, + "grad_norm": 1.285401463508606, + "learning_rate": 4.671995803264306e-05, + "loss": 0.4423, + "step": 5115 + }, + { + "epoch": 0.380259200118831, + "grad_norm": 0.7425358891487122, + "learning_rate": 4.671250956966885e-05, + "loss": 0.4119, + "step": 5120 + }, + { + "epoch": 0.38063054699394705, + "grad_norm": 0.8824949860572815, + "learning_rate": 4.670505325427648e-05, + "loss": 0.4094, + "step": 5125 + }, + { + "epoch": 0.3810018938690631, + "grad_norm": 0.8295382261276245, + "learning_rate": 4.669758908916255e-05, + "loss": 0.3983, + "step": 5130 + }, + { + "epoch": 0.38137324074417916, + "grad_norm": 0.7965675592422485, + "learning_rate": 4.6690117077026515e-05, + "loss": 0.4033, + "step": 5135 + }, + { + "epoch": 0.3817445876192952, + "grad_norm": 0.7937360405921936, + "learning_rate": 4.6682637220570666e-05, + "loss": 0.3942, + "step": 5140 + }, + { + "epoch": 0.3821159344944112, + "grad_norm": 0.892683744430542, + "learning_rate": 4.6675149522500124e-05, + "loss": 0.4026, + "step": 5145 + }, + { + "epoch": 0.38248728136952725, + "grad_norm": 0.8494230508804321, + "learning_rate": 4.666765398552283e-05, + "loss": 0.4035, + "step": 5150 + }, + { + "epoch": 0.38285862824464334, + "grad_norm": 1.0645548105239868, + "learning_rate": 4.666015061234958e-05, + "loss": 0.4324, + "step": 5155 + }, + { + "epoch": 0.38322997511975937, + "grad_norm": 1.1099061965942383, + "learning_rate": 4.665263940569401e-05, + "loss": 0.4147, + "step": 5160 + }, + { + "epoch": 0.3836013219948754, + "grad_norm": 0.864147961139679, + "learning_rate": 4.664512036827258e-05, + "loss": 0.4311, + "step": 5165 + }, + { + "epoch": 0.3839726688699915, + "grad_norm": 0.6989151835441589, + "learning_rate": 4.663759350280456e-05, + "loss": 0.4137, + "step": 5170 + }, + { + "epoch": 0.3843440157451075, + "grad_norm": 0.997937798500061, + "learning_rate": 4.663005881201209e-05, + "loss": 0.3688, + "step": 5175 + }, + { + "epoch": 0.38471536262022354, + "grad_norm": 0.9270450472831726, + "learning_rate": 4.6622516298620124e-05, + "loss": 0.4192, + "step": 5180 + }, + { + "epoch": 0.3850867094953396, + "grad_norm": 0.8299030661582947, + "learning_rate": 4.661496596535643e-05, + "loss": 0.4072, + "step": 5185 + }, + { + "epoch": 0.38545805637045566, + "grad_norm": 1.146391749382019, + "learning_rate": 4.660740781495162e-05, + "loss": 0.3983, + "step": 5190 + }, + { + "epoch": 0.3858294032455717, + "grad_norm": 0.8312609791755676, + "learning_rate": 4.659984185013913e-05, + "loss": 0.4204, + "step": 5195 + }, + { + "epoch": 0.3862007501206877, + "grad_norm": 0.8791617751121521, + "learning_rate": 4.659226807365523e-05, + "loss": 0.4005, + "step": 5200 + }, + { + "epoch": 0.3865720969958038, + "grad_norm": 0.8427607417106628, + "learning_rate": 4.6584686488239e-05, + "loss": 0.3986, + "step": 5205 + }, + { + "epoch": 0.38694344387091983, + "grad_norm": 0.7755858302116394, + "learning_rate": 4.657709709663237e-05, + "loss": 0.41, + "step": 5210 + }, + { + "epoch": 0.38731479074603586, + "grad_norm": 0.9663941264152527, + "learning_rate": 4.656949990158004e-05, + "loss": 0.3973, + "step": 5215 + }, + { + "epoch": 0.3876861376211519, + "grad_norm": 0.8808470964431763, + "learning_rate": 4.656189490582959e-05, + "loss": 0.4142, + "step": 5220 + }, + { + "epoch": 0.388057484496268, + "grad_norm": 0.835097074508667, + "learning_rate": 4.65542821121314e-05, + "loss": 0.3907, + "step": 5225 + }, + { + "epoch": 0.388428831371384, + "grad_norm": 1.165091872215271, + "learning_rate": 4.654666152323866e-05, + "loss": 0.4102, + "step": 5230 + }, + { + "epoch": 0.38880017824650004, + "grad_norm": 0.9790161848068237, + "learning_rate": 4.653903314190738e-05, + "loss": 0.4125, + "step": 5235 + }, + { + "epoch": 0.3891715251216161, + "grad_norm": 0.8133743405342102, + "learning_rate": 4.653139697089641e-05, + "loss": 0.417, + "step": 5240 + }, + { + "epoch": 0.38954287199673215, + "grad_norm": 0.9274176955223083, + "learning_rate": 4.652375301296739e-05, + "loss": 0.4218, + "step": 5245 + }, + { + "epoch": 0.3899142188718482, + "grad_norm": 0.8492559194564819, + "learning_rate": 4.6516101270884805e-05, + "loss": 0.3966, + "step": 5250 + }, + { + "epoch": 0.3902855657469642, + "grad_norm": 0.9852241277694702, + "learning_rate": 4.650844174741592e-05, + "loss": 0.4035, + "step": 5255 + }, + { + "epoch": 0.3906569126220803, + "grad_norm": 0.9053683876991272, + "learning_rate": 4.6500774445330854e-05, + "loss": 0.3954, + "step": 5260 + }, + { + "epoch": 0.39102825949719633, + "grad_norm": 1.2327396869659424, + "learning_rate": 4.64930993674025e-05, + "loss": 0.3793, + "step": 5265 + }, + { + "epoch": 0.39139960637231236, + "grad_norm": 0.9179367423057556, + "learning_rate": 4.648541651640659e-05, + "loss": 0.4146, + "step": 5270 + }, + { + "epoch": 0.39177095324742844, + "grad_norm": 0.8724064826965332, + "learning_rate": 4.647772589512165e-05, + "loss": 0.4214, + "step": 5275 + }, + { + "epoch": 0.3921423001225445, + "grad_norm": 0.8555846810340881, + "learning_rate": 4.6470027506329036e-05, + "loss": 0.4041, + "step": 5280 + }, + { + "epoch": 0.3925136469976605, + "grad_norm": 0.804630696773529, + "learning_rate": 4.64623213528129e-05, + "loss": 0.4092, + "step": 5285 + }, + { + "epoch": 0.39288499387277653, + "grad_norm": 1.16206693649292, + "learning_rate": 4.64546074373602e-05, + "loss": 0.4095, + "step": 5290 + }, + { + "epoch": 0.3932563407478926, + "grad_norm": 1.0643155574798584, + "learning_rate": 4.644688576276071e-05, + "loss": 0.4197, + "step": 5295 + }, + { + "epoch": 0.39362768762300865, + "grad_norm": 0.9959713220596313, + "learning_rate": 4.643915633180699e-05, + "loss": 0.4022, + "step": 5300 + }, + { + "epoch": 0.3939990344981247, + "grad_norm": 0.8475741744041443, + "learning_rate": 4.6431419147294445e-05, + "loss": 0.4099, + "step": 5305 + }, + { + "epoch": 0.39437038137324076, + "grad_norm": 0.7796865701675415, + "learning_rate": 4.642367421202124e-05, + "loss": 0.4044, + "step": 5310 + }, + { + "epoch": 0.3947417282483568, + "grad_norm": 0.9301022291183472, + "learning_rate": 4.641592152878837e-05, + "loss": 0.4104, + "step": 5315 + }, + { + "epoch": 0.3951130751234728, + "grad_norm": 1.0044193267822266, + "learning_rate": 4.640816110039962e-05, + "loss": 0.4028, + "step": 5320 + }, + { + "epoch": 0.3954844219985889, + "grad_norm": 1.0422751903533936, + "learning_rate": 4.6400392929661596e-05, + "loss": 0.3784, + "step": 5325 + }, + { + "epoch": 0.39585576887370494, + "grad_norm": 0.7252130508422852, + "learning_rate": 4.6392617019383665e-05, + "loss": 0.4118, + "step": 5330 + }, + { + "epoch": 0.39622711574882097, + "grad_norm": 1.1372671127319336, + "learning_rate": 4.638483337237803e-05, + "loss": 0.4297, + "step": 5335 + }, + { + "epoch": 0.396598462623937, + "grad_norm": 0.917350709438324, + "learning_rate": 4.6377041991459694e-05, + "loss": 0.4082, + "step": 5340 + }, + { + "epoch": 0.3969698094990531, + "grad_norm": 0.8461329340934753, + "learning_rate": 4.636924287944641e-05, + "loss": 0.4239, + "step": 5345 + }, + { + "epoch": 0.3973411563741691, + "grad_norm": 0.823368489742279, + "learning_rate": 4.636143603915878e-05, + "loss": 0.4517, + "step": 5350 + }, + { + "epoch": 0.39771250324928514, + "grad_norm": 0.99635910987854, + "learning_rate": 4.635362147342018e-05, + "loss": 0.4001, + "step": 5355 + }, + { + "epoch": 0.39808385012440123, + "grad_norm": 0.8141667246818542, + "learning_rate": 4.634579918505677e-05, + "loss": 0.4269, + "step": 5360 + }, + { + "epoch": 0.39845519699951726, + "grad_norm": 0.9809877276420593, + "learning_rate": 4.633796917689751e-05, + "loss": 0.4372, + "step": 5365 + }, + { + "epoch": 0.3988265438746333, + "grad_norm": 0.8991804122924805, + "learning_rate": 4.6330131451774174e-05, + "loss": 0.4092, + "step": 5370 + }, + { + "epoch": 0.3991978907497493, + "grad_norm": 0.8061063885688782, + "learning_rate": 4.6322286012521285e-05, + "loss": 0.4262, + "step": 5375 + }, + { + "epoch": 0.3995692376248654, + "grad_norm": 0.8821228742599487, + "learning_rate": 4.63144328619762e-05, + "loss": 0.4151, + "step": 5380 + }, + { + "epoch": 0.39994058449998143, + "grad_norm": 1.0889192819595337, + "learning_rate": 4.630657200297902e-05, + "loss": 0.4131, + "step": 5385 + }, + { + "epoch": 0.40031193137509746, + "grad_norm": 1.7080798149108887, + "learning_rate": 4.629870343837268e-05, + "loss": 0.409, + "step": 5390 + }, + { + "epoch": 0.40068327825021355, + "grad_norm": 1.1223076581954956, + "learning_rate": 4.629082717100286e-05, + "loss": 0.4065, + "step": 5395 + }, + { + "epoch": 0.4010546251253296, + "grad_norm": 0.7243461608886719, + "learning_rate": 4.628294320371805e-05, + "loss": 0.3867, + "step": 5400 + }, + { + "epoch": 0.4014259720004456, + "grad_norm": 0.7852326035499573, + "learning_rate": 4.6275051539369526e-05, + "loss": 0.3952, + "step": 5405 + }, + { + "epoch": 0.40179731887556164, + "grad_norm": 0.8796769976615906, + "learning_rate": 4.626715218081134e-05, + "loss": 0.4228, + "step": 5410 + }, + { + "epoch": 0.4021686657506777, + "grad_norm": 0.9907186031341553, + "learning_rate": 4.625924513090031e-05, + "loss": 0.4161, + "step": 5415 + }, + { + "epoch": 0.40254001262579375, + "grad_norm": 1.3190866708755493, + "learning_rate": 4.625133039249607e-05, + "loss": 0.4085, + "step": 5420 + }, + { + "epoch": 0.4029113595009098, + "grad_norm": 1.0041900873184204, + "learning_rate": 4.624340796846102e-05, + "loss": 0.4146, + "step": 5425 + }, + { + "epoch": 0.40328270637602587, + "grad_norm": 0.9058469533920288, + "learning_rate": 4.6235477861660324e-05, + "loss": 0.4104, + "step": 5430 + }, + { + "epoch": 0.4036540532511419, + "grad_norm": 0.8792639970779419, + "learning_rate": 4.622754007496195e-05, + "loss": 0.4051, + "step": 5435 + }, + { + "epoch": 0.4040254001262579, + "grad_norm": 1.6717143058776855, + "learning_rate": 4.6219594611236624e-05, + "loss": 0.4286, + "step": 5440 + }, + { + "epoch": 0.40439674700137396, + "grad_norm": 3.575216293334961, + "learning_rate": 4.621164147335786e-05, + "loss": 0.4006, + "step": 5445 + }, + { + "epoch": 0.40476809387649004, + "grad_norm": 1.1020781993865967, + "learning_rate": 4.6203680664201934e-05, + "loss": 0.4101, + "step": 5450 + }, + { + "epoch": 0.4051394407516061, + "grad_norm": 0.995681643486023, + "learning_rate": 4.619571218664791e-05, + "loss": 0.4096, + "step": 5455 + }, + { + "epoch": 0.4055107876267221, + "grad_norm": 1.0068079233169556, + "learning_rate": 4.6187736043577626e-05, + "loss": 0.4048, + "step": 5460 + }, + { + "epoch": 0.4058821345018382, + "grad_norm": 0.9563468098640442, + "learning_rate": 4.617975223787568e-05, + "loss": 0.4009, + "step": 5465 + }, + { + "epoch": 0.4062534813769542, + "grad_norm": 1.1841391324996948, + "learning_rate": 4.617176077242945e-05, + "loss": 0.4124, + "step": 5470 + }, + { + "epoch": 0.40662482825207025, + "grad_norm": 0.7610587477684021, + "learning_rate": 4.616376165012909e-05, + "loss": 0.3878, + "step": 5475 + }, + { + "epoch": 0.4069961751271863, + "grad_norm": 0.8032383918762207, + "learning_rate": 4.615575487386751e-05, + "loss": 0.393, + "step": 5480 + }, + { + "epoch": 0.40736752200230236, + "grad_norm": 1.00172758102417, + "learning_rate": 4.614774044654038e-05, + "loss": 0.4099, + "step": 5485 + }, + { + "epoch": 0.4077388688774184, + "grad_norm": 0.9631612300872803, + "learning_rate": 4.613971837104617e-05, + "loss": 0.4009, + "step": 5490 + }, + { + "epoch": 0.4081102157525344, + "grad_norm": 0.8837024569511414, + "learning_rate": 4.6131688650286084e-05, + "loss": 0.4009, + "step": 5495 + }, + { + "epoch": 0.4084815626276505, + "grad_norm": 1.4335302114486694, + "learning_rate": 4.612365128716412e-05, + "loss": 0.4, + "step": 5500 + }, + { + "epoch": 0.40885290950276654, + "grad_norm": 0.8482183814048767, + "learning_rate": 4.6115606284587e-05, + "loss": 0.4027, + "step": 5505 + }, + { + "epoch": 0.40922425637788257, + "grad_norm": 0.9031336903572083, + "learning_rate": 4.610755364546425e-05, + "loss": 0.4131, + "step": 5510 + }, + { + "epoch": 0.40959560325299865, + "grad_norm": 0.7115298509597778, + "learning_rate": 4.609949337270813e-05, + "loss": 0.408, + "step": 5515 + }, + { + "epoch": 0.4099669501281147, + "grad_norm": 0.9542083144187927, + "learning_rate": 4.609142546923369e-05, + "loss": 0.3955, + "step": 5520 + }, + { + "epoch": 0.4103382970032307, + "grad_norm": 0.8037524223327637, + "learning_rate": 4.608334993795869e-05, + "loss": 0.4065, + "step": 5525 + }, + { + "epoch": 0.41070964387834674, + "grad_norm": 0.8711167573928833, + "learning_rate": 4.6075266781803695e-05, + "loss": 0.4007, + "step": 5530 + }, + { + "epoch": 0.4110809907534628, + "grad_norm": 1.1807262897491455, + "learning_rate": 4.6067176003692006e-05, + "loss": 0.3878, + "step": 5535 + }, + { + "epoch": 0.41145233762857886, + "grad_norm": 1.0701298713684082, + "learning_rate": 4.60590776065497e-05, + "loss": 0.3864, + "step": 5540 + }, + { + "epoch": 0.4118236845036949, + "grad_norm": 1.0017831325531006, + "learning_rate": 4.605097159330558e-05, + "loss": 0.4227, + "step": 5545 + }, + { + "epoch": 0.41219503137881097, + "grad_norm": 0.8837273716926575, + "learning_rate": 4.6042857966891235e-05, + "loss": 0.4079, + "step": 5550 + }, + { + "epoch": 0.412566378253927, + "grad_norm": 0.8195706605911255, + "learning_rate": 4.6034736730240975e-05, + "loss": 0.4209, + "step": 5555 + }, + { + "epoch": 0.41293772512904303, + "grad_norm": 0.9010388255119324, + "learning_rate": 4.6026607886291884e-05, + "loss": 0.3947, + "step": 5560 + }, + { + "epoch": 0.41330907200415906, + "grad_norm": 0.9195128679275513, + "learning_rate": 4.60184714379838e-05, + "loss": 0.4229, + "step": 5565 + }, + { + "epoch": 0.41368041887927515, + "grad_norm": 1.1202746629714966, + "learning_rate": 4.6010327388259286e-05, + "loss": 0.4052, + "step": 5570 + }, + { + "epoch": 0.4140517657543912, + "grad_norm": 0.7508527636528015, + "learning_rate": 4.600217574006369e-05, + "loss": 0.394, + "step": 5575 + }, + { + "epoch": 0.4144231126295072, + "grad_norm": 1.0335317850112915, + "learning_rate": 4.599401649634507e-05, + "loss": 0.4025, + "step": 5580 + }, + { + "epoch": 0.4147944595046233, + "grad_norm": 0.7544407844543457, + "learning_rate": 4.598584966005426e-05, + "loss": 0.4082, + "step": 5585 + }, + { + "epoch": 0.4151658063797393, + "grad_norm": 1.0011963844299316, + "learning_rate": 4.597767523414483e-05, + "loss": 0.4141, + "step": 5590 + }, + { + "epoch": 0.41553715325485535, + "grad_norm": 0.7946156859397888, + "learning_rate": 4.5969493221573114e-05, + "loss": 0.4134, + "step": 5595 + }, + { + "epoch": 0.4159085001299714, + "grad_norm": 0.9330390691757202, + "learning_rate": 4.596130362529813e-05, + "loss": 0.4074, + "step": 5600 + }, + { + "epoch": 0.41627984700508747, + "grad_norm": 0.9563977718353271, + "learning_rate": 4.59531064482817e-05, + "loss": 0.4138, + "step": 5605 + }, + { + "epoch": 0.4166511938802035, + "grad_norm": 0.8964327573776245, + "learning_rate": 4.5944901693488375e-05, + "loss": 0.3875, + "step": 5610 + }, + { + "epoch": 0.4170225407553195, + "grad_norm": 0.9961819052696228, + "learning_rate": 4.593668936388543e-05, + "loss": 0.4063, + "step": 5615 + }, + { + "epoch": 0.4173938876304356, + "grad_norm": 1.026484489440918, + "learning_rate": 4.5928469462442883e-05, + "loss": 0.4261, + "step": 5620 + }, + { + "epoch": 0.41776523450555164, + "grad_norm": 0.7321872711181641, + "learning_rate": 4.592024199213349e-05, + "loss": 0.4019, + "step": 5625 + }, + { + "epoch": 0.41813658138066767, + "grad_norm": 1.0808043479919434, + "learning_rate": 4.5912006955932775e-05, + "loss": 0.4064, + "step": 5630 + }, + { + "epoch": 0.4185079282557837, + "grad_norm": 0.987091600894928, + "learning_rate": 4.590376435681895e-05, + "loss": 0.4116, + "step": 5635 + }, + { + "epoch": 0.4188792751308998, + "grad_norm": 0.9258447289466858, + "learning_rate": 4.589551419777299e-05, + "loss": 0.4051, + "step": 5640 + }, + { + "epoch": 0.4192506220060158, + "grad_norm": 0.9010351896286011, + "learning_rate": 4.588725648177861e-05, + "loss": 0.4052, + "step": 5645 + }, + { + "epoch": 0.41962196888113185, + "grad_norm": 1.3273558616638184, + "learning_rate": 4.5878991211822226e-05, + "loss": 0.4098, + "step": 5650 + }, + { + "epoch": 0.41999331575624793, + "grad_norm": 1.0378541946411133, + "learning_rate": 4.587071839089302e-05, + "loss": 0.4009, + "step": 5655 + }, + { + "epoch": 0.42036466263136396, + "grad_norm": 1.0806586742401123, + "learning_rate": 4.586243802198289e-05, + "loss": 0.3911, + "step": 5660 + }, + { + "epoch": 0.42073600950648, + "grad_norm": 1.4784213304519653, + "learning_rate": 4.585415010808646e-05, + "loss": 0.3955, + "step": 5665 + }, + { + "epoch": 0.4211073563815961, + "grad_norm": 1.1250252723693848, + "learning_rate": 4.58458546522011e-05, + "loss": 0.4094, + "step": 5670 + }, + { + "epoch": 0.4214787032567121, + "grad_norm": 1.0189043283462524, + "learning_rate": 4.583755165732689e-05, + "loss": 0.4241, + "step": 5675 + }, + { + "epoch": 0.42185005013182814, + "grad_norm": 1.0218403339385986, + "learning_rate": 4.582924112646664e-05, + "loss": 0.3957, + "step": 5680 + }, + { + "epoch": 0.42222139700694417, + "grad_norm": 1.075886607170105, + "learning_rate": 4.582092306262588e-05, + "loss": 0.3923, + "step": 5685 + }, + { + "epoch": 0.42259274388206025, + "grad_norm": 1.131255030632019, + "learning_rate": 4.581259746881288e-05, + "loss": 0.4157, + "step": 5690 + }, + { + "epoch": 0.4229640907571763, + "grad_norm": 0.8696762323379517, + "learning_rate": 4.580426434803864e-05, + "loss": 0.3969, + "step": 5695 + }, + { + "epoch": 0.4233354376322923, + "grad_norm": 0.8163397312164307, + "learning_rate": 4.579592370331684e-05, + "loss": 0.3903, + "step": 5700 + }, + { + "epoch": 0.4237067845074084, + "grad_norm": 0.7770934700965881, + "learning_rate": 4.578757553766392e-05, + "loss": 0.3879, + "step": 5705 + }, + { + "epoch": 0.4240781313825244, + "grad_norm": 1.1027836799621582, + "learning_rate": 4.5779219854099034e-05, + "loss": 0.3994, + "step": 5710 + }, + { + "epoch": 0.42444947825764046, + "grad_norm": 1.2394556999206543, + "learning_rate": 4.577085665564404e-05, + "loss": 0.4253, + "step": 5715 + }, + { + "epoch": 0.4248208251327565, + "grad_norm": 0.9389461874961853, + "learning_rate": 4.576248594532354e-05, + "loss": 0.4327, + "step": 5720 + }, + { + "epoch": 0.42519217200787257, + "grad_norm": 1.0077425241470337, + "learning_rate": 4.575410772616481e-05, + "loss": 0.4028, + "step": 5725 + }, + { + "epoch": 0.4255635188829886, + "grad_norm": 1.0668855905532837, + "learning_rate": 4.5745722001197886e-05, + "loss": 0.3972, + "step": 5730 + }, + { + "epoch": 0.42593486575810463, + "grad_norm": 0.8116600513458252, + "learning_rate": 4.57373287734555e-05, + "loss": 0.4024, + "step": 5735 + }, + { + "epoch": 0.4263062126332207, + "grad_norm": 0.8789268136024475, + "learning_rate": 4.5728928045973084e-05, + "loss": 0.424, + "step": 5740 + }, + { + "epoch": 0.42667755950833675, + "grad_norm": 1.0181138515472412, + "learning_rate": 4.5720519821788806e-05, + "loss": 0.4101, + "step": 5745 + }, + { + "epoch": 0.4270489063834528, + "grad_norm": 0.6793472170829773, + "learning_rate": 4.571210410394353e-05, + "loss": 0.3875, + "step": 5750 + }, + { + "epoch": 0.4274202532585688, + "grad_norm": 0.7746472954750061, + "learning_rate": 4.570368089548084e-05, + "loss": 0.4065, + "step": 5755 + }, + { + "epoch": 0.4277916001336849, + "grad_norm": 0.8889456391334534, + "learning_rate": 4.569525019944701e-05, + "loss": 0.4063, + "step": 5760 + }, + { + "epoch": 0.4281629470088009, + "grad_norm": 0.8817050457000732, + "learning_rate": 4.568681201889107e-05, + "loss": 0.4093, + "step": 5765 + }, + { + "epoch": 0.42853429388391695, + "grad_norm": 0.9229766726493835, + "learning_rate": 4.567836635686468e-05, + "loss": 0.3724, + "step": 5770 + }, + { + "epoch": 0.42890564075903304, + "grad_norm": 1.0542956590652466, + "learning_rate": 4.5669913216422275e-05, + "loss": 0.408, + "step": 5775 + }, + { + "epoch": 0.42927698763414907, + "grad_norm": 0.7305135130882263, + "learning_rate": 4.566145260062096e-05, + "loss": 0.3989, + "step": 5780 + }, + { + "epoch": 0.4296483345092651, + "grad_norm": 1.04397451877594, + "learning_rate": 4.565298451252055e-05, + "loss": 0.4075, + "step": 5785 + }, + { + "epoch": 0.4300196813843811, + "grad_norm": 1.1163504123687744, + "learning_rate": 4.5644508955183566e-05, + "loss": 0.3953, + "step": 5790 + }, + { + "epoch": 0.4303910282594972, + "grad_norm": 0.823851466178894, + "learning_rate": 4.5636025931675225e-05, + "loss": 0.4168, + "step": 5795 + }, + { + "epoch": 0.43076237513461324, + "grad_norm": 0.857819676399231, + "learning_rate": 4.562753544506345e-05, + "loss": 0.4047, + "step": 5800 + }, + { + "epoch": 0.43113372200972927, + "grad_norm": 0.8693041205406189, + "learning_rate": 4.5619037498418865e-05, + "loss": 0.4103, + "step": 5805 + }, + { + "epoch": 0.43150506888484536, + "grad_norm": 1.0778625011444092, + "learning_rate": 4.561053209481478e-05, + "loss": 0.4136, + "step": 5810 + }, + { + "epoch": 0.4318764157599614, + "grad_norm": 0.9446026086807251, + "learning_rate": 4.5602019237327214e-05, + "loss": 0.4157, + "step": 5815 + }, + { + "epoch": 0.4322477626350774, + "grad_norm": 1.0272636413574219, + "learning_rate": 4.5593498929034875e-05, + "loss": 0.3854, + "step": 5820 + }, + { + "epoch": 0.43261910951019344, + "grad_norm": 1.2057569026947021, + "learning_rate": 4.5584971173019163e-05, + "loss": 0.3711, + "step": 5825 + }, + { + "epoch": 0.43299045638530953, + "grad_norm": 0.9038252234458923, + "learning_rate": 4.5576435972364186e-05, + "loss": 0.4098, + "step": 5830 + }, + { + "epoch": 0.43336180326042556, + "grad_norm": 0.7703349590301514, + "learning_rate": 4.556789333015673e-05, + "loss": 0.4069, + "step": 5835 + }, + { + "epoch": 0.4337331501355416, + "grad_norm": 0.9097495079040527, + "learning_rate": 4.5559343249486264e-05, + "loss": 0.4208, + "step": 5840 + }, + { + "epoch": 0.4341044970106577, + "grad_norm": 1.0321422815322876, + "learning_rate": 4.555078573344498e-05, + "loss": 0.3767, + "step": 5845 + }, + { + "epoch": 0.4344758438857737, + "grad_norm": 0.9304498434066772, + "learning_rate": 4.554222078512772e-05, + "loss": 0.403, + "step": 5850 + }, + { + "epoch": 0.43484719076088973, + "grad_norm": 0.9338169693946838, + "learning_rate": 4.553364840763204e-05, + "loss": 0.4111, + "step": 5855 + }, + { + "epoch": 0.4352185376360058, + "grad_norm": 0.8401198387145996, + "learning_rate": 4.5525068604058193e-05, + "loss": 0.3897, + "step": 5860 + }, + { + "epoch": 0.43558988451112185, + "grad_norm": 1.1248438358306885, + "learning_rate": 4.551648137750908e-05, + "loss": 0.3891, + "step": 5865 + }, + { + "epoch": 0.4359612313862379, + "grad_norm": 0.7891606092453003, + "learning_rate": 4.550788673109031e-05, + "loss": 0.399, + "step": 5870 + }, + { + "epoch": 0.4363325782613539, + "grad_norm": 0.7357394695281982, + "learning_rate": 4.549928466791017e-05, + "loss": 0.4071, + "step": 5875 + }, + { + "epoch": 0.43670392513647, + "grad_norm": 1.5309947729110718, + "learning_rate": 4.5490675191079635e-05, + "loss": 0.4035, + "step": 5880 + }, + { + "epoch": 0.437075272011586, + "grad_norm": 0.8977563977241516, + "learning_rate": 4.548205830371236e-05, + "loss": 0.4206, + "step": 5885 + }, + { + "epoch": 0.43744661888670205, + "grad_norm": 0.9142330288887024, + "learning_rate": 4.5473434008924675e-05, + "loss": 0.4124, + "step": 5890 + }, + { + "epoch": 0.43781796576181814, + "grad_norm": 0.9033256769180298, + "learning_rate": 4.5464802309835597e-05, + "loss": 0.393, + "step": 5895 + }, + { + "epoch": 0.43818931263693417, + "grad_norm": 1.1107105016708374, + "learning_rate": 4.5456163209566806e-05, + "loss": 0.4233, + "step": 5900 + }, + { + "epoch": 0.4385606595120502, + "grad_norm": 1.1025406122207642, + "learning_rate": 4.544751671124267e-05, + "loss": 0.4062, + "step": 5905 + }, + { + "epoch": 0.43893200638716623, + "grad_norm": 0.7960245013237, + "learning_rate": 4.543886281799023e-05, + "loss": 0.4196, + "step": 5910 + }, + { + "epoch": 0.4393033532622823, + "grad_norm": 0.8743290901184082, + "learning_rate": 4.543020153293922e-05, + "loss": 0.3964, + "step": 5915 + }, + { + "epoch": 0.43967470013739834, + "grad_norm": 0.8431464433670044, + "learning_rate": 4.5421532859222e-05, + "loss": 0.4154, + "step": 5920 + }, + { + "epoch": 0.4400460470125144, + "grad_norm": 1.0902396440505981, + "learning_rate": 4.541285679997365e-05, + "loss": 0.4144, + "step": 5925 + }, + { + "epoch": 0.44041739388763046, + "grad_norm": 0.6638891100883484, + "learning_rate": 4.54041733583319e-05, + "loss": 0.3975, + "step": 5930 + }, + { + "epoch": 0.4407887407627465, + "grad_norm": 0.7656953930854797, + "learning_rate": 4.5395482537437154e-05, + "loss": 0.3902, + "step": 5935 + }, + { + "epoch": 0.4411600876378625, + "grad_norm": 0.8354926109313965, + "learning_rate": 4.538678434043246e-05, + "loss": 0.3898, + "step": 5940 + }, + { + "epoch": 0.44153143451297855, + "grad_norm": 1.043817400932312, + "learning_rate": 4.537807877046359e-05, + "loss": 0.399, + "step": 5945 + }, + { + "epoch": 0.44190278138809463, + "grad_norm": 1.3780771493911743, + "learning_rate": 4.536936583067895e-05, + "loss": 0.412, + "step": 5950 + }, + { + "epoch": 0.44227412826321066, + "grad_norm": 0.8868488669395447, + "learning_rate": 4.5360645524229576e-05, + "loss": 0.4185, + "step": 5955 + }, + { + "epoch": 0.4426454751383267, + "grad_norm": 0.7088744044303894, + "learning_rate": 4.535191785426922e-05, + "loss": 0.3828, + "step": 5960 + }, + { + "epoch": 0.4430168220134428, + "grad_norm": 0.7177873849868774, + "learning_rate": 4.534318282395429e-05, + "loss": 0.3717, + "step": 5965 + }, + { + "epoch": 0.4433881688885588, + "grad_norm": 0.8855662941932678, + "learning_rate": 4.5334440436443837e-05, + "loss": 0.3969, + "step": 5970 + }, + { + "epoch": 0.44375951576367484, + "grad_norm": 0.7701443433761597, + "learning_rate": 4.532569069489958e-05, + "loss": 0.4083, + "step": 5975 + }, + { + "epoch": 0.44413086263879087, + "grad_norm": 1.0092111825942993, + "learning_rate": 4.53169336024859e-05, + "loss": 0.3955, + "step": 5980 + }, + { + "epoch": 0.44450220951390695, + "grad_norm": 1.2627179622650146, + "learning_rate": 4.530816916236984e-05, + "loss": 0.4155, + "step": 5985 + }, + { + "epoch": 0.444873556389023, + "grad_norm": 1.103607177734375, + "learning_rate": 4.529939737772109e-05, + "loss": 0.3959, + "step": 5990 + }, + { + "epoch": 0.445244903264139, + "grad_norm": 0.8310053944587708, + "learning_rate": 4.5290618251711994e-05, + "loss": 0.4192, + "step": 5995 + }, + { + "epoch": 0.4456162501392551, + "grad_norm": 0.9397602677345276, + "learning_rate": 4.528183178751758e-05, + "loss": 0.3736, + "step": 6000 + }, + { + "epoch": 0.44598759701437113, + "grad_norm": 0.8605632185935974, + "learning_rate": 4.5273037988315495e-05, + "loss": 0.3898, + "step": 6005 + }, + { + "epoch": 0.44635894388948716, + "grad_norm": 0.933535635471344, + "learning_rate": 4.5264236857286055e-05, + "loss": 0.4008, + "step": 6010 + }, + { + "epoch": 0.4467302907646032, + "grad_norm": 0.8004394769668579, + "learning_rate": 4.525542839761222e-05, + "loss": 0.4177, + "step": 6015 + }, + { + "epoch": 0.4471016376397193, + "grad_norm": 1.1060221195220947, + "learning_rate": 4.5246612612479625e-05, + "loss": 0.4068, + "step": 6020 + }, + { + "epoch": 0.4474729845148353, + "grad_norm": 0.959685742855072, + "learning_rate": 4.52377895050765e-05, + "loss": 0.4194, + "step": 6025 + }, + { + "epoch": 0.44784433138995133, + "grad_norm": 0.8318073749542236, + "learning_rate": 4.522895907859379e-05, + "loss": 0.3829, + "step": 6030 + }, + { + "epoch": 0.4482156782650674, + "grad_norm": 0.7173587679862976, + "learning_rate": 4.522012133622504e-05, + "loss": 0.3829, + "step": 6035 + }, + { + "epoch": 0.44858702514018345, + "grad_norm": 0.8138423562049866, + "learning_rate": 4.521127628116646e-05, + "loss": 0.3977, + "step": 6040 + }, + { + "epoch": 0.4489583720152995, + "grad_norm": 0.7958429455757141, + "learning_rate": 4.52024239166169e-05, + "loss": 0.3835, + "step": 6045 + }, + { + "epoch": 0.44932971889041556, + "grad_norm": 0.779323399066925, + "learning_rate": 4.519356424577786e-05, + "loss": 0.3854, + "step": 6050 + }, + { + "epoch": 0.4497010657655316, + "grad_norm": 0.73027503490448, + "learning_rate": 4.518469727185346e-05, + "loss": 0.3968, + "step": 6055 + }, + { + "epoch": 0.4500724126406476, + "grad_norm": 0.8514652252197266, + "learning_rate": 4.5175822998050494e-05, + "loss": 0.3736, + "step": 6060 + }, + { + "epoch": 0.45044375951576365, + "grad_norm": 0.7881259322166443, + "learning_rate": 4.516694142757837e-05, + "loss": 0.4076, + "step": 6065 + }, + { + "epoch": 0.45081510639087974, + "grad_norm": 0.7900510430335999, + "learning_rate": 4.515805256364914e-05, + "loss": 0.3974, + "step": 6070 + }, + { + "epoch": 0.45118645326599577, + "grad_norm": 0.983288586139679, + "learning_rate": 4.5149156409477524e-05, + "loss": 0.3985, + "step": 6075 + }, + { + "epoch": 0.4515578001411118, + "grad_norm": 1.7363369464874268, + "learning_rate": 4.514025296828082e-05, + "loss": 0.3949, + "step": 6080 + }, + { + "epoch": 0.4519291470162279, + "grad_norm": 0.9478625655174255, + "learning_rate": 4.513134224327901e-05, + "loss": 0.4051, + "step": 6085 + }, + { + "epoch": 0.4523004938913439, + "grad_norm": 0.7871227264404297, + "learning_rate": 4.5122424237694697e-05, + "loss": 0.4028, + "step": 6090 + }, + { + "epoch": 0.45267184076645994, + "grad_norm": 1.424820899963379, + "learning_rate": 4.5113498954753104e-05, + "loss": 0.4083, + "step": 6095 + }, + { + "epoch": 0.453043187641576, + "grad_norm": 1.3722267150878906, + "learning_rate": 4.51045663976821e-05, + "loss": 0.4084, + "step": 6100 + }, + { + "epoch": 0.45341453451669206, + "grad_norm": 0.8124660849571228, + "learning_rate": 4.509562656971219e-05, + "loss": 0.3827, + "step": 6105 + }, + { + "epoch": 0.4537858813918081, + "grad_norm": 0.9037453532218933, + "learning_rate": 4.5086679474076486e-05, + "loss": 0.3787, + "step": 6110 + }, + { + "epoch": 0.4541572282669241, + "grad_norm": 0.808563768863678, + "learning_rate": 4.5077725114010755e-05, + "loss": 0.3986, + "step": 6115 + }, + { + "epoch": 0.4545285751420402, + "grad_norm": 0.8395090103149414, + "learning_rate": 4.506876349275337e-05, + "loss": 0.3942, + "step": 6120 + }, + { + "epoch": 0.45489992201715623, + "grad_norm": 0.9864828586578369, + "learning_rate": 4.5059794613545336e-05, + "loss": 0.3934, + "step": 6125 + }, + { + "epoch": 0.45527126889227226, + "grad_norm": 0.7316352128982544, + "learning_rate": 4.505081847963029e-05, + "loss": 0.3583, + "step": 6130 + }, + { + "epoch": 0.4556426157673883, + "grad_norm": 0.8625379800796509, + "learning_rate": 4.504183509425448e-05, + "loss": 0.3955, + "step": 6135 + }, + { + "epoch": 0.4560139626425044, + "grad_norm": 1.1295115947723389, + "learning_rate": 4.50328444606668e-05, + "loss": 0.3918, + "step": 6140 + }, + { + "epoch": 0.4563853095176204, + "grad_norm": 0.8367231488227844, + "learning_rate": 4.502384658211873e-05, + "loss": 0.4002, + "step": 6145 + }, + { + "epoch": 0.45675665639273644, + "grad_norm": 2.5678558349609375, + "learning_rate": 4.5014841461864407e-05, + "loss": 0.4113, + "step": 6150 + }, + { + "epoch": 0.4571280032678525, + "grad_norm": 1.087945580482483, + "learning_rate": 4.500582910316056e-05, + "loss": 0.4016, + "step": 6155 + }, + { + "epoch": 0.45749935014296855, + "grad_norm": 0.8971551060676575, + "learning_rate": 4.499680950926654e-05, + "loss": 0.4118, + "step": 6160 + }, + { + "epoch": 0.4578706970180846, + "grad_norm": 0.935817539691925, + "learning_rate": 4.4987782683444336e-05, + "loss": 0.3853, + "step": 6165 + }, + { + "epoch": 0.4582420438932006, + "grad_norm": 2.3204643726348877, + "learning_rate": 4.497874862895852e-05, + "loss": 0.3958, + "step": 6170 + }, + { + "epoch": 0.4586133907683167, + "grad_norm": 0.8300501108169556, + "learning_rate": 4.4969707349076314e-05, + "loss": 0.4185, + "step": 6175 + }, + { + "epoch": 0.4589847376434327, + "grad_norm": 0.8631926774978638, + "learning_rate": 4.4960658847067516e-05, + "loss": 0.4038, + "step": 6180 + }, + { + "epoch": 0.45935608451854876, + "grad_norm": 0.9046585559844971, + "learning_rate": 4.4951603126204555e-05, + "loss": 0.3812, + "step": 6185 + }, + { + "epoch": 0.45972743139366484, + "grad_norm": 1.0047301054000854, + "learning_rate": 4.494254018976248e-05, + "loss": 0.4147, + "step": 6190 + }, + { + "epoch": 0.4600987782687809, + "grad_norm": 0.6368455290794373, + "learning_rate": 4.493347004101894e-05, + "loss": 0.398, + "step": 6195 + }, + { + "epoch": 0.4604701251438969, + "grad_norm": 0.8941465616226196, + "learning_rate": 4.4924392683254185e-05, + "loss": 0.4039, + "step": 6200 + }, + { + "epoch": 0.460841472019013, + "grad_norm": 0.8189155459403992, + "learning_rate": 4.4915308119751074e-05, + "loss": 0.4083, + "step": 6205 + }, + { + "epoch": 0.461212818894129, + "grad_norm": 1.35800302028656, + "learning_rate": 4.490621635379509e-05, + "loss": 0.3816, + "step": 6210 + }, + { + "epoch": 0.46158416576924505, + "grad_norm": 1.2649494409561157, + "learning_rate": 4.4897117388674296e-05, + "loss": 0.3959, + "step": 6215 + }, + { + "epoch": 0.4619555126443611, + "grad_norm": 0.909601628780365, + "learning_rate": 4.488801122767937e-05, + "loss": 0.3561, + "step": 6220 + }, + { + "epoch": 0.46232685951947716, + "grad_norm": 0.8168251514434814, + "learning_rate": 4.4878897874103605e-05, + "loss": 0.3912, + "step": 6225 + }, + { + "epoch": 0.4626982063945932, + "grad_norm": 0.7770623564720154, + "learning_rate": 4.4869777331242865e-05, + "loss": 0.4195, + "step": 6230 + }, + { + "epoch": 0.4630695532697092, + "grad_norm": 0.8942532539367676, + "learning_rate": 4.4860649602395645e-05, + "loss": 0.4062, + "step": 6235 + }, + { + "epoch": 0.4634409001448253, + "grad_norm": 0.7446513175964355, + "learning_rate": 4.485151469086303e-05, + "loss": 0.3851, + "step": 6240 + }, + { + "epoch": 0.46381224701994134, + "grad_norm": 1.5065016746520996, + "learning_rate": 4.484237259994867e-05, + "loss": 0.3893, + "step": 6245 + }, + { + "epoch": 0.46418359389505737, + "grad_norm": 0.8029677271842957, + "learning_rate": 4.483322333295887e-05, + "loss": 0.4138, + "step": 6250 + }, + { + "epoch": 0.4645549407701734, + "grad_norm": 1.67062246799469, + "learning_rate": 4.4824066893202494e-05, + "loss": 0.3852, + "step": 6255 + }, + { + "epoch": 0.4649262876452895, + "grad_norm": 0.7975144386291504, + "learning_rate": 4.4814903283990996e-05, + "loss": 0.374, + "step": 6260 + }, + { + "epoch": 0.4652976345204055, + "grad_norm": 1.0474958419799805, + "learning_rate": 4.4805732508638444e-05, + "loss": 0.3746, + "step": 6265 + }, + { + "epoch": 0.46566898139552154, + "grad_norm": 1.08444082736969, + "learning_rate": 4.479655457046147e-05, + "loss": 0.3937, + "step": 6270 + }, + { + "epoch": 0.4660403282706376, + "grad_norm": 0.8867533802986145, + "learning_rate": 4.4787369472779326e-05, + "loss": 0.4087, + "step": 6275 + }, + { + "epoch": 0.46641167514575366, + "grad_norm": 0.954857349395752, + "learning_rate": 4.4778177218913844e-05, + "loss": 0.4139, + "step": 6280 + }, + { + "epoch": 0.4667830220208697, + "grad_norm": 1.2051773071289062, + "learning_rate": 4.476897781218943e-05, + "loss": 0.3953, + "step": 6285 + }, + { + "epoch": 0.4671543688959857, + "grad_norm": 0.8374889492988586, + "learning_rate": 4.475977125593309e-05, + "loss": 0.3984, + "step": 6290 + }, + { + "epoch": 0.4675257157711018, + "grad_norm": 0.756777286529541, + "learning_rate": 4.475055755347443e-05, + "loss": 0.3882, + "step": 6295 + }, + { + "epoch": 0.46789706264621783, + "grad_norm": 0.6392567753791809, + "learning_rate": 4.474133670814559e-05, + "loss": 0.4093, + "step": 6300 + }, + { + "epoch": 0.46826840952133386, + "grad_norm": 2.0606844425201416, + "learning_rate": 4.4732108723281346e-05, + "loss": 0.3742, + "step": 6305 + }, + { + "epoch": 0.46863975639644995, + "grad_norm": 0.7938241958618164, + "learning_rate": 4.472287360221904e-05, + "loss": 0.3869, + "step": 6310 + }, + { + "epoch": 0.469011103271566, + "grad_norm": 1.1123112440109253, + "learning_rate": 4.471363134829858e-05, + "loss": 0.4054, + "step": 6315 + }, + { + "epoch": 0.469382450146682, + "grad_norm": 0.8488892316818237, + "learning_rate": 4.4704381964862474e-05, + "loss": 0.4049, + "step": 6320 + }, + { + "epoch": 0.46975379702179804, + "grad_norm": 0.8527318239212036, + "learning_rate": 4.4695125455255794e-05, + "loss": 0.4112, + "step": 6325 + }, + { + "epoch": 0.4701251438969141, + "grad_norm": 0.9174249172210693, + "learning_rate": 4.46858618228262e-05, + "loss": 0.3904, + "step": 6330 + }, + { + "epoch": 0.47049649077203015, + "grad_norm": 1.0107132196426392, + "learning_rate": 4.4676591070923926e-05, + "loss": 0.389, + "step": 6335 + }, + { + "epoch": 0.4708678376471462, + "grad_norm": 1.0217775106430054, + "learning_rate": 4.466731320290176e-05, + "loss": 0.405, + "step": 6340 + }, + { + "epoch": 0.47123918452226227, + "grad_norm": 1.0906291007995605, + "learning_rate": 4.46580282221151e-05, + "loss": 0.4112, + "step": 6345 + }, + { + "epoch": 0.4716105313973783, + "grad_norm": 0.6523454785346985, + "learning_rate": 4.464873613192189e-05, + "loss": 0.3853, + "step": 6350 + }, + { + "epoch": 0.4719818782724943, + "grad_norm": 1.7757562398910522, + "learning_rate": 4.4639436935682646e-05, + "loss": 0.3861, + "step": 6355 + }, + { + "epoch": 0.47235322514761036, + "grad_norm": 0.9793681502342224, + "learning_rate": 4.463013063676047e-05, + "loss": 0.4017, + "step": 6360 + }, + { + "epoch": 0.47272457202272644, + "grad_norm": 0.6784321069717407, + "learning_rate": 4.462081723852102e-05, + "loss": 0.3991, + "step": 6365 + }, + { + "epoch": 0.47309591889784247, + "grad_norm": 0.7017379999160767, + "learning_rate": 4.461149674433253e-05, + "loss": 0.4008, + "step": 6370 + }, + { + "epoch": 0.4734672657729585, + "grad_norm": 0.8638973832130432, + "learning_rate": 4.460216915756578e-05, + "loss": 0.4009, + "step": 6375 + }, + { + "epoch": 0.4738386126480746, + "grad_norm": 0.7157192826271057, + "learning_rate": 4.4592834481594146e-05, + "loss": 0.3994, + "step": 6380 + }, + { + "epoch": 0.4742099595231906, + "grad_norm": 0.721689760684967, + "learning_rate": 4.4583492719793544e-05, + "loss": 0.4016, + "step": 6385 + }, + { + "epoch": 0.47458130639830665, + "grad_norm": 0.8904692530632019, + "learning_rate": 4.4574143875542465e-05, + "loss": 0.4127, + "step": 6390 + }, + { + "epoch": 0.47495265327342273, + "grad_norm": 1.060440182685852, + "learning_rate": 4.456478795222195e-05, + "loss": 0.4021, + "step": 6395 + }, + { + "epoch": 0.47532400014853876, + "grad_norm": 0.7294143438339233, + "learning_rate": 4.455542495321561e-05, + "loss": 0.3877, + "step": 6400 + }, + { + "epoch": 0.4756953470236548, + "grad_norm": 0.9109171628952026, + "learning_rate": 4.4546054881909615e-05, + "loss": 0.4181, + "step": 6405 + }, + { + "epoch": 0.4760666938987708, + "grad_norm": 0.8929049372673035, + "learning_rate": 4.453667774169269e-05, + "loss": 0.3796, + "step": 6410 + }, + { + "epoch": 0.4764380407738869, + "grad_norm": 0.9146767258644104, + "learning_rate": 4.452729353595611e-05, + "loss": 0.3882, + "step": 6415 + }, + { + "epoch": 0.47680938764900294, + "grad_norm": 0.8753120303153992, + "learning_rate": 4.4517902268093715e-05, + "loss": 0.3875, + "step": 6420 + }, + { + "epoch": 0.47718073452411897, + "grad_norm": 0.7614051103591919, + "learning_rate": 4.45085039415019e-05, + "loss": 0.4054, + "step": 6425 + }, + { + "epoch": 0.47755208139923505, + "grad_norm": 1.0136183500289917, + "learning_rate": 4.44990985595796e-05, + "loss": 0.3927, + "step": 6430 + }, + { + "epoch": 0.4779234282743511, + "grad_norm": 0.9799590706825256, + "learning_rate": 4.4489686125728316e-05, + "loss": 0.3908, + "step": 6435 + }, + { + "epoch": 0.4782947751494671, + "grad_norm": 0.7973908185958862, + "learning_rate": 4.44802666433521e-05, + "loss": 0.3715, + "step": 6440 + }, + { + "epoch": 0.47866612202458314, + "grad_norm": 0.8001505732536316, + "learning_rate": 4.447084011585752e-05, + "loss": 0.4111, + "step": 6445 + }, + { + "epoch": 0.4790374688996992, + "grad_norm": 0.6913841366767883, + "learning_rate": 4.446140654665375e-05, + "loss": 0.3877, + "step": 6450 + }, + { + "epoch": 0.47940881577481526, + "grad_norm": 1.518284797668457, + "learning_rate": 4.445196593915246e-05, + "loss": 0.4072, + "step": 6455 + }, + { + "epoch": 0.4797801626499313, + "grad_norm": 0.6852445006370544, + "learning_rate": 4.4442518296767896e-05, + "loss": 0.4089, + "step": 6460 + }, + { + "epoch": 0.48015150952504737, + "grad_norm": 0.8990404605865479, + "learning_rate": 4.4433063622916824e-05, + "loss": 0.3974, + "step": 6465 + }, + { + "epoch": 0.4805228564001634, + "grad_norm": 0.9696192741394043, + "learning_rate": 4.4423601921018574e-05, + "loss": 0.398, + "step": 6470 + }, + { + "epoch": 0.48089420327527943, + "grad_norm": 0.8919362425804138, + "learning_rate": 4.4414133194495e-05, + "loss": 0.3827, + "step": 6475 + }, + { + "epoch": 0.48126555015039546, + "grad_norm": 0.8759976625442505, + "learning_rate": 4.4404657446770524e-05, + "loss": 0.3855, + "step": 6480 + }, + { + "epoch": 0.48163689702551155, + "grad_norm": 0.8538511395454407, + "learning_rate": 4.4395174681272076e-05, + "loss": 0.3788, + "step": 6485 + }, + { + "epoch": 0.4820082439006276, + "grad_norm": 0.8503746390342712, + "learning_rate": 4.438568490142914e-05, + "loss": 0.3966, + "step": 6490 + }, + { + "epoch": 0.4823795907757436, + "grad_norm": 1.078658103942871, + "learning_rate": 4.437618811067373e-05, + "loss": 0.4003, + "step": 6495 + }, + { + "epoch": 0.4827509376508597, + "grad_norm": 0.9684752225875854, + "learning_rate": 4.43666843124404e-05, + "loss": 0.3998, + "step": 6500 + }, + { + "epoch": 0.4831222845259757, + "grad_norm": 3.1359939575195312, + "learning_rate": 4.435717351016624e-05, + "loss": 0.4117, + "step": 6505 + }, + { + "epoch": 0.48349363140109175, + "grad_norm": 0.8860180974006653, + "learning_rate": 4.4347655707290874e-05, + "loss": 0.3987, + "step": 6510 + }, + { + "epoch": 0.4838649782762078, + "grad_norm": 0.9404197931289673, + "learning_rate": 4.4338130907256445e-05, + "loss": 0.3865, + "step": 6515 + }, + { + "epoch": 0.48423632515132387, + "grad_norm": 1.2225645780563354, + "learning_rate": 4.432859911350765e-05, + "loss": 0.3799, + "step": 6520 + }, + { + "epoch": 0.4846076720264399, + "grad_norm": 1.038920521736145, + "learning_rate": 4.431906032949169e-05, + "loss": 0.3978, + "step": 6525 + }, + { + "epoch": 0.4849790189015559, + "grad_norm": 1.3525471687316895, + "learning_rate": 4.4309514558658315e-05, + "loss": 0.4046, + "step": 6530 + }, + { + "epoch": 0.485350365776672, + "grad_norm": 0.8096221685409546, + "learning_rate": 4.4299961804459776e-05, + "loss": 0.4029, + "step": 6535 + }, + { + "epoch": 0.48572171265178804, + "grad_norm": 0.9344236850738525, + "learning_rate": 4.429040207035088e-05, + "loss": 0.41, + "step": 6540 + }, + { + "epoch": 0.48609305952690407, + "grad_norm": 0.9042350053787231, + "learning_rate": 4.428083535978894e-05, + "loss": 0.4128, + "step": 6545 + }, + { + "epoch": 0.4864644064020201, + "grad_norm": 1.1597720384597778, + "learning_rate": 4.42712616762338e-05, + "loss": 0.3787, + "step": 6550 + }, + { + "epoch": 0.4868357532771362, + "grad_norm": 1.200708270072937, + "learning_rate": 4.426168102314781e-05, + "loss": 0.3972, + "step": 6555 + }, + { + "epoch": 0.4872071001522522, + "grad_norm": 1.0330638885498047, + "learning_rate": 4.4252093403995864e-05, + "loss": 0.3953, + "step": 6560 + }, + { + "epoch": 0.48757844702736824, + "grad_norm": 1.0347399711608887, + "learning_rate": 4.424249882224536e-05, + "loss": 0.4076, + "step": 6565 + }, + { + "epoch": 0.48794979390248433, + "grad_norm": 0.6727543473243713, + "learning_rate": 4.423289728136621e-05, + "loss": 0.4036, + "step": 6570 + }, + { + "epoch": 0.48832114077760036, + "grad_norm": 1.5079855918884277, + "learning_rate": 4.422328878483087e-05, + "loss": 0.3964, + "step": 6575 + }, + { + "epoch": 0.4886924876527164, + "grad_norm": 0.678868293762207, + "learning_rate": 4.421367333611428e-05, + "loss": 0.3871, + "step": 6580 + }, + { + "epoch": 0.4890638345278325, + "grad_norm": 1.0084245204925537, + "learning_rate": 4.42040509386939e-05, + "loss": 0.3853, + "step": 6585 + }, + { + "epoch": 0.4894351814029485, + "grad_norm": 0.8365580439567566, + "learning_rate": 4.4194421596049715e-05, + "loss": 0.4266, + "step": 6590 + }, + { + "epoch": 0.48980652827806453, + "grad_norm": 1.2573304176330566, + "learning_rate": 4.418478531166423e-05, + "loss": 0.3802, + "step": 6595 + }, + { + "epoch": 0.49017787515318056, + "grad_norm": 0.9323030114173889, + "learning_rate": 4.417514208902242e-05, + "loss": 0.4013, + "step": 6600 + }, + { + "epoch": 0.49054922202829665, + "grad_norm": 0.7535553574562073, + "learning_rate": 4.416549193161182e-05, + "loss": 0.3918, + "step": 6605 + }, + { + "epoch": 0.4909205689034127, + "grad_norm": 0.871611475944519, + "learning_rate": 4.415583484292245e-05, + "loss": 0.3729, + "step": 6610 + }, + { + "epoch": 0.4912919157785287, + "grad_norm": 0.9813038110733032, + "learning_rate": 4.4146170826446806e-05, + "loss": 0.3681, + "step": 6615 + }, + { + "epoch": 0.4916632626536448, + "grad_norm": 0.9408519268035889, + "learning_rate": 4.413649988567995e-05, + "loss": 0.4136, + "step": 6620 + }, + { + "epoch": 0.4920346095287608, + "grad_norm": 0.8955950140953064, + "learning_rate": 4.4126822024119405e-05, + "loss": 0.3629, + "step": 6625 + }, + { + "epoch": 0.49240595640387685, + "grad_norm": 1.0907254219055176, + "learning_rate": 4.4117137245265206e-05, + "loss": 0.401, + "step": 6630 + }, + { + "epoch": 0.4927773032789929, + "grad_norm": 0.7147353291511536, + "learning_rate": 4.41074455526199e-05, + "loss": 0.3912, + "step": 6635 + }, + { + "epoch": 0.49314865015410897, + "grad_norm": 1.297784447669983, + "learning_rate": 4.409774694968853e-05, + "loss": 0.4182, + "step": 6640 + }, + { + "epoch": 0.493519997029225, + "grad_norm": 0.7881303429603577, + "learning_rate": 4.408804143997862e-05, + "loss": 0.3971, + "step": 6645 + }, + { + "epoch": 0.49389134390434103, + "grad_norm": 1.1137521266937256, + "learning_rate": 4.4078329027000226e-05, + "loss": 0.3955, + "step": 6650 + }, + { + "epoch": 0.4942626907794571, + "grad_norm": 0.8524259328842163, + "learning_rate": 4.406860971426587e-05, + "loss": 0.3625, + "step": 6655 + }, + { + "epoch": 0.49463403765457314, + "grad_norm": 0.8187363743782043, + "learning_rate": 4.4058883505290586e-05, + "loss": 0.4043, + "step": 6660 + }, + { + "epoch": 0.4950053845296892, + "grad_norm": 1.0168330669403076, + "learning_rate": 4.40491504035919e-05, + "loss": 0.3851, + "step": 6665 + }, + { + "epoch": 0.4953767314048052, + "grad_norm": 0.7828294038772583, + "learning_rate": 4.403941041268982e-05, + "loss": 0.4131, + "step": 6670 + }, + { + "epoch": 0.4957480782799213, + "grad_norm": 0.966033399105072, + "learning_rate": 4.402966353610686e-05, + "loss": 0.3716, + "step": 6675 + }, + { + "epoch": 0.4961194251550373, + "grad_norm": 2.529012680053711, + "learning_rate": 4.401990977736802e-05, + "loss": 0.3974, + "step": 6680 + }, + { + "epoch": 0.49649077203015335, + "grad_norm": 0.6573621034622192, + "learning_rate": 4.401014914000078e-05, + "loss": 0.3743, + "step": 6685 + }, + { + "epoch": 0.49686211890526943, + "grad_norm": 0.9817414879798889, + "learning_rate": 4.4000381627535114e-05, + "loss": 0.3809, + "step": 6690 + }, + { + "epoch": 0.49723346578038546, + "grad_norm": 0.7202504873275757, + "learning_rate": 4.399060724350349e-05, + "loss": 0.4057, + "step": 6695 + }, + { + "epoch": 0.4976048126555015, + "grad_norm": 0.9702993035316467, + "learning_rate": 4.398082599144085e-05, + "loss": 0.3853, + "step": 6700 + }, + { + "epoch": 0.4979761595306175, + "grad_norm": 0.7524059414863586, + "learning_rate": 4.397103787488462e-05, + "loss": 0.3919, + "step": 6705 + }, + { + "epoch": 0.4983475064057336, + "grad_norm": 0.7320048213005066, + "learning_rate": 4.396124289737472e-05, + "loss": 0.3848, + "step": 6710 + }, + { + "epoch": 0.49871885328084964, + "grad_norm": 2.150871753692627, + "learning_rate": 4.395144106245355e-05, + "loss": 0.3875, + "step": 6715 + }, + { + "epoch": 0.49909020015596567, + "grad_norm": 0.8532472252845764, + "learning_rate": 4.394163237366596e-05, + "loss": 0.4054, + "step": 6720 + }, + { + "epoch": 0.49946154703108175, + "grad_norm": 0.9899107813835144, + "learning_rate": 4.393181683455933e-05, + "loss": 0.3891, + "step": 6725 + }, + { + "epoch": 0.4998328939061978, + "grad_norm": 1.153525471687317, + "learning_rate": 4.392199444868347e-05, + "loss": 0.4071, + "step": 6730 + }, + { + "epoch": 0.5002042407813139, + "grad_norm": 0.8968061208724976, + "learning_rate": 4.3912165219590703e-05, + "loss": 0.3756, + "step": 6735 + }, + { + "epoch": 0.5005755876564298, + "grad_norm": 0.8012394905090332, + "learning_rate": 4.390232915083579e-05, + "loss": 0.4128, + "step": 6740 + }, + { + "epoch": 0.5009469345315459, + "grad_norm": 1.1018304824829102, + "learning_rate": 4.389248624597599e-05, + "loss": 0.3938, + "step": 6745 + }, + { + "epoch": 0.501318281406662, + "grad_norm": 0.9241147637367249, + "learning_rate": 4.3882636508571054e-05, + "loss": 0.3906, + "step": 6750 + }, + { + "epoch": 0.501689628281778, + "grad_norm": 0.9045403599739075, + "learning_rate": 4.3872779942183154e-05, + "loss": 0.3939, + "step": 6755 + }, + { + "epoch": 0.5020609751568941, + "grad_norm": 0.6724326014518738, + "learning_rate": 4.3862916550376964e-05, + "loss": 0.4002, + "step": 6760 + }, + { + "epoch": 0.50243232203201, + "grad_norm": 0.8588703870773315, + "learning_rate": 4.385304633671962e-05, + "loss": 0.3885, + "step": 6765 + }, + { + "epoch": 0.5028036689071261, + "grad_norm": 0.9330534934997559, + "learning_rate": 4.3843169304780726e-05, + "loss": 0.3891, + "step": 6770 + }, + { + "epoch": 0.5031750157822422, + "grad_norm": 1.3435461521148682, + "learning_rate": 4.3833285458132364e-05, + "loss": 0.3868, + "step": 6775 + }, + { + "epoch": 0.5035463626573582, + "grad_norm": 0.771759033203125, + "learning_rate": 4.3823394800349046e-05, + "loss": 0.3999, + "step": 6780 + }, + { + "epoch": 0.5039177095324743, + "grad_norm": 0.7695342302322388, + "learning_rate": 4.381349733500778e-05, + "loss": 0.3824, + "step": 6785 + }, + { + "epoch": 0.5042890564075904, + "grad_norm": 0.8571486473083496, + "learning_rate": 4.380359306568802e-05, + "loss": 0.3814, + "step": 6790 + }, + { + "epoch": 0.5046604032827063, + "grad_norm": 0.8740094304084778, + "learning_rate": 4.379368199597169e-05, + "loss": 0.3879, + "step": 6795 + }, + { + "epoch": 0.5050317501578224, + "grad_norm": 0.8141562342643738, + "learning_rate": 4.378376412944317e-05, + "loss": 0.378, + "step": 6800 + }, + { + "epoch": 0.5054030970329385, + "grad_norm": 1.1338756084442139, + "learning_rate": 4.37738394696893e-05, + "loss": 0.4075, + "step": 6805 + }, + { + "epoch": 0.5057744439080545, + "grad_norm": 1.0660085678100586, + "learning_rate": 4.376390802029937e-05, + "loss": 0.3672, + "step": 6810 + }, + { + "epoch": 0.5061457907831706, + "grad_norm": 1.1899162530899048, + "learning_rate": 4.3753969784865123e-05, + "loss": 0.3949, + "step": 6815 + }, + { + "epoch": 0.5065171376582867, + "grad_norm": 0.7957924008369446, + "learning_rate": 4.374402476698077e-05, + "loss": 0.3909, + "step": 6820 + }, + { + "epoch": 0.5068884845334026, + "grad_norm": 0.8344728350639343, + "learning_rate": 4.3734072970242965e-05, + "loss": 0.3837, + "step": 6825 + }, + { + "epoch": 0.5072598314085187, + "grad_norm": 0.9248842000961304, + "learning_rate": 4.372411439825082e-05, + "loss": 0.3793, + "step": 6830 + }, + { + "epoch": 0.5076311782836348, + "grad_norm": 0.811426043510437, + "learning_rate": 4.371414905460589e-05, + "loss": 0.4047, + "step": 6835 + }, + { + "epoch": 0.5080025251587508, + "grad_norm": 2.3013105392456055, + "learning_rate": 4.370417694291218e-05, + "loss": 0.38, + "step": 6840 + }, + { + "epoch": 0.5083738720338669, + "grad_norm": 0.911200225353241, + "learning_rate": 4.369419806677615e-05, + "loss": 0.4152, + "step": 6845 + }, + { + "epoch": 0.5087452189089828, + "grad_norm": 1.1220375299453735, + "learning_rate": 4.368421242980669e-05, + "loss": 0.3996, + "step": 6850 + }, + { + "epoch": 0.5091165657840989, + "grad_norm": 0.8643706440925598, + "learning_rate": 4.3674220035615165e-05, + "loss": 0.3748, + "step": 6855 + }, + { + "epoch": 0.509487912659215, + "grad_norm": 0.8339560627937317, + "learning_rate": 4.3664220887815364e-05, + "loss": 0.3778, + "step": 6860 + }, + { + "epoch": 0.509859259534331, + "grad_norm": 0.9615778923034668, + "learning_rate": 4.3654214990023504e-05, + "loss": 0.3862, + "step": 6865 + }, + { + "epoch": 0.5102306064094471, + "grad_norm": 1.6474332809448242, + "learning_rate": 4.364420234585827e-05, + "loss": 0.4069, + "step": 6870 + }, + { + "epoch": 0.5106019532845631, + "grad_norm": 0.6933843493461609, + "learning_rate": 4.3634182958940766e-05, + "loss": 0.3902, + "step": 6875 + }, + { + "epoch": 0.5109733001596791, + "grad_norm": 0.7939101457595825, + "learning_rate": 4.362415683289456e-05, + "loss": 0.3915, + "step": 6880 + }, + { + "epoch": 0.5113446470347952, + "grad_norm": 0.9869307279586792, + "learning_rate": 4.361412397134562e-05, + "loss": 0.3807, + "step": 6885 + }, + { + "epoch": 0.5117159939099113, + "grad_norm": 0.9772399663925171, + "learning_rate": 4.360408437792239e-05, + "loss": 0.3938, + "step": 6890 + }, + { + "epoch": 0.5120873407850273, + "grad_norm": 0.8522541522979736, + "learning_rate": 4.3594038056255734e-05, + "loss": 0.3959, + "step": 6895 + }, + { + "epoch": 0.5124586876601434, + "grad_norm": 0.6819554567337036, + "learning_rate": 4.358398500997893e-05, + "loss": 0.3797, + "step": 6900 + }, + { + "epoch": 0.5128300345352594, + "grad_norm": 0.656989574432373, + "learning_rate": 4.357392524272771e-05, + "loss": 0.3808, + "step": 6905 + }, + { + "epoch": 0.5132013814103754, + "grad_norm": 1.0240429639816284, + "learning_rate": 4.356385875814023e-05, + "loss": 0.391, + "step": 6910 + }, + { + "epoch": 0.5135727282854915, + "grad_norm": 0.9838010668754578, + "learning_rate": 4.3553785559857074e-05, + "loss": 0.3858, + "step": 6915 + }, + { + "epoch": 0.5139440751606075, + "grad_norm": 0.9715674519538879, + "learning_rate": 4.354370565152126e-05, + "loss": 0.3989, + "step": 6920 + }, + { + "epoch": 0.5143154220357236, + "grad_norm": 0.8658883571624756, + "learning_rate": 4.3533619036778216e-05, + "loss": 0.3905, + "step": 6925 + }, + { + "epoch": 0.5146867689108396, + "grad_norm": 1.0177209377288818, + "learning_rate": 4.352352571927583e-05, + "loss": 0.3768, + "step": 6930 + }, + { + "epoch": 0.5150581157859556, + "grad_norm": 0.9606013298034668, + "learning_rate": 4.3513425702664365e-05, + "loss": 0.3922, + "step": 6935 + }, + { + "epoch": 0.5154294626610717, + "grad_norm": 1.0270155668258667, + "learning_rate": 4.350331899059655e-05, + "loss": 0.3806, + "step": 6940 + }, + { + "epoch": 0.5158008095361878, + "grad_norm": 0.8382008671760559, + "learning_rate": 4.349320558672752e-05, + "loss": 0.3759, + "step": 6945 + }, + { + "epoch": 0.5161721564113038, + "grad_norm": 0.8334744572639465, + "learning_rate": 4.348308549471481e-05, + "loss": 0.3932, + "step": 6950 + }, + { + "epoch": 0.5165435032864198, + "grad_norm": 0.9420961737632751, + "learning_rate": 4.347295871821843e-05, + "loss": 0.3849, + "step": 6955 + }, + { + "epoch": 0.5169148501615359, + "grad_norm": 1.008453130722046, + "learning_rate": 4.346282526090073e-05, + "loss": 0.3829, + "step": 6960 + }, + { + "epoch": 0.5172861970366519, + "grad_norm": 0.9556795358657837, + "learning_rate": 4.345268512642654e-05, + "loss": 0.4101, + "step": 6965 + }, + { + "epoch": 0.517657543911768, + "grad_norm": 0.8188214302062988, + "learning_rate": 4.344253831846308e-05, + "loss": 0.3921, + "step": 6970 + }, + { + "epoch": 0.5180288907868841, + "grad_norm": 0.8770802617073059, + "learning_rate": 4.3432384840679974e-05, + "loss": 0.3884, + "step": 6975 + }, + { + "epoch": 0.518400237662, + "grad_norm": 0.8020485639572144, + "learning_rate": 4.342222469674927e-05, + "loss": 0.3975, + "step": 6980 + }, + { + "epoch": 0.5187715845371161, + "grad_norm": 0.8058055639266968, + "learning_rate": 4.3412057890345435e-05, + "loss": 0.3952, + "step": 6985 + }, + { + "epoch": 0.5191429314122321, + "grad_norm": 1.0167996883392334, + "learning_rate": 4.3401884425145336e-05, + "loss": 0.3838, + "step": 6990 + }, + { + "epoch": 0.5195142782873482, + "grad_norm": 0.8972017168998718, + "learning_rate": 4.339170430482824e-05, + "loss": 0.3993, + "step": 6995 + }, + { + "epoch": 0.5198856251624643, + "grad_norm": 0.9788426756858826, + "learning_rate": 4.338151753307583e-05, + "loss": 0.3954, + "step": 7000 + }, + { + "epoch": 0.5202569720375803, + "grad_norm": 0.8452064990997314, + "learning_rate": 4.3371324113572205e-05, + "loss": 0.3961, + "step": 7005 + }, + { + "epoch": 0.5206283189126963, + "grad_norm": 0.7179628014564514, + "learning_rate": 4.3361124050003844e-05, + "loss": 0.394, + "step": 7010 + }, + { + "epoch": 0.5209996657878124, + "grad_norm": 0.9873393177986145, + "learning_rate": 4.335091734605965e-05, + "loss": 0.4077, + "step": 7015 + }, + { + "epoch": 0.5213710126629284, + "grad_norm": 0.9680303931236267, + "learning_rate": 4.334070400543091e-05, + "loss": 0.4056, + "step": 7020 + }, + { + "epoch": 0.5217423595380445, + "grad_norm": 1.9106343984603882, + "learning_rate": 4.3330484031811336e-05, + "loss": 0.3935, + "step": 7025 + }, + { + "epoch": 0.5221137064131606, + "grad_norm": 0.9799425601959229, + "learning_rate": 4.332025742889701e-05, + "loss": 0.3763, + "step": 7030 + }, + { + "epoch": 0.5224850532882765, + "grad_norm": 0.9366163611412048, + "learning_rate": 4.331002420038642e-05, + "loss": 0.3861, + "step": 7035 + }, + { + "epoch": 0.5228564001633926, + "grad_norm": 0.8720141053199768, + "learning_rate": 4.329978434998048e-05, + "loss": 0.3852, + "step": 7040 + }, + { + "epoch": 0.5232277470385087, + "grad_norm": 0.8284469842910767, + "learning_rate": 4.3289537881382445e-05, + "loss": 0.3941, + "step": 7045 + }, + { + "epoch": 0.5235990939136247, + "grad_norm": 0.9448261857032776, + "learning_rate": 4.327928479829801e-05, + "loss": 0.3827, + "step": 7050 + }, + { + "epoch": 0.5239704407887408, + "grad_norm": 0.9553066492080688, + "learning_rate": 4.326902510443524e-05, + "loss": 0.3927, + "step": 7055 + }, + { + "epoch": 0.5243417876638569, + "grad_norm": 0.9436995983123779, + "learning_rate": 4.325875880350461e-05, + "loss": 0.4134, + "step": 7060 + }, + { + "epoch": 0.5247131345389728, + "grad_norm": 2.108335494995117, + "learning_rate": 4.3248485899218935e-05, + "loss": 0.3881, + "step": 7065 + }, + { + "epoch": 0.5250844814140889, + "grad_norm": 1.2432762384414673, + "learning_rate": 4.323820639529348e-05, + "loss": 0.3905, + "step": 7070 + }, + { + "epoch": 0.5254558282892049, + "grad_norm": 0.7592414021492004, + "learning_rate": 4.3227920295445866e-05, + "loss": 0.3824, + "step": 7075 + }, + { + "epoch": 0.525827175164321, + "grad_norm": 1.110966682434082, + "learning_rate": 4.32176276033961e-05, + "loss": 0.3664, + "step": 7080 + }, + { + "epoch": 0.5261985220394371, + "grad_norm": 0.959549069404602, + "learning_rate": 4.320732832286657e-05, + "loss": 0.3825, + "step": 7085 + }, + { + "epoch": 0.526569868914553, + "grad_norm": 0.8384568691253662, + "learning_rate": 4.3197022457582065e-05, + "loss": 0.4071, + "step": 7090 + }, + { + "epoch": 0.5269412157896691, + "grad_norm": 0.8591688275337219, + "learning_rate": 4.318671001126974e-05, + "loss": 0.3991, + "step": 7095 + }, + { + "epoch": 0.5273125626647852, + "grad_norm": 0.9322898983955383, + "learning_rate": 4.3176390987659134e-05, + "loss": 0.3788, + "step": 7100 + }, + { + "epoch": 0.5276839095399012, + "grad_norm": 0.8797940611839294, + "learning_rate": 4.3166065390482146e-05, + "loss": 0.4064, + "step": 7105 + }, + { + "epoch": 0.5280552564150173, + "grad_norm": 0.886881411075592, + "learning_rate": 4.3155733223473103e-05, + "loss": 0.3957, + "step": 7110 + }, + { + "epoch": 0.5284266032901334, + "grad_norm": 0.8280029296875, + "learning_rate": 4.314539449036865e-05, + "loss": 0.4041, + "step": 7115 + }, + { + "epoch": 0.5287979501652493, + "grad_norm": 0.8642339706420898, + "learning_rate": 4.313504919490785e-05, + "loss": 0.3824, + "step": 7120 + }, + { + "epoch": 0.5291692970403654, + "grad_norm": 0.8907843232154846, + "learning_rate": 4.31246973408321e-05, + "loss": 0.3912, + "step": 7125 + }, + { + "epoch": 0.5295406439154815, + "grad_norm": 0.8841026425361633, + "learning_rate": 4.311433893188521e-05, + "loss": 0.3776, + "step": 7130 + }, + { + "epoch": 0.5299119907905975, + "grad_norm": 0.8547298312187195, + "learning_rate": 4.310397397181334e-05, + "loss": 0.3878, + "step": 7135 + }, + { + "epoch": 0.5302833376657136, + "grad_norm": 1.2953871488571167, + "learning_rate": 4.3093602464365e-05, + "loss": 0.3881, + "step": 7140 + }, + { + "epoch": 0.5306546845408295, + "grad_norm": 0.9269512295722961, + "learning_rate": 4.308322441329112e-05, + "loss": 0.3754, + "step": 7145 + }, + { + "epoch": 0.5310260314159456, + "grad_norm": 1.1527509689331055, + "learning_rate": 4.307283982234494e-05, + "loss": 0.3692, + "step": 7150 + }, + { + "epoch": 0.5313973782910617, + "grad_norm": 0.9051923751831055, + "learning_rate": 4.306244869528209e-05, + "loss": 0.4169, + "step": 7155 + }, + { + "epoch": 0.5317687251661777, + "grad_norm": 0.8554239869117737, + "learning_rate": 4.305205103586058e-05, + "loss": 0.3821, + "step": 7160 + }, + { + "epoch": 0.5321400720412938, + "grad_norm": 1.063449501991272, + "learning_rate": 4.304164684784076e-05, + "loss": 0.3977, + "step": 7165 + }, + { + "epoch": 0.5325114189164099, + "grad_norm": 0.9604543447494507, + "learning_rate": 4.303123613498534e-05, + "loss": 0.3864, + "step": 7170 + }, + { + "epoch": 0.5328827657915258, + "grad_norm": 1.005906581878662, + "learning_rate": 4.30208189010594e-05, + "loss": 0.3756, + "step": 7175 + }, + { + "epoch": 0.5332541126666419, + "grad_norm": 0.8495503067970276, + "learning_rate": 4.301039514983038e-05, + "loss": 0.3763, + "step": 7180 + }, + { + "epoch": 0.533625459541758, + "grad_norm": 0.8146620988845825, + "learning_rate": 4.2999964885068064e-05, + "loss": 0.3729, + "step": 7185 + }, + { + "epoch": 0.533996806416874, + "grad_norm": 0.6939537525177002, + "learning_rate": 4.298952811054462e-05, + "loss": 0.38, + "step": 7190 + }, + { + "epoch": 0.5343681532919901, + "grad_norm": 0.955191433429718, + "learning_rate": 4.297908483003452e-05, + "loss": 0.3648, + "step": 7195 + }, + { + "epoch": 0.5347395001671061, + "grad_norm": 0.9314118027687073, + "learning_rate": 4.296863504731464e-05, + "loss": 0.3953, + "step": 7200 + }, + { + "epoch": 0.5351108470422221, + "grad_norm": 1.5681297779083252, + "learning_rate": 4.295817876616419e-05, + "loss": 0.4045, + "step": 7205 + }, + { + "epoch": 0.5354821939173382, + "grad_norm": 0.8491237163543701, + "learning_rate": 4.2947715990364715e-05, + "loss": 0.4016, + "step": 7210 + }, + { + "epoch": 0.5358535407924543, + "grad_norm": 0.940566897392273, + "learning_rate": 4.293724672370012e-05, + "loss": 0.3824, + "step": 7215 + }, + { + "epoch": 0.5362248876675703, + "grad_norm": 1.5019309520721436, + "learning_rate": 4.292677096995667e-05, + "loss": 0.3823, + "step": 7220 + }, + { + "epoch": 0.5365962345426863, + "grad_norm": 0.7174615263938904, + "learning_rate": 4.291628873292294e-05, + "loss": 0.3854, + "step": 7225 + }, + { + "epoch": 0.5369675814178023, + "grad_norm": 0.901176929473877, + "learning_rate": 4.290580001638991e-05, + "loss": 0.3872, + "step": 7230 + }, + { + "epoch": 0.5373389282929184, + "grad_norm": 0.8742217421531677, + "learning_rate": 4.289530482415084e-05, + "loss": 0.3791, + "step": 7235 + }, + { + "epoch": 0.5377102751680345, + "grad_norm": 0.939732551574707, + "learning_rate": 4.288480316000137e-05, + "loss": 0.3676, + "step": 7240 + }, + { + "epoch": 0.5380816220431505, + "grad_norm": 0.8793368339538574, + "learning_rate": 4.2874295027739465e-05, + "loss": 0.3813, + "step": 7245 + }, + { + "epoch": 0.5384529689182666, + "grad_norm": 1.093706488609314, + "learning_rate": 4.2863780431165445e-05, + "loss": 0.3734, + "step": 7250 + }, + { + "epoch": 0.5388243157933826, + "grad_norm": 0.855695903301239, + "learning_rate": 4.285325937408194e-05, + "loss": 0.3642, + "step": 7255 + }, + { + "epoch": 0.5391956626684986, + "grad_norm": 0.9116891026496887, + "learning_rate": 4.2842731860293946e-05, + "loss": 0.3662, + "step": 7260 + }, + { + "epoch": 0.5395670095436147, + "grad_norm": 0.9691513776779175, + "learning_rate": 4.283219789360878e-05, + "loss": 0.3793, + "step": 7265 + }, + { + "epoch": 0.5399383564187308, + "grad_norm": 0.7580130696296692, + "learning_rate": 4.2821657477836086e-05, + "loss": 0.3971, + "step": 7270 + }, + { + "epoch": 0.5403097032938468, + "grad_norm": 0.8524672389030457, + "learning_rate": 4.2811110616787866e-05, + "loss": 0.3985, + "step": 7275 + }, + { + "epoch": 0.5406810501689628, + "grad_norm": 0.7902607917785645, + "learning_rate": 4.280055731427841e-05, + "loss": 0.3844, + "step": 7280 + }, + { + "epoch": 0.5410523970440789, + "grad_norm": 0.7604623436927795, + "learning_rate": 4.278999757412439e-05, + "loss": 0.3699, + "step": 7285 + }, + { + "epoch": 0.5414237439191949, + "grad_norm": 0.951298713684082, + "learning_rate": 4.2779431400144765e-05, + "loss": 0.4153, + "step": 7290 + }, + { + "epoch": 0.541795090794311, + "grad_norm": 1.0854779481887817, + "learning_rate": 4.2768858796160834e-05, + "loss": 0.4008, + "step": 7295 + }, + { + "epoch": 0.542166437669427, + "grad_norm": 0.7753530740737915, + "learning_rate": 4.2758279765996224e-05, + "loss": 0.3824, + "step": 7300 + }, + { + "epoch": 0.542537784544543, + "grad_norm": 0.8362932801246643, + "learning_rate": 4.2747694313476886e-05, + "loss": 0.3856, + "step": 7305 + }, + { + "epoch": 0.5429091314196591, + "grad_norm": 1.1193138360977173, + "learning_rate": 4.27371024424311e-05, + "loss": 0.3749, + "step": 7310 + }, + { + "epoch": 0.5432804782947751, + "grad_norm": 0.9703423380851746, + "learning_rate": 4.272650415668944e-05, + "loss": 0.3666, + "step": 7315 + }, + { + "epoch": 0.5436518251698912, + "grad_norm": 0.6996259093284607, + "learning_rate": 4.2715899460084844e-05, + "loss": 0.3828, + "step": 7320 + }, + { + "epoch": 0.5440231720450073, + "grad_norm": 1.0184882879257202, + "learning_rate": 4.2705288356452525e-05, + "loss": 0.376, + "step": 7325 + }, + { + "epoch": 0.5443945189201232, + "grad_norm": 1.1657757759094238, + "learning_rate": 4.269467084963004e-05, + "loss": 0.3993, + "step": 7330 + }, + { + "epoch": 0.5447658657952393, + "grad_norm": 0.9135143160820007, + "learning_rate": 4.268404694345726e-05, + "loss": 0.3784, + "step": 7335 + }, + { + "epoch": 0.5451372126703554, + "grad_norm": 0.8046631217002869, + "learning_rate": 4.2673416641776346e-05, + "loss": 0.3873, + "step": 7340 + }, + { + "epoch": 0.5455085595454714, + "grad_norm": 1.1384739875793457, + "learning_rate": 4.266277994843181e-05, + "loss": 0.3937, + "step": 7345 + }, + { + "epoch": 0.5458799064205875, + "grad_norm": 0.7930871248245239, + "learning_rate": 4.2652136867270444e-05, + "loss": 0.3867, + "step": 7350 + }, + { + "epoch": 0.5462512532957036, + "grad_norm": 0.9256702661514282, + "learning_rate": 4.264148740214136e-05, + "loss": 0.3996, + "step": 7355 + }, + { + "epoch": 0.5466226001708195, + "grad_norm": 1.0180842876434326, + "learning_rate": 4.2630831556896e-05, + "loss": 0.3955, + "step": 7360 + }, + { + "epoch": 0.5469939470459356, + "grad_norm": 1.0560667514801025, + "learning_rate": 4.262016933538806e-05, + "loss": 0.3968, + "step": 7365 + }, + { + "epoch": 0.5473652939210516, + "grad_norm": 0.8952111005783081, + "learning_rate": 4.260950074147361e-05, + "loss": 0.3786, + "step": 7370 + }, + { + "epoch": 0.5477366407961677, + "grad_norm": 0.8378480672836304, + "learning_rate": 4.2598825779010975e-05, + "loss": 0.3595, + "step": 7375 + }, + { + "epoch": 0.5481079876712838, + "grad_norm": 0.9558596014976501, + "learning_rate": 4.25881444518608e-05, + "loss": 0.3788, + "step": 7380 + }, + { + "epoch": 0.5484793345463997, + "grad_norm": 1.0218459367752075, + "learning_rate": 4.257745676388603e-05, + "loss": 0.3883, + "step": 7385 + }, + { + "epoch": 0.5488506814215158, + "grad_norm": 0.7036837339401245, + "learning_rate": 4.256676271895191e-05, + "loss": 0.378, + "step": 7390 + }, + { + "epoch": 0.5492220282966319, + "grad_norm": 0.8147867321968079, + "learning_rate": 4.2556062320926e-05, + "loss": 0.3931, + "step": 7395 + }, + { + "epoch": 0.5495933751717479, + "grad_norm": 0.9215070009231567, + "learning_rate": 4.254535557367811e-05, + "loss": 0.3993, + "step": 7400 + }, + { + "epoch": 0.549964722046864, + "grad_norm": 0.9754782319068909, + "learning_rate": 4.2534642481080425e-05, + "loss": 0.3986, + "step": 7405 + }, + { + "epoch": 0.5503360689219801, + "grad_norm": 0.9843300580978394, + "learning_rate": 4.2523923047007344e-05, + "loss": 0.3794, + "step": 7410 + }, + { + "epoch": 0.550707415797096, + "grad_norm": 0.6864088773727417, + "learning_rate": 4.25131972753356e-05, + "loss": 0.3876, + "step": 7415 + }, + { + "epoch": 0.5510787626722121, + "grad_norm": 0.8335707187652588, + "learning_rate": 4.250246516994422e-05, + "loss": 0.3679, + "step": 7420 + }, + { + "epoch": 0.5514501095473282, + "grad_norm": 0.8459278345108032, + "learning_rate": 4.2491726734714504e-05, + "loss": 0.3871, + "step": 7425 + }, + { + "epoch": 0.5518214564224442, + "grad_norm": 0.9613873362541199, + "learning_rate": 4.2480981973530064e-05, + "loss": 0.3964, + "step": 7430 + }, + { + "epoch": 0.5521928032975603, + "grad_norm": 0.766555666923523, + "learning_rate": 4.247023089027678e-05, + "loss": 0.3756, + "step": 7435 + }, + { + "epoch": 0.5525641501726763, + "grad_norm": 0.7106664180755615, + "learning_rate": 4.245947348884282e-05, + "loss": 0.3887, + "step": 7440 + }, + { + "epoch": 0.5529354970477923, + "grad_norm": 0.6389920711517334, + "learning_rate": 4.2448709773118654e-05, + "loss": 0.3658, + "step": 7445 + }, + { + "epoch": 0.5533068439229084, + "grad_norm": 1.4300941228866577, + "learning_rate": 4.2437939746997024e-05, + "loss": 0.3866, + "step": 7450 + }, + { + "epoch": 0.5536781907980244, + "grad_norm": 0.7473590970039368, + "learning_rate": 4.242716341437294e-05, + "loss": 0.3692, + "step": 7455 + }, + { + "epoch": 0.5540495376731405, + "grad_norm": 0.8290305137634277, + "learning_rate": 4.241638077914372e-05, + "loss": 0.3846, + "step": 7460 + }, + { + "epoch": 0.5544208845482566, + "grad_norm": 1.0392992496490479, + "learning_rate": 4.2405591845208934e-05, + "loss": 0.3814, + "step": 7465 + }, + { + "epoch": 0.5547922314233725, + "grad_norm": 0.7894637584686279, + "learning_rate": 4.239479661647046e-05, + "loss": 0.3628, + "step": 7470 + }, + { + "epoch": 0.5551635782984886, + "grad_norm": 0.9801110029220581, + "learning_rate": 4.238399509683243e-05, + "loss": 0.3801, + "step": 7475 + }, + { + "epoch": 0.5555349251736047, + "grad_norm": 0.751207172870636, + "learning_rate": 4.237318729020127e-05, + "loss": 0.3844, + "step": 7480 + }, + { + "epoch": 0.5559062720487207, + "grad_norm": 1.1230332851409912, + "learning_rate": 4.236237320048564e-05, + "loss": 0.4014, + "step": 7485 + }, + { + "epoch": 0.5562776189238368, + "grad_norm": 1.1641552448272705, + "learning_rate": 4.235155283159652e-05, + "loss": 0.3794, + "step": 7490 + }, + { + "epoch": 0.5566489657989528, + "grad_norm": 1.1932958364486694, + "learning_rate": 4.234072618744714e-05, + "loss": 0.3856, + "step": 7495 + }, + { + "epoch": 0.5570203126740688, + "grad_norm": 1.112725019454956, + "learning_rate": 4.232989327195299e-05, + "loss": 0.3993, + "step": 7500 + }, + { + "epoch": 0.5573916595491849, + "grad_norm": 1.0324194431304932, + "learning_rate": 4.231905408903185e-05, + "loss": 0.3835, + "step": 7505 + }, + { + "epoch": 0.557763006424301, + "grad_norm": 0.749826192855835, + "learning_rate": 4.230820864260375e-05, + "loss": 0.3364, + "step": 7510 + }, + { + "epoch": 0.558134353299417, + "grad_norm": 1.167183756828308, + "learning_rate": 4.229735693659099e-05, + "loss": 0.3906, + "step": 7515 + }, + { + "epoch": 0.558505700174533, + "grad_norm": 0.7182947993278503, + "learning_rate": 4.228649897491814e-05, + "loss": 0.3719, + "step": 7520 + }, + { + "epoch": 0.558877047049649, + "grad_norm": 1.1357501745224, + "learning_rate": 4.2275634761512014e-05, + "loss": 0.3986, + "step": 7525 + }, + { + "epoch": 0.5592483939247651, + "grad_norm": 0.790942370891571, + "learning_rate": 4.2264764300301705e-05, + "loss": 0.3771, + "step": 7530 + }, + { + "epoch": 0.5596197407998812, + "grad_norm": 0.7060715556144714, + "learning_rate": 4.2253887595218566e-05, + "loss": 0.3708, + "step": 7535 + }, + { + "epoch": 0.5599910876749972, + "grad_norm": 0.9133544564247131, + "learning_rate": 4.22430046501962e-05, + "loss": 0.3865, + "step": 7540 + }, + { + "epoch": 0.5603624345501133, + "grad_norm": 0.8340880870819092, + "learning_rate": 4.2232115469170465e-05, + "loss": 0.3893, + "step": 7545 + }, + { + "epoch": 0.5607337814252293, + "grad_norm": 0.8766905069351196, + "learning_rate": 4.2221220056079466e-05, + "loss": 0.3837, + "step": 7550 + }, + { + "epoch": 0.5611051283003453, + "grad_norm": 0.7138217091560364, + "learning_rate": 4.22103184148636e-05, + "loss": 0.3927, + "step": 7555 + }, + { + "epoch": 0.5614764751754614, + "grad_norm": 1.0313303470611572, + "learning_rate": 4.219941054946547e-05, + "loss": 0.3939, + "step": 7560 + }, + { + "epoch": 0.5618478220505775, + "grad_norm": 0.6473879218101501, + "learning_rate": 4.218849646382996e-05, + "loss": 0.3788, + "step": 7565 + }, + { + "epoch": 0.5622191689256935, + "grad_norm": 0.748626172542572, + "learning_rate": 4.217757616190419e-05, + "loss": 0.3893, + "step": 7570 + }, + { + "epoch": 0.5625905158008095, + "grad_norm": 0.7278555035591125, + "learning_rate": 4.2166649647637534e-05, + "loss": 0.3706, + "step": 7575 + }, + { + "epoch": 0.5629618626759256, + "grad_norm": 0.8890153169631958, + "learning_rate": 4.215571692498161e-05, + "loss": 0.3737, + "step": 7580 + }, + { + "epoch": 0.5633332095510416, + "grad_norm": 0.92683345079422, + "learning_rate": 4.214477799789028e-05, + "loss": 0.3809, + "step": 7585 + }, + { + "epoch": 0.5637045564261577, + "grad_norm": 0.8576033711433411, + "learning_rate": 4.213383287031966e-05, + "loss": 0.3891, + "step": 7590 + }, + { + "epoch": 0.5640759033012738, + "grad_norm": 0.9028112888336182, + "learning_rate": 4.2122881546228086e-05, + "loss": 0.3653, + "step": 7595 + }, + { + "epoch": 0.5644472501763897, + "grad_norm": 0.6952714920043945, + "learning_rate": 4.211192402957616e-05, + "loss": 0.3993, + "step": 7600 + }, + { + "epoch": 0.5648185970515058, + "grad_norm": 0.8100974559783936, + "learning_rate": 4.210096032432671e-05, + "loss": 0.3873, + "step": 7605 + }, + { + "epoch": 0.5651899439266218, + "grad_norm": 1.0961740016937256, + "learning_rate": 4.20899904344448e-05, + "loss": 0.3911, + "step": 7610 + }, + { + "epoch": 0.5655612908017379, + "grad_norm": 0.9107250571250916, + "learning_rate": 4.207901436389775e-05, + "loss": 0.3912, + "step": 7615 + }, + { + "epoch": 0.565932637676854, + "grad_norm": 0.7377946376800537, + "learning_rate": 4.206803211665509e-05, + "loss": 0.3896, + "step": 7620 + }, + { + "epoch": 0.56630398455197, + "grad_norm": 0.9254267811775208, + "learning_rate": 4.205704369668859e-05, + "loss": 0.3793, + "step": 7625 + }, + { + "epoch": 0.566675331427086, + "grad_norm": 0.8499208092689514, + "learning_rate": 4.204604910797228e-05, + "loss": 0.3603, + "step": 7630 + }, + { + "epoch": 0.5670466783022021, + "grad_norm": 0.9021883606910706, + "learning_rate": 4.203504835448235e-05, + "loss": 0.3699, + "step": 7635 + }, + { + "epoch": 0.5674180251773181, + "grad_norm": 1.3291375637054443, + "learning_rate": 4.2024041440197326e-05, + "loss": 0.4044, + "step": 7640 + }, + { + "epoch": 0.5677893720524342, + "grad_norm": 0.8598208427429199, + "learning_rate": 4.201302836909787e-05, + "loss": 0.3752, + "step": 7645 + }, + { + "epoch": 0.5681607189275503, + "grad_norm": 1.3929486274719238, + "learning_rate": 4.200200914516691e-05, + "loss": 0.4129, + "step": 7650 + }, + { + "epoch": 0.5685320658026662, + "grad_norm": 0.9411462545394897, + "learning_rate": 4.199098377238959e-05, + "loss": 0.3784, + "step": 7655 + }, + { + "epoch": 0.5689034126777823, + "grad_norm": 0.9103113412857056, + "learning_rate": 4.1979952254753294e-05, + "loss": 0.3861, + "step": 7660 + }, + { + "epoch": 0.5692747595528984, + "grad_norm": 0.8931922316551208, + "learning_rate": 4.19689145962476e-05, + "loss": 0.3887, + "step": 7665 + }, + { + "epoch": 0.5696461064280144, + "grad_norm": 0.7689836621284485, + "learning_rate": 4.1957870800864344e-05, + "loss": 0.3652, + "step": 7670 + }, + { + "epoch": 0.5700174533031305, + "grad_norm": 0.8733877539634705, + "learning_rate": 4.194682087259754e-05, + "loss": 0.3929, + "step": 7675 + }, + { + "epoch": 0.5703888001782464, + "grad_norm": 1.7723708152770996, + "learning_rate": 4.193576481544344e-05, + "loss": 0.3832, + "step": 7680 + }, + { + "epoch": 0.5707601470533625, + "grad_norm": 1.078263759613037, + "learning_rate": 4.192470263340051e-05, + "loss": 0.4004, + "step": 7685 + }, + { + "epoch": 0.5711314939284786, + "grad_norm": 1.061034083366394, + "learning_rate": 4.191363433046946e-05, + "loss": 0.385, + "step": 7690 + }, + { + "epoch": 0.5715028408035946, + "grad_norm": 0.9127119183540344, + "learning_rate": 4.190255991065316e-05, + "loss": 0.369, + "step": 7695 + }, + { + "epoch": 0.5718741876787107, + "grad_norm": 1.017187237739563, + "learning_rate": 4.189147937795673e-05, + "loss": 0.3792, + "step": 7700 + }, + { + "epoch": 0.5722455345538268, + "grad_norm": 0.9224918484687805, + "learning_rate": 4.188039273638749e-05, + "loss": 0.3978, + "step": 7705 + }, + { + "epoch": 0.5726168814289427, + "grad_norm": 1.0088601112365723, + "learning_rate": 4.186929998995497e-05, + "loss": 0.3739, + "step": 7710 + }, + { + "epoch": 0.5729882283040588, + "grad_norm": 1.0696107149124146, + "learning_rate": 4.1858201142670904e-05, + "loss": 0.3929, + "step": 7715 + }, + { + "epoch": 0.5733595751791749, + "grad_norm": 0.8687728047370911, + "learning_rate": 4.1847096198549236e-05, + "loss": 0.3683, + "step": 7720 + }, + { + "epoch": 0.5737309220542909, + "grad_norm": 0.7542261481285095, + "learning_rate": 4.183598516160612e-05, + "loss": 0.3854, + "step": 7725 + }, + { + "epoch": 0.574102268929407, + "grad_norm": 0.8737427592277527, + "learning_rate": 4.18248680358599e-05, + "loss": 0.3918, + "step": 7730 + }, + { + "epoch": 0.574473615804523, + "grad_norm": 0.7520382404327393, + "learning_rate": 4.1813744825331126e-05, + "loss": 0.3836, + "step": 7735 + }, + { + "epoch": 0.574844962679639, + "grad_norm": 1.4893221855163574, + "learning_rate": 4.1802615534042563e-05, + "loss": 0.3679, + "step": 7740 + }, + { + "epoch": 0.5752163095547551, + "grad_norm": 1.0394209623336792, + "learning_rate": 4.179148016601917e-05, + "loss": 0.3835, + "step": 7745 + }, + { + "epoch": 0.5755876564298712, + "grad_norm": 0.9864911437034607, + "learning_rate": 4.178033872528809e-05, + "loss": 0.3737, + "step": 7750 + }, + { + "epoch": 0.5759590033049872, + "grad_norm": 0.7848846316337585, + "learning_rate": 4.176919121587867e-05, + "loss": 0.3839, + "step": 7755 + }, + { + "epoch": 0.5763303501801033, + "grad_norm": 0.6822853088378906, + "learning_rate": 4.175803764182245e-05, + "loss": 0.3695, + "step": 7760 + }, + { + "epoch": 0.5767016970552192, + "grad_norm": 1.83903169631958, + "learning_rate": 4.1746878007153164e-05, + "loss": 0.3493, + "step": 7765 + }, + { + "epoch": 0.5770730439303353, + "grad_norm": 0.9079355001449585, + "learning_rate": 4.173571231590676e-05, + "loss": 0.3889, + "step": 7770 + }, + { + "epoch": 0.5774443908054514, + "grad_norm": 0.8775737285614014, + "learning_rate": 4.172454057212134e-05, + "loss": 0.3736, + "step": 7775 + }, + { + "epoch": 0.5778157376805674, + "grad_norm": 0.8449175953865051, + "learning_rate": 4.171336277983721e-05, + "loss": 0.3912, + "step": 7780 + }, + { + "epoch": 0.5781870845556835, + "grad_norm": 0.9498149752616882, + "learning_rate": 4.170217894309686e-05, + "loss": 0.3749, + "step": 7785 + }, + { + "epoch": 0.5785584314307995, + "grad_norm": 0.8782013058662415, + "learning_rate": 4.1690989065944985e-05, + "loss": 0.3888, + "step": 7790 + }, + { + "epoch": 0.5789297783059155, + "grad_norm": 0.9666025042533875, + "learning_rate": 4.167979315242844e-05, + "loss": 0.3881, + "step": 7795 + }, + { + "epoch": 0.5793011251810316, + "grad_norm": 1.0839999914169312, + "learning_rate": 4.1668591206596274e-05, + "loss": 0.3891, + "step": 7800 + }, + { + "epoch": 0.5796724720561477, + "grad_norm": 0.8281628489494324, + "learning_rate": 4.165738323249972e-05, + "loss": 0.3787, + "step": 7805 + }, + { + "epoch": 0.5800438189312637, + "grad_norm": 0.7159334421157837, + "learning_rate": 4.1646169234192185e-05, + "loss": 0.4055, + "step": 7810 + }, + { + "epoch": 0.5804151658063798, + "grad_norm": 0.9558557868003845, + "learning_rate": 4.163494921572926e-05, + "loss": 0.398, + "step": 7815 + }, + { + "epoch": 0.5807865126814958, + "grad_norm": 0.7024179100990295, + "learning_rate": 4.162372318116871e-05, + "loss": 0.3903, + "step": 7820 + }, + { + "epoch": 0.5811578595566118, + "grad_norm": 0.9543454647064209, + "learning_rate": 4.161249113457047e-05, + "loss": 0.3796, + "step": 7825 + }, + { + "epoch": 0.5815292064317279, + "grad_norm": 0.8931434750556946, + "learning_rate": 4.160125307999667e-05, + "loss": 0.3798, + "step": 7830 + }, + { + "epoch": 0.5819005533068439, + "grad_norm": 0.9350513219833374, + "learning_rate": 4.1590009021511585e-05, + "loss": 0.3922, + "step": 7835 + }, + { + "epoch": 0.58227190018196, + "grad_norm": 1.3297828435897827, + "learning_rate": 4.157875896318168e-05, + "loss": 0.384, + "step": 7840 + }, + { + "epoch": 0.582643247057076, + "grad_norm": 0.8016966581344604, + "learning_rate": 4.156750290907559e-05, + "loss": 0.3897, + "step": 7845 + }, + { + "epoch": 0.583014593932192, + "grad_norm": 0.7308679223060608, + "learning_rate": 4.155624086326409e-05, + "loss": 0.384, + "step": 7850 + }, + { + "epoch": 0.5833859408073081, + "grad_norm": 0.9664947390556335, + "learning_rate": 4.1544972829820173e-05, + "loss": 0.3586, + "step": 7855 + }, + { + "epoch": 0.5837572876824242, + "grad_norm": 0.7831148505210876, + "learning_rate": 4.153369881281897e-05, + "loss": 0.3751, + "step": 7860 + }, + { + "epoch": 0.5841286345575402, + "grad_norm": 0.8598526120185852, + "learning_rate": 4.152241881633775e-05, + "loss": 0.3967, + "step": 7865 + }, + { + "epoch": 0.5844999814326562, + "grad_norm": 0.9544880986213684, + "learning_rate": 4.151113284445599e-05, + "loss": 0.3772, + "step": 7870 + }, + { + "epoch": 0.5848713283077723, + "grad_norm": 0.9176838397979736, + "learning_rate": 4.14998409012553e-05, + "loss": 0.3836, + "step": 7875 + }, + { + "epoch": 0.5852426751828883, + "grad_norm": 0.7144293785095215, + "learning_rate": 4.148854299081947e-05, + "loss": 0.3762, + "step": 7880 + }, + { + "epoch": 0.5856140220580044, + "grad_norm": 0.9049531817436218, + "learning_rate": 4.147723911723441e-05, + "loss": 0.3935, + "step": 7885 + }, + { + "epoch": 0.5859853689331205, + "grad_norm": 1.0131149291992188, + "learning_rate": 4.146592928458824e-05, + "loss": 0.3899, + "step": 7890 + }, + { + "epoch": 0.5863567158082365, + "grad_norm": 0.9420373439788818, + "learning_rate": 4.1454613496971195e-05, + "loss": 0.3769, + "step": 7895 + }, + { + "epoch": 0.5867280626833525, + "grad_norm": 1.0125765800476074, + "learning_rate": 4.144329175847567e-05, + "loss": 0.3655, + "step": 7900 + }, + { + "epoch": 0.5870994095584685, + "grad_norm": 0.810465395450592, + "learning_rate": 4.143196407319623e-05, + "loss": 0.3802, + "step": 7905 + }, + { + "epoch": 0.5874707564335846, + "grad_norm": 0.9369770884513855, + "learning_rate": 4.142063044522958e-05, + "loss": 0.3688, + "step": 7910 + }, + { + "epoch": 0.5878421033087007, + "grad_norm": 0.9551800489425659, + "learning_rate": 4.140929087867456e-05, + "loss": 0.3758, + "step": 7915 + }, + { + "epoch": 0.5882134501838167, + "grad_norm": 1.0710747241973877, + "learning_rate": 4.139794537763217e-05, + "loss": 0.3665, + "step": 7920 + }, + { + "epoch": 0.5885847970589327, + "grad_norm": 1.1802254915237427, + "learning_rate": 4.138659394620558e-05, + "loss": 0.3828, + "step": 7925 + }, + { + "epoch": 0.5889561439340488, + "grad_norm": 0.9443197846412659, + "learning_rate": 4.137523658850005e-05, + "loss": 0.3853, + "step": 7930 + }, + { + "epoch": 0.5893274908091648, + "grad_norm": 0.7773070931434631, + "learning_rate": 4.136387330862303e-05, + "loss": 0.3952, + "step": 7935 + }, + { + "epoch": 0.5896988376842809, + "grad_norm": 0.8189634084701538, + "learning_rate": 4.135250411068411e-05, + "loss": 0.3939, + "step": 7940 + }, + { + "epoch": 0.590070184559397, + "grad_norm": 0.9138241410255432, + "learning_rate": 4.1341128998795e-05, + "loss": 0.4005, + "step": 7945 + }, + { + "epoch": 0.590441531434513, + "grad_norm": 0.9136779308319092, + "learning_rate": 4.132974797706954e-05, + "loss": 0.3808, + "step": 7950 + }, + { + "epoch": 0.590812878309629, + "grad_norm": 1.363004446029663, + "learning_rate": 4.131836104962373e-05, + "loss": 0.3793, + "step": 7955 + }, + { + "epoch": 0.5911842251847451, + "grad_norm": 0.699315071105957, + "learning_rate": 4.1306968220575716e-05, + "loss": 0.3825, + "step": 7960 + }, + { + "epoch": 0.5915555720598611, + "grad_norm": 0.9782199859619141, + "learning_rate": 4.1295569494045736e-05, + "loss": 0.3861, + "step": 7965 + }, + { + "epoch": 0.5919269189349772, + "grad_norm": 0.991134524345398, + "learning_rate": 4.128416487415621e-05, + "loss": 0.374, + "step": 7970 + }, + { + "epoch": 0.5922982658100933, + "grad_norm": 0.8225498199462891, + "learning_rate": 4.127275436503165e-05, + "loss": 0.3808, + "step": 7975 + }, + { + "epoch": 0.5926696126852092, + "grad_norm": 1.1791242361068726, + "learning_rate": 4.126133797079872e-05, + "loss": 0.3901, + "step": 7980 + }, + { + "epoch": 0.5930409595603253, + "grad_norm": 0.8043322563171387, + "learning_rate": 4.1249915695586195e-05, + "loss": 0.3776, + "step": 7985 + }, + { + "epoch": 0.5934123064354413, + "grad_norm": 0.9959288835525513, + "learning_rate": 4.123848754352501e-05, + "loss": 0.3846, + "step": 7990 + }, + { + "epoch": 0.5937836533105574, + "grad_norm": 1.1933143138885498, + "learning_rate": 4.122705351874818e-05, + "loss": 0.3712, + "step": 7995 + }, + { + "epoch": 0.5941550001856735, + "grad_norm": 1.0833144187927246, + "learning_rate": 4.121561362539087e-05, + "loss": 0.397, + "step": 8000 + }, + { + "epoch": 0.5945263470607894, + "grad_norm": 0.8315593600273132, + "learning_rate": 4.120416786759038e-05, + "loss": 0.3924, + "step": 8005 + }, + { + "epoch": 0.5948976939359055, + "grad_norm": 0.6777327656745911, + "learning_rate": 4.119271624948611e-05, + "loss": 0.3754, + "step": 8010 + }, + { + "epoch": 0.5952690408110216, + "grad_norm": 0.9331532120704651, + "learning_rate": 4.1181258775219566e-05, + "loss": 0.3777, + "step": 8015 + }, + { + "epoch": 0.5956403876861376, + "grad_norm": 0.8980804085731506, + "learning_rate": 4.1169795448934414e-05, + "loss": 0.4035, + "step": 8020 + }, + { + "epoch": 0.5960117345612537, + "grad_norm": 0.9153735041618347, + "learning_rate": 4.1158326274776404e-05, + "loss": 0.3697, + "step": 8025 + }, + { + "epoch": 0.5963830814363698, + "grad_norm": 0.8011136651039124, + "learning_rate": 4.114685125689341e-05, + "loss": 0.3739, + "step": 8030 + }, + { + "epoch": 0.5967544283114857, + "grad_norm": 0.7510383725166321, + "learning_rate": 4.113537039943542e-05, + "loss": 0.3825, + "step": 8035 + }, + { + "epoch": 0.5971257751866018, + "grad_norm": 0.9823917746543884, + "learning_rate": 4.112388370655453e-05, + "loss": 0.3692, + "step": 8040 + }, + { + "epoch": 0.5974971220617179, + "grad_norm": 0.7274696826934814, + "learning_rate": 4.111239118240495e-05, + "loss": 0.3599, + "step": 8045 + }, + { + "epoch": 0.5978684689368339, + "grad_norm": 0.8090460300445557, + "learning_rate": 4.110089283114301e-05, + "loss": 0.3787, + "step": 8050 + }, + { + "epoch": 0.59823981581195, + "grad_norm": 0.7262414693832397, + "learning_rate": 4.1089388656927116e-05, + "loss": 0.3765, + "step": 8055 + }, + { + "epoch": 0.5986111626870659, + "grad_norm": 0.8816661834716797, + "learning_rate": 4.107787866391782e-05, + "loss": 0.3741, + "step": 8060 + }, + { + "epoch": 0.598982509562182, + "grad_norm": 0.8424727916717529, + "learning_rate": 4.106636285627774e-05, + "loss": 0.3892, + "step": 8065 + }, + { + "epoch": 0.5993538564372981, + "grad_norm": 0.7614096999168396, + "learning_rate": 4.105484123817163e-05, + "loss": 0.4107, + "step": 8070 + }, + { + "epoch": 0.5997252033124141, + "grad_norm": 0.758870005607605, + "learning_rate": 4.1043313813766314e-05, + "loss": 0.3829, + "step": 8075 + }, + { + "epoch": 0.6000965501875302, + "grad_norm": 0.8257849812507629, + "learning_rate": 4.103178058723075e-05, + "loss": 0.3938, + "step": 8080 + }, + { + "epoch": 0.6004678970626462, + "grad_norm": 2.6828434467315674, + "learning_rate": 4.102024156273597e-05, + "loss": 0.4119, + "step": 8085 + }, + { + "epoch": 0.6008392439377622, + "grad_norm": 0.8033463954925537, + "learning_rate": 4.100869674445511e-05, + "loss": 0.3991, + "step": 8090 + }, + { + "epoch": 0.6012105908128783, + "grad_norm": 0.9146053791046143, + "learning_rate": 4.099714613656339e-05, + "loss": 0.3687, + "step": 8095 + }, + { + "epoch": 0.6015819376879944, + "grad_norm": 0.7598369121551514, + "learning_rate": 4.098558974323816e-05, + "loss": 0.3657, + "step": 8100 + }, + { + "epoch": 0.6019532845631104, + "grad_norm": 1.4062983989715576, + "learning_rate": 4.09740275686588e-05, + "loss": 0.4008, + "step": 8105 + }, + { + "epoch": 0.6023246314382265, + "grad_norm": 0.8199751973152161, + "learning_rate": 4.096245961700685e-05, + "loss": 0.3914, + "step": 8110 + }, + { + "epoch": 0.6026959783133425, + "grad_norm": 0.7408627271652222, + "learning_rate": 4.0950885892465904e-05, + "loss": 0.3501, + "step": 8115 + }, + { + "epoch": 0.6030673251884585, + "grad_norm": 0.9060487151145935, + "learning_rate": 4.093930639922163e-05, + "loss": 0.3818, + "step": 8120 + }, + { + "epoch": 0.6034386720635746, + "grad_norm": 0.8455791473388672, + "learning_rate": 4.0927721141461816e-05, + "loss": 0.3913, + "step": 8125 + }, + { + "epoch": 0.6038100189386907, + "grad_norm": 0.8929671049118042, + "learning_rate": 4.091613012337629e-05, + "loss": 0.367, + "step": 8130 + }, + { + "epoch": 0.6041813658138067, + "grad_norm": 0.8208998441696167, + "learning_rate": 4.0904533349157035e-05, + "loss": 0.3663, + "step": 8135 + }, + { + "epoch": 0.6045527126889227, + "grad_norm": 0.7038164138793945, + "learning_rate": 4.089293082299805e-05, + "loss": 0.3583, + "step": 8140 + }, + { + "epoch": 0.6049240595640387, + "grad_norm": 0.8460705876350403, + "learning_rate": 4.088132254909542e-05, + "loss": 0.3791, + "step": 8145 + }, + { + "epoch": 0.6052954064391548, + "grad_norm": 0.8331727981567383, + "learning_rate": 4.0869708531647346e-05, + "loss": 0.3842, + "step": 8150 + }, + { + "epoch": 0.6056667533142709, + "grad_norm": 0.7929322123527527, + "learning_rate": 4.085808877485407e-05, + "loss": 0.3791, + "step": 8155 + }, + { + "epoch": 0.6060381001893869, + "grad_norm": 0.850964367389679, + "learning_rate": 4.0846463282917944e-05, + "loss": 0.3855, + "step": 8160 + }, + { + "epoch": 0.606409447064503, + "grad_norm": 0.7293786406517029, + "learning_rate": 4.083483206004336e-05, + "loss": 0.3748, + "step": 8165 + }, + { + "epoch": 0.606780793939619, + "grad_norm": 0.9626237750053406, + "learning_rate": 4.0823195110436805e-05, + "loss": 0.367, + "step": 8170 + }, + { + "epoch": 0.607152140814735, + "grad_norm": 0.9357216954231262, + "learning_rate": 4.0811552438306824e-05, + "loss": 0.387, + "step": 8175 + }, + { + "epoch": 0.6075234876898511, + "grad_norm": 0.8385303616523743, + "learning_rate": 4.079990404786404e-05, + "loss": 0.3803, + "step": 8180 + }, + { + "epoch": 0.6078948345649672, + "grad_norm": 0.8900553584098816, + "learning_rate": 4.078824994332114e-05, + "loss": 0.4066, + "step": 8185 + }, + { + "epoch": 0.6082661814400832, + "grad_norm": 0.906819760799408, + "learning_rate": 4.0776590128892886e-05, + "loss": 0.3848, + "step": 8190 + }, + { + "epoch": 0.6086375283151992, + "grad_norm": 0.7580803632736206, + "learning_rate": 4.076492460879609e-05, + "loss": 0.3857, + "step": 8195 + }, + { + "epoch": 0.6090088751903153, + "grad_norm": 0.9620898365974426, + "learning_rate": 4.0753253387249626e-05, + "loss": 0.3959, + "step": 8200 + }, + { + "epoch": 0.6093802220654313, + "grad_norm": 0.7553649544715881, + "learning_rate": 4.074157646847446e-05, + "loss": 0.3752, + "step": 8205 + }, + { + "epoch": 0.6097515689405474, + "grad_norm": 1.101928472518921, + "learning_rate": 4.072989385669358e-05, + "loss": 0.3945, + "step": 8210 + }, + { + "epoch": 0.6101229158156634, + "grad_norm": 0.6902221441268921, + "learning_rate": 4.071820555613206e-05, + "loss": 0.3756, + "step": 8215 + }, + { + "epoch": 0.6104942626907794, + "grad_norm": 0.7055659294128418, + "learning_rate": 4.070651157101701e-05, + "loss": 0.3882, + "step": 8220 + }, + { + "epoch": 0.6108656095658955, + "grad_norm": 0.929861843585968, + "learning_rate": 4.0694811905577634e-05, + "loss": 0.4055, + "step": 8225 + }, + { + "epoch": 0.6112369564410115, + "grad_norm": 1.9215669631958008, + "learning_rate": 4.0683106564045135e-05, + "loss": 0.3598, + "step": 8230 + }, + { + "epoch": 0.6116083033161276, + "grad_norm": 0.7936436533927917, + "learning_rate": 4.067139555065281e-05, + "loss": 0.3895, + "step": 8235 + }, + { + "epoch": 0.6119796501912437, + "grad_norm": 0.8062562346458435, + "learning_rate": 4.065967886963601e-05, + "loss": 0.3681, + "step": 8240 + }, + { + "epoch": 0.6123509970663596, + "grad_norm": 0.983604371547699, + "learning_rate": 4.0647956525232094e-05, + "loss": 0.3759, + "step": 8245 + }, + { + "epoch": 0.6127223439414757, + "grad_norm": 1.1448338031768799, + "learning_rate": 4.063622852168051e-05, + "loss": 0.3793, + "step": 8250 + }, + { + "epoch": 0.6130936908165918, + "grad_norm": 0.9411412477493286, + "learning_rate": 4.0624494863222736e-05, + "loss": 0.3812, + "step": 8255 + }, + { + "epoch": 0.6134650376917078, + "grad_norm": 0.7395769953727722, + "learning_rate": 4.061275555410229e-05, + "loss": 0.3808, + "step": 8260 + }, + { + "epoch": 0.6138363845668239, + "grad_norm": 0.8813555240631104, + "learning_rate": 4.060101059856476e-05, + "loss": 0.3736, + "step": 8265 + }, + { + "epoch": 0.61420773144194, + "grad_norm": 0.7604705095291138, + "learning_rate": 4.0589260000857745e-05, + "loss": 0.3914, + "step": 8270 + }, + { + "epoch": 0.6145790783170559, + "grad_norm": 1.0517133474349976, + "learning_rate": 4.0577503765230893e-05, + "loss": 0.3586, + "step": 8275 + }, + { + "epoch": 0.614950425192172, + "grad_norm": 0.8700730204582214, + "learning_rate": 4.05657418959359e-05, + "loss": 0.383, + "step": 8280 + }, + { + "epoch": 0.6153217720672881, + "grad_norm": 0.8465512990951538, + "learning_rate": 4.05539743972265e-05, + "loss": 0.3552, + "step": 8285 + }, + { + "epoch": 0.6156931189424041, + "grad_norm": 1.0167089700698853, + "learning_rate": 4.0542201273358446e-05, + "loss": 0.4038, + "step": 8290 + }, + { + "epoch": 0.6160644658175202, + "grad_norm": 0.9293932914733887, + "learning_rate": 4.053042252858955e-05, + "loss": 0.3929, + "step": 8295 + }, + { + "epoch": 0.6164358126926361, + "grad_norm": 0.8165609240531921, + "learning_rate": 4.051863816717964e-05, + "loss": 0.3744, + "step": 8300 + }, + { + "epoch": 0.6168071595677522, + "grad_norm": 0.885839581489563, + "learning_rate": 4.050684819339056e-05, + "loss": 0.3824, + "step": 8305 + }, + { + "epoch": 0.6171785064428683, + "grad_norm": 1.0093748569488525, + "learning_rate": 4.049505261148622e-05, + "loss": 0.3811, + "step": 8310 + }, + { + "epoch": 0.6175498533179843, + "grad_norm": 0.9099218845367432, + "learning_rate": 4.048325142573254e-05, + "loss": 0.3781, + "step": 8315 + }, + { + "epoch": 0.6179212001931004, + "grad_norm": 0.7306044101715088, + "learning_rate": 4.047144464039746e-05, + "loss": 0.3505, + "step": 8320 + }, + { + "epoch": 0.6182925470682165, + "grad_norm": 0.8570384383201599, + "learning_rate": 4.045963225975097e-05, + "loss": 0.3927, + "step": 8325 + }, + { + "epoch": 0.6186638939433324, + "grad_norm": 0.8275445699691772, + "learning_rate": 4.0447814288065044e-05, + "loss": 0.3814, + "step": 8330 + }, + { + "epoch": 0.6190352408184485, + "grad_norm": 0.906373918056488, + "learning_rate": 4.0435990729613706e-05, + "loss": 0.365, + "step": 8335 + }, + { + "epoch": 0.6194065876935646, + "grad_norm": 0.8798776865005493, + "learning_rate": 4.0424161588673e-05, + "loss": 0.3808, + "step": 8340 + }, + { + "epoch": 0.6197779345686806, + "grad_norm": 1.0505601167678833, + "learning_rate": 4.041232686952098e-05, + "loss": 0.4154, + "step": 8345 + }, + { + "epoch": 0.6201492814437967, + "grad_norm": 0.8700944185256958, + "learning_rate": 4.0400486576437724e-05, + "loss": 0.402, + "step": 8350 + }, + { + "epoch": 0.6205206283189127, + "grad_norm": 0.9342040419578552, + "learning_rate": 4.038864071370532e-05, + "loss": 0.3918, + "step": 8355 + }, + { + "epoch": 0.6208919751940287, + "grad_norm": 0.7223671674728394, + "learning_rate": 4.037678928560786e-05, + "loss": 0.3933, + "step": 8360 + }, + { + "epoch": 0.6212633220691448, + "grad_norm": 1.1166205406188965, + "learning_rate": 4.036493229643149e-05, + "loss": 0.408, + "step": 8365 + }, + { + "epoch": 0.6216346689442608, + "grad_norm": 1.0232312679290771, + "learning_rate": 4.0353069750464314e-05, + "loss": 0.3843, + "step": 8370 + }, + { + "epoch": 0.6220060158193769, + "grad_norm": 0.9179908037185669, + "learning_rate": 4.0341201651996485e-05, + "loss": 0.4008, + "step": 8375 + }, + { + "epoch": 0.622377362694493, + "grad_norm": 0.725074291229248, + "learning_rate": 4.032932800532014e-05, + "loss": 0.3791, + "step": 8380 + }, + { + "epoch": 0.6227487095696089, + "grad_norm": 0.7987901568412781, + "learning_rate": 4.031744881472944e-05, + "loss": 0.3634, + "step": 8385 + }, + { + "epoch": 0.623120056444725, + "grad_norm": 0.7891753315925598, + "learning_rate": 4.0305564084520536e-05, + "loss": 0.3629, + "step": 8390 + }, + { + "epoch": 0.6234914033198411, + "grad_norm": 0.9482506513595581, + "learning_rate": 4.029367381899159e-05, + "loss": 0.3832, + "step": 8395 + }, + { + "epoch": 0.6238627501949571, + "grad_norm": 0.7741977572441101, + "learning_rate": 4.028177802244277e-05, + "loss": 0.3547, + "step": 8400 + }, + { + "epoch": 0.6242340970700732, + "grad_norm": 0.6711797714233398, + "learning_rate": 4.0269876699176245e-05, + "loss": 0.394, + "step": 8405 + }, + { + "epoch": 0.6246054439451892, + "grad_norm": 1.1592880487442017, + "learning_rate": 4.0257969853496156e-05, + "loss": 0.407, + "step": 8410 + }, + { + "epoch": 0.6249767908203052, + "grad_norm": 0.9707858562469482, + "learning_rate": 4.0246057489708686e-05, + "loss": 0.3682, + "step": 8415 + }, + { + "epoch": 0.6253481376954213, + "grad_norm": 0.9577823281288147, + "learning_rate": 4.023413961212198e-05, + "loss": 0.3788, + "step": 8420 + }, + { + "epoch": 0.6257194845705374, + "grad_norm": 0.9387092590332031, + "learning_rate": 4.0222216225046184e-05, + "loss": 0.3768, + "step": 8425 + }, + { + "epoch": 0.6260908314456534, + "grad_norm": 1.0328980684280396, + "learning_rate": 4.021028733279344e-05, + "loss": 0.3651, + "step": 8430 + }, + { + "epoch": 0.6264621783207694, + "grad_norm": 1.5044817924499512, + "learning_rate": 4.0198352939677894e-05, + "loss": 0.3812, + "step": 8435 + }, + { + "epoch": 0.6268335251958854, + "grad_norm": 0.967638373374939, + "learning_rate": 4.0186413050015657e-05, + "loss": 0.3899, + "step": 8440 + }, + { + "epoch": 0.6272048720710015, + "grad_norm": 0.8514472246170044, + "learning_rate": 4.017446766812484e-05, + "loss": 0.3891, + "step": 8445 + }, + { + "epoch": 0.6275762189461176, + "grad_norm": 0.7428901791572571, + "learning_rate": 4.0162516798325535e-05, + "loss": 0.3713, + "step": 8450 + }, + { + "epoch": 0.6279475658212336, + "grad_norm": 0.8475235104560852, + "learning_rate": 4.0150560444939826e-05, + "loss": 0.3736, + "step": 8455 + }, + { + "epoch": 0.6283189126963497, + "grad_norm": 0.8051154017448425, + "learning_rate": 4.013859861229179e-05, + "loss": 0.3801, + "step": 8460 + }, + { + "epoch": 0.6286902595714657, + "grad_norm": 0.7596623301506042, + "learning_rate": 4.012663130470746e-05, + "loss": 0.382, + "step": 8465 + }, + { + "epoch": 0.6290616064465817, + "grad_norm": 0.9572256803512573, + "learning_rate": 4.0114658526514856e-05, + "loss": 0.3771, + "step": 8470 + }, + { + "epoch": 0.6294329533216978, + "grad_norm": 0.9228476285934448, + "learning_rate": 4.0102680282044e-05, + "loss": 0.3848, + "step": 8475 + }, + { + "epoch": 0.6298043001968139, + "grad_norm": 0.8364796042442322, + "learning_rate": 4.009069657562686e-05, + "loss": 0.3729, + "step": 8480 + }, + { + "epoch": 0.6301756470719299, + "grad_norm": 1.0168267488479614, + "learning_rate": 4.007870741159739e-05, + "loss": 0.3632, + "step": 8485 + }, + { + "epoch": 0.6305469939470459, + "grad_norm": 0.8414918780326843, + "learning_rate": 4.006671279429154e-05, + "loss": 0.3705, + "step": 8490 + }, + { + "epoch": 0.630918340822162, + "grad_norm": 0.9206081032752991, + "learning_rate": 4.0054712728047194e-05, + "loss": 0.3769, + "step": 8495 + }, + { + "epoch": 0.631289687697278, + "grad_norm": 0.93706214427948, + "learning_rate": 4.0042707217204226e-05, + "loss": 0.3867, + "step": 8500 + }, + { + "epoch": 0.6316610345723941, + "grad_norm": 1.0076346397399902, + "learning_rate": 4.0030696266104486e-05, + "loss": 0.391, + "step": 8505 + }, + { + "epoch": 0.6320323814475102, + "grad_norm": 0.9548998475074768, + "learning_rate": 4.001867987909178e-05, + "loss": 0.3764, + "step": 8510 + }, + { + "epoch": 0.6324037283226261, + "grad_norm": 1.0261924266815186, + "learning_rate": 4.000665806051188e-05, + "loss": 0.3743, + "step": 8515 + }, + { + "epoch": 0.6327750751977422, + "grad_norm": 1.048486351966858, + "learning_rate": 3.999463081471254e-05, + "loss": 0.3913, + "step": 8520 + }, + { + "epoch": 0.6331464220728582, + "grad_norm": 1.449408769607544, + "learning_rate": 3.998259814604345e-05, + "loss": 0.3814, + "step": 8525 + }, + { + "epoch": 0.6335177689479743, + "grad_norm": 0.6910377144813538, + "learning_rate": 3.997056005885628e-05, + "loss": 0.3701, + "step": 8530 + }, + { + "epoch": 0.6338891158230904, + "grad_norm": 2.5074048042297363, + "learning_rate": 3.9958516557504635e-05, + "loss": 0.4016, + "step": 8535 + }, + { + "epoch": 0.6342604626982064, + "grad_norm": 0.9100797772407532, + "learning_rate": 3.994646764634413e-05, + "loss": 0.368, + "step": 8540 + }, + { + "epoch": 0.6346318095733224, + "grad_norm": 0.9319835901260376, + "learning_rate": 3.993441332973228e-05, + "loss": 0.3704, + "step": 8545 + }, + { + "epoch": 0.6350031564484385, + "grad_norm": 1.1303112506866455, + "learning_rate": 3.9922353612028576e-05, + "loss": 0.3715, + "step": 8550 + }, + { + "epoch": 0.6353745033235545, + "grad_norm": 0.8505479693412781, + "learning_rate": 3.991028849759448e-05, + "loss": 0.3563, + "step": 8555 + }, + { + "epoch": 0.6357458501986706, + "grad_norm": 0.7193156480789185, + "learning_rate": 3.9898217990793375e-05, + "loss": 0.3668, + "step": 8560 + }, + { + "epoch": 0.6361171970737867, + "grad_norm": 0.8877584934234619, + "learning_rate": 3.9886142095990624e-05, + "loss": 0.3501, + "step": 8565 + }, + { + "epoch": 0.6364885439489026, + "grad_norm": 0.8598123788833618, + "learning_rate": 3.987406081755351e-05, + "loss": 0.3898, + "step": 8570 + }, + { + "epoch": 0.6368598908240187, + "grad_norm": 0.7827747464179993, + "learning_rate": 3.9861974159851286e-05, + "loss": 0.3747, + "step": 8575 + }, + { + "epoch": 0.6372312376991348, + "grad_norm": 1.0073925256729126, + "learning_rate": 3.984988212725514e-05, + "loss": 0.3731, + "step": 8580 + }, + { + "epoch": 0.6376025845742508, + "grad_norm": 0.9102515578269958, + "learning_rate": 3.983778472413821e-05, + "loss": 0.3815, + "step": 8585 + }, + { + "epoch": 0.6379739314493669, + "grad_norm": 0.9365094900131226, + "learning_rate": 3.982568195487555e-05, + "loss": 0.3836, + "step": 8590 + }, + { + "epoch": 0.6383452783244828, + "grad_norm": 0.9244111776351929, + "learning_rate": 3.9813573823844194e-05, + "loss": 0.3957, + "step": 8595 + }, + { + "epoch": 0.6387166251995989, + "grad_norm": 1.1187372207641602, + "learning_rate": 3.98014603354231e-05, + "loss": 0.4007, + "step": 8600 + }, + { + "epoch": 0.639087972074715, + "grad_norm": 0.9645322561264038, + "learning_rate": 3.978934149399315e-05, + "loss": 0.3768, + "step": 8605 + }, + { + "epoch": 0.639459318949831, + "grad_norm": 1.0242455005645752, + "learning_rate": 3.977721730393719e-05, + "loss": 0.3664, + "step": 8610 + }, + { + "epoch": 0.6398306658249471, + "grad_norm": 0.9759249687194824, + "learning_rate": 3.976508776963996e-05, + "loss": 0.3875, + "step": 8615 + }, + { + "epoch": 0.6402020127000632, + "grad_norm": 0.7980156540870667, + "learning_rate": 3.975295289548817e-05, + "loss": 0.3865, + "step": 8620 + }, + { + "epoch": 0.6405733595751791, + "grad_norm": 0.6709896326065063, + "learning_rate": 3.9740812685870445e-05, + "loss": 0.3523, + "step": 8625 + }, + { + "epoch": 0.6409447064502952, + "grad_norm": 0.8203238248825073, + "learning_rate": 3.972866714517734e-05, + "loss": 0.3508, + "step": 8630 + }, + { + "epoch": 0.6413160533254113, + "grad_norm": 1.0209792852401733, + "learning_rate": 3.9716516277801344e-05, + "loss": 0.358, + "step": 8635 + }, + { + "epoch": 0.6416874002005273, + "grad_norm": 1.0689033269882202, + "learning_rate": 3.970436008813686e-05, + "loss": 0.3918, + "step": 8640 + }, + { + "epoch": 0.6420587470756434, + "grad_norm": 1.2434110641479492, + "learning_rate": 3.969219858058022e-05, + "loss": 0.382, + "step": 8645 + }, + { + "epoch": 0.6424300939507595, + "grad_norm": 0.8145067691802979, + "learning_rate": 3.96800317595297e-05, + "loss": 0.3613, + "step": 8650 + }, + { + "epoch": 0.6428014408258754, + "grad_norm": 1.0757936239242554, + "learning_rate": 3.9667859629385475e-05, + "loss": 0.3759, + "step": 8655 + }, + { + "epoch": 0.6431727877009915, + "grad_norm": 1.010427713394165, + "learning_rate": 3.9655682194549636e-05, + "loss": 0.3739, + "step": 8660 + }, + { + "epoch": 0.6435441345761076, + "grad_norm": 0.7364680767059326, + "learning_rate": 3.96434994594262e-05, + "loss": 0.348, + "step": 8665 + }, + { + "epoch": 0.6439154814512236, + "grad_norm": 0.7643406391143799, + "learning_rate": 3.963131142842113e-05, + "loss": 0.3514, + "step": 8670 + }, + { + "epoch": 0.6442868283263397, + "grad_norm": 0.9164127707481384, + "learning_rate": 3.961911810594226e-05, + "loss": 0.364, + "step": 8675 + }, + { + "epoch": 0.6446581752014556, + "grad_norm": 0.7336118221282959, + "learning_rate": 3.960691949639934e-05, + "loss": 0.3876, + "step": 8680 + }, + { + "epoch": 0.6450295220765717, + "grad_norm": 0.728316605091095, + "learning_rate": 3.959471560420407e-05, + "loss": 0.3902, + "step": 8685 + }, + { + "epoch": 0.6454008689516878, + "grad_norm": 0.73708575963974, + "learning_rate": 3.958250643377003e-05, + "loss": 0.3751, + "step": 8690 + }, + { + "epoch": 0.6457722158268038, + "grad_norm": 0.708919107913971, + "learning_rate": 3.957029198951271e-05, + "loss": 0.3802, + "step": 8695 + }, + { + "epoch": 0.6461435627019199, + "grad_norm": 0.9214386343955994, + "learning_rate": 3.955807227584953e-05, + "loss": 0.3688, + "step": 8700 + }, + { + "epoch": 0.646514909577036, + "grad_norm": 0.8379608392715454, + "learning_rate": 3.9545847297199784e-05, + "loss": 0.3778, + "step": 8705 + }, + { + "epoch": 0.6468862564521519, + "grad_norm": 0.7550622820854187, + "learning_rate": 3.953361705798469e-05, + "loss": 0.3723, + "step": 8710 + }, + { + "epoch": 0.647257603327268, + "grad_norm": 0.9238272309303284, + "learning_rate": 3.952138156262736e-05, + "loss": 0.3716, + "step": 8715 + }, + { + "epoch": 0.6476289502023841, + "grad_norm": 0.844140887260437, + "learning_rate": 3.9509140815552826e-05, + "loss": 0.3875, + "step": 8720 + }, + { + "epoch": 0.6480002970775001, + "grad_norm": 0.643217146396637, + "learning_rate": 3.9496894821187995e-05, + "loss": 0.3714, + "step": 8725 + }, + { + "epoch": 0.6483716439526162, + "grad_norm": 0.7092486619949341, + "learning_rate": 3.948464358396168e-05, + "loss": 0.3826, + "step": 8730 + }, + { + "epoch": 0.6487429908277322, + "grad_norm": 0.761620044708252, + "learning_rate": 3.947238710830459e-05, + "loss": 0.3705, + "step": 8735 + }, + { + "epoch": 0.6491143377028482, + "grad_norm": 0.7885283827781677, + "learning_rate": 3.946012539864933e-05, + "loss": 0.3824, + "step": 8740 + }, + { + "epoch": 0.6494856845779643, + "grad_norm": 0.7201789021492004, + "learning_rate": 3.94478584594304e-05, + "loss": 0.3892, + "step": 8745 + }, + { + "epoch": 0.6498570314530803, + "grad_norm": 0.8360629081726074, + "learning_rate": 3.943558629508418e-05, + "loss": 0.3712, + "step": 8750 + }, + { + "epoch": 0.6502283783281964, + "grad_norm": 0.5975351333618164, + "learning_rate": 3.9423308910048974e-05, + "loss": 0.373, + "step": 8755 + }, + { + "epoch": 0.6505997252033124, + "grad_norm": 0.7709315419197083, + "learning_rate": 3.941102630876492e-05, + "loss": 0.3562, + "step": 8760 + }, + { + "epoch": 0.6509710720784284, + "grad_norm": 0.7547560930252075, + "learning_rate": 3.9398738495674096e-05, + "loss": 0.3672, + "step": 8765 + }, + { + "epoch": 0.6513424189535445, + "grad_norm": 0.7901609539985657, + "learning_rate": 3.938644547522041e-05, + "loss": 0.3691, + "step": 8770 + }, + { + "epoch": 0.6517137658286606, + "grad_norm": 0.9239474534988403, + "learning_rate": 3.9374147251849704e-05, + "loss": 0.3717, + "step": 8775 + }, + { + "epoch": 0.6520851127037766, + "grad_norm": 0.9300574064254761, + "learning_rate": 3.936184383000968e-05, + "loss": 0.3667, + "step": 8780 + }, + { + "epoch": 0.6524564595788926, + "grad_norm": 0.8764330148696899, + "learning_rate": 3.934953521414992e-05, + "loss": 0.3529, + "step": 8785 + }, + { + "epoch": 0.6528278064540087, + "grad_norm": 0.8157888650894165, + "learning_rate": 3.933722140872188e-05, + "loss": 0.3712, + "step": 8790 + }, + { + "epoch": 0.6531991533291247, + "grad_norm": 0.8766961097717285, + "learning_rate": 3.93249024181789e-05, + "loss": 0.3557, + "step": 8795 + }, + { + "epoch": 0.6535705002042408, + "grad_norm": 0.8514896631240845, + "learning_rate": 3.931257824697619e-05, + "loss": 0.3978, + "step": 8800 + }, + { + "epoch": 0.6539418470793569, + "grad_norm": 1.4988986253738403, + "learning_rate": 3.9300248899570855e-05, + "loss": 0.3756, + "step": 8805 + }, + { + "epoch": 0.6543131939544728, + "grad_norm": 1.069677710533142, + "learning_rate": 3.928791438042184e-05, + "loss": 0.3773, + "step": 8810 + }, + { + "epoch": 0.6546845408295889, + "grad_norm": 0.7891872525215149, + "learning_rate": 3.9275574693989966e-05, + "loss": 0.3627, + "step": 8815 + }, + { + "epoch": 0.655055887704705, + "grad_norm": 1.4193977117538452, + "learning_rate": 3.926322984473795e-05, + "loss": 0.3792, + "step": 8820 + }, + { + "epoch": 0.655427234579821, + "grad_norm": 0.7989646792411804, + "learning_rate": 3.925087983713034e-05, + "loss": 0.3705, + "step": 8825 + }, + { + "epoch": 0.6557985814549371, + "grad_norm": 0.8563501238822937, + "learning_rate": 3.923852467563358e-05, + "loss": 0.3847, + "step": 8830 + }, + { + "epoch": 0.656169928330053, + "grad_norm": 0.8602546453475952, + "learning_rate": 3.922616436471597e-05, + "loss": 0.3738, + "step": 8835 + }, + { + "epoch": 0.6565412752051691, + "grad_norm": 0.9216306209564209, + "learning_rate": 3.921379890884764e-05, + "loss": 0.3612, + "step": 8840 + }, + { + "epoch": 0.6569126220802852, + "grad_norm": 0.8325752019882202, + "learning_rate": 3.920142831250063e-05, + "loss": 0.3593, + "step": 8845 + }, + { + "epoch": 0.6572839689554012, + "grad_norm": 0.9701927304267883, + "learning_rate": 3.918905258014882e-05, + "loss": 0.3981, + "step": 8850 + }, + { + "epoch": 0.6576553158305173, + "grad_norm": 0.781143069267273, + "learning_rate": 3.917667171626793e-05, + "loss": 0.3792, + "step": 8855 + }, + { + "epoch": 0.6580266627056334, + "grad_norm": 0.7671363353729248, + "learning_rate": 3.916428572533555e-05, + "loss": 0.3833, + "step": 8860 + }, + { + "epoch": 0.6583980095807493, + "grad_norm": 0.7129806876182556, + "learning_rate": 3.915189461183114e-05, + "loss": 0.3648, + "step": 8865 + }, + { + "epoch": 0.6587693564558654, + "grad_norm": 0.9245615005493164, + "learning_rate": 3.9139498380235985e-05, + "loss": 0.3976, + "step": 8870 + }, + { + "epoch": 0.6591407033309815, + "grad_norm": 0.8696265816688538, + "learning_rate": 3.9127097035033234e-05, + "loss": 0.3717, + "step": 8875 + }, + { + "epoch": 0.6595120502060975, + "grad_norm": 0.9935540556907654, + "learning_rate": 3.911469058070789e-05, + "loss": 0.3638, + "step": 8880 + }, + { + "epoch": 0.6598833970812136, + "grad_norm": 0.8621585369110107, + "learning_rate": 3.910227902174679e-05, + "loss": 0.3765, + "step": 8885 + }, + { + "epoch": 0.6602547439563297, + "grad_norm": 0.7071739435195923, + "learning_rate": 3.908986236263863e-05, + "loss": 0.3521, + "step": 8890 + }, + { + "epoch": 0.6606260908314456, + "grad_norm": 1.5234918594360352, + "learning_rate": 3.907744060787395e-05, + "loss": 0.3993, + "step": 8895 + }, + { + "epoch": 0.6609974377065617, + "grad_norm": 1.445349097251892, + "learning_rate": 3.906501376194512e-05, + "loss": 0.371, + "step": 8900 + }, + { + "epoch": 0.6613687845816777, + "grad_norm": 0.8245334625244141, + "learning_rate": 3.905258182934637e-05, + "loss": 0.4072, + "step": 8905 + }, + { + "epoch": 0.6617401314567938, + "grad_norm": 0.80816251039505, + "learning_rate": 3.904014481457374e-05, + "loss": 0.3406, + "step": 8910 + }, + { + "epoch": 0.6621114783319099, + "grad_norm": 0.8921861052513123, + "learning_rate": 3.902770272212516e-05, + "loss": 0.3795, + "step": 8915 + }, + { + "epoch": 0.6624828252070258, + "grad_norm": 1.2531591653823853, + "learning_rate": 3.901525555650033e-05, + "loss": 0.3854, + "step": 8920 + }, + { + "epoch": 0.6628541720821419, + "grad_norm": 0.9966285824775696, + "learning_rate": 3.900280332220083e-05, + "loss": 0.3734, + "step": 8925 + }, + { + "epoch": 0.663225518957258, + "grad_norm": 0.8290938138961792, + "learning_rate": 3.899034602373007e-05, + "loss": 0.3667, + "step": 8930 + }, + { + "epoch": 0.663596865832374, + "grad_norm": 0.7914450764656067, + "learning_rate": 3.8977883665593284e-05, + "loss": 0.387, + "step": 8935 + }, + { + "epoch": 0.6639682127074901, + "grad_norm": 0.899835467338562, + "learning_rate": 3.896541625229752e-05, + "loss": 0.364, + "step": 8940 + }, + { + "epoch": 0.6643395595826062, + "grad_norm": 0.8077391982078552, + "learning_rate": 3.895294378835168e-05, + "loss": 0.3782, + "step": 8945 + }, + { + "epoch": 0.6647109064577221, + "grad_norm": 0.8813906908035278, + "learning_rate": 3.8940466278266476e-05, + "loss": 0.3549, + "step": 8950 + }, + { + "epoch": 0.6650822533328382, + "grad_norm": 0.7896718978881836, + "learning_rate": 3.892798372655446e-05, + "loss": 0.3783, + "step": 8955 + }, + { + "epoch": 0.6654536002079543, + "grad_norm": 1.244232416152954, + "learning_rate": 3.891549613772999e-05, + "loss": 0.372, + "step": 8960 + }, + { + "epoch": 0.6658249470830703, + "grad_norm": 0.776960015296936, + "learning_rate": 3.8903003516309256e-05, + "loss": 0.3989, + "step": 8965 + }, + { + "epoch": 0.6661962939581864, + "grad_norm": 0.8111973404884338, + "learning_rate": 3.889050586681027e-05, + "loss": 0.385, + "step": 8970 + }, + { + "epoch": 0.6665676408333023, + "grad_norm": 0.7727485299110413, + "learning_rate": 3.887800319375284e-05, + "loss": 0.358, + "step": 8975 + }, + { + "epoch": 0.6669389877084184, + "grad_norm": 0.8484100103378296, + "learning_rate": 3.886549550165863e-05, + "loss": 0.3947, + "step": 8980 + }, + { + "epoch": 0.6673103345835345, + "grad_norm": 0.9307198524475098, + "learning_rate": 3.88529827950511e-05, + "loss": 0.3645, + "step": 8985 + }, + { + "epoch": 0.6676816814586505, + "grad_norm": 0.9293243288993835, + "learning_rate": 3.8840465078455505e-05, + "loss": 0.3702, + "step": 8990 + }, + { + "epoch": 0.6680530283337666, + "grad_norm": 0.8592212796211243, + "learning_rate": 3.882794235639894e-05, + "loss": 0.3785, + "step": 8995 + }, + { + "epoch": 0.6684243752088826, + "grad_norm": 0.7336313128471375, + "learning_rate": 3.881541463341029e-05, + "loss": 0.3673, + "step": 9000 + }, + { + "epoch": 0.6687957220839986, + "grad_norm": 0.9456973671913147, + "learning_rate": 3.880288191402026e-05, + "loss": 0.3889, + "step": 9005 + }, + { + "epoch": 0.6691670689591147, + "grad_norm": 0.9240425825119019, + "learning_rate": 3.8790344202761365e-05, + "loss": 0.3637, + "step": 9010 + }, + { + "epoch": 0.6695384158342308, + "grad_norm": 1.105124831199646, + "learning_rate": 3.877780150416792e-05, + "loss": 0.3795, + "step": 9015 + }, + { + "epoch": 0.6699097627093468, + "grad_norm": 0.8785472512245178, + "learning_rate": 3.8765253822776036e-05, + "loss": 0.3639, + "step": 9020 + }, + { + "epoch": 0.6702811095844629, + "grad_norm": 1.0078473091125488, + "learning_rate": 3.875270116312364e-05, + "loss": 0.3979, + "step": 9025 + }, + { + "epoch": 0.6706524564595789, + "grad_norm": 0.7979200482368469, + "learning_rate": 3.874014352975045e-05, + "loss": 0.3602, + "step": 9030 + }, + { + "epoch": 0.6710238033346949, + "grad_norm": 0.7981281876564026, + "learning_rate": 3.872758092719798e-05, + "loss": 0.36, + "step": 9035 + }, + { + "epoch": 0.671395150209811, + "grad_norm": 2.0684547424316406, + "learning_rate": 3.871501336000957e-05, + "loss": 0.3873, + "step": 9040 + }, + { + "epoch": 0.6717664970849271, + "grad_norm": 0.6863536238670349, + "learning_rate": 3.87024408327303e-05, + "loss": 0.3738, + "step": 9045 + }, + { + "epoch": 0.6721378439600431, + "grad_norm": 0.7692870497703552, + "learning_rate": 3.8689863349907084e-05, + "loss": 0.3639, + "step": 9050 + }, + { + "epoch": 0.6725091908351591, + "grad_norm": 0.8045924305915833, + "learning_rate": 3.8677280916088645e-05, + "loss": 0.3557, + "step": 9055 + }, + { + "epoch": 0.6728805377102751, + "grad_norm": 1.2326745986938477, + "learning_rate": 3.866469353582545e-05, + "loss": 0.3831, + "step": 9060 + }, + { + "epoch": 0.6732518845853912, + "grad_norm": 0.9144732356071472, + "learning_rate": 3.865210121366977e-05, + "loss": 0.3797, + "step": 9065 + }, + { + "epoch": 0.6736232314605073, + "grad_norm": 1.0086545944213867, + "learning_rate": 3.863950395417569e-05, + "loss": 0.3989, + "step": 9070 + }, + { + "epoch": 0.6739945783356233, + "grad_norm": 1.0512826442718506, + "learning_rate": 3.8626901761899045e-05, + "loss": 0.3837, + "step": 9075 + }, + { + "epoch": 0.6743659252107393, + "grad_norm": 0.9185574054718018, + "learning_rate": 3.861429464139748e-05, + "loss": 0.369, + "step": 9080 + }, + { + "epoch": 0.6747372720858554, + "grad_norm": 0.9505536556243896, + "learning_rate": 3.860168259723041e-05, + "loss": 0.365, + "step": 9085 + }, + { + "epoch": 0.6751086189609714, + "grad_norm": 0.8058850169181824, + "learning_rate": 3.8589065633959035e-05, + "loss": 0.361, + "step": 9090 + }, + { + "epoch": 0.6754799658360875, + "grad_norm": 0.8129885792732239, + "learning_rate": 3.857644375614631e-05, + "loss": 0.3766, + "step": 9095 + }, + { + "epoch": 0.6758513127112036, + "grad_norm": 0.641776978969574, + "learning_rate": 3.8563816968357016e-05, + "loss": 0.3641, + "step": 9100 + }, + { + "epoch": 0.6762226595863196, + "grad_norm": 0.8404680490493774, + "learning_rate": 3.855118527515767e-05, + "loss": 0.361, + "step": 9105 + }, + { + "epoch": 0.6765940064614356, + "grad_norm": 0.8596828579902649, + "learning_rate": 3.853854868111658e-05, + "loss": 0.3523, + "step": 9110 + }, + { + "epoch": 0.6769653533365517, + "grad_norm": 0.7938897013664246, + "learning_rate": 3.8525907190803816e-05, + "loss": 0.3705, + "step": 9115 + }, + { + "epoch": 0.6773367002116677, + "grad_norm": 0.9502366781234741, + "learning_rate": 3.851326080879123e-05, + "loss": 0.3702, + "step": 9120 + }, + { + "epoch": 0.6777080470867838, + "grad_norm": 0.7413920760154724, + "learning_rate": 3.850060953965244e-05, + "loss": 0.369, + "step": 9125 + }, + { + "epoch": 0.6780793939618998, + "grad_norm": 0.9958828687667847, + "learning_rate": 3.848795338796283e-05, + "loss": 0.3825, + "step": 9130 + }, + { + "epoch": 0.6784507408370158, + "grad_norm": 0.7100253105163574, + "learning_rate": 3.847529235829953e-05, + "loss": 0.352, + "step": 9135 + }, + { + "epoch": 0.6788220877121319, + "grad_norm": 1.3226805925369263, + "learning_rate": 3.846262645524148e-05, + "loss": 0.3683, + "step": 9140 + }, + { + "epoch": 0.6791934345872479, + "grad_norm": 0.9204307198524475, + "learning_rate": 3.8449955683369346e-05, + "loss": 0.3678, + "step": 9145 + }, + { + "epoch": 0.679564781462364, + "grad_norm": 1.0092697143554688, + "learning_rate": 3.8437280047265554e-05, + "loss": 0.3743, + "step": 9150 + }, + { + "epoch": 0.6799361283374801, + "grad_norm": 1.3019070625305176, + "learning_rate": 3.8424599551514314e-05, + "loss": 0.3702, + "step": 9155 + }, + { + "epoch": 0.680307475212596, + "grad_norm": 0.7268680334091187, + "learning_rate": 3.841191420070158e-05, + "loss": 0.3635, + "step": 9160 + }, + { + "epoch": 0.6806788220877121, + "grad_norm": 0.938025176525116, + "learning_rate": 3.839922399941505e-05, + "loss": 0.3854, + "step": 9165 + }, + { + "epoch": 0.6810501689628282, + "grad_norm": 0.8636729717254639, + "learning_rate": 3.838652895224419e-05, + "loss": 0.3751, + "step": 9170 + }, + { + "epoch": 0.6814215158379442, + "grad_norm": 0.7120767831802368, + "learning_rate": 3.837382906378022e-05, + "loss": 0.3751, + "step": 9175 + }, + { + "epoch": 0.6817928627130603, + "grad_norm": 0.9305611848831177, + "learning_rate": 3.8361124338616105e-05, + "loss": 0.3635, + "step": 9180 + }, + { + "epoch": 0.6821642095881764, + "grad_norm": 0.6582894325256348, + "learning_rate": 3.834841478134656e-05, + "loss": 0.3625, + "step": 9185 + }, + { + "epoch": 0.6825355564632923, + "grad_norm": 0.856611967086792, + "learning_rate": 3.833570039656805e-05, + "loss": 0.3726, + "step": 9190 + }, + { + "epoch": 0.6829069033384084, + "grad_norm": 0.9939899444580078, + "learning_rate": 3.8322981188878786e-05, + "loss": 0.3685, + "step": 9195 + }, + { + "epoch": 0.6832782502135245, + "grad_norm": 0.9134172797203064, + "learning_rate": 3.8310257162878724e-05, + "loss": 0.3914, + "step": 9200 + }, + { + "epoch": 0.6836495970886405, + "grad_norm": 0.9030920267105103, + "learning_rate": 3.829752832316955e-05, + "loss": 0.3782, + "step": 9205 + }, + { + "epoch": 0.6840209439637566, + "grad_norm": 0.9032113552093506, + "learning_rate": 3.828479467435471e-05, + "loss": 0.3646, + "step": 9210 + }, + { + "epoch": 0.6843922908388725, + "grad_norm": 1.0852000713348389, + "learning_rate": 3.8272056221039375e-05, + "loss": 0.3481, + "step": 9215 + }, + { + "epoch": 0.6847636377139886, + "grad_norm": 0.8781152367591858, + "learning_rate": 3.8259312967830465e-05, + "loss": 0.3631, + "step": 9220 + }, + { + "epoch": 0.6851349845891047, + "grad_norm": 0.7677597999572754, + "learning_rate": 3.824656491933662e-05, + "loss": 0.3524, + "step": 9225 + }, + { + "epoch": 0.6855063314642207, + "grad_norm": 0.7801468968391418, + "learning_rate": 3.823381208016823e-05, + "loss": 0.3917, + "step": 9230 + }, + { + "epoch": 0.6858776783393368, + "grad_norm": 0.7930712103843689, + "learning_rate": 3.822105445493742e-05, + "loss": 0.391, + "step": 9235 + }, + { + "epoch": 0.6862490252144529, + "grad_norm": 0.8240649104118347, + "learning_rate": 3.820829204825802e-05, + "loss": 0.3703, + "step": 9240 + }, + { + "epoch": 0.6866203720895688, + "grad_norm": 0.7643125057220459, + "learning_rate": 3.819552486474562e-05, + "loss": 0.3791, + "step": 9245 + }, + { + "epoch": 0.6869917189646849, + "grad_norm": 0.9027881622314453, + "learning_rate": 3.818275290901753e-05, + "loss": 0.3388, + "step": 9250 + }, + { + "epoch": 0.687363065839801, + "grad_norm": 0.9473441243171692, + "learning_rate": 3.816997618569276e-05, + "loss": 0.3743, + "step": 9255 + }, + { + "epoch": 0.687734412714917, + "grad_norm": 0.8707664012908936, + "learning_rate": 3.8157194699392074e-05, + "loss": 0.3624, + "step": 9260 + }, + { + "epoch": 0.6881057595900331, + "grad_norm": 0.9271723628044128, + "learning_rate": 3.814440845473795e-05, + "loss": 0.3659, + "step": 9265 + }, + { + "epoch": 0.6884771064651491, + "grad_norm": 0.7871653437614441, + "learning_rate": 3.813161745635459e-05, + "loss": 0.3624, + "step": 9270 + }, + { + "epoch": 0.6888484533402651, + "grad_norm": 0.8163115978240967, + "learning_rate": 3.8118821708867903e-05, + "loss": 0.3694, + "step": 9275 + }, + { + "epoch": 0.6892198002153812, + "grad_norm": 1.2728971242904663, + "learning_rate": 3.810602121690553e-05, + "loss": 0.3728, + "step": 9280 + }, + { + "epoch": 0.6895911470904972, + "grad_norm": 0.8345783352851868, + "learning_rate": 3.809321598509682e-05, + "loss": 0.3616, + "step": 9285 + }, + { + "epoch": 0.6899624939656133, + "grad_norm": 0.7567407488822937, + "learning_rate": 3.808040601807283e-05, + "loss": 0.3727, + "step": 9290 + }, + { + "epoch": 0.6903338408407294, + "grad_norm": 0.8576236963272095, + "learning_rate": 3.806759132046635e-05, + "loss": 0.3884, + "step": 9295 + }, + { + "epoch": 0.6907051877158453, + "grad_norm": 0.7449585199356079, + "learning_rate": 3.805477189691186e-05, + "loss": 0.3729, + "step": 9300 + }, + { + "epoch": 0.6910765345909614, + "grad_norm": 0.9378268122673035, + "learning_rate": 3.804194775204556e-05, + "loss": 0.36, + "step": 9305 + }, + { + "epoch": 0.6914478814660775, + "grad_norm": 1.0872933864593506, + "learning_rate": 3.802911889050534e-05, + "loss": 0.3761, + "step": 9310 + }, + { + "epoch": 0.6918192283411935, + "grad_norm": 0.8560049533843994, + "learning_rate": 3.8016285316930845e-05, + "loss": 0.3716, + "step": 9315 + }, + { + "epoch": 0.6921905752163096, + "grad_norm": 1.188044786453247, + "learning_rate": 3.800344703596335e-05, + "loss": 0.3558, + "step": 9320 + }, + { + "epoch": 0.6925619220914256, + "grad_norm": 0.8799236416816711, + "learning_rate": 3.79906040522459e-05, + "loss": 0.3653, + "step": 9325 + }, + { + "epoch": 0.6929332689665416, + "grad_norm": 1.0614551305770874, + "learning_rate": 3.79777563704232e-05, + "loss": 0.3722, + "step": 9330 + }, + { + "epoch": 0.6933046158416577, + "grad_norm": 0.8129174113273621, + "learning_rate": 3.796490399514167e-05, + "loss": 0.3597, + "step": 9335 + }, + { + "epoch": 0.6936759627167738, + "grad_norm": 0.9150453209877014, + "learning_rate": 3.7952046931049425e-05, + "loss": 0.375, + "step": 9340 + }, + { + "epoch": 0.6940473095918898, + "grad_norm": 0.8379879593849182, + "learning_rate": 3.7939185182796266e-05, + "loss": 0.3811, + "step": 9345 + }, + { + "epoch": 0.6944186564670058, + "grad_norm": 1.0485833883285522, + "learning_rate": 3.7926318755033716e-05, + "loss": 0.3758, + "step": 9350 + }, + { + "epoch": 0.6947900033421219, + "grad_norm": 0.7375094890594482, + "learning_rate": 3.7913447652414955e-05, + "loss": 0.4004, + "step": 9355 + }, + { + "epoch": 0.6951613502172379, + "grad_norm": 0.8998247385025024, + "learning_rate": 3.7900571879594874e-05, + "loss": 0.3741, + "step": 9360 + }, + { + "epoch": 0.695532697092354, + "grad_norm": 0.7974969148635864, + "learning_rate": 3.7887691441230055e-05, + "loss": 0.3839, + "step": 9365 + }, + { + "epoch": 0.69590404396747, + "grad_norm": 0.7805307507514954, + "learning_rate": 3.787480634197875e-05, + "loss": 0.3626, + "step": 9370 + }, + { + "epoch": 0.696275390842586, + "grad_norm": 0.8351677656173706, + "learning_rate": 3.786191658650092e-05, + "loss": 0.368, + "step": 9375 + }, + { + "epoch": 0.6966467377177021, + "grad_norm": 0.9680357575416565, + "learning_rate": 3.784902217945819e-05, + "loss": 0.4092, + "step": 9380 + }, + { + "epoch": 0.6970180845928181, + "grad_norm": 0.7669398784637451, + "learning_rate": 3.7836123125513875e-05, + "loss": 0.3497, + "step": 9385 + }, + { + "epoch": 0.6973894314679342, + "grad_norm": 0.7332344055175781, + "learning_rate": 3.782321942933299e-05, + "loss": 0.3643, + "step": 9390 + }, + { + "epoch": 0.6977607783430503, + "grad_norm": 0.7292071580886841, + "learning_rate": 3.7810311095582176e-05, + "loss": 0.3805, + "step": 9395 + }, + { + "epoch": 0.6981321252181663, + "grad_norm": 0.8352912664413452, + "learning_rate": 3.7797398128929805e-05, + "loss": 0.3792, + "step": 9400 + }, + { + "epoch": 0.6985034720932823, + "grad_norm": 0.7182777523994446, + "learning_rate": 3.778448053404591e-05, + "loss": 0.3643, + "step": 9405 + }, + { + "epoch": 0.6988748189683984, + "grad_norm": 1.0204569101333618, + "learning_rate": 3.777155831560217e-05, + "loss": 0.3696, + "step": 9410 + }, + { + "epoch": 0.6992461658435144, + "grad_norm": 1.1198655366897583, + "learning_rate": 3.775863147827198e-05, + "loss": 0.3755, + "step": 9415 + }, + { + "epoch": 0.6996175127186305, + "grad_norm": 0.8152322173118591, + "learning_rate": 3.774570002673038e-05, + "loss": 0.3673, + "step": 9420 + }, + { + "epoch": 0.6999888595937466, + "grad_norm": 0.6712133288383484, + "learning_rate": 3.7732763965654075e-05, + "loss": 0.3435, + "step": 9425 + }, + { + "epoch": 0.7003602064688625, + "grad_norm": 1.092829704284668, + "learning_rate": 3.7719823299721443e-05, + "loss": 0.3627, + "step": 9430 + }, + { + "epoch": 0.7007315533439786, + "grad_norm": 1.0296307802200317, + "learning_rate": 3.7706878033612534e-05, + "loss": 0.3787, + "step": 9435 + }, + { + "epoch": 0.7011029002190946, + "grad_norm": 0.8601410388946533, + "learning_rate": 3.7693928172009066e-05, + "loss": 0.3877, + "step": 9440 + }, + { + "epoch": 0.7014742470942107, + "grad_norm": 1.8063080310821533, + "learning_rate": 3.7680973719594394e-05, + "loss": 0.3596, + "step": 9445 + }, + { + "epoch": 0.7018455939693268, + "grad_norm": 1.0898480415344238, + "learning_rate": 3.766801468105355e-05, + "loss": 0.3585, + "step": 9450 + }, + { + "epoch": 0.7022169408444427, + "grad_norm": 0.840470552444458, + "learning_rate": 3.7655051061073234e-05, + "loss": 0.3789, + "step": 9455 + }, + { + "epoch": 0.7025882877195588, + "grad_norm": 0.8825913667678833, + "learning_rate": 3.764208286434179e-05, + "loss": 0.375, + "step": 9460 + }, + { + "epoch": 0.7029596345946749, + "grad_norm": 0.7693654894828796, + "learning_rate": 3.762911009554921e-05, + "loss": 0.3469, + "step": 9465 + }, + { + "epoch": 0.7033309814697909, + "grad_norm": 0.7990744113922119, + "learning_rate": 3.7616132759387156e-05, + "loss": 0.3498, + "step": 9470 + }, + { + "epoch": 0.703702328344907, + "grad_norm": 0.8645015954971313, + "learning_rate": 3.760315086054893e-05, + "loss": 0.3721, + "step": 9475 + }, + { + "epoch": 0.7040736752200231, + "grad_norm": 0.8150882720947266, + "learning_rate": 3.75901644037295e-05, + "loss": 0.369, + "step": 9480 + }, + { + "epoch": 0.704445022095139, + "grad_norm": 0.7890775203704834, + "learning_rate": 3.757717339362546e-05, + "loss": 0.3891, + "step": 9485 + }, + { + "epoch": 0.7048163689702551, + "grad_norm": 0.7995883226394653, + "learning_rate": 3.756417783493507e-05, + "loss": 0.3625, + "step": 9490 + }, + { + "epoch": 0.7051877158453712, + "grad_norm": 1.114989161491394, + "learning_rate": 3.755117773235823e-05, + "loss": 0.3743, + "step": 9495 + }, + { + "epoch": 0.7055590627204872, + "grad_norm": 1.1512959003448486, + "learning_rate": 3.753817309059646e-05, + "loss": 0.3785, + "step": 9500 + }, + { + "epoch": 0.7059304095956033, + "grad_norm": 0.7948722839355469, + "learning_rate": 3.752516391435297e-05, + "loss": 0.3863, + "step": 9505 + }, + { + "epoch": 0.7063017564707192, + "grad_norm": 0.9194521307945251, + "learning_rate": 3.751215020833255e-05, + "loss": 0.3619, + "step": 9510 + }, + { + "epoch": 0.7066731033458353, + "grad_norm": 0.8141660690307617, + "learning_rate": 3.749913197724169e-05, + "loss": 0.3721, + "step": 9515 + }, + { + "epoch": 0.7070444502209514, + "grad_norm": 0.7911691665649414, + "learning_rate": 3.748610922578848e-05, + "loss": 0.3782, + "step": 9520 + }, + { + "epoch": 0.7074157970960674, + "grad_norm": 0.9009805917739868, + "learning_rate": 3.747308195868263e-05, + "loss": 0.3323, + "step": 9525 + }, + { + "epoch": 0.7077871439711835, + "grad_norm": 1.0365558862686157, + "learning_rate": 3.746005018063554e-05, + "loss": 0.368, + "step": 9530 + }, + { + "epoch": 0.7081584908462996, + "grad_norm": 0.8006228804588318, + "learning_rate": 3.744701389636018e-05, + "loss": 0.3686, + "step": 9535 + }, + { + "epoch": 0.7085298377214155, + "grad_norm": 0.87027907371521, + "learning_rate": 3.743397311057118e-05, + "loss": 0.349, + "step": 9540 + }, + { + "epoch": 0.7089011845965316, + "grad_norm": 0.8917665481567383, + "learning_rate": 3.742092782798479e-05, + "loss": 0.3655, + "step": 9545 + }, + { + "epoch": 0.7092725314716477, + "grad_norm": 1.0651155710220337, + "learning_rate": 3.740787805331891e-05, + "loss": 0.3842, + "step": 9550 + }, + { + "epoch": 0.7096438783467637, + "grad_norm": 1.054017424583435, + "learning_rate": 3.739482379129302e-05, + "loss": 0.3916, + "step": 9555 + }, + { + "epoch": 0.7100152252218798, + "grad_norm": 0.6938828229904175, + "learning_rate": 3.738176504662826e-05, + "loss": 0.3645, + "step": 9560 + }, + { + "epoch": 0.7103865720969958, + "grad_norm": 0.7110164165496826, + "learning_rate": 3.736870182404738e-05, + "loss": 0.356, + "step": 9565 + }, + { + "epoch": 0.7107579189721118, + "grad_norm": 1.1128145456314087, + "learning_rate": 3.735563412827475e-05, + "loss": 0.3733, + "step": 9570 + }, + { + "epoch": 0.7111292658472279, + "grad_norm": 0.9033927917480469, + "learning_rate": 3.734256196403634e-05, + "loss": 0.3504, + "step": 9575 + }, + { + "epoch": 0.711500612722344, + "grad_norm": 0.88112473487854, + "learning_rate": 3.7329485336059774e-05, + "loss": 0.3631, + "step": 9580 + }, + { + "epoch": 0.71187195959746, + "grad_norm": 0.7752759456634521, + "learning_rate": 3.731640424907425e-05, + "loss": 0.3491, + "step": 9585 + }, + { + "epoch": 0.712243306472576, + "grad_norm": 1.040805697441101, + "learning_rate": 3.7303318707810595e-05, + "loss": 0.3647, + "step": 9590 + }, + { + "epoch": 0.712614653347692, + "grad_norm": 0.7947229146957397, + "learning_rate": 3.7290228717001275e-05, + "loss": 0.3535, + "step": 9595 + }, + { + "epoch": 0.7129860002228081, + "grad_norm": 0.8026042580604553, + "learning_rate": 3.72771342813803e-05, + "loss": 0.3818, + "step": 9600 + }, + { + "epoch": 0.7133573470979242, + "grad_norm": 0.9147016406059265, + "learning_rate": 3.7264035405683364e-05, + "loss": 0.3651, + "step": 9605 + }, + { + "epoch": 0.7137286939730402, + "grad_norm": 1.2373228073120117, + "learning_rate": 3.7250932094647706e-05, + "loss": 0.3719, + "step": 9610 + }, + { + "epoch": 0.7141000408481563, + "grad_norm": 0.8862619996070862, + "learning_rate": 3.72378243530122e-05, + "loss": 0.364, + "step": 9615 + }, + { + "epoch": 0.7144713877232723, + "grad_norm": 0.8791931867599487, + "learning_rate": 3.722471218551732e-05, + "loss": 0.3795, + "step": 9620 + }, + { + "epoch": 0.7148427345983883, + "grad_norm": 0.9781063199043274, + "learning_rate": 3.7211595596905115e-05, + "loss": 0.3658, + "step": 9625 + }, + { + "epoch": 0.7152140814735044, + "grad_norm": 0.7280610799789429, + "learning_rate": 3.719847459191928e-05, + "loss": 0.3742, + "step": 9630 + }, + { + "epoch": 0.7155854283486205, + "grad_norm": 0.8904832601547241, + "learning_rate": 3.718534917530505e-05, + "loss": 0.3596, + "step": 9635 + }, + { + "epoch": 0.7159567752237365, + "grad_norm": 1.0527021884918213, + "learning_rate": 3.7172219351809315e-05, + "loss": 0.3633, + "step": 9640 + }, + { + "epoch": 0.7163281220988525, + "grad_norm": 1.0968849658966064, + "learning_rate": 3.715908512618051e-05, + "loss": 0.3743, + "step": 9645 + }, + { + "epoch": 0.7166994689739686, + "grad_norm": 0.9742524027824402, + "learning_rate": 3.714594650316869e-05, + "loss": 0.3759, + "step": 9650 + }, + { + "epoch": 0.7170708158490846, + "grad_norm": 0.8875055909156799, + "learning_rate": 3.71328034875255e-05, + "loss": 0.358, + "step": 9655 + }, + { + "epoch": 0.7174421627242007, + "grad_norm": 0.8786531686782837, + "learning_rate": 3.711965608400414e-05, + "loss": 0.3813, + "step": 9660 + }, + { + "epoch": 0.7178135095993167, + "grad_norm": 0.9650137424468994, + "learning_rate": 3.710650429735946e-05, + "loss": 0.3534, + "step": 9665 + }, + { + "epoch": 0.7181848564744328, + "grad_norm": 0.7877172231674194, + "learning_rate": 3.709334813234782e-05, + "loss": 0.3575, + "step": 9670 + }, + { + "epoch": 0.7185562033495488, + "grad_norm": 1.950656533241272, + "learning_rate": 3.708018759372722e-05, + "loss": 0.3697, + "step": 9675 + }, + { + "epoch": 0.7189275502246648, + "grad_norm": 0.8109143376350403, + "learning_rate": 3.706702268625721e-05, + "loss": 0.3487, + "step": 9680 + }, + { + "epoch": 0.7192988970997809, + "grad_norm": 0.8184154033660889, + "learning_rate": 3.705385341469896e-05, + "loss": 0.3634, + "step": 9685 + }, + { + "epoch": 0.719670243974897, + "grad_norm": 0.960198700428009, + "learning_rate": 3.704067978381517e-05, + "loss": 0.3768, + "step": 9690 + }, + { + "epoch": 0.720041590850013, + "grad_norm": 0.9191488027572632, + "learning_rate": 3.702750179837013e-05, + "loss": 0.3566, + "step": 9695 + }, + { + "epoch": 0.720412937725129, + "grad_norm": 1.1562496423721313, + "learning_rate": 3.701431946312974e-05, + "loss": 0.3661, + "step": 9700 + }, + { + "epoch": 0.7207842846002451, + "grad_norm": 0.9678395390510559, + "learning_rate": 3.7001132782861414e-05, + "loss": 0.3659, + "step": 9705 + }, + { + "epoch": 0.7211556314753611, + "grad_norm": 0.8770809769630432, + "learning_rate": 3.69879417623342e-05, + "loss": 0.3867, + "step": 9710 + }, + { + "epoch": 0.7215269783504772, + "grad_norm": 0.9320460557937622, + "learning_rate": 3.697474640631866e-05, + "loss": 0.358, + "step": 9715 + }, + { + "epoch": 0.7218983252255933, + "grad_norm": 0.8323056101799011, + "learning_rate": 3.6961546719586955e-05, + "loss": 0.3941, + "step": 9720 + }, + { + "epoch": 0.7222696721007092, + "grad_norm": 0.7591274976730347, + "learning_rate": 3.6948342706912815e-05, + "loss": 0.3754, + "step": 9725 + }, + { + "epoch": 0.7226410189758253, + "grad_norm": 0.9951517581939697, + "learning_rate": 3.693513437307152e-05, + "loss": 0.3804, + "step": 9730 + }, + { + "epoch": 0.7230123658509414, + "grad_norm": 1.128585696220398, + "learning_rate": 3.6921921722839906e-05, + "loss": 0.3622, + "step": 9735 + }, + { + "epoch": 0.7233837127260574, + "grad_norm": 1.0597444772720337, + "learning_rate": 3.69087047609964e-05, + "loss": 0.3773, + "step": 9740 + }, + { + "epoch": 0.7237550596011735, + "grad_norm": 2.1655824184417725, + "learning_rate": 3.689548349232097e-05, + "loss": 0.3649, + "step": 9745 + }, + { + "epoch": 0.7241264064762895, + "grad_norm": 0.9280144572257996, + "learning_rate": 3.688225792159512e-05, + "loss": 0.3645, + "step": 9750 + }, + { + "epoch": 0.7244977533514055, + "grad_norm": 0.9158397912979126, + "learning_rate": 3.6869028053601965e-05, + "loss": 0.3593, + "step": 9755 + }, + { + "epoch": 0.7248691002265216, + "grad_norm": 0.9126660823822021, + "learning_rate": 3.685579389312612e-05, + "loss": 0.3907, + "step": 9760 + }, + { + "epoch": 0.7252404471016376, + "grad_norm": 0.7070202827453613, + "learning_rate": 3.6842555444953775e-05, + "loss": 0.3507, + "step": 9765 + }, + { + "epoch": 0.7256117939767537, + "grad_norm": 0.9516451954841614, + "learning_rate": 3.682931271387267e-05, + "loss": 0.3622, + "step": 9770 + }, + { + "epoch": 0.7259831408518698, + "grad_norm": 0.8496583700180054, + "learning_rate": 3.68160657046721e-05, + "loss": 0.3653, + "step": 9775 + }, + { + "epoch": 0.7263544877269857, + "grad_norm": 0.7504469156265259, + "learning_rate": 3.680281442214288e-05, + "loss": 0.3347, + "step": 9780 + }, + { + "epoch": 0.7267258346021018, + "grad_norm": 0.9300464391708374, + "learning_rate": 3.678955887107742e-05, + "loss": 0.3443, + "step": 9785 + }, + { + "epoch": 0.7270971814772179, + "grad_norm": 0.9748815298080444, + "learning_rate": 3.677629905626962e-05, + "loss": 0.3651, + "step": 9790 + }, + { + "epoch": 0.7274685283523339, + "grad_norm": 1.010619878768921, + "learning_rate": 3.676303498251496e-05, + "loss": 0.3774, + "step": 9795 + }, + { + "epoch": 0.72783987522745, + "grad_norm": 0.8692498803138733, + "learning_rate": 3.6749766654610445e-05, + "loss": 0.3602, + "step": 9800 + }, + { + "epoch": 0.7282112221025661, + "grad_norm": 1.0647649765014648, + "learning_rate": 3.673649407735461e-05, + "loss": 0.3669, + "step": 9805 + }, + { + "epoch": 0.728582568977682, + "grad_norm": 0.9087816476821899, + "learning_rate": 3.672321725554756e-05, + "loss": 0.3725, + "step": 9810 + }, + { + "epoch": 0.7289539158527981, + "grad_norm": 0.9249609708786011, + "learning_rate": 3.6709936193990885e-05, + "loss": 0.3845, + "step": 9815 + }, + { + "epoch": 0.7293252627279141, + "grad_norm": 0.8808562159538269, + "learning_rate": 3.669665089748775e-05, + "loss": 0.3579, + "step": 9820 + }, + { + "epoch": 0.7296966096030302, + "grad_norm": 0.9480738639831543, + "learning_rate": 3.668336137084283e-05, + "loss": 0.3585, + "step": 9825 + }, + { + "epoch": 0.7300679564781463, + "grad_norm": 1.0477834939956665, + "learning_rate": 3.667006761886235e-05, + "loss": 0.3685, + "step": 9830 + }, + { + "epoch": 0.7304393033532622, + "grad_norm": 0.8944914937019348, + "learning_rate": 3.6656769646354037e-05, + "loss": 0.3764, + "step": 9835 + }, + { + "epoch": 0.7308106502283783, + "grad_norm": 0.7739643454551697, + "learning_rate": 3.664346745812717e-05, + "loss": 0.3564, + "step": 9840 + }, + { + "epoch": 0.7311819971034944, + "grad_norm": 0.7765239477157593, + "learning_rate": 3.663016105899252e-05, + "loss": 0.3697, + "step": 9845 + }, + { + "epoch": 0.7315533439786104, + "grad_norm": 0.8198454976081848, + "learning_rate": 3.6616850453762416e-05, + "loss": 0.3508, + "step": 9850 + }, + { + "epoch": 0.7319246908537265, + "grad_norm": 0.7845231294631958, + "learning_rate": 3.6603535647250684e-05, + "loss": 0.3629, + "step": 9855 + }, + { + "epoch": 0.7322960377288426, + "grad_norm": 0.9282823204994202, + "learning_rate": 3.659021664427269e-05, + "loss": 0.3622, + "step": 9860 + }, + { + "epoch": 0.7326673846039585, + "grad_norm": 0.9496755599975586, + "learning_rate": 3.65768934496453e-05, + "loss": 0.3445, + "step": 9865 + }, + { + "epoch": 0.7330387314790746, + "grad_norm": 0.7232083082199097, + "learning_rate": 3.6563566068186895e-05, + "loss": 0.3561, + "step": 9870 + }, + { + "epoch": 0.7334100783541907, + "grad_norm": 1.0331436395645142, + "learning_rate": 3.6550234504717396e-05, + "loss": 0.3706, + "step": 9875 + }, + { + "epoch": 0.7337814252293067, + "grad_norm": 0.9413464069366455, + "learning_rate": 3.6536898764058196e-05, + "loss": 0.3433, + "step": 9880 + }, + { + "epoch": 0.7341527721044228, + "grad_norm": 0.9857883453369141, + "learning_rate": 3.652355885103223e-05, + "loss": 0.3581, + "step": 9885 + }, + { + "epoch": 0.7345241189795388, + "grad_norm": 0.8081353306770325, + "learning_rate": 3.651021477046393e-05, + "loss": 0.3914, + "step": 9890 + }, + { + "epoch": 0.7348954658546548, + "grad_norm": 1.0811405181884766, + "learning_rate": 3.649686652717925e-05, + "loss": 0.3712, + "step": 9895 + }, + { + "epoch": 0.7352668127297709, + "grad_norm": 0.816540002822876, + "learning_rate": 3.6483514126005626e-05, + "loss": 0.3834, + "step": 9900 + }, + { + "epoch": 0.7356381596048869, + "grad_norm": 0.7943872213363647, + "learning_rate": 3.6470157571772004e-05, + "loss": 0.3805, + "step": 9905 + }, + { + "epoch": 0.736009506480003, + "grad_norm": 0.9035302400588989, + "learning_rate": 3.645679686930883e-05, + "loss": 0.3471, + "step": 9910 + }, + { + "epoch": 0.736380853355119, + "grad_norm": 0.8763010501861572, + "learning_rate": 3.644343202344808e-05, + "loss": 0.3741, + "step": 9915 + }, + { + "epoch": 0.736752200230235, + "grad_norm": 0.7111252546310425, + "learning_rate": 3.64300630390232e-05, + "loss": 0.3737, + "step": 9920 + }, + { + "epoch": 0.7371235471053511, + "grad_norm": 1.0100241899490356, + "learning_rate": 3.641668992086913e-05, + "loss": 0.3761, + "step": 9925 + }, + { + "epoch": 0.7374948939804672, + "grad_norm": 1.498679757118225, + "learning_rate": 3.640331267382231e-05, + "loss": 0.3887, + "step": 9930 + }, + { + "epoch": 0.7378662408555832, + "grad_norm": 1.0976613759994507, + "learning_rate": 3.638993130272068e-05, + "loss": 0.357, + "step": 9935 + }, + { + "epoch": 0.7382375877306993, + "grad_norm": 0.8685204982757568, + "learning_rate": 3.6376545812403675e-05, + "loss": 0.3666, + "step": 9940 + }, + { + "epoch": 0.7386089346058153, + "grad_norm": 0.7913703322410583, + "learning_rate": 3.63631562077122e-05, + "loss": 0.3803, + "step": 9945 + }, + { + "epoch": 0.7389802814809313, + "grad_norm": 0.8833486437797546, + "learning_rate": 3.634976249348867e-05, + "loss": 0.3465, + "step": 9950 + }, + { + "epoch": 0.7393516283560474, + "grad_norm": 1.0097178220748901, + "learning_rate": 3.633636467457697e-05, + "loss": 0.3666, + "step": 9955 + }, + { + "epoch": 0.7397229752311635, + "grad_norm": 0.7995867133140564, + "learning_rate": 3.632296275582249e-05, + "loss": 0.3568, + "step": 9960 + }, + { + "epoch": 0.7400943221062795, + "grad_norm": 0.8641396760940552, + "learning_rate": 3.6309556742072075e-05, + "loss": 0.3428, + "step": 9965 + }, + { + "epoch": 0.7404656689813955, + "grad_norm": 0.9114529490470886, + "learning_rate": 3.629614663817407e-05, + "loss": 0.3701, + "step": 9970 + }, + { + "epoch": 0.7408370158565115, + "grad_norm": 0.8092938661575317, + "learning_rate": 3.628273244897828e-05, + "loss": 0.3676, + "step": 9975 + }, + { + "epoch": 0.7412083627316276, + "grad_norm": 0.8112850785255432, + "learning_rate": 3.6269314179336025e-05, + "loss": 0.3839, + "step": 9980 + }, + { + "epoch": 0.7415797096067437, + "grad_norm": 0.7114982604980469, + "learning_rate": 3.625589183410008e-05, + "loss": 0.3701, + "step": 9985 + }, + { + "epoch": 0.7419510564818597, + "grad_norm": 1.0398838520050049, + "learning_rate": 3.6242465418124657e-05, + "loss": 0.3586, + "step": 9990 + }, + { + "epoch": 0.7423224033569757, + "grad_norm": 0.7869742512702942, + "learning_rate": 3.62290349362655e-05, + "loss": 0.3749, + "step": 9995 + }, + { + "epoch": 0.7426937502320918, + "grad_norm": 1.0306918621063232, + "learning_rate": 3.621560039337979e-05, + "loss": 0.3514, + "step": 10000 + }, + { + "epoch": 0.7430650971072078, + "grad_norm": 1.0042853355407715, + "learning_rate": 3.62021617943262e-05, + "loss": 0.3929, + "step": 10005 + }, + { + "epoch": 0.7434364439823239, + "grad_norm": 0.8658770322799683, + "learning_rate": 3.618871914396483e-05, + "loss": 0.3775, + "step": 10010 + }, + { + "epoch": 0.74380779085744, + "grad_norm": 0.8654706478118896, + "learning_rate": 3.6175272447157294e-05, + "loss": 0.3726, + "step": 10015 + }, + { + "epoch": 0.744179137732556, + "grad_norm": 1.4344186782836914, + "learning_rate": 3.616182170876662e-05, + "loss": 0.3462, + "step": 10020 + }, + { + "epoch": 0.744550484607672, + "grad_norm": 0.9275874495506287, + "learning_rate": 3.614836693365734e-05, + "loss": 0.3717, + "step": 10025 + }, + { + "epoch": 0.7449218314827881, + "grad_norm": 0.8803759813308716, + "learning_rate": 3.6134908126695435e-05, + "loss": 0.3425, + "step": 10030 + }, + { + "epoch": 0.7452931783579041, + "grad_norm": 0.8547605872154236, + "learning_rate": 3.6121445292748314e-05, + "loss": 0.3505, + "step": 10035 + }, + { + "epoch": 0.7456645252330202, + "grad_norm": 0.7750731110572815, + "learning_rate": 3.6107978436684875e-05, + "loss": 0.3654, + "step": 10040 + }, + { + "epoch": 0.7460358721081362, + "grad_norm": 0.8395034670829773, + "learning_rate": 3.6094507563375475e-05, + "loss": 0.3687, + "step": 10045 + }, + { + "epoch": 0.7464072189832522, + "grad_norm": 0.6695213913917542, + "learning_rate": 3.60810326776919e-05, + "loss": 0.3758, + "step": 10050 + }, + { + "epoch": 0.7467785658583683, + "grad_norm": 0.8508323431015015, + "learning_rate": 3.60675537845074e-05, + "loss": 0.3948, + "step": 10055 + }, + { + "epoch": 0.7471499127334843, + "grad_norm": 1.0589313507080078, + "learning_rate": 3.6054070888696664e-05, + "loss": 0.3818, + "step": 10060 + }, + { + "epoch": 0.7475212596086004, + "grad_norm": 0.6998316049575806, + "learning_rate": 3.6040583995135854e-05, + "loss": 0.3619, + "step": 10065 + }, + { + "epoch": 0.7478926064837165, + "grad_norm": 0.9173612594604492, + "learning_rate": 3.602709310870255e-05, + "loss": 0.3511, + "step": 10070 + }, + { + "epoch": 0.7482639533588324, + "grad_norm": 0.8886153697967529, + "learning_rate": 3.601359823427578e-05, + "loss": 0.3543, + "step": 10075 + }, + { + "epoch": 0.7486353002339485, + "grad_norm": 0.9810105562210083, + "learning_rate": 3.600009937673603e-05, + "loss": 0.3851, + "step": 10080 + }, + { + "epoch": 0.7490066471090646, + "grad_norm": 0.9224115014076233, + "learning_rate": 3.598659654096521e-05, + "loss": 0.3566, + "step": 10085 + }, + { + "epoch": 0.7493779939841806, + "grad_norm": 0.849329948425293, + "learning_rate": 3.5973089731846684e-05, + "loss": 0.3489, + "step": 10090 + }, + { + "epoch": 0.7497493408592967, + "grad_norm": 0.952434241771698, + "learning_rate": 3.595957895426524e-05, + "loss": 0.3836, + "step": 10095 + }, + { + "epoch": 0.7501206877344128, + "grad_norm": 1.2084769010543823, + "learning_rate": 3.594606421310711e-05, + "loss": 0.3452, + "step": 10100 + }, + { + "epoch": 0.7504920346095287, + "grad_norm": 0.6959002017974854, + "learning_rate": 3.5932545513259945e-05, + "loss": 0.3544, + "step": 10105 + }, + { + "epoch": 0.7508633814846448, + "grad_norm": 1.344814658164978, + "learning_rate": 3.5919022859612863e-05, + "loss": 0.3726, + "step": 10110 + }, + { + "epoch": 0.7512347283597609, + "grad_norm": 0.7053912281990051, + "learning_rate": 3.590549625705635e-05, + "loss": 0.3885, + "step": 10115 + }, + { + "epoch": 0.7516060752348769, + "grad_norm": 0.876451849937439, + "learning_rate": 3.58919657104824e-05, + "loss": 0.3642, + "step": 10120 + }, + { + "epoch": 0.751977422109993, + "grad_norm": 0.8557884097099304, + "learning_rate": 3.5878431224784347e-05, + "loss": 0.3821, + "step": 10125 + }, + { + "epoch": 0.7523487689851089, + "grad_norm": 0.8507999181747437, + "learning_rate": 3.586489280485703e-05, + "loss": 0.3623, + "step": 10130 + }, + { + "epoch": 0.752720115860225, + "grad_norm": 0.8015534281730652, + "learning_rate": 3.585135045559665e-05, + "loss": 0.3374, + "step": 10135 + }, + { + "epoch": 0.7530914627353411, + "grad_norm": 2.207430124282837, + "learning_rate": 3.583780418190086e-05, + "loss": 0.3622, + "step": 10140 + }, + { + "epoch": 0.7534628096104571, + "grad_norm": 0.9500503540039062, + "learning_rate": 3.5824253988668735e-05, + "loss": 0.3705, + "step": 10145 + }, + { + "epoch": 0.7538341564855732, + "grad_norm": 0.865135908126831, + "learning_rate": 3.5810699880800746e-05, + "loss": 0.3576, + "step": 10150 + }, + { + "epoch": 0.7542055033606893, + "grad_norm": 0.8576253652572632, + "learning_rate": 3.5797141863198803e-05, + "loss": 0.3718, + "step": 10155 + }, + { + "epoch": 0.7545768502358052, + "grad_norm": 0.9014632105827332, + "learning_rate": 3.5783579940766206e-05, + "loss": 0.3571, + "step": 10160 + }, + { + "epoch": 0.7549481971109213, + "grad_norm": 0.9770298004150391, + "learning_rate": 3.577001411840769e-05, + "loss": 0.3772, + "step": 10165 + }, + { + "epoch": 0.7553195439860374, + "grad_norm": 0.7506251931190491, + "learning_rate": 3.575644440102937e-05, + "loss": 0.3662, + "step": 10170 + }, + { + "epoch": 0.7556908908611534, + "grad_norm": 0.8840534090995789, + "learning_rate": 3.574287079353882e-05, + "loss": 0.3676, + "step": 10175 + }, + { + "epoch": 0.7560622377362695, + "grad_norm": 0.7646118998527527, + "learning_rate": 3.572929330084498e-05, + "loss": 0.3312, + "step": 10180 + }, + { + "epoch": 0.7564335846113855, + "grad_norm": 0.7132297158241272, + "learning_rate": 3.57157119278582e-05, + "loss": 0.3462, + "step": 10185 + }, + { + "epoch": 0.7568049314865015, + "grad_norm": 1.2096678018569946, + "learning_rate": 3.570212667949023e-05, + "loss": 0.3675, + "step": 10190 + }, + { + "epoch": 0.7571762783616176, + "grad_norm": 0.8162215948104858, + "learning_rate": 3.5688537560654255e-05, + "loss": 0.353, + "step": 10195 + }, + { + "epoch": 0.7575476252367336, + "grad_norm": 1.0601283311843872, + "learning_rate": 3.5674944576264825e-05, + "loss": 0.3683, + "step": 10200 + }, + { + "epoch": 0.7579189721118497, + "grad_norm": 1.0272165536880493, + "learning_rate": 3.566134773123789e-05, + "loss": 0.364, + "step": 10205 + }, + { + "epoch": 0.7582903189869658, + "grad_norm": 0.7758386731147766, + "learning_rate": 3.564774703049081e-05, + "loss": 0.3654, + "step": 10210 + }, + { + "epoch": 0.7586616658620817, + "grad_norm": 0.952804684638977, + "learning_rate": 3.563414247894234e-05, + "loss": 0.3627, + "step": 10215 + }, + { + "epoch": 0.7590330127371978, + "grad_norm": 0.8867418169975281, + "learning_rate": 3.562053408151262e-05, + "loss": 0.3566, + "step": 10220 + }, + { + "epoch": 0.7594043596123139, + "grad_norm": 1.061828851699829, + "learning_rate": 3.5606921843123176e-05, + "loss": 0.3703, + "step": 10225 + }, + { + "epoch": 0.7597757064874299, + "grad_norm": 0.7243791818618774, + "learning_rate": 3.559330576869694e-05, + "loss": 0.366, + "step": 10230 + }, + { + "epoch": 0.760147053362546, + "grad_norm": 0.8533843159675598, + "learning_rate": 3.5579685863158205e-05, + "loss": 0.3852, + "step": 10235 + }, + { + "epoch": 0.760518400237662, + "grad_norm": 0.939856767654419, + "learning_rate": 3.556606213143269e-05, + "loss": 0.376, + "step": 10240 + }, + { + "epoch": 0.760889747112778, + "grad_norm": 0.6458791494369507, + "learning_rate": 3.5552434578447455e-05, + "loss": 0.3485, + "step": 10245 + }, + { + "epoch": 0.7612610939878941, + "grad_norm": 0.7483659386634827, + "learning_rate": 3.553880320913096e-05, + "loss": 0.3484, + "step": 10250 + }, + { + "epoch": 0.7616324408630102, + "grad_norm": 0.778672993183136, + "learning_rate": 3.5525168028413045e-05, + "loss": 0.36, + "step": 10255 + }, + { + "epoch": 0.7620037877381262, + "grad_norm": 0.8349382877349854, + "learning_rate": 3.5511529041224944e-05, + "loss": 0.3571, + "step": 10260 + }, + { + "epoch": 0.7623751346132422, + "grad_norm": 3.6593878269195557, + "learning_rate": 3.5497886252499254e-05, + "loss": 0.3588, + "step": 10265 + }, + { + "epoch": 0.7627464814883583, + "grad_norm": 0.8269204497337341, + "learning_rate": 3.5484239667169914e-05, + "loss": 0.3642, + "step": 10270 + }, + { + "epoch": 0.7631178283634743, + "grad_norm": 0.8218369483947754, + "learning_rate": 3.54705892901723e-05, + "loss": 0.3452, + "step": 10275 + }, + { + "epoch": 0.7634891752385904, + "grad_norm": 0.901314377784729, + "learning_rate": 3.545693512644311e-05, + "loss": 0.3567, + "step": 10280 + }, + { + "epoch": 0.7638605221137064, + "grad_norm": 0.8213258981704712, + "learning_rate": 3.544327718092044e-05, + "loss": 0.3669, + "step": 10285 + }, + { + "epoch": 0.7642318689888224, + "grad_norm": 0.8209224343299866, + "learning_rate": 3.5429615458543744e-05, + "loss": 0.3553, + "step": 10290 + }, + { + "epoch": 0.7646032158639385, + "grad_norm": 0.7939985990524292, + "learning_rate": 3.541594996425381e-05, + "loss": 0.3555, + "step": 10295 + }, + { + "epoch": 0.7649745627390545, + "grad_norm": 1.008037805557251, + "learning_rate": 3.540228070299286e-05, + "loss": 0.38, + "step": 10300 + }, + { + "epoch": 0.7653459096141706, + "grad_norm": 0.847432553768158, + "learning_rate": 3.538860767970441e-05, + "loss": 0.3463, + "step": 10305 + }, + { + "epoch": 0.7657172564892867, + "grad_norm": 0.7961945533752441, + "learning_rate": 3.537493089933338e-05, + "loss": 0.3802, + "step": 10310 + }, + { + "epoch": 0.7660886033644027, + "grad_norm": 0.9736582636833191, + "learning_rate": 3.536125036682603e-05, + "loss": 0.3424, + "step": 10315 + }, + { + "epoch": 0.7664599502395187, + "grad_norm": 0.9529137015342712, + "learning_rate": 3.534756608712998e-05, + "loss": 0.3605, + "step": 10320 + }, + { + "epoch": 0.7668312971146348, + "grad_norm": 0.9066136479377747, + "learning_rate": 3.533387806519419e-05, + "loss": 0.3548, + "step": 10325 + }, + { + "epoch": 0.7672026439897508, + "grad_norm": 1.0122712850570679, + "learning_rate": 3.5320186305969025e-05, + "loss": 0.3543, + "step": 10330 + }, + { + "epoch": 0.7675739908648669, + "grad_norm": 0.9614221453666687, + "learning_rate": 3.530649081440614e-05, + "loss": 0.37, + "step": 10335 + }, + { + "epoch": 0.767945337739983, + "grad_norm": 0.8859171271324158, + "learning_rate": 3.529279159545856e-05, + "loss": 0.3566, + "step": 10340 + }, + { + "epoch": 0.7683166846150989, + "grad_norm": 0.7675598859786987, + "learning_rate": 3.527908865408069e-05, + "loss": 0.3432, + "step": 10345 + }, + { + "epoch": 0.768688031490215, + "grad_norm": 0.7545421719551086, + "learning_rate": 3.526538199522823e-05, + "loss": 0.3661, + "step": 10350 + }, + { + "epoch": 0.769059378365331, + "grad_norm": 0.7817773818969727, + "learning_rate": 3.525167162385827e-05, + "loss": 0.3485, + "step": 10355 + }, + { + "epoch": 0.7694307252404471, + "grad_norm": 0.9702377915382385, + "learning_rate": 3.523795754492921e-05, + "loss": 0.3737, + "step": 10360 + }, + { + "epoch": 0.7698020721155632, + "grad_norm": 0.8555597066879272, + "learning_rate": 3.522423976340079e-05, + "loss": 0.3569, + "step": 10365 + }, + { + "epoch": 0.7701734189906791, + "grad_norm": 0.8786563277244568, + "learning_rate": 3.521051828423413e-05, + "loss": 0.3417, + "step": 10370 + }, + { + "epoch": 0.7705447658657952, + "grad_norm": 0.9140200018882751, + "learning_rate": 3.519679311239164e-05, + "loss": 0.352, + "step": 10375 + }, + { + "epoch": 0.7709161127409113, + "grad_norm": 0.8808820843696594, + "learning_rate": 3.518306425283709e-05, + "loss": 0.3401, + "step": 10380 + }, + { + "epoch": 0.7712874596160273, + "grad_norm": 0.671355664730072, + "learning_rate": 3.516933171053558e-05, + "loss": 0.352, + "step": 10385 + }, + { + "epoch": 0.7716588064911434, + "grad_norm": 0.9198750853538513, + "learning_rate": 3.515559549045354e-05, + "loss": 0.3553, + "step": 10390 + }, + { + "epoch": 0.7720301533662595, + "grad_norm": 1.168434500694275, + "learning_rate": 3.514185559755873e-05, + "loss": 0.3712, + "step": 10395 + }, + { + "epoch": 0.7724015002413754, + "grad_norm": 0.8529664278030396, + "learning_rate": 3.5128112036820244e-05, + "loss": 0.3673, + "step": 10400 + }, + { + "epoch": 0.7727728471164915, + "grad_norm": 0.7546283006668091, + "learning_rate": 3.511436481320848e-05, + "loss": 0.3659, + "step": 10405 + }, + { + "epoch": 0.7731441939916076, + "grad_norm": 1.0193859338760376, + "learning_rate": 3.5100613931695194e-05, + "loss": 0.3635, + "step": 10410 + }, + { + "epoch": 0.7735155408667236, + "grad_norm": 0.854232132434845, + "learning_rate": 3.5086859397253444e-05, + "loss": 0.3754, + "step": 10415 + }, + { + "epoch": 0.7738868877418397, + "grad_norm": 1.3594534397125244, + "learning_rate": 3.507310121485761e-05, + "loss": 0.3633, + "step": 10420 + }, + { + "epoch": 0.7742582346169556, + "grad_norm": 0.8981972336769104, + "learning_rate": 3.50593393894834e-05, + "loss": 0.3307, + "step": 10425 + }, + { + "epoch": 0.7746295814920717, + "grad_norm": 0.9904119372367859, + "learning_rate": 3.5045573926107825e-05, + "loss": 0.3626, + "step": 10430 + }, + { + "epoch": 0.7750009283671878, + "grad_norm": 0.9676375985145569, + "learning_rate": 3.503180482970925e-05, + "loss": 0.3656, + "step": 10435 + }, + { + "epoch": 0.7753722752423038, + "grad_norm": 0.6548202037811279, + "learning_rate": 3.5018032105267286e-05, + "loss": 0.3438, + "step": 10440 + }, + { + "epoch": 0.7757436221174199, + "grad_norm": 0.964228093624115, + "learning_rate": 3.500425575776293e-05, + "loss": 0.3788, + "step": 10445 + }, + { + "epoch": 0.776114968992536, + "grad_norm": 0.8802219033241272, + "learning_rate": 3.499047579217842e-05, + "loss": 0.3342, + "step": 10450 + }, + { + "epoch": 0.7764863158676519, + "grad_norm": 0.9338513612747192, + "learning_rate": 3.497669221349736e-05, + "loss": 0.3521, + "step": 10455 + }, + { + "epoch": 0.776857662742768, + "grad_norm": 0.8177879452705383, + "learning_rate": 3.496290502670464e-05, + "loss": 0.3687, + "step": 10460 + }, + { + "epoch": 0.7772290096178841, + "grad_norm": 1.0469391345977783, + "learning_rate": 3.4949114236786446e-05, + "loss": 0.3767, + "step": 10465 + }, + { + "epoch": 0.7776003564930001, + "grad_norm": 0.7534080147743225, + "learning_rate": 3.493531984873027e-05, + "loss": 0.3473, + "step": 10470 + }, + { + "epoch": 0.7779717033681162, + "grad_norm": 0.7704299092292786, + "learning_rate": 3.4921521867524916e-05, + "loss": 0.3758, + "step": 10475 + }, + { + "epoch": 0.7783430502432322, + "grad_norm": 0.7549129128456116, + "learning_rate": 3.4907720298160476e-05, + "loss": 0.3571, + "step": 10480 + }, + { + "epoch": 0.7787143971183482, + "grad_norm": 0.9326322674751282, + "learning_rate": 3.4893915145628344e-05, + "loss": 0.387, + "step": 10485 + }, + { + "epoch": 0.7790857439934643, + "grad_norm": 0.7050992250442505, + "learning_rate": 3.488010641492121e-05, + "loss": 0.3553, + "step": 10490 + }, + { + "epoch": 0.7794570908685804, + "grad_norm": 0.6863691806793213, + "learning_rate": 3.486629411103306e-05, + "loss": 0.3583, + "step": 10495 + }, + { + "epoch": 0.7798284377436964, + "grad_norm": 1.0876379013061523, + "learning_rate": 3.485247823895918e-05, + "loss": 0.3854, + "step": 10500 + }, + { + "epoch": 0.7801997846188125, + "grad_norm": 0.8426483273506165, + "learning_rate": 3.483865880369612e-05, + "loss": 0.3433, + "step": 10505 + }, + { + "epoch": 0.7805711314939284, + "grad_norm": 0.9481852650642395, + "learning_rate": 3.4824835810241734e-05, + "loss": 0.3431, + "step": 10510 + }, + { + "epoch": 0.7809424783690445, + "grad_norm": 0.8300859332084656, + "learning_rate": 3.4811009263595177e-05, + "loss": 0.3626, + "step": 10515 + }, + { + "epoch": 0.7813138252441606, + "grad_norm": 0.8380314707756042, + "learning_rate": 3.4797179168756877e-05, + "loss": 0.3528, + "step": 10520 + }, + { + "epoch": 0.7816851721192766, + "grad_norm": 1.0434871912002563, + "learning_rate": 3.4783345530728537e-05, + "loss": 0.3725, + "step": 10525 + }, + { + "epoch": 0.7820565189943927, + "grad_norm": 0.7483533024787903, + "learning_rate": 3.476950835451315e-05, + "loss": 0.3664, + "step": 10530 + }, + { + "epoch": 0.7824278658695087, + "grad_norm": 0.7801994681358337, + "learning_rate": 3.475566764511499e-05, + "loss": 0.3779, + "step": 10535 + }, + { + "epoch": 0.7827992127446247, + "grad_norm": 0.6180443167686462, + "learning_rate": 3.4741823407539606e-05, + "loss": 0.3626, + "step": 10540 + }, + { + "epoch": 0.7831705596197408, + "grad_norm": 0.8744031190872192, + "learning_rate": 3.4727975646793825e-05, + "loss": 0.3513, + "step": 10545 + }, + { + "epoch": 0.7835419064948569, + "grad_norm": 0.8767526745796204, + "learning_rate": 3.4714124367885734e-05, + "loss": 0.3712, + "step": 10550 + }, + { + "epoch": 0.7839132533699729, + "grad_norm": 0.7508691549301147, + "learning_rate": 3.4700269575824726e-05, + "loss": 0.349, + "step": 10555 + }, + { + "epoch": 0.784284600245089, + "grad_norm": 0.9903153777122498, + "learning_rate": 3.468641127562143e-05, + "loss": 0.37, + "step": 10560 + }, + { + "epoch": 0.784655947120205, + "grad_norm": 0.978623628616333, + "learning_rate": 3.4672549472287754e-05, + "loss": 0.3566, + "step": 10565 + }, + { + "epoch": 0.785027293995321, + "grad_norm": 1.4273784160614014, + "learning_rate": 3.4658684170836886e-05, + "loss": 0.3743, + "step": 10570 + }, + { + "epoch": 0.7853986408704371, + "grad_norm": 1.0664880275726318, + "learning_rate": 3.4644815376283265e-05, + "loss": 0.3281, + "step": 10575 + }, + { + "epoch": 0.7857699877455531, + "grad_norm": 0.9014327526092529, + "learning_rate": 3.4630943093642594e-05, + "loss": 0.3581, + "step": 10580 + }, + { + "epoch": 0.7861413346206692, + "grad_norm": 0.755337119102478, + "learning_rate": 3.461706732793184e-05, + "loss": 0.3556, + "step": 10585 + }, + { + "epoch": 0.7865126814957852, + "grad_norm": 0.9834188222885132, + "learning_rate": 3.4603188084169245e-05, + "loss": 0.3587, + "step": 10590 + }, + { + "epoch": 0.7868840283709012, + "grad_norm": 1.2598179578781128, + "learning_rate": 3.4589305367374274e-05, + "loss": 0.3572, + "step": 10595 + }, + { + "epoch": 0.7872553752460173, + "grad_norm": 1.7740238904953003, + "learning_rate": 3.457541918256768e-05, + "loss": 0.3806, + "step": 10600 + }, + { + "epoch": 0.7876267221211334, + "grad_norm": 0.8469778895378113, + "learning_rate": 3.456152953477146e-05, + "loss": 0.365, + "step": 10605 + }, + { + "epoch": 0.7879980689962494, + "grad_norm": 0.807431697845459, + "learning_rate": 3.454763642900886e-05, + "loss": 0.3728, + "step": 10610 + }, + { + "epoch": 0.7883694158713654, + "grad_norm": 0.8008100986480713, + "learning_rate": 3.453373987030438e-05, + "loss": 0.3608, + "step": 10615 + }, + { + "epoch": 0.7887407627464815, + "grad_norm": 0.8345623016357422, + "learning_rate": 3.451983986368376e-05, + "loss": 0.3362, + "step": 10620 + }, + { + "epoch": 0.7891121096215975, + "grad_norm": 1.0271682739257812, + "learning_rate": 3.4505936414174e-05, + "loss": 0.3582, + "step": 10625 + }, + { + "epoch": 0.7894834564967136, + "grad_norm": 1.199553370475769, + "learning_rate": 3.4492029526803346e-05, + "loss": 0.3765, + "step": 10630 + }, + { + "epoch": 0.7898548033718297, + "grad_norm": 0.9158863425254822, + "learning_rate": 3.447811920660127e-05, + "loss": 0.3664, + "step": 10635 + }, + { + "epoch": 0.7902261502469456, + "grad_norm": 0.8040124773979187, + "learning_rate": 3.44642054585985e-05, + "loss": 0.3441, + "step": 10640 + }, + { + "epoch": 0.7905974971220617, + "grad_norm": 0.8067423105239868, + "learning_rate": 3.4450288287827005e-05, + "loss": 0.3788, + "step": 10645 + }, + { + "epoch": 0.7909688439971778, + "grad_norm": 2.4240920543670654, + "learning_rate": 3.4436367699319974e-05, + "loss": 0.3677, + "step": 10650 + }, + { + "epoch": 0.7913401908722938, + "grad_norm": 1.0094491243362427, + "learning_rate": 3.442244369811186e-05, + "loss": 0.3594, + "step": 10655 + }, + { + "epoch": 0.7917115377474099, + "grad_norm": 0.9552943706512451, + "learning_rate": 3.4408516289238324e-05, + "loss": 0.36, + "step": 10660 + }, + { + "epoch": 0.7920828846225259, + "grad_norm": 1.0734007358551025, + "learning_rate": 3.4394585477736276e-05, + "loss": 0.3355, + "step": 10665 + }, + { + "epoch": 0.7924542314976419, + "grad_norm": 0.8186981678009033, + "learning_rate": 3.438065126864385e-05, + "loss": 0.3445, + "step": 10670 + }, + { + "epoch": 0.792825578372758, + "grad_norm": 0.7257325053215027, + "learning_rate": 3.436671366700042e-05, + "loss": 0.3586, + "step": 10675 + }, + { + "epoch": 0.793196925247874, + "grad_norm": 0.977857768535614, + "learning_rate": 3.435277267784656e-05, + "loss": 0.3387, + "step": 10680 + }, + { + "epoch": 0.7935682721229901, + "grad_norm": 0.7728844285011292, + "learning_rate": 3.433882830622409e-05, + "loss": 0.3505, + "step": 10685 + }, + { + "epoch": 0.7939396189981062, + "grad_norm": 0.8265921473503113, + "learning_rate": 3.4324880557176056e-05, + "loss": 0.364, + "step": 10690 + }, + { + "epoch": 0.7943109658732221, + "grad_norm": 0.9378109574317932, + "learning_rate": 3.431092943574671e-05, + "loss": 0.3594, + "step": 10695 + }, + { + "epoch": 0.7946823127483382, + "grad_norm": 0.8733883500099182, + "learning_rate": 3.429697494698154e-05, + "loss": 0.3641, + "step": 10700 + }, + { + "epoch": 0.7950536596234543, + "grad_norm": 1.0986816883087158, + "learning_rate": 3.428301709592724e-05, + "loss": 0.3487, + "step": 10705 + }, + { + "epoch": 0.7954250064985703, + "grad_norm": 0.8749418258666992, + "learning_rate": 3.426905588763172e-05, + "loss": 0.3728, + "step": 10710 + }, + { + "epoch": 0.7957963533736864, + "grad_norm": 1.04216730594635, + "learning_rate": 3.4255091327144127e-05, + "loss": 0.3761, + "step": 10715 + }, + { + "epoch": 0.7961677002488025, + "grad_norm": 0.9614870548248291, + "learning_rate": 3.424112341951478e-05, + "loss": 0.3657, + "step": 10720 + }, + { + "epoch": 0.7965390471239184, + "grad_norm": 0.7303864359855652, + "learning_rate": 3.422715216979524e-05, + "loss": 0.3684, + "step": 10725 + }, + { + "epoch": 0.7969103939990345, + "grad_norm": 0.6995201706886292, + "learning_rate": 3.421317758303826e-05, + "loss": 0.3465, + "step": 10730 + }, + { + "epoch": 0.7972817408741505, + "grad_norm": 0.9378324747085571, + "learning_rate": 3.419919966429782e-05, + "loss": 0.3479, + "step": 10735 + }, + { + "epoch": 0.7976530877492666, + "grad_norm": 0.9413684010505676, + "learning_rate": 3.4185218418629075e-05, + "loss": 0.3701, + "step": 10740 + }, + { + "epoch": 0.7980244346243827, + "grad_norm": 0.8533821105957031, + "learning_rate": 3.417123385108841e-05, + "loss": 0.3616, + "step": 10745 + }, + { + "epoch": 0.7983957814994986, + "grad_norm": 0.814765453338623, + "learning_rate": 3.415724596673341e-05, + "loss": 0.3559, + "step": 10750 + }, + { + "epoch": 0.7987671283746147, + "grad_norm": 0.8759481310844421, + "learning_rate": 3.4143254770622826e-05, + "loss": 0.3601, + "step": 10755 + }, + { + "epoch": 0.7991384752497308, + "grad_norm": 1.0086429119110107, + "learning_rate": 3.4129260267816655e-05, + "loss": 0.3771, + "step": 10760 + }, + { + "epoch": 0.7995098221248468, + "grad_norm": 1.0653283596038818, + "learning_rate": 3.411526246337605e-05, + "loss": 0.3615, + "step": 10765 + }, + { + "epoch": 0.7998811689999629, + "grad_norm": 0.8258479237556458, + "learning_rate": 3.410126136236339e-05, + "loss": 0.3426, + "step": 10770 + }, + { + "epoch": 0.800252515875079, + "grad_norm": 0.8926478624343872, + "learning_rate": 3.4087256969842216e-05, + "loss": 0.344, + "step": 10775 + }, + { + "epoch": 0.8006238627501949, + "grad_norm": 0.9824770092964172, + "learning_rate": 3.4073249290877286e-05, + "loss": 0.364, + "step": 10780 + }, + { + "epoch": 0.800995209625311, + "grad_norm": 1.6393191814422607, + "learning_rate": 3.4059238330534514e-05, + "loss": 0.3641, + "step": 10785 + }, + { + "epoch": 0.8013665565004271, + "grad_norm": 0.7166811227798462, + "learning_rate": 3.404522409388104e-05, + "loss": 0.3355, + "step": 10790 + }, + { + "epoch": 0.8017379033755431, + "grad_norm": 0.9480621218681335, + "learning_rate": 3.4031206585985176e-05, + "loss": 0.3473, + "step": 10795 + }, + { + "epoch": 0.8021092502506592, + "grad_norm": 1.0220279693603516, + "learning_rate": 3.4017185811916394e-05, + "loss": 0.3558, + "step": 10800 + }, + { + "epoch": 0.8024805971257752, + "grad_norm": 0.7704471945762634, + "learning_rate": 3.400316177674538e-05, + "loss": 0.3577, + "step": 10805 + }, + { + "epoch": 0.8028519440008912, + "grad_norm": 0.6855241656303406, + "learning_rate": 3.3989134485543974e-05, + "loss": 0.3412, + "step": 10810 + }, + { + "epoch": 0.8032232908760073, + "grad_norm": 0.973976194858551, + "learning_rate": 3.3975103943385196e-05, + "loss": 0.3509, + "step": 10815 + }, + { + "epoch": 0.8035946377511233, + "grad_norm": 0.7488546371459961, + "learning_rate": 3.396107015534326e-05, + "loss": 0.3363, + "step": 10820 + }, + { + "epoch": 0.8039659846262394, + "grad_norm": 0.8770419359207153, + "learning_rate": 3.3947033126493545e-05, + "loss": 0.3624, + "step": 10825 + }, + { + "epoch": 0.8043373315013554, + "grad_norm": 0.8518621921539307, + "learning_rate": 3.393299286191259e-05, + "loss": 0.3495, + "step": 10830 + }, + { + "epoch": 0.8047086783764714, + "grad_norm": 1.432323932647705, + "learning_rate": 3.391894936667813e-05, + "loss": 0.3705, + "step": 10835 + }, + { + "epoch": 0.8050800252515875, + "grad_norm": 1.0123671293258667, + "learning_rate": 3.390490264586903e-05, + "loss": 0.3776, + "step": 10840 + }, + { + "epoch": 0.8054513721267036, + "grad_norm": 1.0251203775405884, + "learning_rate": 3.3890852704565366e-05, + "loss": 0.3833, + "step": 10845 + }, + { + "epoch": 0.8058227190018196, + "grad_norm": 0.7330724000930786, + "learning_rate": 3.387679954784836e-05, + "loss": 0.3492, + "step": 10850 + }, + { + "epoch": 0.8061940658769357, + "grad_norm": 1.0848844051361084, + "learning_rate": 3.386274318080036e-05, + "loss": 0.368, + "step": 10855 + }, + { + "epoch": 0.8065654127520517, + "grad_norm": 1.0718929767608643, + "learning_rate": 3.3848683608504946e-05, + "loss": 0.3608, + "step": 10860 + }, + { + "epoch": 0.8069367596271677, + "grad_norm": 1.0632661581039429, + "learning_rate": 3.383462083604679e-05, + "loss": 0.336, + "step": 10865 + }, + { + "epoch": 0.8073081065022838, + "grad_norm": 0.7139385342597961, + "learning_rate": 3.3820554868511785e-05, + "loss": 0.3594, + "step": 10870 + }, + { + "epoch": 0.8076794533773999, + "grad_norm": 0.798427402973175, + "learning_rate": 3.380648571098692e-05, + "loss": 0.3541, + "step": 10875 + }, + { + "epoch": 0.8080508002525159, + "grad_norm": 0.9843793511390686, + "learning_rate": 3.379241336856036e-05, + "loss": 0.3692, + "step": 10880 + }, + { + "epoch": 0.8084221471276319, + "grad_norm": 0.8368058800697327, + "learning_rate": 3.377833784632144e-05, + "loss": 0.351, + "step": 10885 + }, + { + "epoch": 0.8087934940027479, + "grad_norm": 0.5817992687225342, + "learning_rate": 3.376425914936063e-05, + "loss": 0.3451, + "step": 10890 + }, + { + "epoch": 0.809164840877864, + "grad_norm": 0.9955894351005554, + "learning_rate": 3.3750177282769554e-05, + "loss": 0.3556, + "step": 10895 + }, + { + "epoch": 0.8095361877529801, + "grad_norm": 0.924485445022583, + "learning_rate": 3.373609225164095e-05, + "loss": 0.356, + "step": 10900 + }, + { + "epoch": 0.8099075346280961, + "grad_norm": 0.7800727486610413, + "learning_rate": 3.3722004061068756e-05, + "loss": 0.3646, + "step": 10905 + }, + { + "epoch": 0.8102788815032121, + "grad_norm": 0.8341777920722961, + "learning_rate": 3.370791271614801e-05, + "loss": 0.3562, + "step": 10910 + }, + { + "epoch": 0.8106502283783282, + "grad_norm": 0.8623719215393066, + "learning_rate": 3.369381822197491e-05, + "loss": 0.3579, + "step": 10915 + }, + { + "epoch": 0.8110215752534442, + "grad_norm": 0.9438233375549316, + "learning_rate": 3.367972058364678e-05, + "loss": 0.3598, + "step": 10920 + }, + { + "epoch": 0.8113929221285603, + "grad_norm": 0.8673635721206665, + "learning_rate": 3.36656198062621e-05, + "loss": 0.376, + "step": 10925 + }, + { + "epoch": 0.8117642690036764, + "grad_norm": 0.9977436661720276, + "learning_rate": 3.365151589492046e-05, + "loss": 0.36, + "step": 10930 + }, + { + "epoch": 0.8121356158787923, + "grad_norm": 2.1114859580993652, + "learning_rate": 3.363740885472261e-05, + "loss": 0.3643, + "step": 10935 + }, + { + "epoch": 0.8125069627539084, + "grad_norm": 0.8585381507873535, + "learning_rate": 3.362329869077042e-05, + "loss": 0.3441, + "step": 10940 + }, + { + "epoch": 0.8128783096290245, + "grad_norm": 1.0983209609985352, + "learning_rate": 3.360918540816687e-05, + "loss": 0.3689, + "step": 10945 + }, + { + "epoch": 0.8132496565041405, + "grad_norm": 0.9772186875343323, + "learning_rate": 3.359506901201611e-05, + "loss": 0.3568, + "step": 10950 + }, + { + "epoch": 0.8136210033792566, + "grad_norm": 0.8578617572784424, + "learning_rate": 3.358094950742338e-05, + "loss": 0.3572, + "step": 10955 + }, + { + "epoch": 0.8139923502543726, + "grad_norm": 0.8567529916763306, + "learning_rate": 3.356682689949506e-05, + "loss": 0.3464, + "step": 10960 + }, + { + "epoch": 0.8143636971294886, + "grad_norm": 1.8338557481765747, + "learning_rate": 3.3552701193338636e-05, + "loss": 0.3542, + "step": 10965 + }, + { + "epoch": 0.8147350440046047, + "grad_norm": 1.243157148361206, + "learning_rate": 3.353857239406275e-05, + "loss": 0.3616, + "step": 10970 + }, + { + "epoch": 0.8151063908797207, + "grad_norm": 1.1987615823745728, + "learning_rate": 3.3524440506777125e-05, + "loss": 0.3598, + "step": 10975 + }, + { + "epoch": 0.8154777377548368, + "grad_norm": 0.8914827704429626, + "learning_rate": 3.3510305536592625e-05, + "loss": 0.3743, + "step": 10980 + }, + { + "epoch": 0.8158490846299529, + "grad_norm": 0.7617464661598206, + "learning_rate": 3.349616748862122e-05, + "loss": 0.3407, + "step": 10985 + }, + { + "epoch": 0.8162204315050688, + "grad_norm": 0.8160666227340698, + "learning_rate": 3.3482026367975974e-05, + "loss": 0.35, + "step": 10990 + }, + { + "epoch": 0.8165917783801849, + "grad_norm": 0.81715327501297, + "learning_rate": 3.346788217977111e-05, + "loss": 0.3788, + "step": 10995 + }, + { + "epoch": 0.816963125255301, + "grad_norm": 0.7353161573410034, + "learning_rate": 3.345373492912191e-05, + "loss": 0.3429, + "step": 11000 + }, + { + "epoch": 0.817334472130417, + "grad_norm": 0.8322049975395203, + "learning_rate": 3.34395846211448e-05, + "loss": 0.3609, + "step": 11005 + }, + { + "epoch": 0.8177058190055331, + "grad_norm": 0.7142672538757324, + "learning_rate": 3.3425431260957276e-05, + "loss": 0.3516, + "step": 11010 + }, + { + "epoch": 0.8180771658806492, + "grad_norm": 0.8758529424667358, + "learning_rate": 3.3411274853677994e-05, + "loss": 0.3598, + "step": 11015 + }, + { + "epoch": 0.8184485127557651, + "grad_norm": 1.3628263473510742, + "learning_rate": 3.3397115404426635e-05, + "loss": 0.3707, + "step": 11020 + }, + { + "epoch": 0.8188198596308812, + "grad_norm": 1.0877119302749634, + "learning_rate": 3.338295291832405e-05, + "loss": 0.3616, + "step": 11025 + }, + { + "epoch": 0.8191912065059973, + "grad_norm": 0.7904345393180847, + "learning_rate": 3.336878740049215e-05, + "loss": 0.3661, + "step": 11030 + }, + { + "epoch": 0.8195625533811133, + "grad_norm": 0.85951167345047, + "learning_rate": 3.335461885605397e-05, + "loss": 0.3643, + "step": 11035 + }, + { + "epoch": 0.8199339002562294, + "grad_norm": 1.0981451272964478, + "learning_rate": 3.334044729013359e-05, + "loss": 0.3426, + "step": 11040 + }, + { + "epoch": 0.8203052471313453, + "grad_norm": 0.7961001396179199, + "learning_rate": 3.3326272707856235e-05, + "loss": 0.3458, + "step": 11045 + }, + { + "epoch": 0.8206765940064614, + "grad_norm": 0.8995984196662903, + "learning_rate": 3.33120951143482e-05, + "loss": 0.3545, + "step": 11050 + }, + { + "epoch": 0.8210479408815775, + "grad_norm": 0.6880924701690674, + "learning_rate": 3.329791451473687e-05, + "loss": 0.3446, + "step": 11055 + }, + { + "epoch": 0.8214192877566935, + "grad_norm": 0.7782796621322632, + "learning_rate": 3.3283730914150716e-05, + "loss": 0.3611, + "step": 11060 + }, + { + "epoch": 0.8217906346318096, + "grad_norm": 1.008087158203125, + "learning_rate": 3.326954431771929e-05, + "loss": 0.3652, + "step": 11065 + }, + { + "epoch": 0.8221619815069257, + "grad_norm": 0.6520247459411621, + "learning_rate": 3.325535473057324e-05, + "loss": 0.3516, + "step": 11070 + }, + { + "epoch": 0.8225333283820416, + "grad_norm": 0.7982839345932007, + "learning_rate": 3.3241162157844284e-05, + "loss": 0.3556, + "step": 11075 + }, + { + "epoch": 0.8229046752571577, + "grad_norm": 3.1263599395751953, + "learning_rate": 3.322696660466523e-05, + "loss": 0.3483, + "step": 11080 + }, + { + "epoch": 0.8232760221322738, + "grad_norm": 0.9678716659545898, + "learning_rate": 3.321276807616995e-05, + "loss": 0.3566, + "step": 11085 + }, + { + "epoch": 0.8236473690073898, + "grad_norm": 0.8444531559944153, + "learning_rate": 3.31985665774934e-05, + "loss": 0.3521, + "step": 11090 + }, + { + "epoch": 0.8240187158825059, + "grad_norm": 1.0266430377960205, + "learning_rate": 3.318436211377162e-05, + "loss": 0.3575, + "step": 11095 + }, + { + "epoch": 0.8243900627576219, + "grad_norm": 0.8156036734580994, + "learning_rate": 3.31701546901417e-05, + "loss": 0.3489, + "step": 11100 + }, + { + "epoch": 0.8247614096327379, + "grad_norm": 0.7811394333839417, + "learning_rate": 3.315594431174183e-05, + "loss": 0.3729, + "step": 11105 + }, + { + "epoch": 0.825132756507854, + "grad_norm": 0.8737289905548096, + "learning_rate": 3.314173098371123e-05, + "loss": 0.3448, + "step": 11110 + }, + { + "epoch": 0.82550410338297, + "grad_norm": 0.9332969188690186, + "learning_rate": 3.312751471119023e-05, + "loss": 0.3762, + "step": 11115 + }, + { + "epoch": 0.8258754502580861, + "grad_norm": 1.149572730064392, + "learning_rate": 3.311329549932018e-05, + "loss": 0.3539, + "step": 11120 + }, + { + "epoch": 0.8262467971332021, + "grad_norm": 0.8576069474220276, + "learning_rate": 3.309907335324354e-05, + "loss": 0.3521, + "step": 11125 + }, + { + "epoch": 0.8266181440083181, + "grad_norm": 1.0628324747085571, + "learning_rate": 3.30848482781038e-05, + "loss": 0.3798, + "step": 11130 + }, + { + "epoch": 0.8269894908834342, + "grad_norm": 0.8770873546600342, + "learning_rate": 3.30706202790455e-05, + "loss": 0.3445, + "step": 11135 + }, + { + "epoch": 0.8273608377585503, + "grad_norm": 0.8677864074707031, + "learning_rate": 3.305638936121428e-05, + "loss": 0.3895, + "step": 11140 + }, + { + "epoch": 0.8277321846336663, + "grad_norm": 0.8235073089599609, + "learning_rate": 3.304215552975679e-05, + "loss": 0.3337, + "step": 11145 + }, + { + "epoch": 0.8281035315087824, + "grad_norm": 0.9545353055000305, + "learning_rate": 3.302791878982077e-05, + "loss": 0.3528, + "step": 11150 + }, + { + "epoch": 0.8284748783838984, + "grad_norm": 0.8804544806480408, + "learning_rate": 3.3013679146554984e-05, + "loss": 0.3609, + "step": 11155 + }, + { + "epoch": 0.8288462252590144, + "grad_norm": 0.8000383973121643, + "learning_rate": 3.299943660510925e-05, + "loss": 0.3584, + "step": 11160 + }, + { + "epoch": 0.8292175721341305, + "grad_norm": 1.1502083539962769, + "learning_rate": 3.2985191170634454e-05, + "loss": 0.3894, + "step": 11165 + }, + { + "epoch": 0.8295889190092466, + "grad_norm": 0.7234513759613037, + "learning_rate": 3.297094284828252e-05, + "loss": 0.3795, + "step": 11170 + }, + { + "epoch": 0.8299602658843626, + "grad_norm": 0.9550766348838806, + "learning_rate": 3.2956691643206406e-05, + "loss": 0.3402, + "step": 11175 + }, + { + "epoch": 0.8303316127594786, + "grad_norm": 0.9498189687728882, + "learning_rate": 3.2942437560560116e-05, + "loss": 0.3531, + "step": 11180 + }, + { + "epoch": 0.8307029596345947, + "grad_norm": 0.9834123253822327, + "learning_rate": 3.2928180605498697e-05, + "loss": 0.3597, + "step": 11185 + }, + { + "epoch": 0.8310743065097107, + "grad_norm": 0.7517110109329224, + "learning_rate": 3.2913920783178235e-05, + "loss": 0.35, + "step": 11190 + }, + { + "epoch": 0.8314456533848268, + "grad_norm": 0.841060221195221, + "learning_rate": 3.289965809875587e-05, + "loss": 0.3565, + "step": 11195 + }, + { + "epoch": 0.8318170002599428, + "grad_norm": 0.7667614817619324, + "learning_rate": 3.2885392557389726e-05, + "loss": 0.346, + "step": 11200 + }, + { + "epoch": 0.8321883471350588, + "grad_norm": 3.239955186843872, + "learning_rate": 3.2871124164239035e-05, + "loss": 0.359, + "step": 11205 + }, + { + "epoch": 0.8325596940101749, + "grad_norm": 1.0403164625167847, + "learning_rate": 3.2856852924463986e-05, + "loss": 0.3428, + "step": 11210 + }, + { + "epoch": 0.8329310408852909, + "grad_norm": 0.7579136490821838, + "learning_rate": 3.2842578843225855e-05, + "loss": 0.3331, + "step": 11215 + }, + { + "epoch": 0.833302387760407, + "grad_norm": 0.9784735441207886, + "learning_rate": 3.282830192568691e-05, + "loss": 0.3483, + "step": 11220 + }, + { + "epoch": 0.8336737346355231, + "grad_norm": 1.0152697563171387, + "learning_rate": 3.281402217701045e-05, + "loss": 0.3629, + "step": 11225 + }, + { + "epoch": 0.834045081510639, + "grad_norm": 0.6538757085800171, + "learning_rate": 3.279973960236083e-05, + "loss": 0.3553, + "step": 11230 + }, + { + "epoch": 0.8344164283857551, + "grad_norm": 1.0720804929733276, + "learning_rate": 3.2785454206903377e-05, + "loss": 0.348, + "step": 11235 + }, + { + "epoch": 0.8347877752608712, + "grad_norm": 0.8411654829978943, + "learning_rate": 3.277116599580448e-05, + "loss": 0.3688, + "step": 11240 + }, + { + "epoch": 0.8351591221359872, + "grad_norm": 0.9355876445770264, + "learning_rate": 3.2756874974231506e-05, + "loss": 0.3592, + "step": 11245 + }, + { + "epoch": 0.8355304690111033, + "grad_norm": 0.9845572710037231, + "learning_rate": 3.274258114735288e-05, + "loss": 0.3513, + "step": 11250 + }, + { + "epoch": 0.8359018158862194, + "grad_norm": 0.9073336720466614, + "learning_rate": 3.2728284520338024e-05, + "loss": 0.3639, + "step": 11255 + }, + { + "epoch": 0.8362731627613353, + "grad_norm": 0.9223979115486145, + "learning_rate": 3.271398509835737e-05, + "loss": 0.3617, + "step": 11260 + }, + { + "epoch": 0.8366445096364514, + "grad_norm": 0.7728599309921265, + "learning_rate": 3.269968288658236e-05, + "loss": 0.3576, + "step": 11265 + }, + { + "epoch": 0.8370158565115674, + "grad_norm": 0.7334360480308533, + "learning_rate": 3.2685377890185435e-05, + "loss": 0.3591, + "step": 11270 + }, + { + "epoch": 0.8373872033866835, + "grad_norm": 0.9434806108474731, + "learning_rate": 3.267107011434009e-05, + "loss": 0.3894, + "step": 11275 + }, + { + "epoch": 0.8377585502617996, + "grad_norm": 0.7240009307861328, + "learning_rate": 3.2656759564220745e-05, + "loss": 0.3637, + "step": 11280 + }, + { + "epoch": 0.8381298971369155, + "grad_norm": 1.0840269327163696, + "learning_rate": 3.264244624500291e-05, + "loss": 0.3681, + "step": 11285 + }, + { + "epoch": 0.8385012440120316, + "grad_norm": 0.882512629032135, + "learning_rate": 3.2628130161863025e-05, + "loss": 0.3489, + "step": 11290 + }, + { + "epoch": 0.8388725908871477, + "grad_norm": 0.8570889234542847, + "learning_rate": 3.261381131997859e-05, + "loss": 0.3558, + "step": 11295 + }, + { + "epoch": 0.8392439377622637, + "grad_norm": 0.7707122564315796, + "learning_rate": 3.2599489724528044e-05, + "loss": 0.3436, + "step": 11300 + }, + { + "epoch": 0.8396152846373798, + "grad_norm": 0.9872932434082031, + "learning_rate": 3.2585165380690866e-05, + "loss": 0.3636, + "step": 11305 + }, + { + "epoch": 0.8399866315124959, + "grad_norm": 0.8050171136856079, + "learning_rate": 3.257083829364751e-05, + "loss": 0.3381, + "step": 11310 + }, + { + "epoch": 0.8403579783876118, + "grad_norm": 0.7836553454399109, + "learning_rate": 3.255650846857943e-05, + "loss": 0.3601, + "step": 11315 + }, + { + "epoch": 0.8407293252627279, + "grad_norm": 1.6859378814697266, + "learning_rate": 3.254217591066906e-05, + "loss": 0.3414, + "step": 11320 + }, + { + "epoch": 0.841100672137844, + "grad_norm": 0.9329738616943359, + "learning_rate": 3.252784062509984e-05, + "loss": 0.3492, + "step": 11325 + }, + { + "epoch": 0.84147201901296, + "grad_norm": 1.1672630310058594, + "learning_rate": 3.251350261705617e-05, + "loss": 0.3535, + "step": 11330 + }, + { + "epoch": 0.8418433658880761, + "grad_norm": 0.9215666055679321, + "learning_rate": 3.249916189172345e-05, + "loss": 0.3356, + "step": 11335 + }, + { + "epoch": 0.8422147127631922, + "grad_norm": 0.7470521330833435, + "learning_rate": 3.2484818454288076e-05, + "loss": 0.3672, + "step": 11340 + }, + { + "epoch": 0.8425860596383081, + "grad_norm": 1.0789822340011597, + "learning_rate": 3.24704723099374e-05, + "loss": 0.39, + "step": 11345 + }, + { + "epoch": 0.8429574065134242, + "grad_norm": 0.7468339204788208, + "learning_rate": 3.245612346385977e-05, + "loss": 0.3556, + "step": 11350 + }, + { + "epoch": 0.8433287533885402, + "grad_norm": 0.8107490539550781, + "learning_rate": 3.2441771921244505e-05, + "loss": 0.3363, + "step": 11355 + }, + { + "epoch": 0.8437001002636563, + "grad_norm": 0.977986752986908, + "learning_rate": 3.242741768728188e-05, + "loss": 0.3455, + "step": 11360 + }, + { + "epoch": 0.8440714471387724, + "grad_norm": 0.829646110534668, + "learning_rate": 3.24130607671632e-05, + "loss": 0.3369, + "step": 11365 + }, + { + "epoch": 0.8444427940138883, + "grad_norm": 0.96271812915802, + "learning_rate": 3.239870116608067e-05, + "loss": 0.3568, + "step": 11370 + }, + { + "epoch": 0.8448141408890044, + "grad_norm": 0.8084895014762878, + "learning_rate": 3.238433888922751e-05, + "loss": 0.3448, + "step": 11375 + }, + { + "epoch": 0.8451854877641205, + "grad_norm": 0.7734755873680115, + "learning_rate": 3.2369973941797905e-05, + "loss": 0.3631, + "step": 11380 + }, + { + "epoch": 0.8455568346392365, + "grad_norm": 0.798600971698761, + "learning_rate": 3.235560632898699e-05, + "loss": 0.3513, + "step": 11385 + }, + { + "epoch": 0.8459281815143526, + "grad_norm": 0.9249498248100281, + "learning_rate": 3.234123605599086e-05, + "loss": 0.3686, + "step": 11390 + }, + { + "epoch": 0.8462995283894686, + "grad_norm": 0.7547146081924438, + "learning_rate": 3.23268631280066e-05, + "loss": 0.346, + "step": 11395 + }, + { + "epoch": 0.8466708752645846, + "grad_norm": 0.8783119916915894, + "learning_rate": 3.231248755023224e-05, + "loss": 0.3561, + "step": 11400 + }, + { + "epoch": 0.8470422221397007, + "grad_norm": 0.72531658411026, + "learning_rate": 3.229810932786675e-05, + "loss": 0.3535, + "step": 11405 + }, + { + "epoch": 0.8474135690148168, + "grad_norm": 0.8411027193069458, + "learning_rate": 3.228372846611009e-05, + "loss": 0.3365, + "step": 11410 + }, + { + "epoch": 0.8477849158899328, + "grad_norm": 1.3037042617797852, + "learning_rate": 3.226934497016314e-05, + "loss": 0.3557, + "step": 11415 + }, + { + "epoch": 0.8481562627650489, + "grad_norm": 0.9049678444862366, + "learning_rate": 3.2254958845227765e-05, + "loss": 0.3461, + "step": 11420 + }, + { + "epoch": 0.8485276096401648, + "grad_norm": 0.9194682836532593, + "learning_rate": 3.224057009650676e-05, + "loss": 0.3731, + "step": 11425 + }, + { + "epoch": 0.8488989565152809, + "grad_norm": 0.8797911405563354, + "learning_rate": 3.222617872920389e-05, + "loss": 0.3695, + "step": 11430 + }, + { + "epoch": 0.849270303390397, + "grad_norm": 0.9936988353729248, + "learning_rate": 3.2211784748523813e-05, + "loss": 0.354, + "step": 11435 + }, + { + "epoch": 0.849641650265513, + "grad_norm": 0.8593586683273315, + "learning_rate": 3.219738815967222e-05, + "loss": 0.3863, + "step": 11440 + }, + { + "epoch": 0.8500129971406291, + "grad_norm": 1.028411865234375, + "learning_rate": 3.218298896785566e-05, + "loss": 0.3556, + "step": 11445 + }, + { + "epoch": 0.8503843440157451, + "grad_norm": 1.2228703498840332, + "learning_rate": 3.2168587178281686e-05, + "loss": 0.344, + "step": 11450 + }, + { + "epoch": 0.8507556908908611, + "grad_norm": 0.8291895985603333, + "learning_rate": 3.215418279615874e-05, + "loss": 0.3277, + "step": 11455 + }, + { + "epoch": 0.8511270377659772, + "grad_norm": 0.8701698780059814, + "learning_rate": 3.2139775826696233e-05, + "loss": 0.3926, + "step": 11460 + }, + { + "epoch": 0.8514983846410933, + "grad_norm": 0.8717911243438721, + "learning_rate": 3.212536627510451e-05, + "loss": 0.3421, + "step": 11465 + }, + { + "epoch": 0.8518697315162093, + "grad_norm": 1.0229847431182861, + "learning_rate": 3.211095414659485e-05, + "loss": 0.3591, + "step": 11470 + }, + { + "epoch": 0.8522410783913253, + "grad_norm": 0.666601300239563, + "learning_rate": 3.209653944637945e-05, + "loss": 0.3442, + "step": 11475 + }, + { + "epoch": 0.8526124252664414, + "grad_norm": 0.7465684413909912, + "learning_rate": 3.2082122179671445e-05, + "loss": 0.3585, + "step": 11480 + }, + { + "epoch": 0.8529837721415574, + "grad_norm": 0.9072179794311523, + "learning_rate": 3.2067702351684905e-05, + "loss": 0.3356, + "step": 11485 + }, + { + "epoch": 0.8533551190166735, + "grad_norm": 0.8667060136795044, + "learning_rate": 3.205327996763481e-05, + "loss": 0.355, + "step": 11490 + }, + { + "epoch": 0.8537264658917895, + "grad_norm": 0.9172279238700867, + "learning_rate": 3.203885503273708e-05, + "loss": 0.3482, + "step": 11495 + }, + { + "epoch": 0.8540978127669056, + "grad_norm": 0.8449558615684509, + "learning_rate": 3.2024427552208546e-05, + "loss": 0.362, + "step": 11500 + }, + { + "epoch": 0.8544691596420216, + "grad_norm": 0.8456041216850281, + "learning_rate": 3.2009997531266985e-05, + "loss": 0.3489, + "step": 11505 + }, + { + "epoch": 0.8548405065171376, + "grad_norm": 0.8721354007720947, + "learning_rate": 3.199556497513105e-05, + "loss": 0.348, + "step": 11510 + }, + { + "epoch": 0.8552118533922537, + "grad_norm": 0.7469504475593567, + "learning_rate": 3.198112988902035e-05, + "loss": 0.3365, + "step": 11515 + }, + { + "epoch": 0.8555832002673698, + "grad_norm": 0.8014267086982727, + "learning_rate": 3.1966692278155386e-05, + "loss": 0.3464, + "step": 11520 + }, + { + "epoch": 0.8559545471424858, + "grad_norm": 0.6369099617004395, + "learning_rate": 3.1952252147757563e-05, + "loss": 0.3464, + "step": 11525 + }, + { + "epoch": 0.8563258940176018, + "grad_norm": 0.9981203079223633, + "learning_rate": 3.193780950304925e-05, + "loss": 0.3571, + "step": 11530 + }, + { + "epoch": 0.8566972408927179, + "grad_norm": 0.6954818964004517, + "learning_rate": 3.192336434925367e-05, + "loss": 0.3616, + "step": 11535 + }, + { + "epoch": 0.8570685877678339, + "grad_norm": 1.202691674232483, + "learning_rate": 3.190891669159497e-05, + "loss": 0.3253, + "step": 11540 + }, + { + "epoch": 0.85743993464295, + "grad_norm": 0.6270232200622559, + "learning_rate": 3.189446653529819e-05, + "loss": 0.3458, + "step": 11545 + }, + { + "epoch": 0.8578112815180661, + "grad_norm": 1.0812908411026, + "learning_rate": 3.188001388558932e-05, + "loss": 0.3713, + "step": 11550 + }, + { + "epoch": 0.858182628393182, + "grad_norm": 1.0152004957199097, + "learning_rate": 3.1865558747695194e-05, + "loss": 0.3417, + "step": 11555 + }, + { + "epoch": 0.8585539752682981, + "grad_norm": 0.9745574593544006, + "learning_rate": 3.1851101126843584e-05, + "loss": 0.3675, + "step": 11560 + }, + { + "epoch": 0.8589253221434142, + "grad_norm": 0.8742450475692749, + "learning_rate": 3.183664102826316e-05, + "loss": 0.3524, + "step": 11565 + }, + { + "epoch": 0.8592966690185302, + "grad_norm": 0.8508179783821106, + "learning_rate": 3.182217845718344e-05, + "loss": 0.3647, + "step": 11570 + }, + { + "epoch": 0.8596680158936463, + "grad_norm": 0.8023908734321594, + "learning_rate": 3.180771341883491e-05, + "loss": 0.3531, + "step": 11575 + }, + { + "epoch": 0.8600393627687622, + "grad_norm": 1.2750799655914307, + "learning_rate": 3.1793245918448886e-05, + "loss": 0.3619, + "step": 11580 + }, + { + "epoch": 0.8604107096438783, + "grad_norm": 0.8956974744796753, + "learning_rate": 3.177877596125761e-05, + "loss": 0.3629, + "step": 11585 + }, + { + "epoch": 0.8607820565189944, + "grad_norm": 0.8610901236534119, + "learning_rate": 3.17643035524942e-05, + "loss": 0.348, + "step": 11590 + }, + { + "epoch": 0.8611534033941104, + "grad_norm": 0.8622403144836426, + "learning_rate": 3.1749828697392667e-05, + "loss": 0.3444, + "step": 11595 + }, + { + "epoch": 0.8615247502692265, + "grad_norm": 0.9162015318870544, + "learning_rate": 3.173535140118789e-05, + "loss": 0.362, + "step": 11600 + }, + { + "epoch": 0.8618960971443426, + "grad_norm": 0.7178031802177429, + "learning_rate": 3.1720871669115646e-05, + "loss": 0.3558, + "step": 11605 + }, + { + "epoch": 0.8622674440194585, + "grad_norm": 0.9224133491516113, + "learning_rate": 3.1706389506412595e-05, + "loss": 0.3828, + "step": 11610 + }, + { + "epoch": 0.8626387908945746, + "grad_norm": 0.7860289812088013, + "learning_rate": 3.1691904918316263e-05, + "loss": 0.3344, + "step": 11615 + }, + { + "epoch": 0.8630101377696907, + "grad_norm": 0.9802098274230957, + "learning_rate": 3.167741791006507e-05, + "loss": 0.3834, + "step": 11620 + }, + { + "epoch": 0.8633814846448067, + "grad_norm": 0.9482443332672119, + "learning_rate": 3.166292848689829e-05, + "loss": 0.3722, + "step": 11625 + }, + { + "epoch": 0.8637528315199228, + "grad_norm": 1.1732617616653442, + "learning_rate": 3.16484366540561e-05, + "loss": 0.3598, + "step": 11630 + }, + { + "epoch": 0.8641241783950389, + "grad_norm": 1.0246295928955078, + "learning_rate": 3.16339424167795e-05, + "loss": 0.3534, + "step": 11635 + }, + { + "epoch": 0.8644955252701548, + "grad_norm": 0.8778313994407654, + "learning_rate": 3.1619445780310435e-05, + "loss": 0.3376, + "step": 11640 + }, + { + "epoch": 0.8648668721452709, + "grad_norm": 1.0736422538757324, + "learning_rate": 3.1604946749891636e-05, + "loss": 0.3479, + "step": 11645 + }, + { + "epoch": 0.8652382190203869, + "grad_norm": 0.9888156056404114, + "learning_rate": 3.159044533076675e-05, + "loss": 0.3558, + "step": 11650 + }, + { + "epoch": 0.865609565895503, + "grad_norm": 0.7955016493797302, + "learning_rate": 3.157594152818027e-05, + "loss": 0.3526, + "step": 11655 + }, + { + "epoch": 0.8659809127706191, + "grad_norm": 0.8220962882041931, + "learning_rate": 3.1561435347377564e-05, + "loss": 0.3507, + "step": 11660 + }, + { + "epoch": 0.866352259645735, + "grad_norm": 0.8022737503051758, + "learning_rate": 3.154692679360485e-05, + "loss": 0.3493, + "step": 11665 + }, + { + "epoch": 0.8667236065208511, + "grad_norm": 1.158897042274475, + "learning_rate": 3.1532415872109206e-05, + "loss": 0.3636, + "step": 11670 + }, + { + "epoch": 0.8670949533959672, + "grad_norm": 0.8414539098739624, + "learning_rate": 3.151790258813855e-05, + "loss": 0.3466, + "step": 11675 + }, + { + "epoch": 0.8674663002710832, + "grad_norm": 0.7647371888160706, + "learning_rate": 3.15033869469417e-05, + "loss": 0.3444, + "step": 11680 + }, + { + "epoch": 0.8678376471461993, + "grad_norm": 0.8496689796447754, + "learning_rate": 3.1488868953768276e-05, + "loss": 0.3827, + "step": 11685 + }, + { + "epoch": 0.8682089940213154, + "grad_norm": 0.9408618211746216, + "learning_rate": 3.1474348613868774e-05, + "loss": 0.3402, + "step": 11690 + }, + { + "epoch": 0.8685803408964313, + "grad_norm": 0.9130736589431763, + "learning_rate": 3.1459825932494535e-05, + "loss": 0.3402, + "step": 11695 + }, + { + "epoch": 0.8689516877715474, + "grad_norm": 0.8409910798072815, + "learning_rate": 3.1445300914897744e-05, + "loss": 0.3453, + "step": 11700 + }, + { + "epoch": 0.8693230346466635, + "grad_norm": 0.8987837433815002, + "learning_rate": 3.1430773566331436e-05, + "loss": 0.3364, + "step": 11705 + }, + { + "epoch": 0.8696943815217795, + "grad_norm": 2.895118474960327, + "learning_rate": 3.14162438920495e-05, + "loss": 0.3431, + "step": 11710 + }, + { + "epoch": 0.8700657283968956, + "grad_norm": 0.8010272979736328, + "learning_rate": 3.140171189730662e-05, + "loss": 0.3428, + "step": 11715 + }, + { + "epoch": 0.8704370752720116, + "grad_norm": 1.2577528953552246, + "learning_rate": 3.138717758735839e-05, + "loss": 0.3701, + "step": 11720 + }, + { + "epoch": 0.8708084221471276, + "grad_norm": 0.8167540431022644, + "learning_rate": 3.137264096746117e-05, + "loss": 0.36, + "step": 11725 + }, + { + "epoch": 0.8711797690222437, + "grad_norm": 0.7406752705574036, + "learning_rate": 3.135810204287221e-05, + "loss": 0.3473, + "step": 11730 + }, + { + "epoch": 0.8715511158973597, + "grad_norm": 0.8588741421699524, + "learning_rate": 3.134356081884956e-05, + "loss": 0.3361, + "step": 11735 + }, + { + "epoch": 0.8719224627724758, + "grad_norm": 0.8938350677490234, + "learning_rate": 3.132901730065211e-05, + "loss": 0.3431, + "step": 11740 + }, + { + "epoch": 0.8722938096475918, + "grad_norm": 0.9436230063438416, + "learning_rate": 3.13144714935396e-05, + "loss": 0.357, + "step": 11745 + }, + { + "epoch": 0.8726651565227078, + "grad_norm": 1.0733808279037476, + "learning_rate": 3.129992340277256e-05, + "loss": 0.3459, + "step": 11750 + }, + { + "epoch": 0.8730365033978239, + "grad_norm": 0.7322360873222351, + "learning_rate": 3.1285373033612385e-05, + "loss": 0.3645, + "step": 11755 + }, + { + "epoch": 0.87340785027294, + "grad_norm": 1.1299097537994385, + "learning_rate": 3.127082039132126e-05, + "loss": 0.3319, + "step": 11760 + }, + { + "epoch": 0.873779197148056, + "grad_norm": 0.7460024952888489, + "learning_rate": 3.125626548116222e-05, + "loss": 0.3266, + "step": 11765 + }, + { + "epoch": 0.874150544023172, + "grad_norm": 0.807644248008728, + "learning_rate": 3.124170830839911e-05, + "loss": 0.3627, + "step": 11770 + }, + { + "epoch": 0.8745218908982881, + "grad_norm": 0.7130464315414429, + "learning_rate": 3.1227148878296584e-05, + "loss": 0.3616, + "step": 11775 + }, + { + "epoch": 0.8748932377734041, + "grad_norm": 0.840950608253479, + "learning_rate": 3.12125871961201e-05, + "loss": 0.3755, + "step": 11780 + }, + { + "epoch": 0.8752645846485202, + "grad_norm": 0.8666706681251526, + "learning_rate": 3.1198023267135995e-05, + "loss": 0.3671, + "step": 11785 + }, + { + "epoch": 0.8756359315236363, + "grad_norm": 0.848078727722168, + "learning_rate": 3.118345709661134e-05, + "loss": 0.3648, + "step": 11790 + }, + { + "epoch": 0.8760072783987523, + "grad_norm": 0.7801305651664734, + "learning_rate": 3.116888868981405e-05, + "loss": 0.3276, + "step": 11795 + }, + { + "epoch": 0.8763786252738683, + "grad_norm": 0.855327844619751, + "learning_rate": 3.115431805201287e-05, + "loss": 0.361, + "step": 11800 + }, + { + "epoch": 0.8767499721489843, + "grad_norm": 0.7050216794013977, + "learning_rate": 3.113974518847731e-05, + "loss": 0.3358, + "step": 11805 + }, + { + "epoch": 0.8771213190241004, + "grad_norm": 0.7866954207420349, + "learning_rate": 3.112517010447773e-05, + "loss": 0.3489, + "step": 11810 + }, + { + "epoch": 0.8774926658992165, + "grad_norm": 1.1569349765777588, + "learning_rate": 3.1110592805285246e-05, + "loss": 0.3434, + "step": 11815 + }, + { + "epoch": 0.8778640127743325, + "grad_norm": 1.1610534191131592, + "learning_rate": 3.109601329617181e-05, + "loss": 0.3598, + "step": 11820 + }, + { + "epoch": 0.8782353596494485, + "grad_norm": 1.4828381538391113, + "learning_rate": 3.1081431582410146e-05, + "loss": 0.3625, + "step": 11825 + }, + { + "epoch": 0.8786067065245646, + "grad_norm": 0.9669677019119263, + "learning_rate": 3.106684766927382e-05, + "loss": 0.386, + "step": 11830 + }, + { + "epoch": 0.8789780533996806, + "grad_norm": 0.7989631295204163, + "learning_rate": 3.105226156203713e-05, + "loss": 0.3315, + "step": 11835 + }, + { + "epoch": 0.8793494002747967, + "grad_norm": 0.7316701412200928, + "learning_rate": 3.1037673265975226e-05, + "loss": 0.3785, + "step": 11840 + }, + { + "epoch": 0.8797207471499128, + "grad_norm": 2.4976911544799805, + "learning_rate": 3.102308278636402e-05, + "loss": 0.3629, + "step": 11845 + }, + { + "epoch": 0.8800920940250287, + "grad_norm": 0.879723846912384, + "learning_rate": 3.1008490128480206e-05, + "loss": 0.3411, + "step": 11850 + }, + { + "epoch": 0.8804634409001448, + "grad_norm": 0.8403365612030029, + "learning_rate": 3.0993895297601293e-05, + "loss": 0.3689, + "step": 11855 + }, + { + "epoch": 0.8808347877752609, + "grad_norm": 0.8075715899467468, + "learning_rate": 3.097929829900556e-05, + "loss": 0.3557, + "step": 11860 + }, + { + "epoch": 0.8812061346503769, + "grad_norm": 1.033172845840454, + "learning_rate": 3.096469913797206e-05, + "loss": 0.3461, + "step": 11865 + }, + { + "epoch": 0.881577481525493, + "grad_norm": 0.8531133532524109, + "learning_rate": 3.095009781978066e-05, + "loss": 0.3493, + "step": 11870 + }, + { + "epoch": 0.8819488284006091, + "grad_norm": 0.8683094382286072, + "learning_rate": 3.0935494349711964e-05, + "loss": 0.3624, + "step": 11875 + }, + { + "epoch": 0.882320175275725, + "grad_norm": 1.5701484680175781, + "learning_rate": 3.092088873304739e-05, + "loss": 0.3572, + "step": 11880 + }, + { + "epoch": 0.8826915221508411, + "grad_norm": 0.9219105243682861, + "learning_rate": 3.090628097506911e-05, + "loss": 0.3525, + "step": 11885 + }, + { + "epoch": 0.8830628690259571, + "grad_norm": 0.8252973556518555, + "learning_rate": 3.08916710810601e-05, + "loss": 0.3462, + "step": 11890 + }, + { + "epoch": 0.8834342159010732, + "grad_norm": 0.8419556617736816, + "learning_rate": 3.087705905630406e-05, + "loss": 0.3503, + "step": 11895 + }, + { + "epoch": 0.8838055627761893, + "grad_norm": 0.8596616387367249, + "learning_rate": 3.0862444906085514e-05, + "loss": 0.3658, + "step": 11900 + }, + { + "epoch": 0.8841769096513052, + "grad_norm": 1.017411231994629, + "learning_rate": 3.084782863568971e-05, + "loss": 0.3496, + "step": 11905 + }, + { + "epoch": 0.8845482565264213, + "grad_norm": 0.9587264060974121, + "learning_rate": 3.0833210250402694e-05, + "loss": 0.3452, + "step": 11910 + }, + { + "epoch": 0.8849196034015374, + "grad_norm": 1.0870927572250366, + "learning_rate": 3.0818589755511254e-05, + "loss": 0.3624, + "step": 11915 + }, + { + "epoch": 0.8852909502766534, + "grad_norm": 0.9153867959976196, + "learning_rate": 3.080396715630297e-05, + "loss": 0.3623, + "step": 11920 + }, + { + "epoch": 0.8856622971517695, + "grad_norm": 0.8542145490646362, + "learning_rate": 3.078934245806615e-05, + "loss": 0.378, + "step": 11925 + }, + { + "epoch": 0.8860336440268856, + "grad_norm": 0.6727744340896606, + "learning_rate": 3.077471566608988e-05, + "loss": 0.3439, + "step": 11930 + }, + { + "epoch": 0.8864049909020015, + "grad_norm": 0.9169203042984009, + "learning_rate": 3.0760086785664e-05, + "loss": 0.3537, + "step": 11935 + }, + { + "epoch": 0.8867763377771176, + "grad_norm": 0.7953917980194092, + "learning_rate": 3.07454558220791e-05, + "loss": 0.3408, + "step": 11940 + }, + { + "epoch": 0.8871476846522337, + "grad_norm": 0.7179327607154846, + "learning_rate": 3.073082278062654e-05, + "loss": 0.34, + "step": 11945 + }, + { + "epoch": 0.8875190315273497, + "grad_norm": 0.9073119163513184, + "learning_rate": 3.07161876665984e-05, + "loss": 0.357, + "step": 11950 + }, + { + "epoch": 0.8878903784024658, + "grad_norm": 0.7440044283866882, + "learning_rate": 3.070155048528756e-05, + "loss": 0.3719, + "step": 11955 + }, + { + "epoch": 0.8882617252775817, + "grad_norm": 0.7663770914077759, + "learning_rate": 3.068691124198758e-05, + "loss": 0.3634, + "step": 11960 + }, + { + "epoch": 0.8886330721526978, + "grad_norm": 0.795628011226654, + "learning_rate": 3.0672269941992825e-05, + "loss": 0.3358, + "step": 11965 + }, + { + "epoch": 0.8890044190278139, + "grad_norm": 0.9965798854827881, + "learning_rate": 3.065762659059838e-05, + "loss": 0.3331, + "step": 11970 + }, + { + "epoch": 0.8893757659029299, + "grad_norm": 0.8984972238540649, + "learning_rate": 3.0642981193100065e-05, + "loss": 0.3521, + "step": 11975 + }, + { + "epoch": 0.889747112778046, + "grad_norm": 0.859411358833313, + "learning_rate": 3.0628333754794444e-05, + "loss": 0.3375, + "step": 11980 + }, + { + "epoch": 0.890118459653162, + "grad_norm": 1.0164400339126587, + "learning_rate": 3.061368428097884e-05, + "loss": 0.3496, + "step": 11985 + }, + { + "epoch": 0.890489806528278, + "grad_norm": 1.0231250524520874, + "learning_rate": 3.0599032776951295e-05, + "loss": 0.3414, + "step": 11990 + }, + { + "epoch": 0.8908611534033941, + "grad_norm": 0.7204227447509766, + "learning_rate": 3.0584379248010556e-05, + "loss": 0.3552, + "step": 11995 + }, + { + "epoch": 0.8912325002785102, + "grad_norm": 0.8453019261360168, + "learning_rate": 3.056972369945617e-05, + "loss": 0.3494, + "step": 12000 + }, + { + "epoch": 0.8916038471536262, + "grad_norm": 0.8141784071922302, + "learning_rate": 3.0555066136588354e-05, + "loss": 0.3554, + "step": 12005 + }, + { + "epoch": 0.8919751940287423, + "grad_norm": 1.678991436958313, + "learning_rate": 3.054040656470809e-05, + "loss": 0.3733, + "step": 12010 + }, + { + "epoch": 0.8923465409038583, + "grad_norm": 0.8809152841567993, + "learning_rate": 3.0525744989117044e-05, + "loss": 0.3491, + "step": 12015 + }, + { + "epoch": 0.8927178877789743, + "grad_norm": 0.784815788269043, + "learning_rate": 3.051108141511767e-05, + "loss": 0.3444, + "step": 12020 + }, + { + "epoch": 0.8930892346540904, + "grad_norm": 0.919638991355896, + "learning_rate": 3.049641584801309e-05, + "loss": 0.3502, + "step": 12025 + }, + { + "epoch": 0.8934605815292064, + "grad_norm": 0.757986307144165, + "learning_rate": 3.048174829310717e-05, + "loss": 0.3472, + "step": 12030 + }, + { + "epoch": 0.8938319284043225, + "grad_norm": 0.7388538122177124, + "learning_rate": 3.046707875570449e-05, + "loss": 0.3509, + "step": 12035 + }, + { + "epoch": 0.8942032752794385, + "grad_norm": 2.4140920639038086, + "learning_rate": 3.0452407241110347e-05, + "loss": 0.3668, + "step": 12040 + }, + { + "epoch": 0.8945746221545545, + "grad_norm": 0.9365836381912231, + "learning_rate": 3.0437733754630772e-05, + "loss": 0.3037, + "step": 12045 + }, + { + "epoch": 0.8949459690296706, + "grad_norm": 0.6372706294059753, + "learning_rate": 3.0423058301572466e-05, + "loss": 0.3356, + "step": 12050 + }, + { + "epoch": 0.8953173159047867, + "grad_norm": 0.8278931975364685, + "learning_rate": 3.0408380887242882e-05, + "loss": 0.3504, + "step": 12055 + }, + { + "epoch": 0.8956886627799027, + "grad_norm": 1.124632716178894, + "learning_rate": 3.0393701516950157e-05, + "loss": 0.373, + "step": 12060 + }, + { + "epoch": 0.8960600096550188, + "grad_norm": 0.8332802653312683, + "learning_rate": 3.0379020196003156e-05, + "loss": 0.3443, + "step": 12065 + }, + { + "epoch": 0.8964313565301348, + "grad_norm": 0.7034924030303955, + "learning_rate": 3.0364336929711435e-05, + "loss": 0.3516, + "step": 12070 + }, + { + "epoch": 0.8968027034052508, + "grad_norm": 0.9156379699707031, + "learning_rate": 3.0349651723385254e-05, + "loss": 0.3379, + "step": 12075 + }, + { + "epoch": 0.8971740502803669, + "grad_norm": 0.7082470655441284, + "learning_rate": 3.0334964582335583e-05, + "loss": 0.3254, + "step": 12080 + }, + { + "epoch": 0.897545397155483, + "grad_norm": 0.9504019618034363, + "learning_rate": 3.0320275511874085e-05, + "loss": 0.3355, + "step": 12085 + }, + { + "epoch": 0.897916744030599, + "grad_norm": 1.6386775970458984, + "learning_rate": 3.030558451731313e-05, + "loss": 0.3538, + "step": 12090 + }, + { + "epoch": 0.898288090905715, + "grad_norm": 0.7984548807144165, + "learning_rate": 3.029089160396576e-05, + "loss": 0.344, + "step": 12095 + }, + { + "epoch": 0.8986594377808311, + "grad_norm": 0.9346609115600586, + "learning_rate": 3.0276196777145742e-05, + "loss": 0.3587, + "step": 12100 + }, + { + "epoch": 0.8990307846559471, + "grad_norm": 0.9744558334350586, + "learning_rate": 3.0261500042167507e-05, + "loss": 0.3305, + "step": 12105 + }, + { + "epoch": 0.8994021315310632, + "grad_norm": 1.0569168329238892, + "learning_rate": 3.0246801404346203e-05, + "loss": 0.3552, + "step": 12110 + }, + { + "epoch": 0.8997734784061792, + "grad_norm": 1.1052289009094238, + "learning_rate": 3.0232100868997648e-05, + "loss": 0.3634, + "step": 12115 + }, + { + "epoch": 0.9001448252812952, + "grad_norm": 0.998342752456665, + "learning_rate": 3.0217398441438343e-05, + "loss": 0.358, + "step": 12120 + }, + { + "epoch": 0.9005161721564113, + "grad_norm": 0.9586290717124939, + "learning_rate": 3.0202694126985494e-05, + "loss": 0.3349, + "step": 12125 + }, + { + "epoch": 0.9008875190315273, + "grad_norm": 0.7554903030395508, + "learning_rate": 3.018798793095697e-05, + "loss": 0.3456, + "step": 12130 + }, + { + "epoch": 0.9012588659066434, + "grad_norm": 0.7605744004249573, + "learning_rate": 3.0173279858671333e-05, + "loss": 0.3497, + "step": 12135 + }, + { + "epoch": 0.9016302127817595, + "grad_norm": 0.8068292737007141, + "learning_rate": 3.0158569915447803e-05, + "loss": 0.3484, + "step": 12140 + }, + { + "epoch": 0.9020015596568755, + "grad_norm": 0.8266493082046509, + "learning_rate": 3.0143858106606303e-05, + "loss": 0.3199, + "step": 12145 + }, + { + "epoch": 0.9023729065319915, + "grad_norm": 0.8617991209030151, + "learning_rate": 3.012914443746742e-05, + "loss": 0.3528, + "step": 12150 + }, + { + "epoch": 0.9027442534071076, + "grad_norm": 0.800369918346405, + "learning_rate": 3.011442891335241e-05, + "loss": 0.3501, + "step": 12155 + }, + { + "epoch": 0.9031156002822236, + "grad_norm": 0.5995765328407288, + "learning_rate": 3.0099711539583202e-05, + "loss": 0.3415, + "step": 12160 + }, + { + "epoch": 0.9034869471573397, + "grad_norm": 0.892348051071167, + "learning_rate": 3.0084992321482396e-05, + "loss": 0.3692, + "step": 12165 + }, + { + "epoch": 0.9038582940324558, + "grad_norm": 0.904771089553833, + "learning_rate": 3.007027126437325e-05, + "loss": 0.3472, + "step": 12170 + }, + { + "epoch": 0.9042296409075717, + "grad_norm": 0.8912681341171265, + "learning_rate": 3.0055548373579712e-05, + "loss": 0.3379, + "step": 12175 + }, + { + "epoch": 0.9046009877826878, + "grad_norm": 0.6829692125320435, + "learning_rate": 3.0040823654426357e-05, + "loss": 0.3496, + "step": 12180 + }, + { + "epoch": 0.9049723346578038, + "grad_norm": 0.8899698853492737, + "learning_rate": 3.0026097112238454e-05, + "loss": 0.3404, + "step": 12185 + }, + { + "epoch": 0.9053436815329199, + "grad_norm": 0.7954959273338318, + "learning_rate": 3.0011368752341905e-05, + "loss": 0.3657, + "step": 12190 + }, + { + "epoch": 0.905715028408036, + "grad_norm": 0.8103697896003723, + "learning_rate": 2.999663858006329e-05, + "loss": 0.325, + "step": 12195 + }, + { + "epoch": 0.906086375283152, + "grad_norm": 0.7728240489959717, + "learning_rate": 2.9981906600729842e-05, + "loss": 0.3404, + "step": 12200 + }, + { + "epoch": 0.906457722158268, + "grad_norm": 0.873102605342865, + "learning_rate": 2.9967172819669433e-05, + "loss": 0.3357, + "step": 12205 + }, + { + "epoch": 0.9068290690333841, + "grad_norm": 1.4603787660598755, + "learning_rate": 2.9952437242210592e-05, + "loss": 0.349, + "step": 12210 + }, + { + "epoch": 0.9072004159085001, + "grad_norm": 0.7289944887161255, + "learning_rate": 2.99376998736825e-05, + "loss": 0.3354, + "step": 12215 + }, + { + "epoch": 0.9075717627836162, + "grad_norm": 0.7822965383529663, + "learning_rate": 2.9922960719415e-05, + "loss": 0.3467, + "step": 12220 + }, + { + "epoch": 0.9079431096587323, + "grad_norm": 0.859673023223877, + "learning_rate": 2.9908219784738568e-05, + "loss": 0.3443, + "step": 12225 + }, + { + "epoch": 0.9083144565338482, + "grad_norm": 0.7056594491004944, + "learning_rate": 2.98934770749843e-05, + "loss": 0.3503, + "step": 12230 + }, + { + "epoch": 0.9086858034089643, + "grad_norm": 0.8911407589912415, + "learning_rate": 2.9878732595483982e-05, + "loss": 0.3383, + "step": 12235 + }, + { + "epoch": 0.9090571502840804, + "grad_norm": 0.878011167049408, + "learning_rate": 2.9863986351570006e-05, + "loss": 0.3527, + "step": 12240 + }, + { + "epoch": 0.9094284971591964, + "grad_norm": 0.8897885680198669, + "learning_rate": 2.9849238348575416e-05, + "loss": 0.3577, + "step": 12245 + }, + { + "epoch": 0.9097998440343125, + "grad_norm": 1.0172691345214844, + "learning_rate": 2.9834488591833874e-05, + "loss": 0.3301, + "step": 12250 + }, + { + "epoch": 0.9101711909094286, + "grad_norm": 0.8597069978713989, + "learning_rate": 2.981973708667971e-05, + "loss": 0.3324, + "step": 12255 + }, + { + "epoch": 0.9105425377845445, + "grad_norm": 0.8964053392410278, + "learning_rate": 2.980498383844785e-05, + "loss": 0.3381, + "step": 12260 + }, + { + "epoch": 0.9109138846596606, + "grad_norm": 0.81219881772995, + "learning_rate": 2.979022885247387e-05, + "loss": 0.3557, + "step": 12265 + }, + { + "epoch": 0.9112852315347766, + "grad_norm": 0.7895968556404114, + "learning_rate": 2.9775472134093986e-05, + "loss": 0.34, + "step": 12270 + }, + { + "epoch": 0.9116565784098927, + "grad_norm": 1.065226674079895, + "learning_rate": 2.9760713688645003e-05, + "loss": 0.3473, + "step": 12275 + }, + { + "epoch": 0.9120279252850088, + "grad_norm": 1.1710779666900635, + "learning_rate": 2.9745953521464393e-05, + "loss": 0.3425, + "step": 12280 + }, + { + "epoch": 0.9123992721601247, + "grad_norm": 0.6871535778045654, + "learning_rate": 2.9731191637890222e-05, + "loss": 0.352, + "step": 12285 + }, + { + "epoch": 0.9127706190352408, + "grad_norm": 0.8434427380561829, + "learning_rate": 2.9716428043261187e-05, + "loss": 0.3521, + "step": 12290 + }, + { + "epoch": 0.9131419659103569, + "grad_norm": 0.8964160680770874, + "learning_rate": 2.9701662742916586e-05, + "loss": 0.3217, + "step": 12295 + }, + { + "epoch": 0.9135133127854729, + "grad_norm": 1.7727447748184204, + "learning_rate": 2.9686895742196386e-05, + "loss": 0.3556, + "step": 12300 + }, + { + "epoch": 0.913884659660589, + "grad_norm": 0.7401552796363831, + "learning_rate": 2.9672127046441106e-05, + "loss": 0.3542, + "step": 12305 + }, + { + "epoch": 0.914256006535705, + "grad_norm": 0.8029780983924866, + "learning_rate": 2.965735666099191e-05, + "loss": 0.335, + "step": 12310 + }, + { + "epoch": 0.914627353410821, + "grad_norm": 1.061109185218811, + "learning_rate": 2.9642584591190575e-05, + "loss": 0.3502, + "step": 12315 + }, + { + "epoch": 0.9149987002859371, + "grad_norm": 0.9626049399375916, + "learning_rate": 2.9627810842379472e-05, + "loss": 0.3564, + "step": 12320 + }, + { + "epoch": 0.9153700471610532, + "grad_norm": 0.9086316227912903, + "learning_rate": 2.9613035419901603e-05, + "loss": 0.3414, + "step": 12325 + }, + { + "epoch": 0.9157413940361692, + "grad_norm": 1.798221230506897, + "learning_rate": 2.9598258329100546e-05, + "loss": 0.362, + "step": 12330 + }, + { + "epoch": 0.9161127409112853, + "grad_norm": 0.7969675064086914, + "learning_rate": 2.9583479575320504e-05, + "loss": 0.3382, + "step": 12335 + }, + { + "epoch": 0.9164840877864012, + "grad_norm": 0.6091347336769104, + "learning_rate": 2.956869916390626e-05, + "loss": 0.3676, + "step": 12340 + }, + { + "epoch": 0.9168554346615173, + "grad_norm": 0.639950156211853, + "learning_rate": 2.9553917100203234e-05, + "loss": 0.3228, + "step": 12345 + }, + { + "epoch": 0.9172267815366334, + "grad_norm": 0.7365955114364624, + "learning_rate": 2.9539133389557395e-05, + "loss": 0.3592, + "step": 12350 + }, + { + "epoch": 0.9175981284117494, + "grad_norm": 0.8247587084770203, + "learning_rate": 2.952434803731535e-05, + "loss": 0.3462, + "step": 12355 + }, + { + "epoch": 0.9179694752868655, + "grad_norm": 1.6064798831939697, + "learning_rate": 2.9509561048824276e-05, + "loss": 0.3507, + "step": 12360 + }, + { + "epoch": 0.9183408221619815, + "grad_norm": 0.7756306529045105, + "learning_rate": 2.949477242943195e-05, + "loss": 0.3549, + "step": 12365 + }, + { + "epoch": 0.9187121690370975, + "grad_norm": 1.0088223218917847, + "learning_rate": 2.9479982184486737e-05, + "loss": 0.3677, + "step": 12370 + }, + { + "epoch": 0.9190835159122136, + "grad_norm": 0.9376280307769775, + "learning_rate": 2.9465190319337587e-05, + "loss": 0.3292, + "step": 12375 + }, + { + "epoch": 0.9194548627873297, + "grad_norm": 0.7487239837646484, + "learning_rate": 2.9450396839334042e-05, + "loss": 0.3398, + "step": 12380 + }, + { + "epoch": 0.9198262096624457, + "grad_norm": 0.8737534284591675, + "learning_rate": 2.943560174982622e-05, + "loss": 0.3441, + "step": 12385 + }, + { + "epoch": 0.9201975565375617, + "grad_norm": 0.8645554780960083, + "learning_rate": 2.9420805056164835e-05, + "loss": 0.3467, + "step": 12390 + }, + { + "epoch": 0.9205689034126778, + "grad_norm": 0.8328379392623901, + "learning_rate": 2.940600676370116e-05, + "loss": 0.3392, + "step": 12395 + }, + { + "epoch": 0.9209402502877938, + "grad_norm": 0.6697943210601807, + "learning_rate": 2.9391206877787064e-05, + "loss": 0.3419, + "step": 12400 + }, + { + "epoch": 0.9213115971629099, + "grad_norm": 0.9482589364051819, + "learning_rate": 2.9376405403774988e-05, + "loss": 0.3467, + "step": 12405 + }, + { + "epoch": 0.921682944038026, + "grad_norm": 0.9211759567260742, + "learning_rate": 2.9361602347017946e-05, + "loss": 0.3485, + "step": 12410 + }, + { + "epoch": 0.922054290913142, + "grad_norm": 1.0176403522491455, + "learning_rate": 2.9346797712869522e-05, + "loss": 0.36, + "step": 12415 + }, + { + "epoch": 0.922425637788258, + "grad_norm": 0.8227538466453552, + "learning_rate": 2.9331991506683876e-05, + "loss": 0.3462, + "step": 12420 + }, + { + "epoch": 0.922796984663374, + "grad_norm": 0.8057060241699219, + "learning_rate": 2.9317183733815723e-05, + "loss": 0.3587, + "step": 12425 + }, + { + "epoch": 0.9231683315384901, + "grad_norm": 0.8205479383468628, + "learning_rate": 2.9302374399620364e-05, + "loss": 0.3842, + "step": 12430 + }, + { + "epoch": 0.9235396784136062, + "grad_norm": 0.9048784375190735, + "learning_rate": 2.9287563509453658e-05, + "loss": 0.3531, + "step": 12435 + }, + { + "epoch": 0.9239110252887222, + "grad_norm": 0.8701086044311523, + "learning_rate": 2.9272751068672015e-05, + "loss": 0.3555, + "step": 12440 + }, + { + "epoch": 0.9242823721638382, + "grad_norm": 1.1633310317993164, + "learning_rate": 2.9257937082632426e-05, + "loss": 0.349, + "step": 12445 + }, + { + "epoch": 0.9246537190389543, + "grad_norm": 0.9627925157546997, + "learning_rate": 2.9243121556692417e-05, + "loss": 0.3371, + "step": 12450 + }, + { + "epoch": 0.9250250659140703, + "grad_norm": 0.828937828540802, + "learning_rate": 2.922830449621009e-05, + "loss": 0.3361, + "step": 12455 + }, + { + "epoch": 0.9253964127891864, + "grad_norm": 1.0228971242904663, + "learning_rate": 2.9213485906544103e-05, + "loss": 0.3445, + "step": 12460 + }, + { + "epoch": 0.9257677596643025, + "grad_norm": 1.1212095022201538, + "learning_rate": 2.9198665793053648e-05, + "loss": 0.3366, + "step": 12465 + }, + { + "epoch": 0.9261391065394184, + "grad_norm": 1.1539406776428223, + "learning_rate": 2.9183844161098484e-05, + "loss": 0.3714, + "step": 12470 + }, + { + "epoch": 0.9265104534145345, + "grad_norm": 0.8389959335327148, + "learning_rate": 2.9169021016038917e-05, + "loss": 0.3501, + "step": 12475 + }, + { + "epoch": 0.9268818002896506, + "grad_norm": 0.9025770425796509, + "learning_rate": 2.9154196363235804e-05, + "loss": 0.3364, + "step": 12480 + }, + { + "epoch": 0.9272531471647666, + "grad_norm": 0.9710972309112549, + "learning_rate": 2.913937020805052e-05, + "loss": 0.3577, + "step": 12485 + }, + { + "epoch": 0.9276244940398827, + "grad_norm": 1.1810095310211182, + "learning_rate": 2.912454255584503e-05, + "loss": 0.3565, + "step": 12490 + }, + { + "epoch": 0.9279958409149986, + "grad_norm": 1.0007275342941284, + "learning_rate": 2.9109713411981798e-05, + "loss": 0.3399, + "step": 12495 + }, + { + "epoch": 0.9283671877901147, + "grad_norm": 1.5194200277328491, + "learning_rate": 2.909488278182385e-05, + "loss": 0.3367, + "step": 12500 + }, + { + "epoch": 0.9287385346652308, + "grad_norm": 0.5930137038230896, + "learning_rate": 2.908005067073476e-05, + "loss": 0.3312, + "step": 12505 + }, + { + "epoch": 0.9291098815403468, + "grad_norm": 0.9861831665039062, + "learning_rate": 2.9065217084078588e-05, + "loss": 0.3504, + "step": 12510 + }, + { + "epoch": 0.9294812284154629, + "grad_norm": 3.8240902423858643, + "learning_rate": 2.905038202721999e-05, + "loss": 0.3466, + "step": 12515 + }, + { + "epoch": 0.929852575290579, + "grad_norm": 1.0071384906768799, + "learning_rate": 2.903554550552412e-05, + "loss": 0.333, + "step": 12520 + }, + { + "epoch": 0.9302239221656949, + "grad_norm": 0.8163401484489441, + "learning_rate": 2.902070752435666e-05, + "loss": 0.3312, + "step": 12525 + }, + { + "epoch": 0.930595269040811, + "grad_norm": 0.9900504350662231, + "learning_rate": 2.900586808908382e-05, + "loss": 0.3588, + "step": 12530 + }, + { + "epoch": 0.9309666159159271, + "grad_norm": 0.9772970676422119, + "learning_rate": 2.8991027205072363e-05, + "loss": 0.3598, + "step": 12535 + }, + { + "epoch": 0.9313379627910431, + "grad_norm": 1.0091187953948975, + "learning_rate": 2.8976184877689545e-05, + "loss": 0.3455, + "step": 12540 + }, + { + "epoch": 0.9317093096661592, + "grad_norm": 3.244717836380005, + "learning_rate": 2.8961341112303147e-05, + "loss": 0.3511, + "step": 12545 + }, + { + "epoch": 0.9320806565412753, + "grad_norm": 0.766475260257721, + "learning_rate": 2.894649591428149e-05, + "loss": 0.336, + "step": 12550 + }, + { + "epoch": 0.9324520034163912, + "grad_norm": 0.9685466885566711, + "learning_rate": 2.893164928899339e-05, + "loss": 0.341, + "step": 12555 + }, + { + "epoch": 0.9328233502915073, + "grad_norm": 0.8565616607666016, + "learning_rate": 2.8916801241808205e-05, + "loss": 0.3481, + "step": 12560 + }, + { + "epoch": 0.9331946971666233, + "grad_norm": 0.896621823310852, + "learning_rate": 2.8901951778095783e-05, + "loss": 0.3516, + "step": 12565 + }, + { + "epoch": 0.9335660440417394, + "grad_norm": 0.9232330322265625, + "learning_rate": 2.888710090322649e-05, + "loss": 0.3601, + "step": 12570 + }, + { + "epoch": 0.9339373909168555, + "grad_norm": 0.8799304962158203, + "learning_rate": 2.88722486225712e-05, + "loss": 0.337, + "step": 12575 + }, + { + "epoch": 0.9343087377919714, + "grad_norm": 1.2841633558273315, + "learning_rate": 2.8857394941501325e-05, + "loss": 0.3623, + "step": 12580 + }, + { + "epoch": 0.9346800846670875, + "grad_norm": 0.7310310006141663, + "learning_rate": 2.8842539865388734e-05, + "loss": 0.3434, + "step": 12585 + }, + { + "epoch": 0.9350514315422036, + "grad_norm": 0.8738952875137329, + "learning_rate": 2.8827683399605843e-05, + "loss": 0.3628, + "step": 12590 + }, + { + "epoch": 0.9354227784173196, + "grad_norm": 0.8512542247772217, + "learning_rate": 2.8812825549525545e-05, + "loss": 0.3302, + "step": 12595 + }, + { + "epoch": 0.9357941252924357, + "grad_norm": 0.8368507027626038, + "learning_rate": 2.8797966320521246e-05, + "loss": 0.3378, + "step": 12600 + }, + { + "epoch": 0.9361654721675517, + "grad_norm": 0.7401739954948425, + "learning_rate": 2.878310571796685e-05, + "loss": 0.3359, + "step": 12605 + }, + { + "epoch": 0.9365368190426677, + "grad_norm": 0.7290681004524231, + "learning_rate": 2.876824374723675e-05, + "loss": 0.3379, + "step": 12610 + }, + { + "epoch": 0.9369081659177838, + "grad_norm": 0.8211921453475952, + "learning_rate": 2.8753380413705843e-05, + "loss": 0.3525, + "step": 12615 + }, + { + "epoch": 0.9372795127928999, + "grad_norm": 0.9918500781059265, + "learning_rate": 2.8738515722749504e-05, + "loss": 0.3552, + "step": 12620 + }, + { + "epoch": 0.9376508596680159, + "grad_norm": 0.908306360244751, + "learning_rate": 2.8723649679743626e-05, + "loss": 0.3535, + "step": 12625 + }, + { + "epoch": 0.938022206543132, + "grad_norm": 0.9392675161361694, + "learning_rate": 2.8708782290064562e-05, + "loss": 0.3341, + "step": 12630 + }, + { + "epoch": 0.938393553418248, + "grad_norm": 0.6768072247505188, + "learning_rate": 2.8693913559089164e-05, + "loss": 0.3272, + "step": 12635 + }, + { + "epoch": 0.938764900293364, + "grad_norm": 0.8711260557174683, + "learning_rate": 2.8679043492194778e-05, + "loss": 0.3469, + "step": 12640 + }, + { + "epoch": 0.9391362471684801, + "grad_norm": 0.9255607724189758, + "learning_rate": 2.8664172094759218e-05, + "loss": 0.3684, + "step": 12645 + }, + { + "epoch": 0.9395075940435961, + "grad_norm": 1.1237608194351196, + "learning_rate": 2.86492993721608e-05, + "loss": 0.3568, + "step": 12650 + }, + { + "epoch": 0.9398789409187122, + "grad_norm": 0.95046466588974, + "learning_rate": 2.863442532977828e-05, + "loss": 0.3538, + "step": 12655 + }, + { + "epoch": 0.9402502877938282, + "grad_norm": 0.8607051968574524, + "learning_rate": 2.8619549972990934e-05, + "loss": 0.3388, + "step": 12660 + }, + { + "epoch": 0.9406216346689442, + "grad_norm": 0.989203929901123, + "learning_rate": 2.8604673307178487e-05, + "loss": 0.3608, + "step": 12665 + }, + { + "epoch": 0.9409929815440603, + "grad_norm": 0.9122014045715332, + "learning_rate": 2.858979533772116e-05, + "loss": 0.3398, + "step": 12670 + }, + { + "epoch": 0.9413643284191764, + "grad_norm": 1.0959396362304688, + "learning_rate": 2.8574916069999613e-05, + "loss": 0.3634, + "step": 12675 + }, + { + "epoch": 0.9417356752942924, + "grad_norm": 1.000762939453125, + "learning_rate": 2.856003550939501e-05, + "loss": 0.3404, + "step": 12680 + }, + { + "epoch": 0.9421070221694084, + "grad_norm": 0.8134162425994873, + "learning_rate": 2.854515366128896e-05, + "loss": 0.3614, + "step": 12685 + }, + { + "epoch": 0.9424783690445245, + "grad_norm": 0.659251868724823, + "learning_rate": 2.853027053106354e-05, + "loss": 0.3293, + "step": 12690 + }, + { + "epoch": 0.9428497159196405, + "grad_norm": 1.0323694944381714, + "learning_rate": 2.851538612410131e-05, + "loss": 0.3517, + "step": 12695 + }, + { + "epoch": 0.9432210627947566, + "grad_norm": 1.0935428142547607, + "learning_rate": 2.850050044578526e-05, + "loss": 0.3391, + "step": 12700 + }, + { + "epoch": 0.9435924096698727, + "grad_norm": 1.014157772064209, + "learning_rate": 2.8485613501498866e-05, + "loss": 0.3281, + "step": 12705 + }, + { + "epoch": 0.9439637565449887, + "grad_norm": 0.9155251383781433, + "learning_rate": 2.8470725296626046e-05, + "loss": 0.3377, + "step": 12710 + }, + { + "epoch": 0.9443351034201047, + "grad_norm": 0.9557784795761108, + "learning_rate": 2.845583583655119e-05, + "loss": 0.3406, + "step": 12715 + }, + { + "epoch": 0.9447064502952207, + "grad_norm": 0.9360071420669556, + "learning_rate": 2.8440945126659124e-05, + "loss": 0.3551, + "step": 12720 + }, + { + "epoch": 0.9450777971703368, + "grad_norm": 1.0895956754684448, + "learning_rate": 2.842605317233514e-05, + "loss": 0.3259, + "step": 12725 + }, + { + "epoch": 0.9454491440454529, + "grad_norm": 0.8507657051086426, + "learning_rate": 2.8411159978964964e-05, + "loss": 0.3472, + "step": 12730 + }, + { + "epoch": 0.9458204909205689, + "grad_norm": 0.7691348791122437, + "learning_rate": 2.8396265551934792e-05, + "loss": 0.3593, + "step": 12735 + }, + { + "epoch": 0.9461918377956849, + "grad_norm": 0.6096264123916626, + "learning_rate": 2.8381369896631254e-05, + "loss": 0.3183, + "step": 12740 + }, + { + "epoch": 0.946563184670801, + "grad_norm": 0.7602872848510742, + "learning_rate": 2.8366473018441415e-05, + "loss": 0.3286, + "step": 12745 + }, + { + "epoch": 0.946934531545917, + "grad_norm": 0.9890530705451965, + "learning_rate": 2.8351574922752804e-05, + "loss": 0.324, + "step": 12750 + }, + { + "epoch": 0.9473058784210331, + "grad_norm": 0.8396008014678955, + "learning_rate": 2.8336675614953368e-05, + "loss": 0.3404, + "step": 12755 + }, + { + "epoch": 0.9476772252961492, + "grad_norm": 1.0501019954681396, + "learning_rate": 2.8321775100431513e-05, + "loss": 0.3322, + "step": 12760 + }, + { + "epoch": 0.9480485721712651, + "grad_norm": 1.1446385383605957, + "learning_rate": 2.8306873384576066e-05, + "loss": 0.3323, + "step": 12765 + }, + { + "epoch": 0.9484199190463812, + "grad_norm": 0.8426380753517151, + "learning_rate": 2.8291970472776292e-05, + "loss": 0.3485, + "step": 12770 + }, + { + "epoch": 0.9487912659214973, + "grad_norm": 0.9239525198936462, + "learning_rate": 2.8277066370421888e-05, + "loss": 0.3329, + "step": 12775 + }, + { + "epoch": 0.9491626127966133, + "grad_norm": 1.0131057500839233, + "learning_rate": 2.826216108290299e-05, + "loss": 0.3416, + "step": 12780 + }, + { + "epoch": 0.9495339596717294, + "grad_norm": 1.0407359600067139, + "learning_rate": 2.8247254615610157e-05, + "loss": 0.3636, + "step": 12785 + }, + { + "epoch": 0.9499053065468455, + "grad_norm": 1.0886719226837158, + "learning_rate": 2.823234697393436e-05, + "loss": 0.3371, + "step": 12790 + }, + { + "epoch": 0.9502766534219614, + "grad_norm": 0.6153767704963684, + "learning_rate": 2.8217438163267036e-05, + "loss": 0.3319, + "step": 12795 + }, + { + "epoch": 0.9506480002970775, + "grad_norm": 0.9892891049385071, + "learning_rate": 2.820252818899999e-05, + "loss": 0.353, + "step": 12800 + }, + { + "epoch": 0.9510193471721935, + "grad_norm": 0.9184731841087341, + "learning_rate": 2.8187617056525494e-05, + "loss": 0.3163, + "step": 12805 + }, + { + "epoch": 0.9513906940473096, + "grad_norm": 1.0722240209579468, + "learning_rate": 2.8172704771236204e-05, + "loss": 0.3558, + "step": 12810 + }, + { + "epoch": 0.9517620409224257, + "grad_norm": 1.0215551853179932, + "learning_rate": 2.8157791338525224e-05, + "loss": 0.3558, + "step": 12815 + }, + { + "epoch": 0.9521333877975416, + "grad_norm": 1.0462008714675903, + "learning_rate": 2.8142876763786053e-05, + "loss": 0.3504, + "step": 12820 + }, + { + "epoch": 0.9525047346726577, + "grad_norm": 0.8154160380363464, + "learning_rate": 2.81279610524126e-05, + "loss": 0.3436, + "step": 12825 + }, + { + "epoch": 0.9528760815477738, + "grad_norm": 0.8203123807907104, + "learning_rate": 2.811304420979921e-05, + "loss": 0.3453, + "step": 12830 + }, + { + "epoch": 0.9532474284228898, + "grad_norm": 1.075543999671936, + "learning_rate": 2.80981262413406e-05, + "loss": 0.3419, + "step": 12835 + }, + { + "epoch": 0.9536187752980059, + "grad_norm": 0.8190999627113342, + "learning_rate": 2.808320715243194e-05, + "loss": 0.3439, + "step": 12840 + }, + { + "epoch": 0.953990122173122, + "grad_norm": 0.8523265719413757, + "learning_rate": 2.8068286948468753e-05, + "loss": 0.343, + "step": 12845 + }, + { + "epoch": 0.9543614690482379, + "grad_norm": 0.9448350071907043, + "learning_rate": 2.8053365634847012e-05, + "loss": 0.3251, + "step": 12850 + }, + { + "epoch": 0.954732815923354, + "grad_norm": 0.7018768191337585, + "learning_rate": 2.803844321696305e-05, + "loss": 0.3526, + "step": 12855 + }, + { + "epoch": 0.9551041627984701, + "grad_norm": 0.7716934680938721, + "learning_rate": 2.8023519700213652e-05, + "loss": 0.3447, + "step": 12860 + }, + { + "epoch": 0.9554755096735861, + "grad_norm": 0.7590059638023376, + "learning_rate": 2.8008595089995937e-05, + "loss": 0.3506, + "step": 12865 + }, + { + "epoch": 0.9558468565487022, + "grad_norm": 1.1159205436706543, + "learning_rate": 2.799366939170747e-05, + "loss": 0.3472, + "step": 12870 + }, + { + "epoch": 0.9562182034238181, + "grad_norm": 1.0658485889434814, + "learning_rate": 2.797874261074619e-05, + "loss": 0.3587, + "step": 12875 + }, + { + "epoch": 0.9565895502989342, + "grad_norm": 0.8533363342285156, + "learning_rate": 2.7963814752510413e-05, + "loss": 0.3248, + "step": 12880 + }, + { + "epoch": 0.9569608971740503, + "grad_norm": 0.8562402129173279, + "learning_rate": 2.794888582239889e-05, + "loss": 0.3433, + "step": 12885 + }, + { + "epoch": 0.9573322440491663, + "grad_norm": 0.9736987948417664, + "learning_rate": 2.7933955825810704e-05, + "loss": 0.3518, + "step": 12890 + }, + { + "epoch": 0.9577035909242824, + "grad_norm": 0.722210705280304, + "learning_rate": 2.791902476814535e-05, + "loss": 0.3424, + "step": 12895 + }, + { + "epoch": 0.9580749377993985, + "grad_norm": 3.026717185974121, + "learning_rate": 2.790409265480272e-05, + "loss": 0.3496, + "step": 12900 + }, + { + "epoch": 0.9584462846745144, + "grad_norm": 0.774347722530365, + "learning_rate": 2.7889159491183075e-05, + "loss": 0.3407, + "step": 12905 + }, + { + "epoch": 0.9588176315496305, + "grad_norm": 0.7414963245391846, + "learning_rate": 2.7874225282687038e-05, + "loss": 0.3331, + "step": 12910 + }, + { + "epoch": 0.9591889784247466, + "grad_norm": 0.8461934924125671, + "learning_rate": 2.785929003471564e-05, + "loss": 0.3518, + "step": 12915 + }, + { + "epoch": 0.9595603252998626, + "grad_norm": 0.9420278668403625, + "learning_rate": 2.784435375267027e-05, + "loss": 0.3518, + "step": 12920 + }, + { + "epoch": 0.9599316721749787, + "grad_norm": 0.6645112037658691, + "learning_rate": 2.7829416441952693e-05, + "loss": 0.3604, + "step": 12925 + }, + { + "epoch": 0.9603030190500947, + "grad_norm": 0.8086858987808228, + "learning_rate": 2.7814478107965065e-05, + "loss": 0.3518, + "step": 12930 + }, + { + "epoch": 0.9606743659252107, + "grad_norm": 0.7968643307685852, + "learning_rate": 2.779953875610987e-05, + "loss": 0.344, + "step": 12935 + }, + { + "epoch": 0.9610457128003268, + "grad_norm": 1.1970291137695312, + "learning_rate": 2.7784598391790007e-05, + "loss": 0.3269, + "step": 12940 + }, + { + "epoch": 0.9614170596754429, + "grad_norm": 0.9078313708305359, + "learning_rate": 2.776965702040871e-05, + "loss": 0.324, + "step": 12945 + }, + { + "epoch": 0.9617884065505589, + "grad_norm": 0.7471829056739807, + "learning_rate": 2.7754714647369594e-05, + "loss": 0.3151, + "step": 12950 + }, + { + "epoch": 0.962159753425675, + "grad_norm": 1.185411810874939, + "learning_rate": 2.7739771278076616e-05, + "loss": 0.3452, + "step": 12955 + }, + { + "epoch": 0.9625311003007909, + "grad_norm": 0.7399412393569946, + "learning_rate": 2.772482691793412e-05, + "loss": 0.3411, + "step": 12960 + }, + { + "epoch": 0.962902447175907, + "grad_norm": 0.9101039171218872, + "learning_rate": 2.770988157234679e-05, + "loss": 0.3404, + "step": 12965 + }, + { + "epoch": 0.9632737940510231, + "grad_norm": 0.9467521905899048, + "learning_rate": 2.769493524671967e-05, + "loss": 0.3403, + "step": 12970 + }, + { + "epoch": 0.9636451409261391, + "grad_norm": 2.10199236869812, + "learning_rate": 2.7679987946458173e-05, + "loss": 0.3473, + "step": 12975 + }, + { + "epoch": 0.9640164878012552, + "grad_norm": 0.9057947397232056, + "learning_rate": 2.7665039676968034e-05, + "loss": 0.3513, + "step": 12980 + }, + { + "epoch": 0.9643878346763712, + "grad_norm": 1.0547771453857422, + "learning_rate": 2.7650090443655364e-05, + "loss": 0.35, + "step": 12985 + }, + { + "epoch": 0.9647591815514872, + "grad_norm": 0.9510254263877869, + "learning_rate": 2.763514025192661e-05, + "loss": 0.3523, + "step": 12990 + }, + { + "epoch": 0.9651305284266033, + "grad_norm": 0.816395103931427, + "learning_rate": 2.762018910718858e-05, + "loss": 0.3532, + "step": 12995 + }, + { + "epoch": 0.9655018753017194, + "grad_norm": 1.1673964262008667, + "learning_rate": 2.7605237014848405e-05, + "loss": 0.3483, + "step": 13000 + }, + { + "epoch": 0.9658732221768354, + "grad_norm": 0.9841897487640381, + "learning_rate": 2.759028398031358e-05, + "loss": 0.3267, + "step": 13005 + }, + { + "epoch": 0.9662445690519514, + "grad_norm": 0.8482001423835754, + "learning_rate": 2.7575330008991924e-05, + "loss": 0.3519, + "step": 13010 + }, + { + "epoch": 0.9666159159270675, + "grad_norm": 0.7443394660949707, + "learning_rate": 2.7560375106291603e-05, + "loss": 0.3301, + "step": 13015 + }, + { + "epoch": 0.9669872628021835, + "grad_norm": 0.9038524627685547, + "learning_rate": 2.754541927762113e-05, + "loss": 0.3627, + "step": 13020 + }, + { + "epoch": 0.9673586096772996, + "grad_norm": 0.995887279510498, + "learning_rate": 2.7530462528389327e-05, + "loss": 0.3343, + "step": 13025 + }, + { + "epoch": 0.9677299565524156, + "grad_norm": 0.9123496413230896, + "learning_rate": 2.7515504864005372e-05, + "loss": 0.2947, + "step": 13030 + }, + { + "epoch": 0.9681013034275316, + "grad_norm": 0.6203708052635193, + "learning_rate": 2.750054628987876e-05, + "loss": 0.3362, + "step": 13035 + }, + { + "epoch": 0.9684726503026477, + "grad_norm": 0.9952647686004639, + "learning_rate": 2.7485586811419333e-05, + "loss": 0.3443, + "step": 13040 + }, + { + "epoch": 0.9688439971777637, + "grad_norm": 1.0715820789337158, + "learning_rate": 2.7470626434037235e-05, + "loss": 0.3621, + "step": 13045 + }, + { + "epoch": 0.9692153440528798, + "grad_norm": 0.9002212285995483, + "learning_rate": 2.7455665163142946e-05, + "loss": 0.3603, + "step": 13050 + }, + { + "epoch": 0.9695866909279959, + "grad_norm": 0.8712630271911621, + "learning_rate": 2.7440703004147278e-05, + "loss": 0.3416, + "step": 13055 + }, + { + "epoch": 0.9699580378031118, + "grad_norm": 1.128656029701233, + "learning_rate": 2.7425739962461357e-05, + "loss": 0.367, + "step": 13060 + }, + { + "epoch": 0.9703293846782279, + "grad_norm": 0.9522350430488586, + "learning_rate": 2.7410776043496632e-05, + "loss": 0.3442, + "step": 13065 + }, + { + "epoch": 0.970700731553344, + "grad_norm": 0.844645619392395, + "learning_rate": 2.739581125266486e-05, + "loss": 0.3489, + "step": 13070 + }, + { + "epoch": 0.97107207842846, + "grad_norm": 0.7419381141662598, + "learning_rate": 2.7380845595378114e-05, + "loss": 0.3405, + "step": 13075 + }, + { + "epoch": 0.9714434253035761, + "grad_norm": 0.6905413269996643, + "learning_rate": 2.7365879077048794e-05, + "loss": 0.3489, + "step": 13080 + }, + { + "epoch": 0.9718147721786922, + "grad_norm": 0.9001625776290894, + "learning_rate": 2.7350911703089604e-05, + "loss": 0.3444, + "step": 13085 + }, + { + "epoch": 0.9721861190538081, + "grad_norm": 0.9108909368515015, + "learning_rate": 2.7335943478913544e-05, + "loss": 0.3484, + "step": 13090 + }, + { + "epoch": 0.9725574659289242, + "grad_norm": 1.0819025039672852, + "learning_rate": 2.7320974409933952e-05, + "loss": 0.3543, + "step": 13095 + }, + { + "epoch": 0.9729288128040402, + "grad_norm": 0.7084153890609741, + "learning_rate": 2.730600450156444e-05, + "loss": 0.3328, + "step": 13100 + }, + { + "epoch": 0.9733001596791563, + "grad_norm": 0.8965533971786499, + "learning_rate": 2.729103375921894e-05, + "loss": 0.3462, + "step": 13105 + }, + { + "epoch": 0.9736715065542724, + "grad_norm": 0.8042412996292114, + "learning_rate": 2.727606218831168e-05, + "loss": 0.3398, + "step": 13110 + }, + { + "epoch": 0.9740428534293883, + "grad_norm": 0.8511387705802917, + "learning_rate": 2.7261089794257193e-05, + "loss": 0.3338, + "step": 13115 + }, + { + "epoch": 0.9744142003045044, + "grad_norm": 0.7709136605262756, + "learning_rate": 2.724611658247031e-05, + "loss": 0.3528, + "step": 13120 + }, + { + "epoch": 0.9747855471796205, + "grad_norm": 0.8876646161079407, + "learning_rate": 2.7231142558366147e-05, + "loss": 0.3516, + "step": 13125 + }, + { + "epoch": 0.9751568940547365, + "grad_norm": 0.7100489735603333, + "learning_rate": 2.721616772736013e-05, + "loss": 0.3352, + "step": 13130 + }, + { + "epoch": 0.9755282409298526, + "grad_norm": 1.010686993598938, + "learning_rate": 2.720119209486795e-05, + "loss": 0.3296, + "step": 13135 + }, + { + "epoch": 0.9758995878049687, + "grad_norm": 0.8019929528236389, + "learning_rate": 2.7186215666305632e-05, + "loss": 0.3338, + "step": 13140 + }, + { + "epoch": 0.9762709346800846, + "grad_norm": 0.9189656376838684, + "learning_rate": 2.7171238447089447e-05, + "loss": 0.3603, + "step": 13145 + }, + { + "epoch": 0.9766422815552007, + "grad_norm": 0.7699278593063354, + "learning_rate": 2.7156260442635972e-05, + "loss": 0.3331, + "step": 13150 + }, + { + "epoch": 0.9770136284303168, + "grad_norm": 1.2535781860351562, + "learning_rate": 2.714128165836205e-05, + "loss": 0.3477, + "step": 13155 + }, + { + "epoch": 0.9773849753054328, + "grad_norm": 0.8146220445632935, + "learning_rate": 2.7126302099684848e-05, + "loss": 0.3317, + "step": 13160 + }, + { + "epoch": 0.9777563221805489, + "grad_norm": 0.9014816880226135, + "learning_rate": 2.711132177202176e-05, + "loss": 0.3539, + "step": 13165 + }, + { + "epoch": 0.978127669055665, + "grad_norm": 0.7516365647315979, + "learning_rate": 2.7096340680790493e-05, + "loss": 0.3425, + "step": 13170 + }, + { + "epoch": 0.9784990159307809, + "grad_norm": 0.7220563292503357, + "learning_rate": 2.708135883140902e-05, + "loss": 0.3408, + "step": 13175 + }, + { + "epoch": 0.978870362805897, + "grad_norm": 0.8894364237785339, + "learning_rate": 2.7066376229295583e-05, + "loss": 0.341, + "step": 13180 + }, + { + "epoch": 0.979241709681013, + "grad_norm": 0.8864320516586304, + "learning_rate": 2.7051392879868713e-05, + "loss": 0.3284, + "step": 13185 + }, + { + "epoch": 0.9796130565561291, + "grad_norm": 0.9717690944671631, + "learning_rate": 2.7036408788547192e-05, + "loss": 0.3369, + "step": 13190 + }, + { + "epoch": 0.9799844034312452, + "grad_norm": 0.8697006106376648, + "learning_rate": 2.7021423960750076e-05, + "loss": 0.3451, + "step": 13195 + }, + { + "epoch": 0.9803557503063611, + "grad_norm": 0.8000594973564148, + "learning_rate": 2.700643840189669e-05, + "loss": 0.3215, + "step": 13200 + }, + { + "epoch": 0.9807270971814772, + "grad_norm": 1.589208960533142, + "learning_rate": 2.699145211740663e-05, + "loss": 0.3277, + "step": 13205 + }, + { + "epoch": 0.9810984440565933, + "grad_norm": 1.129319429397583, + "learning_rate": 2.6976465112699735e-05, + "loss": 0.3415, + "step": 13210 + }, + { + "epoch": 0.9814697909317093, + "grad_norm": 1.2226186990737915, + "learning_rate": 2.6961477393196126e-05, + "loss": 0.3737, + "step": 13215 + }, + { + "epoch": 0.9818411378068254, + "grad_norm": 0.8157563209533691, + "learning_rate": 2.6946488964316176e-05, + "loss": 0.3411, + "step": 13220 + }, + { + "epoch": 0.9822124846819414, + "grad_norm": 0.9670863747596741, + "learning_rate": 2.69314998314805e-05, + "loss": 0.3426, + "step": 13225 + }, + { + "epoch": 0.9825838315570574, + "grad_norm": 1.0690017938613892, + "learning_rate": 2.6916510000109995e-05, + "loss": 0.3566, + "step": 13230 + }, + { + "epoch": 0.9829551784321735, + "grad_norm": 0.968449056148529, + "learning_rate": 2.690151947562578e-05, + "loss": 0.3323, + "step": 13235 + }, + { + "epoch": 0.9833265253072896, + "grad_norm": 0.9085156321525574, + "learning_rate": 2.6886528263449247e-05, + "loss": 0.3504, + "step": 13240 + }, + { + "epoch": 0.9836978721824056, + "grad_norm": 0.8193844556808472, + "learning_rate": 2.6871536369002026e-05, + "loss": 0.3132, + "step": 13245 + }, + { + "epoch": 0.9840692190575216, + "grad_norm": 0.6981024146080017, + "learning_rate": 2.6856543797706008e-05, + "loss": 0.3433, + "step": 13250 + }, + { + "epoch": 0.9844405659326376, + "grad_norm": 1.0682088136672974, + "learning_rate": 2.6841550554983296e-05, + "loss": 0.3288, + "step": 13255 + }, + { + "epoch": 0.9848119128077537, + "grad_norm": 1.944576382637024, + "learning_rate": 2.682655664625628e-05, + "loss": 0.3428, + "step": 13260 + }, + { + "epoch": 0.9851832596828698, + "grad_norm": 0.9447241425514221, + "learning_rate": 2.681156207694756e-05, + "loss": 0.3309, + "step": 13265 + }, + { + "epoch": 0.9855546065579858, + "grad_norm": 0.6979337334632874, + "learning_rate": 2.6796566852479976e-05, + "loss": 0.3218, + "step": 13270 + }, + { + "epoch": 0.9859259534331019, + "grad_norm": 0.914198637008667, + "learning_rate": 2.6781570978276632e-05, + "loss": 0.3447, + "step": 13275 + }, + { + "epoch": 0.9862973003082179, + "grad_norm": 0.8899309039115906, + "learning_rate": 2.6766574459760828e-05, + "loss": 0.3661, + "step": 13280 + }, + { + "epoch": 0.9866686471833339, + "grad_norm": 1.063792109489441, + "learning_rate": 2.6751577302356122e-05, + "loss": 0.3394, + "step": 13285 + }, + { + "epoch": 0.98703999405845, + "grad_norm": 0.8097315430641174, + "learning_rate": 2.6736579511486297e-05, + "loss": 0.3542, + "step": 13290 + }, + { + "epoch": 0.9874113409335661, + "grad_norm": 0.6995810270309448, + "learning_rate": 2.672158109257538e-05, + "loss": 0.3578, + "step": 13295 + }, + { + "epoch": 0.9877826878086821, + "grad_norm": 0.7139712572097778, + "learning_rate": 2.670658205104759e-05, + "loss": 0.3285, + "step": 13300 + }, + { + "epoch": 0.9881540346837981, + "grad_norm": 0.9237804412841797, + "learning_rate": 2.6691582392327397e-05, + "loss": 0.3341, + "step": 13305 + }, + { + "epoch": 0.9885253815589142, + "grad_norm": 0.980902910232544, + "learning_rate": 2.6676582121839495e-05, + "loss": 0.333, + "step": 13310 + }, + { + "epoch": 0.9888967284340302, + "grad_norm": 0.8618834018707275, + "learning_rate": 2.6661581245008793e-05, + "loss": 0.3346, + "step": 13315 + }, + { + "epoch": 0.9892680753091463, + "grad_norm": 0.8352036476135254, + "learning_rate": 2.6646579767260417e-05, + "loss": 0.3466, + "step": 13320 + }, + { + "epoch": 0.9896394221842624, + "grad_norm": 1.0906356573104858, + "learning_rate": 2.6631577694019706e-05, + "loss": 0.3279, + "step": 13325 + }, + { + "epoch": 0.9900107690593783, + "grad_norm": 1.1875203847885132, + "learning_rate": 2.6616575030712243e-05, + "loss": 0.3378, + "step": 13330 + }, + { + "epoch": 0.9903821159344944, + "grad_norm": 0.7007048726081848, + "learning_rate": 2.660157178276378e-05, + "loss": 0.3276, + "step": 13335 + }, + { + "epoch": 0.9907534628096104, + "grad_norm": 1.1638414859771729, + "learning_rate": 2.658656795560031e-05, + "loss": 0.313, + "step": 13340 + }, + { + "epoch": 0.9911248096847265, + "grad_norm": 0.9487748742103577, + "learning_rate": 2.6571563554648033e-05, + "loss": 0.3097, + "step": 13345 + }, + { + "epoch": 0.9914961565598426, + "grad_norm": 0.878073513507843, + "learning_rate": 2.6556558585333347e-05, + "loss": 0.3296, + "step": 13350 + }, + { + "epoch": 0.9918675034349586, + "grad_norm": 0.6810947060585022, + "learning_rate": 2.6541553053082862e-05, + "loss": 0.3447, + "step": 13355 + }, + { + "epoch": 0.9922388503100746, + "grad_norm": 0.7682046890258789, + "learning_rate": 2.6526546963323395e-05, + "loss": 0.33, + "step": 13360 + }, + { + "epoch": 0.9926101971851907, + "grad_norm": 0.7372832298278809, + "learning_rate": 2.6511540321481958e-05, + "loss": 0.3364, + "step": 13365 + }, + { + "epoch": 0.9929815440603067, + "grad_norm": 1.080094575881958, + "learning_rate": 2.6496533132985758e-05, + "loss": 0.3333, + "step": 13370 + }, + { + "epoch": 0.9933528909354228, + "grad_norm": 0.8818208575248718, + "learning_rate": 2.6481525403262214e-05, + "loss": 0.3462, + "step": 13375 + }, + { + "epoch": 0.9937242378105389, + "grad_norm": 0.8809915781021118, + "learning_rate": 2.6466517137738932e-05, + "loss": 0.3592, + "step": 13380 + }, + { + "epoch": 0.9940955846856548, + "grad_norm": 1.348819613456726, + "learning_rate": 2.645150834184371e-05, + "loss": 0.33, + "step": 13385 + }, + { + "epoch": 0.9944669315607709, + "grad_norm": 0.714433491230011, + "learning_rate": 2.6436499021004534e-05, + "loss": 0.3374, + "step": 13390 + }, + { + "epoch": 0.994838278435887, + "grad_norm": 0.7457917928695679, + "learning_rate": 2.6421489180649604e-05, + "loss": 0.3482, + "step": 13395 + }, + { + "epoch": 0.995209625311003, + "grad_norm": 0.7141998410224915, + "learning_rate": 2.6406478826207275e-05, + "loss": 0.3359, + "step": 13400 + }, + { + "epoch": 0.9955809721861191, + "grad_norm": 0.9509761333465576, + "learning_rate": 2.6391467963106113e-05, + "loss": 0.3523, + "step": 13405 + }, + { + "epoch": 0.995952319061235, + "grad_norm": 0.9249238967895508, + "learning_rate": 2.6376456596774858e-05, + "loss": 0.3352, + "step": 13410 + }, + { + "epoch": 0.9963236659363511, + "grad_norm": 0.913701057434082, + "learning_rate": 2.636144473264241e-05, + "loss": 0.3387, + "step": 13415 + }, + { + "epoch": 0.9966950128114672, + "grad_norm": 0.7925592660903931, + "learning_rate": 2.634643237613791e-05, + "loss": 0.3378, + "step": 13420 + }, + { + "epoch": 0.9970663596865832, + "grad_norm": 1.0819846391677856, + "learning_rate": 2.6331419532690603e-05, + "loss": 0.3384, + "step": 13425 + }, + { + "epoch": 0.9974377065616993, + "grad_norm": 0.7865422964096069, + "learning_rate": 2.6316406207729972e-05, + "loss": 0.347, + "step": 13430 + }, + { + "epoch": 0.9978090534368154, + "grad_norm": 0.7669934034347534, + "learning_rate": 2.630139240668562e-05, + "loss": 0.3611, + "step": 13435 + }, + { + "epoch": 0.9981804003119313, + "grad_norm": 0.7105921506881714, + "learning_rate": 2.628637813498738e-05, + "loss": 0.3548, + "step": 13440 + }, + { + "epoch": 0.9985517471870474, + "grad_norm": 0.9623938798904419, + "learning_rate": 2.6271363398065206e-05, + "loss": 0.3565, + "step": 13445 + }, + { + "epoch": 0.9989230940621635, + "grad_norm": 0.8119087219238281, + "learning_rate": 2.625634820134924e-05, + "loss": 0.3341, + "step": 13450 + }, + { + "epoch": 0.9992944409372795, + "grad_norm": 0.8674593567848206, + "learning_rate": 2.6241332550269794e-05, + "loss": 0.3478, + "step": 13455 + }, + { + "epoch": 0.9996657878123956, + "grad_norm": 1.0126354694366455, + "learning_rate": 2.6226316450257338e-05, + "loss": 0.3165, + "step": 13460 + } + ], + "logging_steps": 5, + "max_steps": 26928, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.530315460862699e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}